From 461b0f47fc921abc595971e1f8c597d85e0918af Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Fri, 29 May 2026 16:21:46 -0500
Subject: [PATCH 001/113] Add recommendation_v4 (HSTU/DLRM-v3
 generative-recommenders fork @ d97e51c)

Vendored snapshot of chriscai-amd/generative-recommenders branch chcai/dlrmv4
(HEAD d97e51c) as a sibling of recommendation_v2/torchrec_dlrm. The Python
package generative_recommenders keeps its original name so all imports work
unchanged from the new location.

- recommendation_v4/generative_recommenders/: dlrm_v3, modules, ops, research, tests
- recommendation_v4/configs/: research HSTU gins
- recommendation_v4/scripts/launch_smoke_8gpu.sh: sanitized 8-GPU yambda-5b launcher
  (resolves package root from script path; AMD env defaults; pip_local override)
- recommendation_v4/{setup.py,requirements.txt,main.py,...}: upstream entry points
- .gitmodules: cutlass registered at parent repo level

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .gitmodules                                   |    3 +
 recommendation_v4/.gitignore                  |  158 +
 recommendation_v4/LICENSE                     |  202 ++
 recommendation_v4/README.MD                   |  135 +
 .../hstu-sampled-softmax-n512-final.gin       |   49 +
 .../hstu-sampled-softmax-n512-large-final.gin |   49 +
 .../sasrec-sampled-softmax-n512-final.gin     |   50 +
 .../ml-1m/hstu-sampled-softmax-n128-final.gin |   45 +
 .../hstu-sampled-softmax-n128-large-final.gin |   45 +
 .../sasrec-sampled-softmax-n128-final.gin     |   44 +
 .../hstu-sampled-softmax-n128-final.gin       |   45 +
 .../hstu-sampled-softmax-n128-large-final.gin |   45 +
 .../sasrec-sampled-softmax-n128-final.gin     |   44 +
 ...tu-sampled-softmax-n96-seqlen500-final.gin |   42 +
 ...pled-softmax-n96-seqlen500-large-final.gin |   42 +
 ...ec-sampled-softmax-n96-seqlen500-final.gin |   42 +
 .../generative_recommenders/README.md         |    0
 .../generative_recommenders/common.py         |  513 +++
 .../dlrm_v3/checkpoint.py                     |  258 ++
 .../dlrm_v3/configs.py                        |  704 ++++
 .../dlrm_v3/datasets/dataset.py               |  398 +++
 .../dlrm_v3/datasets/kuairand.py              |  163 +
 .../dlrm_v3/datasets/movie_lens.py            |  177 +
 .../dlrm_v3/datasets/synthetic_movie_lens.py  |   83 +
 .../dlrm_v3/datasets/synthetic_streaming.py   |  400 +++
 .../dlrm_v3/datasets/utils.py                 |  146 +
 .../dlrm_v3/datasets/yambda.py                |  608 ++++
 .../dlrm_v3/inference/README.md               |   88 +
 .../dlrm_v3/inference/accuracy.py             |   86 +
 .../dlrm_v3/inference/cpp/hstu_runner.cpp     |  215 ++
 .../dlrm_v3/inference/data_producer.py        |  227 ++
 .../dlrm_v3/inference/dense_predict_module.py |   96 +
 .../dlrm_v3/inference/end_to_end_test.py      |  795 +++++
 .../dlrm_v3/inference/gin/debug.gin           |   13 +
 .../dlrm_v3/inference/gin/kuairand_1k.gin     |   14 +
 .../dlrm_v3/inference/gin/movielens_13b.gin   |   16 +
 .../dlrm_v3/inference/gin/streaming_100b.gin  |   15 +
 .../dlrm_v3/inference/gin/streaming_400m.gin  |   15 +
 .../dlrm_v3/inference/inference_modules.py    |  253 ++
 .../dlrm_v3/inference/main.py                 |  805 +++++
 .../dlrm_v3/inference/mlperf.conf             |   98 +
 .../dlrm_v3/inference/model_family.py         |  705 ++++
 .../inference/sparse_predict_module.py        |  106 +
 .../dlrm_v3/inference/tests/inference_test.py |   39 +
 .../inference/tests/test_scripted_parity.py   |  236 ++
 .../thirdparty/loadgen/.clang-format          |    2 +
 .../thirdparty/loadgen/CMakeLists.txt         |  113 +
 .../inference/thirdparty/loadgen/MANIFEST.in  |    2 +
 .../inference/thirdparty/loadgen/README.md    |  223 ++
 .../thirdparty/loadgen/README_BUILD.md        |   47 +
 .../thirdparty/loadgen/README_FAQ.md          |   78 +
 .../inference/thirdparty/loadgen/VERSION.txt  |    1 +
 .../thirdparty/loadgen/benchmark/.gitignore   |    2 +
 .../thirdparty/loadgen/benchmark/README.md    |   10 +
 .../thirdparty/loadgen/benchmark/repro.cpp    |  296 ++
 .../thirdparty/loadgen/benchmark/run.sh       |   21 +
 .../thirdparty/loadgen/benchmark/run_debug.sh |   21 +
 .../thirdparty/loadgen/bindings/c_api.cc      |  176 +
 .../thirdparty/loadgen/bindings/c_api.h       |   95 +
 .../thirdparty/loadgen/bindings/python_api.cc |  484 +++
 .../thirdparty/loadgen/demos/lon/README.md    |   67 +
 .../loadgen/demos/lon/py_demo_server_lon.py   |  191 +
 .../demos/lon/sut_over_network_demo.py        |   88 +
 .../loadgen/demos/py_demo_multi_stream.py     |   86 +
 .../loadgen/demos/py_demo_offline.py          |   81 +
 .../loadgen/demos/py_demo_server.py           |   74 +
 .../loadgen/demos/py_demo_single_stream.py    |   84 +
 .../token_metrics/py_demo_multi_stream.py     |  142 +
 .../demos/token_metrics/py_demo_offline.py    |  130 +
 .../token_metrics/py_demo_offline_inferred.py |  130 +
 .../demos/token_metrics/py_demo_server.py     |  132 +
 .../token_metrics/py_demo_server_inferred.py  |  125 +
 .../token_metrics/py_demo_single_stream.py    |  129 +
 .../loadgen/diagram_network_submission.png    |  Bin 0 -> 51192 bytes
 .../thirdparty/loadgen/diagram_submission.png |  Bin 0 -> 36510 bytes
 .../thirdparty/loadgen/docs/src/BUILD.gn      |   33 +
 .../thirdparty/loadgen/docs/src/README.md     |   34 +
 .../thirdparty/loadgen/docs/src/doxygen.cfg   | 2495 +++++++++++++
 .../loadgen/docs/src/doxygen_footer.html      |   26 +
 .../loadgen/docs/src/doxygen_header.html      |   49 +
 .../docs/src/doxygen_html_generator.py        |   37 +
 .../loadgen/docs/src/doxygen_layout.xml       |  211 ++
 .../loadgen/docs/src/doxygen_stylesheet.css   | 1629 +++++++++
 .../docs/src/loadgen_integration_diagram.dia  |  Bin 0 -> 1943 bytes
 .../loadgen/docs/src/mlperf_icon.png          |  Bin 0 -> 4632 bytes
 .../docs/src/mlperf_logo_horizontal_color.svg |   55 +
 .../thirdparty/loadgen/early_stopping.cc      |  117 +
 .../thirdparty/loadgen/early_stopping.h       |   27 +
 .../loadgen/generated/version_generated.cc    |   98 +
 .../loadgen/issue_query_controller.cc         |  552 +++
 .../loadgen/issue_query_controller.h          |  215 ++
 .../inference/thirdparty/loadgen/loadgen.cc   | 1345 +++++++
 .../inference/thirdparty/loadgen/loadgen.h    |  103 +
 .../loadgen/loadgen_integration_diagram.svg   |   85 +
 .../inference/thirdparty/loadgen/logging.cc   | 1301 +++++++
 .../inference/thirdparty/loadgen/logging.h    |  816 +++++
 .../inference/thirdparty/loadgen/mlperf.conf  |  164 +
 .../thirdparty/loadgen/mlperf_conf.h          |  167 +
 .../thirdparty/loadgen/pyproject.toml         |    7 +
 .../loadgen/query_dispatch_library.h          |   42 +
 .../thirdparty/loadgen/query_sample.h         |   91 +
 .../thirdparty/loadgen/query_sample_library.h |   75 +
 .../thirdparty/loadgen/requirements.txt       |    1 +
 .../inference/thirdparty/loadgen/results.cc   |  856 +++++
 .../inference/thirdparty/loadgen/results.h    |  128 +
 .../inference/thirdparty/loadgen/setup.py     |  136 +
 .../thirdparty/loadgen/system_under_test.h    |   67 +
 .../thirdparty/loadgen/test_settings.h        |  329 ++
 .../loadgen/test_settings_internal.cc         |  800 +++++
 .../loadgen/test_settings_internal.h          |  182 +
 .../thirdparty/loadgen/tests/BUILD.gn         |   25 +
 .../thirdparty/loadgen/tests/README.md        |   42 +
 .../thirdparty/loadgen/tests/basic.cc         |  314 ++
 .../thirdparty/loadgen/tests/loadgen_test.h   |  198 ++
 .../loadgen/tests/loadgen_test_main.cc        |   33 +
 .../loadgen/tests/perftests_null_sut.cc       |  230 ++
 .../loadgen/tests/perftests_null_sut.py       |   61 +
 .../loadgen/tools/mlperf-trace.ipynb          |  441 +++
 .../inference/thirdparty/loadgen/utils.cc     |  124 +
 .../inference/thirdparty/loadgen/utils.h      |   70 +
 .../inference/thirdparty/loadgen/version.cc   |   85 +
 .../inference/thirdparty/loadgen/version.h    |   39 +
 .../thirdparty/loadgen/version_generator.py   |  141 +
 .../dlrm_v3/inference/ts_types.py             |   70 +
 .../dlrm_v3/inference/user.conf               |    5 +
 .../dlrm_v3/preprocess_public_data.py         |  211 ++
 .../dlrm_v3/streaming_synthetic_data.py       |  664 ++++
 .../dlrm_v3/train/gin/debug.gin               |   35 +
 .../dlrm_v3/train/gin/kuairand_1k.gin         |   41 +
 .../dlrm_v3/train/gin/movielens_13b.gin       |   41 +
 .../dlrm_v3/train/gin/movielens_18b.gin       |   56 +
 .../dlrm_v3/train/gin/movielens_1m.gin        |   38 +
 .../dlrm_v3/train/gin/movielens_20m.gin       |   56 +
 .../dlrm_v3/train/gin/streaming_100b.gin      |   52 +
 .../dlrm_v3/train/gin/streaming_200b.gin      |   63 +
 .../dlrm_v3/train/gin/streaming_400m.gin      |   61 +
 .../dlrm_v3/train/gin/yambda_5b.gin           |   50 +
 .../dlrm_v3/train/tests/train_test.py         |   29 +
 .../dlrm_v3/train/train_ranker.py             |  190 +
 .../dlrm_v3/train/utils.py                    |  902 +++++
 .../generative_recommenders/dlrm_v3/utils.py  |  652 ++++
 .../modules/action_encoder.py                 |  112 +
 .../modules/content_encoder.py                |  109 +
 .../contextual_interleave_preprocessor.py     |  357 ++
 .../modules/contextualize_mlps.py             |  141 +
 .../modules/dlrm_hstu.py                      |  626 ++++
 .../modules/dynamic_stu.py                    |  304 ++
 .../modules/hstu_transducer.py                |  330 ++
 .../modules/multitask_module.py               |  288 ++
 .../modules/positional_encoder.py             |   75 +
 .../modules/postprocessors.py                 |  182 +
 .../modules/preprocessors.py                  |  334 ++
 .../generative_recommenders/modules/stu.py    |  471 +++
 .../modules/tests/action_encoder_test.py      |  113 +
 .../modules/tests/content_encoder_test.py     |   74 +
 ...contextual_interleave_preprocessor_test.py |  499 +++
 .../modules/tests/dynamic_stu_test.py         |  279 ++
 .../modules/tests/multitask_module_test.py    |  233 ++
 .../modules/tests/stu_test.py                 |  453 +++
 .../ops/benchmarks/addmm_bench.py             |  174 +
 .../ops/benchmarks/hstu_attention_bench.py    |  406 +++
 .../ops/benchmarks/jagged_dense_bmm_bench.py  |  199 ++
 .../jagged_dense_bmm_broadcast_add_bench.py   |  270 ++
 .../jagged_dense_broadcast_add_bench.py       |  205 ++
 .../concat_1d_jagged_jagged_bench.py          |  125 +
 .../benchmarks/jagged_transpose_1d_bench.py   |  117 +
 .../replace_last_n_with_jagged_bench.py       |  150 +
 .../split_1d_jagged_jagged_bench.py           |  116 +
 .../generative_recommenders/ops/cpp/common.h  |   60 +
 .../ops/cpp/complete_cumsum.cpp               |   44 +
 .../ops/cpp/complete_cumsum.cu                |   51 +
 .../ops/cpp/concat_1d_jagged_jagged.cpp       |  111 +
 .../ops/cpp/concat_1d_jagged_jagged.cu        |  130 +
 .../ops/cpp/cpp_ops.cpp                       |  207 ++
 .../ops/cpp/cuda_hstu_attention.py            |  193 +
 .../cpp/cuda_hstu_preprocess_and_attention.py |  668 ++++
 .../ops/cpp/expand_1d_jagged_to_dense.cpp     |   97 +
 .../ops/cpp/expand_1d_jagged_to_dense.cu      |  103 +
 .../hstu_attention/copy_sm90_bulk_reduce.h    |   66 +
 .../ops/cpp/hstu_attention/epilogue_bwd.h     |  481 +++
 .../ops/cpp/hstu_attention/epilogue_fwd.h     |  550 +++
 .../ops/cpp/hstu_attention/flash.h            |  157 +
 .../ops/cpp/hstu_attention/flash_api.cpp      |  322 ++
 .../ops/cpp/hstu_attention/flash_api_cpu.cpp  |  256 ++
 .../hstu_attention/flash_bwd_kernel_sm90.h    |  402 +++
 .../flash_bwd_launch_template.h               |  492 +++
 .../flash_bwd_postprocess_kernel.h            |  348 ++
 .../flash_bwd_preprocess_kernel.h             |  349 ++
 .../ops/cpp/hstu_attention/flash_common.cpp   | 1165 ++++++
 .../ops/cpp/hstu_attention/flash_common.h     |  149 +
 .../cpp/hstu_attention/flash_common_cpu.cpp   |  172 +
 .../ops/cpp/hstu_attention/flash_common_cpu.h |  114 +
 .../hstu_attention/flash_fwd_kernel_sm90.h    |  511 +++
 .../flash_fwd_launch_template.h               |  376 ++
 .../cpp/hstu_attention/generate_kernels.py    |  236 ++
 ...lash_bwd_hdim128_bf16_softmaxfalse_sm90.cu |   33 +
 ...flash_bwd_hdim128_bf16_softmaxtrue_sm90.cu |   33 +
 ...lash_bwd_hdim128_fp16_softmaxfalse_sm90.cu |   33 +
 ...flash_bwd_hdim128_fp16_softmaxtrue_sm90.cu |   33 +
 ...lash_bwd_hdim192_bf16_softmaxfalse_sm90.cu |   33 +
 ...flash_bwd_hdim192_bf16_softmaxtrue_sm90.cu |   33 +
 ...lash_bwd_hdim192_fp16_softmaxfalse_sm90.cu |   33 +
 ...flash_bwd_hdim192_fp16_softmaxtrue_sm90.cu |   33 +
 ...lash_bwd_hdim256_bf16_softmaxfalse_sm90.cu |   33 +
 ...flash_bwd_hdim256_bf16_softmaxtrue_sm90.cu |   33 +
 ...lash_bwd_hdim256_fp16_softmaxfalse_sm90.cu |   33 +
 ...flash_bwd_hdim256_fp16_softmaxtrue_sm90.cu |   33 +
 ...flash_bwd_hdim64_bf16_softmaxfalse_sm90.cu |   33 +
 .../flash_bwd_hdim64_bf16_softmaxtrue_sm90.cu |   33 +
 ...flash_bwd_hdim64_fp16_softmaxfalse_sm90.cu |   33 +
 .../flash_bwd_hdim64_fp16_softmaxtrue_sm90.cu |   33 +
 ...flash_bwd_hdim96_bf16_softmaxfalse_sm90.cu |   33 +
 .../flash_bwd_hdim96_bf16_softmaxtrue_sm90.cu |   33 +
 ...flash_bwd_hdim96_fp16_softmaxfalse_sm90.cu |   33 +
 .../flash_bwd_hdim96_fp16_softmaxtrue_sm90.cu |   33 +
 ...lash_fwd_hdim128_bf16_softmaxfalse_sm90.cu |   33 +
 ...flash_fwd_hdim128_bf16_softmaxtrue_sm90.cu |   33 +
 ...lash_fwd_hdim128_e4m3_softmaxfalse_sm90.cu |   33 +
 ...flash_fwd_hdim128_e4m3_softmaxtrue_sm90.cu |   33 +
 ...lash_fwd_hdim128_fp16_softmaxfalse_sm90.cu |   33 +
 ...flash_fwd_hdim128_fp16_softmaxtrue_sm90.cu |   33 +
 ...lash_fwd_hdim192_bf16_softmaxfalse_sm90.cu |   33 +
 ...flash_fwd_hdim192_bf16_softmaxtrue_sm90.cu |   33 +
 ...lash_fwd_hdim192_e4m3_softmaxfalse_sm90.cu |   33 +
 ...flash_fwd_hdim192_e4m3_softmaxtrue_sm90.cu |   33 +
 ...lash_fwd_hdim192_fp16_softmaxfalse_sm90.cu |   33 +
 ...flash_fwd_hdim192_fp16_softmaxtrue_sm90.cu |   33 +
 ...lash_fwd_hdim256_bf16_softmaxfalse_sm90.cu |   33 +
 ...flash_fwd_hdim256_bf16_softmaxtrue_sm90.cu |   33 +
 ...lash_fwd_hdim256_e4m3_softmaxfalse_sm90.cu |   33 +
 ...flash_fwd_hdim256_e4m3_softmaxtrue_sm90.cu |   33 +
 ...lash_fwd_hdim256_fp16_softmaxfalse_sm90.cu |   33 +
 ...flash_fwd_hdim256_fp16_softmaxtrue_sm90.cu |   33 +
 ...flash_fwd_hdim64_bf16_softmaxfalse_sm90.cu |   33 +
 .../flash_fwd_hdim64_bf16_softmaxtrue_sm90.cu |   33 +
 ...flash_fwd_hdim64_e4m3_softmaxfalse_sm90.cu |   33 +
 .../flash_fwd_hdim64_e4m3_softmaxtrue_sm90.cu |   33 +
 ...flash_fwd_hdim64_fp16_softmaxfalse_sm90.cu |   33 +
 .../flash_fwd_hdim64_fp16_softmaxtrue_sm90.cu |   33 +
 ...flash_fwd_hdim96_bf16_softmaxfalse_sm90.cu |   33 +
 .../flash_fwd_hdim96_bf16_softmaxtrue_sm90.cu |   33 +
 ...flash_fwd_hdim96_e4m3_softmaxfalse_sm90.cu |   33 +
 .../flash_fwd_hdim96_e4m3_softmaxtrue_sm90.cu |   33 +
 ...flash_fwd_hdim96_fp16_softmaxfalse_sm90.cu |   33 +
 .../flash_fwd_hdim96_fp16_softmaxtrue_sm90.cu |   33 +
 .../mainloop_bwd_sm90_tma_gmma_ws.h           | 3166 +++++++++++++++++
 .../mainloop_fwd_sm90_tma_gmma_ws.h           | 2180 ++++++++++++
 .../ops/cpp/hstu_attention/mask.h             |  396 +++
 .../ops/cpp/hstu_attention/named_barrier.h    |  101 +
 .../ops/cpp/hstu_attention/seqlen.h           |  134 +
 .../hstu_attention/sm90_pipeline_no_cluster.h |  150 +
 .../ops/cpp/hstu_attention/softmax.h          |  256 ++
 .../ops/cpp/hstu_attention/static_switch.h    |  135 +
 .../ops/cpp/hstu_attention/tile_scheduler.h   |  616 ++++
 .../ops/cpp/hstu_attention/tile_size.h        |  220 ++
 .../ops/cpp/hstu_attention/utils.h            |  789 ++++
 .../ops/cpp/hstu_attention/version.txt        |    1 +
 .../ops/cpp/jagged_transpose_1d.cpp           |  130 +
 .../ops/cpp/jagged_transpose_1d.cu            |  127 +
 .../ops/cpp/replace_last_n_with_jagged.cpp    |  139 +
 .../ops/cpp/replace_last_n_with_jagged.cu     |  156 +
 .../generative_recommenders/ops/cpp/setup.py  |  487 +++
 .../ops/cpp/sort_kv_pairs_cuda.cpp            |   40 +
 .../sort_kv_pairs_cuda_kernels_template.cu    |   82 +
 .../cpp/sort_kv_pairs_cuda_kernels_template.h |   15 +
 .../ops/cpp/split_1d_jagged_jagged.cpp        |  136 +
 .../ops/cpp/split_1d_jagged_jagged.cu         |  147 +
 .../cpp/tests/concat_1d_jagged_jagged_test.py |  135 +
 .../ops/cpp/tests/hstu_mha_cpu_test.py        |   39 +
 .../ops/cpp/tests/jagged_transpose_1d_test.py |  132 +
 .../tests/replace_last_n_with_jagged_test.py  |  105 +
 .../cpp/tests/split_1d_jagged_jagged_test.py  |  100 +
 .../ops/hstu_attention.py                     |  353 ++
 .../ops/hstu_compute.py                       |  390 ++
 .../ops/jagged_tensors.py                     |  451 +++
 .../generative_recommenders/ops/layer_norm.py |  330 ++
 .../generative_recommenders/ops/mm.py         |   60 +
 .../generative_recommenders/ops/position.py   |  147 +
 .../ops/pytorch/pt_hstu_attention.py          |  251 ++
 .../ops/pytorch/pt_hstu_linear.py             |  130 +
 .../ops/pytorch/pt_jagged.py                  |  258 ++
 .../ops/pytorch/pt_jagged_tensors.py          |  246 ++
 .../ops/pytorch/pt_layer_norm.py              |   81 +
 .../ops/pytorch/pt_position.py                |  142 +
 .../ops/tests/fake_signature_test.py          |  162 +
 .../ops/tests/hstu_attention_test.py          |  485 +++
 .../ops/tests/hstu_attention_tma_test.py      |  270 ++
 .../ops/tests/hstu_compute_test.py            |  503 +++
 .../ops/tests/jagged_tensors_test.py          |  963 +++++
 .../ops/tests/layer_norm_test.py              |  231 ++
 .../ops/tests/mm_test.py                      |  156 +
 .../ops/tests/position_test.py                |  234 ++
 .../ops/tests/rms_norm_test.py                |  229 ++
 .../ops/triton/triton_addmm.py                | 1706 +++++++++
 .../ops/triton/triton_attention_utils.py      |   64 +
 .../ops/triton/triton_hstu_attention.py       | 3134 ++++++++++++++++
 .../ops/triton/triton_hstu_linear.py          | 3042 ++++++++++++++++
 .../triton_hstu_preprocess_and_attention.py   |  342 ++
 .../ops/triton/triton_jagged.py               | 2533 +++++++++++++
 .../ops/triton/triton_jagged_tensors.py       | 1067 ++++++
 .../ops/triton/triton_layer_norm.py           | 1327 +++++++
 .../ops/triton/triton_position.py             |  438 +++
 .../ops/triton/triton_swiglu.py               |  753 ++++
 .../ops/triton_aot/README.md                  |   54 +
 .../ops/triton_aot/compile/arg_descriptor.py  |  146 +
 .../ops/triton_aot/compile/codegen.py         |  780 ++++
 .../ops/triton_aot/compile/compile_state.py   |  409 +++
 .../ops/triton_aot/compile/pipeline.py        |  300 ++
 .../ops/triton_aot/compile/spec_processing.py |  593 +++
 .../ops/triton_aot/compile/stable_types.py    |   35 +
 .../triton_aot/compile/triton_aot_compile.py  |  149 +
 .../ops/triton_aot/compile/utils.py           |   47 +
 .../ops/triton_aot/preprocess.py              |   76 +
 .../ops/triton_aot/shared/compat.py           |   91 +
 .../ops/triton_aot/shared/spec_conversion.py  |  389 ++
 .../ops/triton_aot/shared/types.py            |   58 +
 .../triton_aot/templates/embedded_cubins.cpp  |    7 +
 .../ops/triton_aot/templates/kernel.cpp       |  104 +
 .../ops/triton_aot/templates/kernel.h         |   36 +
 .../triton_aot/templates/template_utils.py    |   96 +
 .../ops/triton_aot/templates/torch_op.cpp     |   22 +
 .../ops/triton_aot/transform/import_utils.py  |   89 +
 .../transform/kernel_wrapper_codegen.py       |  500 +++
 .../triton_aot/transform/replace_kernels.py   |  137 +
 .../triton_aot/transform/transform_kernels.py |   29 +
 .../ops/triton_aot/triton_addmm.py            |  347 ++
 .../ops/triton_aot/triton_concat_2d_jagged.py |  183 +
 .../triton_group_norm_mul_dropout.py          |  124 +
 .../ops/triton_aot/triton_layer_norm.py       |  119 +
 .../triton_layer_norm_mul_dropout.py          |  162 +
 .../ops/triton_aot/triton_position.py         |  176 +
 .../triton_ragged_hstu_attention.py           |  366 ++
 .../ops/triton_aot/triton_rms_norm.py         |  114 +
 .../ops/triton_aot/triton_split_2d_jagged.py  |  138 +
 .../ops/triton_aot/types.py                   |  181 +
 .../generative_recommenders/ops/utils.py      |   93 +
 .../research/data/dataset.py                  |  248 ++
 .../research/data/eval.py                     |  263 ++
 .../research/data/item_features.py            |   29 +
 .../research/data/preprocessor.py             |  474 +++
 .../research/data/reco_dataset.py             |  176 +
 .../research/indexing/candidate_index.py      |  179 +
 .../research/indexing/utils.py                |   43 +
 .../research/modeling/initialization.py       |   35 +
 .../sequential/autoregressive_losses.py       |  477 +++
 .../modeling/sequential/embedding_modules.py  |  108 +
 .../modeling/sequential/encoder_utils.py      |  150 +
 .../research/modeling/sequential/features.py  |   94 +
 .../research/modeling/sequential/hstu.py      |  808 +++++
 .../input_features_preprocessors.py           |  259 ++
 .../sequential/losses/sampled_softmax.py      |  193 +
 .../sequential/output_postprocessors.py       |   82 +
 .../research/modeling/sequential/sasrec.py    |  316 ++
 .../research/modeling/sequential/utils.py     |  129 +
 .../research/modeling/similarity_module.py    |   68 +
 .../research/modeling/similarity_utils.py     |  222 ++
 .../rails/indexing/candidate_index.py         |   41 +
 .../research/rails/indexing/mips_top_k.py     |   80 +
 .../research/rails/indexing/mol_top_k.py      |  132 +
 .../similarities/dot_product_similarity_fn.py |   68 +
 .../research/rails/similarities/layers.py     |   82 +
 .../research/rails/similarities/module.py     |   55 +
 .../rails/similarities/mol/embeddings_fn.py   |   52 +
 .../similarities/mol/item_embeddings_fn.py    |   99 +
 .../similarities/mol/query_embeddings_fn.py   |  164 +
 .../rails/similarities/mol/similarity_fn.py   |  388 ++
 .../research/trainer/data_loader.py           |   57 +
 .../research/trainer/train.py                 |  532 +++
 .../tests/test_common.py                      |   41 +
 recommendation_v4/main.py                     |   82 +
 recommendation_v4/preprocess_public_data.py   |   32 +
 recommendation_v4/requirements.txt            |    7 +
 recommendation_v4/run_fractal_expansion.py    |  588 +++
 .../scripts/launch_smoke_8gpu.sh              |   38 +
 recommendation_v4/setup.py                    |   41 +
 375 files changed, 88510 insertions(+)
 create mode 100644 recommendation_v4/.gitignore
 create mode 100644 recommendation_v4/LICENSE
 create mode 100644 recommendation_v4/README.MD
 create mode 100644 recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-final.gin
 create mode 100644 recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-large-final.gin
 create mode 100644 recommendation_v4/configs/amzn-books/sasrec-sampled-softmax-n512-final.gin
 create mode 100644 recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-final.gin
 create mode 100644 recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-large-final.gin
 create mode 100644 recommendation_v4/configs/ml-1m/sasrec-sampled-softmax-n128-final.gin
 create mode 100644 recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-final.gin
 create mode 100644 recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-large-final.gin
 create mode 100644 recommendation_v4/configs/ml-20m/sasrec-sampled-softmax-n128-final.gin
 create mode 100644 recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-final.gin
 create mode 100644 recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-large-final.gin
 create mode 100644 recommendation_v4/configs/ml-3b/sasrec-sampled-softmax-n96-seqlen500-final.gin
 create mode 100644 recommendation_v4/generative_recommenders/README.md
 create mode 100644 recommendation_v4/generative_recommenders/common.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/configs.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/datasets/dataset.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/datasets/kuairand.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/datasets/movie_lens.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/datasets/synthetic_movie_lens.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/datasets/synthetic_streaming.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/datasets/utils.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/README.md
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/accuracy.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/cpp/hstu_runner.cpp
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/data_producer.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/dense_predict_module.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/end_to_end_test.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/debug.gin
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/kuairand_1k.gin
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/movielens_13b.gin
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_100b.gin
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_400m.gin
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/inference_modules.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/main.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/mlperf.conf
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/model_family.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/sparse_predict_module.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/inference_test.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/test_scripted_parity.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/.clang-format
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/CMakeLists.txt
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/MANIFEST.in
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README.md
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_BUILD.md
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_FAQ.md
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/VERSION.txt
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/.gitignore
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/README.md
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/repro.cpp
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run.sh
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run_debug.sh
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.cc
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.h
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/python_api.cc
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/README.md
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/py_demo_server_lon.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/sut_over_network_demo.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_multi_stream.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_offline.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_server.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_single_stream.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_multi_stream.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline_inferred.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server_inferred.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_single_stream.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/diagram_network_submission.png
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/diagram_submission.png
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/BUILD.gn
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/README.md
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen.cfg
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_footer.html
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_header.html
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_html_generator.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_layout.xml
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_stylesheet.css
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/loadgen_integration_diagram.dia
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/mlperf_icon.png
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/mlperf_logo_horizontal_color.svg
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/early_stopping.cc
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/early_stopping.h
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/generated/version_generated.cc
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/issue_query_controller.cc
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/issue_query_controller.h
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen.cc
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen.h
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen_integration_diagram.svg
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/logging.cc
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/logging.h
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/mlperf.conf
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/mlperf_conf.h
 create mode 100755 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/pyproject.toml
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_dispatch_library.h
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_sample.h
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_sample_library.h
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/requirements.txt
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/results.cc
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/results.h
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/setup.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/system_under_test.h
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings.h
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings_internal.cc
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings_internal.h
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/BUILD.gn
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/README.md
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/basic.cc
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/loadgen_test.h
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/loadgen_test_main.cc
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/perftests_null_sut.cc
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/perftests_null_sut.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tools/mlperf-trace.ipynb
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/utils.cc
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/utils.h
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version.cc
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version.h
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version_generator.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/ts_types.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/user.conf
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/preprocess_public_data.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/streaming_synthetic_data.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/train/gin/debug.gin
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/train/gin/kuairand_1k.gin
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_13b.gin
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_18b.gin
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_1m.gin
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_20m.gin
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_100b.gin
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_200b.gin
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_400m.gin
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/train/tests/train_test.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/utils.py
 create mode 100644 recommendation_v4/generative_recommenders/modules/action_encoder.py
 create mode 100644 recommendation_v4/generative_recommenders/modules/content_encoder.py
 create mode 100644 recommendation_v4/generative_recommenders/modules/contextual_interleave_preprocessor.py
 create mode 100644 recommendation_v4/generative_recommenders/modules/contextualize_mlps.py
 create mode 100644 recommendation_v4/generative_recommenders/modules/dlrm_hstu.py
 create mode 100644 recommendation_v4/generative_recommenders/modules/dynamic_stu.py
 create mode 100644 recommendation_v4/generative_recommenders/modules/hstu_transducer.py
 create mode 100644 recommendation_v4/generative_recommenders/modules/multitask_module.py
 create mode 100644 recommendation_v4/generative_recommenders/modules/positional_encoder.py
 create mode 100644 recommendation_v4/generative_recommenders/modules/postprocessors.py
 create mode 100644 recommendation_v4/generative_recommenders/modules/preprocessors.py
 create mode 100644 recommendation_v4/generative_recommenders/modules/stu.py
 create mode 100644 recommendation_v4/generative_recommenders/modules/tests/action_encoder_test.py
 create mode 100644 recommendation_v4/generative_recommenders/modules/tests/content_encoder_test.py
 create mode 100644 recommendation_v4/generative_recommenders/modules/tests/contextual_interleave_preprocessor_test.py
 create mode 100644 recommendation_v4/generative_recommenders/modules/tests/dynamic_stu_test.py
 create mode 100644 recommendation_v4/generative_recommenders/modules/tests/multitask_module_test.py
 create mode 100644 recommendation_v4/generative_recommenders/modules/tests/stu_test.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/benchmarks/addmm_bench.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/benchmarks/hstu_attention_bench.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/benchmarks/jagged_dense_bmm_bench.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/benchmarks/jagged_dense_bmm_broadcast_add_bench.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/benchmarks/jagged_dense_broadcast_add_bench.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/benchmarks/concat_1d_jagged_jagged_bench.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/benchmarks/jagged_transpose_1d_bench.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/benchmarks/replace_last_n_with_jagged_bench.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/benchmarks/split_1d_jagged_jagged_bench.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/common.h
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/complete_cumsum.cpp
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/complete_cumsum.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/concat_1d_jagged_jagged.cpp
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/concat_1d_jagged_jagged.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/cpp_ops.cpp
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/cuda_hstu_attention.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/cuda_hstu_preprocess_and_attention.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/expand_1d_jagged_to_dense.cpp
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/expand_1d_jagged_to_dense.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/copy_sm90_bulk_reduce.h
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/epilogue_bwd.h
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/epilogue_fwd.h
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash.h
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_api.cpp
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_api_cpu.cpp
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_kernel_sm90.h
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_launch_template.h
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_postprocess_kernel.h
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_preprocess_kernel.h
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common.cpp
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common.h
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common_cpu.cpp
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common_cpu.h
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_fwd_kernel_sm90.h
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_fwd_launch_template.h
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/generate_kernels.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_bf16_softmaxfalse_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_bf16_softmaxtrue_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_fp16_softmaxfalse_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_fp16_softmaxtrue_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_bf16_softmaxfalse_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_bf16_softmaxtrue_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_fp16_softmaxfalse_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_fp16_softmaxtrue_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_bf16_softmaxfalse_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_bf16_softmaxtrue_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_fp16_softmaxfalse_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_fp16_softmaxtrue_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_bf16_softmaxfalse_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_bf16_softmaxtrue_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_fp16_softmaxfalse_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_fp16_softmaxtrue_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_bf16_softmaxfalse_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_bf16_softmaxtrue_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_fp16_softmaxfalse_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_fp16_softmaxtrue_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_bf16_softmaxfalse_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_bf16_softmaxtrue_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_e4m3_softmaxfalse_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_e4m3_softmaxtrue_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_fp16_softmaxfalse_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_fp16_softmaxtrue_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_bf16_softmaxfalse_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_bf16_softmaxtrue_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_e4m3_softmaxfalse_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_e4m3_softmaxtrue_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_fp16_softmaxfalse_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_fp16_softmaxtrue_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_bf16_softmaxfalse_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_bf16_softmaxtrue_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_e4m3_softmaxfalse_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_e4m3_softmaxtrue_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_fp16_softmaxfalse_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_fp16_softmaxtrue_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_bf16_softmaxfalse_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_bf16_softmaxtrue_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_e4m3_softmaxfalse_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_e4m3_softmaxtrue_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_fp16_softmaxfalse_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_fp16_softmaxtrue_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_bf16_softmaxfalse_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_bf16_softmaxtrue_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_e4m3_softmaxfalse_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_e4m3_softmaxtrue_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_fp16_softmaxfalse_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_fp16_softmaxtrue_sm90.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mainloop_bwd_sm90_tma_gmma_ws.h
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mainloop_fwd_sm90_tma_gmma_ws.h
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mask.h
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/named_barrier.h
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/seqlen.h
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/sm90_pipeline_no_cluster.h
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/softmax.h
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/static_switch.h
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/tile_scheduler.h
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/tile_size.h
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/utils.h
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/version.txt
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/jagged_transpose_1d.cpp
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/jagged_transpose_1d.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/replace_last_n_with_jagged.cpp
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/replace_last_n_with_jagged.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/setup.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda.cpp
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda_kernels_template.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda_kernels_template.h
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/split_1d_jagged_jagged.cpp
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/split_1d_jagged_jagged.cu
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/tests/concat_1d_jagged_jagged_test.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/tests/hstu_mha_cpu_test.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/tests/jagged_transpose_1d_test.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/tests/replace_last_n_with_jagged_test.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/cpp/tests/split_1d_jagged_jagged_test.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/hstu_attention.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/hstu_compute.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/jagged_tensors.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/layer_norm.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/mm.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/position.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/pytorch/pt_hstu_attention.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/pytorch/pt_hstu_linear.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/pytorch/pt_jagged.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/pytorch/pt_jagged_tensors.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/pytorch/pt_layer_norm.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/pytorch/pt_position.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/tests/fake_signature_test.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/tests/hstu_attention_test.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/tests/hstu_attention_tma_test.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/tests/hstu_compute_test.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/tests/jagged_tensors_test.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/tests/layer_norm_test.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/tests/mm_test.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/tests/position_test.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/tests/rms_norm_test.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton/triton_addmm.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton/triton_attention_utils.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton/triton_hstu_attention.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton/triton_hstu_linear.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton/triton_hstu_preprocess_and_attention.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton/triton_jagged.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton/triton_jagged_tensors.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton/triton_layer_norm.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton/triton_position.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton/triton_swiglu.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/README.md
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/compile/arg_descriptor.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/compile/codegen.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/compile/compile_state.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/compile/pipeline.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/compile/spec_processing.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/compile/stable_types.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/compile/triton_aot_compile.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/compile/utils.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/preprocess.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/shared/compat.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/shared/spec_conversion.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/shared/types.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/templates/embedded_cubins.cpp
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/templates/kernel.cpp
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/templates/kernel.h
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/templates/template_utils.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/templates/torch_op.cpp
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/transform/import_utils.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/transform/kernel_wrapper_codegen.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/transform/replace_kernels.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/transform/transform_kernels.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/triton_addmm.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/triton_concat_2d_jagged.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/triton_group_norm_mul_dropout.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/triton_layer_norm.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/triton_layer_norm_mul_dropout.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/triton_position.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/triton_ragged_hstu_attention.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/triton_rms_norm.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/triton_split_2d_jagged.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/types.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/utils.py
 create mode 100644 recommendation_v4/generative_recommenders/research/data/dataset.py
 create mode 100644 recommendation_v4/generative_recommenders/research/data/eval.py
 create mode 100644 recommendation_v4/generative_recommenders/research/data/item_features.py
 create mode 100644 recommendation_v4/generative_recommenders/research/data/preprocessor.py
 create mode 100644 recommendation_v4/generative_recommenders/research/data/reco_dataset.py
 create mode 100644 recommendation_v4/generative_recommenders/research/indexing/candidate_index.py
 create mode 100644 recommendation_v4/generative_recommenders/research/indexing/utils.py
 create mode 100644 recommendation_v4/generative_recommenders/research/modeling/initialization.py
 create mode 100644 recommendation_v4/generative_recommenders/research/modeling/sequential/autoregressive_losses.py
 create mode 100644 recommendation_v4/generative_recommenders/research/modeling/sequential/embedding_modules.py
 create mode 100644 recommendation_v4/generative_recommenders/research/modeling/sequential/encoder_utils.py
 create mode 100644 recommendation_v4/generative_recommenders/research/modeling/sequential/features.py
 create mode 100644 recommendation_v4/generative_recommenders/research/modeling/sequential/hstu.py
 create mode 100644 recommendation_v4/generative_recommenders/research/modeling/sequential/input_features_preprocessors.py
 create mode 100644 recommendation_v4/generative_recommenders/research/modeling/sequential/losses/sampled_softmax.py
 create mode 100644 recommendation_v4/generative_recommenders/research/modeling/sequential/output_postprocessors.py
 create mode 100644 recommendation_v4/generative_recommenders/research/modeling/sequential/sasrec.py
 create mode 100644 recommendation_v4/generative_recommenders/research/modeling/sequential/utils.py
 create mode 100644 recommendation_v4/generative_recommenders/research/modeling/similarity_module.py
 create mode 100644 recommendation_v4/generative_recommenders/research/modeling/similarity_utils.py
 create mode 100644 recommendation_v4/generative_recommenders/research/rails/indexing/candidate_index.py
 create mode 100644 recommendation_v4/generative_recommenders/research/rails/indexing/mips_top_k.py
 create mode 100644 recommendation_v4/generative_recommenders/research/rails/indexing/mol_top_k.py
 create mode 100644 recommendation_v4/generative_recommenders/research/rails/similarities/dot_product_similarity_fn.py
 create mode 100644 recommendation_v4/generative_recommenders/research/rails/similarities/layers.py
 create mode 100644 recommendation_v4/generative_recommenders/research/rails/similarities/module.py
 create mode 100644 recommendation_v4/generative_recommenders/research/rails/similarities/mol/embeddings_fn.py
 create mode 100644 recommendation_v4/generative_recommenders/research/rails/similarities/mol/item_embeddings_fn.py
 create mode 100644 recommendation_v4/generative_recommenders/research/rails/similarities/mol/query_embeddings_fn.py
 create mode 100644 recommendation_v4/generative_recommenders/research/rails/similarities/mol/similarity_fn.py
 create mode 100644 recommendation_v4/generative_recommenders/research/trainer/data_loader.py
 create mode 100644 recommendation_v4/generative_recommenders/research/trainer/train.py
 create mode 100644 recommendation_v4/generative_recommenders/tests/test_common.py
 create mode 100644 recommendation_v4/main.py
 create mode 100644 recommendation_v4/preprocess_public_data.py
 create mode 100644 recommendation_v4/requirements.txt
 create mode 100644 recommendation_v4/run_fractal_expansion.py
 create mode 100755 recommendation_v4/scripts/launch_smoke_8gpu.sh
 create mode 100644 recommendation_v4/setup.py

diff --git a/.gitmodules b/.gitmodules
index 51d8eac03..6ebcea592 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -6,3 +6,6 @@
 	path = text_to_image/torchtitan
 	url = https://github.com/pytorch/torchtitan.git
 	branch = mlperf-training-flux.1
+[submodule "recommendation_v4/cutlass"]
+	path = recommendation_v4/generative_recommenders/ops/cpp/cutlass
+	url = https://github.com/NVIDIA/cutlass.git
diff --git a/recommendation_v4/.gitignore b/recommendation_v4/.gitignore
new file mode 100644
index 000000000..560f823c4
--- /dev/null
+++ b/recommendation_v4/.gitignore
@@ -0,0 +1,158 @@
+# Don't check in parsed data files and other temporary files
+tmp/
+exps/
+ckpts/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
diff --git a/recommendation_v4/LICENSE b/recommendation_v4/LICENSE
new file mode 100644
index 000000000..d64569567
--- /dev/null
+++ b/recommendation_v4/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/recommendation_v4/README.MD b/recommendation_v4/README.MD
new file mode 100644
index 000000000..a60d0d3d7
--- /dev/null
+++ b/recommendation_v4/README.MD
@@ -0,0 +1,135 @@
+# Generative Recommenders
+
+Repository hosting code for ``Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations`` ([ICML'24 paper](https://proceedings.mlr.press/v235/zhai24a.html)) and related code, where we demonstrate that the ubiquitously used classical deep learning recommendation paradigm (DLRMs) can be reformulated as a generative modeling problem (Generative Recommenders or GRs) to overcome known compute scaling bottlenecks, propose efficient algorithms such as HSTU and M-FALCON to accelerate training and inference for large-scale sequential models by 10x-1000x, and demonstrate scaling law for the first-time in deployed, billion-user scale recommendation systems.
+
+## Getting started
+
+We recommend using `requirements.txt`. This has been tested with Ubuntu 22.04, CUDA 12.4, and Python 3.10.
+
+```bash
+pip3 install -r requirements.txt
+```
+
+Alternatively, you can manually install PyTorch based on official instructions. Then,
+
+```bash
+pip3 install gin-config pandas fbgemm_gpu torchrec tensorboard
+```
+
+## Experiments
+
+### Public Experiments
+
+To reproduce the public experiments in our paper (traditional sequential recommender setting, Section 4.1.1) on MovieLens and Amazon Reviews in the paper, please follow these steps:
+
+#### Download and preprocess data.
+
+```bash
+mkdir -p tmp/ && python3 preprocess_public_data.py
+```
+
+A GPU with 24GB or more HBM should work for most datasets.
+
+```bash
+CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-1m/hstu-sampled-softmax-n128-large-final.gin --master_port=12345
+```
+
+Other configurations are included in configs/ml-1m, configs/ml-20m, and configs/amzn-books to make reproducing these experiments easier.
+
+#### Verify results.
+
+By default we write experimental logs to exps/. We can launch tensorboard with something like the following:
+
+```bash
+tensorboard --logdir ~/generative-recommenders/exps/ml-1m-l200/ --port 24001 --bind_all
+tensorboard --logdir ~/generative-recommenders/exps/ml-20m-l200/ --port 24001 --bind_all
+tensorboard --logdir ~/generative-recommenders/exps/amzn-books-l50/ --port 24001 --bind_all
+```
+
+With the provided configuration (.gin) files, you should be able to reproduce the following results (verified as of 04/15/2024):
+
+**MovieLens-1M (ML-1M)**:
+
+| Method        | HR@10            | NDCG@10         | HR@50           | NDCG@50         | HR@200          | NDCG@200        |
+| ------------- | ---------------- | ----------------| --------------- | --------------- | --------------- | --------------- |
+| SASRec        | 0.2853           | 0.1603          | 0.5474          | 0.2185          | 0.7528          | 0.2498          |
+| BERT4Rec      | 0.2843 (-0.4%)   | 0.1537 (-4.1%)  |                 |                 |                 |                 |
+| GRU4Rec       | 0.2811 (-1.5%)   | 0.1648 (+2.8%)  |                 |                 |                 |                 |
+| HSTU          | 0.3097 (+8.6%)   | 0.1720 (+7.3%)  | 0.5754 (+5.1%)  | 0.2307 (+5.6%)  | 0.7716 (+2.5%)  | 0.2606 (+4.3%)  |
+| HSTU-large    | **0.3294 (+15.5%)**  | **0.1893 (+18.1%)** | **0.5935 (+8.4%)**  | **0.2481 (+13.5%)** | **0.7839 (+4.1%)**  | **0.2771 (+10.9%)** |
+
+**MovieLens-20M (ML-20M)**:
+
+| Method        | HR@10            | NDCG@10         | HR@50           | NDCG@50         | HR@200          | NDCG@200        |
+| ------------- | ---------------- | --------------- | --------------- | --------------- | --------------- | --------------- |
+| SASRec        | 0.2889           | 0.1621          | 0.5503          | 0.2199          | 0.7661          | 0.2527          |
+| BERT4Rec      | 0.2816 (-2.5%)   | 0.1703 (+5.1%)  |                 |                 |                 |                 |
+| GRU4Rec       | 0.2813 (-2.6%)   | 0.1730 (+6.7%)  |                 |                 |                 |                 |
+| HSTU          | 0.3273 (+13.3%)  | 0.1895 (+16.9%) | 0.5889 (+7.0%)  | 0.2473 (+12.5%) | 0.7952 (+3.8%)  | 0.2787 (+10.3%) |
+| HSTU-large    | **0.3556 (+23.1%)**  | **0.2098 (+29.4%)** | **0.6143 (+11.6%)** | **0.2671 (+21.5%)** | **0.8074 (+5.4%)**  | **0.2965 (+17.4%)** |
+
+**Amazon Reviews (Books)**:
+
+| Method        | HR@10            | NDCG@10         | HR@50           | NDCG@50         | HR@200          | NDCG@200        |
+| ------------- | ---------------- | ----------------|---------------- | --------------- | --------------- | --------------- |
+| SASRec        | 0.0306           | 0.0164          | 0.0754          | 0.0260          | 0.1431          | 0.0362          |
+| HSTU          | 0.0416 (+36.4%)  | 0.0227 (+39.3%) | 0.0957 (+27.1%) | 0.0344 (+32.3%) | 0.1735 (+21.3%) | 0.0461 (+27.7%) |
+| HSTU-large    | **0.0478 (+56.7%)**  | **0.0262 (+60.7%)** | **0.1082 (+43.7%)** | **0.0393 (+51.2%)** | **0.1908 (+33.4%)** | **0.0517 (+43.2%)** |
+
+for all three tables above, the ``SASRec`` rows are based on [Self-Attentive Sequential Recommendation](https://arxiv.org/abs/1808.09781) but with the original binary cross entropy loss
+replaced with sampled softmax losses proposed in [Revisiting Neural Retrieval on Accelerators](https://arxiv.org/abs/2306.04039). These rows are reproducible with ``configs/*/sasrec-*-final.gin``.
+The ``BERT4Rec`` and ``GRU4Rec`` rows are based on results reported by [Turning Dross Into Gold Loss: is BERT4Rec really better than SASRec?](https://arxiv.org/abs/2309.07602) -
+note that the comparison slightly favors these two, due to them using full negatives whereas the other rows used 128/512 sampled negatives. The ``HSTU`` and ``HSTU-large`` rows are based on [Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations](https://arxiv.org/abs/2402.17152); in particular, HSTU rows utilize identical configurations as SASRec. ``HSTU`` and ``HSTU-large`` results can be reproduced with ``configs/*/hstu-*-final.gin``.
+
+### Synthetic Dataset / MovieLens-3B
+
+We support generating synthetic dataset with fractal expansion introduced in https://arxiv.org/abs/1901.08910. This allows us to expand the current 20 million real-world ratings in ML-20M to 3 billion.
+
+To download the pre-generated synthetic dataset:
+
+```bash
+pip3 install gdown
+mkdir -p tmp/ && cd tmp/
+gdown https://drive.google.com/uc?id=1-jZ6k0el7e7PyFnwqMLfqUTRh_Qdumt-
+unzip ml-3b.zip && rm ml-3b.zip
+```
+
+To generate the synthetic dataset on your own:
+
+```bash
+python3 run_fractal_expansion.py --input-csv-file tmp/ml-20m/ratings.csv --write-dataset True --output-prefix tmp/ml-3b/
+```
+
+### Efficiency experiments
+
+``ops/triton`` contains triton kernels needed for efficiency experiments. ``ops/cpp`` contains efficient CUDA kernels. In particular, ``ops/cpp/hstu_attention`` contains the attention implementation based on [FlashAttention V3](https://github.com/Dao-AILab/flash-attention) with state-of-the-art efficiency on H100 GPUs.
+
+## DLRM-v3
+
+We have created a DLRM model using HSTU and have developed benchmarks for both training and inference to faciliate production RecSys use cases.
+
+#### Run model training with 4 GPUs
+
+```bash
+LOCAL_WORLD_SIZE=4 WORLD_SIZE=4 python3 generative_recommenders/dlrm_v3/train/train_ranker.py --dataset debug --mode train
+```
+
+#### Run model inference with 4 GPUs
+
+```bash
+git clone --recurse-submodules https://github.com/mlcommons/inference.git mlperf_inference
+cd mlperf_inference/loadgen
+CFLAGS="-std=c++14 -O3" python -m pip install .
+
+LOCAL_WORLD_SIZE=4 WORLD_SIZE=4 python3 generative_recommenders/dlrm_v3/inference/main.py --dataset debug
+```
+
+## License
+This codebase is Apache 2.0 licensed, as found in the [LICENSE](LICENSE) file.
+
+## Contributors
+The overall project is made possible thanks to the joint work from many technical contributors (listed in alphabetical order):
+
+Adnan Akhundov, Bugra Akyildiz, Shabab Ayub, Alex Bao, Renqin Cai, Jennifer Cao, Xuan Cao, Guoqiang Jerry Chen, Lei Chen, Li Chen, Sean Chen, Xianjie Chen, Huihui Cheng, Weiwei Chu, Ted Cui, Shiyan Deng, Nimit Desai, Fei Ding, Shilin Ding, Francois Fagan, Lu Fang, Leon Gao, Zhaojie Gong, Fangda Gu, Liang Guo, Liz Guo, Jeevan Gyawali, Yuchen Hao, Daisy Shi He, Michael Jiayuan He, Yu He, Samuel Hsia, Jie Hua, Yanzun Huang, Hongyi Jia, Rui Jian, Jian Jin, Rafay Khurram, Rahul Kindi, Changkyu Kim, Yejin Lee, Fu Li, Han Li, Hong Li, Shen Li, Rui Li, Wei Li, Zhijing Li, Lucy Liao, Xueting Liao, Emma Lin, Hao Lin, Chloe Liu, Jingzhou Liu, Xing Liu, Xingyu Liu, Kai Londenberg, Yinghai Lu, Liang Luo, Linjian Ma, Matt Ma, Yun Mao, Bert Maher, Ajit Mathews, Matthew Murphy, Satish Nadathur, Min Ni, Jongsoo Park, Colin Peppler, Jing Qian, Lijing Qin, Jing Shan, Alex Singh, Timothy Shi,  Yu Shi, Dennis van der Staay, Xiao Sun, Colin Taylor, Shin-Yeh Tsai, Rohan Varma, Omkar Vichare, Alyssa Wang, Pengchao Wang, Shengzhi Wang, Wenting Wang, Xiaolong Wang, Yueming Wang, Zhiyong Wang, Wei Wei, Bin Wen, Carole-Jean Wu, Yanhong Wu, Eric Xu, Bi Xue, Hong Yan, Zheng Yan, Chao Yang, Junjie Yang, Wen-Yun Yang, Ze Yang, Zimeng Yang, Yuanjun Yao, Chunxing Yin, Daniel Yin, Yiling You, Jiaqi Zhai, Keke Zhai, Yanli Zhao, Zhuoran Zhao, Hui Zhang, Jingjing Zhang, Lu Zhang, Lujia Zhang, Na Zhang, Rui Zhang, Xiong Zhang, Ying Zhang, Zhiyun Zhang, Charles Zheng, Erheng Zhong, Zhao Zhu, Xin Zhuang.
+
+For the initial paper describing the Generative Recommender problem formulation and the algorithms used, including HSTU and M-FALCON, please refer to ``Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations``([ICML'24 paper](https://dl.acm.org/doi/10.5555/3692070.3694484), [slides](https://icml.cc/media/icml-2024/Slides/32684.pdf)).
diff --git a/recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-final.gin b/recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-final.gin
new file mode 100644
index 000000000..8fb8b258c
--- /dev/null
+++ b/recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-final.gin
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Frozen config, validated on 04/12/2024.
+# Based on HSTU results (w/ identical configurations as a SotA Transformer baseline) in
+# Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations (https://arxiv.org/abs/2402.17152).
+#
+# Run this as:
+# mkdir -p logs/amzn-books-l50/
+# CUDA_VISIBLE_DEVICES=1 python3 main.py --gin_config_file=configs/amzn-books/hstu-sampled-softmax-n512-final.gin --master_port=12346 2>&1 | tee logs/amzn-books-l50/hstu-sampled-softmax-n512-final.log
+
+train_fn.dataset_name = "amzn-books"
+train_fn.max_sequence_length = 50
+train_fn.local_batch_size = 128
+train_fn.eval_batch_size = 128
+
+train_fn.main_module = "HSTU"
+train_fn.dropout_rate = 0.5
+train_fn.user_embedding_norm = "l2_norm"
+train_fn.item_embedding_dim = 64
+
+hstu_encoder.num_blocks = 4
+hstu_encoder.num_heads = 4
+hstu_encoder.dv = 16
+hstu_encoder.dqk = 16
+hstu_encoder.linear_dropout_rate = 0.5
+
+train_fn.eval_interval = 4000
+train_fn.num_epochs = 201
+train_fn.learning_rate = 1e-3
+train_fn.weight_decay = 0
+train_fn.num_warmup_steps = 0
+
+train_fn.interaction_module_type = "DotProduct"
+train_fn.top_k_method = "MIPSBruteForceTopK"
+
+train_fn.loss_module = "SampledSoftmaxLoss"
+train_fn.num_negatives = 512
+
+train_fn.sampling_strategy = "local"
+train_fn.temperature = 0.05
+train_fn.item_l2_norm = True
+train_fn.l2_norm_eps = 1e-6
+
+train_fn.enable_tf32 = True
+train_fn.full_eval_every_n = 5
+train_fn.partial_eval_num_iters = 64
+
+create_data_loader.prefetch_factor = 1024
+create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-large-final.gin b/recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-large-final.gin
new file mode 100644
index 000000000..097d4cbc7
--- /dev/null
+++ b/recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-large-final.gin
@@ -0,0 +1,49 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Frozen config, validated on 04/12/2024.
+# Based on HSTU-large results in
+# Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations (https://arxiv.org/abs/2402.17152).
+#
+# Run this as:
+# mkdir -p logs/amzn-books-l50/
+# CUDA_VISIBLE_DEVICES=1 python3 main.py --gin_config_file=configs/amzn-books/hstu-sampled-softmax-n512-large-final.gin --master_port=12346 2>&1 | tee logs/amzn-books-l50/hstu-sampled-softmax-n512-large-final2.log
+
+train_fn.dataset_name = "amzn-books"
+train_fn.max_sequence_length = 50
+train_fn.local_batch_size = 128
+train_fn.eval_batch_size = 128
+
+train_fn.main_module = "HSTU"
+train_fn.dropout_rate = 0.5
+train_fn.user_embedding_norm = "l2_norm"
+train_fn.item_embedding_dim = 64
+
+hstu_encoder.num_blocks = 16
+hstu_encoder.num_heads = 8
+hstu_encoder.dv = 8
+hstu_encoder.dqk = 8
+hstu_encoder.linear_dropout_rate = 0.5
+
+train_fn.eval_interval = 4000
+train_fn.num_epochs = 201
+train_fn.learning_rate = 1e-3
+train_fn.weight_decay = 0
+train_fn.num_warmup_steps = 0
+
+train_fn.interaction_module_type = "DotProduct"
+train_fn.top_k_method = "MIPSBruteForceTopK"
+
+train_fn.loss_module = "SampledSoftmaxLoss"
+train_fn.num_negatives = 512
+
+train_fn.sampling_strategy = "local"
+train_fn.temperature = 0.05
+train_fn.item_l2_norm = True
+train_fn.l2_norm_eps = 1e-6
+
+train_fn.enable_tf32 = True
+train_fn.full_eval_every_n = 5
+train_fn.partial_eval_num_iters = 64
+
+create_data_loader.prefetch_factor = 1024
+create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/amzn-books/sasrec-sampled-softmax-n512-final.gin b/recommendation_v4/configs/amzn-books/sasrec-sampled-softmax-n512-final.gin
new file mode 100644
index 000000000..bc899c9fb
--- /dev/null
+++ b/recommendation_v4/configs/amzn-books/sasrec-sampled-softmax-n512-final.gin
@@ -0,0 +1,50 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Frozen config, validated on 04/12/2024.
+# Based on baseline settings in Revisiting Neural Retrieval on Accelerators (https://arxiv.org/abs/2306.04039, KDD'23).
+#
+# Run this as:
+# mkdir -p logs/amzn-books-l50/
+# CUDA_VISIBLE_DEVICES=1 python3 main.py --gin_config_file=configs/amzn-books/sasrec-sampled-softmax-n512-final.gin  --master_port=12346 2>&1 | tee logs/amzn-books-l50/sasrec-sampled-softmax-n512-final.log
+
+train_fn.dataset_name = "amzn-books"
+train_fn.max_sequence_length = 50
+train_fn.local_batch_size = 128
+train_fn.eval_batch_size = 128
+
+train_fn.main_module = "SASRec"
+train_fn.dropout_rate = 0.5
+train_fn.user_embedding_norm = "l2_norm"
+train_fn.item_embedding_dim = 64
+
+sasrec_encoder.num_blocks = 4
+sasrec_encoder.num_heads = 4
+sasrec_encoder.ffn_dropout_rate = 0.5
+sasrec_encoder.ffn_hidden_dim = 64
+sasrec_encoder.ffn_activation_fn = "relu"
+
+train_fn.eval_interval = 4000
+train_fn.num_epochs = 201
+train_fn.learning_rate = 1e-3
+train_fn.weight_decay = 0
+train_fn.num_warmup_steps = 0
+
+train_fn.save_ckpt_every_n = 10
+
+train_fn.interaction_module_type = "DotProduct"
+train_fn.top_k_method = "MIPSBruteForceTopK"
+
+train_fn.loss_module = "SampledSoftmaxLoss"
+train_fn.num_negatives = 512
+
+train_fn.sampling_strategy = "local"
+train_fn.temperature = 0.05
+train_fn.item_l2_norm = True
+train_fn.l2_norm_eps = 1e-6
+
+train_fn.enable_tf32 = True
+train_fn.full_eval_every_n = 5
+train_fn.partial_eval_num_iters = 64
+
+create_data_loader.prefetch_factor = 1024
+create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-final.gin b/recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-final.gin
new file mode 100644
index 000000000..841b1c80a
--- /dev/null
+++ b/recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-final.gin
@@ -0,0 +1,45 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Frozen config, validated on 04/11/2024.
+# Based on HSTU results (w/ identical configurations as a SotA Transformer baseline) in
+# Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations (https://arxiv.org/abs/2402.17152).
+#
+# Run this as:
+# mkdir -p logs/ml-1m-l200/
+# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-1m/hstu-sampled-softmax-n128-final.gin --master_port=12345 2>&1 | tee logs/ml-1m-l200/hstu-sampled-softmax-n128-final.log
+
+train_fn.dataset_name = "ml-1m"
+train_fn.max_sequence_length = 200
+train_fn.local_batch_size = 128
+
+train_fn.main_module = "HSTU"
+train_fn.dropout_rate = 0.2
+train_fn.user_embedding_norm = "l2_norm"
+train_fn.num_epochs = 101
+train_fn.item_embedding_dim = 50
+
+hstu_encoder.num_blocks = 2
+hstu_encoder.num_heads = 1
+hstu_encoder.dqk = 50
+hstu_encoder.dv = 50
+hstu_encoder.linear_dropout_rate = 0.2
+
+train_fn.learning_rate = 1e-3
+train_fn.weight_decay = 0
+train_fn.num_warmup_steps = 0
+
+train_fn.interaction_module_type = "DotProduct"
+train_fn.top_k_method = "MIPSBruteForceTopK"
+
+train_fn.loss_module = "SampledSoftmaxLoss"
+train_fn.num_negatives = 128
+
+train_fn.sampling_strategy = "local"
+train_fn.temperature = 0.05
+train_fn.item_l2_norm = True
+train_fn.l2_norm_eps = 1e-6
+
+train_fn.enable_tf32 = True
+
+create_data_loader.prefetch_factor = 128
+create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-large-final.gin b/recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-large-final.gin
new file mode 100644
index 000000000..7ffc7ef64
--- /dev/null
+++ b/recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-large-final.gin
@@ -0,0 +1,45 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Frozen config, validated on 04/11/2024.
+# Based on HSTU-large results in
+# Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations (https://arxiv.org/abs/2402.17152).
+#
+# Run this as:
+# mkdir -p logs/ml-1m-l200/
+# CUDA_VISIBLE_DEVICES=1 python3 main.py --gin_config_file=configs/ml-1m/hstu-sampled-softmax-n128-large-final.gin --master_port=12346 2>&1 | tee logs/ml-1m-l200/hstu-sampled-softmax-n128-large-final.log
+
+train_fn.dataset_name = "ml-1m"
+train_fn.max_sequence_length = 200
+train_fn.local_batch_size = 128
+
+train_fn.main_module = "HSTU"
+train_fn.dropout_rate = 0.2
+train_fn.user_embedding_norm = "l2_norm"
+train_fn.num_epochs = 101
+train_fn.item_embedding_dim = 50
+
+hstu_encoder.num_blocks = 8
+hstu_encoder.num_heads = 2
+hstu_encoder.dqk = 25
+hstu_encoder.dv = 25
+hstu_encoder.linear_dropout_rate = 0.2
+
+train_fn.learning_rate = 1e-3
+train_fn.weight_decay = 0
+train_fn.num_warmup_steps = 0
+
+train_fn.interaction_module_type = "DotProduct"
+train_fn.top_k_method = "MIPSBruteForceTopK"
+
+train_fn.loss_module = "SampledSoftmaxLoss"
+train_fn.num_negatives = 128
+
+train_fn.sampling_strategy = "local"
+train_fn.temperature = 0.05
+train_fn.item_l2_norm = True
+train_fn.l2_norm_eps = 1e-6
+
+train_fn.enable_tf32 = True
+
+create_data_loader.prefetch_factor = 128
+create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-1m/sasrec-sampled-softmax-n128-final.gin b/recommendation_v4/configs/ml-1m/sasrec-sampled-softmax-n128-final.gin
new file mode 100644
index 000000000..ead7bb21c
--- /dev/null
+++ b/recommendation_v4/configs/ml-1m/sasrec-sampled-softmax-n128-final.gin
@@ -0,0 +1,44 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Frozen config, validated on 04/11/2024.
+# Based on baseline settings in Revisiting Neural Retrieval on Accelerators (https://arxiv.org/abs/2306.04039, KDD'23).
+#
+# Run this as:
+# mkdir -p logs/ml-1m-l200/
+# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-1m/sasrec-sampled-softmax-n128-final.gin --master_port=12345 2>&1 | tee logs/ml-1m-l200/sasrec-sampled-softmax-n128-final.log
+
+train_fn.dataset_name = "ml-1m"
+train_fn.max_sequence_length = 200
+train_fn.local_batch_size = 128
+
+train_fn.main_module = "SASRec"
+train_fn.dropout_rate = 0.2
+train_fn.user_embedding_norm = "l2_norm"
+train_fn.num_epochs = 101
+train_fn.item_embedding_dim = 50
+
+sasrec_encoder.num_blocks = 2
+sasrec_encoder.num_heads = 1
+sasrec_encoder.ffn_dropout_rate = 0.2
+sasrec_encoder.ffn_hidden_dim = 50
+sasrec_encoder.ffn_activation_fn = "relu"
+
+train_fn.learning_rate = 1e-3
+train_fn.weight_decay = 0
+train_fn.num_warmup_steps = 0
+
+train_fn.top_k_method = "MIPSBruteForceTopK"
+train_fn.interaction_module_type = "DotProduct"
+
+train_fn.loss_module = "SampledSoftmaxLoss"
+train_fn.num_negatives = 128
+
+train_fn.sampling_strategy = "local"
+train_fn.temperature = 0.05
+train_fn.item_l2_norm = True
+train_fn.l2_norm_eps = 1e-6
+
+train_fn.enable_tf32 = True
+
+create_data_loader.prefetch_factor = 128
+create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-final.gin b/recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-final.gin
new file mode 100644
index 000000000..5823ad5b6
--- /dev/null
+++ b/recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-final.gin
@@ -0,0 +1,45 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Frozen config, validated on 04/12/2024.
+# Based on HSTU results (w/ identical configurations as a SotA Transformer baseline) in
+# Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations (https://arxiv.org/abs/2402.17152).
+#
+# Run this as:
+# mkdir -p logs/ml-20m-l200/
+# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-20m/hstu-sampled-softmax-n128-final.gin --master_port=12345 2>&1 | tee logs/ml-20m-l200/hstu-sampled-softmax-n128-final.log
+
+train_fn.dataset_name = "ml-20m"
+train_fn.max_sequence_length = 200
+train_fn.local_batch_size = 128
+
+train_fn.main_module = "HSTU"
+train_fn.dropout_rate = 0.2
+train_fn.user_embedding_norm = "l2_norm"
+train_fn.num_epochs = 101
+train_fn.item_embedding_dim = 256
+
+hstu_encoder.num_blocks = 4
+hstu_encoder.num_heads = 4
+hstu_encoder.dv = 64
+hstu_encoder.dqk = 64
+hstu_encoder.linear_dropout_rate = 0.2
+
+train_fn.learning_rate = 1e-3
+train_fn.weight_decay = 0
+train_fn.num_warmup_steps = 0
+
+train_fn.interaction_module_type = "DotProduct"
+train_fn.top_k_method = "MIPSBruteForceTopK"
+
+train_fn.loss_module = "SampledSoftmaxLoss"
+train_fn.num_negatives = 128
+
+train_fn.sampling_strategy = "local"
+train_fn.temperature = 0.05
+train_fn.item_l2_norm = True
+train_fn.l2_norm_eps = 1e-6
+
+train_fn.enable_tf32 = True
+
+create_data_loader.prefetch_factor = 128
+create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-large-final.gin b/recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-large-final.gin
new file mode 100644
index 000000000..0199afa24
--- /dev/null
+++ b/recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-large-final.gin
@@ -0,0 +1,45 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Frozen config, validated on 04/12/2024.
+# Based on HSTU-large results in
+# Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations (https://arxiv.org/abs/2402.17152).
+#
+# Run this as:
+# mkdir -p logs/ml-20m-l200/
+# CUDA_VISIBLE_DEVICES=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python3 main.py --gin_config_file=configs/ml-20m/hstu-sampled-softmax-n128-large-final.gin --master_port=12345 2>&1 | tee logs/ml-20m-l200/hstu-sampled-softmax-n128-large-final.log
+
+train_fn.dataset_name = "ml-20m"
+train_fn.max_sequence_length = 200
+train_fn.local_batch_size = 128
+
+train_fn.main_module = "HSTU"
+train_fn.dropout_rate = 0.2
+train_fn.user_embedding_norm = "l2_norm"
+train_fn.num_epochs = 101
+train_fn.item_embedding_dim = 256
+
+hstu_encoder.num_blocks = 16
+hstu_encoder.num_heads = 8
+hstu_encoder.dv = 32
+hstu_encoder.dqk = 32
+hstu_encoder.linear_dropout_rate = 0.2
+
+train_fn.learning_rate = 1e-3
+train_fn.weight_decay = 0
+train_fn.num_warmup_steps = 0
+
+train_fn.interaction_module_type = "DotProduct"
+train_fn.top_k_method = "MIPSBruteForceTopK"
+
+train_fn.loss_module = "SampledSoftmaxLoss"
+train_fn.num_negatives = 128
+
+train_fn.sampling_strategy = "local"
+train_fn.temperature = 0.05
+train_fn.item_l2_norm = True
+train_fn.l2_norm_eps = 1e-6
+
+train_fn.enable_tf32 = True
+
+create_data_loader.prefetch_factor = 128
+create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-20m/sasrec-sampled-softmax-n128-final.gin b/recommendation_v4/configs/ml-20m/sasrec-sampled-softmax-n128-final.gin
new file mode 100644
index 000000000..3c666f802
--- /dev/null
+++ b/recommendation_v4/configs/ml-20m/sasrec-sampled-softmax-n128-final.gin
@@ -0,0 +1,44 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Frozen config, validated on 04/12/2024.
+# Based on baseline settings in Revisiting Neural Retrieval on Accelerators (https://arxiv.org/abs/2306.04039, KDD'23).
+#
+# Run this as:
+# mkdir -p logs/ml-20m-l200/
+# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-20m/sasrec-sampled-softmax-n128-final.gin --master_port=12345 2>&1 | tee logs/ml-20m-l200/sasrec-sampled-softmax-n128-final.log
+
+train_fn.dataset_name = "ml-20m"
+train_fn.max_sequence_length = 200
+train_fn.local_batch_size = 128
+
+train_fn.main_module = "SASRec"
+train_fn.dropout_rate = 0.2
+train_fn.user_embedding_norm = "l2_norm"
+train_fn.num_epochs = 101
+train_fn.item_embedding_dim = 256
+
+sasrec_encoder.num_blocks = 4
+sasrec_encoder.num_heads = 4
+sasrec_encoder.ffn_dropout_rate = 0.2
+sasrec_encoder.ffn_hidden_dim = 256
+sasrec_encoder.ffn_activation_fn = "relu"
+
+train_fn.learning_rate = 1e-3
+train_fn.weight_decay = 0
+train_fn.num_warmup_steps = 0
+
+train_fn.top_k_method = "MIPSBruteForceTopK"
+train_fn.interaction_module_type = "DotProduct"
+
+train_fn.loss_module = "SampledSoftmaxLoss"
+train_fn.num_negatives = 128
+
+train_fn.sampling_strategy = "local"
+train_fn.temperature = 0.05
+train_fn.item_l2_norm = True
+train_fn.l2_norm_eps = 1e-6
+
+train_fn.enable_tf32 = True
+
+create_data_loader.prefetch_factor = 128
+create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-final.gin b/recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-final.gin
new file mode 100644
index 000000000..ac7a85350
--- /dev/null
+++ b/recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-final.gin
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Run this as:
+# mkdir -p logs/ml-3b-l500/
+# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-final.gin --master_port=12345 2>&1 | tee logs/ml-3b-l500/hstu-sampled-softmax-n96-seqlen500-final.log
+
+train_fn.dataset_name = "ml-3b"
+train_fn.max_sequence_length = 500
+train_fn.local_batch_size = 96
+train_fn.eval_batch_size = 96
+
+train_fn.main_module = "HSTU"
+train_fn.dropout_rate = 0.2
+train_fn.user_embedding_norm = "l2_norm"
+train_fn.num_epochs = 100
+train_fn.item_embedding_dim = 256
+
+hstu_encoder.num_blocks = 4
+hstu_encoder.num_heads = 4
+hstu_encoder.dv = 64
+hstu_encoder.dqk = 64
+hstu_encoder.linear_dropout_rate = 0.2
+
+train_fn.learning_rate = 1e-3
+train_fn.weight_decay = 0
+train_fn.num_warmup_steps = 0
+
+train_fn.interaction_module_type = "DotProduct"
+train_fn.top_k_method = "MIPSBruteForceTopK"
+
+train_fn.loss_module = "SampledSoftmaxLoss"
+train_fn.num_negatives = 128
+
+train_fn.sampling_strategy = "local"
+train_fn.temperature = 0.05
+train_fn.item_l2_norm = True
+train_fn.l2_norm_eps = 1e-6
+
+train_fn.enable_tf32 = True
+
+create_data_loader.prefetch_factor = 128
+create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-large-final.gin b/recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-large-final.gin
new file mode 100644
index 000000000..a30ad3657
--- /dev/null
+++ b/recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-large-final.gin
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Run this as:
+# mkdir -p logs/ml-3b-l500/
+# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-large-final.gin --master_port=12345 2>&1 | tee logs/ml-3b-l500/hstu-sampled-softmax-n96-seqlen500-large-final.log
+
+train_fn.dataset_name = "ml-3b"
+train_fn.max_sequence_length = 500
+train_fn.local_batch_size = 96
+train_fn.eval_batch_size = 96
+
+train_fn.main_module = "HSTU"
+train_fn.dropout_rate = 0.2
+train_fn.user_embedding_norm = "l2_norm"
+train_fn.num_epochs = 100
+train_fn.item_embedding_dim = 256
+
+hstu_encoder.num_blocks = 16
+hstu_encoder.num_heads = 8
+hstu_encoder.dv = 32
+hstu_encoder.dqk = 32
+hstu_encoder.linear_dropout_rate = 0.2
+
+train_fn.learning_rate = 1e-3
+train_fn.weight_decay = 0
+train_fn.num_warmup_steps = 0
+
+train_fn.interaction_module_type = "DotProduct"
+train_fn.top_k_method = "MIPSBruteForceTopK"
+
+train_fn.loss_module = "SampledSoftmaxLoss"
+train_fn.num_negatives = 128
+
+train_fn.sampling_strategy = "local"
+train_fn.temperature = 0.05
+train_fn.item_l2_norm = True
+train_fn.l2_norm_eps = 1e-6
+
+train_fn.enable_tf32 = True
+
+create_data_loader.prefetch_factor = 128
+create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-3b/sasrec-sampled-softmax-n96-seqlen500-final.gin b/recommendation_v4/configs/ml-3b/sasrec-sampled-softmax-n96-seqlen500-final.gin
new file mode 100644
index 000000000..034c478b4
--- /dev/null
+++ b/recommendation_v4/configs/ml-3b/sasrec-sampled-softmax-n96-seqlen500-final.gin
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Run this as:
+# mkdir -p logs/ml-3b-l500/
+# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-3b/sasrec-sampled-softmax-n96-seqlen500-final.gin --master_port=12345 2>&1 | tee logs/ml-3b-l500/sasrec-sampled-softmax-n96-seqlen500-final.log
+
+train_fn.dataset_name = "ml-3b"
+train_fn.max_sequence_length = 500
+train_fn.local_batch_size = 96
+train_fn.eval_batch_size = 96
+
+train_fn.main_module = "SASRec"
+train_fn.dropout_rate = 0.2
+train_fn.user_embedding_norm = "l2_norm"
+train_fn.num_epochs = 100
+train_fn.item_embedding_dim = 256
+
+sasrec_encoder.num_blocks = 4
+sasrec_encoder.num_heads = 4
+sasrec_encoder.ffn_dropout_rate = 0.2
+sasrec_encoder.ffn_hidden_dim = 256
+sasrec_encoder.ffn_activation_fn = "relu"
+
+train_fn.learning_rate = 1e-3
+train_fn.weight_decay = 0
+train_fn.num_warmup_steps = 0
+
+train_fn.top_k_method = "MIPSBruteForceTopK"
+train_fn.interaction_module_type = "DotProduct"
+
+train_fn.loss_module = "SampledSoftmaxLoss"
+train_fn.num_negatives = 128
+
+train_fn.sampling_strategy = "local"
+train_fn.temperature = 0.05
+train_fn.item_l2_norm = True
+train_fn.l2_norm_eps = 1e-6
+
+train_fn.enable_tf32 = True
+
+create_data_loader.prefetch_factor = 128
+create_data_loader.num_workers = 8
diff --git a/recommendation_v4/generative_recommenders/README.md b/recommendation_v4/generative_recommenders/README.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/recommendation_v4/generative_recommenders/common.py b/recommendation_v4/generative_recommenders/common.py
new file mode 100644
index 000000000..2ff8edf80
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/common.py
@@ -0,0 +1,513 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import abc
+import copy
+import os
+from enum import Enum, unique
+from typing import Any, Callable, List, Optional, Tuple
+
+import torch
+
+# @manual=//triton:triton
+import triton
+from generative_recommenders.ops.utils import is_sm100_plus, is_sm90_plus
+from torch.fx._symbolic_trace import is_fx_tracing
+from torch.utils._python_dispatch import _get_current_dispatch_mode_stack
+
+# @manual=//triton:triton
+from triton.runtime.autotuner import Autotuner
+
+try:
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+except OSError:
+    pass
+
+try:
+    # @manual=//triton:triton
+    import triton.language.extra.tlx  # type: ignore
+
+    HAS_TLX = True
+except ImportError:
+    HAS_TLX = False
+
+try:
+    from generative_recommenders.fb.triton_cc.utils import triton_cc
+    from hammer.ops.triton.utils import triton_autotune
+    from hammer.utils import is_dev_mode, set_dev_mode, set_verbose_level
+except ImportError:
+    # pyre-ignore
+    def triton_cc(annotations):
+        # pyre-ignore
+        def decorator(fn):
+            return fn
+
+        return decorator
+
+    # pyre-ignore
+    def triton_autotune(
+        configs: List[triton.Config],
+        key: List[str],
+        # pyre-ignore
+        prune_configs_by=None,
+        # pyre-ignore
+        reset_to_zero=None,
+        # pyre-ignore
+        restore_value=None,
+        warmup: int = 25,
+        rep: int = 100,
+    ):
+        # pyre-ignore
+        def decorator(fn):
+            return Autotuner(
+                fn,
+                fn.arg_names,
+                configs,
+                key,
+                reset_to_zero,
+                restore_value,
+                pre_hook=None,
+                post_hook=None,
+                prune_configs_by=prune_configs_by,
+                warmup=warmup,
+                rep=rep,
+            )
+
+        return decorator
+
+    DEV_MODE: bool = False
+    VERBOSE_LEVEL: int = 0
+
+    def set_dev_mode(val: bool) -> None:
+        global DEV_MODE
+        DEV_MODE = val
+
+    def is_dev_mode() -> bool:
+        global DEV_MODE  # noqa: F824
+        return DEV_MODE
+
+    def set_verbose_level(level: int) -> None:
+        global VERBOSE_LEVEL
+        VERBOSE_LEVEL = level
+
+    def get_verbose_level() -> int:
+        global VERBOSE_LEVEL  # noqa: F824
+        return VERBOSE_LEVEL
+
+
+@unique
+class HammerKernel(Enum):
+    TRITON = "TRITON"
+    TLX = "TLX"
+    PYTORCH = "PYTORCH"
+    CUDA = "CUDA"
+    TRITON_CC = "TRITON_CC"
+    TRITON_INFERENCE = "TRITON_INFERENCE"
+    CUTEDSL = "CUTEDSL"
+
+
+class HammerModule(torch.nn.Module, abc.ABC):
+    _is_inference: bool = False
+    _use_triton_cc: bool = True
+    _training_dtype: torch.dtype = torch.float32
+    _hammer_kernel: Optional[HammerKernel] = None
+
+    def __init__(
+        self,
+        is_inference: bool,
+        training_dytpe: torch.dtype = torch.float32,
+        use_triton_cc: bool = _use_triton_cc,
+        hammer_kernel: Optional[HammerKernel] = None,
+    ) -> None:
+        super().__init__()
+        self._is_inference = is_inference
+        self._training_dtype = training_dytpe
+        self._hammer_kernel = hammer_kernel
+        self._use_triton_cc = use_triton_cc
+
+    def hammer_kernel(self) -> HammerKernel:
+        kernel = self._hammer_kernel
+        if kernel is not None:
+            return kernel
+        if self._is_inference and self._use_triton_cc:
+            return HammerKernel.TRITON_CC
+        else:
+            return HammerKernel.TRITON
+
+    # pyre-ignore[2]
+    def recursive_setattr(self, name: str, value: Any) -> None:
+        for _, module in self.named_modules():
+            if hasattr(module, name):
+                setattr(module, name, value)
+
+    def set_use_triton_cc(self, use_triton_cc: bool) -> None:
+        self._use_triton_cc = use_triton_cc
+        self.recursive_setattr("_use_triton_cc", use_triton_cc)
+
+    def set_is_inference(self, is_inference: bool) -> None:
+        self._is_inference = is_inference
+        self.recursive_setattr("_is_inference", is_inference)
+
+    def set_training_dtype(self, training_dtype: torch.dtype) -> None:
+        self._training_dtype = training_dtype
+        self.recursive_setattr("_training_dtype", training_dtype)
+
+    def set_hammer_kernel(self, hammer_kernel: HammerKernel) -> None:
+        self._hammer_kernel = hammer_kernel
+        self.recursive_setattr("_hammer_kernel", hammer_kernel)
+
+    @property
+    def is_inference(self) -> bool:
+        return self._is_inference
+
+    @property
+    def is_eval(self) -> bool:
+        return (not self._is_inference) and (not self.training)
+
+    @property
+    def is_train(self) -> bool:
+        return (not self._is_inference) and self.training
+
+
+def generate_sparse_seq_len(
+    size: int,
+    max_seq_len: int,
+    sparsity: float,
+    device: torch.device,
+) -> torch.Tensor:
+    if sparsity == 0.0:
+        return torch.zeros(size=(size,), device=device, dtype=torch.int)
+    elif sparsity == 1.0:
+        return torch.ones(size=(size,), device=device, dtype=torch.int) * max_seq_len
+    elif sparsity >= 0.5:
+        min_seq_len: int = int((2 * sparsity - 1.0) * max_seq_len)
+        return torch.randint(
+            low=min_seq_len,
+            high=max_seq_len,
+            size=(size,),
+            device=device,
+            dtype=torch.int,
+        )
+    else:
+        min_seq_len: int = 0
+        max_seq_len: int = int(2 * sparsity * max_seq_len)
+        return torch.randint(
+            low=min_seq_len,
+            high=max_seq_len,
+            size=(size,),
+            device=device,
+            dtype=torch.int,
+        )
+
+
+def apply_sampling(
+    lengths: torch.Tensor,
+    alpha: float,
+    max_seq_len: int,
+) -> torch.Tensor:
+    threshold = int(max_seq_len ** (alpha / 2))
+    no_sample_prob = (max_seq_len**alpha) / torch.pow(lengths, 2)
+    users_to_sample = torch.logical_and(
+        lengths > threshold,
+        torch.rand_like(no_sample_prob) < 1 - no_sample_prob,
+    )
+    lengths = torch.where(users_to_sample, threshold, lengths)
+    return lengths
+
+
+nv_gpu_unavailable: Tuple[bool, str] = (
+    not torch.cuda.is_available() or torch.cuda.device_count() == 0,
+    "CUDA is not available or no GPUs detected",
+)
+nv_gpu_available: bool = not nv_gpu_unavailable[0]
+
+
+amd_gpu_unavailable: Tuple[bool, str] = (
+    not torch.version.hip,
+    "AMD HIP not available or no GPUs detected",
+)
+amd_gpu_available: bool = not amd_gpu_unavailable[0]
+
+gpu_unavailable: Tuple[bool, str] = (
+    not nv_gpu_available and not amd_gpu_available,
+    "CUDA/HIP is not available or no GPUs detected",
+)
+
+gpu_available: bool = not gpu_unavailable[0]
+
+blackwell_tlx_unavailable: Tuple[bool, str] = (
+    not is_sm100_plus() or not HAS_TLX,
+    "Skip TLX and blackwell only tests",
+)
+
+tma_unavailable: Tuple[bool, str] = (
+    not is_sm90_plus(),  # noqa
+    "Skip TMA only tests",
+)
+
+
+def switch_to_contiguous_if_needed(x: torch.Tensor) -> torch.Tensor:
+    if torch.jit.is_scripting():
+        if x.stride(-1) == 1:
+            return x
+        return x.contiguous()
+    if torch.compiler.is_compiling():
+        # Tell Dynamo this data-dependent value is in the range (0, 10**9)
+        torch._check(x.size(0) > 0)
+        torch._check(x.size(0) < 10**9)
+    # FX cannot trace Python control flow over symbolic stride checks
+    # (`x.stride(-1) == 1`). For AOT-T lowering, conservatively emit the
+    # contiguous op instead of branching on a symbolic value.
+    if is_fx_tracing():
+        return x.contiguous()
+    if x.stride(-1) == 1:
+        return x
+    return x.contiguous()
+
+
+def cdiv(x: int, y: int) -> int:
+    return (x + y - 1) // y
+
+
+def backend_allow_tf32() -> bool:
+    return True
+
+
+BACKEND_ALLOW_TF32: bool = backend_allow_tf32()
+
+
+def next_power_of_2(n: int) -> int:
+    """Return the smallest power of 2 greater than or equal to n"""
+    n -= 1
+    n |= n >> 1
+    n |= n >> 2
+    n |= n >> 4
+    n |= n >> 8
+    n |= n >> 16
+    n |= n >> 32
+    n += 1
+    return n
+
+
+def _prev_power_of_2_bitwise(x: int) -> int:
+    """Return the largest power of 2 less than or equal to x."""
+    x |= x >> 1
+    x |= x >> 2
+    x |= x >> 4
+    x |= x >> 8
+    x |= x >> 16
+    x |= x >> 32
+    return (x >> 1) + 1
+
+
+@torch.fx.wrap
+def _prev_power_of_2_legacy(x: int) -> int:
+    if torch.compiler.is_compiling():
+        # Re-write to make Dynamo happy
+        x_tensor = torch.scalar_tensor(x, dtype=torch.int64)  # type: ignore[arg-type]
+        x_tensor_orig = x_tensor.clone()
+        out_val = next_power_of_2(int(x_tensor.item()))  # type: ignore[arg-type]
+        out = torch.scalar_tensor(out_val, dtype=torch.int64)
+        return int(torch.where(torch.lt(x_tensor_orig, out), out // 2, out).item())  # type: ignore[return-value]
+    else:
+        out = next_power_of_2(x)
+        return out // 2 if out > x else out
+
+
+prev_power_of_2: Callable[[int], int] = (
+    _prev_power_of_2_legacy
+    if os.environ.get("PREV_POWER_OF_2_IMPL", "legacy") == "legacy"
+    else _prev_power_of_2_bitwise
+)
+
+
+STATIC_MAX_SEQ_LENS: List[int] = []
+USE_RUNTIME_MAX_SEQ_LEN: bool = False
+
+
+def set_static_max_seq_lens(max_seq_lens: List[int]) -> None:
+    global STATIC_MAX_SEQ_LENS
+    STATIC_MAX_SEQ_LENS = copy.deepcopy(max_seq_lens)
+    STATIC_MAX_SEQ_LENS.sort()
+
+
+def set_use_runtime_max_seq_len(use_runtime_max_seq_len: bool) -> None:
+    global USE_RUNTIME_MAX_SEQ_LEN
+    USE_RUNTIME_MAX_SEQ_LEN = use_runtime_max_seq_len
+
+
+def autotune_max_seq_len(runtime_max_seq_len: int) -> int:
+    global USE_RUNTIME_MAX_SEQ_LEN  # noqa: F824
+
+    if USE_RUNTIME_MAX_SEQ_LEN:
+        return prev_power_of_2(runtime_max_seq_len)
+    else:
+        if STATIC_MAX_SEQ_LENS == []:
+            return 1
+        for max_len in STATIC_MAX_SEQ_LENS:
+            if max_len >= runtime_max_seq_len:
+                return max_len
+        return STATIC_MAX_SEQ_LENS[-1]
+
+
+def fine_grained_autotune_max_seq_len(runtime_max_seq_len: int) -> int:
+    global USE_RUNTIME_MAX_SEQ_LEN  # noqa: F824
+
+    if USE_RUNTIME_MAX_SEQ_LEN:
+        return _fine_grained_bucket_size(runtime_max_seq_len)
+    else:
+        if STATIC_MAX_SEQ_LENS == []:
+            return 1
+        for max_len in STATIC_MAX_SEQ_LENS:
+            if max_len >= runtime_max_seq_len:
+                return max_len
+        return STATIC_MAX_SEQ_LENS[-1]
+
+
+def _generate_fine_grained_buckets() -> List[int]:
+    buckets = [
+        1024,
+        2048,
+        4096,
+        8192,
+        12288,
+        16384,
+        24576,
+        32768,
+        40960,
+        49152,
+        65536,
+        81920,
+        98304,
+    ]
+    return buckets
+
+
+@torch.fx.wrap
+def _fine_grained_bucket_size(x: int) -> int:
+    if torch.compiler.is_compiling():
+        x_tensor = torch.scalar_tensor(x, dtype=torch.int64)
+        buckets = torch.tensor(_generate_fine_grained_buckets(), dtype=torch.int64)
+
+        mask = buckets >= x_tensor
+        valid_buckets = torch.where(
+            mask, buckets, torch.tensor(2**31 - 1, dtype=torch.int64)
+        )
+
+        result = torch.where(mask.any(), valid_buckets.min(), buckets[-1])
+
+        return int(result.item())
+    else:
+        buckets = _generate_fine_grained_buckets()
+
+        for bucket in buckets:
+            if x <= bucket:
+                return bucket
+
+        return buckets[-1]
+
+
+@torch.fx.wrap
+def fx_unwrap_optional_tensor(optional: Optional[torch.Tensor]) -> torch.Tensor:
+    assert optional is not None, "Expected optional to be non-None Tensor"
+    return optional
+
+
+@torch.fx.wrap
+def fx_arange(len: int, device: torch.device) -> torch.Tensor:
+    return torch.arange(len, device=device)
+
+
+@torch.fx.wrap
+def fx_infer_max_len(
+    lengths: torch.Tensor,
+) -> int:
+    # Do not call ".item()" to avoid unbacked symint problems for lowering
+    max_len = int(lengths.max())
+    if not torch.jit.is_scripting() and torch.compiler.is_compiling():
+        # Tell Dynamo this data-dependent value is in the range [0, 10**9)
+        torch._check_is_size(max_len)
+        torch._check(max_len < 10**9)
+        torch._check(max_len > 0)
+    return max_len
+
+
+@torch.fx.wrap
+def fx_mark_length_features(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor
+
+
+@torch.fx.wrap
+def fx_torch_ones(
+    shape: List[int],
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    return torch.ones(shape, device=device, dtype=dtype)
+
+
+@torch.fx.wrap
+def fx_torch_zeros(shape: List[int], device: torch.device) -> torch.Tensor:
+    return torch.zeros(shape, device=device)
+
+
+def _is_in_dispatch_modes(mode_names: List[str]) -> bool:
+    modes = _get_current_dispatch_mode_stack()
+    return any(mode.__class__.__name__ in mode_names for mode in modes)
+
+
+def should_trigger_eager_impl() -> bool:
+    if torch.jit.is_scripting():
+        return True
+    if torch.compiler.is_compiling():
+        return False
+    return _is_in_dispatch_modes(["SplitDispatchMode", "FakeTensorMode"])
+
+
+@torch.fx.wrap
+def jagged_to_padded_dense(
+    values: torch.Tensor,
+    offsets: List[torch.Tensor],
+    max_lengths: List[int],
+    padding_value: float,
+) -> torch.Tensor:
+    return torch.ops.fbgemm.jagged_to_padded_dense(
+        values=values,
+        offsets=offsets,
+        max_lengths=max_lengths,
+        padding_value=padding_value,
+    )
+
+
+@torch.fx.wrap
+def dense_to_jagged(
+    dense: torch.Tensor,
+    x_offsets: List[torch.Tensor],
+) -> torch.Tensor:
+    return torch.ops.fbgemm.dense_to_jagged(
+        dense=dense,
+        x_offsets=x_offsets,
+    )[0]
+
+
+def init_mlp_weights_optional_bias(m: torch.nn.Module) -> None:
+    if isinstance(m, torch.nn.Linear):
+        torch.nn.init.xavier_uniform_(m.weight)
+        if m.bias is not None:
+            m.bias.data.fill_(0.0)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py b/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py
new file mode 100644
index 000000000..33445bce9
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py
@@ -0,0 +1,258 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+"""
+Checkpoint utilities for saving and loading DLRMv3 model checkpoints.
+
+This module provides functions for saving and loading distributed model checkpoints,
+including both sparse (embedding) and dense (non-embedding) components.
+"""
+
+import gc
+import os
+from datetime import datetime
+from typing import Any, Dict, Optional, Set
+
+import gin
+import torch
+from generative_recommenders.dlrm_v3.utils import MetricsLogger
+from torch.distributed.checkpoint.stateful import Stateful
+from torch.optim.optimizer import Optimizer
+from torchrec.distributed.types import ShardedTensor
+
+
+class SparseState(Stateful):
+    """
+    Stateful wrapper for sparse (embedding) tensors in a model.
+
+    This class implements the Stateful interface for distributed checkpointing,
+    allowing sparse tensors to be saved and loaded separately from dense tensors.
+
+    Args:
+        model: The PyTorch model containing sparse tensors.
+        sparse_tensor_keys: Set of keys identifying sparse tensors in the model's state dict.
+    """
+
+    def __init__(self, model: torch.nn.Module, sparse_tensor_keys: Set[str]) -> None:
+        self.model = model
+        self.sparse_tensor_keys = sparse_tensor_keys
+
+    def state_dict(self) -> Dict[str, torch.Tensor]:
+        out_dict: Dict[str, torch.Tensor] = {}
+        is_sharded_tensor: Optional[bool] = None
+        for k, v in self.model.state_dict().items():
+            if k in self.sparse_tensor_keys:
+                if is_sharded_tensor is None:
+                    is_sharded_tensor = isinstance(v, ShardedTensor)
+                assert is_sharded_tensor == isinstance(v, ShardedTensor)
+                out_dict[k] = v
+        return out_dict
+
+    def load_state_dict(self, state_dict: Dict[str, torch.Tensor]) -> None:
+        incompatible_keys = self.model.load_state_dict(state_dict, strict=False)
+        assert not incompatible_keys.unexpected_keys
+
+
+def is_sparse_key(k: str, v: torch.Tensor) -> bool:
+    return isinstance(v, ShardedTensor) or "embedding_collection" in k
+
+
+def load_dense_state_dict(model: torch.nn.Module, state_dict: Dict[str, Any]) -> None:
+    own_state = model.state_dict()
+    own_state_dense_keys = {k for k, v in own_state.items() if not is_sparse_key(k, v)}
+    state_dict_dense_keys = {
+        k for k, v in state_dict.items() if not is_sparse_key(k, v)
+    }
+    assert own_state_dense_keys == state_dict_dense_keys, (
+        f"expects {own_state_dense_keys} but gets {state_dict_dense_keys}"
+    )
+    for name in state_dict_dense_keys:
+        param = state_dict[name]
+        if isinstance(param, torch.nn.Parameter):
+            # backwards compatibility for serialized parameters
+            param = param.data
+        own_state[name].copy_(param)
+
+
+@gin.configurable
+def save_dmp_checkpoint(
+    model: torch.nn.Module,
+    optimizer: Optimizer,
+    metric_logger: MetricsLogger,
+    rank: int,
+    batch_idx: int,
+    path: str = "",
+) -> None:
+    """
+    Save a distributed model checkpoint including sparse and dense components.
+
+    Saves the model's sparse tensors using distributed checkpointing and dense
+    tensors, optimizer state, and metrics using standard PyTorch serialization.
+
+    Args:
+        model: The model to checkpoint.
+        optimizer: The optimizer whose state should be saved.
+        metric_logger: The metrics logger containing training/eval metrics.
+        rank: The current process rank in distributed training.
+        batch_idx: The current batch index (used for checkpoint naming).
+        path: Base path for saving the checkpoint. If empty, no checkpoint is saved.
+    """
+    if path == "":
+        return
+    now = datetime.now()
+    formatted_datetime = now.strftime("%Y_%m_%d_%H_%M_%S")
+    path = f"{path}/{batch_idx}"
+    if not os.path.exists(path) and rank == 0:
+        os.makedirs(path)
+    sparse_path = f"{path}/sparse/"
+    if not os.path.exists(sparse_path) and rank == 0:
+        os.makedirs(sparse_path)
+    non_sparse_ckpt = f"{path}/non_sparse.ckpt"
+
+    sparse_tensor_keys = {
+        k for k, v in model.state_dict().items() if isinstance(v, ShardedTensor)
+    }
+    if rank == 0:
+        dense_state_dict = {
+            k: v
+            for k, v in model.state_dict().items()
+            if not isinstance(v, ShardedTensor)
+        }
+        class_metric_state_dict = {
+            "train": [m.state_dict() for m in metric_logger.class_metrics["train"]],
+            "eval": [m.state_dict() for m in metric_logger.class_metrics["eval"]],
+        }
+        regression_metric_state_dict = {
+            "train": [
+                m.state_dict() for m in metric_logger.regression_metrics["train"]
+            ],
+            "eval": [m.state_dict() for m in metric_logger.regression_metrics["eval"]],
+        }
+        torch.save(
+            {
+                "dense_dict": dense_state_dict,
+                "optimizer_dict": optimizer.state_dict(),
+                "class_metrics": class_metric_state_dict,
+                "reg_metrics": regression_metric_state_dict,
+                "global_step": metric_logger.global_step,
+                "sparse_tensor_keys": sparse_tensor_keys,
+            },
+            non_sparse_ckpt,
+        )
+    torch.distributed.barrier()
+    sparse_dict = {"sparse_dict": SparseState(model, sparse_tensor_keys)}
+    torch.distributed.checkpoint.save(
+        sparse_dict,
+        storage_writer=torch.distributed.checkpoint.FileSystemWriter(sparse_path),
+    )
+    torch.distributed.barrier()
+    print("checkpoint successfully saved")
+
+
+@gin.configurable
+def load_sparse_checkpoint(
+    model: torch.nn.Module,
+    path: str = "",
+) -> None:
+    if path == "":
+        return
+    sparse_path = f"{path}/sparse/"
+
+    sparse_tensor_keys = {
+        k for k, v in model.state_dict().items() if is_sparse_key(k, v)
+    }
+    sparse_dict = {"sparse_dict": SparseState(model, sparse_tensor_keys)}
+    gc.collect()
+    torch.distributed.checkpoint.load(
+        sparse_dict,
+        storage_reader=torch.distributed.checkpoint.FileSystemReader(sparse_path),
+    )
+    gc.collect()
+    print("sparse checkpoint successfully loaded")
+
+
+@gin.configurable
+def load_nonsparse_checkpoint(
+    model: torch.nn.Module,
+    device: torch.device,
+    optimizer: Optional[Optimizer] = None,
+    metric_logger: Optional[MetricsLogger] = None,
+    path: str = "",
+) -> None:
+    """
+    Load non-sparse (dense) components from a checkpoint.
+
+    Loads dense model parameters, and optionally optimizer state and metrics.
+
+    Args:
+        model: The model to load dense parameters into.
+        device: The device to load tensors onto.
+        optimizer: Optional optimizer to restore state for.
+        metric_logger: Optional metrics logger to restore state for.
+        path: Base path of the checkpoint. If empty, no loading is performed.
+    """
+    if path == "":
+        return
+    non_sparse_ckpt = f"{path}/non_sparse.ckpt"
+
+    non_sparse_state_dict = torch.load(non_sparse_ckpt, map_location=device)
+    load_dense_state_dict(model, non_sparse_state_dict["dense_dict"])
+    print("dense checkpoint successfully loaded")
+    if optimizer is not None:
+        optimizer.load_state_dict(non_sparse_state_dict["optimizer_dict"])
+        print("optimizer checkpoint successfully loaded")
+    if metric_logger is not None:
+        metric_logger.global_step = non_sparse_state_dict["global_step"]
+        class_metric_state_dict = non_sparse_state_dict["class_metrics"]
+        regression_metric_state_dict = non_sparse_state_dict["reg_metrics"]
+        for i, m in enumerate(metric_logger.class_metrics["train"]):
+            m.load_state_dict(class_metric_state_dict["train"][i])
+        for i, m in enumerate(metric_logger.class_metrics["eval"]):
+            m.load_state_dict(class_metric_state_dict["eval"][i])
+        for i, m in enumerate(metric_logger.regression_metrics["train"]):
+            m.load_state_dict(regression_metric_state_dict["train"][i])
+        for i, m in enumerate(metric_logger.regression_metrics["eval"]):
+            m.load_state_dict(regression_metric_state_dict["eval"][i])
+
+
+@gin.configurable
+def load_dmp_checkpoint(
+    model: torch.nn.Module,
+    optimizer: Optimizer,
+    metric_logger: MetricsLogger,
+    device: torch.device,
+    path: str = "",
+) -> None:
+    """
+    Load a complete distributed model checkpoint (both sparse and dense components).
+
+    This is a convenience function that calls both load_sparse_checkpoint and
+    load_nonsparse_checkpoint.
+
+    Args:
+        model: The model to load the checkpoint into.
+        optimizer: The optimizer to restore state for.
+        metric_logger: The metrics logger to restore state for.
+        device: The device to load tensors onto.
+        path: Base path of the checkpoint. If empty, no loading is performed.
+    """
+    load_sparse_checkpoint(model=model, path=path)
+    load_nonsparse_checkpoint(
+        model=model,
+        optimizer=optimizer,
+        metric_logger=metric_logger,
+        path=path,
+        device=device,
+    )
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/configs.py b/recommendation_v4/generative_recommenders/dlrm_v3/configs.py
new file mode 100644
index 000000000..2981f01e3
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/configs.py
@@ -0,0 +1,704 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+"""
+Configuration module for DLRMv3 model.
+
+This module provides configuration functions for the HSTU model architecture and embedding table configurations.
+"""
+
+from typing import Dict
+
+from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig
+from generative_recommenders.modules.multitask_module import (
+    MultitaskTaskType,
+    TaskConfig,
+)
+from torchrec.modules.embedding_configs import DataType, EmbeddingConfig
+
+HSTU_EMBEDDING_DIM = 512  # final DLRMv3 model
+HASH_SIZE = 10_000_000
+HASH_SIZE_1B = 1_000_000_000
+
+YAMBDA_EMBEDDING_DIM = 512
+
+# (name, keys, num_embeddings, salt) — single source of truth for both
+# get_embedding_table_config("yambda-5b") and the dataset's cross-hash inputs.
+# Sizes mirror Primus-DLRM/configs/bench_onetrans_large_5b_cross_feat_shampoo.yaml.
+YAMBDA_5B_CROSS_SPECS = [
+    ("user_x_artist",        ("uid", "artist_id"),                  100_000_000, 0),
+    ("user_x_album",         ("uid", "album_id"),                    40_000_000, 0),
+    ("user_x_hour",          ("uid", "hour_of_day"),                 24_000_000, 0),
+    ("item_x_hour",          ("item_id", "hour_of_day"),             40_000_000, 0),
+    ("artist_x_hour",        ("artist_id", "hour_of_day"),           32_000_000, 0),
+    ("user_x_is_organic",    ("uid", "is_organic"),                   2_000_000, 0),
+    ("user_x_artist_x_hour", ("uid", "artist_id", "hour_of_day"),    40_000_000, 0),
+]
+
+
+def get_hstu_configs(dataset: str = "debug") -> DlrmHSTUConfig:
+    """
+    Create and return HSTU model configuration.
+
+    Builds a complete DlrmHSTUConfig with default hyperparameters for the HSTU
+    architecture including attention settings, embedding dimensions, dropout rates,
+    and feature name mappings.
+
+    Args:
+        dataset: Dataset identifier (currently unused, reserved for dataset-specific configs).
+
+    Returns:
+        DlrmHSTUConfig: Complete configuration object for the HSTU model.
+    """
+    hstu_config = DlrmHSTUConfig(
+        hstu_num_heads=4,
+        hstu_attn_linear_dim=128,
+        hstu_attn_qk_dim=128,
+        hstu_attn_num_layers=5,
+        hstu_embedding_table_dim=HSTU_EMBEDDING_DIM,
+        hstu_preprocessor_hidden_dim=256,
+        hstu_transducer_embedding_dim=512,
+        hstu_group_norm=False,
+        hstu_input_dropout_ratio=0.2,
+        hstu_linear_dropout_rate=0.1,
+        causal_multitask_weights=0.2,
+    )
+    if "movielens" in dataset:
+        assert dataset in [
+            "movielens-1m",
+            "movielens-20m",
+            "movielens-13b",
+            "movielens-18b",
+        ]
+        hstu_config.user_embedding_feature_names = (
+            [
+                "movie_id",
+                "user_id",
+                "sex",
+                "age_group",
+                "occupation",
+                "zip_code",
+            ]
+            if dataset == "movielens-1m"
+            else [
+                "movie_id",
+                "user_id",
+            ]
+        )
+        hstu_config.item_embedding_feature_names = [
+            "item_movie_id",
+        ]
+        hstu_config.uih_post_id_feature_name = "movie_id"
+        hstu_config.uih_action_time_feature_name = "action_timestamp"
+        hstu_config.candidates_querytime_feature_name = "item_query_time"
+        hstu_config.candidates_weight_feature_name = "item_action_weights"
+        hstu_config.uih_weight_feature_name = "item_weights"
+        hstu_config.candidates_watchtime_feature_name = "item_movie_rating"
+        hstu_config.action_weights = [1, 2, 4, 8, 16]
+        hstu_config.contextual_feature_to_max_length = (
+            {
+                "user_id": 1,
+                "sex": 1,
+                "age_group": 1,
+                "occupation": 1,
+                "zip_code": 1,
+            }
+            if dataset == "movielens-1m"
+            else {
+                "user_id": 1,
+            }
+        )
+        hstu_config.contextual_feature_to_min_uih_length = (
+            {
+                "user_id": 20,
+                "sex": 20,
+                "age_group": 20,
+                "occupation": 20,
+                "zip_code": 20,
+            }
+            if dataset == "movielens-1m"
+            else {
+                "user_id": 20,
+            }
+        )
+        hstu_config.merge_uih_candidate_feature_mapping = [
+            ("movie_id", "item_movie_id"),
+            ("movie_rating", "item_movie_rating"),
+            ("action_timestamp", "item_query_time"),
+            ("item_weights", "item_action_weights"),
+            ("dummy_watch_time", "item_dummy_watchtime"),
+        ]
+        hstu_config.hstu_uih_feature_names = (
+            [
+                "user_id",
+                "sex",
+                "age_group",
+                "occupation",
+                "zip_code",
+                "movie_id",
+                "movie_rating",
+                "action_timestamp",
+                "item_weights",
+                "dummy_watch_time",
+            ]
+            if dataset == "movielens-1m"
+            else [
+                "user_id",
+                "movie_id",
+                "movie_rating",
+                "action_timestamp",
+                "item_weights",
+                "dummy_watch_time",
+            ]
+        )
+        hstu_config.hstu_candidate_feature_names = [
+            "item_movie_id",
+            "item_movie_rating",
+            "item_query_time",
+            "item_action_weights",
+            "item_dummy_watchtime",
+        ]
+        hstu_config.max_num_candidates = 10
+        hstu_config.max_num_candidates_inference = (
+            5 if dataset not in ["movielens-13b", "movielens-18b"] else 2048
+        )
+        hstu_config.multitask_configs = [
+            TaskConfig(
+                task_name="rating",
+                task_weight=1,
+                task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+            )
+        ]
+    elif "streaming" in dataset:
+        hstu_config.user_embedding_feature_names = [
+            "item_id",
+            "user_id",
+            "item_category_id",
+        ]
+        hstu_config.item_embedding_feature_names = [
+            "item_candidate_id",
+            "item_candidate_category_id",
+        ]
+        hstu_config.uih_post_id_feature_name = "item_id"
+        hstu_config.uih_action_time_feature_name = "action_timestamp"
+        hstu_config.candidates_querytime_feature_name = "item_query_time"
+        hstu_config.candidates_weight_feature_name = "item_action_weights"
+        hstu_config.uih_weight_feature_name = "item_weights"
+        hstu_config.candidates_watchtime_feature_name = "item_rating"
+        hstu_config.action_weights = [1, 2, 4, 8, 16]
+        hstu_config.action_embedding_init_std = 5.0
+        hstu_config.contextual_feature_to_max_length = {"user_id": 1}
+        hstu_config.contextual_feature_to_min_uih_length = {"user_id": 20}
+        hstu_config.merge_uih_candidate_feature_mapping = [
+            ("item_id", "item_candidate_id"),
+            ("item_rating", "item_candidate_rating"),
+            ("action_timestamp", "item_query_time"),
+            ("item_weights", "item_action_weights"),
+            ("dummy_watch_time", "item_dummy_watchtime"),
+            ("item_category_id", "item_candidate_category_id"),
+        ]
+        hstu_config.hstu_uih_feature_names = [
+            "user_id",
+            "item_id",
+            "item_rating",
+            "action_timestamp",
+            "item_weights",
+            "dummy_watch_time",
+            "item_category_id",
+        ]
+        hstu_config.hstu_candidate_feature_names = [
+            "item_candidate_id",
+            "item_candidate_rating",
+            "item_query_time",
+            "item_action_weights",
+            "item_dummy_watchtime",
+            "item_candidate_category_id",
+        ]
+        hstu_config.max_num_candidates = 32
+        hstu_config.max_num_candidates_inference = 2048
+        hstu_config.multitask_configs = [
+            TaskConfig(
+                task_name="rating",
+                task_weight=1,
+                task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+            )
+        ]
+    elif "kuairand" in dataset:
+        hstu_config.user_embedding_feature_names = [
+            "video_id",
+            "user_id",
+            "user_active_degree",
+            "follow_user_num_range",
+            "fans_user_num_range",
+            "friend_user_num_range",
+            "register_days_range",
+        ]
+        hstu_config.item_embedding_feature_names = [
+            "item_video_id",
+        ]
+        hstu_config.uih_post_id_feature_name = "video_id"
+        hstu_config.uih_action_time_feature_name = "action_timestamp"
+        hstu_config.candidates_querytime_feature_name = "item_query_time"
+        hstu_config.uih_weight_feature_name = "action_weight"
+        hstu_config.candidates_weight_feature_name = "item_action_weight"
+        hstu_config.candidates_watchtime_feature_name = "item_target_watchtime"
+        # There are more contextual features in the dataset, see https://kuairand.com/ for details
+        hstu_config.contextual_feature_to_max_length = {
+            "user_id": 1,
+            "user_active_degree": 1,
+            "follow_user_num_range": 1,
+            "fans_user_num_range": 1,
+            "friend_user_num_range": 1,
+            "register_days_range": 1,
+        }
+        hstu_config.merge_uih_candidate_feature_mapping = [
+            ("video_id", "item_video_id"),
+            ("action_timestamp", "item_query_time"),
+            ("action_weight", "item_action_weight"),
+            ("watch_time", "item_target_watchtime"),
+        ]
+        hstu_config.hstu_uih_feature_names = [
+            "user_id",
+            "user_active_degree",
+            "follow_user_num_range",
+            "fans_user_num_range",
+            "friend_user_num_range",
+            "register_days_range",
+            "video_id",
+            "action_timestamp",
+            "action_weight",
+            "watch_time",
+        ]
+        hstu_config.hstu_candidate_feature_names = [
+            "item_video_id",
+            "item_action_weight",
+            "item_target_watchtime",
+            "item_query_time",
+        ]
+        hstu_config.multitask_configs = [
+            TaskConfig(
+                task_name="is_click",
+                task_weight=1,
+                task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+            ),
+            TaskConfig(
+                task_name="is_like",
+                task_weight=2,
+                task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+            ),
+            TaskConfig(
+                task_name="is_follow",
+                task_weight=4,
+                task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+            ),
+            TaskConfig(
+                task_name="is_comment",
+                task_weight=8,
+                task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+            ),
+            TaskConfig(
+                task_name="is_forward",
+                task_weight=16,
+                task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+            ),
+            TaskConfig(
+                task_name="is_hate",
+                task_weight=32,
+                task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+            ),
+            TaskConfig(
+                task_name="long_view",
+                task_weight=64,
+                task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+            ),
+            TaskConfig(
+                task_name="is_profile_enter",
+                task_weight=128,
+                task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+            ),
+        ]
+        hstu_config.action_weights = [1, 2, 4, 8, 16, 32, 64, 128]
+    elif "yambda" in dataset:
+        assert dataset in ["yambda-5b"]
+        cross_names = [name for (name, _k, _n, _s) in YAMBDA_5B_CROSS_SPECS]
+        # Smaller per-table dim for yambda (see YAMBDA_EMBEDDING_DIM); transducer
+        # still projects to 512.
+        hstu_config.hstu_embedding_table_dim = YAMBDA_EMBEDDING_DIM
+        hstu_config.hstu_transducer_embedding_dim = 512
+        hstu_config.max_seq_len = 8192
+        hstu_config.max_num_candidates = 1
+        hstu_config.max_num_candidates_inference = 1
+        # Per dlrm_hstu convention (see streaming-100b/movielens):
+        #  - user_embedding_feature_names = UIH-side post-id features + contextual features.
+        #    After main_forward merges UIH + candidate, only these entries hold the merged
+        #    sequence (used by user-side transducer).
+        #  - item_embedding_feature_names = candidate-side names only. _item_forward
+        #    concats these along dim=-1 to feed the item MLP (per-candidate, not per-position).
+        hstu_config.user_embedding_feature_names = (
+            ["uid"]
+            + cross_names
+            + ["item_id", "artist_id", "album_id"]
+        )
+        hstu_config.item_embedding_feature_names = [
+            "item_candidate_id",
+            "item_candidate_artist_id",
+            "item_candidate_album_id",
+        ]
+        hstu_config.uih_post_id_feature_name = "item_id"
+        hstu_config.uih_action_time_feature_name = "action_timestamp"
+        hstu_config.uih_weight_feature_name = "action_weight"
+        hstu_config.candidates_querytime_feature_name = "item_query_time"
+        hstu_config.candidates_weight_feature_name = "item_action_weight"
+        hstu_config.candidates_watchtime_feature_name = "item_dummy_watchtime"
+        hstu_config.action_weights = [1, 2, 4]  # lp, like, skip bits
+        hstu_config.contextual_feature_to_max_length = {
+            "uid": 1,
+            **{name: 1 for name in cross_names},
+        }
+        hstu_config.contextual_feature_to_min_uih_length = {
+            "uid": 0,
+            **{name: 0 for name in cross_names},
+        }
+        # uih names map to candidate names (no name collisions allowed):
+        # item_id/artist_id/album_id appear with prefix "item_" on candidate side.
+        hstu_config.merge_uih_candidate_feature_mapping = [
+            ("item_id", "item_candidate_id"),
+            ("artist_id", "item_candidate_artist_id"),
+            ("album_id", "item_candidate_album_id"),
+            ("action_weight", "item_action_weight"),
+            ("action_timestamp", "item_query_time"),
+            ("dummy_watch_time", "item_dummy_watchtime"),
+        ]
+        hstu_config.hstu_uih_feature_names = (
+            ["uid"]
+            + cross_names
+            + [
+                "item_id",
+                "artist_id",
+                "album_id",
+                "action_weight",
+                "action_timestamp",
+                "dummy_watch_time",
+            ]
+        )
+        hstu_config.hstu_candidate_feature_names = [
+            "item_candidate_id",
+            "item_candidate_artist_id",
+            "item_candidate_album_id",
+            "item_query_time",
+            "item_action_weight",
+            "item_dummy_watchtime",
+        ]
+        hstu_config.multitask_configs = [
+            TaskConfig(
+                task_name="listen_plus",
+                task_weight=1,  # matches action_weights[0] (lp bit)
+                task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+            )
+        ]
+    else:
+        hstu_config.user_embedding_feature_names = [
+            "uih_post_id",
+            "uih_owner_id",
+            "viewer_id",
+            "dummy_contexual",
+        ]
+        hstu_config.item_embedding_feature_names = [
+            "item_post_id",
+            "item_owner_id",
+        ]
+        hstu_config.uih_post_id_feature_name = "uih_post_id"
+        hstu_config.uih_action_time_feature_name = "uih_action_time"
+        hstu_config.candidates_querytime_feature_name = "item_query_time"
+        hstu_config.candidates_weight_feature_name = "item_action_weight"
+        hstu_config.candidates_watchtime_feature_name = "item_target_watchtime"
+        hstu_config.contextual_feature_to_max_length = {
+            "viewer_id": 1,
+            "dummy_contexual": 1,
+        }
+        hstu_config.contextual_feature_to_min_uih_length = {
+            "viewer_id": 128,
+            "dummy_contexual": 128,
+        }
+        hstu_config.merge_uih_candidate_feature_mapping = [
+            ("uih_post_id", "item_post_id"),
+            ("uih_owner_id", "item_owner_id"),
+            ("uih_action_time", "item_query_time"),
+            ("uih_weight", "item_action_weight"),
+            ("uih_watchtime", "item_target_watchtime"),
+            ("uih_video_length", "item_video_length"),
+            ("uih_surface_type", "item_surface_type"),
+        ]
+        hstu_config.hstu_uih_feature_names = [
+            "uih_post_id",
+            "uih_action_time",
+            "uih_weight",
+            "uih_owner_id",
+            "uih_watchtime",
+            "uih_surface_type",
+            "uih_video_length",
+            "viewer_id",
+            "dummy_contexual",
+        ]
+        hstu_config.hstu_candidate_feature_names = [
+            "item_post_id",
+            "item_owner_id",
+            "item_surface_type",
+            "item_video_length",
+            "item_action_weight",
+            "item_target_watchtime",
+            "item_query_time",
+        ]
+        hstu_config.multitask_configs = [
+            TaskConfig(
+                task_name="vvp100",
+                task_weight=1,
+                task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+            )
+        ]
+    return hstu_config
+
+
+def get_embedding_table_config(dataset: str = "debug") -> Dict[str, EmbeddingConfig]:
+    """
+    Create and return embedding table configurations.
+
+    Defines the embedding table configurations for item IDs, category IDs, and user IDs
+    with their respective dimensions and data types.
+
+    Args:
+        dataset: Dataset identifier (currently unused, reserved for dataset-specific configs).
+
+    Returns:
+        Dict mapping table names to their EmbeddingConfig objects.
+    """
+    if "movielens" in dataset:
+        assert dataset in [
+            "movielens-1m",
+            "movielens-20m",
+            "movielens-13b",
+            "movielens-18b",
+        ]
+        return (
+            {
+                "movie_id": EmbeddingConfig(
+                    num_embeddings=HASH_SIZE,
+                    embedding_dim=HSTU_EMBEDDING_DIM,
+                    name="movie_id",
+                    data_type=DataType.FP16,
+                    feature_names=["movie_id", "item_movie_id"],
+                ),
+                "user_id": EmbeddingConfig(
+                    num_embeddings=HASH_SIZE,
+                    embedding_dim=HSTU_EMBEDDING_DIM,
+                    name="user_id",
+                    data_type=DataType.FP16,
+                    feature_names=["user_id"],
+                ),
+                "sex": EmbeddingConfig(
+                    num_embeddings=HASH_SIZE,
+                    embedding_dim=HSTU_EMBEDDING_DIM,
+                    name="sex",
+                    data_type=DataType.FP16,
+                    feature_names=["sex"],
+                ),
+                "age_group": EmbeddingConfig(
+                    num_embeddings=HASH_SIZE,
+                    embedding_dim=HSTU_EMBEDDING_DIM,
+                    name="age_group",
+                    data_type=DataType.FP16,
+                    feature_names=["age_group"],
+                ),
+                "occupation": EmbeddingConfig(
+                    num_embeddings=HASH_SIZE,
+                    embedding_dim=HSTU_EMBEDDING_DIM,
+                    name="occupation",
+                    data_type=DataType.FP16,
+                    feature_names=["occupation"],
+                ),
+                "zip_code": EmbeddingConfig(
+                    num_embeddings=HASH_SIZE,
+                    embedding_dim=HSTU_EMBEDDING_DIM,
+                    name="zip_code",
+                    data_type=DataType.FP16,
+                    feature_names=["zip_code"],
+                ),
+            }
+            if dataset == "movielens-1m"
+            else {
+                "movie_id": EmbeddingConfig(
+                    num_embeddings=HASH_SIZE_1B,
+                    embedding_dim=HSTU_EMBEDDING_DIM,
+                    name="movie_id",
+                    data_type=DataType.FP16,
+                    feature_names=["movie_id", "item_movie_id"],
+                ),
+                "user_id": EmbeddingConfig(
+                    num_embeddings=3_000_000,
+                    embedding_dim=HSTU_EMBEDDING_DIM,
+                    name="user_id",
+                    data_type=DataType.FP16,
+                    feature_names=["user_id"],
+                ),
+            }
+        )
+    elif "streaming" in dataset:
+        return {
+            "item_id": EmbeddingConfig(
+                num_embeddings=HASH_SIZE_1B,
+                embedding_dim=HSTU_EMBEDDING_DIM,
+                name="item_id",
+                data_type=DataType.FP16,
+                feature_names=["item_id", "item_candidate_id"],
+            ),
+            "item_category_id": EmbeddingConfig(
+                num_embeddings=128,
+                embedding_dim=HSTU_EMBEDDING_DIM,
+                name="item_category_id",
+                data_type=DataType.FP16,
+                weight_init_max=1.0,
+                weight_init_min=-1.0,
+                feature_names=["item_category_id", "item_candidate_category_id"],
+            ),
+            "user_id": EmbeddingConfig(
+                num_embeddings=10_000_000,
+                embedding_dim=HSTU_EMBEDDING_DIM,
+                name="user_id",
+                data_type=DataType.FP16,
+                feature_names=["user_id"],
+            ),
+        }
+    elif "kuairand" in dataset:
+        return {
+            "video_id": EmbeddingConfig(
+                num_embeddings=HASH_SIZE,
+                embedding_dim=HSTU_EMBEDDING_DIM,
+                name="video_id",
+                data_type=DataType.FP16,
+                feature_names=["video_id", "item_video_id"],
+            ),
+            "user_id": EmbeddingConfig(
+                num_embeddings=HASH_SIZE,
+                embedding_dim=HSTU_EMBEDDING_DIM,
+                name="user_id",
+                data_type=DataType.FP16,
+                feature_names=["user_id"],
+            ),
+            "user_active_degree": EmbeddingConfig(
+                num_embeddings=8,
+                embedding_dim=HSTU_EMBEDDING_DIM,
+                name="user_active_degree",
+                data_type=DataType.FP16,
+                feature_names=["user_active_degree"],
+            ),
+            "follow_user_num_range": EmbeddingConfig(
+                num_embeddings=9,
+                embedding_dim=HSTU_EMBEDDING_DIM,
+                name="follow_user_num_range",
+                data_type=DataType.FP16,
+                feature_names=["follow_user_num_range"],
+            ),
+            "fans_user_num_range": EmbeddingConfig(
+                num_embeddings=9,
+                embedding_dim=HSTU_EMBEDDING_DIM,
+                name="fans_user_num_range",
+                data_type=DataType.FP16,
+                feature_names=["fans_user_num_range"],
+            ),
+            "friend_user_num_range": EmbeddingConfig(
+                num_embeddings=8,
+                embedding_dim=HSTU_EMBEDDING_DIM,
+                name="friend_user_num_range",
+                data_type=DataType.FP16,
+                feature_names=["friend_user_num_range"],
+            ),
+            "register_days_range": EmbeddingConfig(
+                num_embeddings=8,
+                embedding_dim=HSTU_EMBEDDING_DIM,
+                name="register_days_range",
+                data_type=DataType.FP16,
+                feature_names=["register_days_range"],
+            ),
+        }
+    elif "yambda" in dataset:
+        assert dataset in ["yambda-5b"]
+        tables: Dict[str, EmbeddingConfig] = {
+            "item_id": EmbeddingConfig(
+                num_embeddings=9_390_000,
+                embedding_dim=YAMBDA_EMBEDDING_DIM,
+                name="item_id",
+                data_type=DataType.FP32,
+                feature_names=["item_id", "item_candidate_id"],
+            ),
+            "artist_id": EmbeddingConfig(
+                num_embeddings=1_290_000,
+                embedding_dim=YAMBDA_EMBEDDING_DIM,
+                name="artist_id",
+                data_type=DataType.FP32,
+                feature_names=["artist_id", "item_candidate_artist_id"],
+            ),
+            "album_id": EmbeddingConfig(
+                num_embeddings=3_370_000,
+                embedding_dim=YAMBDA_EMBEDDING_DIM,
+                name="album_id",
+                data_type=DataType.FP32,
+                feature_names=["album_id", "item_candidate_album_id"],
+            ),
+            "uid": EmbeddingConfig(
+                num_embeddings=1_000_000,
+                embedding_dim=YAMBDA_EMBEDDING_DIM,
+                name="uid",
+                data_type=DataType.FP32,
+                feature_names=["uid"],
+            ),
+        }
+        for name, _keys, num_embeddings, _salt in YAMBDA_5B_CROSS_SPECS:
+            tables[name] = EmbeddingConfig(
+                num_embeddings=num_embeddings,
+                embedding_dim=YAMBDA_EMBEDDING_DIM,
+                name=name,
+                data_type=DataType.FP32,
+                feature_names=[name],
+            )
+        return tables
+    else:
+        return {
+            "post_id": EmbeddingConfig(
+                num_embeddings=HASH_SIZE,
+                embedding_dim=HSTU_EMBEDDING_DIM,
+                name="post_id",
+                data_type=DataType.FP16,
+                feature_names=[
+                    "uih_post_id",
+                    "item_post_id",
+                    "uih_owner_id",
+                    "item_owner_id",
+                ],
+            ),
+            "viewer_id": EmbeddingConfig(
+                num_embeddings=HASH_SIZE,
+                embedding_dim=HSTU_EMBEDDING_DIM,
+                name="viewer_id",
+                data_type=DataType.FP16,
+                feature_names=["viewer_id"],
+            ),
+            "dummy_contexual": EmbeddingConfig(
+                num_embeddings=HASH_SIZE,
+                embedding_dim=HSTU_EMBEDDING_DIM,
+                name="dummy_contexual",
+                data_type=DataType.FP16,
+                feature_names=["dummy_contexual"],
+            ),
+        }
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/dataset.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/dataset.py
new file mode 100644
index 000000000..a1cbb33fa
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/dataset.py
@@ -0,0 +1,398 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+"""
+Dataset implementations for DLRMv3.
+
+This module provides dataset classes for loading and processing recommendation
+data, including sample containers, collation functions, and random data generation.
+"""
+
+import logging
+import time
+from dataclasses import dataclass
+from typing import Dict, List, Tuple
+
+import torch
+from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig
+from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
+
+
+logging.basicConfig(level=logging.INFO)
+logger: logging.Logger = logging.getLogger("dlrmv3_dataset")
+
+
+@dataclass
+class Samples:
+    """
+    Container for batched samples with user interaction history and candidate features.
+
+    Attributes:
+        uih_features_kjt: User interaction history features as KeyedJaggedTensor.
+        candidates_features_kjt: Candidate item features as KeyedJaggedTensor.
+    """
+
+    uih_features_kjt: KeyedJaggedTensor
+    candidates_features_kjt: KeyedJaggedTensor
+
+    def to(self, device: torch.device) -> None:
+        """
+        Move all tensors to the specified device.
+
+        Args:
+            device: Target device to move tensors to.
+        """
+        for attr in vars(self):
+            setattr(self, attr, getattr(self, attr).to(device=device))
+
+    def batch_size(self) -> int:
+        """
+        Get the batch size of the samples.
+
+        Returns:
+            Number of samples in the batch.
+        """
+        return self.uih_features_kjt.stride()
+
+
+def collate_fn(
+    samples: List[Tuple[KeyedJaggedTensor, KeyedJaggedTensor]],
+) -> Samples:
+    """
+    Collate multiple samples into a batched Samples object.
+
+    Args:
+        samples: List of (uih_features, candidates_features) tuples.
+
+    Returns:
+        Batched Samples object with concatenated features.
+    """
+    (
+        uih_features_kjt_list,
+        candidates_features_kjt_list,
+    ) = list(zip(*samples))
+
+    return Samples(
+        uih_features_kjt=kjt_batch_func(uih_features_kjt_list),
+        candidates_features_kjt=kjt_batch_func(candidates_features_kjt_list),
+    )
+
+
+class Dataset:
+    """
+    Base dataset class for DLRMv3.
+
+    Provides the interface for loading, accessing, and managing samples
+    for recommendation model training and inference.
+
+    Args:
+        hstu_config: HSTU model configuration.
+        **args: Additional arguments (unused in base class).
+    """
+
+    def __init__(self, hstu_config: DlrmHSTUConfig, **args):
+        self.arrival = None
+        self.image_list = []
+        self.label_list = []
+        self.image_list_inmemory = {}
+        self.last_loaded = -1.0
+
+    def preprocess(self, use_cache=True):
+        """
+        Preprocess the dataset.
+
+        Args:
+            use_cache: Whether to use cached preprocessed data.
+
+        Raises:
+            NotImplementedError: Subclasses must implement this method.
+        """
+        raise NotImplementedError("Dataset:preprocess")
+
+    def get_item_count(self):
+        """
+        Get the total number of items in the dataset.
+
+        Returns:
+            Number of items.
+        """
+        return len(self.image_list)
+
+    def load_query_samples(self, sample_list):
+        """
+        Load specified samples into memory.
+
+        Args:
+            sample_list: List of sample indices to load.
+
+        Raises:
+            NotImplementedError: Subclasses must implement this method.
+        """
+        raise NotImplementedError("Dataset:load_query_samples")
+
+    def unload_query_samples(self, sample_list):
+        """
+        Unload specified samples from memory.
+
+        Args:
+            sample_list: List of sample indices to unload.
+
+        Raises:
+            NotImplementedError: Subclasses must implement this method.
+        """
+        raise NotImplementedError("Dataset:unload_query_samples")
+
+    def get_sample(self, id: int):
+        """
+        Get a single sample by ID.
+
+        Args:
+            id: Sample identifier.
+
+        Raises:
+            NotImplementedError: Subclasses must implement this method.
+        """
+        raise NotImplementedError("Dataset:get_sample")
+
+    def get_samples(self, id_list: List[int]) -> Samples:
+        """
+        Get multiple samples and collate them into a batch.
+
+        Args:
+            id_list: List of sample identifiers.
+
+        Returns:
+            Collated Samples object containing the batch.
+        """
+        list_samples = [self.get_sample(ix) for ix in id_list]
+        return collate_fn(list_samples)
+
+
+@torch.jit.script
+def kjt_batch_func(
+    kjt_list: List[KeyedJaggedTensor],
+) -> KeyedJaggedTensor:
+    """
+    Batch multiple KeyedJaggedTensors into a single tensor.
+
+    Uses FBGEMM operations for efficient batching and reordering of
+    jagged tensor data.
+
+    Args:
+        kjt_list: List of KeyedJaggedTensors to batch.
+
+    Returns:
+        Batched KeyedJaggedTensor with reordered indices and lengths.
+    """
+    bs_list = [kjt.stride() for kjt in kjt_list]
+    bs = sum(bs_list)
+    batched_length = torch.cat([kjt.lengths() for kjt in kjt_list], dim=0)
+    batched_indices = torch.cat([kjt.values() for kjt in kjt_list], dim=0)
+    bs_offset = torch.ops.fbgemm.asynchronous_complete_cumsum(
+        torch.tensor(bs_list)
+    ).int()
+    batched_offset = torch.ops.fbgemm.asynchronous_complete_cumsum(batched_length)
+    reorder_length = torch.ops.fbgemm.reorder_batched_ad_lengths(
+        batched_length, bs_offset, bs
+    )
+    reorder_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(reorder_length)
+    reorder_indices = torch.ops.fbgemm.reorder_batched_ad_indices(
+        batched_offset, batched_indices, reorder_offsets, bs_offset, bs
+    )
+    out = KeyedJaggedTensor(
+        keys=kjt_list[0].keys(),
+        lengths=reorder_length.long(),
+        values=reorder_indices.long(),
+    )
+    return out
+
+
+def get_random_data(
+    contexual_features: List[str],
+    hstu_uih_keys: List[str],
+    hstu_candidates_keys: List[str],
+    uih_max_seq_len: int,
+    max_num_candidates: int,
+    value_bound: int = 1000,
+):
+    """
+    Generate random sample data for testing and debugging.
+
+    Creates synthetic user interaction history and candidate features
+    with random values.
+
+    Args:
+        contexual_features: List of contextual feature names.
+        hstu_uih_keys: List of UIH feature keys.
+        hstu_candidates_keys: List of candidate feature keys.
+        uih_max_seq_len: Maximum sequence length for UIH.
+        max_num_candidates: Maximum number of candidates.
+        value_bound: Upper bound for random values.
+
+    Returns:
+        Tuple of (uih_features_kjt, candidates_features_kjt).
+    """
+    uih_non_seq_feature_keys = contexual_features
+    uih_seq_feature_keys = [
+        k for k in hstu_uih_keys if k not in uih_non_seq_feature_keys
+    ]
+    uih_seq_len = torch.randint(
+        int(uih_max_seq_len * 0.8),
+        uih_max_seq_len + 1,
+        (1,),
+    ).item()
+    uih_lengths = torch.tensor(
+        [1 for _ in uih_non_seq_feature_keys]
+        + [uih_seq_len for _ in uih_seq_feature_keys]
+    )
+    # logging.info(f"uih_lengths: {uih_lengths}")
+    uih_values = torch.randint(
+        1,
+        value_bound,
+        # pyre-ignore[6]
+        (uih_seq_len * len(uih_seq_feature_keys) + len(uih_non_seq_feature_keys),),
+    )
+    uih_features_kjt = KeyedJaggedTensor(
+        keys=uih_non_seq_feature_keys + uih_seq_feature_keys,
+        lengths=uih_lengths.long(),
+        values=uih_values.long(),
+    )
+    num_candidates = torch.randint(
+        1,
+        max_num_candidates + 1,
+        (1,),
+    ).item()
+    candidates_lengths = num_candidates * torch.ones(len(hstu_candidates_keys))
+    candidates_values = torch.randint(
+        1,
+        value_bound,
+        (num_candidates * len(hstu_candidates_keys),),  # pyre-ignore[6]
+    )
+    candidates_features_kjt = KeyedJaggedTensor(
+        keys=hstu_candidates_keys,
+        lengths=candidates_lengths.long(),
+        values=candidates_values.long(),
+    )
+    return uih_features_kjt, candidates_features_kjt
+
+
+class DLRMv3RandomDataset(Dataset):
+    """
+    Dataset that generates random synthetic data for DLRMv3.
+
+    Useful for testing and benchmarking without real data dependencies.
+
+    Args:
+        hstu_config: HSTU model configuration.
+        num_aggregated_samples: Total number of samples to generate.
+        is_inference: Whether the dataset is used for inference mode.
+        *args: Additional positional arguments.
+        **kwargs: Additional keyword arguments.
+    """
+
+    def __init__(
+        self,
+        hstu_config: DlrmHSTUConfig,
+        num_aggregated_samples: int = 10000,
+        is_inference: bool = False,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(
+            hstu_config=hstu_config,
+        )
+        self.hstu_config: DlrmHSTUConfig = hstu_config
+        self._max_num_candidates: int = hstu_config.max_num_candidates
+        self._max_num_candidates_inference: int = (
+            hstu_config.max_num_candidates_inference
+        )
+        self._max_seq_len: int = hstu_config.max_seq_len
+        self._uih_keys: List[str] = hstu_config.hstu_uih_feature_names
+        self._candidates_keys: List[str] = hstu_config.hstu_candidate_feature_names
+        self._contextual_feature_to_max_length: Dict[str, int] = (
+            hstu_config.contextual_feature_to_max_length
+        )
+        self._max_uih_len: int = (
+            self._max_seq_len
+            - self._max_num_candidates
+            - (
+                len(self._contextual_feature_to_max_length)
+                if self._contextual_feature_to_max_length
+                else 0
+            )
+        )
+        self._is_inference = is_inference
+
+        self.contexual_features = []
+        if hstu_config.contextual_feature_to_max_length is not None:
+            self.contexual_features = [
+                p[0] for p in hstu_config.contextual_feature_to_max_length
+            ]
+
+        self.num_aggregated_samples = num_aggregated_samples
+        self.items_in_memory = {}
+
+    def get_sample(self, id: int) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]:
+        """
+        Get a sample by ID from in-memory storage.
+
+        Args:
+            id: Sample identifier.
+
+        Returns:
+            Tuple of (uih_features_kjt, candidates_features_kjt).
+        """
+        return self.items_in_memory[id]
+
+    def get_item_count(self):
+        """
+        Get the total number of samples in the dataset.
+
+        Returns:
+            Number of aggregated samples.
+        """
+        return self.num_aggregated_samples
+
+    def unload_query_samples(self, sample_list):
+        """
+        Clear all samples from memory.
+
+        Args:
+            sample_list: Ignored; clears all samples.
+        """
+        self.items_in_memory = {}
+
+    def load_query_samples(self, sample_list):
+        """
+        Generate and load random samples into memory.
+
+        Args:
+            sample_list: List of sample IDs to generate.
+        """
+        max_num_candidates = (
+            self._max_num_candidates_inference
+            if self._is_inference
+            else self._max_num_candidates
+        )
+        self.items_in_memory = {}
+        for sample in sample_list:
+            self.items_in_memory[sample] = get_random_data(
+                contexual_features=self.contexual_features,
+                hstu_uih_keys=self.hstu_config.hstu_uih_feature_names,
+                hstu_candidates_keys=self.hstu_config.hstu_candidate_feature_names,
+                uih_max_seq_len=self._max_uih_len,
+                max_num_candidates=max_num_candidates,
+            )
+        self.last_loaded = time.time()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/kuairand.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/kuairand.py
new file mode 100644
index 000000000..f6cd9e672
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/kuairand.py
@@ -0,0 +1,163 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+import json
+import time
+from functools import partial
+from typing import Any, Dict, List
+
+import pandas as pd
+import torch
+from generative_recommenders.dlrm_v3.datasets.dataset import DLRMv3RandomDataset
+from generative_recommenders.dlrm_v3.datasets.utils import (
+    maybe_truncate_seq,
+    separate_uih_candidates,
+)
+from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig
+from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
+
+
+def process_and_hash_x(x: Any, hash_size: int) -> Any:
+    if isinstance(x, str):
+        x = json.loads(x)
+    if isinstance(x, list):
+        return [x_i % hash_size for x_i in x]
+    else:
+        return x % hash_size
+
+
+class DLRMv3KuaiRandDataset(DLRMv3RandomDataset):
+    def __init__(
+        self,
+        hstu_config: DlrmHSTUConfig,
+        embedding_config: Dict[str, Any],
+        seq_logs_file: str,
+        is_inference: bool,
+        **kwargs,
+    ) -> None:
+        super().__init__(hstu_config=hstu_config, is_inference=is_inference)
+        self.seq_logs_frame: pd.DataFrame = pd.read_csv(seq_logs_file, delimiter=",")
+        # apply hashing from embedding table config
+        for key, table in embedding_config.items():
+            assert key in self.seq_logs_frame.columns, (
+                "Rename key in embedding table configs!"
+            )
+            hash_size = table.num_embeddings
+            self.seq_logs_frame[key] = self.seq_logs_frame[key].apply(
+                partial(process_and_hash_x, hash_size=hash_size)
+            )
+
+    def get_item_count(self):
+        return len(self.seq_logs_frame)
+
+    def unload_query_samples(self, sample_list):
+        self.items_in_memory = {}
+
+    def load_query_samples(self, sample_list):
+        max_num_candidates = (
+            self._max_num_candidates_inference
+            if self._is_inference
+            else self._max_num_candidates
+        )
+        self.items_in_memory = {}
+        for idx in sample_list:
+            data = self.seq_logs_frame.iloc[idx]
+            if len(data.video_id) <= max_num_candidates:
+                continue
+            sample = self.load_item(data, max_num_candidates)
+            self.items_in_memory[idx] = sample
+
+        self.last_loaded = time.time()
+
+    def load_item(self, data, max_num_candidates):
+        with torch.profiler.record_function("load_item"):
+            video_history_uih, video_history_candidates = separate_uih_candidates(
+                data.video_id,
+                candidates_max_seq_len=max_num_candidates,
+            )
+            action_weights_uih, action_weights_candidates = separate_uih_candidates(
+                data.action_weights,
+                candidates_max_seq_len=max_num_candidates,
+            )
+            timestamps_uih, _ = separate_uih_candidates(
+                data.time_ms,
+                candidates_max_seq_len=max_num_candidates,
+            )
+            watch_time_uih, watch_time_candidates = separate_uih_candidates(
+                data.play_time_ms,
+                candidates_max_seq_len=max_num_candidates,
+            )
+
+            video_history_uih = maybe_truncate_seq(video_history_uih, self._max_uih_len)
+            action_weights_uih = maybe_truncate_seq(
+                action_weights_uih, self._max_uih_len
+            )
+            timestamps_uih = maybe_truncate_seq(timestamps_uih, self._max_uih_len)
+            watch_time_uih = maybe_truncate_seq(watch_time_uih, self._max_uih_len)
+
+            uih_seq_len = len(video_history_uih)
+            assert uih_seq_len == len(timestamps_uih), (
+                "history len differs from timestamp len."
+            )
+            assert uih_seq_len == len(action_weights_uih), (
+                "history len differs from weights len."
+            )
+            assert uih_seq_len == len(watch_time_uih), (
+                "history len differs from watch time len."
+            )
+
+            uih_kjt_values: List[torch.Tensor] = []
+            uih_kjt_lengths: List[torch.Tensor] = []
+            for name, length in self._contextual_feature_to_max_length.items():
+                uih_kjt_values.append(data[name])
+                uih_kjt_lengths.append(length)
+
+            uih_kjt_values.extend(
+                video_history_uih + timestamps_uih + action_weights_uih + watch_time_uih
+            )
+
+            uih_kjt_lengths.extend(
+                [
+                    uih_seq_len
+                    for _ in range(
+                        len(self._uih_keys)
+                        - len(self._contextual_feature_to_max_length)
+                    )
+                ]
+            )
+
+            dummy_query_time = max(timestamps_uih)
+            uih_features_kjt = KeyedJaggedTensor(
+                keys=self._uih_keys,
+                lengths=torch.tensor(uih_kjt_lengths).long(),
+                values=torch.tensor(uih_kjt_values).long(),
+            )
+
+            candidates_kjt_lengths = max_num_candidates * torch.ones(
+                len(self._candidates_keys)
+            )
+            candidates_kjt_values = (
+                video_history_candidates
+                + action_weights_candidates
+                + watch_time_candidates
+                + [dummy_query_time] * max_num_candidates
+            )
+            candidates_features_kjt = KeyedJaggedTensor(
+                keys=self._candidates_keys,
+                lengths=torch.tensor(candidates_kjt_lengths).long(),
+                values=torch.tensor(candidates_kjt_values).long(),
+            )
+
+        return uih_features_kjt, candidates_features_kjt
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/movie_lens.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/movie_lens.py
new file mode 100644
index 000000000..d74fb575b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/movie_lens.py
@@ -0,0 +1,177 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+import logging
+import time
+from typing import List, Optional
+
+import pandas as pd
+import torch
+from generative_recommenders.dlrm_v3.datasets.dataset import DLRMv3RandomDataset
+from generative_recommenders.dlrm_v3.datasets.utils import (
+    maybe_truncate_seq,
+    separate_uih_candidates,
+)
+from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig
+from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
+
+logger = logging.getLogger(__name__)
+
+
+class DLRMv3MovieLensDataset(DLRMv3RandomDataset):
+    def __init__(
+        self,
+        hstu_config: DlrmHSTUConfig,
+        ratings_file: str,
+        is_inference: bool,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(hstu_config=hstu_config, is_inference=is_inference)
+        self.ratings_frame: Optional[pd.DataFrame] = None
+        if ratings_file != "":
+            self.ratings_frame = pd.read_csv(
+                ratings_file,
+                delimiter=",",
+            )
+        assert hstu_config.action_weights is not None
+        self.action_weights: List[int] = hstu_config.action_weights
+
+    def get_item_count(self):
+        assert self.ratings_frame is not None
+        return len(self.ratings_frame)
+
+    def unload_query_samples(self, sample_list):
+        self.items_in_memory = {}
+
+    def iloc(self, idx):
+        assert self.ratings_frame is not None
+        return self.ratings_frame.iloc[idx]
+
+    def load_query_samples(self, sample_list):
+        max_num_candidates = (
+            self._max_num_candidates_inference
+            if self._is_inference
+            else self._max_num_candidates
+        )
+        self.items_in_memory = {}
+        for idx in sample_list:
+            data = self.iloc(idx)
+            if len(data.sequence_item_ids) <= max_num_candidates:
+                continue
+            sample = self.load_item(data, max_num_candidates)
+            self.items_in_memory[idx] = sample
+
+        self.last_loaded = time.time()
+
+    def get_timestamp_uih(self, data, max_num_candidates, size):
+        movie_timestamps_uih, _ = separate_uih_candidates(
+            data.sequence_timestamps,
+            candidates_max_seq_len=max_num_candidates,
+        )
+        return movie_timestamps_uih
+
+    def load_item(self, data, max_num_candidates):
+        movie_history_uih, movie_history_candidates = separate_uih_candidates(
+            data.sequence_item_ids,
+            candidates_max_seq_len=max_num_candidates,
+        )
+        movie_history_ratings_uih, movie_history_ratings_candidates = (
+            separate_uih_candidates(
+                data.sequence_ratings,
+                candidates_max_seq_len=max_num_candidates,
+            )
+        )
+        movie_timestamps_uih = self.get_timestamp_uih(
+            data=data,
+            max_num_candidates=max_num_candidates,
+            size=len(movie_history_uih),
+        )
+
+        assert len(movie_history_uih) == len(movie_timestamps_uih), (
+            "history len differs from timestamp len."
+        )
+        assert len(movie_history_uih) == len(movie_history_ratings_uih), (
+            "history len differs from ratings len."
+        )
+
+        movie_history_uih = maybe_truncate_seq(movie_history_uih, self._max_uih_len)
+        movie_history_ratings_uih = maybe_truncate_seq(
+            movie_history_ratings_uih, self._max_uih_len
+        )
+        movie_timestamps_uih = maybe_truncate_seq(
+            movie_timestamps_uih, self._max_uih_len
+        )
+
+        uih_kjt_values: List[torch.Tensor] = []
+        uih_kjt_lengths: List[torch.Tensor] = []
+        for name, length in self._contextual_feature_to_max_length.items():
+            uih_kjt_values.append(data[name])
+            uih_kjt_lengths.append(length)
+
+        uih_seq_len = len(movie_history_uih)
+        movie_dummy_watch_times_uih = [0 for _ in range(uih_seq_len)]
+        action_weights_uih = [
+            self.action_weights[int(rating) - 1] for rating in movie_history_ratings_uih
+        ]
+        uih_kjt_values.extend(
+            movie_history_uih
+            + movie_history_ratings_uih
+            + movie_timestamps_uih
+            + action_weights_uih
+            + movie_dummy_watch_times_uih
+        )
+        uih_kjt_lengths.extend(
+            [
+                uih_seq_len
+                for _ in range(
+                    len(self._uih_keys) - len(self._contextual_feature_to_max_length)
+                )
+            ]
+        )
+
+        dummy_query_time = (
+            0 if movie_timestamps_uih == [] else max(movie_timestamps_uih)
+        )
+        uih_kjt_values.append(dummy_query_time)
+        uih_kjt_lengths.append(1)
+        uih_features_kjt = KeyedJaggedTensor(
+            keys=self._uih_keys + ["dummy_query_time"],
+            lengths=torch.tensor(uih_kjt_lengths).long(),
+            values=torch.tensor(uih_kjt_values).long(),
+        )
+
+        candidates_kjt_lengths = max_num_candidates * torch.ones(
+            len(self._candidates_keys)
+        )
+        action_weights_candidates = [
+            int(rating >= 3.5) for rating in movie_history_ratings_candidates
+        ]
+        candidates_kjt_values = (
+            movie_history_candidates
+            + movie_history_ratings_candidates
+            + [dummy_query_time] * max_num_candidates  # item_query_time
+            + action_weights_candidates
+            + [1] * max_num_candidates  # item_dummy_watchtime
+        )
+        candidates_features_kjt = KeyedJaggedTensor(
+            keys=self._candidates_keys,
+            lengths=candidates_kjt_lengths.detach().clone().long(),
+            values=torch.tensor(candidates_kjt_values).long(),
+        )
+        return (
+            uih_features_kjt,
+            candidates_features_kjt,
+        )
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/synthetic_movie_lens.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/synthetic_movie_lens.py
new file mode 100644
index 000000000..6cf8a5f56
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/synthetic_movie_lens.py
@@ -0,0 +1,83 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+import csv
+import linecache
+import logging
+import sys
+from typing import List
+
+import numpy as np
+import pandas as pd
+from generative_recommenders.dlrm_v3.datasets.movie_lens import DLRMv3MovieLensDataset
+from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig
+
+csv.field_size_limit(sys.maxsize)
+logger = logging.getLogger(__name__)
+
+
+class DLRMv3SyntheticMovieLensDataset(DLRMv3MovieLensDataset):
+    def __init__(
+        self,
+        hstu_config: DlrmHSTUConfig,
+        ratings_file_prefix: str,
+        is_inference: bool,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(
+            hstu_config=hstu_config, is_inference=is_inference, ratings_file=""
+        )
+        self.ratings_file_prefix = ratings_file_prefix
+        with open(f"{self.ratings_file_prefix}_users.csv", "r") as file:
+            reader = csv.reader(file)
+            self.users_cumsum: List[int] = np.cumsum(
+                [int(row[1]) for row in reader]
+            ).tolist()
+
+    def get_item_count(self):
+        return self.users_cumsum[-1]
+
+    def _process_line(self, line: str) -> pd.Series:
+        reader = csv.reader([line])
+        parsed_line = next(reader)
+        user_id = int(parsed_line[0])
+        sequence_item_ids = parsed_line[1]
+        sequence_ratings = parsed_line[2]
+        return pd.Series(
+            data={
+                "user_id": user_id,
+                "sequence_item_ids": sequence_item_ids,
+                "sequence_ratings": sequence_ratings,
+            }
+        )
+
+    def iloc(self, idx) -> pd.Series:
+        assert idx < self.users_cumsum[-1]
+        file_idx: int = 0
+        while self.users_cumsum[file_idx] <= idx:
+            file_idx += 1
+        if file_idx == 0:
+            local_idx = idx
+        else:
+            local_idx = idx - self.users_cumsum[file_idx - 1]
+        line = linecache.getline(
+            f"{self.ratings_file_prefix}_{file_idx}.csv", local_idx + 1
+        )
+        data = self._process_line(line)
+        return data
+
+    def get_timestamp_uih(self, data, max_num_candidates, size):
+        return [1] * size
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/synthetic_streaming.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/synthetic_streaming.py
new file mode 100644
index 000000000..437e5ae8e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/synthetic_streaming.py
@@ -0,0 +1,400 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+"""
+Synthetic streaming dataset for DLRMv3 inference benchmarking.
+
+This module provides a streaming dataset implementation that loads user interaction
+data from pre-generated CSV files with temporal (timestamp) organization, suitable
+for simulating real-time recommendation scenarios.
+"""
+
+import csv
+import logging
+import sys
+import time
+from typing import Any, Dict, List, Set, Tuple
+
+import pandas as pd
+import torch
+from generative_recommenders.dlrm_v3.datasets.dataset import (
+    collate_fn,
+    DLRMv3RandomDataset,
+    Samples,
+)
+from generative_recommenders.dlrm_v3.datasets.utils import (
+    json_loads,
+    maybe_truncate_seq,
+)
+from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig
+from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
+
+csv.field_size_limit(sys.maxsize)
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class DLRMv3SyntheticStreamingDataset(DLRMv3RandomDataset):
+    """
+    Streaming dataset that loads pre-generated synthetic recommendation data.
+
+    Supports timestamp-based data organization for simulating streaming scenarios
+    where user interaction histories evolve over time.
+
+    Args:
+        hstu_config: HSTU model configuration.
+        ratings_file_prefix: Path prefix for rating data files.
+        is_inference: Whether dataset is used for inference.
+        train_ts: Number of timestamps used for training.
+        total_ts: Total number of timestamps in the data.
+        num_files: Number of data files (for parallelization).
+        num_users: Total number of users in the dataset.
+        num_items: Total number of items in the catalog.
+        num_categories: Number of item categories.
+        *args: Additional positional arguments.
+        **kwargs: Additional keyword arguments.
+    """
+
+    def __init__(
+        self,
+        hstu_config: DlrmHSTUConfig,
+        ratings_file_prefix: str,
+        is_inference: bool,
+        train_ts: int,
+        total_ts: int,
+        num_files: int,
+        num_users: int,
+        num_items: int,
+        num_categories: int,
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__(hstu_config=hstu_config, is_inference=is_inference)
+        self.ratings_file_prefix = ratings_file_prefix
+        self.file_to_offsets: Dict[int, List[int]] = {}
+        with open(f"{self.ratings_file_prefix}offset.csv", "r") as file:
+            reader = csv.reader(file)
+            for size in range(num_files):
+                row = next(reader)
+                assert len(row) == 1
+                offset = json_loads(row[0])
+                assert len(offset) == num_users // num_files
+                self.file_to_offsets[size] = offset
+        self.ts_requests_offsets: List[int] = []
+        with open(f"{self.ratings_file_prefix}requests_per_ts_offset.csv", "r") as file:
+            reader = csv.reader(file)
+            row = next(reader)
+            assert len(row) == 1
+            self.ts_requests_offsets = json_loads(row[0])
+            assert len(self.ts_requests_offsets) == total_ts
+        self.requests: List[int] = []
+        self.ts_to_users_cumsum: Dict[int, List[int]] = {}
+        with open(
+            f"{self.ratings_file_prefix}users_cumsum_per_ts.csv", "r"
+        ) as cumsum_file:
+            reader = csv.reader(cumsum_file)
+            ts = 0
+            for row in reader:
+                assert len(row) == 1
+                cumsum = json_loads(row[0])
+                self.ts_to_users_cumsum[ts] = cumsum
+                ts += 1
+        self.train_ts = train_ts
+        self.total_ts = total_ts
+        self.num_files = num_files
+        self.ts: int = -1
+        self.is_inference: bool = False
+        self.is_eval: bool = False
+        self.users_per_file: int = num_users // num_files
+        self.cached_files: Set[str] = set()
+        self.items_per_category: int = num_items // num_categories
+        assert hstu_config.action_weights is not None
+        self.action_weights: List[int] = hstu_config.action_weights
+        self.items_in_memory: Dict[
+            int, Dict[int, Tuple[KeyedJaggedTensor, KeyedJaggedTensor]]
+        ] = {}
+
+    def get_item_count(self) -> int:
+        return len(self.requests)
+
+    def load_query_samples(self, sample_list: List[int]) -> None:
+        max_num_candidates = (
+            self._max_num_candidates_inference
+            if self._is_inference
+            else self._max_num_candidates
+        )
+        for idx in sample_list:
+            data = self.iloc(idx)
+            sample = self.load_item(data, max_num_candidates)
+            if self.ts not in self.items_in_memory:
+                self.items_in_memory[self.ts] = {}
+            self.items_in_memory[self.ts][idx] = sample
+
+        self.last_loaded = time.time()
+
+    def unload_query_samples(self, sample_list: List[int]) -> None:
+        self.items_in_memory = {}
+
+    def get_sample(self, id: int) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]:
+        return self.items_in_memory[self.ts][id]
+
+    def get_sample_with_ts(
+        self, id: int, ts: int
+    ) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]:
+        """
+        Get a sample for a specific timestamp.
+
+        Args:
+            id: Sample identifier.
+            ts: Timestamp index.
+
+        Returns:
+            Tuple of (uih_features_kjt, candidates_features_kjt).
+        """
+        return self.items_in_memory[ts][id]
+
+    def get_samples_with_ts(self, id_list: List[int], ts: int) -> Samples:
+        """
+        Get and collate multiple samples for a specific timestamp.
+
+        Args:
+            id_list: List of sample identifiers.
+            ts: Timestamp index.
+
+        Returns:
+            Collated Samples object.
+        """
+        list_samples = [self.get_sample_with_ts(ix, ts) for ix in id_list]
+        return collate_fn(list_samples)
+
+    def _process_line(self, line: str, user_id: int) -> pd.Series:
+        """
+        Parse a CSV line into a pandas Series with user interaction data.
+
+        Args:
+            line: CSV line containing user data.
+            user_id: User identifier.
+
+        Returns:
+            pd.Series with parsed user interaction history and candidates.
+        """
+        reader = csv.reader([line])
+        parsed_line = next(reader)
+        # total ts + one more eval ts + one base ts so that uih won't be zero
+        # for each ts, ordered as candidate_ids, candidate_ratings, uih_ids, uih_ratings
+        assert len(parsed_line) == 4 * (self.total_ts + 2)
+        uih_item_ids_list = []
+        uih_ratings_list = []
+        candidate_item_ids = ""
+        candidate_ratings = ""
+        if (not self.is_eval) and (not self.is_inference):
+            assert self.ts < self.train_ts
+            for i in range(self.ts + 1):
+                if parsed_line[4 * i]:
+                    uih_item_ids_list.append(parsed_line[2 + 4 * i])
+                    uih_ratings_list.append(parsed_line[3 + 4 * i])
+            candidate_item_ids = parsed_line[4 * (self.ts + 1)]
+            candidate_ratings = parsed_line[1 + 4 * (self.ts + 1)]
+        elif self.is_eval:
+            for i in range(self.ts + 1):
+                if parsed_line[4 * i]:
+                    uih_item_ids_list.append(parsed_line[2 + 4 * i])
+                    uih_ratings_list.append(parsed_line[3 + 4 * i])
+            candidate_item_ids = parsed_line[4 * (self.ts + 1)]
+            candidate_ratings = parsed_line[1 + 4 * (self.ts + 1)]
+        else:
+            assert self.is_inference is True
+            assert self.ts >= self.train_ts
+            for i in range(self.train_ts + 1):
+                if parsed_line[4 * i]:
+                    uih_item_ids_list.append(parsed_line[2 + 4 * i])
+                    uih_ratings_list.append(parsed_line[3 + 4 * i])
+            for i in range(self.train_ts + 2, self.ts + 2):
+                if parsed_line[4 * i]:
+                    uih_item_ids_list.append(parsed_line[2 + 4 * i])
+                    uih_ratings_list.append(parsed_line[3 + 4 * i])
+            candidate_item_ids = parsed_line[4 * (self.ts + 2)]
+            candidate_ratings = parsed_line[1 + 4 * (self.ts + 2)]
+        uih_item_ids = ",".join(uih_item_ids_list)
+        uih_ratings = ",".join(uih_ratings_list)
+        assert candidate_item_ids != "" and candidate_ratings != ""
+        return pd.Series(
+            data={
+                "user_id": user_id,
+                "uih_item_ids": uih_item_ids,
+                "uih_ratings": uih_ratings,
+                "candidate_item_ids": candidate_item_ids,
+                "candidate_ratings": candidate_ratings,
+            }
+        )
+
+    def iloc(self, idx: int) -> pd.Series:
+        """
+        Get user data by request index using file offsets for efficient access.
+
+        Args:
+            idx: Request index within the current timestamp.
+
+        Returns:
+            pd.Series with parsed user interaction data.
+        """
+        cumsum: List[int] = self.ts_to_users_cumsum[self.ts]
+        assert cumsum != []
+        assert idx < cumsum[-1]
+        file_idx: int = 0
+        while cumsum[file_idx] <= idx:
+            file_idx += 1
+        user_idx = self.requests[idx]
+        filename = f"{self.ratings_file_prefix}{file_idx}.csv"
+        with open(filename, "r") as file:
+            idx = user_idx % self.users_per_file
+            file.seek(self.file_to_offsets[file_idx][idx])
+            line = file.readline()
+        data = self._process_line(line=line, user_id=user_idx)
+        return data
+
+    def get_timestamp_uih(
+        self, data: pd.Series, max_num_candidates: int, size: int
+    ) -> List[int]:
+        return [1] * size
+
+    def set_ts(self, ts: int) -> None:
+        """
+        Set the current timestamp and load associated request data.
+
+        Args:
+            ts: Timestamp index to set.
+        """
+        logger.warning(f"Streaming dataset ts set to {ts}")
+        if ts == self.ts:
+            return
+        self.ts = ts
+        with open(
+            f"{self.ratings_file_prefix}requests_per_ts.csv", "r"
+        ) as request_file:
+            request_file.seek(self.ts_requests_offsets[self.ts])
+            line = request_file.readline()
+            reader = csv.reader([line])
+            row = next(reader)
+            assert len(row) == 1
+            requests = json_loads(row[0])
+            self.requests = requests
+            logger.warning(f"DLRMv3SyntheticStreamingDataset: ts={ts} requests loaded")
+        assert self.ts_to_users_cumsum[self.ts][-1] == len(self.requests)
+        logger.warning(
+            f"DLRMv3SyntheticStreamingDataset: ts={ts} users_cumsum={self.ts_to_users_cumsum[self.ts]}"
+        )
+
+    def load_item(
+        self, data: pd.Series, max_num_candidates: int
+    ) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]:
+        """
+        Load and process a single user's data into KeyedJaggedTensors.
+
+        Converts parsed user data into feature tensors suitable for model input,
+        including truncation to maximum sequence lengths.
+
+        Args:
+            data: pd.Series with user interaction history and candidates.
+            max_num_candidates: Maximum number of candidates to include.
+
+        Returns:
+            Tuple of (uih_features_kjt, candidates_features_kjt).
+        """
+        ids_uih = json_loads(data.uih_item_ids)
+        ids_candidates = json_loads(data.candidate_item_ids)
+        ratings_uih = json_loads(data.uih_ratings)
+        ratings_candidates = json_loads(data.candidate_ratings)
+        timestamps_uih = self.get_timestamp_uih(
+            data=data,
+            max_num_candidates=max_num_candidates,
+            size=len(ids_uih),
+        )
+        assert len(ids_uih) == len(timestamps_uih), (
+            "history len differs from timestamp len."
+        )
+        assert len(ids_uih) == len(ratings_uih), (
+            f"history len {len(ids_uih)} differs from ratings len {len(ratings_uih)}."
+        )
+        assert len(ids_candidates) == len(ratings_candidates), (
+            f"candidates len {len(ids_candidates)} differs from ratings len {len(ratings_candidates)}."
+        )
+
+        ids_uih = maybe_truncate_seq(ids_uih, self._max_uih_len)
+        ratings_uih = maybe_truncate_seq(ratings_uih, self._max_uih_len)
+        timestamps_uih = maybe_truncate_seq(timestamps_uih, self._max_uih_len)
+        ids_candidates = maybe_truncate_seq(ids_candidates, max_num_candidates)
+        num_candidates = len(ids_candidates)
+        ratings_candidates = maybe_truncate_seq(ratings_candidates, max_num_candidates)
+        action_weights_uih = [
+            self.action_weights[int(rating) - 1] for rating in ratings_uih
+        ]
+        action_weights_candidates = [
+            int(rating >= 3.5) for rating in ratings_candidates
+        ]
+
+        uih_kjt_values: List[int] = []
+        uih_kjt_lengths: List[int] = []
+        for name, length in self._contextual_feature_to_max_length.items():
+            uih_kjt_values.append(data[name])
+            uih_kjt_lengths.append(length)
+
+        uih_seq_len = len(ids_uih)
+        dummy_watch_times_uih = [0 for _ in range(uih_seq_len)]
+        item_category_ids = [id // self.items_per_category for id in ids_uih]
+        extend_uih_kjt_values: List[int] = (
+            ids_uih
+            + ratings_uih
+            + timestamps_uih
+            + action_weights_uih
+            + dummy_watch_times_uih
+            + item_category_ids
+        )
+        uih_kjt_values.extend(extend_uih_kjt_values)
+        uih_kjt_lengths.extend(
+            [
+                uih_seq_len
+                for _ in range(
+                    len(self._uih_keys) - len(self._contextual_feature_to_max_length)
+                )
+            ]
+        )
+
+        dummy_query_time = 0 if timestamps_uih == [] else max(timestamps_uih)
+        uih_kjt_values.append(dummy_query_time)
+        uih_kjt_lengths.append(1)
+        uih_features_kjt: KeyedJaggedTensor = KeyedJaggedTensor(
+            keys=self._uih_keys + ["dummy_query_time"],
+            lengths=torch.tensor(uih_kjt_lengths).long(),
+            values=torch.tensor(uih_kjt_values).long(),
+        )
+
+        candidates_kjt_lengths = num_candidates * torch.ones(len(self._candidates_keys))
+        item_candidate_category_ids = [
+            id // self.items_per_category for id in ids_candidates
+        ]
+        candidates_kjt_values = (
+            ids_candidates
+            + ratings_candidates
+            + [dummy_query_time] * num_candidates  # item_query_time
+            + action_weights_candidates
+            + [1] * num_candidates  # item_dummy_watchtime
+            + item_candidate_category_ids
+        )
+        candidates_features_kjt: KeyedJaggedTensor = KeyedJaggedTensor(
+            keys=self._candidates_keys,
+            lengths=candidates_kjt_lengths.detach().clone().long(),
+            values=torch.tensor(candidates_kjt_values).long(),
+        )
+        return uih_features_kjt, candidates_features_kjt
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/utils.py
new file mode 100644
index 000000000..aeca75d41
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/utils.py
@@ -0,0 +1,146 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+"""
+Utility functions for dataset processing.
+
+This module provides helper functions for parsing and processing data
+in the DLRMv3 dataset pipeline.
+"""
+
+import json
+import struct
+from typing import Dict, List, Sequence, Tuple
+
+import numpy as np
+import xxhash
+
+
+def json_loads(
+    x: str | int | List[int],
+) -> List[int]:
+    """
+    Parse a JSON-like string into a list of integers.
+
+    Handles multiple input formats including JSON arrays, comma-separated
+    strings, and single values.
+
+    Args:
+        x: Input that can be a JSON array string, a single integer,
+           or already a list of integers.
+
+    Returns:
+        List of integers parsed from the input.
+    """
+    if isinstance(x, str):
+        if x[0] != "[" and x[-1] != "]":
+            x = "[" + x + "]"
+        y = json.loads(x)
+    else:
+        y = x
+    y_list = [y] if type(y) == int else list(y)
+    return y_list
+
+
+def separate_uih_candidates(
+    x: str | int | List[int],
+    candidates_max_seq_len: int,
+) -> Tuple[List[int], List[int]]:
+    """
+    Separate a sequence into user interaction history (UIH) and candidates.
+
+    Splits the input sequence such that the last `candidates_max_seq_len`
+    elements become candidates and the rest become UIH.
+
+    Args:
+        x: Input sequence as JSON string, single int, or list of ints.
+        candidates_max_seq_len: Number of items at the end to use as candidates.
+
+    Returns:
+        Tuple of (uih, candidates) where both are lists of integers.
+    """
+    if isinstance(x, str):
+        if x[0] != "[" and x[-1] != "]":
+            x = "[" + x + "]"
+        y = json.loads(x)
+    else:
+        y = x
+    y_list = [y] if type(y) == int else list(y)
+    candidates, uih = (
+        y_list[-candidates_max_seq_len:],
+        y_list[:-candidates_max_seq_len],
+    )
+    return uih, candidates
+
+
+def maybe_truncate_seq(
+    y: List[int],
+    max_seq_len: int,
+) -> List[int]:
+    """
+    Truncate a sequence if it exceeds the maximum length.
+
+    Args:
+        y: Input sequence to potentially truncate.
+        max_seq_len: Maximum allowed sequence length.
+
+    Returns:
+        The input sequence, truncated to max_seq_len if necessary.
+    """
+    y_len = len(y)
+    if y_len > max_seq_len:
+        y = y[:max_seq_len]
+    return y
+
+
+def xxhash_cross(
+    anchor: Dict[str, int],
+    keys: Sequence[str],
+    table_size: int,
+    salt: int = 0,
+) -> int:
+    """xxhash64(seed=salt) over little-endian int64 concat(anchor[k] for k in keys), mod table_size.
+
+    Bit-identical to primus_dlrm.data.hashing.cross_hash_nway — embedding rows
+    are interchangeable with Primus-trained ones.
+    """
+    n = len(keys)
+    assert n >= 2, f"xxhash_cross needs >=2 keys, got {n}"
+    digest = xxhash.xxh64(seed=salt)
+    digest.update(struct.Struct(f"<{n}q").pack(*(int(anchor[k]) for k in keys)))
+    return digest.intdigest() % table_size
+
+
+def xxhash_cross_batch(
+    arr_by_key: Dict[str, np.ndarray],
+    keys: Sequence[str],
+    table_size: int,
+    salt: int = 0,
+) -> np.ndarray:
+    """Vectorised xxhash_cross over equal-length int64 arrays (one per key)."""
+    n = len(keys)
+    assert n >= 2
+    cols = [np.asarray(arr_by_key[k], dtype=np.int64).ravel() for k in keys]
+    length = cols[0].shape[0]
+    for c in cols:
+        assert c.shape[0] == length
+    pack = struct.Struct(f"<{n}q").pack
+    digest_cls = xxhash.xxh64
+    out = np.empty(length, dtype=np.int64)
+    for i in range(length):
+        d = digest_cls(seed=salt)
+        d.update(pack(*(int(c[i]) for c in cols)))
+        out[i] = d.intdigest() % table_size
+    return out
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py
new file mode 100644
index 000000000..fb8b212b1
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py
@@ -0,0 +1,608 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+
+# pyre-unsafe
+"""
+Yambda dataset for the DLRMv3 HSTU `modules/` path.
+
+Reads the same parquets produced by Primus-DLRM's preprocessing (no runtime
+dep on Primus). Each sample is one anchor LISTEN event with:
+  * label = (played_ratio >= LISTEN_PLUS_THRESHOLD) — the listen_plus bit
+  * a chronologically interleaved 3-pool history (listen+/like/skip), with
+    pool identity tagged per-position in `action_weight` (bits 1/2/4)
+  * 7 pre-hashed cross-feature ids exposed as length-1 contextual entries
+
+Hash formula is byte-identical to `primus_dlrm.data.hashing.cross_hash_nway`
+so embedding rows are interchangeable.
+"""
+
+import logging
+import mmap as _mmap_mod
+import os
+import time
+from pathlib import Path
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import polars as pl
+import torch
+from generative_recommenders.dlrm_v3.datasets.dataset import DLRMv3RandomDataset
+from generative_recommenders.dlrm_v3.datasets.utils import xxhash_cross
+from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig
+from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
+
+logger = logging.getLogger(__name__)
+
+
+def _load_npy_readonly(path: Union[str, Path]) -> np.ndarray:
+    # MAP_SHARED + PROT_READ so the kernel does not charge the mapping against
+    # vm.overcommit_memory=2 limits. numpy's mmap_mode='r' uses MAP_PRIVATE and
+    # reserves per-process commit; at 8 ranks × ~190 GB store, that OOMs.
+    path = Path(path)
+    with open(path, "rb") as f:
+        version = np.lib.format.read_magic(f)
+        if version[0] == 1:
+            shape, _, dtype = np.lib.format.read_array_header_1_0(f)
+        else:
+            shape, _, dtype = np.lib.format.read_array_header_2_0(f)
+        offset = f.tell()
+    fd = os.open(str(path), os.O_RDONLY)
+    try:
+        buf = _mmap_mod.mmap(fd, 0, access=_mmap_mod.ACCESS_READ)
+    finally:
+        os.close(fd)
+    arr = np.ndarray(shape, dtype=dtype, buffer=buf, offset=offset)
+    arr.flags.writeable = False
+    return arr
+
+# Match primus_dlrm.data.preprocessing.EVENT_TYPE_MAP / dataset.LISTEN_PLUS_THRESHOLD
+LISTEN_TYPE = 0
+LIKE_TYPE = 1
+LISTEN_PLUS_THRESHOLD = 50
+
+# Action-weight bits (must match hstu_config.action_weights = [1, 2, 4]).
+LP_BIT = 1
+LIKE_BIT = 2
+SKIP_BIT = 4
+
+
+class _FlatEventStore:
+    """Minimal port of Primus-DLRM's FlatEventStore.
+
+    Reads `train_sessions.parquet` and explodes per-session arrays into flat
+    numpy columns + per-user `(start, end)` index arrays. Cache-compatible
+    layout, but writes nothing (rebuilds from parquet each construction).
+    """
+
+    # On-disk column layout written by Primus-DLRM's FlatEventStore.save_mmap.
+    # Bit-identical to that schema so the cache is interchangeable.
+    _MMAP_COLS = (
+        "flat_uid", "flat_item_ids", "flat_timestamps",
+        "flat_event_types", "flat_played_ratio",
+        "flat_is_listen_plus", "flat_is_like", "flat_is_skip",
+        "flat_is_organic",
+        "user_start", "user_end", "unique_uids",
+    )
+
+    def __init__(self, sessions_df: pl.DataFrame) -> None:
+        logger.info("Building flat event store from sessions...")
+        sorted_sessions = sessions_df.sort(["uid", "session_id"])
+        exploded = sorted_sessions.explode(
+            ["item_ids", "timestamps", "event_types", "is_organic", "played_ratio_pct"]
+        )
+
+        self.flat_uid: np.ndarray = exploded["uid"].to_numpy().astype(np.int64)
+        self.flat_item_ids: np.ndarray = exploded["item_ids"].to_numpy().astype(np.int64)
+        self.flat_timestamps: np.ndarray = exploded["timestamps"].to_numpy().astype(np.int64)
+        self.flat_event_types: np.ndarray = exploded["event_types"].to_numpy().astype(np.int64)
+        self.flat_played_ratio: np.ndarray = exploded["played_ratio_pct"].to_numpy().astype(np.float32)
+        self.flat_is_organic: np.ndarray = exploded["is_organic"].to_numpy().astype(np.int8)
+        np.nan_to_num(self.flat_played_ratio, copy=False, nan=0.0)
+
+        is_listen = self.flat_event_types == LISTEN_TYPE
+        self.flat_is_listen_plus: np.ndarray = is_listen & (
+            self.flat_played_ratio >= LISTEN_PLUS_THRESHOLD
+        )
+        self.flat_is_like: np.ndarray = self.flat_event_types == LIKE_TYPE
+        self.flat_is_skip: np.ndarray = is_listen & (
+            self.flat_played_ratio < LISTEN_PLUS_THRESHOLD
+        )
+
+        uid_changes = np.where(np.diff(self.flat_uid) != 0)[0] + 1
+        starts = np.concatenate([[0], uid_changes])
+        ends = np.concatenate([uid_changes, [len(self.flat_uid)]])
+        uid_vals = self.flat_uid[starts]
+        max_uid = int(uid_vals.max()) + 1
+        self.user_start: np.ndarray = np.full(max_uid, -1, dtype=np.int64)
+        self.user_end: np.ndarray = np.full(max_uid, -1, dtype=np.int64)
+        self.user_start[uid_vals] = starts
+        self.user_end[uid_vals] = ends
+        self.unique_uids: np.ndarray = uid_vals
+        self.num_users: int = len(uid_vals)
+        self.total_events: int = len(self.flat_item_ids)
+        logger.info(
+            f"FlatEventStore: {self.total_events:,} events, {self.num_users:,} users"
+        )
+
+    @classmethod
+    def load_mmap(cls, cache_dir: Union[str, Path]) -> "_FlatEventStore":
+        """Load flat columns by MAP_SHARED+PROT_READ from a prebuilt cache.
+        All ranks on a node share the same physical pages."""
+        import json as _json
+        cache_dir = Path(cache_dir)
+        with open(cache_dir / "store_meta.json") as f:
+            meta = _json.load(f)
+        store = object.__new__(cls)
+        for name in cls._MMAP_COLS:
+            setattr(store, name, _load_npy_readonly(cache_dir / f"{name}.npy"))
+        store.num_users = int(meta["num_users"])
+        store.total_events = int(meta["total_events"])
+        logger.info(
+            f"FlatEventStore mmap from {cache_dir}: "
+            f"{store.total_events:,} events, {store.num_users:,} users"
+        )
+        return store
+
+    def save_mmap(self, cache_dir: Union[str, Path]) -> None:
+        """Persist flat columns to disk as .npy, then write a sentinel.
+        Subsequent runs (any rank, any node sharing the FS) load via mmap."""
+        import json as _json
+        cache_dir = Path(cache_dir)
+        cache_dir.mkdir(parents=True, exist_ok=True)
+        for name in self._MMAP_COLS:
+            np.save(cache_dir / f"{name}.npy", getattr(self, name))
+        with open(cache_dir / "store_meta.json", "w") as f:
+            _json.dump(
+                {"num_users": self.num_users, "total_events": self.total_events}, f
+            )
+        # Sentinel — readers check this before mmap'ing to avoid partial files.
+        (cache_dir / "_READY").touch()
+        logger.info(f"FlatEventStore saved to {cache_dir}")
+
+
+class DLRMv3YambdaDataset(DLRMv3RandomDataset):
+    """Yambda-5b dataset for the DLRMv3 HSTU modules/ path.
+
+    Args:
+        hstu_config: DlrmHSTUConfig (must come from `get_hstu_configs("yambda-5b")`).
+        processed_dir: directory with `train_sessions.parquet` + `item_popularity.npy`.
+        metadata_dir: directory with `{artist,album}_item_mapping.parquet`.
+        history_length: per-pool truncation cap (total interleaved ≤ 3 * this).
+        scan_window: how far back to scan when filling each pool.
+        cross_specs: list of (name, keys, num_embeddings, salt). Source of truth
+            in `dlrm_v3/configs.py:YAMBDA_5B_CROSS_SPECS`.
+        is_inference: passed through to base class.
+    """
+
+    def __init__(
+        self,
+        hstu_config: DlrmHSTUConfig,
+        processed_dir: str,
+        metadata_dir: str,
+        history_length: int = 2048,
+        scan_window: int = 20000,
+        cross_specs: Optional[Sequence[Tuple[str, Sequence[str], int, int]]] = None,
+        cache_dir: Optional[str] = None,
+        is_inference: bool = False,
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__(hstu_config=hstu_config, is_inference=is_inference)
+        self._processed_dir: str = processed_dir
+        self._metadata_dir: str = metadata_dir
+        self._history_length: int = history_length
+        self._scan_window: int = scan_window
+        self._cache_dir: Optional[str] = cache_dir
+        self._cross_specs: List[Tuple[str, Tuple[str, ...], int, int]] = [
+            (name, tuple(keys), n, s) for (name, keys, n, s) in (cross_specs or [])
+        ]
+        assert hstu_config.action_weights is not None
+        self._action_weights: List[int] = hstu_config.action_weights
+
+        self._load_metadata(metadata_dir)
+        # Build-once-mmap-many: first rank to arrive acquires the build lock
+        # and explodes the parquet (one ~190 GB in-memory pass), then writes
+        # flat .npy columns + _READY sentinel. All ranks (including the
+        # builder, after dropping its in-memory copy) reload via MAP_SHARED+
+        # PROT_READ — kernel shares physical pages across ranks so the steady-
+        # state per-rank RSS for the dataset is ~0.
+        if cache_dir is None:
+            cache_dir = os.path.join(processed_dir, f"hstu_cache_L{history_length}")
+        self._cache_dir = cache_dir
+        self._ensure_cache_built(cache_dir, processed_dir, history_length)
+        self.store: _FlatEventStore = _FlatEventStore.load_mmap(cache_dir)
+        # Mmap the positions file built alongside the flat columns.
+        self._positions: np.ndarray = _load_npy_readonly(
+            os.path.join(cache_dir, f"positions_L{history_length}.npy")
+        )
+        logger.info(
+            f"Yambda dataset ready: {self.store.total_events:,} events, "
+            f"{len(self._positions):,} training positions"
+        )
+
+    @staticmethod
+    def _ensure_cache_built(
+        cache_dir: str, processed_dir: str, history_length: int
+    ) -> None:
+        """File-locked one-shot build with column-at-a-time explode.
+
+        A naive `pl.read_parquet(...).explode([5 list cols])` peaks at ~1.6 TB
+        on the 5b dataset (polars holds input list-columns + dense output +
+        parallel-worker scratch all together). Instead we:
+          1) Read parquet + sort once (sorted list-column DF, ~80 GB).
+          2) For each output column: select that single list, explode, write
+             .npy, drop. Bounds incremental peak to one column (~38 GB).
+          3) Derive bool flags and indices from the on-disk mmaps.
+
+        Peak RAM: ~150 GB. Steady state across all ranks afterward: ~0
+        incremental thanks to MAP_SHARED in load_mmap.
+        """
+        import fcntl
+        import gc
+        import json as _json
+
+        ready = os.path.join(cache_dir, "_READY")
+        if os.path.exists(ready):
+            return
+        os.makedirs(cache_dir, exist_ok=True)
+        lock_path = os.path.join(cache_dir, "_lock")
+        with open(lock_path, "w") as lf:
+            logger.info(f"Acquiring build lock for {cache_dir}...")
+            fcntl.flock(lf, fcntl.LOCK_EX)
+            try:
+                if os.path.exists(ready):
+                    return
+                parquet_path = os.path.join(processed_dir, "train_sessions.parquet")
+                logger.info(
+                    f"Building flat-event cache from {parquet_path} "
+                    f"(column-at-a-time, ~150 GB peak RAM)"
+                )
+
+                # Step 1: read + sort. List columns stay nested at this stage.
+                sessions = pl.read_parquet(parquet_path).sort(["uid", "session_id"])
+                logger.info(f"Sessions sorted: {sessions.shape}")
+
+                # Per-session lengths + uids — used to derive flat_uid via
+                # np.repeat (cheap) without exploding the whole DF at once.
+                lengths = (
+                    sessions.select(pl.col("item_ids").list.len())
+                    .to_numpy()
+                    .reshape(-1)
+                    .astype(np.int64)
+                )
+                session_uids = sessions["uid"].to_numpy().astype(np.int64)
+                N = int(lengths.sum())
+                num_users = int(np.unique(session_uids).shape[0])
+                logger.info(f"Total events: {N:,}, users: {num_users:,}")
+
+                # Step 2: column-at-a-time explode → save → drop.
+                # uid is per-session scalar; expand via np.repeat.
+                flat_uid = np.repeat(session_uids, lengths).astype(np.int64)
+                np.save(os.path.join(cache_dir, "flat_uid.npy"), flat_uid)
+                del flat_uid, session_uids, lengths
+                gc.collect()
+                logger.info("Wrote flat_uid.npy")
+
+                # Derived columns flat_is_listen_plus/like/skip depend on
+                # event_types + played_ratio. Save those two first, then
+                # derive the bools from the mmaps.
+                _list_cols = [
+                    ("item_ids", "flat_item_ids", np.int64),
+                    ("timestamps", "flat_timestamps", np.int64),
+                    ("event_types", "flat_event_types", np.int64),
+                    ("is_organic", "flat_is_organic", np.int8),
+                    ("played_ratio_pct", "flat_played_ratio", np.float32),
+                ]
+                for src_col, dst_name, dtype in _list_cols:
+                    exploded = sessions.select(pl.col(src_col).explode())
+                    arr = exploded[src_col].to_numpy().astype(dtype, copy=False)
+                    if dtype == np.float32:
+                        np.nan_to_num(arr, copy=False, nan=0.0)
+                    np.save(os.path.join(cache_dir, f"{dst_name}.npy"), arr)
+                    del exploded, arr
+                    gc.collect()
+                    logger.info(f"Wrote {dst_name}.npy")
+
+                # Drop the sessions DF now that all source columns are on disk.
+                del sessions
+                gc.collect()
+
+                # Step 3: derive bool flags from the just-written mmaps.
+                event_types = _load_npy_readonly(
+                    os.path.join(cache_dir, "flat_event_types.npy")
+                )
+                played_ratio = _load_npy_readonly(
+                    os.path.join(cache_dir, "flat_played_ratio.npy")
+                )
+                is_listen = event_types == LISTEN_TYPE
+                np.save(
+                    os.path.join(cache_dir, "flat_is_listen_plus.npy"),
+                    is_listen & (played_ratio >= LISTEN_PLUS_THRESHOLD),
+                )
+                np.save(
+                    os.path.join(cache_dir, "flat_is_like.npy"),
+                    event_types == LIKE_TYPE,
+                )
+                np.save(
+                    os.path.join(cache_dir, "flat_is_skip.npy"),
+                    is_listen & (played_ratio < LISTEN_PLUS_THRESHOLD),
+                )
+                del is_listen, played_ratio
+                gc.collect()
+                logger.info("Wrote flat_is_listen_plus/like/skip.npy")
+
+                # user_start / user_end / unique_uids from flat_uid mmap.
+                flat_uid = _load_npy_readonly(
+                    os.path.join(cache_dir, "flat_uid.npy")
+                )
+                uid_changes = np.where(np.diff(flat_uid) != 0)[0] + 1
+                starts = np.concatenate([[0], uid_changes])
+                ends = np.concatenate([uid_changes, [len(flat_uid)]])
+                uid_vals = flat_uid[starts]
+                max_uid = int(uid_vals.max()) + 1
+                user_start = np.full(max_uid, -1, dtype=np.int64)
+                user_end = np.full(max_uid, -1, dtype=np.int64)
+                user_start[uid_vals] = starts
+                user_end[uid_vals] = ends
+                np.save(os.path.join(cache_dir, "user_start.npy"), user_start)
+                np.save(os.path.join(cache_dir, "user_end.npy"), user_end)
+                np.save(os.path.join(cache_dir, "unique_uids.npy"), uid_vals)
+                logger.info("Wrote user_start/end/unique_uids.npy")
+
+                # Positions: LISTEN events with ≥history_length prior history.
+                # Done now (before dropping user_start) so all sibling ranks
+                # just mmap the result instead of each running a 75 GB build.
+                user_start_per_event = user_start[flat_uid]
+                idx = np.arange(len(flat_uid), dtype=np.int64)
+                keep = (idx - user_start_per_event >= history_length) & (
+                    event_types == LISTEN_TYPE
+                )
+                positions = np.where(keep)[0].astype(np.int64)
+                np.save(
+                    os.path.join(cache_dir, f"positions_L{history_length}.npy"),
+                    positions,
+                )
+                logger.info(
+                    f"Wrote positions_L{history_length}.npy: {len(positions):,}"
+                )
+                del (
+                    flat_uid, event_types, user_start, user_end, uid_vals,
+                    starts, ends, uid_changes, idx, user_start_per_event,
+                    keep, positions,
+                )
+                gc.collect()
+
+                # Meta + sentinel — written last; readers gate on _READY.
+                with open(os.path.join(cache_dir, "store_meta.json"), "w") as f:
+                    _json.dump(
+                        {"num_users": num_users, "total_events": N}, f
+                    )
+                open(os.path.join(cache_dir, "_READY"), "w").close()
+                logger.info(f"Cache build complete: {cache_dir}")
+            finally:
+                fcntl.flock(lf, fcntl.LOCK_UN)
+
+    def _load_metadata(self, metadata_dir: str) -> None:
+        item_pop_path = os.path.join(metadata_dir, "item_popularity.npy")
+        if os.path.exists(item_pop_path):
+            item_popularity = np.load(item_pop_path)
+        else:
+            # Fallback: derive vocab size from the artist+album maps.
+            item_popularity = None
+
+        artist_map = pl.read_parquet(os.path.join(metadata_dir, "artist_item_mapping.parquet"))
+        album_map = pl.read_parquet(os.path.join(metadata_dir, "album_item_mapping.parquet"))
+        n_items = int(
+            max(
+                int(artist_map["item_id"].max()) + 1,
+                int(album_map["item_id"].max()) + 1,
+                len(item_popularity) if item_popularity is not None else 0,
+            )
+        )
+        self.item_to_artist: np.ndarray = np.zeros(n_items, dtype=np.int64)
+        valid = artist_map.filter(pl.col("item_id") < n_items)
+        self.item_to_artist[valid["item_id"].to_numpy()] = valid["artist_id"].to_numpy()
+        self.item_to_album: np.ndarray = np.zeros(n_items, dtype=np.int64)
+        valid = album_map.filter(pl.col("item_id") < n_items)
+        self.item_to_album[valid["item_id"].to_numpy()] = valid["album_id"].to_numpy()
+        self.num_items: int = n_items
+
+    def get_item_count(self) -> int:
+        return int(len(self._positions))
+
+    def iloc(self, idx: int) -> int:
+        return int(self._positions[idx])
+
+    def load_query_samples(self, sample_list) -> None:
+        max_num_candidates = (
+            self._max_num_candidates_inference
+            if self._is_inference
+            else self._max_num_candidates
+        )
+        self.items_in_memory = {}
+        for idx in sample_list:
+            flat_pos = self.iloc(idx)
+            self.items_in_memory[idx] = self._build_sample(flat_pos, max_num_candidates)
+        self.last_loaded = time.time()
+
+    def get_sample(self, idx: int) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]:
+        if idx in self.items_in_memory:
+            return self.items_in_memory[idx]
+        max_num_candidates = (
+            self._max_num_candidates_inference
+            if self._is_inference
+            else self._max_num_candidates
+        )
+        flat_pos = self.iloc(idx)
+        return self._build_sample(flat_pos, max_num_candidates)
+
+    def _gather_interleaved_history(
+        self, flat_pos: int, user_start: int
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """Build a single chronologically-ordered history sequence from the 3
+        behavior pools. Each event's `action_weight` carries the pool bitmask
+        (LP_BIT/LIKE_BIT/SKIP_BIT). Per-pool cap = history_length // 3."""
+        L = self._history_length
+        per_pool = max(1, L // 3)
+        scan_start = max(int(user_start), int(flat_pos) - self._scan_window)
+        scan_end = int(flat_pos)
+        if scan_end <= scan_start:
+            empty = np.empty(0, dtype=np.int64)
+            return empty, empty, empty, empty, empty
+
+        item_ids = self.store.flat_item_ids[scan_start:scan_end]
+        timestamps = self.store.flat_timestamps[scan_start:scan_end]
+        is_lp = self.store.flat_is_listen_plus[scan_start:scan_end]
+        is_like = self.store.flat_is_like[scan_start:scan_end]
+        is_skip = self.store.flat_is_skip[scan_start:scan_end]
+
+        # Local indices into the scan window — preserves chronological order
+        # within each pool and lets us interleave by re-sorting.
+        idx_all = np.arange(item_ids.shape[0], dtype=np.int64)
+        lp_idx = idx_all[is_lp][-per_pool:]
+        like_idx = idx_all[is_like][-per_pool:]
+        skip_idx = idx_all[is_skip][-per_pool:]
+
+        keep_local = np.concatenate([lp_idx, like_idx, skip_idx])
+        if keep_local.size == 0:
+            empty = np.empty(0, dtype=np.int64)
+            return empty, empty, empty, empty, empty
+
+        order = np.argsort(keep_local, kind="stable")
+        keep_local = keep_local[order]
+
+        items = item_ids[keep_local]
+        ts = timestamps[keep_local]
+        artists = self.item_to_artist[np.clip(items, 0, self.item_to_artist.shape[0] - 1)]
+        albums = self.item_to_album[np.clip(items, 0, self.item_to_album.shape[0] - 1)]
+
+        # Pool bitmask per kept event (LP/LIKE/SKIP are mutually exclusive in
+        # the source data, but OR is safe and forward-compatible).
+        weight = np.zeros(keep_local.shape[0], dtype=np.int64)
+        weight[is_lp[keep_local]] |= LP_BIT
+        weight[is_like[keep_local]] |= LIKE_BIT
+        weight[is_skip[keep_local]] |= SKIP_BIT
+
+        return items, artists, albums, ts, weight
+
+    def _build_sample(
+        self, flat_pos: int, max_num_candidates: int
+    ) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]:
+        uid = int(self.store.flat_uid[flat_pos])
+        user_start = int(self.store.user_start[uid])
+
+        items, artists, albums, ts, weight = self._gather_interleaved_history(
+            flat_pos, user_start
+        )
+
+        target_item = int(self.store.flat_item_ids[flat_pos])
+        target_artist = int(
+            self.item_to_artist[target_item]
+            if target_item < self.item_to_artist.shape[0]
+            else 0
+        )
+        target_album = int(
+            self.item_to_album[target_item]
+            if target_item < self.item_to_album.shape[0]
+            else 0
+        )
+        target_ts = int(self.store.flat_timestamps[flat_pos])
+
+        played_ratio = float(self.store.flat_played_ratio[flat_pos])
+        is_lp = (
+            int(self.store.flat_event_types[flat_pos]) == LISTEN_TYPE
+            and played_ratio >= LISTEN_PLUS_THRESHOLD
+        )
+        # Label encoded into the candidate's action_weight via the LP bit, so
+        # _get_supervision_labels_and_weights sees the right supervision.
+        candidate_action_weight = LP_BIT if is_lp else 0
+
+        cross_id_anchor: Dict[str, int] = {
+            "uid": uid,
+            "item_id": target_item,
+            "artist_id": target_artist,
+            "album_id": target_album,
+            "hour_of_day": int((target_ts // 3600) % 24),
+            "is_organic": int(self.store.flat_is_organic[flat_pos]),
+        }
+        cross_ids: Dict[str, int] = {
+            name: xxhash_cross(cross_id_anchor, list(keys), n, salt)
+            for (name, keys, n, salt) in self._cross_specs
+        }
+
+        # ---- Truncate UIH to fit max_seq_len budget ----
+        uih_seq_len_budget = (
+            self._max_seq_len
+            - max_num_candidates
+            - len(self._contextual_feature_to_max_length or {})
+        )
+        if items.shape[0] > uih_seq_len_budget:
+            items = items[-uih_seq_len_budget:]
+            artists = artists[-uih_seq_len_budget:]
+            albums = albums[-uih_seq_len_budget:]
+            ts = ts[-uih_seq_len_budget:]
+            weight = weight[-uih_seq_len_budget:]
+        uih_seq_len = int(items.shape[0])
+        dummy_watch_time = np.zeros(uih_seq_len, dtype=np.int64)
+
+        # ---- Build UIH KJT ----
+        # Contextual features (length-1 each) iterated in the same order as
+        # `_contextual_feature_to_max_length` (matches movielens reference).
+        uih_kjt_values: List[int] = []
+        uih_kjt_lengths: List[int] = []
+        for name, length in (self._contextual_feature_to_max_length or {}).items():
+            assert length == 1, f"yambda contextuals are length-1, got {name}={length}"
+            if name == "uid":
+                uih_kjt_values.append(uid)
+            else:
+                uih_kjt_values.append(int(cross_ids[name]))
+            uih_kjt_lengths.append(1)
+
+        # Sequential features — order must match the trailing entries of
+        # hstu_uih_feature_names in configs.py:
+        #   item_id, artist_id, album_id, action_weight, action_timestamp, dummy_watch_time
+        uih_kjt_values.extend(items.tolist())
+        uih_kjt_values.extend(artists.tolist())
+        uih_kjt_values.extend(albums.tolist())
+        uih_kjt_values.extend(weight.tolist())
+        uih_kjt_values.extend(ts.tolist())
+        uih_kjt_values.extend(dummy_watch_time.tolist())
+        n_sequential = len(self._uih_keys) - len(self._contextual_feature_to_max_length or {})
+        uih_kjt_lengths.extend([uih_seq_len] * n_sequential)
+
+        dummy_query_time = int(ts[-1]) if uih_seq_len > 0 else target_ts
+        uih_kjt_values.append(dummy_query_time)
+        uih_kjt_lengths.append(1)
+
+        uih_features_kjt = KeyedJaggedTensor(
+            keys=self._uih_keys + ["dummy_query_time"],
+            lengths=torch.tensor(uih_kjt_lengths, dtype=torch.long),
+            values=torch.tensor(uih_kjt_values, dtype=torch.long),
+        )
+
+        # ---- Build candidates KJT ----
+        # Order must match configs.py:hstu_candidate_feature_names exactly:
+        #   item_candidate_id, item_candidate_artist_id, item_candidate_album_id,
+        #   item_query_time, item_action_weight, item_dummy_watchtime
+        candidates_kjt_lengths = max_num_candidates * torch.ones(
+            len(self._candidates_keys), dtype=torch.long
+        )
+        candidates_kjt_values: List[int] = (
+            [target_item] * max_num_candidates
+            + [target_artist] * max_num_candidates
+            + [target_album] * max_num_candidates
+            + [dummy_query_time] * max_num_candidates
+            + [candidate_action_weight] * max_num_candidates
+            + [1] * max_num_candidates  # item_dummy_watchtime
+        )
+        candidates_features_kjt = KeyedJaggedTensor(
+            keys=self._candidates_keys,
+            lengths=candidates_kjt_lengths,
+            values=torch.tensor(candidates_kjt_values, dtype=torch.long),
+        )
+        return uih_features_kjt, candidates_features_kjt
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/README.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/README.md
new file mode 100644
index 000000000..ef1c9686d
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/README.md
@@ -0,0 +1,88 @@
+# MLPerf Inference reference implementation for DLRMv3
+
+## Install dependencies
+
+The reference implementation has been tested on a single host, with x86_64 CPUs
+and 8 NVIDIA H100/B200 GPUs. Dependencies can be installed below,
+
+```
+cd generative_recommenders/
+pip install -e .
+```
+
+## Build loadgen
+
+```
+cd generative_recommenders/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/
+CFLAGS="-std=c++14 -O3" python -m pip install .
+```
+
+## Dataset download
+
+DLRMv3 uses a synthetic dataset specifically designed to match the model and
+system characteristics of large-scale sequential recommendation (large item set
+and long average sequence length for each request). To generate the dataset used
+for both training and inference, run
+
+```
+cd generative_recommenders/dlrm_v3/
+python streaming_synthetic_data.py
+```
+
+The generated dataset has 2TB size, and contains 5 million users interacting
+with a billion items over 100 timestamps.
+
+Only 1% of the dataset is used in the inference benchmark. The sampled DLRMv3
+dataset and trained checkpoint are available at
+https://inference.mlcommons-storage.org/.
+
+Script to download the sampled dataset used in inference benchmark:
+
+```
+bash <(curl -s https://raw.githubusercontent.com/mlcommons/r2-downloader/refs/heads/main/mlc-r2-downloader.sh) https://inference.mlcommons-storage.org/metadata/dlrm-v3-dataset.uri
+```
+
+Script to download the 1TB trained checkpoint:
+
+```
+bash <(curl -s https://raw.githubusercontent.com/mlcommons/r2-downloader/refs/heads/main/mlc-r2-downloader.sh) https://inference.mlcommons-storage.org/metadata/dlrm-v3-checkpoint.uri
+```
+
+## Inference benchmark
+
+```
+cd generative_recommenders/generative_recommenders/dlrm_v3/inference/
+WORLD_SIZE=8 python main.py --dataset sampled-streaming-100b
+```
+
+The config file is listed in `dlrm_v3/inference/gin/streaming_100b.gin`.
+`WORLD_SIZE` is the number of GPUs used in the inference benchmark.
+
+To load checkpoint from training, modify `run.model_path` inside the inference
+gin config file. (We will relase the checkpoint soon.)
+
+To achieve the best performance, tune `run.target_qps` and `run.batch_size` in
+the config file.
+
+## Accuracy test
+
+Set `run.compute_eval` will run the accuracy test and dump prediction outputs in
+`mlperf_log_accuracy.json`. To check the accuracy, run
+
+```
+python accuracy.py --path path/to/mlperf_log_accuracy.json
+```
+
+We use normalized entropy (NE), accuracy, and AUC as the metrics to evaluate the model quality. For accepted submissions, all three metrics (NE, Accuracy, AUC) must be within 99% of the reference implementation values. The accuracy for the reference implementation evaluated on 34,996 requests across 10 inference timestamps are listed below:
+
+```
+NE: 86.687%
+Accuracy: 69.651%
+AUC: 78.663%
+```
+
+## Run unit tests
+
+```
+python tests/inference_test.py
+```
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/accuracy.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/accuracy.py
new file mode 100644
index 000000000..19242f7bd
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/accuracy.py
@@ -0,0 +1,86 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+"""
+Tool to calculate accuracy for loadgen accuracy output found in mlperf_log_accuracy.json
+"""
+
+import argparse
+import json
+import logging
+
+import numpy as np
+import torch
+from generative_recommenders.dlrm_v3.configs import get_hstu_configs
+from generative_recommenders.dlrm_v3.utils import MetricsLogger
+
+logger: logging.Logger = logging.getLogger("main")
+
+
+def get_args() -> argparse.Namespace:
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--path",
+        required=True,
+        help="path to mlperf_log_accuracy.json",
+    )
+    args = parser.parse_args()
+    return args
+
+
+def main() -> None:
+    """
+    Main function to calculate accuracy metrics from loadgen output.
+
+    Reads the mlperf_log_accuracy.json file, parses the results, and computes
+    accuracy metrics using the MetricsLogger. Each result entry contains
+    predictions, labels, and weights packed as float32 numpy arrays.
+    """
+    args = get_args()
+    logger.warning("Parsing loadgen accuracy log...")
+    with open(args.path, "r") as f:
+        results = json.load(f)
+    hstu_config = get_hstu_configs(dataset="sampled-streaming-100b")
+    metrics = MetricsLogger(
+        multitask_configs=hstu_config.multitask_configs,
+        batch_size=1,
+        window_size=3000,
+        device=torch.device("cpu"),
+        rank=0,
+    )
+    logger.warning(f"results have {len(results)} entries")
+    for result in results:
+        data = np.frombuffer(bytes.fromhex(result["data"]), np.float32)
+        num_candidates = data[-1].astype(int)
+        assert len(data) == 1 + num_candidates * 3
+        mt_target_preds = torch.from_numpy(data[0:num_candidates])
+        mt_target_labels = torch.from_numpy(data[num_candidates : num_candidates * 2])
+        mt_target_weights = torch.from_numpy(
+            data[num_candidates * 2 : num_candidates * 3]
+        )
+        num_candidates = torch.tensor([num_candidates])
+        metrics.update(
+            predictions=mt_target_preds.view(1, -1),
+            labels=mt_target_labels.view(1, -1),
+            weights=mt_target_weights.view(1, -1),
+            num_candidates=num_candidates,
+        )
+    for k, v in metrics.compute().items():
+        logger.warning(f"{k}: {v}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/cpp/hstu_runner.cpp b/recommendation_v4/generative_recommenders/dlrm_v3/inference/cpp/hstu_runner.cpp
new file mode 100644
index 000000000..d4d0d4082
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/cpp/hstu_runner.cpp
@@ -0,0 +1,215 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+//
+// End-to-end runner for the HSTU torch.jit / torch.package artifacts produced
+// by generative_recommenders/dlrm_v3/inference/packager.py and exercised by
+// :end_to_end_test.
+//
+// CLI:
+//   hstu_runner [--aott_library <lib.so> ...] <sparse.pt> <dense.pt>
+//       <inputs.pt> <output.pt>
+//
+// Where:
+//   sparse.pt   ScriptModule whose forward(uih, candidates) returns
+//               Tuple[Dict[str,Tensor], Dict[str,Tensor],
+//                     Dict[str,Tensor], Tensor, Tensor]
+//   dense.pt    ScriptModule (cuda:0, bf16) whose forward(...) returns
+//               Tuple[Tensor, Optional[Tensor], Optional[Tensor]]
+//   inputs.pt   ScriptModule whose forward() returns
+//               Tuple[KeyedJaggedTensor, KeyedJaggedTensor]
+//   output.pt   torch::pickle_save destination for the predictions tensor;
+//               readable from Python as ``torch.load(output.pt)``.
+
+#include <dlfcn.h>
+
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <torch/csrc/jit/serialization/import.h>
+#include <torch/script.h>
+
+namespace {
+
+struct RunnerArgs {
+  std::vector<std::string> aottLibraryPaths;
+  std::string sparsePath;
+  std::string densePath;
+  std::string inputsPath;
+  std::string outputPath;
+};
+
+RunnerArgs parseArgs(int argc, char** argv) {
+  RunnerArgs args;
+  std::vector<std::string> positional;
+  for (int i = 1; i < argc; ++i) {
+    const std::string arg{argv[i]};
+    if (arg == "--aott_library") {
+      if (++i >= argc) {
+        throw std::runtime_error("--aott_library requires a path");
+      }
+      args.aottLibraryPaths.emplace_back(argv[i]);
+    } else {
+      positional.push_back(arg);
+    }
+  }
+
+  if (positional.size() != 4) {
+    throw std::runtime_error(
+        "Usage: hstu_runner [--aott_library <lib.so> ...] <sparse.pt> "
+        "<dense.pt> <inputs.pt> <output.pt>");
+  }
+  args.sparsePath = positional[0];
+  args.densePath = positional[1];
+  args.inputsPath = positional[2];
+  args.outputPath = positional[3];
+  return args;
+}
+
+void loadAottLibraries(
+    const std::vector<std::string>& libraryPaths,
+    const std::function<void(const std::string&)>& log) {
+  for (const auto& path : libraryPaths) {
+    log("[runner] loading AOT-T library " + path);
+    void* handle = dlopen(path.c_str(), RTLD_GLOBAL | RTLD_NOW);
+    if (handle == nullptr) {
+      throw std::runtime_error(
+          "failed to dlopen AOT-T library " + path + ": " + dlerror());
+    }
+  }
+}
+
+torch::jit::Module loadModule(const std::string& path) {
+  // @patternlint-disable-next-line no-torch-low-level-api
+  auto m = torch::jit::load(path);
+  m.eval();
+  return m;
+}
+
+// Walk a Dict<str, Tensor> and replace every value with .to(device) (and
+// optionally .to(bfloat16)). C++ analog of move_sparse_output_to_device.
+void moveDictToDevice(
+    c10::impl::GenericDict& d,
+    const torch::Device& device,
+    bool toBfloat16) {
+  for (auto& kv : d) {
+    auto t = kv.value().toTensor().to(device);
+    if (toBfloat16) {
+      t = t.to(torch::kBFloat16);
+    }
+    d.insert_or_assign(kv.key(), t);
+  }
+}
+
+void writePickle(const torch::Tensor& t, const std::string& path) {
+  // torch::pickle_save returns a byte buffer in the same wire format as
+  // ``torch.save(tensor, ...)``, so the Python side can read it with
+  // ``torch.load(path)``.
+  const auto data = torch::jit::pickle_save(c10::IValue(t));
+  std::ofstream out(path, std::ios::binary);
+  if (!out) {
+    throw std::runtime_error("failed to open output: " + path);
+  }
+  out.write(data.data(), static_cast<std::streamsize>(data.size()));
+}
+
+} // namespace
+
+int main(int argc, char** argv) {
+  RunnerArgs args;
+  try {
+    args = parseArgs(argc, argv);
+  } catch (const std::exception& e) {
+    std::cerr << e.what() << '\n';
+    return 1;
+  }
+
+  // Log to a file next to the output so we can inspect even if
+  // buck2 swallows stderr.
+  const std::string logPath = args.outputPath + ".log";
+  std::ofstream logFile(logPath);
+  auto log = [&](const std::string& msg) {
+    logFile << msg << std::endl;
+    logFile.flush();
+    std::cerr << msg << std::endl;
+  };
+
+  try {
+    log("[runner] step 0: loading AOT-T libraries");
+    loadAottLibraries(args.aottLibraryPaths, log);
+    log("[runner] step 0 done: loaded " +
+        std::to_string(args.aottLibraryPaths.size()) + " AOT-T libraries");
+
+    log("[runner] step 1: loading sparse module from " + args.sparsePath);
+    auto sparse = loadModule(args.sparsePath);
+
+    log("[runner] step 2: loading dense module from " + args.densePath);
+    auto dense = loadModule(args.densePath);
+
+    log("[runner] step 3: loading inputs module from " + args.inputsPath);
+    auto inputs = loadModule(args.inputsPath);
+
+    log("[runner] step 4: running inputs.forward()");
+    auto inputsTuple = inputs.forward({}).toTuple();
+    auto uihLengths = inputsTuple->elements()[0];
+    auto uihValues = inputsTuple->elements()[1];
+    auto candidatesLengths = inputsTuple->elements()[2];
+    auto candidatesValues = inputsTuple->elements()[3];
+    log("[runner] step 4 done: got 4 input tensors");
+
+    log("[runner] step 5: running sparse.forward()");
+    std::vector<c10::IValue> sparseInputs{
+        uihLengths, uihValues, candidatesLengths, candidatesValues};
+    auto sparseOut = sparse.forward(sparseInputs).toTuple();
+    log("[runner] step 5 done: sparse forward returned " +
+        std::to_string(sparseOut->elements().size()) + " elements");
+
+    log("[runner] step 6: unpacking sparse output dicts");
+    auto seqEmbValues = sparseOut->elements()[0].toGenericDict();
+    auto seqEmbLengths = sparseOut->elements()[1].toGenericDict();
+    auto payloadFeatures = sparseOut->elements()[2].toGenericDict();
+    auto uihSeqLengths = sparseOut->elements()[3].toTensor();
+    auto numCandidates = sparseOut->elements()[4].toTensor();
+    log("[runner] step 6 done: unpacked dicts");
+
+    log("[runner] step 7: moving dicts to cuda:0");
+    const auto device = torch::Device(torch::kCUDA, 0);
+    moveDictToDevice(seqEmbValues, device, /*toBfloat16=*/true);
+    log("[runner] step 7a: seqEmbValues moved");
+    moveDictToDevice(seqEmbLengths, device, /*toBfloat16=*/false);
+    log("[runner] step 7b: seqEmbLengths moved");
+    moveDictToDevice(payloadFeatures, device, /*toBfloat16=*/false);
+    log("[runner] step 7c: payloadFeatures moved");
+    uihSeqLengths = uihSeqLengths.to(device);
+    numCandidates = numCandidates.to(device);
+    log("[runner] step 7 done: all on cuda:0");
+
+    log("[runner] step 8: running dense.forward()");
+    std::vector<c10::IValue> denseInputs{
+        seqEmbValues,
+        seqEmbLengths,
+        payloadFeatures,
+        uihSeqLengths,
+        numCandidates,
+    };
+    auto denseOut = dense.forward(denseInputs);
+    log("[runner] step 8 done: dense forward returned");
+
+    auto preds = denseOut.toTensor().detach().cpu();
+    log("[runner] step 9: preds on cpu");
+
+    std::cout << "preds shape: " << preds.sizes() << '\n';
+    std::cout << "preds sum:   "
+              << preds.to(torch::kFloat32).sum().item<float>() << '\n';
+
+    writePickle(preds, args.outputPath);
+    std::cout << "wrote " << args.outputPath << '\n';
+    log("[runner] step 10: done, wrote output");
+    return 0;
+  } catch (const std::exception& e) {
+    log(std::string("hstu_runner FAILED: ") + e.what());
+    return 1;
+  }
+}
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/data_producer.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/data_producer.py
new file mode 100644
index 000000000..6a8db77c8
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/data_producer.py
@@ -0,0 +1,227 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+"""
+Data producer module for DLRMv3 inference.
+
+This module provides classes for producing and managing query data during inference,
+supporting both single-threaded and multi-threaded data production modes.
+"""
+
+import logging
+import threading
+import time
+from queue import Queue
+from typing import List, Optional, Tuple, Union
+
+import torch
+from generative_recommenders.dlrm_v3.datasets.dataset import Dataset, Samples
+
+logging.basicConfig(level=logging.INFO)
+logger: logging.Logger = logging.getLogger("data_producer")
+
+
+class QueryItem:
+    """
+    Container for a query item to be processed by the inference thread pool.
+
+    Attributes:
+        query_ids: List of unique identifiers for the queries in this batch.
+        samples: The sample data containing features for the queries.
+        start: Time when the query was first received.
+        dt_queue: Time spent in the queue before processing.
+        dt_batching: Time spent on batching the data.
+    """
+
+    def __init__(
+        self,
+        query_ids: List[int],
+        samples: Samples,
+        start: float,
+        dt_queue: float,
+        dt_batching: float,
+    ) -> None:
+        self.query_ids = query_ids
+        self.samples = samples
+        self.start: float = start
+        self.dt_queue: float = dt_queue
+        self.dt_batching: float = dt_batching
+
+
+class SingleThreadDataProducer:
+    """
+    Single-threaded data producer for synchronous query processing.
+
+    This producer processes queries on the main thread without any parallelism,
+    suitable for debugging or low-throughput scenarios.
+
+    Args:
+        ds: The dataset to fetch samples from.
+        run_one_item: Callback function to process a single QueryItem.
+    """
+
+    def __init__(self, ds: Dataset, run_one_item) -> None:  # pyre-ignore [2]
+        self.ds = ds
+        self.run_one_item = run_one_item  # pyre-ignore [4]
+
+    def enqueue(
+        self, query_ids: List[int], content_ids: List[int], t0: float, dt_queue: float
+    ) -> None:
+        """
+        Enqueue queries for immediate synchronous processing.
+
+        Args:
+            query_ids: List of unique query identifiers.
+            content_ids: List of content/sample identifiers to fetch.
+            t0: Timestamp when the query batch was created.
+            dt_queue: Time spent waiting in the queue.
+        """
+        with torch.profiler.record_function("data batching"):
+            t0_batching: float = time.time()
+            samples: Union[Samples, List[Samples]] = self.ds.get_samples(content_ids)
+            dt_batching: float = time.time() - t0_batching
+            if isinstance(samples, Samples):
+                query = QueryItem(
+                    query_ids=query_ids,
+                    samples=samples,
+                    start=t0,
+                    dt_queue=dt_queue,
+                    dt_batching=dt_batching,
+                )
+                self.run_one_item(query)
+            else:
+                start_idx = 0
+                for sample in samples:
+                    batch_size: int = sample.batch_size()
+                    query = QueryItem(
+                        query_ids=query_ids[start_idx : start_idx + batch_size],
+                        samples=sample,
+                        start=t0,
+                        dt_queue=dt_queue,
+                        dt_batching=dt_batching,
+                    )
+                    start_idx += batch_size
+                    self.run_one_item(query)
+
+    def finish(self) -> None:
+        """Finalize the producer. No-op for single-threaded mode."""
+        pass
+
+
+class MultiThreadDataProducer:
+    """
+    Multi-threaded data producer for parallel query processing.
+
+    Uses a thread pool to fetch and batch data in parallel with model inference,
+    improving throughput for high-load scenarios.
+
+    Args:
+        ds: The dataset to fetch samples from.
+        threads: Number of worker threads to use.
+        run_one_item: Callback function to process a single QueryItem.
+    """
+
+    def __init__(
+        self,
+        ds: Dataset,
+        threads: int,
+        run_one_item,  # pyre-ignore [2]
+    ) -> None:
+        queue_size_multiplier = 4
+        self.ds = ds
+        self.threads = threads
+        self.run_one_item = run_one_item  # pyre-ignore [4]
+        self.tasks: Queue[Optional[Tuple[List[int], List[int], float, float]]] = Queue(
+            maxsize=threads * queue_size_multiplier
+        )
+        self.workers: List[threading.Thread] = []
+        for _ in range(self.threads):
+            worker = threading.Thread(target=self.handle_tasks, args=(self.tasks,))
+            worker.daemon = True
+            self.workers.append(worker)
+            worker.start()
+
+    def handle_tasks(
+        self, tasks_queue: Queue[Optional[Tuple[List[int], List[int], float, float]]]
+    ) -> None:
+        """
+        Worker thread main loop to process tasks from the queue.
+
+        Each worker maintains its own CUDA stream for parallel execution.
+
+        Args:
+            tasks_queue: Queue containing task tuples or None for termination.
+        """
+        stream = torch.cuda.Stream()
+        while True:
+            query_and_content_ids = tasks_queue.get()
+            if query_and_content_ids is None:
+                tasks_queue.task_done()
+                break
+            query_ids, content_ids, t0, dt_queue = query_and_content_ids
+            t0_batching: float = time.time()
+            samples: Union[Samples, List[Samples]] = self.ds.get_samples(content_ids)
+            dt_batching: float = time.time() - t0_batching
+            if isinstance(samples, Samples):
+                qitem = QueryItem(
+                    query_ids=query_ids,
+                    samples=samples,
+                    start=t0,
+                    dt_queue=dt_queue,
+                    dt_batching=dt_batching,
+                )
+                with torch.inference_mode(), torch.cuda.stream(stream):
+                    self.run_one_item(qitem)
+            else:
+                start_idx = 0
+                for sample in samples:
+                    batch_size: int = sample.batch_size()
+                    qitem = QueryItem(
+                        query_ids=query_ids[start_idx : start_idx + batch_size],
+                        samples=sample,
+                        start=t0,
+                        dt_queue=dt_queue,
+                        dt_batching=dt_batching,
+                    )
+                    start_idx += batch_size
+                    with torch.inference_mode(), torch.cuda.stream(stream):
+                        self.run_one_item(qitem)
+            tasks_queue.task_done()
+
+    def enqueue(
+        self, query_ids: List[int], content_ids: List[int], t0: float, dt_queue: float
+    ) -> None:
+        """
+        Enqueue queries for asynchronous processing by worker threads.
+
+        Args:
+            query_ids: List of unique query identifiers.
+            content_ids: List of content/sample identifiers to fetch.
+            t0: Timestamp when the query batch was created.
+            dt_queue: Time spent waiting in the queue.
+        """
+        with torch.profiler.record_function("data batching"):
+            self.tasks.put((query_ids, content_ids, t0, dt_queue))
+
+    def finish(self) -> None:
+        """
+        Signal all worker threads to terminate and wait for completion.
+
+        Sends None to each worker to trigger graceful shutdown.
+        """
+        for _ in self.workers:
+            self.tasks.put(None)
+        for worker in self.workers:
+            worker.join()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/dense_predict_module.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/dense_predict_module.py
new file mode 100644
index 000000000..add2781bc
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/dense_predict_module.py
@@ -0,0 +1,96 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+
+"""
+TorchScript-friendly wrapper for the HSTU dense path (GPU transformer).
+
+``HSTUDenseScriptModule`` accepts the *flattened* sparse-output dicts produced
+by :class:`HSTUSparseScriptModule`, reconstructs ``Dict[str,
+SequenceEmbedding]`` for the existing :meth:`DlrmHSTU.main_forward` and
+returns a 3-tuple of ``(preds, labels, weights)`` -- the only fields the
+predictor actually consumes.
+"""
+
+from typing import Dict
+
+import torch
+from generative_recommenders.dlrm_v3.inference.inference_modules import get_hstu_model
+from generative_recommenders.dlrm_v3.inference.ts_types import (
+    SeqEmbLengths,
+    SeqEmbValues,
+    unflatten_seq_embeddings,
+)
+from generative_recommenders.modules.dlrm_hstu import DlrmHSTU, DlrmHSTUConfig
+from torchrec.modules.embedding_configs import EmbeddingConfig
+
+
+class HSTUDenseScriptModule(torch.nn.Module):
+    """Script-friendly dense module.
+
+    The wrapper owns a dense-only :class:`DlrmHSTU` (no
+    ``_embedding_collection``) and delegates to ``main_forward`` after
+    reconstructing the ``SequenceEmbedding`` NamedTuple form.
+    """
+
+    def __init__(
+        self,
+        hstu_config: DlrmHSTUConfig,
+        table_config: Dict[str, EmbeddingConfig],
+    ) -> None:
+        super().__init__()
+        self._hstu_model: DlrmHSTU = get_hstu_model(
+            table_config=table_config,
+            hstu_config=hstu_config,
+            table_device="cpu",
+            is_dense=True,
+        )
+
+    def forward(
+        self,
+        seq_emb_values: SeqEmbValues,
+        seq_emb_lengths: SeqEmbLengths,
+        payload_features: Dict[str, torch.Tensor],
+        uih_seq_lengths: torch.Tensor,
+        num_candidates: torch.Tensor,
+    ) -> torch.Tensor:
+        # TorchScript supports ``int(tensor.item())`` on a 0-d tensor.
+        max_uih_len: int = int(uih_seq_lengths.max().item())
+        max_num_candidates: int = int(num_candidates.max().item())
+
+        seq_embeddings = unflatten_seq_embeddings(seq_emb_values, seq_emb_lengths)
+
+        (
+            _,
+            _,
+            _,
+            mt_target_preds,
+            _mt_target_labels,
+            _mt_target_weights,
+        ) = self._hstu_model.main_forward(
+            seq_embeddings=seq_embeddings,
+            payload_features=payload_features,
+            max_uih_len=max_uih_len,
+            uih_seq_lengths=uih_seq_lengths,
+            max_num_candidates=max_num_candidates,
+            num_candidates=num_candidates,
+        )
+        assert mt_target_preds is not None
+        # Return just the predictions tensor; labels/weights are unused by
+        # the predictor at inference time and would force ``Optional[Tensor]``
+        # in the return type, which torch.jit.trace rejects ("Only tensors,
+        # lists, tuples of tensors, or dictionary of tensors can be output
+        # from traced functions").
+        return mt_target_preds
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/end_to_end_test.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/end_to_end_test.py
new file mode 100644
index 000000000..f1b956d9c
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/end_to_end_test.py
@@ -0,0 +1,795 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+
+"""
+End-to-end smoke test for the HSTU TorchScript + C++ deployment pipeline.
+
+What this binary does, in order:
+
+1. Build a synthetic batch (uih_kjt, candidates_kjt) via :func:`get_random_data`.
+2. Build the eager :class:`HSTUSparseScriptModule` and
+   :class:`HSTUDenseScriptModule`.
+3. Run them eagerly to obtain the reference ``preds_eager``.
+4. ``torch.jit.script`` + save:
+       - ``sparse.pt`` (CPU)
+       - ``dense.pt``  (cuda:0, bf16)
+       - ``inputs.pt`` (an :class:`InputsBundle` ScriptModule whose
+         ``forward()`` returns ``Tuple[KeyedJaggedTensor, KeyedJaggedTensor]``)
+5. Run the C++ runner
+       ``hstu_runner [--aott_library <lib.so> ...] <sparse.pt> <dense.pt> <inputs.pt> <preds_cpp.pt>``.
+6. ``torch.load`` the runner's output and compare against ``preds_eager``
+   with :func:`torch.testing.assert_close` (loose tolerance because the
+   scripted path may use either the PyTorch fallback trace or AOT-T-loaded
+   Triton inference kernels).
+
+Usage (manual override of the runner path):
+
+    buck2 run @mode/opt //generative_recommenders/dlrm_v3/inference:end_to_end_test \\
+        -- --cpp_runner /path/to/hstu_runner
+
+By default the binary locates the runner via ``libfb.py.parutil`` -- it ships
+inside the par as a resource (see BUCK).
+"""
+
+import argparse
+import logging
+import os
+import shutil
+import sys
+import tempfile
+from typing import Any, Dict, List, Tuple
+
+import torch
+from generative_recommenders.dlrm_v3.configs import (
+    get_embedding_table_config,
+    get_hstu_configs,
+)
+from generative_recommenders.dlrm_v3.datasets.dataset import get_random_data
+from generative_recommenders.dlrm_v3.inference.dense_predict_module import (
+    HSTUDenseScriptModule,
+)
+from generative_recommenders.dlrm_v3.inference.sparse_predict_module import (
+    HSTUSparseScriptModule,
+)
+from generative_recommenders.dlrm_v3.inference.ts_types import (
+    SeqEmbLengths,
+    SeqEmbValues,
+    unflatten_seq_embeddings,
+)
+from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig
+from security.frameworks.python.exec.subprocess import TrustedSubprocessWithList
+from torchrec.modules.embedding_configs import EmbeddingConfig
+from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
+
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+_DEFAULT_DATASET = "kuairand-1k"
+
+
+class InputsBundle(torch.nn.Module):
+    """Scripted holder for the test inputs.
+
+    Returns the constituent tensors of the two KJTs as a 4-tuple
+    ``(uih_lengths, uih_values, candidates_lengths, candidates_values)`` so
+    the traced sparse module can rebuild the KJTs inside its forward (KJT
+    instances themselves are not traceable inputs).
+    """
+
+    def __init__(
+        self,
+        uih_kjt: KeyedJaggedTensor,
+        candidates_kjt: KeyedJaggedTensor,
+    ) -> None:
+        super().__init__()
+        self.register_buffer("uih_lengths", uih_kjt.lengths())
+        self.register_buffer("uih_values", uih_kjt.values())
+        self.register_buffer("candidates_lengths", candidates_kjt.lengths())
+        self.register_buffer("candidates_values", candidates_kjt.values())
+
+    def forward(
+        self,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        return (
+            self.uih_lengths,
+            self.uih_values,
+            self.candidates_lengths,
+            self.candidates_values,
+        )
+
+
+class _SparseTraceShim(torch.nn.Module):
+    """Adapter that takes raw tensors and rebuilds the KJTs inside forward.
+
+    ``torch.jit.trace`` does not accept ``KeyedJaggedTensor`` (or any
+    non-Tensor / non-collection-of-Tensor type) as a top-level forward
+    input, so we make the traced boundary tensor-only and bake the
+    ``List[str]`` of feature keys in as Python constants captured by the
+    closure / module attribute.
+    """
+
+    def __init__(
+        self,
+        sparse_module: HSTUSparseScriptModule,
+        uih_keys: List[str],
+        candidates_keys: List[str],
+    ) -> None:
+        super().__init__()
+        self._sparse_module: HSTUSparseScriptModule = sparse_module
+        self._uih_keys: List[str] = uih_keys
+        self._candidates_keys: List[str] = candidates_keys
+
+    def forward(
+        self,
+        uih_lengths: torch.Tensor,
+        uih_values: torch.Tensor,
+        candidates_lengths: torch.Tensor,
+        candidates_values: torch.Tensor,
+    ) -> Tuple[
+        SeqEmbValues,
+        SeqEmbLengths,
+        Dict[str, torch.Tensor],
+        torch.Tensor,
+        torch.Tensor,
+    ]:
+        uih_kjt = KeyedJaggedTensor(
+            keys=self._uih_keys,
+            lengths=uih_lengths,
+            values=uih_values,
+        )
+        candidates_kjt = KeyedJaggedTensor(
+            keys=self._candidates_keys,
+            lengths=candidates_lengths,
+            values=candidates_values,
+        )
+        return self._sparse_module(
+            uih_features=uih_kjt, candidates_features=candidates_kjt
+        )
+
+
+class _DenseAottTraceShim(torch.nn.Module):
+    """FX-traceable dense adapter for the representative AOT-T shape."""
+
+    def __init__(
+        self,
+        dense_module: HSTUDenseScriptModule,
+        max_uih_len: int,
+        max_num_candidates: int,
+        total_uih_len: int,
+        total_targets: int,
+    ) -> None:
+        super().__init__()
+        self._dense_module: HSTUDenseScriptModule = dense_module
+        self._max_uih_len: int = max_uih_len
+        self._max_num_candidates: int = max_num_candidates
+        self._total_uih_len: int = total_uih_len
+        self._total_targets: int = total_targets
+
+    def forward(
+        self,
+        seq_emb_values: SeqEmbValues,
+        seq_emb_lengths: SeqEmbLengths,
+        payload_features: Dict[str, torch.Tensor],
+        uih_seq_lengths: torch.Tensor,
+        num_candidates: torch.Tensor,
+    ) -> torch.Tensor:
+        seq_embeddings = unflatten_seq_embeddings(seq_emb_values, seq_emb_lengths)
+
+        (
+            _,
+            _,
+            _,
+            mt_target_preds,
+            _mt_target_labels,
+            _mt_target_weights,
+        ) = self._dense_module._hstu_model.main_forward(
+            seq_embeddings=seq_embeddings,
+            payload_features=payload_features,
+            max_uih_len=self._max_uih_len,
+            uih_seq_lengths=uih_seq_lengths,
+            max_num_candidates=self._max_num_candidates,
+            num_candidates=num_candidates,
+            total_uih_len=self._total_uih_len,
+            total_targets=self._total_targets,
+        )
+        assert mt_target_preds is not None
+        return mt_target_preds
+
+
+def _dense_aott_concrete_args(
+    dense_inputs: Tuple[
+        Dict[str, torch.Tensor],
+        Dict[str, torch.Tensor],
+        Dict[str, torch.Tensor],
+        torch.Tensor,
+        torch.Tensor,
+    ],
+) -> Dict[str, Any]:
+    from torch.fx._symbolic_trace import PH
+
+    seq_emb_values, seq_emb_lengths, payload_features, _, _ = dense_inputs
+    return {
+        "seq_emb_values": {key: PH for key in seq_emb_values},
+        "seq_emb_lengths": {key: PH for key in seq_emb_lengths},
+        "payload_features": {key: PH for key in payload_features},
+    }
+
+
+def _find_cpp_runner() -> str:
+    """Locate the bundled hstu_runner binary.
+
+    Tries ``importlib.resources`` (the canonical fbcode resource resolver,
+    works whether the binary is in a par or unpacked), and falls back to
+    looking next to ``sys.argv[0]``.
+    """
+    try:
+        from importlib.resources import files
+
+        path = files("generative_recommenders.dlrm_v3.inference.cpp").joinpath(
+            "hstu_runner"
+        )
+        if path.is_file():
+            return str(path)
+    except Exception as exc:
+        logger.debug("importlib.resources lookup failed: %s", exc)
+
+    candidate = os.path.join(
+        os.path.dirname(os.path.abspath(sys.argv[0])), "hstu_runner"
+    )
+    if os.path.exists(candidate):
+        return candidate
+
+    raise RuntimeError(
+        "Could not find hstu_runner binary. "
+        "Pass --cpp_runner=<path> or build the cpp_binary target first."
+    )
+
+
+def _eager_run(
+    sparse_module: HSTUSparseScriptModule,
+    dense_module: HSTUDenseScriptModule,
+    uih_kjt: KeyedJaggedTensor,
+    candidates_kjt: KeyedJaggedTensor,
+    device: torch.device,
+) -> torch.Tensor:
+    """Reference path: sparse → device-move + bf16 → dense, all in Python."""
+    with torch.no_grad():
+        seq_emb_values, seq_emb_lengths, payload, uih_lens, num_cands = sparse_module(
+            uih_features=uih_kjt, candidates_features=candidates_kjt
+        )
+        seq_emb_values = {
+            k: v.to(device).to(torch.bfloat16) for k, v in seq_emb_values.items()
+        }
+        seq_emb_lengths = {k: v.to(device) for k, v in seq_emb_lengths.items()}
+        payload = {k: v.to(device) for k, v in payload.items()}
+        uih_lens = uih_lens.to(device)
+        num_cands = num_cands.to(device)
+        preds = dense_module(
+            seq_emb_values, seq_emb_lengths, payload, uih_lens, num_cands
+        )
+    return preds.detach().to(torch.float32).cpu()
+
+
+def _find_aott_libraries() -> List[str]:
+    from generative_recommenders.ops.triton_aot.compile.compile_state import (
+        get_aott_compile_path,
+    )
+
+    compile_path = get_aott_compile_path()
+    libraries: List[str] = []
+    for root, _, files in os.walk(compile_path):
+        for filename in files:
+            if filename.endswith(".so"):
+                libraries.append(os.path.join(root, filename))
+    return sorted(libraries)
+
+
+def _copy_aott_libraries_to_workdir(
+    library_paths: List[str], workdir: str
+) -> List[str]:
+    copied: List[str] = []
+    for index, path in enumerate(library_paths):
+        dst = os.path.join(workdir, f"aott_{index}_{os.path.basename(path)}")
+        shutil.copy2(path, dst)
+        copied.append(dst)
+    return copied
+
+
+def _load_aott_libraries_for_python(library_paths: List[str]) -> None:
+    for library_path in library_paths:
+        logger.info("Python roundtrip: loading AOT-T library %s", library_path)
+        torch.ops.load_library(library_path)
+
+
+def _save_aott_dense_module(
+    dense_module: HSTUDenseScriptModule,
+    dense_inputs: Tuple[
+        Dict[str, torch.Tensor],
+        Dict[str, torch.Tensor],
+        Dict[str, torch.Tensor],
+        torch.Tensor,
+        torch.Tensor,
+    ],
+    dense_path: str,
+    workdir: str,
+    atol: float,
+    rtol: float,
+) -> List[str]:
+    """Lower the dense module with AOT-T and save a TorchScript artifact.
+
+    This follows the AOT-T example flow:
+
+    1. FX trace the module.
+    2. Unwrap outer `aot_triton_kernel_wrapper_*` nodes.
+    3. Run representative CUDA inputs under `TritonAOTCompile`.
+    4. `transform_kernels` to replace wrappers with `torch.ops.triton_aot.*`.
+    5. Script and save the transformed dense module.
+
+    The full HSTU dense wrapper has historically needed tracing rather than FX,
+    so failures here are reported with context and the default path remains the
+    D102 traced TorchScript fallback.
+    """
+    from generative_recommenders.ops.triton_aot.compile.triton_aot_compile import (
+        TritonAOTCompile,
+    )
+    from generative_recommenders.ops.triton_aot.preprocess import (
+        unwrap_aott_wrapper_nodes,
+    )
+    from generative_recommenders.ops.triton_aot.transform.transform_kernels import (
+        transform_kernels,
+    )
+    from tgif.fx.tgif_tracer import TGIFTracer
+
+    max_uih_len = int(dense_inputs[3].max().item())
+    max_num_candidates = int(dense_inputs[4].max().item())
+    total_uih_len = int(dense_inputs[3].sum().item())
+    total_targets = int(dense_inputs[4].sum().item())
+    trace_shim = _DenseAottTraceShim(
+        dense_module=dense_module,
+        max_uih_len=max_uih_len,
+        max_num_candidates=max_num_candidates,
+        total_uih_len=total_uih_len,
+        total_targets=total_targets,
+    ).eval()
+
+    logger.info(
+        "AOT-T dense: FX tracing representative shape "
+        "(max_uih_len=%d, max_num_candidates=%d, "
+        "total_uih_len=%d, total_targets=%d)...",
+        max_uih_len,
+        max_num_candidates,
+        total_uih_len,
+        total_targets,
+    )
+    try:
+        fx_dense = TGIFTracer().symbolic_trace(
+            trace_shim,
+            concrete_args=_dense_aott_concrete_args(dense_inputs),
+        )
+        lowered_dense = unwrap_aott_wrapper_nodes(fx_dense, TGIFTracer())
+    except Exception as exc:
+        raise RuntimeError(
+            "AOT-T dense lowering requires an FX-traceable dense entry point. "
+            "Use --dense_backend=torchscript to fall back to the D102 traced "
+            "TorchScript path."
+        ) from exc
+
+    logger.info("AOT-T dense: compiling Triton kernels from sample inputs...")
+    with torch.no_grad():
+        with TritonAOTCompile():
+            ref_output = lowered_dense(*dense_inputs)
+
+    original_code = lowered_dense.code
+    lowered_dense = transform_kernels(lowered_dense)
+    if lowered_dense.code == original_code:
+        logger.warning(
+            "AOT-T dense: transform_kernels did not change the FX graph. "
+            "This usually means no aot_triton_kernel_wrapper_* nodes were "
+            "present in the dense path."
+        )
+
+    libraries = _find_aott_libraries()
+    if not libraries:
+        raise RuntimeError(
+            "AOT-T dense lowering produced no .so files. Ensure the dense path "
+            "uses HammerKernel.TRITON_INFERENCE branches backed by triton_aot ops."
+        )
+
+    with torch.no_grad():
+        lowered_output = lowered_dense(*dense_inputs)
+    torch.testing.assert_close(ref_output, lowered_output, atol=atol, rtol=rtol)
+
+    logger.info("AOT-T dense: tracing transformed module...")
+    torch.jit.trace(
+        lowered_dense,
+        example_inputs=dense_inputs,
+        strict=False,
+        check_trace=False,
+    ).save(dense_path)
+    copied_libraries = _copy_aott_libraries_to_workdir(libraries, workdir)
+    logger.info("AOT-T dense: copied %d library file(s)", len(copied_libraries))
+    return copied_libraries
+
+
+def _build_synthetic_inputs(
+    hstu_config: DlrmHSTUConfig,
+    table_config: Dict[str, EmbeddingConfig],
+    uih_max_seq_len: int,
+) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]:
+    contextual: List[str] = list(hstu_config.contextual_feature_to_max_length.keys())
+    # The kuairand-1k dataset has tiny embedding tables for some contextual
+    # features (e.g. user_active_degree has num_embeddings=8). Clamp the
+    # random value range so every index stays in range for every table.
+    min_rows = min(t.num_embeddings for t in table_config.values())
+    value_bound = max(2, min_rows)
+    logger.info(
+        "synthetic value_bound=%d (min table rows=%d across %d tables)",
+        value_bound,
+        min_rows,
+        len(table_config),
+    )
+    return get_random_data(
+        contexual_features=contextual,
+        hstu_uih_keys=hstu_config.hstu_uih_feature_names,
+        hstu_candidates_keys=hstu_config.hstu_candidate_feature_names,
+        uih_max_seq_len=uih_max_seq_len,
+        max_num_candidates=hstu_config.max_num_candidates_inference,
+        value_bound=value_bound,
+    )
+
+
+def _parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--cpp_runner",
+        type=str,
+        default=None,
+        help="Path to the hstu_runner binary; default: bundled resource.",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default=_DEFAULT_DATASET,
+        help="Dataset key for HSTU/embedding configs.",
+    )
+    parser.add_argument(
+        "--device", type=str, default="cuda:0", help="Dense-module device."
+    )
+    parser.add_argument(
+        "--uih_max_seq_len",
+        type=int,
+        default=128,
+        help="Max UIH length for the synthetic batch.",
+    )
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--atol", type=float, default=1e-2)
+    parser.add_argument("--rtol", type=float, default=1e-2)
+    parser.add_argument(
+        "--dense_backend",
+        choices=("torchscript", "aott"),
+        default="torchscript",
+        help="Dense artifact backend. aott lowers TRITON_INFERENCE wrappers and passes compiled libraries to the C++ runner.",
+    )
+    parser.add_argument(
+        "--aott_library",
+        action="append",
+        default=[],
+        help="Additional prebuilt AOT-T shared library to dlopen before loading dense.pt. May be repeated.",
+    )
+    parser.add_argument(
+        "--keep_workdir",
+        action="store_true",
+        help="Do not delete the temp dir holding the saved artifacts.",
+    )
+    return parser.parse_args()
+
+
+def main() -> None:  # noqa: C901
+    logging.basicConfig(level=logging.INFO, format="[e2e] %(message)s", force=True)
+    logger.setLevel(logging.DEBUG)
+    args = _parse_args()
+
+    if not torch.cuda.is_available():
+        logger.error("CUDA is required; aborting.")
+        sys.exit(2)
+
+    runner_path = args.cpp_runner or _find_cpp_runner()
+    logger.info("Using C++ runner: %s", runner_path)
+
+    torch.manual_seed(args.seed)
+    device = torch.device(args.device)
+    torch.cuda.set_device(device)
+
+    hstu_config = get_hstu_configs(args.dataset)
+    table_config = get_embedding_table_config(args.dataset)
+
+    uih_kjt, candidates_kjt = _build_synthetic_inputs(
+        hstu_config, table_config, args.uih_max_seq_len
+    )
+
+    sparse_module = HSTUSparseScriptModule(
+        table_config=table_config,
+        hstu_config=hstu_config,
+        use_no_copy_embedding_collection=True,
+    ).eval()
+    dense_module = (
+        HSTUDenseScriptModule(hstu_config=hstu_config, table_config=table_config)
+        .to(torch.bfloat16)
+        .to(device)
+        .eval()
+    )
+
+    from generative_recommenders.common import HammerKernel
+
+    dense_kernel = (
+        HammerKernel.TRITON_INFERENCE
+        if args.dense_backend == "aott"
+        else HammerKernel.PYTORCH
+    )
+    sparse_module._sparse._hstu_model.set_hammer_kernel(HammerKernel.PYTORCH)
+    dense_module._hstu_model.set_hammer_kernel(dense_kernel)
+
+    # Diagnostic: walk every HammerModule submodule and print its effective
+    # kernel selection, so any submodule that didn't pick up the override
+    # surfaces immediately. Triton/Triton-CC selections will fail at trace
+    # time, so this print is critical for triaging the next iteration if
+    # tracing fails.
+    from generative_recommenders.common import HammerModule as _HM
+
+    for name, m in list(sparse_module.named_modules()) + list(
+        dense_module.named_modules()
+    ):
+        if isinstance(m, _HM):
+            logger.info(
+                "kernel-pin %-60s -> %s (is_inference=%s, use_triton_cc=%s)",
+                name or "<root>",
+                m.hammer_kernel().value,
+                m._is_inference,
+                m._use_triton_cc,
+            )
+
+    # === 1. Eager reference ===
+    logger.info("Running eager reference...")
+    preds_eager = _eager_run(
+        sparse_module, dense_module, uih_kjt, candidates_kjt, device
+    )
+    logger.info(
+        "preds_eager shape=%s sum=%.6f",
+        tuple(preds_eager.shape),
+        preds_eager.sum().item(),
+    )
+
+    # === 2. Trace/lower + save ===
+    # The default path keeps D102's trace-based TorchScript artifact. The
+    # AOT-T path follows ModelStore's compile/transform flow and saves a
+    # scripted FX module whose Triton kernels dispatch through torch.ops.
+    workdir = tempfile.mkdtemp(prefix="hstu_e2e_")
+    sparse_path = os.path.join(workdir, "sparse.pt")
+    dense_path = os.path.join(workdir, "dense.pt")
+    inputs_path = os.path.join(workdir, "inputs.pt")
+    cpp_out_path = os.path.join(workdir, "preds_cpp.pt")
+    eager_out_path = os.path.join(workdir, "preds_eager.pt")
+    aott_library_paths: List[str] = list(args.aott_library)
+    python_roundtrip_aott_library_paths: List[str] = list(args.aott_library)
+    logger.info("workdir: %s", workdir)
+
+    # Re-run sparse eagerly to capture an example output that can drive the
+    # dense trace.
+    with torch.no_grad():
+        sparse_out = sparse_module(
+            uih_features=uih_kjt, candidates_features=candidates_kjt
+        )
+        seq_emb_values = {
+            k: v.to(device).to(torch.bfloat16) for k, v in sparse_out[0].items()
+        }
+        seq_emb_lengths = {k: v.to(device) for k, v in sparse_out[1].items()}
+        payload = {k: v.to(device) for k, v in sparse_out[2].items()}
+        uih_lens = sparse_out[3].to(device)
+        num_cands = sparse_out[4].to(device)
+
+    logger.info("Tracing sparse module via raw-tensor shim (CPU)...")
+    sparse_shim = _SparseTraceShim(
+        sparse_module=sparse_module,
+        uih_keys=list(uih_kjt.keys()),
+        candidates_keys=list(candidates_kjt.keys()),
+    )
+    traced_sparse = torch.jit.trace(
+        sparse_shim,
+        example_inputs=(
+            uih_kjt.lengths(),
+            uih_kjt.values(),
+            candidates_kjt.lengths(),
+            candidates_kjt.values(),
+        ),
+        strict=False,
+        check_trace=False,
+    )
+    traced_sparse.save(sparse_path)
+
+    dense_inputs = (
+        seq_emb_values,
+        seq_emb_lengths,
+        payload,
+        uih_lens,
+        num_cands,
+    )
+    if args.dense_backend == "aott":
+        logger.info("Lowering dense module with AOT-T...")
+        generated_aott_library_paths = _save_aott_dense_module(
+            dense_module,
+            dense_inputs,
+            dense_path,
+            workdir,
+            args.atol,
+            args.rtol,
+        )
+        aott_library_paths.extend(generated_aott_library_paths)
+    else:
+        logger.info("Tracing dense module (cuda:0, bf16)...")
+        traced_dense = torch.jit.trace(
+            dense_module,
+            example_inputs=dense_inputs,
+            strict=False,
+            check_trace=False,
+        )
+        traced_dense.save(dense_path)
+
+    logger.info("Scripting + saving inputs bundle...")
+    torch.jit.script(InputsBundle(uih_kjt, candidates_kjt)).save(inputs_path)
+    torch.save(preds_eager, eager_out_path)
+
+    # === 2.5. Python-side roundtrip verification ===
+    # Load the saved traced artifacts back in Python and verify they produce
+    # the same results as the eager run. This proves the artifacts are correct
+    # independently of the C++ runner.
+    logger.info("Python roundtrip: loading traced artifacts back...")
+    if python_roundtrip_aott_library_paths:
+        _load_aott_libraries_for_python(python_roundtrip_aott_library_paths)
+    rt_inputs = torch.jit.load(inputs_path)
+    rt_sparse = torch.jit.load(sparse_path)
+    rt_dense = torch.jit.load(dense_path)
+
+    with torch.no_grad():
+        rt_uih_l, rt_uih_v, rt_cand_l, rt_cand_v = rt_inputs()
+        logger.info(
+            "  rt inputs: uih_l=%s uih_v=%s cand_l=%s cand_v=%s",
+            rt_uih_l.shape,
+            rt_uih_v.shape,
+            rt_cand_l.shape,
+            rt_cand_v.shape,
+        )
+
+        rt_sparse_out = rt_sparse(rt_uih_l, rt_uih_v, rt_cand_l, rt_cand_v)
+
+        for i, elem in enumerate(rt_sparse_out):
+            if isinstance(elem, dict):
+                for k, v in elem.items():
+                    has_nan = torch.isnan(v).any().item()
+                    has_inf = torch.isinf(v).any().item()
+                    logger.info(
+                        "  sparse_out[%d][%s] shape=%s dtype=%s nan=%s inf=%s",
+                        i,
+                        k,
+                        tuple(v.shape),
+                        v.dtype,
+                        has_nan,
+                        has_inf,
+                    )
+            elif isinstance(elem, torch.Tensor):
+                logger.info(
+                    "  sparse_out[%d] shape=%s dtype=%s nan=%s inf=%s",
+                    i,
+                    tuple(elem.shape),
+                    elem.dtype,
+                    torch.isnan(elem).any().item(),
+                    torch.isinf(elem).any().item(),
+                )
+
+        rt_sev = {
+            k: v.to(device).to(torch.bfloat16) for k, v in rt_sparse_out[0].items()
+        }
+        rt_sel = {k: v.to(device) for k, v in rt_sparse_out[1].items()}
+        rt_pay = {k: v.to(device) for k, v in rt_sparse_out[2].items()}
+        rt_uih = rt_sparse_out[3].to(device)
+        rt_nc = rt_sparse_out[4].to(device)
+
+        preds_rt = rt_dense(rt_sev, rt_sel, rt_pay, rt_uih, rt_nc)
+
+    preds_rt_cpu = preds_rt.detach().to(torch.float32).cpu()
+    logger.info(
+        "preds_roundtrip shape=%s sum=%.6f nan=%s inf=%s",
+        tuple(preds_rt_cpu.shape),
+        preds_rt_cpu.sum().item(),
+        torch.isnan(preds_rt_cpu).any().item(),
+        torch.isinf(preds_rt_cpu).any().item(),
+    )
+
+    try:
+        torch.testing.assert_close(
+            preds_eager, preds_rt_cpu, atol=args.atol, rtol=args.rtol
+        )
+    except AssertionError as e:
+        logger.error("PYTHON ROUNDTRIP PARITY FAILED: %s", e)
+        if not args.keep_workdir:
+            logger.info("(workdir kept for inspection: %s)", workdir)
+        sys.exit(1)
+    logger.info("PYTHON ROUNDTRIP PASSED (atol=%g rtol=%g)", args.atol, args.rtol)
+
+    # === 3. Invoke C++ runner ===
+    runner_args: List[str] = []
+    for library_path in aott_library_paths:
+        runner_args.extend(["--aott_library", library_path])
+    runner_args.extend([sparse_path, dense_path, inputs_path, cpp_out_path])
+
+    logger.info("Running C++: %s %s", runner_path, " ".join(runner_args))
+    # pyre-fixme[6]: TrustedSubprocessWithList requires Literal[str] but this
+    # runner is resolved from a built resource or explicit test argument.
+    result = TrustedSubprocessWithList.run(
+        executable=runner_path,
+        cmd_args=runner_args,
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    if result.stdout:
+        logger.info("--- runner stdout ---\n%s", result.stdout.rstrip())
+    if result.stderr:
+        logger.info("--- runner stderr ---\n%s", result.stderr.rstrip())
+    if result.returncode != 0:
+        if result.returncode == -11:
+            logger.warning(
+                "C++ runner SIGSEGV (exit -11). This is a known issue with "
+                "torch-cpp-cuda static initialization on some machines. "
+                "Python roundtrip verification passed above. "
+                "Artifacts in: %s",
+                workdir,
+            )
+            args.keep_workdir = True
+        else:
+            logger.error("C++ runner exited with code %d", result.returncode)
+        if not args.keep_workdir:
+            shutil.rmtree(workdir, ignore_errors=True)
+        sys.exit(result.returncode)
+
+    # === 4. Compare ===
+    if not os.path.exists(cpp_out_path):
+        logger.error("C++ runner did not produce %s", cpp_out_path)
+        sys.exit(1)
+    preds_cpp = torch.load(cpp_out_path, weights_only=False).to(torch.float32).cpu()
+    logger.info(
+        "preds_cpp   shape=%s sum=%.6f",
+        tuple(preds_cpp.shape),
+        preds_cpp.sum().item(),
+    )
+
+    try:
+        torch.testing.assert_close(
+            preds_eager, preds_cpp, atol=args.atol, rtol=args.rtol
+        )
+    except AssertionError as e:
+        logger.error("PARITY FAILED: %s", e)
+        if not args.keep_workdir:
+            logger.info("(workdir kept for inspection: %s)", workdir)
+        sys.exit(1)
+
+    logger.info("PASSED: eager and C++ agree (atol=%g rtol=%g)", args.atol, args.rtol)
+    if not args.keep_workdir:
+        shutil.rmtree(workdir, ignore_errors=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/debug.gin b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/debug.gin
new file mode 100644
index 000000000..e2025dee0
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/debug.gin
@@ -0,0 +1,13 @@
+run.model_path = ""
+run.scenario_name = "Server"
+run.batchsize = 16
+run.output_trace = False
+run.data_producer_threads = 4
+run.compute_eval = False
+run.find_peak_performance = False
+run.train_split_percentage = 0.75
+
+# below will override mlperf rules compliant settings - don't use for official submission
+run.target_qps = 2000
+run.num_queries = 10000
+run.numpy_rand_seed = 123
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/kuairand_1k.gin b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/kuairand_1k.gin
new file mode 100644
index 000000000..a770aa014
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/kuairand_1k.gin
@@ -0,0 +1,14 @@
+# run.model_path = "/home/linjianma/ckpts/kuairand_1k/2025_01_12_17_56_43/"
+run.scenario_name = "Server"
+run.batchsize = 16
+run.output_trace = False
+run.data_producer_threads = 4
+run.compute_eval = False
+run.find_peak_performance = False
+run.train_split_percentage = 0.75
+
+# below will override mlperf rules compliant settings - don't use for official submission
+run.target_qps = 2000
+run.num_queries = 10000
+run.numpy_rand_seed = 123
+run.dataset_path_prefix = "/home/linjianma"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/movielens_13b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/movielens_13b.gin
new file mode 100644
index 000000000..3121ac0e7
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/movielens_13b.gin
@@ -0,0 +1,16 @@
+run.model_path = ""
+run.scenario_name = "Server"
+run.batchsize = 5
+run.output_trace = False
+run.data_producer_threads = 8
+run.compute_eval = False
+run.find_peak_performance = False
+run.train_split_percentage = 0.75
+run.sparse_quant = False
+
+# below will override mlperf rules compliant settings - don't use for official submission
+run.target_qps = 5000
+run.num_queries = 30000
+run.numpy_rand_seed = 123
+run.dataset_path_prefix = "/home/linjianma"
+run.dataset_percentage = 0.0625
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_100b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_100b.gin
new file mode 100644
index 000000000..0655734c2
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_100b.gin
@@ -0,0 +1,15 @@
+# run.model_path = "/home/linjianma/ckpts/streaming_100b/89/"
+run.scenario_name = "Server"
+run.batchsize = 10
+run.output_trace = False
+run.data_producer_threads = 16
+run.compute_eval = False
+run.find_peak_performance = False
+run.sparse_quant = False
+run.numpy_rand_seed = 123
+run.dataset_path_prefix = "/home/linjianma"
+run.dataset_percentage = 0.001
+run.warmup_ratio = 0.3
+run.num_queries = 20000
+# Needs to be tuned for different implementations to balance latency and throughput
+run.target_qps = 1000
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_400m.gin b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_400m.gin
new file mode 100644
index 000000000..eed13e0ff
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_400m.gin
@@ -0,0 +1,15 @@
+run.model_path = ""
+run.scenario_name = "Server"
+run.batchsize = 5
+run.output_trace = False
+run.data_producer_threads = 8
+run.compute_eval = False
+run.find_peak_performance = False
+run.train_split_percentage = 0.75
+run.sparse_quant = False
+
+# below will override mlperf rules compliant settings - don't use for official submission
+run.target_qps = 5000
+run.numpy_rand_seed = 123
+run.dataset_path_prefix = "/home/linjianma"
+run.dataset_percentage = 0.00625
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/inference_modules.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/inference_modules.py
new file mode 100644
index 000000000..cb567df63
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/inference_modules.py
@@ -0,0 +1,253 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+"""
+Inference modules for DLRMv3.
+
+This module provides inference-specific components for the HSTU model,
+including sparse inference modules and utilities for moving tensors between devices.
+"""
+
+from typing import Dict, Optional, Tuple
+
+import torch
+import torchrec
+from generative_recommenders.modules.dlrm_hstu import (
+    DlrmHSTU,
+    DlrmHSTUConfig,
+    SequenceEmbedding,
+)
+from torchrec.modules.embedding_modules import (
+    EmbeddingBagCollection,
+    EmbeddingCollection,
+)
+from torchrec.sparse.jagged_tensor import JaggedTensor, KeyedJaggedTensor
+from torchrec.sparse.tensor_dict import maybe_td_to_kjt
+
+
+IS_INFERENCE: bool = True
+
+
+class _NoCopyEmbeddingCollection(torchrec.EmbeddingCollection):
+    """
+    EmbeddingCollection variant that skips the dtype-cast copy in
+    ``EmbeddingCollection.forward`` and clamps indices into the hash-size
+    range. This is the script-mode replacement for the
+    ``functools.partial`` monkey-patch in
+    :func:`generative_recommenders.dlrm_v3.inference.model_family.ec_patched_forward_wo_embedding_copy`.
+
+    The body mirrors that helper exactly so that the eager and scripted paths
+    produce the same embeddings.
+    """
+
+    def forward(
+        self,
+        features: KeyedJaggedTensor,
+    ) -> Dict[str, JaggedTensor]:
+        features = maybe_td_to_kjt(features, None)
+        feature_embeddings: Dict[str, JaggedTensor] = {}
+        jt_dict: Dict[str, JaggedTensor] = features.to_dict()
+        # Inline HASH_SIZE_1B - 1 as a literal so TorchScript can see it; the
+        # imported module-level constant is treated as an opaque "closed-over
+        # global" by jit.script and would fail with
+        # "python value of type 'int' cannot be used as a value".
+        max_index: int = 999_999_999  # HASH_SIZE_1B - 1
+        for i, emb_module in enumerate(self.embeddings.values()):
+            feature_names = self._feature_names[i]
+            embedding_names = self._embedding_names_by_table[i]
+            for j, embedding_name in enumerate(embedding_names):
+                feature_name = feature_names[j]
+                f = jt_dict[feature_name]
+                indices = torch.clamp(f.values(), min=0, max=max_index)
+                lookup = emb_module(input=indices)
+                feature_embeddings[embedding_name] = JaggedTensor(
+                    values=lookup,
+                    lengths=f.lengths(),
+                    weights=f.values() if self._need_indices else None,
+                )
+        return feature_embeddings
+
+
+def set_is_inference(is_inference: bool = False) -> None:
+    """
+    Set the global inference mode flag.
+
+    Args:
+        is_inference: If True, model operates in inference mode (no labels/weights).
+                     If False, model operates in training/eval mode with labels.
+    """
+    global IS_INFERENCE
+    IS_INFERENCE = is_inference
+
+
+def get_hstu_model(
+    table_config,
+    hstu_config: DlrmHSTUConfig,
+    table_device: str = "meta",
+    max_hash_size: Optional[int] = None,
+    is_dense: bool = False,
+) -> DlrmHSTU:
+    """
+    Create and initialize an HSTU model for inference.
+
+    Args:
+        table_config: Dictionary of embedding table configurations.
+        hstu_config: HSTU model configuration object.
+        table_device: Device to place embedding tables on ('meta', 'cpu', or 'cuda').
+        max_hash_size: Optional maximum hash size to cap embedding table sizes.
+        is_dense: If True, creates model for dense-only operations.
+
+    Returns:
+        Initialized DlrmHSTU model in eval mode.
+    """
+    if max_hash_size is not None:
+        for t in table_config.values():
+            t.num_embeddings = (
+                max_hash_size if t.num_embeddings > max_hash_size else t.num_embeddings
+            )
+    model = DlrmHSTU(
+        hstu_configs=hstu_config,
+        embedding_tables=table_config,
+        is_inference=IS_INFERENCE,
+        is_dense=is_dense,
+    )
+    model.eval()
+    model.recursive_setattr("_use_triton_cc", False)
+    for _, module in model.named_modules():
+        if isinstance(module, EmbeddingBagCollection) or isinstance(
+            module, EmbeddingCollection
+        ):
+            module.to_empty(device=table_device)
+            # to_empty leaves parameters uninitialized; fill with small random
+            # values so downstream bf16 ops don't produce NaN from
+            # uninitialized memory.
+            for p in module.parameters():
+                if not p.is_meta:
+                    torch.nn.init.uniform_(p, -0.01, 0.01)
+    return model
+
+
+class HSTUSparseInferenceModule(torch.nn.Module):
+    """
+    Module for sparse (embedding) inference operations.
+
+    Handles embedding lookups and preprocessing for the HSTU model,
+    running on CPU to handle large embedding tables.
+
+    Args:
+        table_config: Dictionary of embedding table configurations.
+        hstu_config: HSTU model configuration object.
+    """
+
+    def __init__(
+        self,
+        table_config,
+        hstu_config: DlrmHSTUConfig,
+    ) -> None:
+        super().__init__()
+        self._hstu_model: DlrmHSTU = get_hstu_model(
+            table_config,
+            hstu_config,
+            table_device="cpu",
+        )
+
+    def forward(
+        self,
+        uih_features: KeyedJaggedTensor,
+        candidates_features: KeyedJaggedTensor,
+    ) -> Tuple[
+        Dict[str, SequenceEmbedding],
+        Dict[str, torch.Tensor],
+        int,
+        torch.Tensor,
+        int,
+        torch.Tensor,
+    ]:
+        """
+        Run sparse preprocessing and embedding lookups.
+
+        Args:
+            uih_features: User interaction history features as KeyedJaggedTensor.
+            candidates_features: Candidate item features as KeyedJaggedTensor.
+
+        Returns:
+            Tuple containing:
+                - seq_embeddings: Dictionary of sequence embeddings per feature.
+                - payload_features: Dictionary of payload feature tensors.
+                - max_uih_len: Maximum user interaction history length.
+                - uih_seq_lengths: Tensor of UIH sequence lengths per batch item.
+                - max_num_candidates: Maximum number of candidates.
+                - num_candidates: Tensor of candidate counts per batch item.
+        """
+        (
+            seq_embeddings,
+            payload_features,
+            max_uih_len,
+            uih_seq_lengths,
+            max_num_candidates,
+            num_candidates,
+        ) = self._hstu_model.preprocess(
+            uih_features=uih_features,
+            candidates_features=candidates_features,
+        )
+        return (
+            seq_embeddings,
+            payload_features,
+            max_uih_len,
+            uih_seq_lengths,
+            max_num_candidates,
+            num_candidates,
+        )
+
+
+def move_sparse_output_to_device(
+    seq_embeddings: Dict[str, SequenceEmbedding],
+    payload_features: Dict[str, torch.Tensor],
+    uih_seq_lengths: torch.Tensor,
+    num_candidates: torch.Tensor,
+    device: torch.device,
+) -> Tuple[
+    Dict[str, SequenceEmbedding],
+    Dict[str, torch.Tensor],
+    torch.Tensor,
+    torch.Tensor,
+]:
+    """
+    Move sparse module outputs from CPU to the target device (typically GPU).
+
+    Converts embeddings to bfloat16 for efficient GPU computation.
+
+    Args:
+        seq_embeddings: Dictionary of sequence embeddings to move.
+        payload_features: Dictionary of payload features to move.
+        uih_seq_lengths: UIH sequence lengths tensor to move.
+        num_candidates: Number of candidates tensor to move.
+        device: Target device (e.g., torch.device('cuda:0')).
+
+    Returns:
+        Tuple of moved tensors on the target device.
+    """
+    num_candidates = num_candidates.to(device)
+    uih_seq_lengths = uih_seq_lengths.to(device)
+    seq_embeddings = {
+        k: SequenceEmbedding(
+            lengths=seq_embeddings[k].lengths.to(device),
+            embedding=seq_embeddings[k].embedding.to(device).to(torch.bfloat16),
+        )
+        for k in seq_embeddings.keys()
+    }
+    for k, v in payload_features.items():
+        payload_features[k] = v.to(device)
+    return seq_embeddings, payload_features, uih_seq_lengths, num_candidates
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/main.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/main.py
new file mode 100644
index 000000000..00e334119
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/main.py
@@ -0,0 +1,805 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+"""
+mlperf dlrm_v3 inference benchmarking tool.
+"""
+
+import argparse
+import array
+import logging
+import random
+import threading
+
+logging.basicConfig(level=logging.INFO)
+import os
+import sys
+import time
+from typing import Any, Dict, List, Optional, Union
+
+import gin
+
+# pyre-ignore [21]
+import mlperf_loadgen as lg  # @manual
+import numpy as np
+import torch
+from generative_recommenders.common import set_dev_mode, set_verbose_level
+from generative_recommenders.dlrm_v3.configs import (
+    get_embedding_table_config,
+    get_hstu_configs,
+)
+from generative_recommenders.dlrm_v3.datasets.dataset import Dataset, Samples
+from generative_recommenders.dlrm_v3.datasets.synthetic_streaming import (
+    DLRMv3SyntheticStreamingDataset,
+)
+from generative_recommenders.dlrm_v3.inference.data_producer import (
+    MultiThreadDataProducer,
+    QueryItem,
+    SingleThreadDataProducer,
+)
+from generative_recommenders.dlrm_v3.inference.inference_modules import set_is_inference
+from generative_recommenders.dlrm_v3.inference.model_family import HSTUModelFamily
+from generative_recommenders.dlrm_v3.utils import (
+    get_dataset,
+    profiler_or_nullcontext,
+    SUPPORTED_DATASETS,
+)
+
+
+logger: logging.Logger = logging.getLogger("main")
+
+torch.multiprocessing.set_start_method("spawn", force=True)
+
+USER_CONF = f"{os.path.dirname(__file__)}/user.conf"
+
+SUPPORTED_CONFIGS = {
+    "debug": "debug.gin",
+    "kuairand-1k": "kuairand_1k.gin",
+    "movielens-13b": "movielens_13b.gin",
+    "streaming-400m": "streaming_400m.gin",
+    "sampled-streaming-100b": "streaming_100b.gin",
+}
+
+
+SCENARIO_MAP = {  # pyre-ignore [5]
+    "Server": lg.TestScenario.Server,
+    "Offline": lg.TestScenario.Offline,
+}
+
+
+def get_args():  # pyre-ignore [3]
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dataset", default="debug", choices=SUPPORTED_DATASETS, help="dataset"
+    )
+    args, unknown_args = parser.parse_known_args()
+    logger.warning(f"unknown_args: {unknown_args}")
+    return args
+
+
+class Runner:
+    """
+    Orchestrates inference benchmark execution.
+
+    Manages data production, model inference, and result collection for
+    MLPerf LoadGen-based benchmarking.
+
+    Args:
+        model: The HSTU model family instance for making predictions.
+        ds: Dataset to fetch samples from.
+        num_queries: Total number of queries to process.
+        data_producer_threads: Number of threads for data loading (default: 1).
+        batchsize: Batch size for inference (default: 128).
+        compute_eval: Whether to compute evaluation metrics (default: False).
+    """
+
+    def __init__(
+        self,
+        model: HSTUModelFamily,
+        ds: Dataset,
+        num_queries: int,
+        data_producer_threads: int = 1,
+        batchsize: int = 128,
+        compute_eval: bool = False,
+    ) -> None:
+        self.model = model
+        if data_producer_threads == 1:
+            self.data_producer: Union[
+                MultiThreadDataProducer, SingleThreadDataProducer
+            ] = SingleThreadDataProducer(ds, self.run_one_item)
+        else:
+            self.data_producer = MultiThreadDataProducer(
+                ds, data_producer_threads, self.run_one_item
+            )
+        self.batchsize = batchsize
+        self.compute_eval = compute_eval
+        self.reset_states(num_queries=num_queries)
+
+    def reset_states(self, num_queries: int) -> None:
+        """
+        Reset all internal state for a new benchmark run.
+
+        Args:
+            num_queries: Number of queries expected in this run.
+        """
+        self.result_timing: List[Dict[str, float]] = []
+        self.result_batches: List[int] = []
+        self.current_query_ids: List[int] = []
+        self.current_content_ids: List[int] = []
+        self.current_t0: List[float] = []
+        self.num_queries: int = num_queries
+        self.processed_queries: int = 0
+
+    def run_one_item(self, qitem: QueryItem) -> None:
+        """
+        Process a single query item through model inference.
+
+        Runs prediction, records timing metrics, and sends results back to LoadGen.
+
+        Args:
+            qitem: Query item containing batch of samples to process.
+        """
+        try:
+            t0_prediction: float = time.time()
+            prediction_output = self.model.predict(qitem.samples)
+            dt_prediction: float = time.time() - t0_prediction
+            assert prediction_output is not None
+            (
+                mt_target_preds,
+                mt_target_labels,
+                mt_target_weights,
+                dt_sparse,
+                dt_dense,
+            ) = prediction_output
+            if self.compute_eval:
+                assert mt_target_labels is not None
+                assert mt_target_weights is not None
+            self.result_timing.append(
+                {
+                    "total": time.time() - qitem.start,
+                    "prediction": dt_prediction,
+                    "queue": qitem.dt_queue,
+                    "batching": qitem.dt_batching,
+                    "sparse": dt_sparse,
+                    "dense": dt_dense,
+                }
+            )
+            self.result_batches.append(len(qitem.query_ids))
+        except Exception as ex:  # pylint: disable=broad-except
+            logger.error("thread: failed, %s", ex)
+        finally:
+            candidate_size = mt_target_preds.size(1) // len(qitem.query_ids)
+            if not self.compute_eval:
+                for i, query_id in enumerate(qitem.query_ids):
+                    query_mt_target_preds = (
+                        mt_target_preds[  # pyre-ignore [61]
+                            0,
+                            candidate_size * i : candidate_size * (i + 1),
+                        ]
+                        .view(-1)
+                        .float()
+                        .numpy()
+                    )
+                    response_array = array.array("B", query_mt_target_preds.tobytes())
+                    bi = response_array.buffer_info()
+                    # since we send buffer to loadgen, needs `response_array` in memory during send
+                    lg.QuerySamplesComplete(
+                        [lg.QuerySampleResponse(query_id, bi[0], bi[1])]
+                    )
+            else:
+                for i, query_id in enumerate(qitem.query_ids):
+                    query_mt_target_preds = (
+                        mt_target_preds[  # pyre-ignore [61]
+                            0, candidate_size * i : candidate_size * (i + 1)
+                        ]
+                        .view(-1)
+                        .float()
+                        .numpy()
+                    )
+                    query_mt_target_labels = (
+                        mt_target_labels[  # pyre-ignore [16,61]
+                            0, candidate_size * i : candidate_size * (i + 1)
+                        ]
+                        .view(-1)
+                        .float()
+                        .numpy()
+                    )
+                    query_mt_target_weights = (
+                        mt_target_weights[  # pyre-ignore [61]
+                            0, candidate_size * i : candidate_size * (i + 1)
+                        ]
+                        .view(-1)
+                        .float()
+                        .numpy()
+                    )
+                    np_array = np.concatenate(
+                        [
+                            query_mt_target_preds,
+                            query_mt_target_labels,
+                            query_mt_target_weights,
+                            np.array([candidate_size]).astype(np.float32),
+                        ]
+                    )
+                    response_array = array.array("B", np_array.tobytes())
+                    bi = response_array.buffer_info()
+                    # since we send buffer to loadgen, needs `response_array` in memory during send
+                    lg.QuerySamplesComplete(
+                        [lg.QuerySampleResponse(query_id, bi[0], bi[1])]
+                    )
+
+    def enqueue(self, query_samples, t0: float) -> None:  # pyre-ignore [2]
+        """
+        Enqueue query samples for batch processing.
+
+        Collects samples until batch size is reached, then dispatches to data producer.
+
+        Args:
+            query_samples: List of LoadGen query sample objects.
+            t0: Timestamp when this batch started.
+        """
+        self.current_query_ids.extend([q.id for q in query_samples])
+        self.current_content_ids.extend([q.index for q in query_samples])
+        self.current_t0.append(t0)
+        self.processed_queries += len(query_samples)
+        t0: float = min(self.current_t0)
+        dt_queue: float = max(self.current_t0) - min(self.current_t0)
+        if (
+            self.processed_queries >= self.num_queries
+            or len(self.current_query_ids) >= self.batchsize
+        ):
+            for i in range(len(self.current_query_ids) // self.batchsize):
+                self.data_producer.enqueue(
+                    query_ids=self.current_query_ids[
+                        i * self.batchsize : (i + 1) * self.batchsize
+                    ],
+                    content_ids=self.current_content_ids[
+                        i * self.batchsize : (i + 1) * self.batchsize
+                    ],
+                    t0=t0,
+                    dt_queue=dt_queue,
+                )
+            remaining_s: int = len(self.current_query_ids) % self.batchsize
+            if remaining_s > 0:
+                self.data_producer.enqueue(
+                    query_ids=self.current_query_ids[-remaining_s:],
+                    content_ids=self.current_content_ids[-remaining_s:],
+                    t0=t0,
+                    dt_queue=dt_queue,
+                )
+            self.current_query_ids = []
+            self.current_content_ids = []
+            self.current_t0 = []
+
+    def finish(self) -> None:
+        """Signal data producer to finish and wait for completion."""
+        self.data_producer.finish()
+
+
+def add_results(
+    final_results: Dict[str, Any],
+    result_timing: List[Dict[str, float]],
+    result_batches: List[int],
+) -> None:
+    """
+    Aggregate and log benchmark results.
+
+    Computes percentile statistics and QPS metrics from timing data.
+
+    Args:
+        final_results: Dictionary to populate with aggregated results.
+        result_timing: List of timing dictionaries for each batch.
+        result_batches: List of batch sizes processed.
+    """
+    percentiles: list[float] = [50.0, 80.0, 90.0, 95.0, 99.0, 99.9]
+    buckets_dict: Dict[str, List[float]] = {}
+    buckets_str_dict: Dict[str, str] = {}
+    total_timing: list[float] = [result["total"] for result in result_timing]
+    for key in ["total", "prediction", "queue", "batching", "sparse", "dense"]:
+        timing: list[float] = [result[key] for result in result_timing]
+        buckets: List[float] = np.percentile(timing, percentiles).tolist()
+        buckets_str: str = ",".join(
+            ["| {}:{:.4f}| ".format(p, b) for p, b in zip(percentiles, buckets)]
+        )
+        buckets_dict[key] = buckets
+        buckets_str_dict[key] = buckets_str
+    total_batches = sum(result_batches)
+
+    final_results["good"] = len(total_timing)
+    final_results["avg_time"] = np.mean(total_timing)
+    final_results["percentiles"] = {
+        str(k): v for k, v in zip(percentiles, buckets_dict["total"])
+    }
+    final_results["qps"] = total_batches / final_results["took"]
+    final_results["count"] = total_batches
+
+    for i, timing in enumerate(result_timing):
+        logger.warning(f"timing of {i}: {timing}")
+
+    logger.warning(
+        "{} qps={:.2f}, avg_query_time={:.4f}, time={:.3f}, queries={}, tiles={}".format(
+            final_results["scenario"],
+            final_results["qps"],
+            final_results["avg_time"],
+            final_results["took"],
+            len(result_timing),
+            buckets_str_dict["total"],
+        )
+    )
+    for key in ["prediction", "queue", "batching", "sparse", "dense"]:
+        logger.warning(f"{key}: {buckets_str_dict[key]}")
+
+
+def get_num_queries(
+    input_size: Optional[int],
+    one_pass_size: int,
+    scenario_name: str,
+    offline_target_qps: int,
+    target_duration: float,
+) -> int:
+    """
+    Determine the number of queries to run based on scenario and settings.
+
+    Args:
+        input_size: User-specified query count (None to use defaults).
+        one_pass_size: Size of one complete pass through the dataset.
+        scenario_name: MLPerf scenario name ('Server' or 'Offline').
+        offline_target_qps: Target QPS for offline scenario.
+        target_duration: Target duration in milliseconds.
+
+    Returns:
+        Number of queries to execute in the benchmark run.
+    """
+    if scenario_name == "Offline":
+        # consistent with https://github.com/mlcommons/inference/blob/8999c4d686f6e4a180da14597c97063fce7c9f33/loadgen/test_settings_internal.cc#L147
+        return int(1.1 * target_duration / 1000 * offline_target_qps)
+    else:
+        if input_size is None:
+            return one_pass_size
+        return input_size
+
+
+class StreamingQuerySampler:
+    """
+    Sampler for streaming dataset
+    The execution order is determined by `StreamingQuerySampler.run_order`, not by the QSL or input query ID.
+    This ensures that queries are executed according to their timestamp constraints.
+    """
+
+    def __init__(
+        self,
+        ds: DLRMv3SyntheticStreamingDataset,
+        dataset_percentage: float,
+        scenario_name: str,
+        offline_target_qps: int,
+        target_duration: float,
+        input_queries: Optional[int] = None,
+        compute_eval: bool = False,
+    ) -> None:
+        self.ds: DLRMv3SyntheticStreamingDataset = ds
+        self.ds.is_inference = True
+        self.inference_ts: int = self.ds.total_ts - self.ds.train_ts
+        self.start_ts: int = self.ds.train_ts
+        self.dataset_percentage: float = dataset_percentage
+        self.num_unique_requests: List[int] = self.get_num_unique_requests(
+            warmup_ratio=1.0
+        )
+        self.num_unique_requests_cumsum: List[int] = np.cumsum(
+            self.num_unique_requests
+        ).tolist()
+        self.total_requests: int = sum(self.num_unique_requests)
+        self.run_order: List[List[int]] = self.build_random_exec_order()
+        self.ts_idx: int = 0
+        self.ts_processed_cnt: int = 0
+        self.last_loaded: float = -1.0
+        num_queries: int = get_num_queries(
+            input_size=input_queries,
+            one_pass_size=self.total_requests,
+            scenario_name=scenario_name,
+            offline_target_qps=offline_target_qps,
+            target_duration=target_duration,
+        )
+        logger.warning(
+            f"StreamingQuerySampler constructred to handle {num_queries} queries"
+        )
+        self.num_repeats: int = (
+            max(1, num_queries // self.total_requests) if not compute_eval else 1
+        )
+        self.remaining_queries: int = (
+            num_queries % self.total_requests if not compute_eval else 0
+        )
+        self._lock = threading.Lock()
+
+    def get_num_unique_requests(self, warmup_ratio: float) -> List[int]:
+        """
+        Calculate number of unique requests per timestamp.
+
+        Args:
+            warmup_ratio: Fraction of users to include in warmup.
+
+        Returns:
+            List of request counts per timestamp.
+        """
+        num_unique_requests = [
+            int(
+                self.ds.ts_to_users_cumsum[t][-1]
+                * self.dataset_percentage
+                * warmup_ratio
+            )
+            for t in range(self.start_ts, self.start_ts + self.inference_ts)
+        ]
+        return num_unique_requests
+
+    def build_random_exec_order(self) -> List[List[int]]:
+        """
+        Build randomized execution order for each timestamp.
+
+        Returns:
+            List of shuffled index lists, one per timestamp.
+        """
+        order = []
+        for req_size in self.num_unique_requests:
+            within_ts_order = list(range(req_size))
+            random.shuffle(within_ts_order)
+            order.append(within_ts_order)
+        return order
+
+    def init_sut(self) -> None:
+        """Initialize System Under Test state for a new benchmark run."""
+        self.ts_idx = 0
+        self.ts_processed_cnt = 0
+        self.ds.set_ts(self.start_ts)
+
+    def load_query_samples(self, query_ids: List[Optional[int]]) -> None:
+        """
+        Load query samples into memory for the benchmark.
+
+        Args:
+            query_ids: List of query identifiers to load.
+        """
+        length = len(query_ids)
+        ts_idx: int = 0
+        while self.num_unique_requests_cumsum[ts_idx] < length:
+            ts_idx += 1
+        for i in range(0, ts_idx):
+            self.ds.set_ts(i + self.start_ts)
+            self.ds.load_query_samples(self.run_order[i])
+        self.ds.set_ts(ts_idx + self.start_ts)
+        delta_length = (
+            length
+            if ts_idx == 0
+            else length - self.num_unique_requests_cumsum[ts_idx - 1]
+        )
+        self.ds.load_query_samples(self.run_order[ts_idx][:delta_length])
+        self.init_sut()
+        self.last_loaded = time.time()
+
+    def unload_query_samples(self, sample_list: List[int]) -> None:
+        """
+        Unload query samples from memory.
+
+        Args:
+            sample_list: List of sample identifiers to unload.
+        """
+        self.ds.unload_query_samples(sample_list)
+
+    def get_samples(self, id_list: List[int]) -> List[Samples]:
+        """
+        Get samples for a batch of queries, handling timestamp boundaries.
+
+        Args:
+            id_list: List of query identifiers.
+
+        Returns:
+            List of Samples objects, potentially spanning multiple timestamps.
+        """
+        batch_size: int = len(id_list)
+        with self._lock:
+            curr_ts_idx: int = self.ts_idx
+            curr_ts_unique_requests: int = self.num_unique_requests[curr_ts_idx]
+            curr_ts_queries: int = curr_ts_unique_requests * self.num_repeats
+            if curr_ts_idx == self.inference_ts - 1:
+                curr_ts_queries += self.remaining_queries
+            begin_query_idx: int = self.ts_processed_cnt
+            end_query_idx: int = min(begin_query_idx + batch_size, curr_ts_queries)
+            begin_request_idx: int = begin_query_idx % curr_ts_unique_requests
+            end_request_idx: int = end_query_idx % curr_ts_unique_requests
+            if begin_query_idx + batch_size >= curr_ts_queries:
+                self.ts_idx += 1
+                self.ts_processed_cnt = begin_query_idx + batch_size - curr_ts_queries
+            else:
+                self.ts_processed_cnt = begin_query_idx + batch_size
+        # requests of current ts
+        outputs: List[Samples] = []
+        if end_request_idx > begin_request_idx:
+            output: Samples = self.ds.get_samples_with_ts(
+                self.run_order[curr_ts_idx][begin_request_idx:end_request_idx],
+                curr_ts_idx + self.start_ts,
+            )
+            outputs.append(output)
+        else:
+            if begin_request_idx < curr_ts_unique_requests:
+                output: Samples = self.ds.get_samples_with_ts(
+                    self.run_order[curr_ts_idx][begin_request_idx:],
+                    curr_ts_idx + self.start_ts,
+                )
+                outputs.append(output)
+            if end_request_idx > 0:
+                output = self.ds.get_samples_with_ts(
+                    self.run_order[curr_ts_idx][0:end_request_idx],
+                    curr_ts_idx + self.start_ts,
+                )
+                outputs.append(output)
+        # requests of next ts
+        if begin_query_idx + batch_size > curr_ts_queries:
+            output: Samples = self.ds.get_samples_with_ts(
+                self.run_order[curr_ts_idx + 1][
+                    : begin_query_idx + batch_size - curr_ts_queries
+                ],
+                curr_ts_idx + 1 + self.start_ts,
+            )
+            outputs.append(output)
+        return outputs
+
+    def get_item_count(self) -> int:
+        """
+        Get total number of items in the dataset.
+
+        Returns:
+            Total request count across all timestamps.
+        """
+        return self.total_requests
+
+
+@gin.configurable
+def run(
+    dataset: str = "sampled-streaming-100b",
+    model_path: str = "",
+    scenario_name: str = "Server",
+    batchsize: int = 16,
+    output_trace: bool = False,
+    data_producer_threads: int = 4,
+    compute_eval: bool = False,
+    find_peak_performance: bool = False,
+    dataset_path_prefix: str = "",
+    train_split_percentage: float = 0.75,
+    warmup_ratio: float = 0.1,
+    target_qps: Optional[int] = None,
+    num_queries: Optional[int] = None,
+    numpy_rand_seed: int = 123,
+    sparse_quant: bool = False,
+    dataset_percentage: float = 1.0,
+) -> None:
+    """
+    Execute the MLPerf DLRMv3 inference benchmark.
+
+    Sets up the model, dataset, and LoadGen infrastructure, then runs
+    warmup and official benchmark phases.
+
+    Args:
+        dataset: Dataset identifier to use.
+        model_path: Path to model checkpoint directory.
+        scenario_name: MLPerf scenario ('Server' or 'Offline').
+        batchsize: Batch size for inference.
+        output_trace: Whether to output profiling traces.
+        data_producer_threads: Number of data loading threads.
+        compute_eval: Whether to compute accuracy metrics.
+        find_peak_performance: Whether to run peak performance finding mode.
+        dataset_path_prefix: Prefix path for dataset files.
+        warmup_ratio: Fraction of data to use for warmup.
+        target_qps: Target queries per second.
+        num_queries: Number of queries to run (None for automatic).
+        numpy_rand_seed: Random seed for reproducibility.
+        sparse_quant: Whether to quantize sparse embeddings.
+        dataset_percentage: Fraction of dataset to use.
+    """
+    set_dev_mode(False)
+    if scenario_name not in SCENARIO_MAP:
+        raise NotImplementedError("valid scanarios:" + str(list(SCENARIO_MAP.keys())))
+    scenario = SCENARIO_MAP[scenario_name]
+    np.random.seed(numpy_rand_seed)
+    random.seed(numpy_rand_seed)
+
+    hstu_config = get_hstu_configs(dataset)
+    hstu_config.max_num_candidates = hstu_config.max_num_candidates_inference
+    table_config = get_embedding_table_config(dataset)
+    set_is_inference(is_inference=not compute_eval)
+
+    user_conf = os.path.abspath(USER_CONF)
+    if not os.path.exists(user_conf):
+        logger.error("{} not found".format(user_conf))
+        sys.exit(1)
+
+    settings = lg.TestSettings()
+    settings.FromConfig(user_conf, model_path, scenario_name)
+    settings.scenario = scenario
+    settings.mode = lg.TestMode.PerformanceOnly
+    if compute_eval:
+        settings.mode = lg.TestMode.AccuracyOnly
+    if find_peak_performance:
+        settings.mode = lg.TestMode.FindPeakPerformance
+    if target_qps:
+        settings.server_target_qps = float(target_qps)
+        settings.offline_expected_qps = float(target_qps)
+
+    model_family = HSTUModelFamily(
+        hstu_config=hstu_config,
+        table_config=table_config,
+        sparse_quant=sparse_quant,
+        output_trace=output_trace,
+        compute_eval=compute_eval,
+    )
+    is_streaming: bool = "streaming" in dataset
+    dataset, kwargs = get_dataset(dataset, dataset_path_prefix)
+
+    ds: Dataset = dataset(
+        hstu_config=hstu_config,
+        embedding_config=table_config,
+        is_inference=not compute_eval,
+        **kwargs,
+    )
+    if is_streaming:
+        ds = StreamingQuerySampler(  # pyre-ignore
+            ds=ds,  # pyre-ignore [6]
+            dataset_percentage=dataset_percentage,
+            input_queries=num_queries,
+            compute_eval=compute_eval,
+            scenario_name=scenario_name,
+            offline_target_qps=settings.offline_expected_qps,
+            target_duration=settings.min_duration_ms,
+        )
+    model_family.load(model_path)
+
+    # warmup
+    for autotune_bs in range(batchsize, 0, -1):
+        logger.warning(f"Autotune for batch size {autotune_bs}")
+        warmup_ids = list(range(autotune_bs))
+        ds.load_query_samples(warmup_ids)
+        for _ in range(4 * int(os.environ.get("WORLD_SIZE", 1))):
+            if is_streaming:
+                ds.init_sut()  # pyre-ignore [16]
+            sample: Union[Samples, List[Samples]] = ds.get_samples(warmup_ids)
+            if isinstance(sample, Samples):
+                model_family.predict(sample)
+            else:
+                for s in sample:
+                    model_family.predict(s)
+        ds.unload_query_samples(None)
+    for h in logger.handlers:
+        h.flush()
+    logger.info("Model forward warmup done")
+
+    count = int(
+        ds.get_item_count() * dataset_percentage
+        if not is_streaming
+        else ds.get_item_count()
+    )
+    train_size: int = round(train_split_percentage * count) if not is_streaming else 0
+    if compute_eval:
+        count = count - train_size
+
+    runner: Runner = Runner(
+        model_family,
+        ds,
+        data_producer_threads=data_producer_threads,
+        batchsize=batchsize,
+        compute_eval=compute_eval,
+        num_queries=count,
+    )
+
+    def issue_queries(query_samples) -> None:  # pyre-ignore [2]
+        if compute_eval:
+            for sample in query_samples:
+                sample.index = sample.index + train_size
+        runner.enqueue(query_samples, time.time())
+
+    def load_query_samples(query_ids: List[int]) -> None:
+        if compute_eval:
+            query_ids = [q + train_size for q in query_ids]
+        ds.load_query_samples(query_ids)
+
+    def flush_queries() -> None:
+        pass
+
+    if scenario == lg.TestScenario.Server:
+        # inference benchmark warmup
+        if is_streaming:
+            ds.init_sut()
+            warmup_count: int = sum(
+                ds.get_num_unique_requests(  # pyre-ignore [16]
+                    warmup_ratio=warmup_ratio
+                )
+            )
+        else:
+            warmup_count: int = int(count * warmup_ratio)
+        runner.reset_states(num_queries=warmup_count)
+        final_results = {
+            "runtime": model_family.name(),
+            "version": model_family.version(),
+            "time": int(time.time()),
+            "scenario": str(scenario),
+        }
+        settings.min_query_count = warmup_count
+        settings.max_query_count = warmup_count
+        sut = lg.ConstructSUT(issue_queries, flush_queries)
+        qsl = lg.ConstructQSL(
+            warmup_count,
+            warmup_count,
+            load_query_samples,
+            ds.unload_query_samples,
+        )
+        with profiler_or_nullcontext(enabled=output_trace, with_stack=False):
+            logger.info(f"starting warmup {scenario} with {warmup_count} queries")
+            lg.StartTest(sut, qsl, settings)
+            lg.DestroyQSL(qsl)
+            lg.DestroySUT(sut)
+
+    # official run
+    if is_streaming:
+        ds.init_sut()
+    final_results = {
+        "runtime": model_family.name(),
+        "version": model_family.version(),
+        "time": int(time.time()),
+        "scenario": str(scenario),
+    }
+    query_size: int = get_num_queries(
+        input_size=num_queries,
+        one_pass_size=count,
+        scenario_name=scenario_name,
+        offline_target_qps=settings.offline_expected_qps,
+        target_duration=settings.min_duration_ms,
+    )
+    settings.min_query_count = query_size
+    settings.max_query_count = query_size
+    runner.reset_states(num_queries=query_size if not compute_eval else count)
+    sut = lg.ConstructSUT(issue_queries, flush_queries)
+    qsl = lg.ConstructQSL(
+        count,
+        count,
+        load_query_samples,
+        ds.unload_query_samples,
+    )
+    with profiler_or_nullcontext(enabled=output_trace, with_stack=False):
+        logger.info(
+            f"starting {scenario} with {query_size} queries and {query_size // count} repeats"
+        )
+        lg.StartTest(sut, qsl, settings)
+        runner.finish()
+        final_results["took"] = time.time() - ds.last_loaded
+        lg.DestroyQSL(qsl)
+        lg.DestroySUT(sut)
+
+    add_results(
+        final_results,
+        runner.result_timing,
+        runner.result_batches,
+    )
+    # If multiple subprocesses are running the model send a signal to stop them
+    if int(os.environ.get("WORLD_SIZE", 1)) > 1:
+        model_family.predict(None)
+
+
+def main() -> None:
+    set_verbose_level(1)
+    args = get_args()
+    logger.info(args)
+    gin_path = f"{os.path.dirname(__file__)}/gin/{SUPPORTED_CONFIGS[args.dataset]}"
+    gin.parse_config_file(gin_path)
+    run(dataset=args.dataset)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/mlperf.conf b/recommendation_v4/generative_recommenders/dlrm_v3/inference/mlperf.conf
new file mode 100644
index 000000000..a2b4f6fff
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/mlperf.conf
@@ -0,0 +1,98 @@
+# The format of this config file is 'key = value'.
+# The key has the format 'model.scenario.key'. Value is mostly int64_t.
+# Model maybe '*' as wildcard. In that case the value applies to all models.
+# All times are in milli seconds
+
+# Set performance_sample_count for each model.
+# User can optionally set this to higher values in user.conf.
+resnet50.*.performance_sample_count_override = 1024
+ssd-mobilenet.*.performance_sample_count_override = 256
+retinanet.*.performance_sample_count_override = 64
+bert.*.performance_sample_count_override = 10833
+dlrm.*.performance_sample_count_override = 204800
+dlrm-v2.*.performance_sample_count_override = 204800
+rnnt.*.performance_sample_count_override = 2513
+gptj.*.performance_sample_count_override = 13368
+llama2-70b.*.performance_sample_count_override = 24576
+stable-diffusion-xl.*.performance_sample_count_override = 5000
+# set to 0 to let entire sample set to be performance sample
+3d-unet.*.performance_sample_count_override = 0
+
+# Set seeds. The seeds will be distributed two weeks before the submission.
+*.*.qsl_rng_seed = 3066443479025735752
+*.*.sample_index_rng_seed = 10688027786191513374
+*.*.schedule_rng_seed = 14962580496156340209
+# Set seeds for TEST_05. The seeds will be distributed two weeks before the submission.
+*.*.test05_qsl_rng_seed = 16799458546791641818
+*.*.test05_sample_index_rng_seed = 5453809927556429288
+*.*.test05_schedule_rng_seed = 5435552105434836064
+
+
+*.SingleStream.target_latency_percentile = 90
+*.SingleStream.min_duration = 600000
+
+*.MultiStream.target_latency_percentile = 99
+*.MultiStream.samples_per_query = 8
+*.MultiStream.min_duration = 600000
+*.MultiStream.min_query_count = 662
+retinanet.MultiStream.target_latency = 528
+
+# 3D-UNet uses equal issue mode because it has non-uniform inputs
+3d-unet.*.sample_concatenate_permutation = 1
+
+# LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario
+gptj.*.sample_concatenate_permutation = 1
+llama2-70b.*.sample_concatenate_permutation = 1
+mixtral-8x7b.*.sample_concatenate_permutation = 1
+
+*.Server.target_latency = 10
+*.Server.target_latency_percentile = 99
+*.Server.target_duration = 0
+*.Server.min_duration = 600000
+resnet50.Server.target_latency = 15
+retinanet.Server.target_latency = 100
+bert.Server.target_latency = 130
+dlrm.Server.target_latency = 60
+dlrm-v2.Server.target_latency = 60
+rnnt.Server.target_latency = 1000
+gptj.Server.target_latency = 20000
+stable-diffusion-xl.Server.target_latency = 20000
+# Llama2-70b benchmarks measures token latencies
+llama2-70b.*.use_token_latencies = 1
+mixtral-8x7b.*.use_token_latencies = 1
+# gptj benchmark infers token latencies
+gptj.*.infer_token_latencies = 1
+gptj.*.token_latency_scaling_factor = 69
+# Only ttft and tpot are tracked for the llama2-70b & mixtral-8x7B benchmark therefore target_latency = 0
+llama2-70b.Server.target_latency = 0
+llama2-70b.Server.ttft_latency = 2000
+llama2-70b.Server.tpot_latency = 200
+
+mixtral-8x7b.Server.target_latency = 0
+mixtral-8x7b.Server.ttft_latency = 2000
+mixtral-8x7b.Server.tpot_latency = 200
+
+*.Offline.target_latency_percentile = 90
+*.Offline.min_duration = 600000
+
+# In Offline scenario, we always have one query. But LoadGen maps this to
+# min_sample_count internally in Offline scenario. If the dataset size is larger
+# than 24576 we limit the min_query_count to 24576 and otherwise we use
+# the dataset size as the limit
+
+resnet50.Offline.min_query_count = 24576
+retinanet.Offline.min_query_count = 24576
+dlrm-v2.Offline.min_query_count = 24576
+bert.Offline.min_query_count = 10833
+gptj.Offline.min_query_count = 13368
+rnnt.Offline.min_query_count = 2513
+3d-unet.Offline.min_query_count = 43
+stable-diffusion-xl.Offline.min_query_count = 5000
+llama2-70b.Offline.min_query_count = 24576
+mixtral-8x7b.Offline.min_query_count = 15000
+
+# These fields should be defined and overridden by user.conf.
+*.SingleStream.target_latency = 10
+*.MultiStream.target_latency = 80
+*.Server.target_qps = 1.0
+*.Offline.target_qps = 1.0
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/model_family.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/model_family.py
new file mode 100644
index 000000000..1c8bcd237
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/model_family.py
@@ -0,0 +1,705 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+"""
+model_family for dlrm_v3.
+"""
+
+import copy
+import functools
+import logging
+import os
+import time
+import uuid
+from threading import Event
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.multiprocessing as mp
+import torchrec
+from generative_recommenders.dlrm_v3.checkpoint import (
+    load_nonsparse_checkpoint,
+    load_sparse_checkpoint,
+)
+from generative_recommenders.dlrm_v3.configs import HASH_SIZE_1B
+from generative_recommenders.dlrm_v3.datasets.dataset import Samples
+from generative_recommenders.dlrm_v3.inference.inference_modules import (
+    get_hstu_model,
+    HSTUSparseInferenceModule,
+    move_sparse_output_to_device,
+    set_is_inference,
+)
+from generative_recommenders.dlrm_v3.utils import Profiler
+from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig, SequenceEmbedding
+from pyre_extensions import none_throws
+from torch import quantization as quant
+from torchrec.distributed.quant_embedding import QuantEmbeddingCollection
+from torchrec.modules.embedding_configs import EmbeddingConfig, QuantConfig
+from torchrec.sparse.jagged_tensor import JaggedTensor, KeyedJaggedTensor
+from torchrec.sparse.tensor_dict import maybe_td_to_kjt
+from torchrec.test_utils import get_free_port
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class HSTUModelFamily:
+    """
+    High-level interface for the HSTU model family.
+
+    Manages both sparse (embedding) and dense (transformer) components of the
+    HSTU model, supporting distributed inference across multiple GPUs.
+
+    Args:
+        hstu_config: Configuration object for the HSTU model.
+        table_config: Dictionary of embedding table configurations.
+        output_trace: Whether to enable profiling trace output.
+        sparse_quant: Whether to quantize sparse embeddings.
+        compute_eval: Whether to compute evaluation metrics (includes labels).
+    """
+
+    def __init__(
+        self,
+        hstu_config: DlrmHSTUConfig,
+        table_config: Dict[str, EmbeddingConfig],
+        output_trace: bool = False,
+        sparse_quant: bool = False,
+        compute_eval: bool = False,
+    ) -> None:
+        self.hstu_config = hstu_config
+        self.table_config = table_config
+        self.sparse: ModelFamilySparseDist = ModelFamilySparseDist(
+            hstu_config=hstu_config,
+            table_config=table_config,
+            quant=sparse_quant,
+        )
+
+        assert torch.cuda.is_available(), "CUDA is required for this benchmark."
+        ngpus = torch.cuda.device_count()
+        self.world_size = int(os.environ.get("WORLD_SIZE", str(ngpus)))
+        logger.warning(f"Using {self.world_size} GPU(s)...")
+        dense_model_family_clazz = (
+            ModelFamilyDenseDist
+            if self.world_size > 1
+            else ModelFamilyDenseSingleWorker
+        )
+        self.dense: Union[ModelFamilyDenseDist, ModelFamilyDenseSingleWorker] = (
+            dense_model_family_clazz(
+                hstu_config=hstu_config,
+                table_config=table_config,
+                output_trace=output_trace,
+                compute_eval=compute_eval,
+            )
+        )
+
+    def version(self) -> str:
+        """Return the PyTorch version string."""
+        return torch.__version__
+
+    def name(self) -> str:
+        """Return the model family name identifier."""
+        return "model-family-hstu"
+
+    def load(self, model_path: str) -> None:
+        """
+        Load model checkpoints from disk.
+
+        Args:
+            model_path: Base path to the model checkpoint directory.
+        """
+        self.sparse.load(model_path=model_path)
+        self.dense.load(model_path=model_path)
+
+    def predict(
+        self, samples: Optional[Samples]
+    ) -> Optional[
+        Tuple[
+            torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], float, float
+        ]
+    ]:
+        """
+        Run inference on a batch of samples.
+
+        Processes samples through sparse embeddings, then dense forward pass.
+
+        Args:
+            samples: Input samples containing features. If None, signals shutdown.
+
+        Returns:
+            Tuple of (predictions, labels, weights, sparse_time, dense_time) or None.
+        """
+        with torch.no_grad():
+            if samples is None:
+                self.dense.predict(None, None, 0, None, 0, None)
+                return None
+            (
+                seq_embeddings,
+                payload_features,
+                max_uih_len,
+                uih_seq_lengths,
+                max_num_candidates,
+                num_candidates,
+                dt_sparse,
+            ) = self.sparse.predict(samples)
+            out = self.dense.predict(
+                seq_embeddings,
+                payload_features,
+                max_uih_len,
+                uih_seq_lengths,
+                max_num_candidates,
+                num_candidates,
+            )
+            (  # pyre-ignore [23]
+                mt_target_preds,
+                mt_target_labels,
+                mt_target_weights,
+                dt_dense,
+            ) = out
+            return (
+                mt_target_preds,
+                mt_target_labels,
+                mt_target_weights,
+                dt_sparse,
+                dt_dense,
+            )
+
+
+def ec_patched_forward_wo_embedding_copy(
+    ec_module: torchrec.EmbeddingCollection,
+    features: KeyedJaggedTensor,  # can also take TensorDict as input
+) -> Dict[str, JaggedTensor]:
+    """
+    Run the EmbeddingBagCollection forward pass. This method takes in a `KeyedJaggedTensor`
+    and returns a `Dict[str, JaggedTensor]`, which is the result of the individual embeddings for each feature.
+
+    Args:
+        features (KeyedJaggedTensor): KJT of form [F X B X L].
+
+    Returns:
+        Dict[str, JaggedTensor]
+    """
+    features = maybe_td_to_kjt(features, None)
+    feature_embeddings: Dict[str, JaggedTensor] = {}
+    jt_dict: Dict[str, JaggedTensor] = features.to_dict()
+    for i, emb_module in enumerate(ec_module.embeddings.values()):
+        feature_names = ec_module._feature_names[i]
+        embedding_names = ec_module._embedding_names_by_table[i]
+        for j, embedding_name in enumerate(embedding_names):
+            feature_name = feature_names[j]
+            f = jt_dict[feature_name]
+            indices = torch.clamp(f.values(), min=0, max=HASH_SIZE_1B - 1)
+            lookup = emb_module(
+                input=indices
+            )  # remove the dtype cast at https://github.com/meta-pytorch/torchrec/blob/0a2cebd5472a7edc5072b3c912ad8aaa4179b9d9/torchrec/modules/embedding_modules.py#L486
+            feature_embeddings[embedding_name] = JaggedTensor(
+                values=lookup,
+                lengths=f.lengths(),
+                weights=f.values() if ec_module._need_indices else None,
+            )
+    return feature_embeddings
+
+
+class ModelFamilySparseDist:
+    """
+    Sparse Arch module manager.
+
+    Handles loading and inference of sparse embedding lookups, optionally
+    with quantization for memory efficiency.
+
+    Args:
+        hstu_config: HSTU model configuration.
+        table_config: Embedding table configurations.
+        quant: Whether to apply dynamic quantization to embeddings.
+    """
+
+    def __init__(
+        self,
+        hstu_config: DlrmHSTUConfig,
+        table_config: Dict[str, EmbeddingConfig],
+        quant: bool = False,
+    ) -> None:
+        super(ModelFamilySparseDist, self).__init__()
+        self.hstu_config = hstu_config
+        self.table_config = table_config
+        self.module: Optional[torch.nn.Module] = None
+        self.quant: bool = quant
+
+    def load(self, model_path: str) -> None:
+        """
+        Load sparse model checkpoint and optionally apply quantization.
+
+        Args:
+            model_path: Path to the model checkpoint directory.
+        """
+        logger.warning(f"Loading sparse module from {model_path}")
+
+        sparse_arch: HSTUSparseInferenceModule = HSTUSparseInferenceModule(
+            table_config=self.table_config,
+            hstu_config=self.hstu_config,
+        )
+        load_sparse_checkpoint(model=sparse_arch._hstu_model, path=model_path)
+        sparse_arch.eval()
+        if self.quant:
+            self.module = quant.quantize_dynamic(
+                sparse_arch,
+                qconfig_spec={
+                    torchrec.EmbeddingCollection: QuantConfig(
+                        activation=quant.PlaceholderObserver.with_args(
+                            dtype=torch.float
+                        ),
+                        weight=quant.PlaceholderObserver.with_args(dtype=torch.int8),
+                    ),
+                },
+                mapping={
+                    torchrec.EmbeddingCollection: QuantEmbeddingCollection,
+                },
+                inplace=False,
+            )
+        else:
+            sparse_arch._hstu_model._embedding_collection.forward = (  # pyre-ignore[8]
+                functools.partial(
+                    ec_patched_forward_wo_embedding_copy,
+                    sparse_arch._hstu_model._embedding_collection,
+                )
+            )
+            self.module = sparse_arch
+        logger.warning(f"sparse module is {self.module}")
+
+    def predict(
+        self, samples: Samples
+    ) -> Tuple[
+        Dict[str, SequenceEmbedding],
+        Dict[str, torch.Tensor],
+        int,
+        torch.Tensor,
+        int,
+        torch.Tensor,
+        float,
+    ]:
+        """
+        Run sparse forward pass (embedding lookups).
+
+        Args:
+            samples: Input samples with feature tensors.
+
+        Returns:
+            Tuple of (seq_embeddings, payload_features, max_uih_len, uih_seq_lengths,
+            max_num_candidates, num_candidates, elapsed_time).
+        """
+        with torch.profiler.record_function("sparse forward"):
+            module: torch.nn.Module = none_throws(self.module)
+            assert self.module is not None
+            uih_features = samples.uih_features_kjt
+            candidates_features = samples.candidates_features_kjt
+            t0: float = time.time()
+            (
+                seq_embeddings,
+                payload_features,
+                max_uih_len,
+                uih_seq_lengths,
+                max_num_candidates,
+                num_candidates,
+            ) = module(
+                uih_features=uih_features,
+                candidates_features=candidates_features,
+            )
+            dt_sparse: float = time.time() - t0
+            return (
+                seq_embeddings,
+                payload_features,
+                max_uih_len,
+                uih_seq_lengths,
+                max_num_candidates,
+                num_candidates,
+                dt_sparse,
+            )
+
+
+class ModelFamilyDenseDist:
+    """
+    Distributed dense module manager for multi-GPU inference.
+
+    Spawns worker processes for each GPU to run dense forward passes in parallel,
+    with samples distributed via inter-process queues.
+
+    Args:
+        hstu_config: HSTU model configuration.
+        table_config: Embedding table configurations.
+        output_trace: Whether to enable profiling traces.
+        compute_eval: Whether to compute evaluation metrics.
+    """
+
+    def __init__(
+        self,
+        hstu_config: DlrmHSTUConfig,
+        table_config: Dict[str, EmbeddingConfig],
+        output_trace: bool = False,
+        compute_eval: bool = False,
+    ) -> None:
+        super(ModelFamilyDenseDist, self).__init__()
+        self.hstu_config = hstu_config
+        self.table_config = table_config
+        self.output_trace = output_trace
+        self.compute_eval = compute_eval
+
+        ngpus = torch.cuda.device_count()
+        self.world_size = int(os.environ.get("WORLD_SIZE", str(ngpus)))
+        self.rank = 0
+        os.environ["MASTER_ADDR"] = "localhost"
+        os.environ["MASTER_PORT"] = str(get_free_port())
+        self.dist_backend = "nccl"
+
+        ctx = mp.get_context("spawn")
+        self.samples_q: List[mp.Queue] = [ctx.Queue() for _ in range(self.world_size)]
+        self.result_q: List[mp.Queue] = [ctx.Queue() for _ in range(self.world_size)]
+
+    def load(self, model_path: str) -> None:
+        """
+        Load dense model and spawn worker processes for distributed inference.
+
+        Args:
+            model_path: Path to the model checkpoint directory.
+        """
+        logger.warning(f"Loading dense module from {model_path}")
+
+        ctx = mp.get_context("spawn")
+        processes = []
+        for rank in range(self.world_size):
+            p = ctx.Process(
+                target=self.distributed_setup,
+                args=(
+                    rank,
+                    self.world_size,
+                    model_path,
+                ),
+            )
+            p.start()
+            processes.append(p)
+
+    def distributed_setup(self, rank: int, world_size: int, model_path: str) -> None:
+        """
+        Initialize and run a dense worker process.
+
+        Each worker loads the model, processes samples from its queue, and
+        returns results.
+
+        Args:
+            rank: Process rank (GPU index).
+            world_size: Total number of worker processes.
+            model_path: Path to model checkpoint.
+        """
+        nprocs_per_rank = 16
+        start_core: int = nprocs_per_rank * rank
+        cores: set[int] = set([start_core + i for i in range(nprocs_per_rank)])
+        os.sched_setaffinity(0, cores)
+        set_is_inference(is_inference=not self.compute_eval)
+        model = get_hstu_model(
+            table_config=self.table_config,
+            hstu_config=self.hstu_config,
+            table_device="cpu",
+            max_hash_size=100,
+            is_dense=True,
+        ).to(torch.bfloat16)
+        model.set_training_dtype(torch.bfloat16)
+        device = torch.device(f"cuda:{rank}")
+        torch.cuda.set_device(f"cuda:{rank}")
+        load_nonsparse_checkpoint(
+            model=model, device=device, optimizer=None, path=model_path
+        )
+        model = model.to(device)
+        model.eval()
+        profiler = Profiler(rank) if self.output_trace else None
+
+        with torch.no_grad():
+            while True:
+                item = self.samples_q[rank].get()
+                # If -1 is received terminate all subprocesses
+                if item == -1:
+                    break
+                if self.output_trace:
+                    assert profiler is not None
+                    profiler.step()
+                with torch.profiler.record_function("get_item_from_queue"):
+                    # Copy here to release data in the producer to avoid invalid cuda caching allocator release.
+                    item = copy.deepcopy(item)
+                    (
+                        id,
+                        seq_embeddings,
+                        payload_features,
+                        max_uih_len,
+                        uih_seq_lengths,
+                        max_num_candidates,
+                        num_candidates,
+                    ) = item
+                    assert seq_embeddings is not None
+                with torch.profiler.record_function("dense forward"):
+                    (
+                        _,
+                        _,
+                        _,
+                        mt_target_preds,
+                        mt_target_labels,
+                        mt_target_weights,
+                    ) = model.main_forward(
+                        seq_embeddings=seq_embeddings,
+                        payload_features=payload_features,
+                        max_uih_len=max_uih_len,
+                        uih_seq_lengths=uih_seq_lengths,
+                        max_num_candidates=max_num_candidates,
+                        num_candidates=num_candidates,
+                    )
+                    # mt_target_preds = torch.empty(1, 2048 * 20).to(device="cpu")
+                    # mt_target_labels = None
+                    # mt_target_weights = None
+                    assert mt_target_preds is not None
+                    mt_target_preds = mt_target_preds.detach().to(device="cpu")
+                    if mt_target_labels is not None:
+                        mt_target_labels = mt_target_labels.detach().to(device="cpu")
+                    if mt_target_weights is not None:
+                        mt_target_weights = mt_target_weights.detach().to(device="cpu")
+                    self.result_q[rank].put(
+                        (id, mt_target_preds, mt_target_labels, mt_target_weights)
+                    )
+
+    def capture_output(
+        self, id: uuid.UUID, rank: int
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """
+        Retrieve inference results from a worker process.
+
+        Args:
+            id: Unique identifier for the request.
+            rank: Worker rank to retrieve from.
+
+        Returns:
+            Tuple of (predictions, labels, weights).
+        """
+        while True:
+            recv_id, preds, labels, weights = self.result_q[rank].get()
+            assert recv_id == id
+            return preds, labels, weights
+
+    def get_rank(self) -> int:
+        """
+        Get the next worker rank for load balancing.
+
+        Returns:
+            Rank index, cycling through available workers.
+        """
+        rank = self.rank
+        self.rank = (self.rank + 1) % self.world_size
+        return rank
+
+    def predict(
+        self,
+        seq_embeddings: Optional[Dict[str, SequenceEmbedding]],
+        payload_features: Optional[Dict[str, torch.Tensor]],
+        max_uih_len: int,
+        uih_seq_lengths: Optional[torch.Tensor],
+        max_num_candidates: int,
+        num_candidates: Optional[torch.Tensor],
+    ) -> Optional[
+        Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], float]
+    ]:
+        """
+        Run distributed dense forward pass.
+
+        Dispatches work to a worker process and collects results.
+
+        Args:
+            seq_embeddings: Sequence embeddings from sparse module.
+            payload_features: Additional feature tensors.
+            max_uih_len: Maximum UIH sequence length.
+            uih_seq_lengths: Per-sample UIH lengths.
+            max_num_candidates: Maximum candidates per sample.
+            num_candidates: Per-sample candidate counts.
+
+        Returns:
+            Tuple of (predictions, labels, weights, elapsed_time) or None if shutdown.
+        """
+        id = uuid.uuid4()
+        # If none is received terminate all subprocesses
+        if seq_embeddings is None:
+            for rank in range(self.world_size):
+                self.samples_q[rank].put(-1)
+            return None
+        rank = self.get_rank()
+        device = torch.device(f"cuda:{rank}")
+        assert (
+            payload_features is not None
+            and num_candidates is not None
+            and uih_seq_lengths is not None
+        )
+        t0: float = time.time()
+        seq_embeddings, payload_features, uih_seq_lengths, num_candidates = (
+            move_sparse_output_to_device(
+                seq_embeddings=seq_embeddings,
+                payload_features=payload_features,
+                uih_seq_lengths=uih_seq_lengths,
+                num_candidates=num_candidates,
+                device=device,
+            )
+        )
+        self.samples_q[rank].put(
+            (
+                id,
+                seq_embeddings,
+                payload_features,
+                max_uih_len,
+                uih_seq_lengths,
+                max_num_candidates,
+                num_candidates,
+            )
+        )
+        (mt_target_preds, mt_target_labels, mt_target_weights) = self.capture_output(
+            id, rank
+        )
+        dt_dense = time.time() - t0
+        return (
+            mt_target_preds,
+            mt_target_labels,
+            mt_target_weights,
+            dt_dense,
+        )
+
+
+class ModelFamilyDenseSingleWorker:
+    """
+    Single-worker dense module manager for single-GPU inference.
+
+    Simpler alternative to ModelFamilyDenseDist for single-GPU setups.
+
+    Args:
+        hstu_config: HSTU model configuration.
+        table_config: Embedding table configurations.
+        output_trace: Whether to enable profiling traces.
+        compute_eval: Whether to compute evaluation metrics.
+    """
+
+    def __init__(
+        self,
+        hstu_config: DlrmHSTUConfig,
+        table_config: Dict[str, EmbeddingConfig],
+        output_trace: bool = False,
+        compute_eval: bool = False,
+    ) -> None:
+        self.model: Optional[torch.nn.Module] = None
+        self.hstu_config = hstu_config
+        self.table_config = table_config
+        self.output_trace = output_trace
+        self.device: torch.device = torch.device("cuda:0")
+        torch.cuda.set_device(self.device)
+        self.profiler: Optional[Profiler] = (
+            Profiler(rank=0) if self.output_trace else None
+        )
+
+    def load(self, model_path: str) -> None:
+        """
+        Load dense model for single-GPU inference.
+
+        Args:
+            model_path: Path to the model checkpoint directory.
+        """
+        logger.warning(f"Loading dense module from {model_path}")
+        self.model = (
+            get_hstu_model(
+                table_config=self.table_config,
+                hstu_config=self.hstu_config,
+                table_device="cpu",
+                is_dense=True,
+            )
+            .to(self.device)
+            .to(torch.bfloat16)
+        )
+        self.model.set_training_dtype(torch.bfloat16)
+        load_nonsparse_checkpoint(
+            model=self.model, device=self.device, optimizer=None, path=model_path
+        )
+        assert self.model is not None
+        self.model.eval()
+
+    def predict(
+        self,
+        seq_embeddings: Optional[Dict[str, SequenceEmbedding]],
+        payload_features: Optional[Dict[str, torch.Tensor]],
+        max_uih_len: int,
+        uih_seq_lengths: Optional[torch.Tensor],
+        max_num_candidates: int,
+        num_candidates: Optional[torch.Tensor],
+    ) -> Optional[
+        Tuple[
+            torch.Tensor,
+            Optional[torch.Tensor],
+            Optional[torch.Tensor],
+            float,
+        ]
+    ]:
+        """
+        Run dense forward pass on single GPU.
+
+        Args:
+            seq_embeddings: Sequence embeddings from sparse module.
+            payload_features: Additional feature tensors.
+            max_uih_len: Maximum UIH sequence length.
+            uih_seq_lengths: Per-sample UIH lengths.
+            max_num_candidates: Maximum candidates per sample.
+            num_candidates: Per-sample candidate counts.
+
+        Returns:
+            Tuple of (predictions, labels, weights, elapsed_time).
+        """
+        if self.output_trace:
+            assert self.profiler is not None
+            self.profiler.step()
+        assert (
+            payload_features is not None
+            and uih_seq_lengths is not None
+            and num_candidates is not None
+            and seq_embeddings is not None
+        )
+        t0: float = time.time()
+        with torch.profiler.record_function("dense forward"):
+            seq_embeddings, payload_features, uih_seq_lengths, num_candidates = (
+                move_sparse_output_to_device(
+                    seq_embeddings=seq_embeddings,
+                    payload_features=payload_features,
+                    uih_seq_lengths=uih_seq_lengths,
+                    num_candidates=num_candidates,
+                    device=self.device,
+                )
+            )
+            assert self.model is not None
+            (
+                _,
+                _,
+                _,
+                mt_target_preds,
+                mt_target_labels,
+                mt_target_weights,
+            ) = self.model.main_forward(  # pyre-ignore [29]
+                seq_embeddings=seq_embeddings,
+                payload_features=payload_features,
+                max_uih_len=max_uih_len,
+                uih_seq_lengths=uih_seq_lengths,
+                max_num_candidates=max_num_candidates,
+                num_candidates=num_candidates,
+            )
+            assert mt_target_preds is not None
+            mt_target_preds = mt_target_preds.detach().to(device="cpu")
+            if mt_target_labels is not None:
+                mt_target_labels = mt_target_labels.detach().to(device="cpu")
+            if mt_target_weights is not None:
+                mt_target_weights = mt_target_weights.detach().to(device="cpu")
+            dt_dense: float = time.time() - t0
+            return mt_target_preds, mt_target_labels, mt_target_weights, dt_dense
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/sparse_predict_module.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/sparse_predict_module.py
new file mode 100644
index 000000000..e3ec10415
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/sparse_predict_module.py
@@ -0,0 +1,106 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+
+"""
+TorchScript-friendly wrapper for the HSTU sparse path (CPU embedding lookup).
+
+``HSTUSparseScriptModule`` wraps :class:`HSTUSparseInferenceModule` and
+flattens the ``Dict[str, SequenceEmbedding]`` output into the parallel
+value/length dicts defined in :mod:`ts_types` so the boundary is composed
+entirely of TorchScript-supported types.
+"""
+
+from typing import Dict, Tuple
+
+import torch
+from generative_recommenders.dlrm_v3.inference.inference_modules import (
+    _NoCopyEmbeddingCollection,
+    HSTUSparseInferenceModule,
+)
+from generative_recommenders.dlrm_v3.inference.ts_types import (
+    flatten_seq_embeddings,
+    SeqEmbLengths,
+    SeqEmbValues,
+)
+from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig
+from torchrec.modules.embedding_configs import EmbeddingConfig
+from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
+
+
+class HSTUSparseScriptModule(torch.nn.Module):
+    """Script-friendly sparse module.
+
+    ``forward`` returns 5 tensors / dicts (no Python ``int`` scalars):
+
+    1. ``seq_emb_values``   ``Dict[str, Tensor]`` -- jagged embedding values.
+    2. ``seq_emb_lengths``  ``Dict[str, Tensor]`` -- per-feature lengths.
+    3. ``payload_features`` ``Dict[str, Tensor]`` -- side features.
+    4. ``uih_seq_lengths``  ``Tensor[B]``        -- UIH lengths.
+    5. ``num_candidates``   ``Tensor[B]``        -- candidate counts.
+
+    The dense module (or the C++ glue) recovers the ``int`` ``max_uih_len`` /
+    ``max_num_candidates`` values from these tensors via ``.max().item()``.
+    """
+
+    def __init__(
+        self,
+        table_config: Dict[str, EmbeddingConfig],
+        hstu_config: DlrmHSTUConfig,
+        use_no_copy_embedding_collection: bool = True,
+    ) -> None:
+        super().__init__()
+        self._sparse: HSTUSparseInferenceModule = HSTUSparseInferenceModule(
+            table_config=table_config,
+            hstu_config=hstu_config,
+        )
+        if use_no_copy_embedding_collection:
+            # Re-class the existing EmbeddingCollection so TorchScript picks up
+            # the no-copy ``forward`` override (matches the eager-only
+            # ``ec_patched_forward_wo_embedding_copy`` monkey-patch).
+            self._sparse._hstu_model._embedding_collection.__class__ = (
+                _NoCopyEmbeddingCollection
+            )
+
+    def forward(
+        self,
+        uih_features: KeyedJaggedTensor,
+        candidates_features: KeyedJaggedTensor,
+    ) -> Tuple[
+        SeqEmbValues,
+        SeqEmbLengths,
+        Dict[str, torch.Tensor],
+        torch.Tensor,
+        torch.Tensor,
+    ]:
+        (
+            seq_embeddings,
+            payload_features,
+            _max_uih_len,
+            uih_seq_lengths,
+            _max_num_candidates,
+            num_candidates,
+        ) = self._sparse(
+            uih_features=uih_features,
+            candidates_features=candidates_features,
+        )
+        seq_emb_values, seq_emb_lengths = flatten_seq_embeddings(seq_embeddings)
+        return (
+            seq_emb_values,
+            seq_emb_lengths,
+            payload_features,
+            uih_seq_lengths,
+            num_candidates,
+        )
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/inference_test.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/inference_test.py
new file mode 100644
index 000000000..948f10618
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/inference_test.py
@@ -0,0 +1,39 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+from generative_recommenders.common import gpu_unavailable
+from generative_recommenders.dlrm_v3.inference.main import main
+from hypothesis import given, settings, strategies as st, Verbosity
+
+
+class DLRMV3InferenceTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    @given(
+        world_size=st.sampled_from([1]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=1,
+        deadline=None,
+    )
+    def test_e2e(self, world_size: int) -> None:
+        os.environ["WORLD_SIZE"] = str(world_size)
+        main()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/test_scripted_parity.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/test_scripted_parity.py
new file mode 100644
index 000000000..34d0388ea
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/test_scripted_parity.py
@@ -0,0 +1,236 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+
+"""
+Numerical parity test: eager HSTU vs traced (sparse + dense) on a synthetic
+batch.
+
+The production deployment path (see ``end_to_end_test.py``) uses
+``torch.jit.trace``, not ``torch.jit.script``, for the HSTU sparse/dense
+wrappers. Tracing records the actual tensor ops executed during a forward
+pass and ignores source-level dispatch logic (HammerKernel enum,
+``is_fx_tracing()``, ``torch.autocast``, IntEnum branches) that scripting
+cannot compile. This unit test mirrors that path.
+
+Tolerances are deliberately loose because the traced path replaces the
+Triton fused kernels with PyTorch fallbacks and skips ``torch.autocast`` in
+the user-forward block; both can perturb low-order bits in bf16.
+"""
+
+import unittest
+from typing import Dict, List, Tuple
+
+import torch
+from generative_recommenders.common import gpu_unavailable, HammerKernel
+from generative_recommenders.dlrm_v3.configs import (
+    get_embedding_table_config,
+    get_hstu_configs,
+)
+from generative_recommenders.dlrm_v3.datasets.dataset import get_random_data
+from generative_recommenders.dlrm_v3.inference.dense_predict_module import (
+    HSTUDenseScriptModule,
+)
+from generative_recommenders.dlrm_v3.inference.sparse_predict_module import (
+    HSTUSparseScriptModule,
+)
+from generative_recommenders.dlrm_v3.inference.ts_types import (
+    SeqEmbLengths,
+    SeqEmbValues,
+)
+from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
+
+
+_DATASET = "kuairand-1k"
+
+
+def _move_dense_inputs(
+    seq_emb_values: Dict[str, torch.Tensor],
+    seq_emb_lengths: Dict[str, torch.Tensor],
+    payload_features: Dict[str, torch.Tensor],
+    uih_seq_lengths: torch.Tensor,
+    num_candidates: torch.Tensor,
+    device: torch.device,
+) -> Tuple[
+    Dict[str, torch.Tensor],
+    Dict[str, torch.Tensor],
+    Dict[str, torch.Tensor],
+    torch.Tensor,
+    torch.Tensor,
+]:
+    """C++-side ``move_sparse_output_to_device`` analog for the test."""
+    return (
+        {k: v.to(device).to(torch.bfloat16) for k, v in seq_emb_values.items()},
+        {k: v.to(device) for k, v in seq_emb_lengths.items()},
+        {k: v.to(device) for k, v in payload_features.items()},
+        uih_seq_lengths.to(device),
+        num_candidates.to(device),
+    )
+
+
+class _SparseTraceShim(torch.nn.Module):
+    """Adapter that takes raw tensors and rebuilds the KJTs inside forward.
+
+    ``torch.jit.trace`` does not accept ``KeyedJaggedTensor`` (or any
+    non-Tensor / non-collection-of-Tensor type) as a top-level forward
+    input, so we make the traced boundary tensor-only and bake the
+    ``List[str]`` of feature keys in as module attributes.
+    """
+
+    def __init__(
+        self,
+        sparse_module: HSTUSparseScriptModule,
+        uih_keys: List[str],
+        candidates_keys: List[str],
+    ) -> None:
+        super().__init__()
+        self._sparse_module: HSTUSparseScriptModule = sparse_module
+        self._uih_keys: List[str] = uih_keys
+        self._candidates_keys: List[str] = candidates_keys
+
+    def forward(
+        self,
+        uih_lengths: torch.Tensor,
+        uih_values: torch.Tensor,
+        candidates_lengths: torch.Tensor,
+        candidates_values: torch.Tensor,
+    ) -> Tuple[
+        SeqEmbValues,
+        SeqEmbLengths,
+        Dict[str, torch.Tensor],
+        torch.Tensor,
+        torch.Tensor,
+    ]:
+        uih_kjt = KeyedJaggedTensor(
+            keys=self._uih_keys,
+            lengths=uih_lengths,
+            values=uih_values,
+        )
+        candidates_kjt = KeyedJaggedTensor(
+            keys=self._candidates_keys,
+            lengths=candidates_lengths,
+            values=candidates_values,
+        )
+        return self._sparse_module(
+            uih_features=uih_kjt, candidates_features=candidates_kjt
+        )
+
+
+class HSTUScriptedParityTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    def test_scripted_matches_eager(self) -> None:
+        torch.manual_seed(0)
+        device = torch.device("cuda:0")
+        torch.cuda.set_device(device)
+
+        hstu_config = get_hstu_configs(_DATASET)
+        table_config = get_embedding_table_config(_DATASET)
+
+        # Some embedding tables in kuairand-1k are tiny (e.g.
+        # user_active_degree has num_embeddings=8). Clamp the random value
+        # range so every index stays in range for every table; otherwise the
+        # default value_bound=1000 triggers an out-of-range embedding lookup.
+        min_rows = min(t.num_embeddings for t in table_config.values())
+        value_bound = max(2, min_rows)
+
+        uih_kjt, candidates_kjt = get_random_data(
+            contexual_features=list(
+                hstu_config.contextual_feature_to_max_length.keys()
+            ),
+            hstu_uih_keys=hstu_config.hstu_uih_feature_names,
+            hstu_candidates_keys=hstu_config.hstu_candidate_feature_names,
+            uih_max_seq_len=128,
+            max_num_candidates=hstu_config.max_num_candidates_inference,
+            value_bound=value_bound,
+        )
+
+        sparse_module = HSTUSparseScriptModule(
+            table_config=table_config,
+            hstu_config=hstu_config,
+            use_no_copy_embedding_collection=True,
+        ).eval()
+        dense_module = (
+            HSTUDenseScriptModule(
+                hstu_config=hstu_config,
+                table_config=table_config,
+            )
+            .to(torch.bfloat16)
+            .to(device)
+            .eval()
+        )
+
+        # Pin the HammerKernel to PyTorch on both wrappers. The Triton
+        # kernels use Python-level dispatch (autotune, constexpr arguments)
+        # that interacts badly with torch.jit.trace's recording pass. The
+        # eager reference run uses the same setting so the comparison is
+        # apples-to-apples.
+        sparse_module._sparse._hstu_model.set_hammer_kernel(HammerKernel.PYTORCH)
+        dense_module._hstu_model.set_hammer_kernel(HammerKernel.PYTORCH)
+
+        # === Eager reference path ===
+        with torch.no_grad():
+            sparse_out_e = sparse_module(
+                uih_features=uih_kjt, candidates_features=candidates_kjt
+            )
+            dense_inputs_e = _move_dense_inputs(*sparse_out_e, device=device)
+            preds_eager = dense_module(*dense_inputs_e)
+
+        # === Traced path ===
+        # Sparse is traced via a raw-tensor shim because KJT is not a valid
+        # traced input. Dense is traced directly with the eager sparse
+        # output as the example.
+        sparse_shim = _SparseTraceShim(
+            sparse_module=sparse_module,
+            uih_keys=list(uih_kjt.keys()),
+            candidates_keys=list(candidates_kjt.keys()),
+        )
+        traced_sparse = torch.jit.trace(
+            sparse_shim,
+            example_inputs=(
+                uih_kjt.lengths(),
+                uih_kjt.values(),
+                candidates_kjt.lengths(),
+                candidates_kjt.values(),
+            ),
+            strict=False,
+            check_trace=False,
+        )
+        traced_dense = torch.jit.trace(
+            dense_module,
+            example_inputs=tuple(dense_inputs_e),
+            strict=False,
+            check_trace=False,
+        )
+
+        with torch.no_grad():
+            sparse_out_t = traced_sparse(
+                uih_kjt.lengths(),
+                uih_kjt.values(),
+                candidates_kjt.lengths(),
+                candidates_kjt.values(),
+            )
+            dense_inputs_t = _move_dense_inputs(*sparse_out_t, device=device)
+            preds_traced = traced_dense(*dense_inputs_t)
+
+        torch.testing.assert_close(
+            preds_eager.float(),
+            preds_traced.float(),
+            atol=1e-2,
+            rtol=1e-2,
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/.clang-format b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/.clang-format
new file mode 100644
index 000000000..f08c9c2c8
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/.clang-format
@@ -0,0 +1,2 @@
+BasedOnStyle: Google
+Standard: Cpp11
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/CMakeLists.txt b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/CMakeLists.txt
new file mode 100644
index 000000000..4fec0e44f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/CMakeLists.txt
@@ -0,0 +1,113 @@
+cmake_minimum_required(VERSION 3.12)
+
+project(mlperf_loadgen)
+
+# Read the version file
+file(READ "${CMAKE_SOURCE_DIR}/VERSION.txt" VERSION_CONTENTS)
+
+# Extract the major, minor, and patch versions from the VERSION file (assuming "MAJOR.MINOR.PATCH" format)
+string(REGEX MATCH "^([0-9]+)\\.([0-9]+)\\.([0-9]+)" VERSION_MATCH ${VERSION_CONTENTS})
+
+# Set the variables for the major, minor, and patch versions
+set(mlperf_loadgen_VERSION_MAJOR "${CMAKE_MATCH_1}")
+set(mlperf_loadgen_VERSION_MINOR "${CMAKE_MATCH_2}")
+set(mlperf_loadgen_VERSION_PATCH "${CMAKE_MATCH_3}")
+
+# Check if the version format was parsed correctly
+if(NOT DEFINED mlperf_loadgen_VERSION_MAJOR OR NOT DEFINED mlperf_loadgen_VERSION_MINOR OR NOT DEFINED mlperf_loadgen_VERSION_PATCH)
+    message(FATAL_ERROR "Version format in VERSION.txt is incorrect. Expected format: MAJOR.MINOR.PATCH")
+endif()
+
+# Print out the version
+message("mlperf_loadgen v${mlperf_loadgen_VERSION_MAJOR}.${mlperf_loadgen_VERSION_MINOR}.${mlperf_loadgen_VERSION_PATCH}")
+
+# Set build options. NB: CXX_STANDARD is supported since CMake 3.1.
+if (NOT MSVC)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -W -Wall")
+endif()
+# Extra build options can be specified by setting the MLPERF_LOADGEN_CXX_FLAGS variable
+if (MLPERF_LOADGEN_CXX_FLAGS)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MLPERF_LOADGEN_CXX_FLAGS}")
+endif()
+message(STATUS "Using C++ compiler flags: ${CMAKE_CXX_FLAGS}")
+set(CMAKE_CXX_STANDARD "14")
+message(STATUS "Using C++ standard: ${CMAKE_CXX_STANDARD}")
+message(STATUS "Using static linker flags: ${CMAKE_STATIC_LINKER_FLAGS}")
+message(STATUS "Using shared linker flags: ${CMAKE_SHARED_LINKER_FLAGS}")
+
+# Output directory for libraries.
+set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR})
+message(STATUS "Using output path: ${LIBRARY_OUTPUT_PATH}")
+
+# Detect Python to use for generating source file with version info.
+# NB: PythonInterp has been deprecated since CMake 3.12
+# but it works with earlier versions of CMake.
+find_package(PythonInterp)
+message(STATUS "Using Python interpreter: ${PYTHON_EXECUTABLE}")
+
+# Specify the source and destination files
+set(CONF_FILE "mlperf.conf")
+set(HEADER_FILE "mlperf_conf.h")
+
+# Read the content of the configuration file
+file(READ ${CONF_FILE} CONF_CONTENTS)
+
+# Escape all double quotes and backslashes
+string(REPLACE "\\" "\\\\" CONF_CONTENTS "${CONF_CONTENTS}")
+string(REPLACE "\"" "\\\"" CONF_CONTENTS "${CONF_CONTENTS}")
+
+# Handle new lines
+string(REPLACE "\n" "\\n\"\n\"" CONF_CONTENTS "${CONF_CONTENTS}")
+
+# Wrap the content in a C++ string declaration
+set(FORMATTED_CONTENT "const char* mlperf_conf =\n\"${CONF_CONTENTS}\";\n")
+
+# Write the formatted content to the header file
+file(WRITE ${HEADER_FILE} "${FORMATTED_CONTENT}")
+
+message(STATUS "Output config:  ${CMAKE_BINARY_DIR}/mlperf_conf.h")
+
+# Generate source file with version info.
+execute_process(COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/version_generator.py ${CMAKE_BINARY_DIR}/version_generated.cc ${CMAKE_CURRENT_SOURCE_DIR})
+
+# Add source files.
+set(SOURCE
+  ${CMAKE_CURRENT_SOURCE_DIR}/bindings/c_api.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/bindings/c_api.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/early_stopping.cc  
+  ${CMAKE_CURRENT_SOURCE_DIR}/issue_query_controller.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/loadgen.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/logging.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/logging.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/test_settings_internal.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/test_settings_internal.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/utils.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/utils.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/results.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/results.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/version.cc
+  ${CMAKE_CURRENT_SOURCE_DIR}/version.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/mlperf_conf.h
+  ${CMAKE_CURRENT_SOURCE_DIR}/VERSION.txt
+  ${CMAKE_BINARY_DIR}/version_generated.cc
+)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+
+add_library(mlperf_loadgen STATIC ${SOURCE})
+target_link_libraries(mlperf_loadgen)
+
+if(WIN32)
+set (LIBS "")
+else()
+set (LIBS pthread)
+endif()
+
+add_executable(benchmark benchmark/repro.cpp)
+target_link_libraries(benchmark PUBLIC mlperf_loadgen ${LIBS})
+
+# Install library and headers.
+install(TARGETS mlperf_loadgen
+	DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/
+	DESTINATION ${CMAKE_INSTALL_PREFIX}/include FILES_MATCHING PATTERN "*.h")
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/MANIFEST.in b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/MANIFEST.in
new file mode 100644
index 000000000..152b53111
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/MANIFEST.in
@@ -0,0 +1,2 @@
+include VERSION.txt
+include mlperf.conf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README.md
new file mode 100644
index 000000000..212c8a53c
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README.md
@@ -0,0 +1,223 @@
+# Overview {#mainpage}
+
+## Introduction
+
+* The LoadGen is a *reusable* module that *efficiently* and *fairly* measures
+  the performance of inference systems.
+* It generates traffic for scenarios as formulated by a diverse set of experts
+  in the [MLCommons working group](https://mlcommons.org/).
+* The scenarios emulate the workloads seen in mobile devices,
+  autonomous vehicles, robotics, and cloud-based setups.
+* Although the LoadGen is not model or dataset aware, its strength is in its
+  reusability with logic that is.
+
+## Integration Example and Flow
+The following is an diagram of how the LoadGen can be integrated into an
+inference system, resembling how some of the MLPerf reference models are
+implemented.
+<div style="display:flex; flex-flow:row wrap; justify-content: space-evenly;">
+<img src="https://raw.githubusercontent.com/mlcommons/inference/master/loadgen/loadgen_integration_diagram.svg" width="500px" style="padding: 20px">
+<ol style="padding: 20px">
+<li>Benchmark knows the model, dataset, and preprocessing.</li>
+<li>Benchmark hands dataset sample IDs to LoadGen.</li>
+<li>LoadGen starts generating queries of sample IDs.</li>
+<li>Benchmark creates requests to backend.</li>
+<li>Result is post processed and forwarded to LoadGen.</li>
+<li>LoadGen outputs logs for analysis.<br>
+</ol>
+</div>
+
+## Useful Links
+* [FAQ](README_FAQ.md)
+* [LoadGen Build Instructions](README_BUILD.md)
+* [LoadGen API](loadgen.h)
+* [Test Settings](test_settings.h) -
+  A good description of available scenarios, modes, and knobs.
+* [MLPerf Inference Code](https://github.com/mlcommons/inference) -
+  Includes source for the LoadGen and reference models that use the LoadGen.
+* [MLPerf Inference Rules](https://github.com/mlcommons/inference_policies) -
+  Any mismatch with this is a bug in the LoadGen.
+
+## Scope of the LoadGen's Responsibilities
+
+### In Scope
+* **Provide a reusable** C++ library with python bindings.
+* **Implement** the traffic patterns of the MLPerf Inference scenarios and
+  modes.
+* **Record** all traffic generated and received for later analysis and
+  verification.
+* **Summarize** the results and whether performance constraints were met.
+* **Target high-performance** systems with efficient multi-thread friendly
+  logging utilities.
+* **Generate trust** via a shared, well-tested, and community-hardened
+  code base.
+
+### Out of Scope
+The LoadGen is:
+* **NOT** aware of the ML model it is running against.
+* **NOT** aware of the data formats of the model's inputs and outputs.
+* **NOT** aware of how to score the accuracy of a model's outputs.
+* **NOT** aware of MLPerf rules regarding scenario-specific constraints.
+
+Limitting the scope of the LoadGen in this way keeps it reusable across
+different models and datasets without modification. Using composition and
+dependency injection, the user can define their own model, datasets, and
+metrics.
+
+Additionally, not hardcoding MLPerf-specific test constraints, like test
+duration and performance targets, allows users to use the LoadGen unmodified
+for custom testing and continuous integration purposes.
+
+## Submission Considerations
+
+### Upstream all local modifications
+* As a rule, no local modifications to the LoadGen's C++ library are allowed
+for submission.
+* Please upstream early and often to keep the playing field level.
+
+### Choose your TestSettings carefully!
+* Since the LoadGen is oblivious to the model, it can't enforce the MLPerf
+requirements for submission. *e.g.:* target percentiles and latencies.
+* For verification, the values in TestSettings are logged.
+* To help make sure your settings are spec compliant, use
+TestSettings::FromConfig in conjunction with the relevant config file provided
+with the reference models.
+
+## Responsibilities of a LoadGen User
+
+### Implement the Interfaces
+* Implement the SystemUnderTest and QuerySampleLibrary interfaces and pass
+  them to the StartTest function.
+* Call QuerySampleComplete for every sample received by
+  SystemUnderTest::IssueQuery.
+
+### Assess Accuracy
+* Process the *mlperf_log_accuracy.json* output by the LoadGen to determine
+  the accuracy of your system.
+* For the official models, Python scripts will be provided by the MLPerf model
+  owners for you to do this automatically.
+
+For templates of how to do the above in detail, refer to code for the demos,
+tests, and reference models.
+
+
+## LoadGen over the Network
+
+For reference, on a high level a submission looks like this:
+
+<div align="center" style="display:flex; flex-flow:row wrap; justify-content: space-evenly;">
+<img src="https://raw.githubusercontent.com/mlcommons/inference/master/loadgen/diagram_submission.png" width="300px" style="padding: 20px">
+</div>
+
+The LoadGen implementation is common to all submissions, while the QSL (“Query Sample Library”) and SUT (“System Under Test”) are implemented by submitters. QSL is responsible for loading the data and includes untimed preprocessing.
+
+A submission over the network introduces a new component “QDL” (query dispatch library) that is added to the system as presented in the following diagram:
+
+<div align="center" style="display:flex; flex-flow:row wrap; justify-content: space-evenly;">
+<img src="https://raw.githubusercontent.com/mlcommons/inference/master/loadgen/diagram_network_submission.png" width="300px" style="padding: 20px">
+</div>
+
+QDL is a proxy for a load-balancer, that dispatches queries to SUT over a physical network, receives the responses and passes them back to LoadGen.  It is implemented by the submitter. The interface of the QDL is the same as the API to SUT. 
+
+In scenarios using QDL, data may be compressed in QSL at the choice of the submitter in order to reduce network transmission time. Decompression is part of the timed processing in SUT. A set of approved standard compression schemes will be specified for each benchmark; additional compression schemes must be approved in advance by the Working Group.
+
+All communication between LoadGen/QSL and SUT is via QDL, and all communication between QDL and SUT must pass over a physical network.
+
+QDL implements the protocol to transmit queries over the network and receive responses. It also implements decompression of any response returned by the SUT, where compression of responses is allowed. Performing any part of the timed preprocessing or inference in QDL is specifically disallowed. Currently no batching is allowed in QDL, although this may be revisited in future.
+
+The MLperf over the Network will run in Server mode and Offline mode. All LoadGen modes are expected to work as is with insignificant changes. These include running the test in performance mode, accuracy mode, find peak performance mode and compliance mode. The same applies for power measurements.
+
+### QDL details
+The Query Dispatch Library is implemented by the submitter and interfaces with LoadGen using the same SUT API. All MLPerf Inference SUTs implement the `mlperf::SystemUnderTest` class which is defined in system_under_test.h. The QDL implements `mlperf::QueryDispatchLibrary` class which inherits the `mlperf::SystemUnderTest` class and has the same API and support all existing `mlperf::SystemUnderTest` methods. It has a separate header file query_dispatch_library.h. Using sut with `mlperf::SystemUnderTest` class in LoadGen StartTest is natively upcasting `mlperf::QueryDispatchLibrary` class.
+
+#### QDL Query issue and response over the network
+
+The QDL gets the queries from the LoadGen through 
+```CPP
+void IssueQuery(const std::vector<QuerySample>& samples)
+```
+
+The QDL dispatches the queries to the SUT over the physical media. The exact method and implementation for it are submitter specific and would not be specified at MLCommons. Submitter implementation includes all methods required to serialize the query, load balance, drive it to the Operating system and network interface card and send to the SUT.
+
+The QDL receives the query responses over the network from the SUT. The exact method and implementation for it are submitter specific and would not be specified at MLCommons. The submitter implementation includes all methods required to receive the network data from the Network Interface card, go through the Operating system, deserialize the query response, and provide it back to the LoadGen through query completion by:
+
+```CPP
+struct QuerySampleResponse {
+  ResponseId id;
+  uintptr_t data;
+  size_t size;
+};
+void QuerySamplesComplete(QuerySampleResponse* responses, 
+                          size_t response_count);
+
+```
+
+#### QDL Additional Methods
+
+In addition to that the QDL needs to implement the following methods that are provided by the SUT interface to the LoadGen:
+```CPP
+const std::string& Name();
+```
+The `Name` function returns a known string for over the Network SUTs to identify it as over the network benchmark.
+```CPP
+void FlushQueries();
+```
+
+It is not specified here how the QDL would query and configure the SUT to execute the above methods. The QDL responds to the LoadGen after receiving its own response from the SUT.
+
+### Example
+
+Refer to [LON demo](demos/lon) for a reference example illustrating usage of Loadgen over the network.
+
+## Find Peak Performance Mode
+
+The Find Peak Performance mode can be used to find the optimal queries per second (QPS) for the server scenario.
+
+### Setup
+
+You can setup loadgen to run this mode by setting the `mode` variable in the `test_settings` used to run the test. Using the Python API:
+
+```python
+settings = mlperf_loadgen.TestSettings()
+settings.server_target_qps = 100
+settings.scenario = mlperf_loadgen.TestScenario.Server
+settings.mode = mlperf_loadgen.TestMode.FindPeakPerformance
+...
+
+mlperf_loadgen.StartTest(sut, qsl, settings)
+```
+
+Using the C/C++ API:
+```CPP
+mlperf::TestSettings settings;
+setting.server_target_qps = 100;
+settings.scenario = mlperf::TestScenario::Server;
+settings.mode = mlperf::TestMode::FindPeakPerformance;
+mlperf::LogSettings log_settings;
+/*
+Construct QSL and SUT
+*/
+mlperf::StartTest(&sut, &qsl, settings, log_settings);
+```
+
+**Note:** Make sure you are setting the TestScenario to server and you are providing an initial target QPS.
+
+### Description
+
+The Find Peak Performance mode works by finding a lower and upper boundary for the optimal QPS. Then performing a binary search between the lower and upper bound to find the optimal QPS.
+
+#### Finding lower and upper boundary
+
+LoadGen begins by running performance mode at the specified target QPS. If the test passes, this value is used as the lower bound; otherwise, an error is raised. The algorithm then guesses the upper bound as twice the target QPS. 
+
+Then LoadGen will run performance mode using the upper bound guess. If the test is successful, both the lower bound and upper bound will be doubled. This repeats until the upper bound guess fails the test.
+
+```
+[initial_target_qps, 2*initial_target_qps] -> [2*initial_target_qps, 4*initial_target_qps] -> [4*initial_target_qps, 8*initial_target_qps]...
+```
+
+Finally, the final lower bound and upper bound are set to their current values. This process assures that the lower bound passes the performance mode, but the upper bound doesn’t.
+
+#### Binary Search
+
+Once the lower and upper bounds are set, binary search can be performed over the range `[lower, upper]`` to find the optimal QPS. If a given QPS fails in performance mode, the optimal value lies below it; if it passes, the optimal is higher.
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_BUILD.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_BUILD.md
new file mode 100644
index 000000000..499cc360a
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_BUILD.md
@@ -0,0 +1,47 @@
+# Building the LoadGen {#ReadmeBuild}
+
+## Prerequisites
+
+    sudo apt-get install libglib2.0-dev python-pip python3-pip
+    pip2 install absl-py numpy
+    pip3 install absl-py numpy
+
+## Quick Start
+### Installation - Python
+
+    pip install absl-py numpy
+    git clone --recurse-submodules https://github.com/mlcommons/inference.git mlperf_inference
+    cd mlperf_inference/loadgen
+    CFLAGS="-std=c++14 -O3" python -m pip install .
+
+This will fetch the loadgen source, build and install the loadgen as a python module, and run a simple end-to-end demo.
+
+Alternatively, we provide wheels for several python versions and operating system that can be installed using pip directly.
+
+    pip install mlperf-loadgen
+
+**NOTE:** Take into account that we only update the published wheels after an official release, they may not include the latest changes.
+
+### Testing your Installation
+The following command will run a simple end-to-end demo:
+
+    python mlperf_inference/loadgen/demos/py_demo_single_stream.py
+
+A summary of the test results can be found in the *"mlperf_log_summary.txt"* logfile.
+
+For a timeline visualization of what happened during the test, open the *"mlperf_log_trace.json"* file in Chrome:
+* Type “chrome://tracing” in the address bar, then drag-n-drop the json.
+* This may be useful for SUT performance tuning and understanding + debugging the loadgen.
+
+### Installation - C++
+To build the loadgen as a C++ library, rather than a python module:
+
+    git clone https://github.com/mlcommons/inference.git mlperf_inference
+    cd mlperf_inference
+    mkdir loadgen/build/ && cd loadgen/build/
+    cmake .. && cmake --build .
+    cp libmlperf_loadgen.a ..
+
+## Quick start: Loadgen Over the Network
+
+Refer to [LON demo](demos/lon/README.md) for a basic example.
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_FAQ.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_FAQ.md
new file mode 100644
index 000000000..ab4e0c75d
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_FAQ.md
@@ -0,0 +1,78 @@
+# LoadGen FAQ {#ReadmeFAQ}
+
+## Q: The LoadGen does not match the MLPerf specification. Who is right?
+**A:**
+The MLPerf spec is *always* right.
+Please file a LoadGen bug so it may be resolved.
+
+## Q: How can I file a bug?
+**A:**
+On GitHub: https://github.com/mlcommons/inference/issues/new
+
+## Q: Can I make local modifications to the LoadGen for submission?
+**A:**
+No. To keep the playing field level, please upstream any local
+modificiations you need to make. Ideally upstream such changes behind a runtime
+flag or via an abstract interface the client can implement. This will help
+with testability.
+
+## Q: Where can I find the results of a test?
+**A:**
+By default, the loadgen will output an *mlperf_log_summary.txt* file
+that summarizes the target metrics and constraints of the test, along with
+other stats about the run.
+
+*Note:* LogSettings also has a flag to forward the results to stdout and
+there's an outstanding TODO to make this more programmable.
+
+## Q: The reference implementation for \<*some_model*\> prints out results of its own. Are those for submission?
+**A:**
+They are not. The LoadGen results are the ground truth for submission
+results since they will work even for systems that forgo the python bindings.
+If you notice a bug in the LoadGen's results, please file a bug or submit a
+patch.
+
+## Q: I'm getting linker errors for LoadgenVersion definitions. Where is *version_generated.cc*?
+**A:**
+If you have a custom build setup, make sure you run the *version_generator.py*
+script, which will create the cc file you are looking for. The official build
+files that come with the LoadGen do this for you out of the box.
+
+## Q: What is this *version_generator.py* script?
+**A:**
+The LoadGen records git stats (if available) and the SHA1 of all its
+source files (always) at build time for verification purposes. This is easy
+to circumvent, but try your best to run *version_generator.py* correctly;
+ideally integrated with your build system if you have a custom build.
+The intention is more to help with debugging efforts and detect accidental
+version missmatches than to detect bad actors.
+
+## Q: How do I view the *mlperf_log_trace.json* file?
+**A:**
+This file uses the [Trace Event Format]
+(https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/edit)
+to record a timeline of all the threads involved.
+You can view the file by typing [chrome://tracing](chrome://tracing) into
+Chrome's address bar and dragging the json file there.
+This file zips well and you can drag the zip file directly into
+[chrome://tracing](chrome://tracing) too.
+Please include zipped traces (and the other logs) when filing bug reports.
+
+## Q: Why is the code littered with so many lambdas? My eyes hurt.
+**A:**
+Lambdas are a convenient and efficient way to ship arbitrary data + deferred
+logic over to the logging thread without much boilerplate.
+Much of the loadgen is built on top of the logging utilities.
+Thus the lambdas. (Sorry about the eyes.)
+
+## Q: What C++ version does the LoadGen target?
+**A:**
+It currently targets and requires C++14. It should compile with recent
+versions of clang, gcc, and msvc.
+
+## Q: What dependencies does the LoadGen code have?
+**A:**
+The C++ code has no external dependencies. The loadgen itself, logging
+utilities, and unit test utilities are built solely on the C++ Standard Library.
+The python bindings, however, do require
+[pybind11](https://github.com/pybind/pybind11).
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/VERSION.txt b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/VERSION.txt
new file mode 100644
index 000000000..ac14c3dfa
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/VERSION.txt
@@ -0,0 +1 @@
+5.1.1
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/.gitignore b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/.gitignore
new file mode 100644
index 000000000..e792c8e55
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/.gitignore
@@ -0,0 +1,2 @@
+loadgen_build
+build
\ No newline at end of file
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/README.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/README.md
new file mode 100644
index 000000000..24e872983
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/README.md
@@ -0,0 +1,10 @@
+Note: please install jemalloc first. See: http://jemalloc.net/
+Command: bash run.sh <target_qps> <0=Basic,1=Queue> <numCompleteThreads> <maxSizeInComplete> <server_coalesce_queries=0or1>
+
+Experiments:
+- On Intel(R) Xeon(R) CPU E5-1650 v4 @ 3.60GHz
+- Basic SUT : 500-600k i/s
+- Basic SUT + jemalloc: 800-900k i/s (`bash run.sh 800000 0`)
+- Queued SUT (2 complete threads) + jemalloc: 1.2-1.3M i/s (`bash run.sh 1200000 1 2 2048`)
+- Queued SUT (2 complete threads) + jemalloc + server_coalesce_queries: 1.4-1.5M is/ (`bash run.sh 1400000 1 2 512 1`)
+- Basic SUT + jemalloc + server_coalesce_queries + 4 IssueQueryThreads: 2.4-2.5M is/ (`bash run.sh 2400000 0 2 512 1 4`)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/repro.cpp b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/repro.cpp
new file mode 100644
index 000000000..44ff53efa
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/repro.cpp
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cassert>
+#include <condition_variable>
+#include <deque>
+#include <iostream>
+#include <map>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+#include "loadgen.h"
+#include "query_sample_library.h"
+#include "system_under_test.h"
+#include "test_settings.h"
+
+class QSL : public mlperf::QuerySampleLibrary {
+ public:
+  ~QSL() override{};
+  const std::string& Name() override { return mName; }
+  size_t TotalSampleCount() override { return 1000000; }
+  size_t PerformanceSampleCount() override { return TotalSampleCount(); }
+  void LoadSamplesToRam(const std::vector<mlperf::QuerySampleIndex>&) override {
+  }
+  void UnloadSamplesFromRam(
+      const std::vector<mlperf::QuerySampleIndex>&) override {}
+
+ private:
+  std::string mName{"Dummy QSL"};
+};
+
+class BasicSUT : public mlperf::SystemUnderTest {
+ public:
+  BasicSUT() {
+    // Start with some large value so that we don't reallocate memory.
+    initResponse(10000);
+  }
+  ~BasicSUT() override {}
+  const std::string& Name() override { return mName; }
+  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
+    size_t n = samples.size();
+    if (n > mResponses.size()) {
+      std::cerr << "Warning: reallocating response buffer in BasicSUT. Maybe "
+                   "you should initResponse with larger value!?"
+                << std::endl;
+      initResponse(samples.size());
+    }
+    for (size_t i = 0; i < n; i++) {
+      mResponses[i].id = samples[i].id;
+    }
+    mlperf::QuerySamplesComplete(mResponses.data(), n);
+  }
+  void FlushQueries() override {}
+
+ private:
+  void initResponse(int size) {
+    mResponses.resize(size,
+                      {0, reinterpret_cast<uintptr_t>(&mBuf), sizeof(int)});
+  }
+  int mBuf{0};
+  std::string mName{"BasicSUT"};
+  std::vector<mlperf::QuerySampleResponse> mResponses;
+};
+
+class QueueSUT : public mlperf::SystemUnderTest {
+ public:
+  QueueSUT(int numCompleteThreads, int maxSize) {
+    // Each thread handle at most maxSize at a time.
+    std::cout << "QueueSUT: maxSize = " << maxSize << std::endl;
+    initResponse(numCompleteThreads, maxSize);
+    // Launch complete threads
+    for (int i = 0; i < numCompleteThreads; i++) {
+      mThreads.emplace_back(&QueueSUT::CompleteThread, this, i);
+    }
+  }
+  ~QueueSUT() override {
+    {
+      std::unique_lock<std::mutex> lck(mMtx);
+      mDone = true;
+      mCondVar.notify_all();
+    }
+    for (auto& thread : mThreads) {
+      thread.join();
+    }
+  }
+  const std::string& Name() override { return mName; }
+  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
+    std::unique_lock<std::mutex> lck(mMtx);
+    for (const auto& sample : samples) {
+      mIdQueue.push_back(sample.id);
+    }
+    // Let some worker thread to consume tasks
+    mCondVar.notify_one();
+  }
+  void FlushQueries() override {}
+
+ private:
+  void CompleteThread(int threadIdx) {
+    auto& responses = mResponses[threadIdx];
+    size_t maxSize{responses.size()};
+    size_t actualSize{0};
+    while (true) {
+      {
+        std::unique_lock<std::mutex> lck(mMtx);
+        mCondVar.wait(lck, [&]() { return !mIdQueue.empty() || mDone; });
+
+        if (mDone) {
+          break;
+        }
+
+        actualSize = std::min(maxSize, mIdQueue.size());
+        for (size_t i = 0; i < actualSize; i++) {
+          responses[i].id = mIdQueue.front();
+          mIdQueue.pop_front();
+        }
+        mCondVar.notify_one();
+      }
+      mlperf::QuerySamplesComplete(responses.data(), actualSize);
+    }
+  }
+  void initResponse(int numCompleteThreads, int size) {
+    mResponses.resize(numCompleteThreads);
+    for (auto& responses : mResponses) {
+      responses.resize(size,
+                       {0, reinterpret_cast<uintptr_t>(&mBuf), sizeof(int)});
+    }
+  }
+  int mBuf{0};
+  std::string mName{"QueueSUT"};
+  std::vector<std::vector<mlperf::QuerySampleResponse>> mResponses;
+  std::vector<std::thread> mThreads;
+  std::deque<mlperf::ResponseId> mIdQueue;
+  std::mutex mMtx;
+  std::condition_variable mCondVar;
+  bool mDone{false};
+};
+
+class MultiBasicSUT : public mlperf::SystemUnderTest {
+ public:
+  MultiBasicSUT(int numThreads)
+      : mNumThreads(numThreads), mResponses(numThreads) {
+    // Start with some large value so that we don't reallocate memory.
+    initResponse(10000);
+    for (int i = 0; i < mNumThreads; ++i) {
+      mThreads.emplace_back(&MultiBasicSUT::startIssueThread, this, i);
+    }
+  }
+  ~MultiBasicSUT() override {
+    for (auto& thread : mThreads) {
+      thread.join();
+    }
+  }
+  const std::string& Name() override { return mName; }
+  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
+    int thread_idx = mThreadMap[std::this_thread::get_id()];
+    size_t n = samples.size();
+    auto& reponses = mResponses[thread_idx];
+    if (n > reponses.size()) {
+      std::cout
+          << "Warning: reallocating response buffer in MultiBasicSUT. Maybe "
+             "you should initResponse with larger value!?"
+          << std::endl;
+      initResponse(samples.size());
+    }
+    for (size_t i = 0; i < n; i++) {
+      reponses[i].id = samples[i].id;
+    }
+    mlperf::QuerySamplesComplete(reponses.data(), n);
+  }
+  void FlushQueries() override {}
+
+ private:
+  void initResponse(int size) {
+    for (auto& responses : mResponses) {
+      responses.resize(size,
+                       {0, reinterpret_cast<uintptr_t>(&mBuf), sizeof(int)});
+    }
+  }
+  void startIssueThread(int thread_idx) {
+    {
+      std::lock_guard<std::mutex> lock(mMtx);
+      mThreadMap[std::this_thread::get_id()] = thread_idx;
+    }
+    mlperf::RegisterIssueQueryThread();
+  }
+  int mBuf{0};
+  int mNumThreads{0};
+  std::string mName{"MultiBasicSUT"};
+  std::vector<std::vector<mlperf::QuerySampleResponse>> mResponses;
+  std::mutex mMtx;
+  std::vector<std::thread> mThreads;
+  std::map<std::thread::id, int> mThreadMap;
+};
+
+int main(int argc, char** argv) {
+  assert(argc >= 2 && "Need to pass in at least one argument: target_qps");
+  int target_qps = std::stoi(argv[1]);
+  std::cout << "target_qps = " << target_qps << std::endl;
+
+  bool useQueue{false};
+  int numCompleteThreads{4};
+  int maxSize{1};
+  bool server_coalesce_queries{false};
+  int num_issue_threads{0};
+  if (argc >= 3) {
+    useQueue = std::stoi(argv[2]) != 0;
+  }
+  if (argc >= 4) {
+    numCompleteThreads = std::stoi(argv[3]);
+  }
+  if (argc >= 5) {
+    maxSize = std::stoi(argv[4]);
+  }
+  if (argc >= 6) {
+    server_coalesce_queries = std::stoi(argv[5]) != 0;
+  }
+  if (argc >= 7) {
+    num_issue_threads = std::stoi(argv[6]);
+  }
+
+  QSL qsl;
+  std::unique_ptr<mlperf::SystemUnderTest> sut;
+
+  // Configure the test settings
+  mlperf::TestSettings testSettings;
+  testSettings.scenario = mlperf::TestScenario::Server;
+  testSettings.mode = mlperf::TestMode::PerformanceOnly;
+  testSettings.server_target_qps = target_qps;
+  testSettings.server_target_latency_ns = 10000000;  // 10ms
+  testSettings.server_target_latency_percentile = 0.99;
+  testSettings.min_duration_ms = 60000;
+  testSettings.min_query_count = 270000;
+  testSettings.server_coalesce_queries = server_coalesce_queries;
+  std::cout << "testSettings.server_coalesce_queries = "
+            << (server_coalesce_queries ? "True" : "False") << std::endl;
+  testSettings.server_num_issue_query_threads = num_issue_threads;
+  std::cout << "num_issue_threads = " << num_issue_threads << std::endl;
+
+  // Configure the logging settings
+  mlperf::LogSettings logSettings;
+  logSettings.log_output.outdir = "build";
+  logSettings.log_output.prefix = "mlperf_log_";
+  logSettings.log_output.suffix = "";
+  logSettings.log_output.prefix_with_datetime = false;
+  logSettings.log_output.copy_detail_to_stdout = false;
+  logSettings.log_output.copy_summary_to_stdout = true;
+  logSettings.log_mode = mlperf::LoggingMode::AsyncPoll;
+  logSettings.log_mode_async_poll_interval_ms = 1000;
+  logSettings.enable_trace = false;
+
+  // Choose SUT
+  if (num_issue_threads == 0) {
+    if (useQueue) {
+      std::cout << "Using QueueSUT with " << numCompleteThreads
+                << " complete threads" << std::endl;
+      sut.reset(new QueueSUT(numCompleteThreads, maxSize));
+    } else {
+      std::cout << "Using BasicSUT" << std::endl;
+      sut.reset(new BasicSUT());
+    }
+  } else {
+    if (useQueue) {
+      std::cout << "Using MultiQueueSUT with " << numCompleteThreads
+                << " complete threads" << std::endl;
+      std::cerr << "!!!! MultiQueueSUT is NOT implemented yet !!!!"
+                << std::endl;
+      return 1;
+      // sut.reset(new MultiQueueSUT(num_issue_threads, numCompleteThreads,
+      // maxSize));
+    } else {
+      std::cout << "Using MultiBasicSUT" << std::endl;
+      sut.reset(new MultiBasicSUT(num_issue_threads));
+    }
+  }
+
+  // Start test
+  std::cout << "Start test..." << std::endl;
+  mlperf::StartTest(sut.get(), &qsl, testSettings, logSettings);
+  std::cout << "Test done. Clean up SUT..." << std::endl;
+  sut.reset();
+  std::cout << "Done!" << std::endl;
+  return 0;
+}
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run.sh b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run.sh
new file mode 100644
index 000000000..62559c1a8
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/bash
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+echo "Building loadgen..."
+if [ ! -e loadgen_build ]; then mkdir loadgen_build; fi;
+cd loadgen_build && cmake ../.. && make -j && cd ..
+echo "Building test program..."
+if [ ! -e build ]; then mkdir build; fi;
+g++ --std=c++11 -O3 -I.. -o build/repro.exe repro.cpp -Lloadgen_build -lmlperf_loadgen -lpthread && \
+LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2 build/repro.exe $1 $2 $3 $4 $5 $6
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run_debug.sh b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run_debug.sh
new file mode 100644
index 000000000..ba63727c8
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run_debug.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/bash
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+echo "Building loadgen in Debug mode..."
+if [ ! -e loadgen_build ]; then mkdir loadgen_build; fi;
+cd loadgen_build && cmake -DCMAKE_BUILD_TYPE=Debug ../.. && make -j && cd ..
+echo "Building test program in Debug mode..."
+if [ ! -e build ]; then mkdir build; fi;
+g++ --std=c++11 -O0 -g -I.. -o build/repro.exe repro.cpp -Lloadgen_build -lmlperf_loadgen -lpthread && \
+gdb --args build/repro.exe $1 $2 $3 $4 $5 $6
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.cc
new file mode 100644
index 000000000..0248a1c16
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.cc
@@ -0,0 +1,176 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "c_api.h"
+
+#include <string>
+
+#include "../loadgen.h"
+#include "../query_sample.h"
+#include "../query_sample_library.h"
+#include "../system_under_test.h"
+#include "../test_settings.h"
+
+namespace mlperf {
+namespace c {
+namespace {
+
+// Forwards SystemUnderTest calls to relevant callbacks.
+class SystemUnderTestTrampoline : public SystemUnderTest {
+ public:
+  SystemUnderTestTrampoline(ClientData client_data, std::string name,
+                            IssueQueryCallback issue_cb,
+                            FlushQueriesCallback flush_queries_cb)
+      : client_data_(client_data),
+        name_(std::move(name)),
+        issue_cb_(issue_cb),
+        flush_queries_cb_(flush_queries_cb) {}
+  ~SystemUnderTestTrampoline() override = default;
+
+  const std::string& Name() override { return name_; }
+
+  void IssueQuery(const std::vector<QuerySample>& samples) override {
+    (*issue_cb_)(client_data_, samples.data(), samples.size());
+  }
+
+  void FlushQueries() override { (*flush_queries_cb_)(); }
+
+ private:
+  ClientData client_data_;
+  std::string name_;
+  IssueQueryCallback issue_cb_;
+  FlushQueriesCallback flush_queries_cb_;
+};
+
+}  // namespace
+
+void* ConstructSUT(ClientData client_data, const char* name, size_t name_length,
+                   IssueQueryCallback issue_cb,
+                   FlushQueriesCallback flush_queries_cb) {
+  SystemUnderTestTrampoline* sut = new SystemUnderTestTrampoline(
+      client_data, std::string(name, name_length), issue_cb, flush_queries_cb);
+  return reinterpret_cast<void*>(sut);
+}
+
+void DestroySUT(void* sut) {
+  SystemUnderTestTrampoline* sut_cast =
+      reinterpret_cast<SystemUnderTestTrampoline*>(sut);
+  delete sut_cast;
+}
+
+namespace {
+
+// Forwards QuerySampleLibrary calls to relevant callbacks.
+class QuerySampleLibraryTrampoline : public QuerySampleLibrary {
+ public:
+  QuerySampleLibraryTrampoline(
+      ClientData client_data, std::string name, size_t total_sample_count,
+      size_t performance_sample_count,
+      LoadSamplesToRamCallback load_samples_to_ram_cb,
+      UnloadSamplesFromRamCallback unload_samples_from_ram_cb)
+      : client_data_(client_data),
+        name_(std::move(name)),
+        total_sample_count_(total_sample_count),
+        performance_sample_count_(performance_sample_count),
+        load_samples_to_ram_cb_(load_samples_to_ram_cb),
+        unload_samples_from_ram_cb_(unload_samples_from_ram_cb) {}
+  ~QuerySampleLibraryTrampoline() override = default;
+
+  const std::string& Name() override { return name_; }
+  size_t TotalSampleCount() override { return total_sample_count_; }
+  size_t PerformanceSampleCount() override { return performance_sample_count_; }
+
+  void LoadSamplesToRam(const std::vector<QuerySampleIndex>& samples) override {
+    (*load_samples_to_ram_cb_)(client_data_, samples.data(), samples.size());
+  }
+  void UnloadSamplesFromRam(
+      const std::vector<QuerySampleIndex>& samples) override {
+    (*unload_samples_from_ram_cb_)(client_data_, samples.data(),
+                                   samples.size());
+  }
+
+ private:
+  ClientData client_data_;
+  std::string name_;
+  size_t total_sample_count_;
+  size_t performance_sample_count_;
+  LoadSamplesToRamCallback load_samples_to_ram_cb_;
+  UnloadSamplesFromRamCallback unload_samples_from_ram_cb_;
+};
+
+}  // namespace
+
+void* ConstructQSL(ClientData client_data, const char* name, size_t name_length,
+                   size_t total_sample_count, size_t performance_sample_count,
+                   LoadSamplesToRamCallback load_samples_to_ram_cb,
+                   UnloadSamplesFromRamCallback unload_samples_from_ram_cb) {
+  QuerySampleLibraryTrampoline* qsl = new QuerySampleLibraryTrampoline(
+      client_data, std::string(name, name_length), total_sample_count,
+      performance_sample_count, load_samples_to_ram_cb,
+      unload_samples_from_ram_cb);
+  return reinterpret_cast<void*>(qsl);
+}
+
+void DestroyQSL(void* qsl) {
+  QuerySampleLibraryTrampoline* qsl_cast =
+      reinterpret_cast<QuerySampleLibraryTrampoline*>(qsl);
+  delete qsl_cast;
+}
+
+// mlperf::c::StartTest just forwards to mlperf::StartTest after doing the
+// proper cast.
+void StartTest(void* sut, void* qsl, const TestSettings& settings,
+               const std::string& audit_config_filename = "audit.config") {
+  SystemUnderTestTrampoline* sut_cast =
+      reinterpret_cast<SystemUnderTestTrampoline*>(sut);
+  QuerySampleLibraryTrampoline* qsl_cast =
+      reinterpret_cast<QuerySampleLibraryTrampoline*>(qsl);
+  LogSettings default_log_settings;
+  mlperf::StartTest(sut_cast, qsl_cast, settings, default_log_settings,
+                    audit_config_filename);
+}
+
+void QuerySamplesComplete(QuerySampleResponse* responses,
+                          size_t response_count) {
+  mlperf::QuerySamplesComplete(responses, response_count);
+}
+
+void QuerySamplesCompleteResponseCb(QuerySampleResponse* responses,
+                                    size_t response_count,
+                                    ResponseCallback response_cb,
+                                    ClientData client_data) {
+  mlperf::QuerySamplesComplete(
+      responses, response_count,
+      [client_data, response_cb](QuerySampleResponse* response) {
+        response_cb(client_data, response);
+      });
+}
+
+void FirstTokenComplete(QuerySampleResponse* responses, size_t response_count) {
+  mlperf::FirstTokenComplete(responses, response_count);
+}
+
+void FirstTokenCompleteResponseCb(QuerySampleResponse* responses,
+                                  size_t response_count,
+                                  ResponseCallback response_cb,
+                                  ClientData client_data) {
+  mlperf::FirstTokenComplete(
+      responses, response_count,
+      [client_data, response_cb](QuerySampleResponse* response) {
+        response_cb(client_data, response);
+      });
+}
+
+void RegisterIssueQueryThread() { mlperf::RegisterIssueQueryThread(); }
+
+}  // namespace c
+}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.h
new file mode 100644
index 000000000..0ee44fb71
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.h
@@ -0,0 +1,95 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief A C API wrapping the C++ loadgen. Not tested. Needs work.
+/// \details The C API allows a C or Python client to easily create
+/// a SystemUnderTest without having to expose the SystemUnderTest class
+/// directly.
+/// ConstructSUT works with a bunch of function poitners instead that are
+/// called from an underlying trampoline class.
+
+#ifndef SYSTEM_UNDER_TEST_C_API_H_
+#define SYSTEM_UNDER_TEST_C_API_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "../query_sample.h"
+#include "../test_settings.h"
+
+namespace mlperf {
+
+namespace c {
+
+/// \brief Optional opaque client data that creators of SUTs and QSLs can have
+/// the loadgen pass back to their callback invocations.
+/// Helps avoids global variables.
+typedef uintptr_t ClientData;
+
+typedef void (*IssueQueryCallback)(ClientData, const QuerySample*, size_t);
+typedef void (*FlushQueriesCallback)();
+typedef void (*ResponseCallback)(ClientData, QuerySampleResponse*);
+
+/// \brief SUT calls this function to report query result back to loadgen
+void QuerySamplesComplete(QuerySampleResponse* responses,
+                          size_t response_count);
+
+void QuerySamplesCompleteResponseCb(QuerySampleResponse* responses,
+                                    size_t response_count,
+                                    ResponseCallback response_cb,
+                                    ClientData client_data);
+
+void FirstTokenComplete(QuerySampleResponse* responses, size_t response_count);
+
+void FirstTokenCompleteResponseCb(QuerySampleResponse* responses,
+                                  size_t response_count,
+                                  ResponseCallback response_cb,
+                                  ClientData client_data);
+
+/// \brief Create an opaque SUT pointer based on C callbacks.
+void* ConstructSUT(ClientData client_data, const char* name, size_t name_length,
+                   IssueQueryCallback issue_cb,
+                   FlushQueriesCallback flush_queries_cb);
+/// \brief Destroys the SUT created by ConstructSUT.
+void DestroySUT(void* sut);
+
+typedef void (*LoadSamplesToRamCallback)(ClientData, const QuerySampleIndex*,
+                                         size_t);
+typedef void (*UnloadSamplesFromRamCallback)(ClientData,
+                                             const QuerySampleIndex*, size_t);
+
+/// \brief Create an opaque QSL pointer based on C callbacks.
+void* ConstructQSL(ClientData client_data, const char* name, size_t name_length,
+                   size_t total_sample_count, size_t performance_sample_count,
+                   LoadSamplesToRamCallback load_samples_to_ram_cb,
+                   UnloadSamplesFromRamCallback unload_samples_from_ram_cb);
+/// \brief Destroys the QSL created by ConsructQSL.
+void DestroyQSL(void* qsl);
+
+/// \brief Run tests on a SUT created by ConstructSUT().
+/// \details This is the C entry point. See mlperf::StartTest for the C++ entry
+/// point.
+void StartTest(void* sut, void* qsl, const TestSettings& settings,
+               const std::string& audit_config_filename);
+
+///
+/// \brief Register a thread for query issuing in Server scenario.
+/// \details This is the C entry point. See mlperf::RegisterIssueQueryThread for
+/// the C++ entry point.
+///
+void RegisterIssueQueryThread();
+
+}  // namespace c
+}  // namespace mlperf
+
+#endif  // SYSTEM_UNDER_TEST_C_API_H_
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/python_api.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/python_api.cc
new file mode 100644
index 000000000..96396dab9
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/python_api.cc
@@ -0,0 +1,484 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Python bindings for the loadgen using pybind11.
+
+#ifndef PYTHON_BINDINGS_H
+#define PYTHON_BINDINGS_H
+
+#include <functional>
+
+#include "../loadgen.h"
+#include "../query_dispatch_library.h"
+#include "../query_sample.h"
+#include "../query_sample_library.h"
+#include "../system_under_test.h"
+#include "../test_settings.h"
+#include "pybind11/functional.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+#include "pybind11/stl_bind.h"
+
+namespace mlperf {
+
+namespace {
+
+using IssueQueryCallback = std::function<void(std::vector<QuerySample>)>;
+using FastIssueQueriesCallback =
+    std::function<void(std::vector<ResponseId>, std::vector<QuerySampleIndex>)>;
+using FlushQueriesCallback = std::function<void()>;
+using NameCallback = std::function<std::string()>;
+
+// Forwards SystemUnderTest calls to relevant callbacks.
+class SystemUnderTestTrampoline : public SystemUnderTest {
+ public:
+  SystemUnderTestTrampoline(std::string name, IssueQueryCallback issue_cb,
+                            FlushQueriesCallback flush_queries_cb)
+      : name_(std::move(name)),
+        issue_cb_(issue_cb),
+        flush_queries_cb_(flush_queries_cb) {}
+  ~SystemUnderTestTrampoline() override = default;
+
+  const std::string& Name() override { return name_; }
+
+  void IssueQuery(const std::vector<QuerySample>& samples) override {
+    pybind11::gil_scoped_acquire gil_acquirer;
+    issue_cb_(samples);
+  }
+
+  void FlushQueries() override { flush_queries_cb_(); }
+
+ protected:
+  std::string name_;
+  IssueQueryCallback issue_cb_;
+  FlushQueriesCallback flush_queries_cb_;
+};
+
+class FastSystemUnderTestTrampoline : public SystemUnderTestTrampoline {
+ public:
+  FastSystemUnderTestTrampoline(std::string name,
+                                FastIssueQueriesCallback fast_issue_cb,
+                                FlushQueriesCallback flush_queries_cb)
+      : SystemUnderTestTrampoline(name, nullptr, flush_queries_cb),
+        fast_issue_cb_(fast_issue_cb) {}
+  ~FastSystemUnderTestTrampoline() override = default;
+
+  void IssueQuery(const std::vector<QuerySample>& samples) override {
+    pybind11::gil_scoped_acquire gil_acquirer;
+    std::vector<ResponseId> responseIds;
+    std::vector<QuerySampleIndex> querySampleIndices;
+    for (auto& s : samples) {
+      responseIds.push_back(s.id);
+      querySampleIndices.push_back(s.index);
+    }
+    fast_issue_cb_(responseIds, querySampleIndices);
+  }
+
+ private:
+  FastIssueQueriesCallback fast_issue_cb_;
+};
+
+using LoadSamplesToRamCallback =
+    std::function<void(std::vector<QuerySampleIndex>)>;
+using UnloadSamplesFromRamCallback =
+    std::function<void(std::vector<QuerySampleIndex>)>;
+
+// Forwards QuerySampleLibrary calls to relevant callbacks.
+class QuerySampleLibraryTrampoline : public QuerySampleLibrary {
+ public:
+  QuerySampleLibraryTrampoline(
+      std::string name, size_t total_sample_count,
+      size_t performance_sample_count,
+      LoadSamplesToRamCallback load_samples_to_ram_cb,
+      UnloadSamplesFromRamCallback unload_samples_from_ram_cb)
+      : name_(std::move(name)),
+        total_sample_count_(total_sample_count),
+        performance_sample_count_(performance_sample_count),
+        load_samples_to_ram_cb_(load_samples_to_ram_cb),
+        unload_samples_from_ram_cb_(unload_samples_from_ram_cb) {}
+  ~QuerySampleLibraryTrampoline() override = default;
+
+  const std::string& Name() override { return name_; }
+  size_t TotalSampleCount() { return total_sample_count_; }
+  size_t PerformanceSampleCount() { return performance_sample_count_; }
+
+  void LoadSamplesToRam(const std::vector<QuerySampleIndex>& samples) override {
+    pybind11::gil_scoped_acquire gil_acquirer;
+    load_samples_to_ram_cb_(samples);
+  }
+  void UnloadSamplesFromRam(
+      const std::vector<QuerySampleIndex>& samples) override {
+    pybind11::gil_scoped_acquire gil_acquirer;
+    unload_samples_from_ram_cb_(samples);
+  }
+
+ private:
+  std::string name_;
+  size_t total_sample_count_;
+  size_t performance_sample_count_;
+  LoadSamplesToRamCallback load_samples_to_ram_cb_;
+  UnloadSamplesFromRamCallback unload_samples_from_ram_cb_;
+};
+
+// A QDL that allows defining callbacks for
+// IssueQuery, FlushQueries, and Name methods.
+class QueryDispatchLibraryTrampoline : public QueryDispatchLibrary {
+ public:
+  QueryDispatchLibraryTrampoline(IssueQueryCallback issue_query_callback,
+                                 FlushQueriesCallback flush_queries_callback,
+                                 NameCallback name_callback)
+      : issue_query_callback_(issue_query_callback),
+        flush_queries_callback_(flush_queries_callback),
+        name_callback_(name_callback) {}
+
+  // Returns the name of the SUT. Name shall be returned over the network
+  // TODO: other bindings should also be fixed eventually to be used over the
+  // network
+  const std::string& Name() override {
+    static std::string name;  // HACK: avoid returning a reference to temporary.
+    pybind11::gil_scoped_acquire gil_acquirer;
+    name = name_callback_();  // name_callback_() shall returned name over the
+                              // network.
+    return name;
+  }
+
+  void IssueQuery(const std::vector<QuerySample>& samples) override {
+    pybind11::gil_scoped_acquire gil_acquirer;
+    issue_query_callback_(samples);
+  }
+
+  void FlushQueries() override { flush_queries_callback_(); }
+
+ protected:
+  IssueQueryCallback issue_query_callback_;
+  FlushQueriesCallback flush_queries_callback_;
+  NameCallback name_callback_;
+};
+
+}  // namespace
+
+/// \brief Python bindings.
+namespace py {
+
+uintptr_t ConstructSUT(IssueQueryCallback issue_cb,
+                       FlushQueriesCallback flush_queries_cb) {
+  SystemUnderTestTrampoline* sut =
+      new SystemUnderTestTrampoline("PySUT", issue_cb, flush_queries_cb);
+  return reinterpret_cast<uintptr_t>(sut);
+}
+
+void DestroySUT(uintptr_t sut) {
+  SystemUnderTestTrampoline* sut_cast =
+      reinterpret_cast<SystemUnderTestTrampoline*>(sut);
+  delete sut_cast;
+}
+
+uintptr_t ConstructFastSUT(FastIssueQueriesCallback fast_issue_cb,
+                           FlushQueriesCallback flush_queries_cb) {
+  FastSystemUnderTestTrampoline* sut = new FastSystemUnderTestTrampoline(
+      "PyFastSUT", fast_issue_cb, flush_queries_cb);
+  return reinterpret_cast<uintptr_t>(sut);
+}
+
+void DestroyFastSUT(uintptr_t sut) {
+  FastSystemUnderTestTrampoline* sut_cast =
+      reinterpret_cast<FastSystemUnderTestTrampoline*>(sut);
+  delete sut_cast;
+}
+
+uintptr_t ConstructQSL(
+    size_t total_sample_count, size_t performance_sample_count,
+    LoadSamplesToRamCallback load_samples_to_ram_cb,
+    UnloadSamplesFromRamCallback unload_samples_from_ram_cb) {
+  QuerySampleLibraryTrampoline* qsl = new QuerySampleLibraryTrampoline(
+      "PyQSL", total_sample_count, performance_sample_count,
+      load_samples_to_ram_cb, unload_samples_from_ram_cb);
+  return reinterpret_cast<uintptr_t>(qsl);
+}
+
+void DestroyQSL(uintptr_t qsl) {
+  QuerySampleLibraryTrampoline* qsl_cast =
+      reinterpret_cast<QuerySampleLibraryTrampoline*>(qsl);
+  delete qsl_cast;
+}
+
+uintptr_t ConstructQDL(IssueQueryCallback issue_cb,
+                       FlushQueriesCallback flush_queries_cb,
+                       NameCallback name_callback) {
+  QueryDispatchLibraryTrampoline* qdl = new QueryDispatchLibraryTrampoline(
+      issue_cb, flush_queries_cb, name_callback);
+  return reinterpret_cast<uintptr_t>(qdl);
+}
+
+void DestroyQDL(uintptr_t qdl) {
+  QueryDispatchLibraryTrampoline* qdl_cast =
+      reinterpret_cast<QueryDispatchLibraryTrampoline*>(qdl);
+  delete qdl_cast;
+}
+
+void StartTest(uintptr_t sut, uintptr_t qsl, mlperf::TestSettings test_settings,
+               const std::string& audit_config_filename) {
+  pybind11::gil_scoped_release gil_releaser;
+  SystemUnderTestTrampoline* sut_cast =
+      reinterpret_cast<SystemUnderTestTrampoline*>(sut);
+  QuerySampleLibraryTrampoline* qsl_cast =
+      reinterpret_cast<QuerySampleLibraryTrampoline*>(qsl);
+  LogSettings default_log_settings;
+  mlperf::StartTest(sut_cast, qsl_cast, test_settings, default_log_settings,
+                    audit_config_filename);
+}
+
+void StartTestWithLogSettings(uintptr_t sut, uintptr_t qsl,
+                              mlperf::TestSettings test_settings,
+                              mlperf::LogSettings log_settings,
+                              const std::string& audit_config_filename) {
+  pybind11::gil_scoped_release gil_releaser;
+  SystemUnderTestTrampoline* sut_cast =
+      reinterpret_cast<SystemUnderTestTrampoline*>(sut);
+  QuerySampleLibraryTrampoline* qsl_cast =
+      reinterpret_cast<QuerySampleLibraryTrampoline*>(qsl);
+  mlperf::StartTest(sut_cast, qsl_cast, test_settings, log_settings,
+                    audit_config_filename);
+}
+
+using ResponseCallback = std::function<void(QuerySampleResponse*)>;
+
+/// TODO: Get rid of copies.
+void QuerySamplesComplete(std::vector<QuerySampleResponse> responses,
+                          ResponseCallback response_cb = {}) {
+  pybind11::gil_scoped_release gil_releaser;
+  mlperf::QuerySamplesComplete(responses.data(), responses.size(), response_cb);
+}
+
+void FirstTokenComplete(std::vector<QuerySampleResponse> responses,
+                        ResponseCallback response_cb = {}) {
+  pybind11::gil_scoped_release gil_releaser;
+  mlperf::FirstTokenComplete(responses.data(), responses.size(), response_cb);
+}
+
+PYBIND11_MODULE(mlperf_loadgen, m) {
+  m.doc() = "MLPerf Inference load generator.";
+
+  pybind11::enum_<TestScenario>(m, "TestScenario")
+      .value("SingleStream", TestScenario::SingleStream)
+      .value("MultiStream", TestScenario::MultiStream)
+      .value("Server", TestScenario::Server)
+      .value("Offline", TestScenario::Offline);
+
+  pybind11::enum_<TestMode>(m, "TestMode")
+      .value("SubmissionRun", TestMode::SubmissionRun)
+      .value("AccuracyOnly", TestMode::AccuracyOnly)
+      .value("PerformanceOnly", TestMode::PerformanceOnly)
+      .value("FindPeakPerformance", TestMode::FindPeakPerformance);
+
+  pybind11::class_<TestSettings>(m, "TestSettings")
+      .def(pybind11::init<>())
+      .def_readwrite("scenario", &TestSettings::scenario)
+      .def_readwrite("mode", &TestSettings::mode)
+      .def_readwrite("single_stream_expected_latency_ns",
+                     &TestSettings::single_stream_expected_latency_ns)
+      .def_readwrite("single_stream_target_latency_percentile",
+                     &TestSettings::single_stream_target_latency_percentile)
+      .def_readwrite("multi_stream_expected_latency_ns",
+                     &TestSettings::multi_stream_expected_latency_ns)
+      .def_readwrite("multi_stream_target_latency_percentile",
+                     &TestSettings::multi_stream_target_latency_percentile)
+      .def_readwrite("multi_stream_samples_per_query",
+                     &TestSettings::multi_stream_samples_per_query)
+      .def_readwrite("server_target_qps", &TestSettings::server_target_qps)
+      .def_readwrite("server_target_latency_ns",
+                     &TestSettings::server_target_latency_ns)
+      .def_readwrite("server_target_latency_percentile",
+                     &TestSettings::server_target_latency_percentile)
+      .def_readwrite("server_coalesce_queries",
+                     &TestSettings::server_coalesce_queries)
+      .def_readwrite("server_find_peak_qps_decimals_of_precision",
+                     &TestSettings::server_find_peak_qps_decimals_of_precision)
+      .def_readwrite("server_find_peak_qps_boundary_step_size",
+                     &TestSettings::server_find_peak_qps_boundary_step_size)
+      .def_readwrite("server_max_async_queries",
+                     &TestSettings::server_max_async_queries)
+      .def_readwrite("server_num_issue_query_threads",
+                     &TestSettings::server_num_issue_query_threads)
+      .def_readwrite("offline_expected_qps",
+                     &TestSettings::offline_expected_qps)
+      .def_readwrite("min_duration_ms", &TestSettings::min_duration_ms)
+      .def_readwrite("max_duration_ms", &TestSettings::max_duration_ms)
+      .def_readwrite("min_query_count", &TestSettings::min_query_count)
+      .def_readwrite("max_query_count", &TestSettings::max_query_count)
+      .def_readwrite("qsl_rng_seed", &TestSettings::qsl_rng_seed)
+      .def_readwrite("sample_index_rng_seed",
+                     &TestSettings::sample_index_rng_seed)
+      .def_readwrite("schedule_rng_seed", &TestSettings::schedule_rng_seed)
+      .def_readwrite("accuracy_log_rng_seed",
+                     &TestSettings::accuracy_log_rng_seed)
+      .def_readwrite("accuracy_log_probability",
+                     &TestSettings::accuracy_log_probability)
+      .def_readwrite("print_timestamps", &TestSettings::print_timestamps)
+      .def_readwrite("performance_issue_unique",
+                     &TestSettings::performance_issue_unique)
+      .def_readwrite("performance_issue_same",
+                     &TestSettings::performance_issue_same)
+      .def_readwrite("performance_issue_same_index",
+                     &TestSettings::performance_issue_same_index)
+      .def_readwrite("performance_sample_count_override",
+                     &TestSettings::performance_sample_count_override)
+      .def_readwrite("test05", &TestSettings::test05)
+      .def_readwrite("test05_qsl_rng_seed", &TestSettings::test05_qsl_rng_seed)
+      .def_readwrite("test05_sample_index_rng_seed",
+                     &TestSettings::test05_sample_index_rng_seed)
+      .def_readwrite("test05_schedule_rng_seed",
+                     &TestSettings::test05_schedule_rng_seed)
+      .def_readwrite("use_token_latencies", &TestSettings::use_token_latencies)
+      .def_readwrite("ttft_latency", &TestSettings::server_ttft_latency)
+      .def_readwrite("tpot_latency", &TestSettings::server_tpot_latency)
+      .def_readwrite("infer_token_latencies",
+                     &TestSettings::infer_token_latencies)
+      .def_readwrite("token_latency_scaling_factor",
+                     &TestSettings::token_latency_scaling_factor)
+      .def("FromConfig", &TestSettings::FromConfig, pybind11::arg("path"),
+           pybind11::arg("model"), pybind11::arg("scenario"),
+           pybind11::arg("conf_type") = 1,
+           "This function configures settings from the given user "
+           "configuration file, model, and scenario. The conf_type flag "
+           "should be set to 1 for loading user.conf or else only the default "
+           "mlperf_conf file "
+           "will be loaded by the loadgen.");
+
+  pybind11::enum_<LoggingMode>(m, "LoggingMode")
+      .value("AsyncPoll", LoggingMode::AsyncPoll)
+      .value("EndOfTestOnly", LoggingMode::EndOfTestOnly)
+      .value("Synchronous", LoggingMode::Synchronous);
+
+  pybind11::class_<LogOutputSettings>(m, "LogOutputSettings")
+      .def(pybind11::init<>())
+      .def_readwrite("outdir", &LogOutputSettings::outdir)
+      .def_readwrite("prefix", &LogOutputSettings::prefix)
+      .def_readwrite("suffix", &LogOutputSettings::suffix)
+      .def_readwrite("prefix_with_datetime",
+                     &LogOutputSettings::prefix_with_datetime)
+      .def_readwrite("copy_detail_to_stdout",
+                     &LogOutputSettings::copy_detail_to_stdout)
+      .def_readwrite("copy_summary_to_stdout",
+                     &LogOutputSettings::copy_summary_to_stdout);
+
+  pybind11::class_<LogSettings>(m, "LogSettings")
+      .def(pybind11::init<>())
+      .def_readwrite("log_output", &LogSettings::log_output)
+      .def_readwrite("log_mode", &LogSettings::log_mode)
+      .def_readwrite("log_mode_async_poll_interval_ms",
+                     &LogSettings::log_mode_async_poll_interval_ms)
+      .def_readwrite("enable_trace", &LogSettings::enable_trace);
+
+  pybind11::class_<QuerySample>(m, "QuerySample")
+      .def(pybind11::init<>())
+      .def(pybind11::init<ResponseId, QuerySampleIndex>())
+      .def_readwrite("id", &QuerySample::id)
+      .def_readwrite("index", &QuerySample::index)
+      .def(pybind11::pickle(
+          [](const QuerySample& qs) {  // __getstate__
+            /*Return a tuple that fully encodes state of object*/
+            return pybind11::make_tuple(qs.id, qs.index);
+          },
+          [](pybind11::tuple t) {  // __setstate__
+            if (t.size() != 2)
+              throw std::runtime_error("Invalid state for QuerySample");
+            /* Create a new C++ instance*/
+            QuerySample q;
+            q.id = t[0].cast<uintptr_t>();
+            q.index = t[1].cast<size_t>();
+            return q;
+          }));
+
+  pybind11::class_<QuerySampleResponse>(m, "QuerySampleResponse")
+      .def(pybind11::init<>())
+      .def(pybind11::init<ResponseId, uintptr_t, size_t>())
+      .def(pybind11::init<ResponseId, uintptr_t, size_t, int64_t>())
+      .def_readwrite("id", &QuerySampleResponse::id)
+      .def_readwrite("data", &QuerySampleResponse::data)
+      .def_readwrite("size", &QuerySampleResponse::size)
+      .def_readwrite("n_tokens", &QuerySampleResponse::n_tokens)
+      .def(pybind11::pickle(
+          [](const QuerySampleResponse& qsr) {  // __getstate__
+            /* Return a tuple that fully encodes state of object*/
+            return pybind11::make_tuple(qsr.id, qsr.data, qsr.size);
+          },
+          [](pybind11::tuple t) {  // __setstate__
+            if ((t.size() != 3) || (t.size() != 4))
+              throw std::runtime_error("Invalid state for QuerySampleResponse");
+            /* Create a new C++ instance*/
+            QuerySampleResponse q;
+            q.id = t[0].cast<uintptr_t>();
+            q.data = t[1].cast<uintptr_t>();
+            q.size = t[2].cast<size_t>();
+            if (t.size() == 4) {
+              q.n_tokens = t[3].cast<int64_t>();
+            } else {
+              q.n_tokens = 0;
+            }
+            return q;
+          }));
+
+  // TODO: Use PYBIND11_MAKE_OPAQUE for the following vector types.
+  pybind11::bind_vector<std::vector<QuerySample>>(m, "VectorQuerySample");
+  pybind11::bind_vector<std::vector<QuerySampleResponse>>(
+      m, "VectorQuerySampleResponse");
+
+  m.def("ConstructSUT", &py::ConstructSUT, "Construct the system under test.");
+  m.def("DestroySUT", &py::DestroySUT,
+        "Destroy the object created by ConstructSUT.");
+
+  m.def("ConstructFastSUT", &py::ConstructFastSUT,
+        "Construct the system under test, fast issue query");
+  m.def("DestroyFastSUT", &py::DestroyFastSUT,
+        "Destroy the object created by ConstructFastSUT.");
+
+  m.def("ConstructQSL", &py::ConstructQSL,
+        "Construct the query sample library.");
+  m.def("DestroyQSL", &py::DestroyQSL,
+        "Destroy the object created by ConstructQSL.");
+
+  m.def("ConstructQDL", &py::ConstructQDL,
+        "Construct the query sample library, communicating with the SUT over "
+        "the network.");
+  m.def("DestroyQDL", &py::DestroyQDL,
+        "Destroy the object created by ConstructQDL.");
+
+  m.def("StartTest", &py::StartTest,
+        "Run tests on a SUT created by ConstructSUT() with the provided QSL. "
+        "Uses default log settings.",
+        pybind11::arg("sut"), pybind11::arg("qsl"),
+        pybind11::arg("test_settings"),
+        pybind11::arg("audit_config_filename") = "audit.config");
+  m.def("StartTestWithLogSettings", &py::StartTestWithLogSettings,
+        "Run tests on a SUT created by ConstructSUT() with the provided QSL. "
+        "Accepts custom log settings.",
+        pybind11::arg("sut"), pybind11::arg("qsl"),
+        pybind11::arg("test_settings"), pybind11::arg("log_settings"),
+        pybind11::arg("audit_config_filename") = "audit.config");
+  m.def("QuerySamplesComplete", &py::QuerySamplesComplete,
+        "Called by the SUT to indicate that samples from some combination of"
+        "IssueQuery calls have finished.",
+        pybind11::arg("responses"),
+        pybind11::arg("response_cb") = ResponseCallback{});
+  m.def("FirstTokenComplete", &py::FirstTokenComplete,
+        "Called by the SUT to indicate that tokens from some combination of"
+        "IssueQuery calls have finished.",
+        pybind11::arg("responses"),
+        pybind11::arg("response_cb") = ResponseCallback{});
+}
+
+}  // namespace py
+}  // namespace mlperf
+
+#endif  // PYTHON_BINDINGS_H
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/README.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/README.md
new file mode 100644
index 000000000..f46e22a65
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/README.md
@@ -0,0 +1,67 @@
+# Demo
+
+## Loadgen Over the Network
+
+### Overview
+
+
+This folder provides a demo implementation for LoadGen over the network.\
+Two sides are implemented:
+
+1. The SUT side which is implemented in [sut_over_network_demo.py](sut_over_network_demo.py). Each Node should run it for multiple Nodes operation.
+2. The LoadGen node running the LoadGen, QSL and QDL instances, implemented in [py_demo_server_lon.py](py_demo_server_lon.py)
+
+The demo SUT is implemented with a Flask server. the LON node implements a Flask client for network operation.
+
+The test runs in MLPerf Server mode. the SUT is not implementing a benchmark but contains dummy interface to preprocessing, postprocessing and  model calling functions.
+
+### Setup
+
+Install python packages:
+
+```sh
+pip install absl-py numpy wheel flask requests
+```
+
+Clone:
+
+```sh
+git clone --recurse-submodules https://github.com/mlcommons/inference.git mlperf_inference
+```
+
+Build:
+
+```sh
+cd mlperf_inference/loadgen
+CFLAGS="-std=c++14 -O3" python setup.py bdist_wheel
+cd ..; pip install --force-reinstall loadgen/dist/`ls -r loadgen/dist/ | head -n1` ; cd -
+```
+
+### Run the demo (single machine)
+
+Start the demo SUT server (run this at a separate terminal):
+
+```sh
+python demos/lon/sut_over_network_demo.py --port 8000
+```
+
+Start the test:
+
+```sh
+python demos/lon/py_demo_server_lon.py --sut_server http://localhost:8000
+```
+
+### Run the demo (over the network)
+
+To run over a network - simply run the demo SUT over on a different machine. For multiple Nodes run the demo SUT on each machine specifying the node number.\
+
+```sh
+python demos/lon/sut_over_network_demo.py --port 8000 --node N1
+```
+
+Then, when running the client, replace `localhost` with the correct IP.
+
+
+```sh
+python demos/lon/py_demo_server_lon.py --sut_server IP1:8000,IP2:8000,IP3:8000
+```
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/py_demo_server_lon.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/py_demo_server_lon.py
new file mode 100644
index 000000000..1248215db
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/py_demo_server_lon.py
@@ -0,0 +1,191 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""
+Python demo showing how to use the MLPerf Inference LoadGen over the Network bindings.
+This programs runs in the LON Node side.
+It runs the demo in MLPerf server mode over the network.
+It communicates over the network with a Network SUT node,
+which is running the Network SUT demo based on a flask server, implemented in SUT_over_network.py
+"""
+
+import threading
+import requests
+import array
+import time
+
+from absl import app
+from absl import flags
+import mlperf_loadgen
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_list(
+    "sut_server", "http://localhost:8000", "Address of the server(s) under test."
+)
+
+
+class QSL:
+    """Demo QuerySampleLibrary with dummy features."""
+
+    def __init__(self, total_sample_count, performance_sample_count):
+        self.eval_features = {
+            i: f"what_is_my_dummy_feature_{i}?" for i in range(total_sample_count)
+        }
+        self.qsl = mlperf_loadgen.ConstructQSL(
+            total_sample_count,
+            performance_sample_count,
+            self.load_samples_to_ram,
+            self.unload_samples_from_ram,
+        )
+
+    def get_features(self, sample_id):
+        """Returns the feature for a given sample id."""
+        return self.eval_features[sample_id]
+
+    def load_samples_to_ram(self, query_samples):
+        """Loads the features for the given query samples into RAM."""
+        # Current implementation is not using this functionality.
+        del query_samples
+        return
+
+    def unload_samples_from_ram(self, query_samples):
+        """Unloads the features for the given query samples from RAM."""
+        # Current implementation is not using this functionality.
+        del query_samples
+        return
+
+    def __del__(self):
+        mlperf_loadgen.DestroyQSL(self.qsl)
+
+
+class QDL:
+    """QDL acting as a proxy to the SUT.
+    This QDL communicates with the SUT via HTTP.
+    It uses two endpoints to communicate with the SUT:
+    - /predict/ : Send a query to the SUT and get a response.
+    - /getname/ : Get the name of the SUT. Send a getname to the SUT and get a response.
+    """
+
+    def __init__(self, qsl: QSL, sut_server_addr: list):
+        """
+        Constructor for the QDL.
+        Args:
+            qsl: The QSL to use.
+            sut_server_addr: A list of addresses of the SUT.
+        """
+        self.qsl = qsl
+
+        # Construct QDL from the python binding
+        self.qdl = mlperf_loadgen.ConstructQDL(
+            self.issue_query, self.flush_queries, self.client_get_name
+        )
+        self.sut_server_addr = sut_server_addr
+        self.num_nodes = len(sut_server_addr)
+
+        # For round robin between the SUTs:
+        self.next_sut_id = 0
+        self.lock = threading.Lock()
+
+    def issue_query(self, query_samples):
+        """Process the query to send to the SUT"""
+        threading.Thread(
+            target=self.process_query_async,
+            args=[query_samples]).start()
+
+    def flush_queries(self):
+        """Flush the queries. Dummy implementation."""
+        pass
+
+    def process_query_async(self, query_samples):
+        """
+        This function is called by the Loadgen in a separate thread.
+        It is responsible for
+            1. Creating a query for the SUT, by reading the features from the QSL.
+            2. Sending the query to the SUT.
+            3. Waiting for the response from the SUT.
+            4. Deserializing the response.
+            5. Calling mlperf_loadgen.QuerySamplesComplete(query_samples, response)
+        Args:
+            query_samples: A list of QuerySample objects.
+        """
+        responses = []
+        for s in query_samples:
+            # Overall process:
+            # QDL builds a real-world query and sends to SUT --> SUT processes --> SUT sends back to QDL
+            # Read features from the QSL
+            features = self.qsl.get_features(s.index)
+
+            time.sleep(0.001)  # Ensure a maximal rate of queries to the SUT
+
+            # Send the query to SUT in round robin
+            # Wait for a response
+            sut_result = self.client_predict(features, s.index)
+            response_array = array.array("B", sut_result.encode("utf-8"))
+            bi = response_array.buffer_info()
+            responses.append(
+                mlperf_loadgen.QuerySampleResponse(
+                    s.id, bi[0], bi[1]))
+        mlperf_loadgen.QuerySamplesComplete(responses)
+
+    def get_sut_id_round_robin(self):
+        """Get the SUT id in round robin."""
+        with self.lock:
+            res = self.next_sut_id
+            self.next_sut_id = (self.next_sut_id + 1) % self.num_nodes
+        return res
+
+    def client_predict(self, query, id):
+        """Serialize the query, send it to the SUT in round robin, and return the deserialized response."""
+        url = "{}/predict/".format(
+            self.sut_server_addr[self.get_sut_id_round_robin()])
+        response = requests.post(url, json={"query": query, id: id})
+        return response.json()["result"]
+
+    def client_get_name(self):
+        """Get the name of the SUT from ALL the SUTS."""
+        if len(self.sut_server_addr) == 1:
+            return requests.post(
+                f"{self.sut_server_addr[0]}/getname/").json()["name"]
+
+        sut_names = [
+            requests.post(f"{addr}/getname/").json()["name"]
+            for addr in self.sut_server_addr
+        ]
+        return "Multi-node SUT: " + ", ".join(sut_names)
+
+    def __del__(self):
+        mlperf_loadgen.DestroyQDL(self.qdl)
+
+
+def main(argv):
+    del argv
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.Server
+    settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    settings.server_target_qps = 100
+    settings.server_target_latency_ns = 100000000
+    settings.min_query_count = 100
+    settings.min_duration_ms = 10000
+
+    # QDL and QSL
+    qsl = QSL(1024, 128)
+    qdl = QDL(qsl, sut_server_addr=FLAGS.sut_server)
+
+    mlperf_loadgen.StartTest(qdl.qdl, qsl.qsl, settings)
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/sut_over_network_demo.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/sut_over_network_demo.py
new file mode 100644
index 000000000..55e5e038d
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/sut_over_network_demo.py
@@ -0,0 +1,88 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+
+"""
+Python demo showing how to use the MLPerf Inference load generator bindings over the network.
+This part of the demo runs the "demo SUT" which is connected over the network to the LON node.
+A corresponding "demo LON node" with the demo test is implemented in py_demo_server_lon.py.
+
+The SUT is implemented using a Flask server, with dummy implementation of the inference processing.
+Two endpoints are exposed:
+- /predict/ : Receives a query (e.g., a text) runs inference, and returns a prediction.
+- /getname/ : Get the name of the SUT.
+
+The current implementation is a dummy implementation, which does not use
+a real DNN model, batching, or pre/postprocessing code,
+but rather just returns subset of the input query as a response,
+Yet, it illustrates the basic structure of a SUT server.
+"""
+
+import argparse
+from flask import Flask, request, jsonify
+
+
+app = Flask(__name__)
+
+
+node = ""
+
+
+def preprocess(query):
+    """[SUT Node] A dummy preprocess."""
+    # Here may come for example batching, tokenization, resizing,
+    # normalization, etc.
+    response = query
+    return response
+
+
+def dnn_model(query):
+    """[SUT Node] A dummy DNN model."""
+    # Here may come for example a call to a dnn model such as resnet, bert,
+    # etc.
+    response = query
+    return response
+
+
+def postprocess(query):
+    """[SUT Node] A dummy postprocess."""
+    # Here may come for example a postprocessing call, e.g., NMS,
+    # detokenization, etc.
+    response = query
+    return response
+
+
+@app.route("/predict/", methods=["POST"])
+def predict():
+    """Receives a query (e.g., a text) runs inference, and returns a prediction."""
+    query = request.get_json(force=True)["query"]
+    result = postprocess(dnn_model(preprocess(query)))
+    return jsonify(result=result)
+
+
+@app.route("/getname/", methods=["POST", "GET"])
+def getname():
+    """Returns the name of the SUT."""
+    return jsonify(name=f"Demo SUT (Network SUT) node" +
+                   (" " + node) if node else "")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument("--node", type=str, default="")
+    args = parser.parse_args()
+    node = args.node
+    app.run(debug=False, port=args.port)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_multi_stream.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_multi_stream.py
new file mode 100644
index 000000000..f6082cad6
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_multi_stream.py
@@ -0,0 +1,86 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python demo showing how to use the MLPerf Inference load generator bindings.
+"""
+
+from __future__ import print_function
+
+import threading
+import time
+
+from absl import app
+import mlperf_loadgen
+
+from datetime import datetime
+
+# Global var
+NUM_AGENTS = 8
+LOOPBACK_LATENCY_S = 0.001
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+# Processes queries in NUM_AGENTS slices that complete at different times.
+def process_query_async(query_samples, i_slice):
+    time.sleep(LOOPBACK_LATENCY_S * (i_slice + 1))
+    responses = []
+    samples_to_complete = query_samples[i_slice: len(
+        query_samples): NUM_AGENTS]
+    for j, s in enumerate(samples_to_complete):
+        responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0))
+    mlperf_loadgen.QuerySamplesComplete(responses)
+
+
+def issue_query(query_samples):
+    for i in range(8):
+        threading.Thread(
+            target=process_query_async, args=(
+                query_samples, i)).start()
+
+
+def flush_queries():
+    pass
+
+
+def main(argv):
+    del argv
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.MultiStream
+    settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    settings.multi_stream_expected_latency_ns = 8000000
+    settings.multi_stream_samples_per_query = 8
+    settings.min_query_count = 100
+    settings.min_duration_ms = 10000
+
+    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_offline.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_offline.py
new file mode 100644
index 000000000..909585edc
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_offline.py
@@ -0,0 +1,81 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python demo showing how to use the MLPerf Inference load generator bindings.
+"""
+
+from __future__ import print_function
+
+import threading
+import time
+
+from absl import app
+import mlperf_loadgen
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+# Processes queries in 3 slices that complete at different times.
+def process_query_async(query_samples, i_slice):
+    time.sleep(3 * (i_slice + 1))
+    responses = []
+    samples_to_complete = query_samples[i_slice: len(query_samples): 3]
+    for s in samples_to_complete:
+        responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0))
+    mlperf_loadgen.QuerySamplesComplete(responses)
+
+
+def issue_query(query_samples):
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 0)).start()
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 1)).start()
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 2)).start()
+
+
+def flush_queries():
+    pass
+
+
+def main(argv):
+    del argv
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.Offline
+    settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    settings.offline_expected_qps = 1000
+
+    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_server.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_server.py
new file mode 100644
index 000000000..8b6f2b826
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_server.py
@@ -0,0 +1,74 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python demo showing how to use the MLPerf Inference load generator bindings.
+"""
+
+from __future__ import print_function
+
+import threading
+import time
+
+from absl import app
+import mlperf_loadgen
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+def process_query_async(query_samples):
+    time.sleep(0.001)
+    responses = []
+    for s in query_samples:
+        responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0))
+    mlperf_loadgen.QuerySamplesComplete(responses)
+
+
+def issue_query(query_samples):
+    threading.Thread(target=process_query_async, args=[query_samples]).start()
+
+
+def flush_queries():
+    pass
+
+
+def main(argv):
+    del argv
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.Server
+    settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    settings.server_target_qps = 100
+    settings.server_target_latency_ns = 100000000
+    settings.min_query_count = 100
+    settings.min_duration_ms = 10000
+
+    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_single_stream.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_single_stream.py
new file mode 100644
index 000000000..8806271bd
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_single_stream.py
@@ -0,0 +1,84 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python demo showing how to use the MLPerf Inference load generator bindings.
+"""
+
+from __future__ import print_function
+
+import array
+import threading
+import time
+
+from absl import app
+import mlperf_loadgen
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+def process_query_async(query_samples):
+    """Processes the list of queries."""
+    time.sleep(0.001)
+    responses = []
+    response_array = array.array(
+        "f", [0, 1, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128, 254, 255]
+    )
+    response_info = response_array.buffer_info()
+    response_data = response_info[0]
+    response_size = response_info[1] * response_array.itemsize
+    for s in query_samples:
+        responses.append(
+            mlperf_loadgen.QuerySampleResponse(
+                s.id, response_data, response_size)
+        )
+    mlperf_loadgen.QuerySamplesComplete(responses)
+
+
+def issue_query(query_samples):
+    threading.Thread(target=process_query_async, args=[query_samples]).start()
+
+
+def flush_queries():
+    pass
+
+
+def main(argv):
+    del argv
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.SingleStream
+    settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    settings.single_stream_expected_latency_ns = 1000000
+    settings.min_query_count = 100
+    settings.min_duration_ms = 10000
+
+    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_multi_stream.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_multi_stream.py
new file mode 100644
index 000000000..e4b083853
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_multi_stream.py
@@ -0,0 +1,142 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python demo showing how to use the MLPerf Inference load generator bindings.
+"""
+
+from __future__ import print_function
+
+import argparse
+import threading
+import time
+import numpy as np
+import array
+
+import mlperf_loadgen
+
+from datetime import datetime
+
+# Global var
+NUM_AGENTS = 8
+LOOPBACK_LATENCY_S = 0.001
+
+
+def f(x, y):
+    return 4 + 3 * x * y + x**3 + y**2
+
+
+def create_responses(n, m, mod=4):
+    r = []
+    for i in range(n):
+        r.append([f(i, j) for j in range(m + (i % mod))])
+    return r
+
+
+responses = create_responses(1024, 20)
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+# Processes queries in NUM_AGENTS slices that complete at different times.
+def process_query_async(query_samples, i_slice):
+    time.sleep(LOOPBACK_LATENCY_S * (i_slice + 1))
+    query_responses = []
+    samples_to_complete = query_samples[i_slice: len(
+        query_samples): NUM_AGENTS]
+    for j, s in enumerate(samples_to_complete):
+        response_array = np.array(responses[s.index], np.int32)
+        token = response_array[0]
+        time.sleep(0.0002)
+        response_token = array.array("B", token.tobytes())
+        response_token_info = response_token.buffer_info()
+        response_token_data = response_token_info[0]
+        response_token_size = response_token_info[1] * response_token.itemsize
+        mlperf_loadgen.FirstTokenComplete(
+            [
+                mlperf_loadgen.QuerySampleResponse(
+                    s.id, response_token_data, response_token_size
+                )
+            ]
+        )
+        time.sleep(0.02)
+        n_tokens = len(response_array)
+        response_array = array.array("B", response_array.tobytes())
+        response_info = response_array.buffer_info()
+        response_data = response_info[0]
+        response_size = response_info[1] * response_array.itemsize
+        query_responses.append(
+            mlperf_loadgen.QuerySampleResponse(
+                s.id, response_data, response_size, n_tokens
+            )
+        )
+    mlperf_loadgen.QuerySamplesComplete(query_responses)
+
+
+def issue_query(query_samples):
+    for i in range(8):
+        threading.Thread(
+            target=process_query_async, args=(
+                query_samples, i)).start()
+
+
+def flush_queries():
+    pass
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--mode", choices=["performance", "accuracy"], default="performance"
+    )
+    parser.add_argument("--expected-latency", type=int, default=8000000)
+    parser.add_argument("--samples-per-query", type=int, default=8)
+    parser.add_argument("--min-query-count", type=int, default=100)
+    parser.add_argument("--min-duration-ms", type=int, default=30000)
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.MultiStream
+    if args.mode == "performance":
+        settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    else:
+        settings.mode = mlperf_loadgen.TestMode.AccuracyOnly
+    settings.multi_stream_expected_latency_ns = args.expected_latency
+    settings.multi_stream_samples_per_query = args.samples_per_query
+    settings.min_query_count = args.min_query_count
+    settings.min_duration_ms = args.min_duration_ms
+    settings.use_token_latencies = True
+
+    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline.py
new file mode 100644
index 000000000..2e190cdd5
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline.py
@@ -0,0 +1,130 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python demo showing how to use the MLPerf Inference load generator bindings.
+"""
+
+from __future__ import print_function
+
+import argparse
+import threading
+import time
+import numpy as np
+import array
+
+import mlperf_loadgen
+
+
+def f(x, y):
+    return 4 + 3 * x * y + x**3 + y**2
+
+
+def create_responses(n, m, mod=4):
+    r = []
+    for i in range(n):
+        r.append([f(i, j) for j in range(m + (i % mod))])
+    return r
+
+
+responses = create_responses(1024, 20)
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+# Processes queries in 3 slices that complete at different times.
+def process_query_async(query_samples, i_slice):
+    time.sleep(3 * (i_slice + 1))
+    query_responses = []
+    samples_to_complete = query_samples[i_slice: len(query_samples): 3]
+    for s in samples_to_complete:
+        response_array = np.array(responses[s.index], np.int32)
+        token = response_array[0]
+        time.sleep(0.0002)
+        response_token = array.array("B", token.tobytes())
+        response_token_info = response_token.buffer_info()
+        response_token_data = response_token_info[0]
+        response_token_size = response_token_info[1] * response_token.itemsize
+        # mlperf_loadgen.FirstTokenComplete([mlperf_loadgen.QuerySampleResponse(s.id, response_token_data, response_token_size)])
+        time.sleep(0.02)
+        n_tokens = len(response_array)
+        response_array = array.array("B", response_array.tobytes())
+        response_info = response_array.buffer_info()
+        response_data = response_info[0]
+        response_size = response_info[1] * response_array.itemsize
+        query_responses.append(
+            mlperf_loadgen.QuerySampleResponse(
+                s.id, response_data, response_size, n_tokens
+            )
+        )
+    mlperf_loadgen.QuerySamplesComplete(query_responses)
+
+
+def issue_query(query_samples):
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 0)).start()
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 1)).start()
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 2)).start()
+
+
+def flush_queries():
+    pass
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--mode", choices=["performance", "accuracy"], default="performance"
+    )
+    parser.add_argument("--expected-qps", type=int, default=1000)
+    parser.add_argument("--min-duration-ms", type=int, default=30000)
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.Offline
+    if args.mode == "performance":
+        settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    else:
+        settings.mode = mlperf_loadgen.TestMode.AccuracyOnly
+    settings.offline_expected_qps = args.expected_qps
+    settings.min_duration_ms = args.min_duration_ms
+    settings.use_token_latencies = True
+
+    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline_inferred.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline_inferred.py
new file mode 100644
index 000000000..9325b8410
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline_inferred.py
@@ -0,0 +1,130 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python demo showing how to use the MLPerf Inference load generator bindings.
+"""
+
+from __future__ import print_function
+
+import argparse
+import threading
+import time
+import numpy as np
+import array
+
+import mlperf_loadgen
+
+
+def f(x, y):
+    return 4 + 3 * x * y + x**3 + y**2
+
+
+def create_responses(n, m, mod=4):
+    r = []
+    for i in range(n):
+        r.append([f(i, j) for j in range(m + (i % mod))])
+    return r
+
+
+responses = create_responses(1024, 20, mod=3)
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+# Processes queries in 3 slices that complete at different times.
+def process_query_async(query_samples, i_slice):
+    time.sleep(3 * (i_slice + 1))
+    query_responses = []
+    samples_to_complete = query_samples[i_slice: len(query_samples): 3]
+    for s in samples_to_complete:
+        response_array = np.array(responses[s.index], np.int32)
+        token = response_array[0]
+        time.sleep(0.0002)
+        response_token = array.array("B", token.tobytes())
+        response_token_info = response_token.buffer_info()
+        response_token_data = response_token_info[0]
+        response_token_size = response_token_info[1] * response_token.itemsize
+        # mlperf_loadgen.FirstTokenComplete([mlperf_loadgen.QuerySampleResponse(s.id, response_token_data, response_token_size)])
+        time.sleep(0.02)
+        n_tokens = len(response_array)
+        response_array = array.array("B", response_array.tobytes())
+        response_info = response_array.buffer_info()
+        response_data = response_info[0]
+        response_size = response_info[1] * response_array.itemsize
+        query_responses.append(
+            mlperf_loadgen.QuerySampleResponse(
+                s.id, response_data, response_size)
+        )
+    mlperf_loadgen.QuerySamplesComplete(query_responses)
+
+
+def issue_query(query_samples):
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 0)).start()
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 1)).start()
+    threading.Thread(
+        target=process_query_async, args=(
+            query_samples, 2)).start()
+
+
+def flush_queries():
+    pass
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--mode", choices=["performance", "accuracy"], default="performance"
+    )
+    parser.add_argument("--expected-qps", type=int, default=1000)
+    parser.add_argument("--min-duration-ms", type=int, default=30000)
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.Offline
+    if args.mode == "performance":
+        settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    else:
+        settings.mode = mlperf_loadgen.TestMode.AccuracyOnly
+    settings.offline_expected_qps = args.expected_qps
+    settings.min_duration_ms = args.min_duration_ms
+    settings.infer_token_latencies = 1
+    settings.token_latency_scaling_factor = 21
+
+    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server.py
new file mode 100644
index 000000000..b564543cd
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server.py
@@ -0,0 +1,132 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python demo showing how to use the MLPerf Inference load generator bindings.
+"""
+
+from __future__ import print_function
+
+import argparse
+import array
+import threading
+import time
+import numpy as np
+
+from absl import app
+import mlperf_loadgen
+
+
+def f(x, y):
+    return 4 + 3 * x * y + x**3 + y**2
+
+
+def create_responses(n, m, mod=4):
+    r = []
+    for i in range(n):
+        r.append([f(i, j) for j in range(m + (i % mod))])
+    return r
+
+
+responses = create_responses(1024, 20)
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+def process_query_async(query_samples):
+    """Processes the list of queries."""
+    query_responses = []
+    for s in query_samples:
+        response_array = np.array(responses[s.index], np.int32)
+        token = response_array[0]
+        time.sleep(0.0002)
+        response_token = array.array("B", token.tobytes())
+        response_token_info = response_token.buffer_info()
+        response_token_data = response_token_info[0]
+        response_token_size = response_token_info[1] * response_token.itemsize
+        mlperf_loadgen.FirstTokenComplete(
+            [
+                mlperf_loadgen.QuerySampleResponse(
+                    s.id, response_token_data, response_token_size
+                )
+            ]
+        )
+        time.sleep(0.02)
+        n_tokens = len(response_array)
+        response_array = array.array("B", response_array.tobytes())
+        response_info = response_array.buffer_info()
+        response_data = response_info[0]
+        response_size = response_info[1] * response_array.itemsize
+        # print(f"Reported size python: {n_tokens}")
+        query_responses.append(
+            mlperf_loadgen.QuerySampleResponse(
+                s.id, response_data, response_size, n_tokens
+            )
+        )
+    mlperf_loadgen.QuerySamplesComplete(query_responses)
+
+
+def issue_query(query_samples):
+    threading.Thread(target=process_query_async, args=[query_samples]).start()
+
+
+def flush_queries():
+    pass
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--mode", choices=["performance", "accuracy"], default="performance"
+    )
+    parser.add_argument("--target-qps", type=int, default=100)
+    parser.add_argument("--target-latency-ns", type=int, default=100000000)
+    parser.add_argument("--min-query-count", type=int, default=100)
+    parser.add_argument("--min-duration-ms", type=int, default=30000)
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.Server
+    if args.mode == "performance":
+        settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    else:
+        settings.mode = mlperf_loadgen.TestMode.AccuracyOnly
+    settings.server_target_qps = args.target_qps
+    settings.server_target_latency_ns = args.target_latency_ns
+    settings.min_query_count = args.min_query_count
+    settings.min_duration_ms = args.min_duration_ms
+    settings.use_token_latencies = True
+
+    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server_inferred.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server_inferred.py
new file mode 100644
index 000000000..76461a75d
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server_inferred.py
@@ -0,0 +1,125 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python demo showing how to use the MLPerf Inference load generator bindings.
+"""
+
+from __future__ import print_function
+
+import argparse
+import array
+import threading
+import time
+import numpy as np
+
+from absl import app
+import mlperf_loadgen
+
+
+def f(x, y):
+    return 4 + 3 * x * y + x**3 + y**2
+
+
+def create_responses(n, m, mod=4):
+    r = []
+    for i in range(n):
+        r.append([f(i, j) for j in range(m + (i % mod))])
+    return r
+
+
+responses = create_responses(1024, 20, mod=3)
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+def process_query_async(query_samples):
+    """Processes the list of queries."""
+    query_responses = []
+    for s in query_samples:
+        response_array = np.array(responses[s.index], np.int32)
+        token = response_array[0]
+        time.sleep(0.0002)
+        response_token = array.array("B", token.tobytes())
+        response_token_info = response_token.buffer_info()
+        response_token_data = response_token_info[0]
+        response_token_size = response_token_info[1] * response_token.itemsize
+        time.sleep(0.02)
+        n_tokens = len(response_array)
+        response_array = array.array("B", response_array.tobytes())
+        response_info = response_array.buffer_info()
+        response_data = response_info[0]
+        response_size = response_info[1] * response_array.itemsize
+        # print(f"Reported size python: {n_tokens}")
+        query_responses.append(
+            mlperf_loadgen.QuerySampleResponse(
+                s.id, response_data, response_size)
+        )
+    mlperf_loadgen.QuerySamplesComplete(query_responses)
+
+
+def issue_query(query_samples):
+    threading.Thread(target=process_query_async, args=[query_samples]).start()
+
+
+def flush_queries():
+    pass
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--mode", choices=["performance", "accuracy"], default="performance"
+    )
+    parser.add_argument("--target-qps", type=int, default=100)
+    parser.add_argument("--target-latency-ns", type=int, default=100000000)
+    parser.add_argument("--min-query-count", type=int, default=100)
+    parser.add_argument("--min-duration-ms", type=int, default=30000)
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.Server
+    if args.mode == "performance":
+        settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    else:
+        settings.mode = mlperf_loadgen.TestMode.AccuracyOnly
+    settings.server_target_qps = args.target_qps
+    settings.server_target_latency_ns = args.target_latency_ns
+    settings.min_query_count = args.min_query_count
+    settings.min_duration_ms = args.min_duration_ms
+    settings.infer_token_latencies = 1
+    settings.token_latency_scaling_factor = 21
+
+    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_single_stream.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_single_stream.py
new file mode 100644
index 000000000..ca8d84591
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_single_stream.py
@@ -0,0 +1,129 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python demo showing how to use the MLPerf Inference load generator bindings.
+"""
+
+from __future__ import print_function
+
+import argparse
+import array
+import threading
+import time
+import numpy as np
+
+from absl import app
+import mlperf_loadgen
+
+
+def f(x, y):
+    return 4 + 3 * x * y + x**3 + y**2
+
+
+def create_responses(n, m, mod=4):
+    r = []
+    for i in range(n):
+        r.append([f(i, j) for j in range(m + (i % mod))])
+    return r
+
+
+responses = create_responses(1024, 20)
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+def process_query_async(query_samples):
+    """Processes the list of queries."""
+    query_responses = []
+    for s in query_samples:
+        response_array = np.array(responses[s.index], np.int32)
+        time.sleep(0.0002)
+        token = response_array[:1]
+        response_token = array.array("B", token.tobytes())
+        response_token_info = response_token.buffer_info()
+        response_token_data = response_token_info[0]
+        response_token_size = response_token_info[1] * response_token.itemsize
+        mlperf_loadgen.FirstTokenComplete(
+            [
+                mlperf_loadgen.QuerySampleResponse(
+                    s.id, response_token_data, response_token_size
+                )
+            ]
+        )
+        time.sleep(0.02)
+        n_tokens = len(response_array)
+        response_array = array.array("B", response_array.tobytes())
+        response_info = response_array.buffer_info()
+        response_data = response_info[0]
+        response_size = response_info[1] * response_array.itemsize
+        query_responses.append(
+            mlperf_loadgen.QuerySampleResponse(
+                s.id, response_data, response_size, n_tokens
+            )
+        )
+    mlperf_loadgen.QuerySamplesComplete(query_responses)
+
+
+def issue_query(query_samples):
+    threading.Thread(target=process_query_async, args=[query_samples]).start()
+
+
+def flush_queries():
+    pass
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--mode", choices=["performance", "accuracy"], default="performance"
+    )
+    parser.add_argument("--expected-latency", type=int, default=2050000)
+    parser.add_argument("--min-query-count", type=int, default=100)
+    parser.add_argument("--min-duration-ms", type=int, default=30000)
+    return parser.parse_args()
+
+
+def main():
+    args = get_args()
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.SingleStream
+    if args.mode == "performance":
+        settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+    else:
+        settings.mode = mlperf_loadgen.TestMode.AccuracyOnly
+    settings.single_stream_expected_latency_ns = args.expected_latency
+    settings.min_query_count = args.min_query_count
+    settings.min_duration_ms = args.min_duration_ms
+    settings.use_token_latencies = True
+
+    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024, 128, load_samples_to_ram, unload_samples_from_ram
+    )
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/diagram_network_submission.png b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/diagram_network_submission.png
new file mode 100644
index 0000000000000000000000000000000000000000..35663b97fe3ad1453c431cbf5c6155ab37c78851
GIT binary patch
literal 51192
zcmeFYby$?o_cu<ngs{K@OD`phf|N+t(jhFMu!wYb%hI)UhlJ7~AxJ8Mba$voOLs`O
zz_a>(f4<-E_5A++{qsD}Ue|ri+%t2|nK}2FIdkSU8wpoeyhlh&h=GA|PgzM$3j+hQ
z3IhWx34n(#sqKi1!oVQ5wUL#DE6d74;I2-VHue@67)p_^b@36}{Z!fdNeKyKSb0hW
z&8**|Q-CjUFt?0Z*@95yW-xelK{R27fn0Td8B0_}MIhG$)Al;(SqlQLIU#QVl3#nO
zBt7Z3=Cg6VK9VC~b*s(qwoi=l#_we$8|e=$h|x<(Y&^iEsOX)5vIZuWb1t@0V8>cN
zTX}uGHHJmWU1L)t=7+qO9y-o9ZogZjNQ@vWn1O16!pS@<YqE317%hn$iBuTU&o;S>
zDV0D-#;z|~U`Ur8NLWA8^PzJQxzN2Cww_w%7UDqK*MqMuv2$HWw=+VU?E4B12~2hQ
zjM(s`zY{D=XD1AXBgh)H{cl0I6@gPEGpdlVLyVjuC*j(=o*a>((KSm_GYi9ze5waY
z%18AAx1-;FJN1tylU`N@#Xd+;=y`Y)F4**1j9n$D6zCZ(U}j4BDXd<jOLu|+BZVbJ
z@jmCCrdf8(y&Zfou<@uNPZLNdOaEvjI#Op#pjK{)K4QsMrZ}2VI1)%F-GqP;sluo2
zU#myzUcz>23hh%>0#Y9`oRl4$N@u_A(mHG<BNTq%uU4Rxo{q(bYgz`=bS3Vh1vd7%
zQsh5-QV@;WO=qSn6*nYnrR+Mya*>NjVtP1~6xLg|t9oo9dugg4%Ay`a$ixM_ukc_e
zBx~6Pe<i%ZI1SfS=Nm9yI+a!;g~$j<TQtPr^9abnUV^-0!(~1kXb_f9qAsrK<;L+M
zBW%X)Vyx-F`SDV`0ppQ6yHqwiXN2mro=EC7=iV;Pm3G}g>o&Z+K!VjOi(pO&e#Fd|
zoyH}eH6`IiEY3bWti|$g#KTH7^kkh@SWv70X*P_SB`@ZX0xUS-G$;JiK(Rm{Ej>4V
zW+G8GER!@}n(;R`Qzkp(St@b1bnGV6d_&ctag$dSXU|=;ztZ*d&fquJuN5R$6H7(R
zm5Euy$XO$o<b37;q+K_M))&NK!}reH{lbqfY&DQ7)*Rw2FFD+Sjv7>ys;$wkT$_Qv
zm)^fH_tz(y!*lgsW=3L|hYty33t^4a<DYbPgydRjlvtE4>pTm+WH?{ozgS;@PbV7m
zL-lg#)#|SOi^4*}Hh81+Cl)3YY3$aWo2NTj7p|=WwEZM^3tB#RQJIBT>FnaXBC+e(
z#-tF76Evo-E+7<d`jSmtQNiqV7;?A@WBe-h+vT|=<?ED&iTYO!{#tRAfFf$>6GopX
z^RQ<r++4;ri<2^Ks}Ww0)YXJ^g@iCb3$$xHJKSambiuqM0YA8T@gc@NZEe!ubqzqi
z8;qiapYO6NMQ7D7-UySnO?!wpe1w;FQj%i%s#<qsSpw98*sDS=>oIv2NrPp;U~E$v
z7A5Rt<JY0iS*q=id5MPb*>j1@G3?s$s&MV`8`~wFf#V_fov|`8soN}^8RkOEiYc0K
zBo-Mq6@e(!7*NzSBAt06f^|9U0urPICXel6wNb)Lij5!>;V04yOkpldxYzfjkoJ;B
zBsf!ggn6&eVU_t?Xp@X&K3GNGLz7}0k0J6Zm+LX%Q#p?O>CI;!0R|C|WFL+_aOZgu
zo-NIjJNcF5gncF&Q;osQcs*TUJ*2uC!NoAxQ8-0k4tm)8zUoCe+3l!qHAy+$bs&X{
z=tx)A5@B3dvNNM3WljiBx6oqaq3CarKV?9~`JxfvaWG$d;-carP>Wz1WrHGziI{LK
z@h&}{Q_Vto%CRKMd}R0TY))4gVjf|8#O}!Qi{lB!Y-D_7GaGq-!m1J>q&um;5KD`|
zM#6??OL&X0G<m$ZsKT;ny@;wJwc@_cL!F1Zvjwlz_M&D4KKa*BO~2VyjAK9U7w8-8
z7wd~%Rb5qgm6CsJs!EW{rsWDf=A=w{F<>(wF`)0^;X&kKbn3Fra_Y5Bxs5+Q`d;eU
zo%D}9QTX&Y!7~vL8V~AE8QyQNQWXlO$AWFPK4Q)AzYyGG+r#{z5mK>~-~M&cZs<qJ
zc**zr{6m%Ga+?qR-!cpJ3iT?yl#Y`uKU3)qDnx(i{_&>Zd8vooPLWRK69b==SLLr-
z&b0O<W;Q<N>?F9`yLY%>h!T0!?PQL7sr7EAx<AQMG?HGBk$Sp6&b~df?dtC4KIneF
zRkwAvxwYxF$-eV+vTR)J{bYL^_z5uyYkYO4OM*+tq2_{I&O4qlqkN;X0QtM-c0h<R
zc92YIyMM?I9@D*yFq5!`u!r|{?(NepDUc|v#1zpuJP~ymmw<Vq?f~-<-^lP4=lB$2
z8Ddsqt!SNuTCC^nOl`?cSorjuT)!30rkK#$zcKsLbCPb8M)Q+vOwGKmXSaTusE9GS
zd&o?OSA)-ID0Y~2cx`=lXm;qD+nzg#D~kI}f9`W3;tFA@S5a+Wy<5#+OKT@M_`YX5
zPH=jr_l0%y$d?~i&9NVzMh&yP^?tiihgU~G%Q8zl>+|YyBc!qP)$>=AyOC#-XANg>
zcX`BeNwP^`;VR*(aU}(51${2bW{ni>{gcYO&(_?(XKW9-K60;c@!#xlj(o-Ws?^g<
z@SCmqRo`n?eMNtlQ}<!FpG)4JDI+C(1yPEEao#KIi#={-?v3l}Zls<$JM)XTGo4>$
z7ptdNQaOIQm2a?&QjLxp&Kqh~zNmcU^l+beKXzWo&)sj?H}xv>l>H*?V&v4}=w@+a
zO?T0AY+ob5>h9&8#O<i!Y#de0Jl~Z|pge|<vA(n-M1Yso+0Ixo!3!u<@?cZ5%7Rx4
zX>}lwD)BJ<Q`+f+E<rh3SK(EO*?5V!Kk1`F6;P>Rb)j1*_D~0a573b;o57G`o??lH
zPat0;2}ncPz<+eV7uC7)%+0$88J{7s{tP7efZ0}dyudo#+Qxka;l7(*UDi7vT^&c(
zJKh_NP~Y92x^}5^(O&UqP^708s&JkAexZ3-yR1*KNsvr&#@3`tto$+?Db)W+IeR>*
zJVTthLG;p3#d_d<O5Av2X{|O-A<etUZ?xZ(xB9olzWels#~O*XxCL*Zk_d$E&!}fg
z9?r29D0?NxsZV4MW%)uw5)zZDIaVlyQuVv{hF8rUhjy2$y>^g1m+ij=e?O85G=9CP
zrz1V_zI<aEbGody5hc^%YVtXPdj{g2MUhd<zTxL+|Eppys41oC=F;|(=2Edsbou3B
zDQA&(k%mp0u!;&BgZ52PMb->!Y|^h3=9<WwFf0RPNh|bo@*N}q0Y)J7ri8e?X7aXX
zzIT7;8F5apHt(s;KgiFqTo&*W(6`JOeIvEAWIwU%;Fchg;r3$v#fX9MXE(#Hrk}^0
z$8H;UI=e5L-d|MWauU%}zv0UKd~irgCN-X7ouZ`b_~^*!qXl0J;$yu-<aN@iYD>zI
z>S$U+>gzPW7X$j<x4oKDI=_Xhxa{<1T`Sk#>H>8mbmtH>A6?HM>^+w+W2|Is{Ni}e
z?djH=)(Le(pNhEx-$Bs<(aSG!QJYa=r#d^j<#!)t>X*DNl-6!zd+5^aeOn`2w+7Z1
zhd=47)ulLG)<6GvI#Czl;PL#)^R?N?S)$KUKNFh@PC0UN-h0ySam`+Or6F~De_i}6
ze&1YHb`IXxGYpyNv-@(UKIP+OwAm8&Vq?bPn&x_tBoO-uo5z3T;<Bf`B!oNMm*mH-
z>CIqI=Sk-~52c?mU$VVT{1jzh$X+M%DZZe+p(DN9KH90ce_S+B;H{X(R6wtfv|mJi
zJ)E~xb~}&UrHI$R|9MU#!~1Ba)TMYvT`^12=l<`ZejS~i!&*$Mw<DKXVy^QWzBjm)
zL>|-<JdC%)$DcbL3>`euns@|Xo;A-+Nq)JRdY$`_qaw^u!Y$Tquj4d*XHO`}aqf%A
zl4URV?T3xe$BwNn+Q{O&UrjzJpLge<ex*tA`om8fZ}zXnnndFyiT#j=njHm=@m}#p
zl8F+R;|&L#dp4H>wN^hye`X7LgnAnI^WDmCsP+%u{z{VicsF%1@aqRp0N$NfHOHso
z<4-52ZExQ>-COu|HMmllI$uAJzkjrF+RC#huseIxz9GBQn-V{IRqV&&_x{5EuHjI1
zJk1CD5*wSmB1H@dYBQFtt-(On5@Ecc#c)T4<;LjF<Kp9L&Ku#AU1%V@jOal;7`7(G
z!acQbF@-VS4vY8Q^XVp&<OApa`qlZAy@tzEh@F~p9tH5j#&&v&0TIMH&=chyYqc?6
zJSG9}I$*phez2cQ$lM{;Ie~%u#Szknu@HIWfAutO;ar+52>+txjymgULX4K<Mm9S5
zh_XN^TRwe?@c^9%VBlcVV&I~4nCM3mlkUIr3Ybs~?7!t$7#QI;7&!m7QAekLehKL3
z56wR*c2XDyKKhLu{UCF({?i(>Di`}dd8{OK9fpjytg<pX)i!gruyAy<c5?qnD_e#x
zAaGXFbHl)(V*m4CDr>PFp!rYOJVUr6o~ntOIXOHsF?TYxc!YFt{=)}D94U&<I#{@y
zK#&ghj&7nz36{StMA7*_#XKyKzfIijBv=qn;SgCTR}09KN4$@CStJP|5Qw;|xuvL<
zoWj54=r;)#Yj<~NQ63&oPtQl50*{<rt$6rEL_~Oa`FZ&HxzR1S-Mk##O_1D<Zmj<>
z`41mC3pX=Y8)tVLCr8L1z9yzl9_|t>EPn+3*YBTxS|DxyTau&Ozr#WgkmpYg58oqR
zp8w*FCKdlvDhjtjTG;E!**KtehL$1uL{M1#Z~OmG&3{Y$AC!pyrhF{G|34}JN6r6E
z`P|LIRo2M?Ez(``zdzT%#Q(GMUqW%7KO_Gin)pY}e@oGNmLwGC`LAy#Nf?Zr{E7aI
z^fq!D&(JB_$o~8=|JVlfpY%`OeF|F7_XGn&8beu5<{1)mx7pL(pf~ZCexEUhG7tw4
zD5A)lOwO*~K9FQFow^2{uCb5bJB+cK|3Yz>qi5G)ke%VN;FnD^y8_X#qi35{(W!A|
zW53U9MUJM0q+_rg|EMI<7sY752gh+9X)j&J(wOq`)D8IA*z$gR+mC<5hKwgJkaZCt
zj1?yx#hkE&lWNMkzATFMw-Ah_ZY)VgkEz&3=8U67j>%D#LjQz%2L1A<j9_TvKa?sg
zi#VxCo$D~Me+Y$xF6sYiM5d&Sli8-_I`z`}FBf`D4&EH>e?@Xe22vnYNZu0d{gIDm
zhO_SWM--Yngq_S8OSRv`Esx<J@tJMDB>yx56tbcF#VKO{=;J>f52Q#R{ihLqP>Ng|
z0T+AtaOkVQeMB=k7Wq56K-PpLfN&7|*Q&$Wd;<f6Z(qNfXjxcUsajeVnVXxlC8nft
zsBE&{tV;L|4h+=z8yXImdU&+u)z*IR5nq(yX=7ugKA=%|gM3mLc=yy2J*XseceuHO
zgM;hu3l|ah%>>SKPVJ(;i^COm9v%%y?Gk(>TjF7CY^>|{IH!%RZ4w!)Lf>kCvc*t3
zUuT}r%`a?W-ig!GQ!RacJLJdDpJz{jIx;drAZqH?b0ORD6936P4pFSFV6=*oVji!0
ztPjylw+6H}U9P{id|jzmt6({jX*0k;PDg@!lpp`&#}C)tDLzgvu9P2Tx~g7Ym(H7p
zIeq}fSF*ApOI22*j7y{}W+ZT4ov5fN^Q)6Be!B^-3^C6NGCg(mIIl9lC(9hdST=v9
z5xXC#&HJ4jc&#QX#<WS^9>xzItqnf)ilrCfS7vJ#=Zj;MctH2WJ|RoYb6}}6{9Spu
z+OMW#^$a1W`L*YnEV{Cb*Jpdl4zqOxW(K;t96$;@!T=N@b4_Y0)is@vV<s1(N}7uM
z1sf$LrIwaf5{rZfA45<(+M?_a_<rdUp}xZ{d0H;=ea{;g1hKAXCR^@L8hB=TaiCJd
zWJQn%xlO;11{F*jJ@->>z8CfDwj5EiZZD0swp=+cwqeq(Od9$Oi|<WL=&*7)x6y^R
z0V-j?^gla(I)yh?l~sIE;kAYcj2sG9w<MgcYz6++Zw<xN5gScpI~gmRgZ*5+CZq3x
z%yf%i;|I5=S)$KV9~s#01YTZVDySvU?wE?b+O0ZZDeUz+pZBQ@)z{;pf5g1jE;DQQ
zl=`n(6yis<!Bb@Pl7_)Ga~C9KS>*~zI2}wmE2{O`I~}J>jmDS91ENoz^pck5vM0ae
zIXr*<yr0ggiTUjl2XgVMtuG@K`$6&s8jUiJ+h!7x`+)ZD^AJ&C1ueNvsSWK%6L({d
zh=S#pGs`~Ci$3$(Forw>^UX4pw~^Nx<z;ce6$fj{tYhO-$=^$)g9EQ0a;MN5DFOtS
zKin=;w^m68Q*5NArVea=E!aC5n+)`9%EaX9iEKF#=^OBRNpS_*Uh=u^_|)VYBN=|%
zGuZDF7k+k0zhGD*M9&1h3>H6$KF>4yxX<nfM9<HpYDOrQtW9mUg0SoUe6yCeHY<ll
z_G63ew{IVs^~MQWh}e8%V;-!wI%ArUV&)RZNa8LDUnQTgfP>8M)jaAvt+g6WC7`(P
zu5F*GXoT2d5eb@HU$<;btz*XC4(lt)VSA0llD*BIJkEfDU$5eUgjJHbBx{ZlQY$`$
zQRFmAQMq@DarD*YiY+T|vTrs7*cW>|km7y)SM&?Dmfh!zQRE!sitFY|6}g60kq=}l
z(hR+oZ!dH1rpXbKXVvljpC%3M7^k`-DH;t%Xs046I0|cOw6guLY*=5@Io8Wi@fg~t
zhYNm98155h6><$0B4kJ^FoY!-<N3F{iTPc9H0gX8dZxr>Qo%cal6?B43074wisGT!
zzaYQrPOqs@#-y%z$9)hIlNf&(mlj_goyId?8<Q5_GB%351b@Cf-k25UWlH?6dc}-h
zK-tVBfjD8G=(c4t&;lQ_p~!=M;aot{DBvIy3rr7Jl@3>dR%+>C5;1%ph)YPq9;;PI
zl0;b1uuhvHU=_!y(3g0oDQ*r<Dhw>fjnw3D$SsHGkB20!idT^0P*o^SlbNn~H$4W4
zOiF9@4GNCq6C~OC4|0D-<bYK`41~!$h5<JoW%%T4KA;C1ZW1>}pKM=qN&^7P>t75S
zU-g$&w%BfbR|Fk=@z_zjgzbJLqH(@CoiuV<CAfbg^<FySW02x$<=s_fQJ>V+*Z6@S
zx<(EjLHGQ2w=@IE_Cln(r0+E6PjH>^-jVbyK1Ch{CV`RK)P>kyiE9ncOE3XMeQGsH
z|KoAeUoShq(8+ZnWfQ=73B>iryL0~6dtnS7+W72X@tDcuQ7NZ|1>fW0SKJ=FOuTFf
z@vB)7rb+zEzPeC)8*Kh%u&cA=(N~RCv!=^z$>dKeVr(Ko${Mm<wp_ob4rI{&LAi$?
zI2z5w8;Pw7wg~3GUSYbkjsm}nbyd*JJrKrUhWKUCF=EM*6@WGEGf7=c4$Zg=%9$ln
zz^A!G%oHGk0h{R>i?d!@lCCxvz+#3~cC~KGqbP>uJU=DSnk<VVyrvH$h+D6wD^6B?
zRPuV)XOb1$XQ?Fos*Ql9eK1#2DeMz7|H&qk>&n^~=}i8Z<WYV}0|`&yZ3Uvbqau7P
zk#y~1L~Orr-~U|f7>w{Fygq=xPXG>N-pams8ISeuF5$&983NXYtJ9q|e5s2dPWK*+
zcf;U6j)hlFr{`e;%Mt9!$5<|3rg-+QgAO8|^!(!Z-+Y#!o@5bB^^>(8Iw#+FXJ$O-
zj!q&&euS+0BC0qvT=tqR6SA`*c`ou+AV}H<yT)upCL1V)b8<NvFux93L1BjaB0Gi(
z=2vAEk`M>HHp!BvU0s8{^}T~a2C}1II8eYX-n+Sw3GJPg`Pb7_jiH!In3XHn3wJH_
za8>%9w{-ORGM6%jh=Eaw^Gv-SM6<4;*E_u$M0Eniz;^51b?eho1D9^fqL&26OCotg
zSnV>wP==9)or*>^Of!e?w*j{s0WYq{HLepVm>Ic#55E<9OO4gA=#h(p@Q%nXxk&7>
zN-;_+F>QE@%Yq(l<SxP^n6AdcS4Z!Dk3Ox2IXku9)SWVk9D)w(st%aJAQ2qZI~yX4
zem)QfAsu^k_(><z?N|!`<XHG8yGdHi+_^)gyMu74bh1ElT&WuD(lROBP!gG70e%?O
z2?hB(N#4-$1blh*RzAjqp$<{1y+9@p8QRv%VW*gKfFf)aB~&W3*ew!sN8NfM-x)my
z#W}lJiGNFPE(WUt^CO=mP*(a{4*8!HFzHSHYQ6KP<5^vyC+yuK>KdAI-F>_8yJ#4{
z%5s?WGF_~%_ocL?N|SHGncw-(^K9RQZ3+!>lN(vdJ}E;<02Aq(NhIu3)J^Nhzvdlw
z<w`&d3(5r0Ez9KD6I@Cg0&#Uj2<hJiB`JEruoC9dT29!t+mO^yB5??pe;?*rRB#*i
zvRJtTAW&CQmKQMx6>s9`@P{&rQ^CxUoQOWC34jD^S=fXf#IFLtl0D2zSn%HAW(BOa
zuEdF{-8xGYQUKC*w>z<e+aN%3v8(uw!b(*7rW7FsmIA@DM4tVYlnT9T(cK~jBlM*>
zn>Z2Z(UQ;=faV*Ac+=t%EFH+x6-YgigLN24AzW~fh`)k8y(utHxf;lU9my=lHe}aS
zjOma3!aNk_EA-PnxnI_ZnfD++izCPP+k6NvBTwP5Z%7-MCjxIY{sswr!jDE@7|o87
zbAERLC~DSUY#TRm#&j0MOWIa0cAn;3ba4#nn=&#p`N_S4HPaUd^ii+NCXf=B(iH$Y
z%(9Kkd39tY4-_QV+2rW^q8nD(Z~i(vzSfsmveo48O`37u@f&>b;xYm8MukeLKt#o3
zjqa`{i!g4tD{luP@Iu3>gJGSGCi`Sv!5(oPOZ$P|3aD3oEGS<1Zy<1hN|niV70g^F
zb@N?HFqC?x$PZ=+B;K(@#z5IB6!o-tRy#!c?oKNMegYoddZ$Q5#lC_sLp=e+A`Mib
z`0cs8t*0NHGQvSHjgO>!n)rsDP65~YcISVV-q=k5H_Uk9dj7n;vd<&yS7*VmMVB$F
z5zGy4{krsCIjQyrDO}=qiW(g3s^Ia%-2+iDb{qwHND9PKTsq>-xi9`#8Db!}S+^(j
zPT#5dM-QtZ$vPI@iU=ty%9$M}q$9koEwO>Z8I`+ySQJkxagtzySGv4p0$$?#M$L=Q
zqeQq*7=q)~e)MOm#$aU7(X;L(?JN_mqI`kuD=6Qb->V$Mizb<T*2}##u4s4I@+ct3
z|Ifa>Ki%k2P#y-O&n}<#dm#2wDv$3HQIYVn6S5-$s?rw>XBYtD#(X0%q_vYl8!kv3
z+k8c|+!REajZFw9CGPGIvjd6e34-_sh~(v!0Ns=el2NpzB)G&hC@whwMQCUT@zKh3
z9`A}OKre3!Q5g(n=50~~&|}G#l6!YzhZt*4$lzTC`a;zK)DLfG7jEmwaBjLnWSPHv
zw)7H(pj{IAO|PBI;djaGuQ@l<IW(HRgEXg%-K5!zYQgu8q|1?QO=0JY7(Gb(7!ErG
zG6aZacCdeUyB|iKr|wLgZR$IV%;@V+7nlGtufPHzygfl@rFVlyw>1p$t&+&ha+jrM
zikeQd?gI>-Dv0@zPXvBJxLbPs-67MR$trRC5yNWN^+ks`Owy2swMlL}ye)$?L0nj-
zid6g{kmkkV<n{H5!e8IUgd523bBKN4#kgDQ_d#b-dc&orW%*MwtQa1)7r9+F;r7Dw
z1z`9%)U4dS2?NAr(vc80as>{6Qbgbfv--+XG2n#3x@tR?6*a@)<duh^SMXct{l2It
zhwj$Ymzuugp*1^uWZEA`J%C`pz<LPp1RUN61be{Y-l*(lA`~&+c&8T-vM1X`PsE4|
zzTI26{jd})j*{kF3_h1<g3p&Te3Q&peMXQ?Jx{rpTfQRE$@>V|2gGHhxap#kSHHT<
z81?O$$c50sLhdf&@2qi9#Ps1)34$#sFI*EXz?T1Mnb9X8Gm_6qI9#-pX>NsRHH>)i
zYFt}`COhCVEx;y>pvDxZYF)+XcA4k4W0e)ve*t0Y3Qrtkx@^O{43+^A)52o=&ZkHH
z&#*3+El$t3!v97AUhv1zHPRy}C+O>Fz^$9MFVsjpZ61Ijul$O3@AqKq?`Se`st_X>
zh3bIm<;hjwGP5?(0Pr+0<oozis84Z7W85IT;&c`5Fpg(G1R9uu7^zb8v;)*wGMGIS
zQ{8|IQyc{(5h=)&c_<RSdh_^4Fp~ptmV3U-RQs>8GVzIAkaS1m<AnvBdj@b!+5i&$
zdnv$dy<TZBW1u7>m7Ov80q~xR2#{+)Ean~_P7NPb0@%)Ai1Z3uGwA}xbhpHG+uny8
zL(uSzT73F~JR-5Wi=tb=@<Yrh5G>ax%rYo(K4%uD@%z-oke|c9u%8~tbBsHhTD|J6
z6qCT+r38K50E>9i7fvScN|<e$;yd&f!i1DIKE%1+(g?Wv-&6$x4&t*{Ee7F|uA4G~
z9lnw<f(M95W5&pFLkY-WTIDy3%rg93GX)UM2&hIAZ-*C@pUV>Df6!KfkHuOz{vEp!
z9F%wBe>rGGiwUy?B9LcHr=c(?@p6bXqF0u4850*P<fV1UC^8(0kRTdJl!_o8y|)%?
z+TAIqiH~)CjovSu1Bll&@*HOXjNtl{UTju+fFbe|@-gZqflGcBGw!YTLnN=cyeZiE
z9vfao0t83Kk4;kA^w7>DuR?lioV16zmYhGdEg_VRoVWh^SEIumI_?RkFvov%cjk0w
z3FfOzCmO<r1tf4Y4<Sh-s|k*lp39iV$RkvV9scZlSee|ys{`Ax+LxcoR?#0X18~sc
z9~L=*fAdQ$)*%#r#|&VAJhc55I+43f#0X-;2)daJxJlDnV-u5>V-qqythpbAA7TZQ
zK%w9Vzn3uchTy(X9uN}_o=L(@M8`XSkboTx()tkho#HGCsF!KwfVQwF08JV7k7NOU
zO|Id>daWVS!qkI|SSAjku^)kSHF?IeN?h5HK6M;BapsN;^I4hszIc`%H!v|)2=6A(
z&Ha8HF3k(5=M_oAiB8*buheZd8Q7dC=n#k0{AR?n#ms~J4BKj(<t4QdOgP=8w=)QG
zFv1*h3q%0?f=|&#pv}4@{iQtK1jl%lceFy=$>WQ6FsSJ}gsCvZ7wDjI2_WSs>FX87
zy-b)T7|K1lV?le)XbdYT=_wm;xgki*PJ8YI+AvQjBoW04ebt0r?on<46EzPFWy4F+
zoRR{aZRFPLeGkM?st9ZYkiuS~qTo><$+#>g>Zf{9EbHPa-*bNV=S-af>2_pM_=jp2
z*$L!JLfdU`tQ^8W+JFQMarVYECY<2jKnk3JQ{YN>MKQxw%m!vqo}4U#lpIEulnL7w
zS_O(jLF_on`~-g8`fHcLsJ6B+=P6?#7!~pvWz|u8uN^1Ystg7;>fuXSWQNg8T~Cj4
zX(vd^7@HrP6egf#vtk5S<dPJXUgR-uxAhr|ZwfdBKlG3W-Y{1|NcR#7JLfv=tajUT
zTv#O-GY{)7)l7)n6L-LLF{q0|+L0|zCNq?8CQ-FWP*74D6m2#6V>BIR5(F`H=|I{j
zf;E+hiq9SA(0d)yP!J%HE~i8C0U&UUoJtC<9cG>;5P%ISih6Zm3>QZdLghhRI1xfU
zv?K!wE90bOn!@T|B%>&z>B(6n7zXL(qkqJvb%~PeNHMZZuI%Mi36lgZMJ{C};`W_>
z#P?d^26muO^^92Tv%qep0Ba_!5Wtoc+f|v-%|M0KhwzmXvVGjp$WIzf^^6^`CllN7
z8mJe*k{3YyE}j~%|LC!IM<nX!fwAWA7&5zr6AF=#4LK$1cna)5McqV&xX+y~#F)ec
zJ-G1{ESNkKKOnM7BpvN5W|Ch?da-3*5(b5iapRGj4I#{k;T*@P-OdXy!WQIsAO1R6
zsV{uDuXpM^C7w@Ca$Vd*ENGg0{Z1aTh(m=r0%r_vLwr=!OH3ths0s@SmaUdyX{OX_
z(qjS;(yk5NLcTR?;z5Z!gfbQBfYQh$oPoE67!h%&_%mVBRZ}8&+BsJ#oML_VZ~-=X
zZ5;(cfL3N!oUrE_H!tI@fK99&`cSnI`LD~GRqd3a5gjYu;u0gW{jzQ5pT1HtDk_oo
zYMb6f!a3)u5i$cy%Jfi)BwpAx6t09xO16(N$f&jEBmQ(X<XK`Y{2@+MerQ`8Y3HKT
z?YdKpoCpunxeueKPg)<Tfik{M4vnBD3aYJ)vSU`Ol7H2caQp3%t1IzQ&}QQ6>L?CE
zkhsT^1AjJh7f5dwdl>YYQYm}|1i)z$;Xn!x;m6>v>p9t_nv8s9TnRrsW_`LIh+x*=
z?hN4|0K^1SNB+)4Pf8lQVj0P+VBgFT4)skM`-8EP_y+WD?C}v_#p3r_T7!g9qwf^x
zqg^%{&lDj>Oe#@A5+|HW4AZF5;2q9K^91O?A)S|1g@_#+bFn&i3L*s{e3(7@F_bbU
zU%I`w(gqz|fN-L$@V)Y%g^(*Y;5lJg#BJbW=7A3B6W;RD52W!@L`CmtzWT^ivy7Ct
zNi1()7SEGz=@gJ@kbMRfZVCCFkQ2<0K2>o=K2CUiU3KtLIPFaH4OFtCS2bj_KKtw<
zhQG+DofQ**X3-arVh#oz(=A+tkx|s(V_{Q+1mt3lx@(P%C~x-5#v~n_C6-j1jIV;f
z#orMvP4K>op`0gR!^{ieu=7d~zQJ*FE`XEPgo%QAoP&lsYq1kE{Bbl=zobh6>D}@^
zlTVN^J0<x-ba+v|eC}Yynbml5V*Yi^qePHgL%Q#f$7%uNYJroCB;eJTiVvrCWHS#=
z@4iv}Ri3w_7VH63h)V|~7eFZVq@N1Oi#UKPb6J2^qu+3~KP(Kxf+em*O22?If><U*
z2}zG+8Va1Nd=Gg66ea`(ZHK8<!JS42sRxj%IUA;=TqYT1iljon6n5NhIiK+$t_0u_
za9fs-6&nYa?`uFrjPv~^esj4*eEyXe=JJPO9C2fKlLfya_*bOaNSzbxwJ0lP{S&6S
z191pW9~O4-v9f-6mF5tT?FM$JWlcTJ3xw!xuO#Y?f6pNY;8sT6F3D>$@$=s%+?yv@
z#drdBZ&K?^J$!d}h+AHY-N`EXctAXt<RGsSWxJ9LwT|ZtyRc5KJMf2mDS_n#++8oc
z4If3q%8~UzxD$IH{`5+7_!!z71jqErv<t%|BjOwAR?TjaE;v}|!vV}loYs74W+0bq
z$`h3)DO`4WB|Ikrt=vTuxW-gvs9?;p99aPdGgb<-3MLk;x&SVWk2@?%!xjOPzu$+=
z9xv3#q(bMV$q)5T5!!QOT)^L<V~?fxErKxZX(~d*R)bKb;gaD}0RBD{F%V!#W**iN
z{!+6A8F)C00q=Q9cSV{Jgs+kSV=z{@Yv=C6@^WR^V1o0+wd7_pJI;_~<Ko1|EOs%?
zbMpr(2dHdnBUbVywq+Eq`oUw2r@_?qFp?OW9KQR@s%^CSZSeLTk||AmKMTK@+ap>)
z!Wr0XHFw`PUTE|}j?z37#S2KMIC<;kUd>i;J3wu|`&5!Q@AkHjn!S;U?oZ%zua)m<
zKch@4kg+hLH>|_dFG{dz@h0~-GmrQ)ajq2JRhrg2D7)DHA84fwAJ#^&i`~9XeeW?A
z5X{KB#7oWzMU5bm2@F|uq`KrINHwsJShy7GSmslx(Rhmz58AhUvujGO`;4mT%QSD#
zxmOFMjlL?sshWTR;;SG#hCuqIhuMJkFZbjWB8H>UfJz}X`V(yl&MVLAPXruPDaApu
zay+0)(Vw8V$k7+~Oymf6U~G8_K=gr!oxb58sHXW(bX<9;el?%O)+S1w2jiavJPcz4
zPILn(Yn~|FXP-tTr2XxV#-HxE%_Y}OG7q8;p!(@iRQ-@HY*y^}Q(pcF0L>AYi+``_
z4;JP92T{si`PKXv$APT<L4z)tPyT^`60*^Xd$&>VQ~4JhQu0Ex)$#{4|AQ+Pa{L*9
z$gg?3e{d_tKe!bWZ2upyYU>Z?<Q>;|{ujmyN<pIzTx;QDV*fy3|38YAZUw|J(XMWA
zb;J`CaXkCCcl1HGkc6x(2dRLr_^%|EThu{*e^8I3s5|+&cF@4izc7}P1dg*izkt9V
z^>^J5FWA@C)<R|?{mG_=g%<1?!NG?c-T#7eWSfx~nxgbsQBi~q`z_bz{YmT+8Z=|0
zH!M|lmCZT>0|VyWQIuz*x#UQTe^Izjte`h<Xs7F43Q|&1`fkn_cm`9s>Bjv}quVAb
z4D4dO_nXg2L<>l4otj<$!SI4)WQ~v4hmXc&NtMy)k@!Iyc8%}lk&In*R8|`vrD5gV
zpWXzW{5g#B>zVP{{sZr|RiTa79&xYOv*-WS{$HYv^7!gsySpi8bx*d&bVYjm&y1LK
zFHQcLmXtxn{R0C@b4@ROo~|1T2c7bX{!o0smgTYYZhO(h#>pZ1QN+-n1xXmI6C)@l
zCdRRO(G3mOs=aw4KT+NcyZ<y}ZVy@G=Zv*uCRb-a!zFcn7*gZ%6AyiVAhF^sPO{>l
z^RI;$La~4^EW@}Anqj~0u5`_(2iBB-l5tc?hFN_Ni#=K-4dA+dvY(>twNePNlVj4o
z__u=;K@=aXRu@ZOWa#l3F<z`snqO|e{tx9!Bi*Xv!W~oTf|t?+X!NymLG)F92kBq@
zJK+t0UQ&~zlA$7H`BHn$Zs97d98vqARBRb%qW1tjQp^M<^t=u-2|+JKCveNm@8o__
zeKViz-sR`Rel?#P>eZMl8x@moyKq#=gib#6M7L!_PaZCBUUZ~zcv$dvSZu`Tb<KJ+
z>;IMfS=j%N(#H3=wvotNF;Dd`yOpZ^Cq_D!spgjEa;&dS6}QYb8XoP~g`LF)oFtxi
zHlq$>Pg~1!e#V+$E9lW=*aT@y1u6G`H+9QZJt5(}^pNwoEuOCz4U*Sr{m5E15%)^u
z$2T?({ShUzW1IfLkwlvnRSWyuP3uCf-OY@;TGx{k7m?Zd2VQz2{Gy&HP#jkRnTQwX
zM6xQy=YleA@Ct=!onSHDs=J+A>w`WW*84`)ob-GzRC+-)hBsfvG*nb@-puM3-VPDi
z*xK=3Dy*%m9&Wnu^K(dXt~IB_)s3lT@1Ob2kWFn()M~lCmwB%~(cj)f%h^ZAsjF_l
zmfZXL?<jXh9uG{++_*09($HiH!Bi9|X*J{NEaiT>A!bsVARlC7J!&pD88O4t*M>8w
z&n1t4B+wL0snO`O5OJsrSxnYC-ermc9zV`WGB^zn)m8c0$H)H}8P!~*&LfAOLW&Rf
z0K0|WMru(vDuzCB3_9-wRV4ZC-VK3`sA8W*F@zQ_h}3Au?c7si8YGM|A3^o=8#=8n
zy?h}$v${Hce|pkcZoXpXQSWHB);mERh0?XBB^Rd5<4xuE>{p3eqTWqI58jy1Z#(zc
z)SrL&<zDdYJ*MvDxV1rdpuV_ESPBSK^lTD6Z^O~uh^b;;{nVJJQa8zv0{isEDd)4!
zI!Gf<!4?qLlYgq|54#IBUT$@<E~B>-Wkwn#|ENX4`5mTws8DJ&D+aciO|HhtnP}a>
z-tgLHuG&Hc9}3j!0^L&P&>YBua~{`Ci60wkG@*F#UB!9w^rW7XwFS<?oAo<4?}aIr
z(mD?gZYX7hg=?1z2g%2$QcPixA(Y-={31=lah{&sF%^vD1d)@yS06Vb4@#UP50Fr1
zO4EWZ%UR!<W~)ETs>Y#X9#+tgXMg_k4an5a{}89=&D4PMvBgY%WL*!g=BW-qVBNfX
zqMh*ZD&h30j&yp9mT5oV*;()%p0~J9T<^7OmwEecWKwZ)AU2<&#7<M@Xj;7Pxr@j)
z&MT45!=TEaO<uL)Cu6cn$;qsLPP(~_T7F|s{>UN9YLkQnb%UhSEU)q|PCu2Y&NPY=
zA7>8P*PdubQ4iV@=_R}z^Z_~eSXu3QKQByxOlsj9Vpkrr&mAmz&v5LJglfRBs7gY|
z9;2q-;!jMTexz1K#FVP1n2GcCUIn!5Z(M6IK|eB?kBBaWOIt5}0qafQy}Ad6NL6N<
z)!Rr+Q*X5zj1LA#Nbs89mlLl<zK`KI9r(5Oo~V`Z;y^{_+f;h@Rg7=7OB|W0aLyrM
zf2H?3uclr{CIE?Or4-sGWwSU|p7mqW$ik-KQxoBMl_@f5!_Ulg)a4vMoCTNah2~`;
znC5fb=tJrWDtp%FF4ZI)q|B>>17pL7Bh!^lSc^Bhwqq^4Epu{;4;|dZCrEo9KPld;
z*P%7f%bL3B{4G*k-?N(cv|`I)KJ?72a{LE7!xg^dN%8Fo!Su6QMcYyO;^=SRv7`GL
zIyK%_?<DcjO-u07wM5@;;GAI+#BddxS*gE>_=PxDR;9Pz-yNGJ4b9!*d~+%!GieAg
zNW77wdIp8oe>W$1(C@q#SzG*0ulC){=tF+Tso%RV^^-GrT{G{egXjocWbxOeQ<+_E
zz3-7}DJJ{>(jp(!95pGg@T#VAzc6o3GHnFKf?2i4u_H;Dld?{4Ppj>@L8f&BapmTA
ziOotz5Y{Av`&2OjJYtpanulQ?A7hh*-$hb5%uxBs2E~jpl=?9}?B2L$s%eF!uh1@w
z{hTqMC-=kqSj+Ii6(%67e<JL$n)EDDXrO|l9gjS2KcKB<V@H|{90>tqqHl;e{Afu8
zcGi1t3MU`%)t0>Md04mjy)bWJTf5hO_?~?`*1I93jJp}Ab@^*(m~!GWtUd{rg&#L*
z-TWcy5Fiji>+v(rynw1(F0aZ365>8E6&a;kH=L%BC9r6{0S#On^nLjC$#N~Uo!sK-
zI`<KomSpLafIkqwCOJbz>V9m7G4nIY?+Y)6O(lMQv)_pQb}4SNi#msuF~3b?Bj|7+
z7jSr4Iwr*L!>k<@y89_$yw?{>uW9TdEnGgzXeVmp2ELpb$B;9uupp^<rB^<gve{JN
zS0XUfs6B(<vs^xMd=^LYR%(u7PH1h0^vbvDu*8CnRS{D4jc;WSyi>^L`qi;$VC3bB
z<jQNl=&z4;@7JN1f|Y%oSCKeWAXvyevb<n#^6l|h<O`WCV;h+9Tp)C=RE1WcCr$qd
zM=ngL!zChLr{7BdC#FGsS)IElsK}sFh|zO2!d~kO(e4#x^7?yM&WS3WI!>1+NiCG;
z1XD_LZ#+VaaDZtVE(zt2*eHL%SrH^xM43v9KBcX{O*aToI@>x8cW>U68?<Sx)|9m8
z3=AxND8(@ef)W~=^t)&9dY6CtUJ`Y_cvw)Et;7J$Kc|Ej#b{3x%<A1m7o{Hs2g0-Q
zPefA~38<iKAUu`UGc~R~hP4nDSn0^K->Dke9Lb~SO!73(y~)9=ZZ|`esX<{{IBD~Q
z{9u`EU4xj>)B+z$t6|geCw#ndu<`c?@E!|X3Ulu1GL;9=#;bLTC)rT>I_r<`O@BBd
z7I>95i6UX;y~C0@YMmZ+I$l-ZYSwt&A(X7J=f9wv>6%I0Ycum&2A#|_@2*bx#Cq!p
zO2l_xJ%D$MS*S(+Xhtrp2|gkGut&nKx%xr#=)L$C4U)|(9tBdQfD!K#g>6K^2{o0*
zq!+(tuWjb3XZQ$Dohh%v5Zy!qM3M0L)0KJ1aLxzO_zWw%`#j^ful2tHk0Y)~1t$i|
z@P+h!+&I*#eN2PTlI_3xdGFH;8#?d{mN1W7;z6=sq|%rqdvLp2nq+v7R)5kLXCr3X
zF5Djyi6Fj&@chCg{viLRy!|0gzi%!MR>)%eO2l&YVV`3si$*NgRcoPp-I<f`XD?%F
zjsq$5R`ejN?NMS|Fun_d^2f-P>>IW*dm(%HMcc5*92_dvl2fqNSZHOIwd^=vw($y~
z?DY9Qt+9-$P13gdCeB*ykyzg3@^o$$RVnOE9u6va5N;9i+~>ml1ACsc?8-go8^H4`
zP`THLi0gYpryeysy6#B)9<S=19rN(})wvOUT5=7wq=fH6ew*7d>b`@?{MxcL&`lZ9
zGNpBxd{*jdWpnlZvBLJJzR1@xH3~Ah2h8$2tq&e0DfN_52h~g$7sj%LUL3wS<WI!>
zmY+KZ5!G5ud^mAeLnN*z*3cW!r+x<`$5&{7-(Mc|`i*A72STwH9M!jVy`{b~VO3$S
zN-p=w<Sw5Qx+-Q^^?~^<QS7zPr@y7Wczx=wUwfJ(2kc`KV<aVTiWEH<fEU^oaXpha
z%r{g`9S%u5cS>T7;!t5~>UnVKCV{fe&LY=&>rE4r<<;ZOIPa1@Fy~eAeVt~dkI8WU
zwKmmsD#3_MQAn@d@ND3P=wr25-^HjrT*)Ek9<}g!@p{56r^Pm~z$iRF<~qEH=aY9k
z@u=PBqEfT2y&o|e;_GV`ZZXozDQy@9>T%6YP6cGguV?mM+1b|-S+wxk*B9wlCte>1
zI>kE@+8S4GMM{tH=`G`O%-z6R_qn2b3D@z}=|43@O?G*mSd|YDOn=9#(UyAmthxT(
z113D0Fx}VlbYRVwA&Izp<R(zKUIMdT6gtFF5)Lw83Cg2C-x{t}bej8l`n<>;o>2Q5
z0zXmngNgZ^^cYmy%t4>^nWfI7YB|*S-_>b+Q0HfEAZzfNEMb=_C>c@;q$6Ekxtb((
zOuYZ8qxxlOjwnCBz|f8?U)3n?yU(^qShc<c0y4nyXsmE!IQ*rUXU8*EUU6WPLPfZ=
zW^mbf=hA(JAX!F4J$+gEPpdqoK57~`M#0e$_s6;o#B2MKX|!uepj>mas7rdR66{P9
zHU4di)JcF}9+S+m^Nr<jhq=_NK0x~I$a9u-gU5QxU*@)Z#ilC3%`PS1Dsp5OPK9qa
z?dYU6utJoo7dhUW7FPo8Kj&ljr03-xYITO4Z728I({=m`>s;q9IAC@b=i%k*n@o5=
z;nlR2(p0p9Gfj@gHrIe$=E^|MC!ue}wC2t9V8_#0_#9^nD)`4Ru-$gdd;Xv+eKJZP
ztS|p9asWn$q}hMA$E!j2O<Hr5j|;?ey6O5cPV_YC)ef@CA^66Aa{A2z=&e3-raUx%
zh*?G)J-lOc6_ZF#w+KppW3^W$GCH|=3p<vNgz~$SSHaW1CwEk@>GsJ?*_-;#8$nD>
z8NTd;m$nc-jr3wxk8B{GMO;fwKml3zu-hcma%FF(dB#ZmWlsnjT^&*yj-0By2aB(q
z^p~x$*k)LYq`(YxZi~8Xe!3yVxMT0&lr-|yG?s+dzra-i>_V0Y>3SM8rChM#kNj*~
z?Bz`C+u(h+`1B&-%a%SrB}difS7)q7Ijrb+8^8E>#+&f(edf~a?ztD}nwKl58K|Fv
z0k=oEqo6uFvHX@@Q0)lleXe6NTQb*9iZ`-+`D6Mia&Yg;GAzYNprou`H5mnwyY{eQ
z?Ns$I0_sPcZ!Jm}5W@_%i8=O;{GsP{<N))I8RdgEMA|f>cD5qEUrL9n?030Sx_qV3
z1*gGzp!5XOw29qgpuj42P@eIWzq8T@BhZ5#hr!ovfV3pGpx*Fe8E|=x!gb}fGy1|W
zeZH&>otG&_3HgAo>J8y37*g!Qsz`^KM2sETQaXzP!4TIFFTs7^hGhEVf%svMfr?Ji
zCv`I>QTD<8)ZfangG{V*<)Rg0Uca_2sZy6yO62O+Z15_w_q3G?<`_Xv(WaQn{6t~u
z+0B)rhLO%c6)it#VZAji3{1lgTH;=5H|65Z8SoGnPU8i4g}0;lYh%y1`=vJ*aWOMB
zlX~d9T^1VG+gCVc0K0Z#?^h0L$N_Hmc)+Gza44-{59pPCns?(DMjXy2KV;*FS?(W~
zT@6E4J%^O1K+Rns-B+B6{d!VEv8w4Nh!UznF+IiQJv7S(r$+>=ihOah_a$SH>!otQ
z`;R<eTQJE`4yVfR1oMM7fPt%FY~5%&Y*+WGzI|a#`B2uL3fxPt;R+HQ%5Sj2!AQ?Y
zl}J2M+;A6R>a6jVdNrhNlfupQeVFPN$N4%~kZQAFIJ|`5NmnmAhQsvyV;`N3UM>rG
z&*wQjzS||v3%}#n;MSM`EJueT#viO(b`Gp7V+fU45Pvnv$ab*%B*^MmSFK_@7r|Uh
z7}|F4HuZsclgPHE1Oe{I@b9PxBvdPUWs5#s3)y?MBPZW(?@_Jf^l1y9;BlZLVC?4I
zw2X`tgjS9O8qm#0QbHt*kR3@BzGgWP5uQZU)j^A6QsDZWtDNCv4iZ%y_4?_8h-dH_
z6RuJT`X<nxV<mc7Su1KNi#1+xFoM}apK<x6klO_U1x>O`z9qN8QOE`$hf0~1jLA{2
zu%Vb)1}Aca03A+IDPVV5+8MsLmX+b7l_^b=wI*?;$tw(FCZ7;Hv_5nNe3HSLNvW%y
z6X&gx&VU)0t5#3b%hTuBW5?BP8CG(b8>AaSJ`XjstPg8vP_0gtqAwwyO{!&o(xxu!
z73&AYc%AJ2`Yf0^NhJV61ZWTOWiAiO2!ai@@;i-Tc(9%X2F4+Q#`z~2nH&>~H)Mnr
z6MTM-lcGeOl%<-OEo|Cj;SW=c+k>E37bfLj?@tQ!9wG=DlBPPEI%>PGqj<`#$hK!=
ztxDt+tPgbdgpNTggs|dK;Nl0j%4Z!Dn_XVwB^;L4zjS<ehHFFX5$Ib88TRtTi4Q3`
zl!#d&)8dtZp%e(|{BvE7DpfY#l>OSq$qXylRJ8mKjdv%7e>6|mJU6IVXr}JV4b{vd
zUS?h%P#ly(I}nMG$_V!DWqGoLhel`F>MsFqY7fS`?H$5_gd2A*`(l=w1?JXt7V;Oo
zExGj_3YHxY5O1Ig;x!v*<)3k#U_2<xqB75LmIxl`!6|K0M=<+GEKM0>C=76r>uvl#
zdT3{Pi!wpZmzjusA5&f~jKz*vXjIQ3lG9k}h)|^#PD$6L4iIb^yA{3X_f<0EH8k^`
zMJ-Q88-ny@F0&jLGR1xrXbSf>Mo|YUDx<&i7qXi!6(bRwY%hgF*B4K+&*JTy1-{J7
z0nVGNpAokONu>y<AC`B|1IlG6UKBwU5_;+Hb($Q&+empVbCM*M)RICRwdfB^WHk@X
zb$>E@*@u&-dz(Qm0W<{>pI#8ti0d`2vIu%39$KiaM1Lxnzi%F)DWJN6n`f|2XNO=@
z9lMX})aHh~+_sP${(A5@TSPBlLQ3$Nb;#p0QHgjnun+109B+(4PWu=e-uxvS541N?
zau{T#Yz^9vxyCPueFbnZrBB*)@7#U<Dxix_I;c&UbU)b|SBOGReqn3n$M-a-axZ{u
zC<LjNgudv8ID!7%6I4$etd0n&nQ&`{M#+mRH$N!vzrd1Vknos;DWaAEd2<bmw)(Pd
zgyN+*@)r@I=H+wR?@Qi1zf4q*BbVbNS1B;mww)mv-IPrfZ{FdKK9sdCdiIPDx0}u!
za?%kSzs%;_F>?<mEc9l4L@4$d;JHjkbAS9st==RQeR;wbwFKc^^)Vy>?Ywb!b@WZb
z1&ns5x!IyO>yOI~@7clTK5x<j2=TGZFd2D>2Is_?{Bm7=32<PREFSt)*>xJJh-X_{
zK{5y?b0&l}6Df#?25To)nosngqmNL&I)93QgM5M&m+8o~3`2oHC8gN63t%4=tM@5v
z`k;+g&?LyE2eQNiW^1QlKE$1ayZ|tT+;3UTl6W<b%^DfvK!MCeO0OSwdF4~xk3QBZ
z2^Wbx#L-0mGoXp(Eh)cYaW?O9iFAJ!kQ2chf6(PMb8Tx7ViDKmg)9pt*L&EBXbbE}
z=)925-V*I8r+I@}GAVG*n+u3Pf1C^M#S=W(oV0t>-ACW(F#lR1yL=;2B-(S-x1+ZB
z*9AEs<`aKMZH#6Mx#%nqQH*N`h}i)z%r_lhT#=v76L8Ub{hf_T)Y&toIQeq0Ix3+h
z5MZ}yjd8JupM5x;Zu94M7M0=E=I=5tQ1`juhIStF!w&Ab+}-vL(JoU92Eu5DZY-}V
zM=}5xG|<cTGT*w2MbrL}9cm1Ez${&Ez{(2%UOume8>!u%OT@$RR|j9vFamLT9-=`{
z$ZB*=fk6$RQ?k-o!jXvPHT21CqHjAbqmJtXGQ<$}&=c&yPkF$OLZcJXWtN}x9pU8=
z_bTI?2ig>lrgjf#Yzw<auxs;Jc%Mk24KlSyeS%H@L0=}@Tfr7<r=dmZUoN!YdRSS?
zC(H93C#oPXVyI)7V+dj%{=)5OWHXU=N9rsdnHYqak;pouex(ZmDR-L8Jxwu|DD_UH
zTQ?ApT?j=s?GB=xp(T%5ai-xZ(`DE>zRRKF{r|<@Uk1e$bZww$m`rejCAe#Fhu{R4
z-~k4=!GZ;M3-0a~g1bX-cXx;2?iT!R-uL^eZr%Iy-cxl>o%179X3w<j)vH%`uYT6E
zn81#fV4w9|^pkwuD6GrKhmZ$L#eHORUQaJWQ$ckdz#}NpiExC0fV}-sQ-+|2J2p4|
zVF0cL9Uf^q@Y+U>o^x|R=!erDO1}Y%6i)X!DK=IcDe3HclD8$xKOfQaaMlEBMyy(G
z8c)6~UJKU-k@oK17Jw8&SE?SSUBm0=yMAo0luXex&JMz#Mq7SFBIRr$2F+Zfbw(P)
z+zNZ6F@@lw!rv)~I-@fc{1(2!PziAU(BIKo{trX>tdd&bjQ_z;@@*fl6hCOHRzUl0
zmmvzcS6pkhcMrg}Wzep(RXtJ_!gaxF#eBlijS+zGdU&3?4y6DnKbD9NBqYQqUspY4
z4$)pBCeG@%B2wMKKofcnf6Ij=p1kv(jFYTYR`4+7{D)!lC%%C$yaWCMs6K4`4L&L|
zQ-CD~8vFn^G{RQ*Y_L_V*Xt?zSVO){tDK0u7wORI6ZFggm1JG=erZW0?_6ggG!RDe
z0!h5X@)E%CZ|>vy6gruK^UM>mFWWyy{(x3utbLd5>>zPhnTuMe4_1WRDPc&Q$BO?)
z0_Ecgb~^=Qg|}eNQ)1w7p{i>`Rue?CR>wMHaIv9%=vd2z3$7))QO03nGJ9~Sp;7Ni
z1wJ5fz1KktVUo`z?sY}oWwwzLq+0kb+CnTD5cOWwQ*w#QLNY-AQMpbX^@|J1Zt(-h
zf(V0GiPZ-@aBnQ*s-|fDVH4ADI|eC=68Y$E#e5>1RHzh|PkTiH!pCTLp}lvM?Y9(u
z-HB+bNb-AnUMOu%y}irpLVC^w6q{&D#%c5Hqp${HxXrR*urXTj*%)$E``!vT;4l4g
zmM9dbXa|U--98k|&3K)&&V$YXC~@E?rJK!9W0D93;h>Mr&<uFnMad8aYW4ih_#{wp
zjHDm+Wkf*d`EqhG)Is=%C>-+^Nm^pur-gsA!svEEt>@1?E0S{ku`b%B7ib!OX9LDi
z%Y<=@vGEE^wFyqC8yMcOKdHv$3FoxDw;wK3uUlH-!_G(->T*u21A>tg-WASaz#5fE
zW>Y7>J&oBM`1b0JCqG6&#t1sLcJWed=IV!UsK^k8_j{sQ#_{Wwd|%Pyp|bUf8j+%X
zq$$|mfxld(y1GJRz`$^1t6}BPas>$~t~RUT03nP0)AGNx>f3pA7b$*13{pvNcykQM
zpx@1WklBOlbQP2s&+M#59)c93KaqKee|(Q|Swq77f!5aMvU*fV(s(BB-HudcyX;fx
z;`6iNjQF&lE<9!Q0jlDJn5PL+Xc{xM&U}2z>Q@l>5T4`>V~NAB<pEA#dqwCc+L8xg
zJSsU-loJqc8!<3-zSmQG76gLcQ#dhW3x7&hnB_oAa-}GFK*sWkh6ll*mw8%^@sMZS
z;R#c48x~PHN6nZRI(W7U*$<l|qc)T6<XK9p=8n43M}uR$n6*eSCBs-Ui*~afNzG!c
znBCh0xcl9`#}R03-@l8@*n0ngZ53~ctX~8AqSX)Hde(nHc)&s3=kwPh{bfGw>-U*-
z>g^K+^K25$az3Aw-`5E|l*m{#5kDWdlpO44Aq*3eN+_+I-b8lvEHIrVshA`8^uwu(
zqP})0gU?9%EjC~`;vs^w>_qVRpJ+b8%*rWfK)fufC=8>*hnM%9h5Y_ijGve799|y{
zO+0WWp#56s{xGOh%hy082J(p#A4TcS$+J`nQxyJG%(|!+ir>?551Pxihfj|32-rsP
z**3%34GQ2qcIB7*&ayx0YzFB@`y>yu-rF}wX_--hiyMkI-`~vVHpx#qPMAX4yNEyp
zh4;D0K`L?nfR?~Eh)S0(<{k$&`izDn4r@4oWB7@A$GQKL7X~3o5!T4OUq1^LB(#V}
zh1&2^{LnKGF*)BbfXj!P6^951KZGU3d6sN|4@YTOyK~z4k}7*t@V>+SON0o|>TtOc
z9tH(38fvsBbD;ClRA}QE@+sU;eK|B93ishfEO}&~ACf2Bzn{7ler_>0I}1xVgFi}1
zJ}QAOO%m}wf7p4aUIyTh>oF+X&tNpeh{Gr(6Yq-mu@c4f^nR8sw^69A`}fpk$7v*3
zADMAg#>-$bcD!)=h@lECqT`q&9z}P8$m7O~vWPqNAjXQ63Ej*~v#yXeVT88aEcSy<
zW5!ug*iKwD#I@+!9-ZXM?<{4isDZVt^ajG@_(e_&D59SO#GV%JOWGleSKQTc!AqDw
z`HHb<-9rBC!y~MBpk4fhg`$2W6__k<=->_v6SyIdzF+x8;VeDXsP4-GQWop-WZ{#e
z-||4=z#E+K&?)+(pm9YU`37X@fpm&!LH{-9$*}1S7;CW)(NjBfy{}<9_z}|Zi4TeR
z;I`!92mjocRp+dJ2}qED!HBPVvp-Sqn=x=2Gq@8^>k;qU=ki@TMAR@+YvgyxVTZ;g
z6)1o7!r|sKg&$78KcKM~zWEYcgL%``<2Fda^n>V7XZYQ{R4;y{4H;M~_o+Mf9X7K6
zOsu^?!Q{>Wy2Jgny^P%)w=XG+M_QS$SwXzL3nC&`Ijpa*hl9IU3nhd7nxjzT9cBul
z%taU82x50Z*7K}m1~<--o*!Biz9C|6ct>aUu;eGQul>!xGNHSbr7$JTXP+KBIh=pe
znd^o*oP92&zPb9qQFQfBb%j%y!kdEJEhH?0|1toi&+66n2!q{u=*JOyMYIuo<7b?#
zh9+o;m0|CTy_WZ+uS<abCXD?|&k|LhsWr;aFW>1~HH0STyf2W}e45RFUVTo)5qZ2g
zUu_;|oTfn<pod^jdgTJxMUxFKXG1>)b_I&^ydc9b*ty|wejgoG;}PZMs9QLTD&W5^
z@^FzI5`UtyqTr;z#0_Qh6DkNyebdu-_bH4_g(?a`8nhjB(MxN5?1xN;)g-IGhw^o=
zN7%=;IyUcX%coWsN@dyg&3F#e04!A&)M!e8Z!a|8xv$J$tTN`jj0BCd4Fk3tX~RZ>
zg80|gdBuTZNeDs+%Egy;yUVJ~pdYAf*%S(B7N57jl+3oeVtrDX1t|_Yd!nJtO>+8(
z9zkMwEB>8#N^Z21XTP9f4k8`7E%i(9#i3EG!a#l+S}%O(?+dGJXA^EX4{g?~|B4WY
zvh{aujU%E0J^k$C+_-@GH+}6OfuAT(CL#<^q%x>)19soQ(s%gT<gQ&|m#4ataEHAQ
zBmKZ|i9wD45={>9a3D?g;SSaByuHfbO+nW|f&D0Nqc0m$!AA7q9aC6kw`sent91q~
zXF_w<rB8#jbA$q+dAx;cgxXdm#|lK6Cnsdx2TmLVb$|@#Jrw09uMW=oRB{S;C_$8f
zs3=Ue)uqHKIUi#)A5pCF7t?qo+=1UYgrR7APrg|ncTi#9Qos?lj1jL$mfJ>^e9><s
zT_t(??aiB1ITx1S6E^+z@4I{EO~`!B%Vjq$e)kKo@O8-Yf;PnsQL(V7Q|T33pJ?d6
zZ;Jih{*@yFA51U!;Vl=YbPz~N$XS`A160iN>{OtDg9Z-{qV80tRiH&9;X)Vm5bkGj
z_FJxi64_n~lR5Lo=?zRWihiwvrMW}H%EHHBQb>b_qR(!MOXyd1e4qraS8&2%K?;}0
z``;uRpo`QTb!1rv1@@_<lOYkoO9$DVvF{5a`um|AZ11mrBFf0{5r*}RGA8DuxF<MN
zD?>NmOLUNXc`Fl#G|Ln6;coMz{_8|O&h8f3doLCQ61B<70b(R6Yt~AR4HcsNlnC;J
zL<p~TMXNFVFmPu1y64J{t0{j9e1Z4-NXpt-g+_1qwt)n5U6qgifV7!Qu3X;(u`0KF
z$2``>z`|-*JrM2d+4xTav}Yo1E+eg=PiQf{Y`@1oG$U8tM`3aYVVsSdZjni&ZBY>P
z6C6lgEmMbaz`nEdKM&*j)J|wv@&yH7I4FsE7&j=h0coHId#mP~%TDB|uB6{xyF(PA
zCL1J>Jg2Y4`)i!{?Z7OXqrhkVr0X<6pMPEa1CJd+<im2oBVNc8kf6i|j4l|L1}_$&
zn5@@pa^XdENYWo?h9jGG@dNqwWDo!QBW^2k=?aQMU;Gq&zUg;1KYU|(UY@i44MH%7
z2u~5spZsDzap6$-_k>LT@@&Rn#jt_SgNMmoKZ|>;i+i>`C0ri&VT~$|W6@6G6ZH)3
z&R0KFm0HDNg&_MItCr1%Hm8MJIO~0SH%kJE%eRK)@-=wx{`IF81(OAI3%`g6n#S@#
zrv9$C<+aC2(y;stwwxAsxjjlu&v|+G6IK}+mJiB|g2<gVcgaCf`D;(E_=wK~rb>ut
z>v!369MaO*zBqhV1al_I;{2&aAy?KQ8sCDsUuTYoz$UH%#nuiqm9M{hHwotLd0z%I
zu{=qZXGJ)=Tpo=%D`XStsi!!vb>i-k^-CDLX(A{h%;hMOdoIrqJrr-8=&-(j0P^Mg
zsf*bzzWy4vAtC1i+k3yX{kN2VN4Y7=6r6Eb`bf)30tv@oHg-{;`ZfcXX{N&ohMgoo
zJ698guAe_YewqFE@YC*7R@-_;j@(HovsV;$c_bR`rQcW-0j!q0Tje6gZ5t+p{l-i}
zZ{i2?OEzSJ_qtN#sB7K0cLUl%6d5eskI~~OG&eub*zO9(wW~a;MdJF3>ebyG7$LOr
z8lTepdZx|PcZ$14V;$%ZG1PvClrT;~s<eu^N}(JUMIxVO(uRhwtUmU4f45_U$g$ly
z5I|(TGL5(@0Q2}9*HHq&Fj}-jn0Mf<pMc9C7}&%YAgHY8a@1zlMv;Y{nLA`+Cx`hx
zxCSHy5DKl5N{|G<wo8NzT${NEtV~qYiGYxZu!wLEt#76Uw22MS)1QP(-k#0VRS~;r
z2wM>Ljp8{by|xM!du<i^pH}N63>Pd-=*=TOn6tJ^3g~HvZ6UFupASPw>P4eGsRWTV
z+nWs|gg~!N!LI!072VjO_0cE`(|AK~$dM*kvDobFk7cUiXWo*>Mhv$+Z=axO{&_7U
z{8}d9wG6S~f6$vko&S0ADyaYGNyPt2r6E9xaMUOGFKhbb^Cvp$;g~AL&1|<liT{G6
z6~RSvGeb)fbut6ZW*0No9@?rUI`Px|3;*8;8+V0mS#Ho$vsv}yQ3srn^|l<%C?Pp{
zwJqF7lK;W>ijw}wYZ3dN8ZlJTbGaRgy38b#$Rrc@>FN6%LJ4Y8o%I@H<l_$df`&Q`
zq2=-9!puV7N6-2H0Dq@NN!ZAhtE#G?iaiDm4;k_Uo&HF%jpTBA9?0?lTg}m8gYv;)
zq*P+eTtk_iht_)Fe_ucr%*x72LCz);Lj7T=`R+2uw9_x=BnT6tgc^yo`(Fm3A1nYA
zoG~EG>-4{$qox6*IOX-(pCA7NcE%O~5cCH9|8D>1vs5^M!TkS21!Ln#PAALU>*dwe
z$~-(gU%iixESA!(Qwbchv#}-gK9i#DxVgDCIfU*W7#SOfjNNV#8&Owm5oeReN2UI0
zXizUIE;dRP5HFSqwir$0R?^X71PD~(o12?S<NNUKz_6Gw$h82aO<`eSbg5?j!IWbo
z^!f2lNmZ4KTq2?-u=~C7gU;<XKqKPl@((4o*mrkkdYS?t{!}XDx+K<h{=&n{vwcuJ
zJJ=}}3cbjY5;x!jC^Dx*$)gV7YiepF4GpuS=h(*hilYPHH{2{9LMMLv1Kd1h+2Y~0
z^7O%Q3kbj;lTTtu8FL-Ja7ItOr;-k`rD;RM!NKDTjGIV(gGcJs(8ZcUR;l2UU%&J|
z^<35w_`rz0@kOyd?2jhDxLG=czWDgs<~4}%Op_0d3`PyaQV$%c&^ffq+sRu9MH9~t
zrnSRjjl~t0=I_=T$>j1qEZa>^u2PzpwT{lVa+bM?ENY}cPtw$@gu3WLv6&JkCzX)O
zD^&mXpi6RqA#0TOk~!=3ZBOY0=B9BCp$VS~(_<M@SL1(1-`QTuat&g5&^Eal4DdF}
z=?(C5ijb*ujhR2Wa%(t!t-27GQj|&_PU4rBqE-7|T%42jT#&}2L%lGlMoEbYU7Rjt
zwO-{*{v}_0({n<KOKS~%gI-Wxj_pZncG~(@NyX!vtwdOiOK5_$oK*&az#m;X_GuXt
z&Dgkv4^7mZ-hWW-Rpj*EhMR^=JF$!vD_3nVrdlPsASLd(s%33`pM(`8SI*_S)+}!&
zk$8U`A7+=1CEdK!MVz3Hr%F-j`&CRATPC_g-jJnf)MWl-rMiy{1;+xDN-l<m&3Gv7
z?j!5>D;#c@Dn&=DC3+@-ha717;SXTYc0(~C7^tIqA+sZwlN{yQwbwf?9}h?PoO}Xg
zrHA5^E-2}Ehb9l?)A?vPRN!*!b}9&p`N@tMMp(%N<*DKmrNqSijc9gub~O5Jn73~l
z34MmX{GHyPsynH58`D{+X;5lpz3RhHEQu?W*7FzZ!Ll*6w2W@do~bf-%bh0X-^iHX
zek0I7$K!*-+CHG+(a*X3$1IlbZcBMGPmtZ?IJ|!7XU|Yh!5>lVG6@IFbbD9HQ;2r>
z(L$yA&CxQIo6~U=f|$SU1EG_oe_3cW0CdY-JLY)-z`k!ml{w*d35;#}o8l;hCy9*#
zm{_!J2Pyc_!J6ciZVKKB{z|JTifuURiX4K34(t#Q9Voule1)RqvgzSoOtRh1P-loy
zmHV{@4PEB4Y|#B;c9p{rEcYL;NP_b68IDSgLN!)=HrE<@?YeIX3^Q#;L-Y;VOa9Lt
z=#n6?hK5FblpJEl9KLlcO@8X=;B@jOz5QpS17ut_M=dTILn)$@V28Jq2!ep^S^tIX
zFWKaP^0zSm`Imki!++Sqin*MPRqKZ5YRe-%H`wsRwWejmw3q4!HzNh;o1CoTEmq)W
zb03T}*u#=C6V25%WD@D40Xha1<FSmD|CK=0T!%=YQFQ={uIIJNzyL92z57)x&l=rm
z;9zxAFfQ5qkr7$ji*6fj%q21={@b{``OH{et2XWM+T2qkYR9H=L1Y>rP(}pN>4Lp?
zXQbuf24MkWGFI2WUPOD{%Q-NAPI*x&@1Ya>nYIyO2<5R!SCAqfKXRZ*g)YZ_Lv&?p
z*~%Q6eHHvJ2X8EiMFbHEsY!d67}%i*{gU3j1w=y$&{-v{G&zMrjm;-}g_FyZ149!^
zqBeq!R=SbV15?NYbFVF814bhUpIe$JxTuF_-4o4g^tT7&>u#2gg<qI+0Nag1vpleb
z!*wH1^jKJPpfTSk=#jZHe?Doo)+j~f@$bukvdFc}?Mg9e7v*{2Qwup@@P#x=uB{fT
zdW3^ZEiF?xQfch4zV#F0*(9-f*>K$jf9A_W4{X7+={Gv<Z(uaDH2;VZAJQAih0%0e
zx5T$Gj#8WT@)yi`l|*e63J}?F(=$%LA=LwlxN&U!80KW&jE){wRDjSstv>l9k-g^)
z`uuo6%5Aribn;e{blV8eW?VGyD@tbMxM=RihCwp7U7iF1tf{+zDTkaM@rMR|$8^kj
zW_*QHISSD5E!>y{!N~!`h5|0O&A;Cg1Y{jw|9+va0RPPVD$DCbIc55X-WOP*=Jm+=
z*spngP*6st)&NA!Bx*F$XYW_>kR%}hI2?MPztqZ8fL&w9V;s^0c=R~u;ZbFufYJ4}
zo2=geCtQhaf!>n<!~GWC7*uQvk|FswFkOQQVGR!$Vx51jh_h2fxearE{BPP-DWJAb
zXbs+PgMgOb{7h~yN^8e}_hjWb1}A*YUavyt1S;|kKO3Ju1#_~0ruLu%<RmD{`1cJE
zw8iPIM^!=lf3;W8rhBCXTwS#ioSd6+kfW_v9R$c)<L#vrmK{*r`j?-u6P+wp=p&ym
zz5qu>f!hW7p$)On8u{H4-Q?utm|VfDL;#@Ne7gZpfe$*rd|Ga>tIDlXBAUHhwTZrK
zO}{H^6Us=Rpvg38WtzP!n+t~S5saPPWl+350(igPvP>IH&&){Wu6C<PL`W(t$4<}A
z4rH`E<ovO(dKNz3g_b=}ySgtFwkbS3tUYSs9TXXLG&kJK)NF>msjjYue*NO}HG@wb
zi>5F>F0Na5=akSVN67oG6<9?H3I)ha#xlZ3Zu#AEOg8|W?5TY#u}SM#CY{(OQf_v(
zNL6)}(R4wyz*?-qH?db*Ly`cEFks#e#mWnq2^Q;UYSLLArz<Hii({$MCu*KFYZ{Me
zkIUxpH`NVIMGDe3Ien9sa7(SJON-yhV~fd5q>iPEe`O5Zu3NH-U8pv3Y8n;%3>xrm
zFVk+?I`7Mjl1=3ts0@JJ-#dz%$nNX^*(0-^$1C76ao1|HtGa(+kytm2#ddXJYxins
z)axvgUkSiiO-9q8dpe7QKww~=2r*v;3O?5$Kt+~YdYKk8$EyCvxq*IsZX;&Ws-K<6
zI_k;INopqRNl8<gQUX}eZ~FCddGmy3wJOCE7_@1D=(X5a-nswjnUL&=;zEVKuf<}s
zde`<61v#BW0=+KfB^F~euvW;d|2w(l9KO9@#N%Fa7yoWwW?jBa;liC<^?im`y$oFE
zd~Oi=nZo%;dOHT+o8a%Qxw&F^`1mvL*2p4@sb?o=Db=*yvWK-Hq+*!T(5-&OnCNJI
zul{~KVF`(*={dsql_ff(mkdH%=NF=62%ipr`4WfwABA#UvqQb$gV_2*cr9cfHj_BN
z@86+Av!xdsq2-dQC-4pDH<3fjccBz&+3@OkcwtN>vFCHk%M>Vm2hh12hKpXAF)2^0
zyydJ*kAWCOlx%EiPfyQBqi4?`Aea*0pglI1d*dz&;^0urXlflVq9U=im0ZnQR{^0-
zD2=(GnSyo)MaKZ4iCngu=Z00*w{o?jr~?DNt^>1L;gcI=$vj3?!zmn^A)(aH7!p;;
zWkWVFAb<`N3$$Dh&{ST}@|c<O(fIcmDG4|P4A1_!`}cdpYA))p{f$NSH#NdI2K>39
z!p&<|$1C_`9+wzO`?$nJd>0!pXDI@}yaaa%&}c+lbzkN6@>tGsl#9UK3@nwmQPj=^
zGK8_3?0ju_zCVQahT$ee``7OPJ3vRU@SgkJNsFyQcpc043T=;LSyqQ$i`2gU(+st$
z$c={!Zl9J&F^$%yBx+9AVP~1#vC&b*i$%I{l<Ucv30gz($T*tVp^?iJPKTl}Lf$&f
z0v3iZS91BtpplUgC0$)6x0|DQhtp;{hLmP@6HkA1h}1p>$Na_0rF8Pd6@B*yyrPbe
zwfWSJjfE<zB@NQI&Cr1altXsRVp%K(Exm^}T3W0DQTasTWtj6txQ{+hc+&3Ace}Yo
zjpE{km8zUZjd&JA(5*cX$85v^<4D~#NBI}RpD=~X{1q@hEZ21V`@w{K-zdZLM@=t!
zxi1@~sT2B3lG*HEWE#qxJ`R17oj==v&!GiT%cY@7TGYE5EIFM_J`HfQs&So~n02qW
zuddkE?Z=h7<hK9i@yTr`LCFp3z$~8bj>yk+kz3?g*i11OD9M-;reUI+YB9(TR3fk*
zLdW?4hU^|px9Ba^S;k5bxrl!MerR+*+)yWIk#D>Z!9=jR0MIWywyPe1IV+s=+$}pW
z0eZ|NjRLsgx2Yw`0X>av=2v@)F{!Dk7ap(SyQH!ivX6Yx#;2_w_i+&#<h4M|TQZAf
ziV1#RJG%`?#^;MII&j|8`+-KHogq=U9?X9|tt_s`wfw`EUhB{JtqaQgt+yJ*nDgG9
z`|;dBq%Z&&z(L0_AMcYnR%SvTvq*8(6%!L{9iw{(JBdL54t7Ay{=ZbPf6#z{IV6^I
z>sRtx;3TT_2Kf8|hXQ-z^;ilc@mKiu?AYDiX7k_QOTqx3mGq*tME@u16$zL(O=y@e
z|A}7(;-23DE|LH3?`wsOfJf*^`0FG7|NQ=gP&-0N<QGJILYM1Tav0#qdj!YlBD=kD
z$r~?WeFl|}zdHs(0tpJZUB7W!%>QcB(#jrtcT|;r@1eC37{~&8{9yt_Jo3*w-(742
zN4=EE7JfXgHr_yI&j;&YRBO|fcj?hIM%zXPmrd*qyY-k5V!Hy{Lsgf5>w!jxv&S)6
zAwym+e^A>4?(~T`re_u?L$Mg5#yoe}ZrbO*om|XbUN%kcI?<wTdJklLzgz-(o3&PD
z6#96Sl{qxg2&-OW?;rP-{&W*jkS`6rptv+)F#J^7Oez|n=E{tv{P>Y(W#x9qsb%z(
zQlL%Ql^IUxA9EY^!DB1eyPen0__mr&8Q%@~3dXBygC73npHAsoYA*tg0O!x-6^4gY
zE?4VtluVjizy^*^CXrEU|HKL~z(jn4ZKsWWyH3)mJ8$6ITKXP!61M)+xi3zj!?(ZK
zd4V<4_-%&qiUZ(6J3|hy%m@__gB~x|DrIFQyPdYECD2Qi_#c92=S*|wfEdavyi@zb
zQUG%l#0x|=Y)ZreB*YaI#3b0KRsV5njs3@|m9q8fD!6;o;GF0W_ypM9flJYU|0XD2
z1H=d5`g@w_0d@GjZU?vv`y#2h0UISHAjG(81RS}sP%?o60-o#xuVLyJFt+E6NI(;&
z|6bj6=h;#bKz%W&QK3%iKoeqJy8y%>l33Ojs4vFdTdR5KHMSE^;MEuSXc`0T-V(On
zZf$5|0ZmMt0e6D`x9oDPoh{%`**`dl86A~#<DdKZF#2l<ZD6=RDr}hkHE?)%`0K#m
z>SwW!2Zsj-(>zUEX|0ZC_*p-U#KrTV#`|k`u5Pn|U;FLk4i(Lg9>+Zk`gDQa&3(o@
zeNU(K%r)&@dpQ>3<QN{@WVd8aVuuK}wz0AE)$hh~PBl*R9+OMwXWD-x{^+t^w<2VA
zou3Up);*j&K6SM8@=3rlkpV+fO>&G-lAW#nO7l#Oho4io6GR}W%9Jmk88e%J)ih9T
zXA7M!m{5YSe~})d8gTX?<Y*MKwYzi`!|G{Y!!6x3b%>vyn;AUc?8|A`4_mof?qB|u
zT^oTwnX9D5s`w&ZI{FVWbu=wuF?+`LVhh2{+PwObT0Wi8us0%9kl}a@&}Mm<{;bkc
z6}b%4xNM#_{D_%>{aET;v0XlBTzovk9l$s}Z?kD+px*F7_ESQoArd8%V4G%-z7DZ<
zK_WkIJyLlK7I_!)z{t?lL!MACRzy9wK%-vR)?SNW2bQi}CJzmV#QP=wzT0>240A0e
zKhWT?Q9akxcmN<7{wp&Xz2&Ioc3D|jM>bVL9+D;}i})guhij_x`qQ!fk<8RYrp+rV
z`b<`pol#v1<@~Xj;9MhC4eOd!zTcyx@69YNIarrf8>DWI7bs^`;I`v1&<DYwq0v!J
z;j}!5ku-k!9n%&x^+yyPU8iq<S{o0dVsv8(bsc(UlmuM9Uj^kYC{kCf!tsfJ))xPU
z^F_k`{mG&#P2l^J*!x8dwu+=kZtewE>4A5f7B|c9SUq+ulG%+?Ff_78mEoy(5d0sE
z)=ttd9@!#?d_0;)%n$cVS=t(myZaTDAP^|Dn;^5`8rzf?O|zK`w$RG;*0M?W2VAt_
ze#nfY=PAETxixSx+sA8(P}0yK6)`in5DsZLCoEL7F3fHj=_Dk--s!sB$U8Q*=A!h@
z&vVLfk-5ZXVA)s~CuTFY;PN~FlVtDaXf@S%`w2aR1@%207Z=yuL3fim_mV{hG0{w#
z3lj9E*&i4a?2nor67PxQ>ermbZcbNd^5xTGq@=aDmTmI^e^tGv+x~M0Ldh-e_~N30
z_*B;0hcFW6@gR^8MkTy(*~o50#CtO{Gft*Vw9P-~VyteXpkxlGjFI!sbIcMSG=uWx
ztBA*gEaJgcZQii00wimcZ2Au;sdA;*B8<G$bI9%cfQP9{#T+^~jVo@2V;%?W!v^V?
z_{iZ@<bqJLPekTcasoDn6|^<lfEZ&`FAV@THD7tJv5-qN*L1iVr*;kd-NIDi)Hvkp
z1is1rFt;mtB-a+ajSyAfHSAKGQ{vWcWZZUf8St+>F5bK>#umk?yF@`-Y24L`&W8Vb
zp1ev-s`A{~S3MmGn~4K8f*Da~4dLnetmCQ5;S#oGKofdUlU!xL{qZ6?^)w2%dS@{_
z%>SXYTK)iU%$R#&{?nAnNOa`d<vSm~mgY#{Jo^Cga~c-md+x%Mov(Oia5*Vl(@c85
z37?JRTPVrzDG8_4h4ga|9!1G`EZy4fu}hj_>Pm-XQYanA=5W<rrS;b9x&COORy%L6
znSq9(w!#sfyUKI9(C#rVfX2M&z2;t(EJ~O9f&Yr*Ts}ZGz+nu=NXYL#NnA_nfb$tq
z6gipYJ>@hUm6N=Dy#l6wg?nI8=lBF(^_gT$qpiKK#p2BcJYpm%15?IHM#-6iol)jf
zcIueqa&6`hs+ZG{Oap$zP9FsbTmRrFtGwl9@AXALljm>J=<nA>SV<pcAbF<3!CTF{
zKIyfay!F=6Zy#=@OI^LDo#yy&2RCx*DCD~m{8$0)rDl>yZ*MgGz(?8qlKAqkhPjV9
zC+GvcPL*)pXp+(4A5mQ9=0%RqcK5R4(&#6=lQJH6p#%F9Ifb81H@%uhiaoY8$q-{5
zWj-JDN=PB;u?rHGjQSlEmvFWuZch?o5PrOcK&gnLZng4Q=HfiX;i`~z`4CcER(lt*
zC8xL2SH4S4U6B!F|B~hio}KIrpZRp!`qaqxIQ0TJ_x{R?m9nf?uA`KC9dfvwH{{a?
zZ$?CVY5Xw`i6P3dyMSh^7YIJAdui;QV`etUtLom7h8}Ku^dySQ;&o)Z?H+S&-3jth
zWq-?Zzmd_6{a9SiT3&kJPfA_k+|GEhicrGKZ2ZeC$hC{tdVtCEk)JZ>X{Uurvr@e5
zuD`Ud_hTFE$1>yeu!448>ozL+r;|D3EsuZqsqBM4YUxWL#;_1RpY(CJy@W{5&Yjy5
z?n??kJQ0tQ@-Gp*b(VGa3f)nQ&Hf*4$97K>#ia8};@1cGuan&LVCpl<<@ET`_n*0e
z2z2KzmW3v_CV$lm1v_?eufz1rny6qS!E2I~H*@ORY~&4wmjMnJIjeuE@gfhyTfPBD
zQp+sSst=^S4%V$?s845Y?3cx|0*<%4(0Gh_=<J{?6g6XpgPGn?(jq&+rHVbR`bK2V
z9Ns2Ey-8vk!x5GKm)AJ}q(7ODdJ1t8HGK?TdzS9<w8WwzIg)j5RG9liQjy_b0A9Z+
zP30N{j`jxVfg8cO;yQCH&Z(44+K4{rwtn6H-77h_!+B1t3E|R3n;b*u%9%}TCtNW+
z?+@^jFd<_T&M(!k)9mVo5eRR;E~(e?S_lf_V6M7v{qe#T7o?gjHor7Nh)SR{pWWuJ
zz|co1%}#FQ6M$xYw@1xD0rFYTdfK2~s~+yaw|+=kDVSon5D^C;68fnSV>OFYTv175
zoUBZ#*!+c`P8r7E^-*|zdVuVBZoqr}fnY=%m$z1*IJg}5>~bvCO1A!+u}(G_YYvgG
z+w(L!nrh`wF)Bd!Db?gW&^Tt1#Y>^iREi9CiEzT6ZNI;07ha)YnVBQ1J@kAUkF;kf
zl@Yj0W*RK2t?UP90hnj+S$|;ounjO?WPxdGc04Bdl;qDjnkK-&HbeU~pD)04$x$<b
z?E*tUG)0{0B3!qR5+M=LXFs;vMsMf9@P;nI`Q_B}Ix3@Hb8fPCpUkPjNL7}eNev@a
z56mI194CSOp+5+nd_>x)gD~`CD7*aJDukVycL;&rv*_meRcRTGf3=)fQKBEu<RJe>
z%xpNNQ>wI3DPIz3)Mco%d>hlLNj4Bfk^2z$hUJX8ig{deV~@$FG7ueOvk|WC#cMmP
ztbWD*K^x1J`bv_j0vIjD|M$_d=&NJ6Xws$YD9GiOmV!QaQYK^wC}Sr*D9<O>o|Mj`
z=rkx=|G56!`gkokcT_}obfMSL+H`^<lT95a>}YtGQ|QQunEMU8W#P6bZ!bC}++T%Y
zNg2z{0Oq3$mVuXeSL|ok-ye|ahjeCw-spG9yWg*Rnv@HFqVN<lVB&l7J63yK%zR=(
zik+I}ZFF>}0rzs?n25W-oW9ILqqA*iIEW{}QN`?8hPnPL>Fq*=w*a9EFXQ?Wz-(a|
zkH5_&_)-}?1Rmkj`h_$-xAG&E!3s{{w~iEdFAwFNLCfk^?sn`Jz{aAB_gmdO5go(N
z{p{-IWHoC&7EiO;_)Dt9zY6X+WabQZX1PqyH_6dtsFmj*z!8dR?Z^QFFLtOUyrRPm
zCK=A(qzaT4?TKfBX}>)x?>S2bUKu-jQqB%Qxjis8f<H>`Bc5^LQEhz;w;n=VVeMRH
zn7j~#_t77J`up+_k$0q~xzQKGVeZg%fv4$Y2Rihi|2K<pv@!pK0+=ur{avWf?ZWm(
z2e{E1U+X@~&+o|>P60cOiIFr&TzSh?vp{^dR_pU|TV<5wAEMCMGqM4)lK9H~SPv+N
zv}ZoMa&!eE5q7Fj{yyxOikgn){$=$pV;n+H58mK#obSV}Gd#;1yz>L_EFpo6%p3)|
zPI92#4pZC`aOApCa}cb)mB-P8b1;V+kffl~S3nomD)j3zev~~eYl4EZb5T;hUHHB{
za&f!nNGI$Q4SuXkv@PXF?*_IYm47v<D?$ysqJiN!Zft)_)$QhEX8nz|w_1M4F%&W~
z^h5(yiT?e_NR$)<kw*>VQ6R`cj-?&TGt!aY&%mi_mP_Z+KgGw7loIi1F{Z0CmG*3P
z(I;*^5m#o1(pUXCHrMlr1g?9X!iL|vT{zv(7ckuWg;%qney!GDD4)s17@AzspYS)y
z_*d$pckA`U6xKfccB;XPh6v{t<Yl3Gm*J|1<85wFjN|v`8=JHIKL0w{@jTiF)byE?
zK#CVcUUTv4T)i<Xpwz8~-(@I>N@drKQjqZ)M4xTHLYF^fuwBBNuV*0Wbj6mRqx#lA
z)gl!hX+$9Wy)xe_c4wI94Zdu><-)H43Qg!(gk@dy{b-Y*d8){OIFNrq+W~awCYtuT
z-U^Q`(QzGeh8r3uRT4+jcRo|lTrZ#YKbMueU%pJBZD=%QDE!SugIE|k1fp-;KA$?V
zvnuAJN2W3ubWrVMQb+LmI1qPQWdfCT*mMNekTX}k)pfo>=)g22#yXVGe5_ghdl1~S
z_7@V4hT;E1_jB(QV!r{!$REYqxM*Yd$UI#6seH>Q#<2RC9EzT7TZqu^f^D2$@CEb-
z(WuOzhp$z{j!%n4nAO~=OI3%G$MkzH&d7f@AhtcS7eHIz%hM0+aV(T%32>rc#)XY}
zo}1rzYi01gw0@a+c#Popy)@z&Uriw7;ToD2YC821j7J<1?uM_1hRO5`Bcs08Yx$Q&
z{8BGQW%V$>QYRR*OMWr_{kFX^QvVUy4>+?kx@S6qdr?MZ`!Oq|{4*h(>H_8R?RsAp
zGGNQ0Vx;>UM+;Qsl?1ZQ@ajL02p^uvoqsg3`~HZdAEy`3kZiBU<nFzP5gh8Y+jUML
zh&=Im1i{=8{2Y`$JxE!;y`E5nivyc1U?R?s59a~rhYVBge+z(Nr6LWDqR3XW&wj@7
z7+%08`WH|#O+mgxhkr{m%+$HeSI<lryVj^|J2Y9V7LLqUz9iaj0Ob!e!Kp$GlgQH^
z6yG8F7=UNz)?$1j@Vv*w5WIgf<^e?4ebqXV`2ScqApIE270<FO6?cEE=tsxVHnda3
zoOAip;xeOuooQ<EMJ1YaqrZnvv98CZ9CX!9V4@ZZ8~%H|dP{Q!#m2~3E>n+)cO`Ud
zf$;5`5t57l`fMP-_Wti?G`h~h?=iW@27!yf+S&#r?4cMVx)Drr-(dv$Uk?9W_bYZo
zxMe204I>|g!z+yM#%sA2yYkiHQWf|n7)jy+JNzqWmX|Feg*R{8N!k-L9(?(MT^w=f
z)q+0-kTu~>hO~mY&E@y7W_cTi+Yj$mVhVl_>04P0Gm<ZUQTdpVTb8hwEihJOum34P
zVbn=(=R7@H?sX%Aq<{8CJ1A%ut*Shf87Xw#YdnAObQ<oT?Pk?^65L5ufY@h06a0&U
zPo16?H(L{mO^yds$1@im=L`AthN1a=Ou7#K$G01|(fy2DGy;R#2MZ4Ks3^iQd|BQ-
zb{W$xqT702$dP4;U~#hkZz%QdU+4PgnO8IwiRIPUujjH=ZXo?^{GJoD2qg~|z*Z+I
zMIyuBLUYF?EXZntqjG^$@GL~fZJ!${caD&s)+I&1|8%m}bb5V6O&8<$`?*#7)Z>?w
zdc9~|>vOPp0YO*f-B{O~&T=jiS@^a~Wzi<bZ;Ku^snjcn-F>l0=C*10FU-0E)mR>v
z$Ha-rjROt!oqY8{y>6F}oqYz^j@m_@fCGr%7J#d*!%?H|Y0MY9zP%(*@Z%<-$ms3x
zbwT2IoGZfBI@;c7ew#A6nj?Qq5)QE|iqlHD29h;eddd2`e$^T>bd*eAt6G0CIX&{s
zF8RsqX*v~>ccECMkVxz29f$R!h5cOI8u@#L8aO=VN>`+gMJEm)A;nbqE4!$yy^n@~
zs7%|NCX{)Q;?l<B=zR*}5c_?dv*Rtd)D(2=Wao&@Zq)=xFqLfGNmESew7FP=zG_$5
z$bYN^pBTj{Zvcl1Dzer}O$qzRE4lLv^SHRY1(%NK2X)vb3w+oh(OiE1rvGH3j%Uwo
zeRiIx+0v@jIw@Zpgm8Iw8d(H<BTr*8AiQblERawB#imS*T#E4+b5b08|4FQ4uaZts
z9ODb|oTd?uJ*IE4DvdQVL2w4fE$04cMOu4*jDNG~MPq3NziPBT<%~h6Q>ky4){(6M
z2MZC>jlnmY(J&kF?w?26;^KN~_2`rC-_4!Ap0z&|a|GFsH~LE=s9Y>4hx3W*wc4jf
zw6lIT-t-6J9arW|I`_xqdX5SpEM2T`&TCAge@d`f-l|H%<Z+Ugs?2ZMC-!Lmh9%R-
zUCjA=H+~qJK)0OP0iwzd4R0H|r;Aqq(@;#OTG@w`R@r5K5;TIhk1oCC_CB?5*69~c
z7q_1dL~Q}yX5#>2Q8Cc<Gf=AZOArQAxn6qq!!c>cj#U6DsF-29<Q<xry&%5}nIn`6
zy!!N2p5ANLtg*iVdtE$?@q{L3a&*2zUpWk{(2;$rr$Rho-P~k7=8nrm5sNHlrxHeq
z5&vu@2wh!UUS^m&Og8;x<KS1K)yPj=<*E0fEf0iZG>!8u`=y&vC$qj<uk)c<&ftSs
zpgBr^=+Mj_w_lpSsCi30x(=q#osxwHXz0XDKV7Hd+YL?iCj``I{gK<x({){yI2R<L
zAwzvw*P;CJ@R5HjzhKo!_n9Aht&4>oW0M6(HxB3JT=N)t%*R<g)4Suua??lPjKLrI
zc)!CXroCYXcG!)jyaJGde}Fv?b+l^7lcH5`JwL^!qZlOR$F3?w?DWmW4S+AHi$fq+
zI>eL@F2=(&Mq5cs55MLqCN);JHWvtH`4~KOl_Cl1{lJ4nif*+<#$GC4fHP$!j<IV@
zEx<$M;5M~0TF};vJr)%0-;0fPBpvKQ{g5-{x_*5$k9#}F$@Uxsg`Vc`@2Kgz6QKx_
z;kXB!j=-Nk>teSsrRE#5trD1g{~sMmXlI46l4X-U2C3|mghnCtQkYdF&!zSR`rp+M
z{tz@@7h8>V5|bj}#MeRh=x}{N?M5!gl0ocHqV$topbfIHef)maTOq#i*UzrVC<k#T
z{aU=ZM$g3h!^KFEe|2(oNw4{oitWg()}fc%lHq!AL<HpXXWqo(>sE0$S@4bwJ2`9E
zuS={VDzlBo{hc&_WbW+Fv-OQfRiu5JWUmR~bsT#%Vpu}aFM&|$GKZu}u`2W|rw~~;
z-)_ogJ~GrM$6F0WbMugfus>B}w%2|R;ZN=hH!CPIpBmC-XCCDxuENFj?4AFT)xeif
zeSKL9-L~b9lg>@5;r#p=C!}X1r7U+JAuP#bW~dQ&EZy{|4|ndTi$Sk^<rM2IiI~&E
zo`q{CqXj3e?dC_tq`2@9qap8k5+UIrNHrgP{}mZ{fZ((%ru*Ud2%?P_?F3WXzppnq
z!GvkD%NZ{mRKd<E-gEP_t5{Qcm9~|6sro_f?ldM+gnJ~o0rf{4XUBJUNqdzwf1r)?
z2CyF+=k<4`)6dB~b1!73tq+_9%y`C<RXSW3CnEgEHb$}%SKGUOR`48IB>o_ofYBkx
zt91WC9pfP!_con+@-GT9A97209peqkEd#zw|J%#lr7JMtqx(w_?k2`}wl$zP7PUNG
zKp!~na5{{y6&C{6al&J&cAJya_|c1xrfUpKy3Tv_)~frrs(^P6z2S%Z@^|RLH>oRA
zDfM07S^4LeA^CWbL@?;{i+ZE{qA@HuqKM<Cv@RTui}#Da?$+9XVusY9%kv3~4fvpT
z4D6_cL?jsWB-BixDv9b~auSe?mXL-HBUn^Vz@3bln0SOL!W*962FA>#0S(RP?ku?7
z-JL!4Gl;CB`(4iVwy+S;|Np1L|KDY(<^@(|3<S`YfxK-@0ds3S7>=jMmtI-M_;@~{
z*}Gi>>!@9u$EE8nPHPBos6+*aYY+VQPaWq!=B^&b_4l2$k?cNVMS6NYCVIB%!rg0g
zJ~6~g173Ls<oE9b3@ZO1e&TziI{+vhGx)KkMMU%szVjR{U%SswH*BxB$+4kCwvQ}G
z|J%RtuV@)3TorI-KU!PUool;0YOvGqzjAX|y4Rljsg>K4y{7iH&Fw(6O-zpt{yA`B
zf|-wxKwMnBbEDG$0$|eCF;lFdui!Lcb93|gdTU)A931$T7PrIuUFo(%4fo>Kf38#3
z=<|rQ3WeV~dt?%Yjf|;c=33n7+=Zr4!rL1iNGk^FE%p(Ci|)FY>*HvdHhc~3m+Q!4
zt|!lF@C`MPy7c9RN`hbgfAf_dr#mGP_QcnZ4V!-W0s|vW68y`%3LF!x)85P3%Q;V#
zg3LL@xVmRD*Yt2IFt+p8+9@*<bqS0g9jhMG!AT}f0!Ac9pp3&DB+25-E??_>S2~e?
zvY*V$=27f5!nT~OIir%lI>pl_Ff+5F68L*WsjhmIXQ(%V(EK$YWH>$-V{ToDm=IW0
z2pl4*x7zG-^Vb(u!_u;J#de5+(`KWavqymv#$37eyVmP(5JiMQsyJ%qzsN{HWN270
z2oT8gv?=g~=r3k4g`U4nEi5ev_)Z@^%G-ZiK3x3&;6m-$kf%*ZaZyniv1Pmr(6@%x
z*lnPFT!f>wwl#sfEzEkgNFx3cyWLS-O@(Vdpj+Dm_*G&+e`OJlmK&ykJLA9iILvz9
zgdSHDmW!Eq)|w!Uin{IMSwJYZ!rp~lD#fY`=H)Fr^!GsSM^lji<Oy~IxyLCCHVQq4
ztUM3k-DyMKY^dn!S_7FF>vW$l5=U=<#D^U%f<<|G;$*6dtB=bNg!x_it5o<Pp<m#e
zlNIx&dh2>1-{9g~RKCCBF2}WVi*Yo{CGwd9vq07~Aqk?J3JN{_2eoWW81x|cqxs4%
zS>je8Y3WTrdE{p*HEoCZbh}Y*E4)gl)76b{K7imq?y)D+gUF&@Rq*iWxPFZjY_V8l
zW;J;B6N$0HZj679N<PEF7Zzo3_Sg5_pWa}4Y^x}AKv0%adH}!m%~AE(B9J~)9|@b$
z9>i%pjFaf2Z96~-NT;IiNXN1s9T;hRAC+pL#2raHDB>Lw);y34?4ZoYa<OIrceY$t
z&<W)zgs!Y&B3r_oR6<}megF{xLZwv%^h!Yrj8+~H09#yKTpdW{+FvTXTx|mIZPr_@
zEYo(Imo@ah>LZYT6{Awx)(Tn-F6ktovIFFwBLBDD=WU(at(9h*XG<y%1!k(CSF5=<
zjVNomVz&4K@MWpn?TM<U=F(%YN_7{4WIGRVPxY#erSKK*&p;1Zs~D>)dqT0702%By
zZZ6IN&OxnNG0xdyH7169YdVP<#U_jjPEfHi&>l}lJAll#Tx<2{rN+mdH;m-p@w}aD
z{g<trFE^VjgMGV?c-<C4LIP6^&ov;4j^<;FM)G&X>*cAe@UXmXwKp7JEy8$491hSj
z2Q+5~IuP0TGcY*EW6cb1ao)9*+O-V87|{U=rqz%=(<|Ao5w3s|<PHA~5fE>#bCHXq
zbbCslR4t%WAfOeb_TWlV$#w(gt71TN=7E~@@6<q834$DWK-*xT8t~_j5DJ)Lq5J_T
z@FW3fw>C^u=QhTBdxot_1`xv$P{>GPF$+byo##Uv4>k<CsXS2bCTHcwL#ll-qtX=(
z2E88vXkLy$kqS_0BT%6+(ENqG5GcWm*r-wgl;EFm5CZg(9r%%0(<%HWrJe-I1U8@v
zSs1UAd|??8hbg7$VvueHX}43jYyx;8`wuWW#&>63s7)v3t@qzIa}yS66W^5gxjdft
ztrY*TYS_lQ4LGhNkX5--Y7j9aO!Ioaa~x*rn?GCcDz#o~HJedkd^IB@WbLWR0BCLh
zKrWfJrzN;SowrDd5y4`qZsawgtS=mD-Ro4N`kFMJ=g2A3&e(LsdV4c{fA6{}0n9dU
z_&DG8K4LG6!&x8;Eealczw6DB`hSP0pscKJG@ZBUwz0M^OuHI<G(eqc{}~Sefe~8*
zEt4s{2hpjO)g<bAE^s)k11aXlvqXj_N*4D>K=3$%eS}VF>5hwLjz^V!b%m58fC|+L
zg@<~>e@JLQTBtrgX*zY>Y`dNX6f;Fi{|g+?4+O}sh?xq}Ul{O`?X(>-ceRS31l_fM
zR&@9bU^Y5q*YCff)Uymxd#!{57)Z(qMRhzt<ang<z>^wW;J>L^45D2a6~ceieLwiP
z1H#{)Vv0A22bU=EQSyqB2mu58CVeqNB*wdz+`@qpME2t|48r_6-<29C@RddLH%&kz
z!d^APPrW^lLhqe@V*owKg_HzlD3qFZ>OBI)=w4PeR0s|D$(KKsU9hAuhs;+85Wt3m
z@(h1*0FO(v@;!iwq9G;(nx%F3UsDIE{u2>;Bl>F-crBg?4|sl=cHy(%y(FNI(ttJ2
ziWmxBz^F+3ENmPkgbqy4kiF}RaG;Pt!gndKC<!_6R^oa^o;W~1^*>()0&4CCyjLyt
z@!5ThxBsqkxFnc1z#Cj*pcOA8LaEna9~y??V}KUr{jqDzl0zB+q8Y8y1PoUp(6_>%
zZ=^neEMX!-01aMD0E$`O1*|rL>FrU?-N~X10U;3Z=7YY*daHQHLVv^i0bRr@GFdXw
z0%pW|i4IUo8mKzn`*(SYEbxLB--<o~ieLqlv&Cdkb;GDIxYNx3=%a;h8thHTW8Z&p
zcz6uhAXOC=quSb9RRDBaY%`E_0|?Zdc=X_PkpcUZfE>^MepzaAD)#^&MXxx=!|B4<
zaALt#AoZx)ZwHT$Jb)1Ph-UwZlJtSA;Bk`YPF9*M0gbH#GP=5L$E%e9CeHD0J8`K%
zG250;I(b$Vcv(v^)ni~*=3tnXd2_I{<1EY-sT5lPNvS!3C=1P2_XZSP)^4lDeX3Mu
zec|uAMQB7oE6xHlL88A5;MvvMZ;qEtc%4pa8tir#mF4-j1)Po-xARa>|7^Zb<#Vy^
z2~!V9hy*0)PCpX^jEXqO4AfggDri1eR_nCt(r2Wn^>ztJchCgOX}_oN3j02~nvx%z
zuPzD{0)+Y#n*$cTK;gfI7Sy!0)A{eVVi}QmFNfJy>^Ex6rb>XN%ki+Zei29<Oc&=p
zY=r{!&wS5J7qDZh=#rSss`#vye++sMmuk#nF-zD+KX*AE&Fjnao%=Z)FRB3Ahx+Yi
z6r*DBCbI<FCxmJ|&ihUbAHZf+?Pj=*0z$B$PPdN%(rr%yR@vlmuDrhKv~?+58y6x*
z3@T;Sc~##<bzp=zz$q9NmP>UDlesdMuQ_Qu;E4ZxHJ{9pay%%^U$p5XG?7W5kM^#G
zbtid{ZU@-_ykVet$zPS;Hp;B>3wncRKmz1y4)fVR)Rs{m>40!JozL{RhI=3O@gS70
zZP$QV(il@<0E|&hl#n4Q&vu;ha)?M3)5*1g;ouLz9pTuCKn)qo;9qeHzm4_gruToy
z1eg3^obwhV5d=4*ResCBk9oK~wFYuXWTf&y3m*ZMJC2cItOW9g6C2m8MKU>{95J)|
z-;01$KRKCKkVP?pbb_2kr*ji^zDp1~#5Jvjy0!dS72D|<W9cKXOIr67Y>o3K49HXP
zl59uZwn9+_Ux;~IgwFW!t(~RWjaI(eMWaiPhZ~2n&jkFgHExr9^68P*7!{wxH=Y?`
z=*mtxY|;0AfU1p0s+vw3hh%Z96gImt<!iq9!s5<#qLpt(3a+-Wih*X=%La2{faR`H
zOgvbK1^&#rfcLa{%1HYwan-lyLAr8#B(6sL{SBQ&r4T>X3Z-u$x9Q*#XMwpOF%mAY
z^I$KKO2ZmRlm4Sz6qn67%&m$ql*?Jl2#Cp&(xlX*to(2yg8*6ADuW3X@`OKA8Z^mp
z>E%B?d%PU02U5k)Pe>43LUpB5!Sum$HiHy;J?FLJw4es^c1$kkMOZE-yQk2ZfFSGz
zwMi-%ncvE3tOyVr_tNMPq1U}^$Db;dx4^g~W;$ZQpzpy0Rv_Y6Mw<X7ka|3r9s>ql
z5}4;QFyEpK5FlR&O7%m8B!N<-F4fzHK&iC!n-4%K5}?$3P-e9ir5?9k&@ehsiW(?o
z^34wqqk_y?-SC4?J776McBV*y<S^!>pO2}4O@j?k5n-QzJYY;Owon$xfJgK|MaKGw
ztue1_g}FD`|IBAg;HvU}x1l_7z<d=s-_5f_fLLjfiU8&;1=teLYANZ@#d`Z6>xRpL
ze^LP3ipkPsI7*nIu&=zYOZDrzF5yN(`~)<%4yR7~|7!0$qnc{kb|FSc1cVT20@90s
z(xeEQfKsKYC`DQj5R@Y2p$LTDJA%@s2^M-sN(e=|6qQ~=6-263frK->&-<S5d)7MV
z|5;~!YvoUpy=P`mxyyB3GaE!Gr&jb#p<e-*?ry~){C_+8l`*^dmws0BdE>y|UI*bx
zk3Z5G695&V)Q*FLpT@v*ly9A@s)mX@=znI+O(zBl#`k>@y9PI)F!%`4f;d|65Z(vb
zxkVs8s-*nh1Pn_Bj13slArQfX&aa3&Yo-oiY>1RnK)m;c(9qDF+D@4`z-o<w?EwGs
zs=nZ@xkaamNjJw^mADkpQb~z5;I9X!L6CXVYmeE`V5HcjsJOWA^feIHA<)9c#`sJU
zFXqoyun6dXn*cnR-uA@9-D=+xpI=k;HQ=?by_b9r%nOQ&q~Hd~fpeQx&_XipVLIik
zuW&oC3+D6{=JjrC<))3EV=F@iYoC_hROH6Ub;C#BswW6x!^)e^f&shYmv3_Fcard@
z_8X|<tG)j?yAy4}7DMd;ZPXvjdhniKyY9rxYqoKoJ!e7ZT%F|P0e8HF9VoM%E~U(q
z<E6b}qvtF+ZtX1Aqp?0aJo3K^D_*+k8)ktD{N?mo>HXtir5<Mx7F!e}xO|<=^?XDJ
zh@H`-t=P3mmwuY3wHGxcfuBu<-x&f$gMb#8wY6X-)6x+$->DzHDHiYEczz^K5r`S5
zri%S}x`>TC7!^0w;`66|{SM~T@mO}z%LE^QRb&K0;kDCOKd<#l_S-#87%vF`g$s?o
zcp=t^!%bhY|Dl@(tCS*-sQ|{}lYrBWgPB0!1}$U=-W3j(4z`<XqbQw6=D$Ev@T`Qs
z^nudHI3+(Xm)cUC!<3wRbPIZ2;3>zea0LyxBRe%>0&E{hyTv<N7H5I?BfnyQ|7bCc
zj8s@$ER6!Mo!)!U^S6%=CyU)lk#qf6RyfAtp}zZ&AF{~lID;pzITv4D=f1J-U~^Tq
z{#mXC`~Z1DJIKuQXWj#g|J(_2BVhN(fJ>OV7Sk7$VDq8S7~0t~*RNtJbkArJ$ywn8
ztgLYeaiyRn|GXOHb%YwjOPFKn*4>8zU9KZ0V5j&nKV(f2ucLot%fB}G?$E{v0*2Z}
zQf~69xJ7%^_r8DE1m<OUOg9*Uau88iW#`ou@q?&9>XsCks+(RAdNKBMYasj>F0LI)
zfk8pE!C2|FEh7szAbE2~DrMb2?dBmDhG#X<!dvVD70R5UU<O?O>rN-XI{wyPaXV<m
z>DtCMP!ZCgg`1&%WMK-YE3iA{K8;i;Fa*~VAE*NNb{*XICHt+M6%Qx5FzX#DUPv%7
zK*Gmd12{5}ne-SnhAVV}Yq-2*LM;;f76&d%s6&NLEy*o*{SQwxPFqoK1pTcF@&R-v
z)IJ_6Vzd9uk$a#R+_V;9l%bCA_M+ZQrJHvB{7xFN3H3111n7a+nW~ekZ8~s+Fw-wv
z$-J0F*Yg_>({6fI7sXCTJ}~p`JQlv)JrVh+-~uEQmtEUUah`4S=h5Ebe#wPzMwaiO
zh03Fqx*DJp9^EP4@!4h)DFwp$HbxN-LSbF|$DET&h9oipF&dg(Y0sUfZeJeik0p!e
z28M8vQ;j$NVNHh?HlSveeQ0kS#Mw_a45`G*PsMDYOB>kVS4D7_KArb0NZT&nMdM0L
z+}EU}uWmB6-@yy5Y!-T3@L?8rXD64VEB6WLp*`g!{ME)ePuA<6-Y?|u+A=855YM`-
zeNbJmxth`I&_I$)6Z0`v9?KjufE(yN{zwo4OG$55{;@TD6rpyG7z`B|BR(8t`<1;Q
zB*fsKf4VTAe0xa`H_&(=TptVmt)VZqL<wpA%abIQ89-hm3}%M-FkR<g`|)kxdwQA|
zp392*OkWy>hm>3;*KjL^iloHe(FG%YpUg_+wVxG~(ZdyPP#y?w@_vwd3k(xP(4q_~
z!c&Sb(7?ZB7a`a9k^A)mXxIe>mPkGfi~m_u6UggT4BSA1<5fPGoq1&41|sA=Fp<p_
z&>Bi`)1UX(g24nnd$%3<HWiG&ev%)bJZN06YTjiq?3eu{7&Y(`5UK_cTaPvW&m;U_
zM#1U&*;SA=>H%nImy-4o_I4wP)JL<IJDxv$_)zbmiM`aIj;odc$Oqa#e*Bo5DP;C}
zt(8PGNUBsNj*mYoC@%g}sQm#*ZG33iWHB_M7jx=%?pa7lNevQ0?M_J8zT*2wd018X
z9kY46&wc&&S`#(&JzyS~Cj4XaEYg<-#xJi<)pu0ekp}d9O211NngIyPzUpSC_gowG
zy`PhxB6nAZdSc{pO*G7Z<z&b%SpJ$?VivfTSg|&W`=VE5@MC<e-a1y8L;P7GyO(iO
zTfF^vZS%?xoUJ^X)OmQxE1jL-n?)0KIJ05+yy@w$kz(!1rVlOiN3^Ab^Za9YB!v+G
zd$<RS>_0xEBY+t;CHRkN30ht6?)!M6+OFD4W4gwD+Ow6h$;_H{xVS17J-%L^xgyBA
zGF(*8?&XbU_5Aul+PT+59{bx++GF|haFLgL(jw+?d`Q^7o%P#lAYWt-)_oKn7&@*m
z`lmWW!rOALlkrNjb(7~RImlTuGQ3w{U`3}#XUdLAV{4l)9e&oaC#CP7pUBjVE-cEP
zy}N@={Hbh+(Mn|S<l21|8qH>vWbBp~og&udxw~RvQuo)cvZBu&@DxJ-cnTw7jOMoZ
z!&<l92sIDWGOI{JZ29{lhm~UMe$(MbFWQ2&$uTX;^@*B()L7LwE1)A0jE*WV7_<-=
zcOo9j_vn`#<EK3qWvVR)1Kww>N%PXzYT&SWG#uCEoGht8K%vS}+aX<OWYMY;6;Yla
zE-g;qNj%ycFKF@_v~t~|D4lN=w7t!n@_5zh`=Hxn-q$z1c4PEiQbt&;-~3Xl@rtz(
zsmh=grd^{221E592E)nrY?SRoW3N=1qYS}se^QDJs(HK14=2>$QCB!2`%94_=?iWC
zzaD{MtmfMgEv?C+w(ZHMEdmL}=qUW%#Aia?4f>41t5lCZc|YZq6&=kv>soy?Rh!qS
zel^0@MCH}B$`=*ZV`uhBXqktybCu%K(kouhyf;j8o;Kx0GS#h$;ZA;Nv91ySZ0CqL
zRUscr+zyKP&~h+bpp&Vlq~rSSrKZ2L$(02yV7W~GVYwzBk<ZHw?Fr6%Ze+0RwcTzC
zYr`aExHs$D^&W(4Q`fdD54R*vy#XN^v2Lguvu>AU?o?!ZmzP=Jd*g|KEUYc{VNZ_`
z277pk)ymt*PRG*Pdp>nkUeeS(K!Y*F5&P4YP@0hQv*Tb`Dpg{2tUk7$6zoy9%=>?C
zibZ@|of`EM<&nlWF~y3mKgYi~95>UNNVirRFWDT5;jon!^+gf+aDvy;m7}$79oBEo
zg!-S-GE*Iub9GV$-9ngXNulRG-HSarK0fx5jC~brMm6ID328|`{}68z6aDzS^!{Oo
zYxMA)wFZJ)eMTkq)KOMDgu`_yheL;f@Y^x9`E7TUm8b`eev`bDGPG{YYqI_|hpCCP
z*zNxP1Oose${^rOlK$aP9E$wb#wsSRbbGk}_{+;~<>ubKO5hh=L`&iEf@KPUpIL?m
zD_IMUfVn!t`=HOd&)|I~*sqjX?-B-8_bdBM>STL*ur!*G*K2+N!`goN4@*?UAm)2A
zG8gQ+H|L`BX>mh4F^3`S3?+?Q&krwEDlX4m6#wDRHI&!>Pg|4R&kT0P`VpcrAbk1@
zlXB|I{K1m@FV{3Td@gyzVQT!LwB(};;$A<f(^!ktCV~C<%U3_vCePgL7uxaej^x1M
zF2{lc24%jfJ{aBAF>&wSk0AiEx2`w6^m~!RzI_=iVc=N*n~wy>NZ#FNb-npf{j=br
zFh(`+TJwzhj)=?cft0zbH-@&=KHMIBU|#uAv~J|%i63%x!JNmgQ~lj2yv)`Lc<S>&
zs0ya&-G59|<E!M`Q|sYxI?JRyxjPP*L*^JbM9QQFbD~^r_38yIH<K?jVmH0pd<Jvh
zv}d9|8{nM|6Dy`9+=>&59&I&m=yRzQoH6p`7=6EtDuriGQ|8-ZrX{}|ff?cQ&rV)s
zEDapg`eJEPpA!`|6f$X{>{h)!zihpvTJ5^c?_RML!#=T!ClLp8-)dJ=;eNZyU-8^Q
zj&yZS8cdUXc5DPMX-V-unkKCYm$)yg-=0_b@nL`RXHY_QyH@J)2QiiY+ags@S+JLd
z7@ab0b#6L0x&E|to8KK73)-5$qffms@nZYO*!K^*rzzNz-sE^WA-I7M_d_*qkoV>!
zG!|At+AZE5Uzp9diHH;V`2E}buV1tmV_(j3HErjHuJ^^PpGmt;tXjUzuBzOBu|epO
zo3HkcBOxJ&A$nr9D#<MB%}enSk4Nos%j#@|#zP9wdbh`4rS}`%SCh~uP1R1si~Eyc
zYew#>!|xPn%nY2*J;Co)8#lRd^kT(xK1^ozfasM|gbrSnzyGwzZ)>uZVes2hjZz9j
zKl?%#Yu=knu>(%A%>A57$v<#1-%LFYohFb|Om2HGGDYhJywbJ~uL(F8=!v-Jxb5x|
zV&ANae^}u3VB|qpST|bjlbTNE*VWU!hhIovrY|X**Kya$6RpaH7G7Ym!sLDMCKW+f
zT^J9G{>~U*mulFa?M2_5zS$;|%4n(BuAlR+BUA3UQfGr0(fEr>vSH@?mlP}7X?OiZ
z@gf)6>BX^Nq|Qlz*TRJoG*>P+6;EK@-j50U<F}XjCbs(@d+)FPbnDMRm)Q?m<o2EX
z9?ALH>f$5%wj+tH5n9O02%1ply$6~gdEqdR4}@}8vWIt2MGU3UgHpofm-Eju4S#WE
z8U37O3Vg2Vpd_?jCH8g1^BV{5{cm(~qJG+Y9VDS+ixccCDw;gZc}=b5nzOH#Jf0nk
zc&>RAy}Rs`h!se^m40op&_<n9w*E?FYp-@VgeGkITP$h@x|ueZpwM^Fa((@Jq^jzE
z?yGTLOx}CJGs0Sn+ovN_G4T8fz!hxCD}ziP31cR51bVkC_uX@knoR$E*}ke#xBM$f
zMQ85d7B6Qe>P`e}+KX86=hJikxEsdqef5%SKRY<0iZj`xruSV_knfuZsjhwsX+PtO
zcnbu_qXrj#=S4Ox>Is)G;M^+*<|~r8Rdqwta^H=MMmXfWL27Z~7#jP|4mPYD8s+Zw
zk#>kCgV-l|(r|;zLiD9yAD~Y3AbZ!Th{F&NN;4{GjOJBAeq7?Ka*e!_j_&Z;pQ~`_
zx?xBwBUBNT!J5+5LdV~>G9T++`?U27heOosiRaF`K}^aJ>1Vw`YL1@u9a`}ZuN0<P
zH2M5cH)6k;@rL!yi%3af#;%rU@*^r5TeQj}lY5_JWVX*S=}B=XHdw6YM2bdFb4c@h
z{;}LSOwb;5uA>K;A2e}UxagM1m(RY!<lDSE04oBmGq-hqH++tn#>FHun5{%<V=I+i
zi+Ck+I={hp!?eaPM~8rmCu3iGQ{5cN<F2&tR%BM+PW&R@TD$H_Qf5-!pk-Z$cCxOV
zeX>{hzHyl^<tUeL|4a_wmNs2($6Y5;mi<Kg*tYG~=wCF0mZ^h8CDC#=vC^I2F9;}|
zHec+xMz!Uv_r8c~xqSYm7fA;3G9cX8|7W=I2^tl8P;B_A#UocjhDWk0Z26j!FfB2W
zgXO-fYqaai1KWmowl=&MVIARv-??;s6OW?3I){~Xb`4k>CGGxv{k;Ce=Jgy$#I*2_
z6{63sxvSPY!EY-U!xzc+`Ja?m`kbK{yEj;1{8Cz}#G$Xgn%^^Fr+E32ltUp(_z_Ll
zeG)eSGc7m%k?PDj!wecUPSh@&GI!NC(v?pQe!Je8{4AGAZ!i4Pg1pV<U3sp<>*9yA
zW%t$7e0`Y<*kzPbCf>;?Y(y%Rb;I=u7O|1}j4#-R?(X+0=d-3&TyKtszI+$5d0%^#
z8RZPpduNwD+Nv0kGFAK|WvZQov|pPo2B6ePajeOc`OM(XYZh9G*sT_g{G(&qkq<Ac
zv4f3P>rXEYMoNB?t`<#AEqoqUUT5>`f|Z#`y=&}r@}#^cpGL`Y{xXkQ*4TrI%AOus
z!v`%@*H<}(R~f7H{}roX|LKn$$a_W`KQ2XqJa3HaAMN>{EcE}cL>gK8fp+u_ad&T5
zqomxNN7dR+ap*R8spTNVYwv?<q1V#I`mIA-`{M&AE}D(7vfebA{(S9>X8>fE28b3<
zH>)ofxvi#>uQl-CJwd`%&L&gdUYpsWc3<jfq!$tq!}U6Z@7M=i#t^V^t7?%$q`Enf
zVzi0?PFas;N3LUX<JIv2ZqG>+0_4IaI@4yAoo~EX^s6!8tVfU1=@r)VuWHUz>4CHL
z3JMDI^V?NoFIo|ZeOA&dmGKN#zIp(XF%19UQve{3N5LCm86c5-rt$h}6@YCYfuru&
zU;3~&zgIJR@Gu2r)e~hMdsT?mP(uRF+<E~(?Cew5nG61nQPaTyoWdDXm}m~bwa>cc
zy8ofM&st?eMFicSiE$5P1E<Iy(d3b#fsa^ni8g?aM*u9WsPp60OYje0*|G*8s2Bli
zQbm^P_VhX^uxIS44*2a<d~kj#yJ<WbTpvwd@B$%72mbX>&j`)Fga18qb_}Ruz<t59
z9T6<Sr;H}wZDr1VQ0OQ3dj=GD@GZaJsqjG9)d8v(ze6<&fImO&kvQe7Xz<|Uu)@?U
z03_5)jl?kk!d@G|JQ*o_Hm4}E@XS;F)B3?>=UcPU<Ef|Zg%@BEg6F}%PTm=j@ptgf
zvlJ{`5Jgb`+1d7PmLO=Mvp1iHGUo=U|Hb}Y%$_>nAl`?7`qe=F{+p4b00R#*9Z6Qs
zvOldqq~MW#04NwPcK8VJwvwRaCzj*c|BVu!10^|KJZ$^|`eB@{OpeaK1=PmSKJ<tA
zJye9lCys%e4gzk2nHFY=gZ}R{lz0erK79}u%HSt&pd!~|pWXNm^?{26)W^A>ZZwae
z<3XTWt<sN!`Jf%C+_@V7rY;8@u@ztbVG)>-whX8>pfRBX_-17DK#4sNE)4HQ4F$q3
z00^s4@J(NrEZ|<8f_mxv$v_DZ$H#m$iV!{Z>6b;^anFIb^ga@h{RQZrPY=$}`=1BJ
z0M7gKssTh#v&r^5_W&b!E_=m?_R^;V!+VhdP$rguMCpNP4jBPZbXd-39MI}BE}*1-
z!{r*c(*TCDbLAHF9KhoA=v0@ilIcsq;pT2(3X8}%zlS(l;b&0T=*|5}i=(}@3gKe#
z=A7t@5AW^u7=TecrZcE@%)WWJoZG&%^Yfc7O@Z76`wvwQ6rrh{V(#n4fW$nq`;saJ
zggIirfQkqywOe(nbKpge*0~m3F{%50kS=Zi$a(SogMWk9A_k@S3~?Z?83WRmqU>z0
z4|S)ov{IHj1pGMsL04dYd#i)vdHEaBIKLP4hjs{mu(C#Tlq2l`u&p8?N^Xl#S>pvg
zjClX?qcb@D$STR?b+^*Lia#v~h(*Ahd97N~$&UbH5<33}peq2<4}fm)iBkpNZg?t7
zhx?tj^DP*;Jns8v|Gx6ZsfJ-txEx3UKtY_ZfG=W*7Xp`o&q6%-7Hb->by~{*CioTq
zzB2Z7+U`Wy0TvEasm2FV3+1=x{=V|X>FDI@TPB>A5~d&o<{U^rPFn^R&Z*$wQX_ZT
z-wi)S+3zDh1pIyFCGhR{GQ${n*3*Uqbp90nzr}-(l=)*m|M|tSO%Dh82G7Y);sScx
zx=iHR>Fn9#FDA(){+DdXKYk<$Q0YThxA*2N$6}D(-CZ}pP3*UB*a3;adPkEpsg!+3
zl?&8$?&sby%|%6Ux>GEY0m7gN<YMiOrVVVvhkms{@^rME8*8^Q-8eijfB{EI)d2RW
z&vR!vTeFZY%V?BB0D0_zp3T%j{b^%I0(#gTH>;Ej;F<MM!gXy~mBjceI#05Cu&f;)
z!)_aZ1E3lY<|3<Imb6&`k(%M$eOcOOfb$oq+%Tl(#r6}UQjGr<(}q#h!OR(OPE8tB
z@+nb-U9&Z-08FYu(p3{<1rd$VXS_&qrDuR(t9Dy^wDj(ku>M8Vyu1=k{H;>iOF+<I
z{V&%2(#>%SYUUYeH1d-D=TqY0O~1?aIejzi5Q(&LGA->eX+t>vA+ttHnxVC834#wd
z`JR-I*4qqn9}!N7W?sbI5qm(JRa^Bi3g`h&H=no}imn8bw+<`w40-pf8otkXS^Z<t
zq|xCbBkNPy-Dz>B$_ICmxL__*6gUn<`^J`%yPfx<uvwYYeEV2g$E9E1Nkytd=TFH^
zs3?V4NHx7!eqORrx|Ho};?6sC)jq+OpvA%+W$9sQ`YzL_F2k_}o~fmN*017m$P95_
za<i^ZE{R=IDl8VrEauz7JuNNLJvje@jn6a(X2h8|HlMx9v(2q99+;H<AF>CCh}0n}
zn-R~ULL3L)EQHU#-q^Ot&txTyd#S`$^l{*+dJeZ+^nC2aQpQr>H2h_BN$x=Yb3)DS
z39vddtso>XR3lAv(>5Wgyk~zFJtkusO8^H0HntNaRm@%kunJ8mIe9Bm<lEFl2=yTj
zPx@f7V1{l38gb0`WZ!q<C7*^}XB?Nb&oMZZ#U;QAsH|!e=)7fsF*({T@m<qa>g?;p
zJ2Vm9fbQcr!sl)5>(Or1)|Z;urtS1O-y;I6Z~MM|1BWa21>@_dds43H($!ajn)onW
z9^!uD*Z43%U)4}g(c((S=(!o2@D$}^Elxp&KIrIo*A#V%Cs5I!3>Z3fx=KM;(I?yJ
zDHKpteai4!|J1%KIv#6wlw-1IUPg)D(eSFA0Gd2~nV2-(hxxT!9lM3p^F;c}29vEJ
zm&7CGb2}X&>^rRiFWYrfPy^p+S=Syj&8;%!ZL&wQF30x2Rd`+HJMwe~-A}nb+E0k0
zs>e3r99&R_h5kg%wQ113)fOG+j}Awg95mO?bD$<oHXe&Oe&#UdvGCjiRH)O@OcVMr
z7wxQzSLML&ASN+AJG^_3g?kNZ0~;$R4#ru$$0(=2FCtlI7x%5riaW~gQJQVB5K5Cc
zj^~NLjv}cHHC9_0S6iB**IF54bPt7E_{oI>l7ul)T5#qO85k#&6T-|&{>Jy<RL{^H
z%c*#mD{an>_&ztrnLNaI5-Pfa-~`s)VPu#=-*^_n`z-{jKx#(D*@U8CIi%1H!QAs}
z`|DFj6+jol{r>aXwqneTcDq7cb|wycREJm3;vwU5&<q{a@iyLR{iqfoPCUdf@0(9T
z4G;6vu`C?lZ0%N;-c7^4f6p!^jE>YOpM-^OaD+saJUA(}v~xlMZ{lwMYA#T$?P!-M
zLuzv<_QC;CpBvB#f5NT<T~HF%oYEq!X8zNu6xFT&=6-ECI8bvaBhIn#F30TVyruWn
zJk$vBsJ*MKrR)N_44<^+H0wK_Wj4w~cR=r-MXN1mL+Q2^EwN{XW~6pvYtyf9L81qw
z=a&+iSSOaSog}-_4)NP2s}6P?Bo4X1-lus68>O2};_mt+`328)o#Rl|g|wq*fYye&
z;A^e+tr-E2P1+=d+hn|@o3!bezkYWKOc+{cEGnJhEzmD<o0GyG{*?v}kECkRls^^g
zIppzV9&3pi)cy4#t>y8>$ZG87J5dOOf8<bn0>73s6<kn2A)dkG2&M-Wz*#e}KbCV_
zHDHx`mRLWng0Z+5g=1z4z<a>1MFonNs`;-55)GVGSsu3v-8l>(U-Oz(n*1~S93n#g
zia0BF?c9@5IL9it!6E5sBngf~wKkjT9eh3t<(UsayH&hQXfkhaPxfxwTuSA(n2{I7
z7e5X^dC`nNIoUNkS%h4V^hlnwIu3I(#%!LIwvZp`8QtU%fJX&7(NA9X1r&Aqmvptw
z4AD?YNIUZ%7HPX=3l*bP8H;x6Q=#mykt3!-1^ig7h2Ok6;PV|A7Lsr~eEaN2M_)wm
z__MI^wEP|_@fj-1V=taJ=Ma9}h-IT3<*1>ZIG7!^Xhr24fgeu1m(~K&kw&u9%Sid#
z)29q^O+fo_q939GUaQICG^|CY^4&X&o9vK-G+R-5IBr`~6^<0B^KB?Zc`wAPrh6a+
zVRO(kjAT(jgk>WCX|sjOgzRK$!)GagiHNABqYXeSwq)8PMVgMLHslaIM%<2cYK>SJ
zKWjBzTjx34X0T{&6s=-*kr{DCzZj)R?FIW9g>2@TK}LPbclnu@B$urW(<^$ksJCR!
z%5oMmx0aKR*Y@>(EXnP>{Y7R1D5{M)xBcT8l5xHPoUB9(O5bm2NJvqViCc<=5<Aor
z=%Zt3{h1KKwAj{jC>Tvn>xC=aI;8@F0`{hH(;o9Q0vKVvLdTkWVm?aLWmz<$Eu$R<
z$F!4ktR0XsYH8F~8&cZ3kIkB+I-Xoy`@kv3uP6$6f;@Q2YKGdxpIHZE9HH|R882Y5
zFK`2PeUCVnNle)m-EF+N(;RW9?Z+#@`XKRL*{e^H^$pmAvN9~&{7&lwA;{0QR^QPM
z0qO=UTj*#<w@GIFVU%c}KqY|>ll`LdR!0Iq#`5l@Ks(xvWR8@DolAe{;-=-d*qgEj
zRNQXliDaMA&q!rFK468WQ*AC~`r?du6|(leY4;?F)>vm^hf&9yw`;=45L;_JDq+#+
z`6ZMT<Sh4W;H5PS4*#HFN*J#_a`S+T6R`jvh<cUPZzz#+PIwLG4OQ>UH@vJ4fGs)5
zCAXJ|@`iN3S|Cy!5oHTeVPRxpa^n<=3Y1%oC2NN0IdZacmhvd!d{^K4+F7hRD(#f9
zaJ8sNm*01|Jp+YE7tXNBI%csl9V5^OoOG+L+|){8p6aZ-2=PPjA1T(x7=a3|tO`-?
zu7l%P>uy52!Zk*<)fr337U#D4<N;M$m)de9lI0jK-2Nx}xp%7*|4b{)A2TF$V=NTO
zei$rnTn0zqB<_53IIfv;ORJ{{)nbdatK4p(60Nm}O`s{RF?IPOwZ0HujSyS4ICeVf
z>?@7ko8NP4Fy|n=IHD6>o%hJ4KEhDAGqyPPavgI4z$mWKNYO$ADb}6oxpG-1=eh6E
zL<kIl{}ZaQiQS)apOU(oUt9Z$!O2`GVkVl*iSbzHWJky0;~MSBChZjIyTwZMhwBn#
z5PI_X-eBcRFlOtO%2*f}j8<*qPB}W&;ylDCY_jrXAwj3kPpQf5c#g@0t^*ao-{RNF
z#R;SHZ?{(4@%J;Q`j&1yF-!d>OL;Hk3RD^z^;r<eW2%285r2kO!_qa220MUa=R{U~
z&Qxywc`nUfDA$^W=$uQ>m6}=!2>T-R(cb>Zxvp#Qm{HnX;JqiUjAf=ha(iNI0{yh<
z@{d8N<dP{Hp5?teq+j3vgDnJVzdFae0Mvxey03O3?Zc?C;@S|&$9RF9pac51@M*hg
z9m|gg8J8cg&KNX554$qoMnE5p^|MI^U22ChK*AIpTY|DAv6SylRh;2MEqBZg{FD{b
z?+&8IxDDy|I8$NYfzEG3+n|)y?B&ndT;PDeze>3BRp5x{!0Bi+uqIY>a-zd&c5QyY
z7V9`8ix;ST7^DT!gNo|onZH4!@LCjj76I!9s6*!1fk-4?VXp;knNJVL8_%oJixpqt
zLZw&nh*Z7+TG~3lz|sN6HOXkD0Uj}DH?7jgyR0nK{!B;7sXQ~J8A`O{o>Tt%*-|Gz
zflXtu0phCPASFZv&yVCnE@-lfD+?m*imeW)qpCY6VTGFF%|ZBX670f$y%UAbz@Dy?
zrLm@=xJvu&8D6oMR|e7CL4MqomX1)*!zZcd@?mEbS(<Sx9}A2_G_owsq>GohP?o6g
z0=7Z4y2+KZ?}k(5F2Gkcv%j7f;M!DC=(K_FjJ6)o(K5F(hLA4u`&bws#1BOo8?~sy
zDCchFcjMBd#I%n)ze}`ql?5o=ixpw^$FS{Ca!3dml%nS`HxgZP{Acd9lQdPGzqW3L
z;LRx?zZg_z`h+}d7c5FRvUs0}jba+kTjlxHg6eKmu;9S8qhii@w`?en@?w-XozF&z
zYvJi=^U2`GzDG5_zxveX72~s1ulv9Lz!AV|T@78~CFi;O(vWTd_B(xS+valL?|D}L
z#Ghyh!Qxl(h~qDW<LI8dbQkolbC*KzuktV-`l-spBAfcd3kN8*#>1aOl~yFII#{G3
z$HWa^f{D&8{Bq_mbEq|L>5;BCPw5mmhp<nteXjo){uai1)DlGfDuHaJO(sCzwX2g(
zaHW+Y{>eq;tYtS1ZYvs3t07^Yf4|GY_jsP89tHadC05dq=a?M_m(J)0g=NLEA8+1N
zWJ9#7S|(fG4Lie{LX{$|8)Phh%V0+9$pwN<k|u4ob-JYjOzu{=f+Kfx7i9}w6XWuP
z!rSXyS(oN>efLxa&70vgBo(nwI?>b-wgH=4d-3tJl&cnp;y5QrAQmRK_d=fT;d2h^
zWB7U0l{paq<&3z<{<ucYBZgo`ls1umHM#i<qo@NintmvdZH=&fg;L++=5%b=^y9bT
zy67{Mw9zVho>Q1JPNJ$^`9%ON({>kXnd6AGQbOt;3&nfJTisb9kk$y3DU$X)e`#JR
zdn=FQ!uc3VHJ>A&%em1^OMW{G)(0Eal9^iQo<GthMVcMASUFmTn&LYhVDDEo^i-a-
zedqq>xJ5&**hYu!TVP?;4YZUFqtty6m7ytu<C23>RkvOU@%=5}J0}~bPQQ>KEuj!_
zGt+L=4=wA2&paML%_|5ZF8Rok&$R#59u5=pk+qPK7Q-D>U49;j=JFRu-BzO$62-9T
zgk6Ft=1&@wA{b-T(R`ZdgK#9ITsY<(a7fdCk}lhhThGfLjQA2qH20heOVKH{Ok%6^
z7dcz>x>4x90W@%SUbVN{llU>PHehLunCyIZaydTzOg$KYUA&R)PaTCHF($V_OE5yH
zn&Zj(5{<`yTR`|n0_flYaPmw3T%(EQvD9lECtDm-J8ax|o%@)jixiLVP&k3?vcAG`
zTtHVz2_c3P@cS&&jSy1`%DVGJkw2i+@k>bI`tkB1*EbUpIxgLa=P)Mj7J)g9A5wso
z=m^Eav%bs*nv8}LqjEpkO6(}7!6>=a@>SXgA7JynyF6#qfAr!e5284a62dc&7X)IE
zEy_kPCtVg!0XoZ9qu`x2IW2G+mWwz`Hcj%cZr(f^dFQ!9*u%Qajz62#R?~?x+8n%#
z5F6@wz$+l2sejVpd)#3yAn~P)7X6qbMKr1bYiy>BF_b2Z;%bYyPOg$ujny+;tBwJ=
z?6ca@z7+Yn&ttNRnYJvzl^VQPh_jXp&kDwG63s}9W|z21p>@80l6;+-lPX?7`UY(3
z!h1h{Z{OkZXYx0%i=Sa)65DH9ultpH1!Ji*X{}7r?kZU>8h|pyc+Wq&P(X&DP<qYz
zWsXssx-$ZnninKGP0@rrE@O>RAF-t4E$1tL<Rc(&b@&Pyc%M}o!_fd=NdHSLIzsE1
zbJ7xHsiryc$1c7saa{JEV%#jX?%p=UU$oKBsfCW<9nu+28)+)*MrU^%UV`vCRH~&j
z@TM_>Mo~xkB$F&d{KO^M4W8ZZL>S6G<{-7AVvIyFdC=+yMqEAu<Wz$N46^y43yMn|
z%hD@61J35ORZ$l|J@|=slxS4GyOh8I4Vh6D+bvjn35^OySW;+uadDDDD}EiQOr%?F
ztDo!ynm|3qmwKnIF@{iTC(IVvx990qlF@2@u~cUblb9|*uK6>}dhwmBb9~IhSs@$J
zFUygsj_92C{+fRt-e#<8`Sf~<i2oZ~FH5$(;*3Ak@gL+F?{ON4Fwed{qo}|vdk<3=
z|Cy$+jh8jn%p!-zj(4KP;c!cU8dk*wgHr3klq|?|X?!+?1t?%YlbVjLM$V6Mt+@n{
zSwk5WJ4%dMkCuE%)-rq%SmV%zAbtmq^x5ULR%yEhCBI)q(<o@+W&gZrZo5-S`6o#G
z=sidXGw6xPUr=|*GL6jSOu%`F@+O9-l*bwB93VI12aC}>Bdv-*uDU$FIdYZmP9HvH
z`W7`TEZrR0dp%G96VDZI=(sn1k=(++`!?;(g>q#(mG{Is%0vD*<h3sTPaKi9+E7(y
zu_r+xKlPxoA9Rbm+TJN3ob!9%_b}6;^Mz(vnCjX?xpmVty<T?BP>wI`LD$VJ2<<FF
z+<x{jO5LV|v$MBE=*~f-Y_Z<IWdzP`eHMyBI?V?81uJUGxUfgt>ciA;w5rpKwfi^G
z-Eb)Jqw!IeHSfd~KCxm;e=GJ-t;sTGw4?0D+I9giHtl;LPYRpvE6dPQJgTXeV2z;{
zYsk`q$|WnP!kNh_n>qbnK6RXZb5xN0*kUDwtPhRI|5{7Yk%Ddx#{52AVpp?dRPnHN
zkyuqh_$^BsQaBfkfm>xJ2kh4ygXtO>+aC3~<K|awqWuuH2-qq(uDi5rrqfuq{@rt>
zmt;LeRT1ZzK~>93mT!Q|HivKokvl0m&fMm2qxp(Tcw850eX&Jc9Y3R^cW1bzN*;Q)
zdG;dv7*%s75zDHX*Ai=+Gb!!ATwdF(!^CY2LdndY2g$D^t-p(foJpZ^li2MDheS0F
zaK)fn)sbSPE=LhDhvSGaaYiKu^Vw8pH}O|63u#~O+x$ZZN-YziYA#ws1lq9JnNB#e
zp#j^C?@jdo)OzRSmkorS4+AxWaFa*d-t_T|A=9Z=q_dQZ&eGXja_{a`NPaZeTX!8C
z3V~FN#0TBwX4KSidZHz*ny<Ra%SlfS_egI<wsql`Yos6{n~p`{PPpUWJ;lBd9wN^;
zjCqWXn7~KRmLkM^n|C?kp<^a1c_tQvr=_|^2Mr4J%I)0o9$tQ~x;dx2!pvj9bH`Ew
zc7s@&b*6Z;<GeQ+ZCeCc42`AtabLKCHv{)va~EvBP1G(5Y19?REPzvR>^eS(0WT$Y
z43nDQgbcEg=c4UY?tYr`n-4<LLJ`?ZvdwO7;n8~^BC(p_70P9{-KgUqYx8f({C5Dk
z!HZeX`g$VwhV0w*Nsd52$H6p7eUkY$ZMuTe6MV|m5qi>&&dK+T`1T_H+uV4U+^b_}
zn!`Q_UY$eoUT+03+KD@pogqVeB@I(MJO(NS5wzZ!4E%h7%euqT5PBcNM?iQdJIqC8
zJ{5+XQ)Do2K?RW?8KC-DT)cS%k?#=#kT}|6=v{~uM1=z-X0gAI+rK$-{+p9yHg@&x
zCI7Ue$xg93*0rywTS0N0FPBtxUmVBSd_@<WD@(!OIf;T(tOnktzXorZEbYBTEnQna
z9^=H#X`Ixm5+F6-!k@p0>HaS0eP=3$u11O^@1IhQ;Lv`O+}%pKDe#R)gAe&u6bfTS
zxE*rowlT2@-(Wezm0xu5{Sq~Tw>_L=>vB#&Z9H7kRD!Q9jfn9Vo&9B&%u+LQa$Yq5
zk1Wn36-8a^)4la@g(SY(n&foEta15>S;yk2bK%pINHUkXK<uPW`_|nyEfxbr<C|7-
z7sj64oX%Ud()bj#b>1=0kHXiNMFM2Vnfz}X&#@}{OFj!g8j|@ZJ{5$>_{Z*j8cZhq
zq11`v5<9$nD}cM1it8;J!k2UF1bQ=oo&43_G0V*qsyVexr=zR5gE^knTpFdQnSR;Y
z5XAujGWzr}@E%_BdS-cDBh74t0sLhcvKrwHFNQvE6U8AE;Q3LZ!Q_>JFa#%yJn`Fk
z@;JGj*toJmbU<^tGn8B<eX%0`X<T!2D5{jJFO*O=75QU8P$SpZ?s*@@bGzHJ7<Ikg
zGBPviV|++aWw?n*-;7f5Otk1ICh7#CB*Eq}c%IPFq0Y+2vl<I?%m{Z5O0d;ApfL9r
zUEI7{)l3~yI*U@)R>1Ua^*G()*rMUUaCA`n_*v=han+7?sKeGA(=pEG5@cj&Mo-^Y
zdaat)?A|mW{&BaDO(2Xp`ofNyP-=+$w|h`UlHy@jn=1k$$Q>L;$99pR22Sy&^|Yjb
z(nD#tvasbaAA!w5lwvQXRGfH5*j>6wxES<70Fql3tCV0DEL|2z<JEQ0fc<{ODzku%
z82eK;;L;x54oqz|)+CJ5Ke0E&kD;dDnz5Agy)IL{!s;CB;;2rXr7tr*>_b1}2FJa3
zzR6qzVtkwu+CEG2qHL>F9Co|jbS&j^K~yM5fS`DyR+FqO?810(D!m77HKvn|f-X^F
zSM$X*>`e_i+h?Lg&BF#-qL61~d4kx^M7@gM8WeS9Ipew&yrln86xVu^PUdt2AAk(u
zCD*Bl-7*K;`CD=MtHnDTe3)13%?hgmbT1JH3Vq+-&;~%Iy5Zf&7W}ew>GQ1Cqfqvh
zuM~<5;0!{14=J4;iHDbx>LO^3E0u3T=*TlT@AyJ523*=5i&Y3wdoi1vjzY+-=+-1+
zr}Yl;6sw9N@V?dl{dDthDF0YI8?7;_BI_#laLZi7dwWkea}T3tgzAahr==yJZ+&J)
z`1A{0HO0{qP~J0IDu|_4k`v0O>4(ppd^?Qem`t}FB6(<j<A6bX%Sp<CKqYn5IyTXr
zTuLiH+nYRpON+3hk-7QqDiJULwDn}OO{TJow)$}X?G|p$-fW(9`;QTXF+xD<{3%r>
z2=>GpG_w`Yf?b|r3W9*}%J#TFe@bKd2Y~It485}#i%)s0AV4$z&rSc2cErI6RY^#_
Vh{?rhaPSA2y0W%X@hwci{{qr(<EsDw

literal 0
HcmV?d00001

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/diagram_submission.png b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/diagram_submission.png
new file mode 100644
index 0000000000000000000000000000000000000000..3ed6429c01294f35c4fe61df11c0c92001dbc462
GIT binary patch
literal 36510
zcmeFXWmKF^vnWa+1b26bA-KB@4#C|acyJ#)gu$H<B#=P^ArRc%f?Ejg?(RNklK0);
z-s|2!cio@ooLQ^qX{oC2uI{d`uBs??Re20lQdBrNI1EJv8BI927d5a1Mt%jWF;$&9
zfP+JSXD=<St|%={t?uq(WAA7U2d5DANe@Y1Yk&}9l$@A24xg)l(n>cGlY(vd^2MGx
z9eoHG*Al2+nID5%W+GFWS3(<IR`#Ct_1n%mz;&Cxy2iAu3BK%>ih|@!;FkZ+!}e%4
zkL|M-x5o)O+~+{!D0-|Zcxp3a>bL}Ci^9UMJc?>B;9YYN6yA4j4bYc1G}ysee}gu+
zG{5+fYwV@#`sDH09*t>6z47AxoA-hrIX1SW7t!I`l9-YR;Uu+p*@_4h@IjP4O`62i
zJr4MSMo~U2u7$V)81wYK)iiDB?@2!meX>Ewal_ir2y1cd&p$(XtH)(V|4Q;V%DN;Z
zaVSC`r&%lL86UCi{Ve9ZGIjVFT=uYwU^TQiTWEM}%LdER+B7te@O84{c>~Y$*u<mD
zz}QEuyNZyw*C}$nEawq?EuTafltPNJePVbl-xAb@H>maKO;f<7(5A>UG9PJJLSiuv
zkchFH&l__!u*sy!*+*ldboY3wW!A_e*WO7L#h?mCVUtO==u@L9tIs)pQjOBP10GZr
zIHoEEr?OC7mYiNmLNa?a&)RWN1z!if$yZ2Ehv!0kTLRQ@NADrUZti!-%hTr1j{zU1
z(~uR5n&Pw*^qj%F$wVeou?#1N_mvzfUsy}uy)_D>RgFcZV#Q{Zdwmf4W!(*FBcjnf
z4e_n+1a^XCDydisni)1};V^|iJ2n%;H_$yjBF))gqo8auQBhSN+e=>@)K<hE%BrrH
zQ^ul=aO|oK5)cOFNM-HbD55Rbz8>a{PQCYb9j|iVqij}Ke_*CYikxpcXkO#kQV?8)
zXYPLmzgjwhKB7QEj?--m4}cGrq=&0o^Q8&RhgV0w%8sZVEaK@WC1)egN<xDGv#|0c
zDIeLWvKT0@Q_&%kal2H@jTL9+Exr}Zz0lSmg$JGP51;Km$%$_!6$@D@qSJ-r(nYPw
z_%9-3^*o)~-J*w&VBB;DMx5WiQv)g4F^SR|GkIb=s}T|?x5v1%?!JFq`)+6zWQ4Z(
z%H40B1_WmnF)WB606*G*blKe%nq#Z>&AMb=S3B&E;%0^Wc6&vAF3Ds7V31AzrtTrA
zFg!GThchN`dSzOk#9`aDb*`Ik<-s;sD^UC}zpcK9&@!Sz_wc1RrbE~M3p|l{K69eV
zd{ohvCJ1_28I8+X=-DQa@|VP;n~pfar<BI&2Jglo&3FRjLLvY^rGK<lxONH~t9jMx
zj8w;Fq%XVb8v#850XX~>(yfC7Hp^4851gaHQ*4|_)aJb%9g@HGOz;DCD1}k$p<l{{
z7gTRQ3u1N5d5JdGs26t=V8I6{+jV`hL4Fg$P!W39@PcC%>w^?AF~VCZS_OoU=AXh`
zzbJR|a-t0*G320^!Z~!lsz7u^YVH(w#hwghbcN4)LDXUGO0gJLQiRv?Qf!rCR~{P-
z9>*4b8<|cs9Z9zyeoGyqK#UvLLuaq>Dmg9^M~EBE;C%{BNg_r+e*x(oiO`2E$x)i4
zey2^EiLe$a@jPNBSuYK|$yXFn_c^S*s46l{d2_qkHOMBB?9wdbuRS>oBOsC-IWxa7
zFB#@zUc8~OG~Z6=*$%C&)Mup_>MEEeFU4o+`(9yKit{|CSBY6l_V6ChO?b5D%NlBY
z&qr5Eaf0knj-LXn&1b@o_(250kvFSm$h;r8I+IrASFtrw=D_w~T%eEz(;DX*@1pV-
zu#XIFl2i?YUw3P|+%U~3Jv)Ol?J*NS-a=GDR4YC1K;otXD)rCgh5~p^6nimyjy=IW
z)Z&kmMTKQHh1-RMWvOM1x-7aZdJFm9Z;qnpd1`~I2<JW@%EvQY4Dj?14T$u|Z7Of7
zx=YArzEwuaq1SW=Trd-)7!KMGiVYfhd3m9EnO(W<(_Z=R6YL{Rj(wNVhDuK53aige
zqG$_wk$4f+X82{^r^@BejeoG;tAU^AHsm{^KYH;)EwpScuk+Wc!|>F%$#1_K^3Ie#
zmfHUqn8+$HC@?7VRk%pDsV6iTl8gEAbLw-xPO+EFL7{Frzlnc}cd2*VwdRr7{7y~w
zL87OlXP4)#Fq&80LDr=2o4(ytPyR3RW|AvX5-KN?4Eyu@?w%f=L!LK#b$i#ld%M27
z3<oMRC6glGXF5BG`Oz`y5-PLY65T@2G*)D?zjBP5<(ZWP%R*Z_kweW9LZpg2gF+8p
zQDJ0+TZA`;vtS%xoRF=_Vajd97Lqve3p-7U0e!$w<mJc-93=TgF1a|0*o`<_QWt?X
zyG4h$?{F<>xeQ#~Ckhr)EJz(cTQ>DxrrW2HEU}Kiv8wAmY?wnUr2P1E*ix5Mjmv*H
zZiH@RYkOgMVfcZ~ku8}un(f+XvA#h6Uf;%`tkSXau#&r))PZm4d+&Zc-`sqkp<U}}
z)6{)y+z*xL5!y_@%$>Sdb>s`Q3#1GF-n`A!&Bfk2-ZO_$*W%ZW*O`YLA~~24Okjjk
zgmV11{IvXjw~v<1c%1_?iig@;p1(8phuzsd%iMx?yIiBZnZ1jBeEBBcS>5-4qBD{Y
za=Y>z@mN~(^GO-~#+4r}&lm5vvAx>sQR3OWt?Gg0lYOwf`aIwLOM10(ZX=ax$)j|K
zc8qXr%yijQv)r(p-G$`@{UmN#AkZ^#Js|Zy>x$tv{C4!p>HKMRXG?F@XZ%Dh*cNIG
z6?-0&Ux+7+UFN!XdoK$oU~VKSPtC(g=jvcCpXiG%@a^@kMujz}1jzQ3Csm9kqBia7
zbq}8msk`8&*g}F><`Q{ym>f7Yye@1H%n;^;?2qk?1EDa*TgF=>;o`{?O2#H3XyiU;
z>;rdiXnXkef)X;swzcv3Uemmjp3Jw4u(S8v(Dyt{uPo_Xj;V~t>6`5Pps#wkKl|WT
z=cct0L?KU3Bv9tQ`1@Alta{xDZx`hw-Zg!TGP<HM1SBxPt_Yb-F3k|7X%xN-RI(fV
zo)SNqR9vmaQ9$xFYJzk^ac^Kx<hOrcM4XvOo5zP8a59Pj<GgB?_}L<TzM}8Pc-85w
z;V%Jz(8Q$VN~R4wfmEZPM<bh7&clammA(fcj=RoBzDIVc_vW8g4Rj@^znAXJy_hSh
zZU#$rxm(mnvdvTbeZkA9X4naIc04Xy3~5PedAfUdM{+0MBfM^WR?J+eRj6j4Ca9!D
zPoecxSoUR}E-v{vg{CU1DjePf^sOCG{}D<ZtWT`3Z!jys<~yIeH~;(RZ;nyd^h&GV
z>b%puY@2l+UmhcyjIqxW2WyVghfW@eLKz;0+lHeig7qGzJuOQY%oiRzP~AhrmhZRa
zh|FlDM4wr+>QB$Ga3m&E>{1kzo!QULYOJ~1^lKWNq8^g3l-p9ymB-Q=Q$M8z8V(xy
zJ@;uy=spTousRqlxR-Bb>S60e>MiP%)VSZgKGKmbp)99tZgR%(P}%$3KCNo%U$&SZ
zFeE%EeAg5oy&D~VrF)=P3ayc9So5=1*m{oZB};P*Xpd^&8{A$UsWp02m*RBSpi^@-
zT^H%(rNghYwGg#{Rxh!X)RKS2l%4(EhxCYb;m$V=q}O+RyHxbOwWQ>R_{6|8bh_W6
z=~{Kx-`8xnE!=Qt-syqlVF>d*!Z`v*(CF=5Z|ApAwuk`Cspq#(L%rRX-Cw;FmSUSA
zeini9(uUFxNnG-Vq)%j6(Eam+GRBL-!F)gYG^%`ZBaq`N=-1h@jiSd*)FEDi5o7(L
zScc#Ee6d^6ysG>caev0g;Q?LUgR|-vwwa@MUqsxOcLJUe%h9}u#5gFQM=t8SolKp)
z(pot9jIUc4XT_VIW<TYyFqMUyih0C&9Ccl#9~=oJJ1;f~t=aUkJ^$FLzi@7E(*hMi
zk6Zk~{$FoukJBVLgVe8@pH3b`T7=`p(E~we8eRF#3BCzt;z?q6lZ~g$NA`C-)wWY(
zOArCCFdvg3u4maD<$<B+<79~%=<My_@f1h!E2wWJQ*F^j?d4TR=2sVtmE-%Njq=px
zhGnFa^OdW1jw7DKg{RIP>4UzMgt7agK-$3Xw~o-pGv&!Ne}p>(1l+O|5fFZdxpZ|E
z9H<%%&X5$&6BM2ktGA4Zgs8D>hJ<sgrtfP;j?V%2&Z0=Lw>tBMAYA5%Xg`MkPaJVB
z;+*5-ZWV?qRv!TdB7$Wwav%bNiwYbyAN;9-Fz0x?z4__|Ch?&Y+~=a#CpoAzT_WAn
zaEMLL)ctTPQRhMTD)B2fk~kqqw{1|OFDi+#n$9~A*yAJGT3^vdMFs9PERPKL@&zdz
zA}sd;mc(C>{U<N?0sx2bw;mo2F2Ww}<-ahhu<xH=A}syE`R9v}91e#BE8)TtC<p$(
zpkLJFAp9o}hPA;-X-O+8!oFIT?$*}M9(FFCk&G%tunH7c1p^N_I6{U$>4l;u{V9z8
zw7s^zr@qP?VM`Y$b_*+)x7O?+C)Yo8;6y>fu&k4{rv)|0$<f(E7$ipf7eW}8|5MFD
zOZ^wb(?N_@Uqzi-+Qr?PnxCDMos(7^m71Db)ZNNPSW`ysUvyYWjMmQ6(^Z&*!^g*m
z-G_(W#od;JOGrqFgOi(so0|;=!RF!X>}dgFbM~P7hsb}?k+Jr$bhmf)w0Ci){zKQ|
zt&5kZ7%lCej{fueXPnj``~U9A+2h}7!6wM@r-g%yos;7~v|*^Ce`<x*?LpR#1~T?e
zFrLBs5a$sT6#WbT|7!W~9{&fX{(ocg^6>vp%>U8y|HRbsuy&Vrae{T}DgNKf^)K@O
zZ2T8dl;h9L{|6`jVe{Ww7@x&aMLGVnX5y&2_Mi;dGLqZNsA<E#uub;&*9H3rz|x;D
zEUDV-#pDgc!AZg?%1CL0UK}pzC7SfP37jI(hW1C%NA_fS5E6W2723I-t8yF=(iYOY
z>%<Cj1yu|wVpg&Jau>248VpfN0VKa2rIvb$OPz$56jC1S!L@p}4yUg^yR{^Xb?`QA
z2g0+q!t-=;yeA~GZJxrYOq+;I4<98-%h`oP{1MFG#gM+F_7XQlh8hQeOrN_Yqwr6G
zm;hFaL*?40{@bSTGI`t`^Dkl|ax5yWBo{{PfeoYchn%9F5YoS3a)_`}J}_1&5=QY4
zE&7@)gnz;E;bA2WnqG35e+PkUIQpmKKVTXmlDPBVuq`oZ{}yGaQP_6=As*H7O_CbL
zNjs7YDe0e~CxTNw{=t$l*QQPc^S|y%5B+!C3cw+u|C-@zYy}`0!P{Y}f5#1*9{K;@
z>7l|SVNp6_k_)=t^4JWg+Z@hTj@TD{nZ6ZMhSH9wepw-KFR4<#yDC4dvSlKI&GTc-
zPUH`#ljYVCWscU#!^OtxxmpK3n(DpZMFJn0)SP?c$n;V+A}JOD$AWZLD{X-z$4kwb
zM+^07D>!pKQFxiZ)g)RoI4!y?-je;~+#JUZ`5=jFn5_zadblBC2r~}=DOcN!ykk`P
z?5XMhhU9zjP%7I<lh?s#l}vs<5${90!Z#Tctfnmu83Hb*8nGLF2@=IeHrD0jFHDIM
zz>EEZwg=NcMz%*XGsXO`QyROlGlV>Bc%0^IXqBO6aUb=T-NiGIF9l$$8Tgv2rz+s?
ze7mxb(t$GIv=eBva_zJ-Z5(j+lc~0kqwV(XML%6a%2<Aa_<X5Ja~(k0%6q}7o=?x%
zP1fUj&X(vWNRw=I&=d#0Kekd@6-)F~@}dV%jh98Y)NZ=m%nUg`q)U1ugG*^ta3gm6
z@nXz{y=ga(+lLrH@*zNWd`FrN4#rS%f(ef?v6;odo23y7pT%DRn{-sn1?QRVcbQ>U
zsy1oa0p|nT!O)xFOpd@k<q>Htu}R0phD!bOG!t)wi;<Pbk&zLBmBlQNQK2w(HJfIG
z85ZgHNOevNb>U}cX9>YAck?rC)1`*B-$8bfgGI;r_7<aALSe&Y?a%kyaV`OtH1(I0
zrX<&Q+EBVg@N^M3DLXJ98@gZq{5>&4jA(h5A<cNg7_=4^);}tGG2CnVSuBIKTpXUG
z<rH`}eB64oygg+0?2aM;@mcmA`GAOq>WiYdpeOCu(!l31&AUCZ*yw?lYvQxyHG)WF
ztbQDLCI3-O;iW1&Yzdf%#y;vUoFT^%EZ6n$8UOI0DZc;Wz-LssB($OxVD`99l)3WI
zy24^nlIh%3+;}x@lqt}%hA}enRaxXCifDe21i>2HmEK9tQuvC4#FMAmOzk~vMaLy-
zgXfE#rrgl$e$1Qx=vJGVzp7a{yf=~92OSW<dI+Eh<;;OExi1bpcz2;c1Db?hNI-ut
zdd--Oz;>qzFdFzvU>Rh|C0D5W#qhq_qC0%MxCrvD?fg?YKGur(0*1iic*{0lIdf0%
z!SQP*Vbhxjbk<tR4PG0GgTN~&lK2MHA@Is{Y)b7RsH)W6ZAmOSt?BqGu-Wv+)8ycD
z#8&U4Z4c~#PZ#A0Q|1~Bcc}##<`CxhU<|<Hidwxk>|wno;ju}{J}yCPW+b4lk+=<&
z08F<ePO{Z6P8fTP@Qvij#>6Yoo|gJ8TwN@_doqdsZnLT(dH)S_rshN%#mo17Z7??n
zv!z$_SX<%~jaO~V{gjc!v4HoHuPJ<3n}b?#*wW-$n<c?qXbZ*I!N)p>>zzu-4Asqn
zL`%P7=KUC6&{swNo0Z_;3z%%K0FzJs%naHQSuqHr<+;)oQpClPBh)mlb2PX+{$5h{
zt_}Lo-ro3lJ|q%h(pR!_UoJktAm8n<Dym(4eK0#Bv=R2n;dJ^WKdyaRK7Ri!OGwVS
zcJfRSOhcqU<NGap{#$`gwnld4l<#iS^W@mCZHqwh;COVHFuvDT?W5yGjUaHA(gRil
zp4ux?@SS7QxRZ#@Ib?guWY>kQb1=Z(4@dQB=#4Y&&sQV0Gp0sBP32TpW((FaJ?k9R
zf!DZ-i7$TwLlk%uLyCi-<#9^*kY7|*AIt%)Gnd&PG}`M#%b{(0_0RS(nsR`BnAq^v
zD8pynGFhMnZk8?IyQu_EA~6-Li+~H-PbD378mFJTdU@^(sURO+bFt<*U#*Ee9r;n}
zT8&eN?wHKjj@2woQ=OB2dw<nl5^z;w=JdR&_FRCSGEOsg_`@a8VUFzJyZqZ<MG{YY
z$@-TtsgTBbj`cK-#iwWLYh4UMkDHmjBw5}x>WcyFgG!{<Nb@vajY{9@aBHPKY>#D|
zS)m1wp*nPT`XKcEz}iMbA3?f6Fopb9yHF6T%?_Gj6smhL*pd6>@MGqpmVM^Z%*c!K
zv*or6g&B#}GtXPt)p4MuL6I=7TW)I7r5a>dqm!%Efh)o~{rx?&Yif(@dRG`)%-ci?
zra2h}B}N8B)nB)>2~v)%!8audVZmc!H@st_7jJ0Dyc}lMi#C&zdk~ZeLo~6KbiQ4?
z17uKTbs@r28r94kt(Oyd4VPvj2@=mBMeiK_h3JH?U*Tt`+%>Yyv=TIePZzo004hg6
zj2o*0)P8+Ot3+;81Xb!OtW7^RWmjoO6p%m7Fx>(m^1ypGFq6MJ)nqX>0ty)?0i`GY
zhb~!i)MT-P(&|AE$PP+&Bz&#Ubz1OU+DJE+gz^5mTi`=|u>XME*KbLZ&BQTW)Lxe6
z=({{y<%uDU>y%c^Z=&9>nU{+1^D@<J8C2;eyoI0!-`i&)_omk>d%M>5*_I_Xd#^lS
z*9#=a_seH5grPGuQ}YQ9jHlQ;94ET@Qzj2}_#%%9V;w<t8;MtJB_8Hp6@QfaLbOd|
zEXp+LtZF|aW*|rT_l~LFz#$F0bAevFFf2jx>Li)nB-qp?3?M1!UwdewZ6^<Hc_{$(
zTmXc`TIQ}Ih?y!n`a}IpSF*foe}cR1o)&rv9>Y$4kW*tqguJ3dOtMcl_M92?%8ZCX
zjafrdA)R}I5mAu_&EwoJfJtUsDEgNeKGnoIx-vP5hM|l9!ouJM@>FX|Ymnw}O19f>
z_9pV6&KmRH$;2$(W4Oz0IJfHHRI5T!a=oAURk0eu+uz<4)jLzxEx;Ouq`abUaTqk9
z(F^3b4+73(4r#295)4mqE4Oi52J|EO#BZdmptlK7eTD4B#282I^b$R>O<-@(z6g3v
zLS*d<JE!bz#lx<=grSv_c%zvKgzweHNA)f`UK?;3yX9j73>R3jO*TgA>-UP%YVcN_
zN&06CXr_5tryn6|w>poaim~~o^(4@mG9B;QYed$aN)Rhfg(`OH;#9V!Jn+atSz-XO
zm?VY{lj^nuX;4(g{kXPwU({-LHU{5azQZcndXwhwg+|7jt8<X9X=|c+Ey^8}wQhYj
z7@ohld|4<!(6xr4@v4QIrVn~w4t;$?i!%7M=rP7dg_hZOmJPkjmc>z5!q4;vFb;w>
zr(R8%@A+Sgv~7q)cNJpeFah&dnQ=B{e0TH3H6-=S0=@P)4i+*5t-_6CBi3cA@ClRi
z0+NKjT8WA(K4}Ylbrg+E1M;b^Xk@WEdnvi?e9~+wqg!>kFLZfrn$2&1z5;z(`MQGs
z5|!jm`)8!!7c_7Te9X!7@d-Pmw8=UyXdN#b5V)nd-)`$mdmmZBtmmY0nZo~J6$PvC
zl+)ZEJ>F2gzqhl58n5S#oH%D;`dS2be~f!MA|*0$(%wg|P6DCi7$HOi4~_Of<D%P=
z7hMNHyKR~|yY7Z7d6Rj8pr_~42npj@ws5R<$;5pvb4zxT-+y?J=G}hK+*XoEGO@ju
zS+tFmr(J%R>+IB(J1Sk}G$w6%Lk<l6!j*yLMQO-#Z?UM{J+5(Mju9Je{d|b^M}|vd
z%3r>%0AGi31mprhgNOcYJN}<%T93VCbJZD<F)|c~iX53i(K7w=0QtMe9f_64X_qJ6
zGuO5Yk@jkk!Lg-~Kjec2*8x@+UR?_trm+@rO8)pbOkJAU?duc2t_13z1fKZEe0oz{
z^&RK4pZ+njo`k^|exdiGJlFF>JJi28#yt<e{l>q&9w0VYgi9*>oQoCs4Pa&!jvkw#
z3AFR+X%vh8II$sxLRUG<97qX=wex4*$|AiHr9taVL3xR<$F4_3PFObrVOk%WL2-Ml
zysUk9EWG^m{*@@=C_=GJXhZNRQg8*pYMC6dILPW-U_tD7Er(*anq2Q0YF>C_9Wm*?
zhl&X@dTs_9K5z2nFL~mN4vTm={8!O#)Yy<z|AF+KNJooka<JLJP4@F%cC<#v?}$dB
z91dg*Mg?gRQRk~&H14S~-_skIDu`j6ggtkbIAB2I;UOs&eB%?WcYjwc@lgF!`nOq?
zlfQEd|0|5pucOih!#QhfnLH=HWBy7KYabeLKKn*C;OKpcu#z^t-Y+H7uZwE;<b;yo
zz#c_c#p$n<?SQ<;UyZ@YVPF8xV5FuzKRy`Isk5rI{mQ8Q>Xb}AUe1otOUDXmoWh2v
zlB82Uz-LNQj))V$S#_e{j!zpFN^Ee29JkX&xB5$+9$!%QD@EYcSo>{EJAg(dl_xAx
zkeM}?dyMv_(#Nl=Qm;!rDRE1ha7n}DqS_voPT0Y68g!~&73*Oy*KEp_pW08++ST`S
zvzU)zx^O$Q(;gGIC)Q!gchFQm7CzHDm~RZzq6^&Fp(VR`|27Nx(?s~E>o}6R$4fP+
zr^ceIAcR&H$hlofP4gs)o5K6{H-A?DE8h$>WK6eHxe+{5CWKB?5_oE=O;hO#PFcB4
zOXJ?*YOaKu?tK2SetmyrjHH9U3F>I7=QS$!b7_20tqt7obppDrMo)K1RCh_5X>Fgo
zIg6<Z`Z&3XVhL=JH7wxLBSZx(&qUwE7L7_j`Spn`Yuww7o?*k-jx6o*vD1n&IZo(R
z`D+9(8xr5YEFXrkyvqYW9#JVoi~r~Y9Oz|Ay{5EP&}(@*-WW=$ThnvopBe-lh@9$Y
zO=(ODYdY2u=pbwS!fS(0WF3HFFoC}&{^^eRO`|)d>Ta^U)k%+7>4EnejYbzf9ir0O
z9?`@E(iL*9#(MknLA$Bw==Xw2Edr*WKJp-smLjvePiD+76R2Zv{>)Ly40eGAH)7NN
zEX71*-YakA56rzy4ab##?v2{4QPF6-8*yQ$wD&z7m*^@ymW4x3l+O=@ST%pS<}qp4
zTTHCrEOup^7-t$Zf*#t_xD~)NL4@Y0@y<i3(4Vhepmnhbou{h<pWR)E&L3uipJvWu
z#V;z_wv5`oP9}_-k5!z#tD}ORQE@=L9()TnDRB^-x^{Phj(3#&E2e@WeC26+-`ddu
zPBcO8*5A^ly(sIp3>Vxvgwo}KpL^+gP<+5PVw{|%c(0UAkgo0?g{pA=XgYTosp9~+
z=$P{&IKYH=ZsfajWb4DWYoIu3Zt=%@V3N8dtJfg*3-xd(wA6`$3IJXht&~k<lzX0e
zA|gZh_@RM@esn7yxHL}Uv0H)_LVr>iK5!UZ6XPGI#_Zl+a*`Ibmxk)5o__C$rHp(!
z6Ld7ABr*T=^>B&l(yCm=?pQDQN-st}j9yI5?OYu@&aDPzQGe~*N(-sr=|rpv)EDkP
zHZn{vNxB&n@4nTDDsuiD^t=<?hW*(;Y2F+aOIZTMcnt|K|Dh(z2kvc<bZzmVM-PUS
zi*ZsioinWbSuo~6pe8zkmCf+chtgq&rwF#t(1;CjFyms86Sx=l`mBp-%lLgtz<r&S
zNFCr{2I^HHao!xHcBL}dWAY`5fT`b3jnj4W7ufbFNp$Y%)V!9{kQ3~xf&$ijq(86I
zPg~|FaX$wP>vyC&(Q5>mRZ#l^Ti8*H0#|+^$GPW828?CqaE65dPRc%<f_=damz~Ra
zVVgo=M&9rO!0Wjv{Kj>XLEPLqx?J9*DC}I$amk-bQ4T}tPpVQ24jMk0-vA~uvbnX&
zI-rzICW*TNZpI$@sQEPaC!XxPVT(Mn+kj^F%|dEE(~XnI(G&bqY<SJksg4>?nN4Xf
zt*T8%ZFCNL!Nd*+|4$vU6jAmci*ke79{~58rY}G6Ce?U)o$u5lGtsJV@CjUI7Detl
z*y)PZinp_DV{-HG&(ae3mmTx8gAL;b9;^Bazj&XC(kc2!nnA%<zUs=|-LkyFm-4~d
zqJDj{V4e}W3u)+xbd?3V9aR4UTU=CVmUj7_J`FD<X9wqVqScbJ^9^4xRpZ}-eW(f^
z2YAQSANfvMv}ywL9htqAz%x=wb1ZoQO{fe?_Gg^fxLbDZa{6ewDAjZR(suZ<Cn-%g
zA>+?5@l^vb(G|hM6na;0^1W*svf>+?Lg?^xz`#4E)o?uva-QmTs5c8P-?)MT&4Bjp
z56$gHk=?TaiM^6oBTo<po`TpO5YH)4ov&uE&0C65R1<U_j@|0EApb!l^LFjSZU49^
z&?Z?TcF1%??P*etAi&q<x)l4f_!NCyh*cp~?by5&<NeSAk!IPekNilaZ@kbU0;V2O
zOyQ0r29&r42WVZt>c$&|%qev&`$TIJ`uda5;TOvCz6fK5v0QyT(>}1(PhtaZH0pH2
zo`)^`=YL-v8L+bm;{7yEFQ3JVh4HH*n)zL{X+v}#PU=4DmAXx-eOT8=w(X*49bkR{
z&b-m`d1^Pf_noa58uQZ`71RyIXwGt@^0W!`?g+?y{u00zO@^{d7Spa`my(*eCW&=m
zM@jnAMgq3gcnaR&eBMMDz|k4d1IiL6V?VZI6RJh-gt!(2&+r%_|6yua8WuP3UBv{Z
zV>7B_m_Zy)NJBzIkMz{@FVW`xVlLwTPLC??BouDV>_g3?h0G5knp0`sA@xGJ(EB%S
z{qrD}+(x8aFB08a_J_b@Tf1Vkz1<k>Vs(zLPX*G>_ZAe1QNEEW16%U2{e#xlml<)t
zYl&%(WkJi!EoFD4Aa!K+Og{H_qdb~A0ZXQNt!YK#&hH1Z9|v%0-}&R}bRKXxWKV2-
zq2`DVg%={swJ$oJd3rv925HQT3ZlmC&<@YB<7bP}JV-}kkWd?J9nkP9>6qb^6kuDH
z`WxZD#h}n#FQlf>GJkVw9M{aKvWpkSyik2iFwI2`?0LySl7P>SyVk0?Do<ClpQ~}0
z>;iTtc?I+~5uCbyZQ9GAoQT9mA`MNXNOtWoZI-yUl<=PA;ePhEKt3%4MfK{5QMn7^
z05>vp=%R2WpOdvgL8qK9`;_FGVFU@2%Vg5hfM25c$&>PwnQtlYzqLd7@Kn^Z+oG}Q
zb88udg@e^#qPekc9xRuke%iuo|M0v*MhX4^l412TMGKS3P*%lhT31xBYQZ<fV(?-G
zmVgy1&s_nOxvY?gVTs(Yq^i=CqVV)3rem~f(O>eO{g5%3)gkyyU8_!y9axW&G}HUc
zq#VG!#j9MEmo4-&=C2TBdbL6jzQ%BZ=#k&g3~E)n^lZ2~$JhqKi1%=a{5GY_0!b4v
z>DP;~%yta58DI#_<H^chlastahkwYASDxkhL>~eGzwlMHDjQQE4;cc=B=#1zb8eVq
zBj+}nt;B`CFo$##VWK1XyYOAw+c%lzMfyf24#2H(f%Z*F9S=l#b^C<-B^3ytuLtI7
z`YvInjlC~mXNKG_FR>%?qV+Pzu?acecRiSAaai8z$}&3k?{C#s%vy8O@}^b>a^$Kh
zB19$#z<3c#v88OG8QTJv%Q?18xHFzeR+ZBa(6XXdd=I~^qcN<I$_@2HRtd(b<|5`;
zHzp-)yZ*A0_mWxlB~AV>&{YQa_S`Lm`$wQ;m#6>)n?p)L{RtWCtP{`To0&T}dtYF*
z?1MhQ$w^rK2gRm>YGl|%;i@m^zO3$M#I3L3J%%lKjhqOmna#=amQmVN&?QQ$OMUgl
z8N~Y2d0`@|d+M7I*2LW?Kf9^lGIl4y&<Ev}$pKdy`K#pOO2AdnE38IpM$4(-n~Vrx
z&US92?hcJReiNgfm*Oi$tRD_Gab*sF?6PVGwpGU<bfD2SIA}v4C_k(Cmo=gIPr1&@
z9)p8xZNQ!P8e)Y$&WrNUt0i$HxyC>Gh4);*4zv2}eM#CB03yvx`H*u}{fU&9)@q6i
z@_G0MOZ=zIG7xgy9LKAKE-P!a5{)*K%zLkvE)9I9@(yu)Tt4qGBx_M%X1}&wvV5*;
zpe!$>o$fjY{5trID7X@^!%vCd{M?k#w=QlEO=-w$zfO5wd9M9V0efwiY3n;j3lPlU
zfM3Z4oIF^h;)q|L3a-jE54#2E4z~EYRisSqEO(|ve-w_zd+wF2{1_z$u!=?wh=>LF
z737xp&Z!<tSv|LrzDW|q<uSDiBfrAiKGwoomy6v0RmOEOye4O}o#q>v(~PRgz;blP
zJJSC7m}B8L<ZESzvRiwCFF@0rvMb3r(d<h^EJe?Vxn$L8DF-Ph=D|<57Z;NO>y5QH
z?#=g~XZ|%w@Ss*3W?oBxSS2oaB((Zv(%6jcckt<f`R|)164_S$sDK`3Mj_`4&xd2r
zrC@LDm$dF!KNXVHF-5Vc3KSGI(^Ev$Bo|s<Y|0hWNh>uSqiBSWcAhEd58_i;?=E}i
z3f67~fjg^nQkW%{)4K9kJGEolC@7`t?jt8bpCZ$CdNLh}3Y70?Su;F<uA|bG*96PC
z;>0YER|BAuodOBh0(D=M-VLU&#aNL>P#m~Y^U#1bM3%9K4q(Ihh5S&s<4yCksT-}2
zB_CEz_s0pjr-t^ow%PG%$Y|i-jOa9CUfPG)ZF!U%x)EF*i|Jx&L85}(^66DfS4iw-
z^*egSjvW%)1hF_YAGl{MsC$MYf0-Rktp~Z&;XQfEbfk~8-`kH#Z#axednH96!(ojX
zp0`$Pb@=Kp3}@5f{Z)yKba)B55`co=obIY(4H_dqOjrn2m$Gep7w0P97+jWWp%yR8
zHyZ7hn#x$^mXsLwE4-|%SoWpDNBv5j^;O&y)g_+kgZZNjZpxQ+AsS$hBu&*fif1`N
zldre{?%amU>?&Pw>gC7<&{n(_(li6q*r#i02{R3zmu<fq2%N{J^`esfL5%k^R^3<X
zel_T-n}j_jr;b?$)wmrHj2K@L&T_{Is3}&x>4@$$M;Ar+(ilYElu@GK80!|iOiI(N
zeEagWuD>~%ef{e&i;%-udMpXoUU}1N#pG7M*g3%wEU!I}FI1v(a!FjxP`eA))!BD-
z%y~4H%cf$kE5~Mms#G{7ZpmmQA2nbvN%fuFI~B+_4Lbf97kUztB}V7P#A=fD<zM@D
zRSoC}O<l883$4yiwP~2FLeSulRUa))(|oBe=45H-x$NP!8!C~(Vq}gJjb50%M)mTQ
ze53+&Ql5m4uO}yP>;YQ$p{U}Js!<h3Q#9!0H9j{-anNOUp7x}D7Nk7eFyVPO|7@48
zJaVes>`3z@AGubeRqRPOb|C!#w6t*pGtGC)KlFEj=D{!J^$(C12ga-SWehj_GcQD2
z9NbmXa9wWvS5Lgi=huEDqIBoK{i;U5fx0N4dHSy2x<%YeoP@*iyHDv@Z|T^53FP`w
zaQTsR_T(PA9qRi7x^E80<^7#}Kk%wZoVK>k7-puO^;;ID_p$u3-0BY#`2zc*rQv!`
zOu_Mve>{tIk5xDSx(p*=8qrYk|2KzT7IyAbcSio(&Hwlw+vFtSKhkG+rsC1^2Ez`P
zZtB0@Mj7*OIEmH=SXY>JFs!?wa!t@z)fvy(PX3e|hCklXKRt#@bu_B<36Y1MZRh+U
zQR9a_d63kCLi-;ds%sU_>G`X2S&!9~OrKXqs*&R@!hH{`EhS@Ptp5^;>KKsAY2#f`
zD+iG|Dgwz^HGR$g@qTtx{Bd2fa@x}U3o0E4bAhVUWnD4-GXiupm|?UV-h)5+Uyto1
zOcBE1>oN%bzXbjd<3SN??$KOz(Og@g|AwuHtA53(++Tk?u`hD0{bv+BMqaHVRW^wS
zUpN@#510${?mX?-QTK=O9QkZ;j!=5R^w<UOLn>?^mBYhZ_Qu+)lKC4}+nud%ry9A~
z6S5gzzP@~WssgD3U#}uy@lrKCNsrKBiB7(;>*X)vvk13R{3j|vFjg}Cegt9@>~W`~
z#}7R*k(e|lk$7m#ssekjwBGPhfuLprER=!Y8_jz^*jyLbKd}}a8vWg_-SHj0OADW^
zjiz~(*_(5GZ~TL58t>;F!dtPJCRUl1?QWI>)h?cIo@A?!qdtxNur-@zg>XS-T!;Me
z{sajSUm=&yH<jH@TqN*cup2J6+!O`1G+y5y-1`kH+QGCTF{x?~*hxCw|6|M!dE%Qf
z{%st1H8s+_d{uT{y^(6so<ccaAq~M`gpcY#mZnDeA#JW`N0GDn^{-I4Cx=~Gm@gP|
zz1}CM{bla12TT|Vzf1v7zfdw&`on;nu|LKVY;{~iB!DmT-#`U%`9Fjt|1Y8cJ$Mu+
z2NWUR5Cxr@S3_4rY@i`e){`5mWDBtr9)^G7J}~j+uD^zkbZ$$=-gJZzyTr~R_a#uw
zWD#zNeL~HkqQ0QLK?sS?;7sRc<RtqYE9UgZ5|Z_<0pnF}QcyyMdElW~MLGJ{!3=TY
zVp1BEAC|iDa=-;DJfEBA){*Nak<2&_CpLQilS`73$tW#M=F%kLFp-zUY1fGMC-cok
zu|5~0M<WexmYF<7BhH1Y>5EStfYv5kXq%0Ef4~Fe%Shc42Ve8e5yhke4l#nc;VPf)
zeyq*lZB(cqy9+4^vUX^KILKn5@a^R=gu69^-_4!J;_#N!O0IA%4Lr>Sm7~kmaAZ7*
zSr8Byb2j{)MVNYn=t0PLQ8ePn*!$AvOI#t)&1J5ZuD7|^pmpn~&BQFqh_x-`_i(1L
z2QS7&gQc(iNtw;lH5+0??ScTm$o1jt!F>>a!yTuPf}=CB*v_zUNd}+wyxYPcnH(m3
z9#P9RQRpzGznd%<+<@<Mjlsws9wTCL9GOt4{S`0D4@9{qFJ9dFvOxdODjZH|+(rwc
z4K@aV+R70OG!Ey~Vet|nF?=G20A7M<f0M+U=n;pZ548$r!J|4O$p*IZg%*@PtO|is
z+eB!XV~zph$WFGPVy*a+ir#GgvJ7sEw;t0qjUx;fX<6>!9d02MYXZp9Cvkw@=s1T=
zjvt_J7N@3O(|02i`cynjYEF!CnVhup>ELOax?c)*90Z8%Uwz*`4Q>yyP_a|ak&U(1
zHM%aZ<JzPPItL)((%mGV&U+Y53#gTuO)cI$;&Jsdj|i9qIK=w=>Zh1w*=;;M|4jx#
zYb-~78_W-k_y3G2rFkl-Yq9!<QiA#Tix#reG4u8uC0gdMQ;HKum&94T16_V9|MfC%
z4%6z*=<U35B#$l@s$!aL`kMH;449M}L~e!&D?EJ$-k#^%CITE&$oOKzZu0iw-bFmK
zhQD1`yr|Kd+*;p17R%?vt*4unmBpZkqqyvwC7+cX5kAJM)>oO!Q}nqMR5Q4t35l{C
z{?5SzfKs(NbxIK%M~*rApc5qlp)5CpsuRDl5*~LhWp9eG`I0_018@0h&4WZZT94|R
z04?rwP{B1?_Hu);Xbn@sey(q!xC^V-=l*AM5&rqO<rd?Ubj|v<$>H&djc@s2*RFM^
znKg(Kdaou#1T6O;bj0FC^>J&ltj~k<wAC-Z9;)+N<Z^5XTqj(5Mj)m7JJY3QHx=(%
zn`K%@u61g1h<-|MLTItCV-)dbFpuSyvTM3w2`%%3TWnv$s<yII2^x`Li5^AK&uVK3
zMe@fWg`o4S2mIM9*MrgBAMTvN^G}zKM;jK4`Rq`yD>dj+xtUU{d@#NFtxZc%UGHAo
za?70AaYzK$l&|Ii;3?W~(dewip|lLgX=WYbeVylbWpVOs$F#q@w9p(pcRw}K2N7+G
z(0$k`+IbB@=+jsTstqoTn5Ox2zo@@5{_gK?<n6>UK-7GF6ff&pK|&Ohsv}`JF{uDZ
zCuvv1{E9c!Si7pN^HIC5mvU^|WI>5=8KSUR5$V5H79|BNk{2r?!m==bldV}jUN3K_
zu^!P%8%6y&gV*`30jMoF=-_@kginp{cRr*nabvW#S$TSAn>eCvH%(TAq07mma2OEC
zr8|_J#x2x4z3fp?fjvIG+jmcqsZ4QPBxw~h@I6I8H*icVUMW!Znr1@7CR@B!h?ux6
z=FD-aHGj@p^lYWz3=b7z`{v8kn=iayu|vc$3$s{(3wR_|Ev6ay4}h85Ly6fTdOmMO
zxoRdg7n|@!+c)B^i3_6*K1SwEG`%|pUepvy`pKhvNgB40VU*-AxDBMghTNCtO3_3u
zfilRBHl5Y0z)_8avusN4&qBV2idGdnNJDH%zupZeTTJTU86BdbR3|GoCx+hF9SPI7
zcPxnlQSq#OdwD#Hh8+w0(!pEI?wfkJrI<Hrb*G!$dWjLVHO<yL@yXz72PmY9iBDqF
z)~P0@)%0S9oO?wrTd?@jKx~_1aXP}nCTNC>M(*PGti_K6+~$<etqm&h`t$+2EE$Jr
z2CdkNQq6DMx|O`E?^<mZ!nZfnmZtRmO=#r(%dYdxqIY1viU%Qj*kOd2L<GUCcW_(d
z45;tlCZj@)6A`~wTkq7_?7#u5^$ADQ%?RUUm6Ei>tdk7BO@xWHyD^2$Xwj!6B04~A
z<XBym%rz!?c)Co*l*L&$chSNuerdY12(+uGqz4&zrp)%tPEt~B<{tNT@nlf^G4A5C
zoX_JAs!jAXQA8+k?dQCqSG6N+RPx^mRB*V?zzdY(?oXR2l!@DD`~iy*43X7|<o=0$
zPNHQiZ~<i2dWjFz(M<vGK$aQ`0e=2QO2tJ0SOlpQjYn^%rh%Qpj{;pR_NW~z9MURk
zj#ubi%hfJsfW<%zHINoDy{n;MU=V%Mu>H2|Hmihtcl;0QfS;?gt*xu@K2|c~y<E^s
zgOTJ6TYO`dIcnm;+hUB8m^`N7p6A87jvH+DZCu<eqOokUC>6Z!$T|EfQ>dnur}KjT
zHWAQuEF%PJ@Oy_J&1TrCurF($L{ID_S*m*PE97yyCxD;uTnn4Di_H@Ck%as{Zg@b2
zc*akk`(ijznb?J`kuzS`#gyo2&UkyThEs+e(%eu<WDqBEfn*>?`7n=cEd>rCk!XDy
z3AIK)@OF%9UG=iScZ_Nzl(Sq#atP0Bv@<243LR)#gK`U+KZC3|xOqn;97cs{<Tq_8
zzew0xWRz!v_Dif)>QP2!*D7e{gZXhwusK+N9LVzQ5RjuO1YWD*8b(OMlt@V-&jFw;
zoaP!1=@ySz3baT4w7<Z1;KW=%Ev%6me?T^ZkX!ACpLYs>*!x*le5K4SyUt_uutpwM
z$3b^LK#KA<8*_nNZi7p(wsWUAsUoIm#TU2ix}t~)Vj`~!Z#Ju#v>jOz^=K#HhK?u^
zYM-F4vu`f;Z_iZcNwqc2@cRyBZbqf>bxo|Fk-Tgm!edL0Qe?y#k)cq}qEn@Av|m`!
zsB)#7F_z>IuW2Ol(qn>%D%E;$SGG`by`D}W$0jDn@_Yj_j8cqk)-Pk;U=0mc`Lv0l
z71+7|NYgAD@84nc1)m%%&Pz(_f_)UzK&yhMi;PxJ5r9moapr7Z!YKzLJI+o28bdZ?
zz_X2iN9)%R;5B8r<*THf4CK{{MK0*q0#j|ZncrvS71SU57S9n`@P)t3o2&G(<uV*D
z<|!JrFEe$usOC)J3S$RFP)|Ez+=Z?0n6T{s)^%Y=a*IG8NV@TLv+}1E=pCN_L6%uw
z8bCikJC!A{b7<q1cutaZZaOA(Z)#;On;01+%<evvH$+!*8$0o8ahSy*+|1xgfy=JU
z#8u<cC1?-2-H}Oi4IK9HPykG1NaA37{BBtl`ywuWiUzb5rDYDpGqj>#A{j92wLT_E
z^GgJ{+{booob!xfkQ~~Hn*!sVu$=rBo6E6tu3qG$(6poNu)p>%_jpV38TzCT+YJTD
zU~hT=UrMFTQ26#UpIYi=@ZexiVWLgMgar+4(v%b^Ww}hh0>5J<h9g1N$3{aJe{E2O
zBc+uMY~=<KagZ{IO(A0#MbjsfsT3vr8U~sp*-80XL%*NHnK;Zt21)STKvE_S6W|k=
zFPa6Y6*RN6TBfqaWUdRkT+fA#vq!SZ$M`w1P9V*Qo6MU}X*F!M=z!dnXfi9_Fuz>h
zDW%R--sXs__D9S7`jd{>EMBUgy5*s<FQ!jeB1P!uPls`rwej^3L>%>Py)0i-jiCFe
zTmJ9~;J%OCJTEXyypkx$@`z5|G<TeE==1pvI>(Wha=9a6#M{kU^70xK6;0%cpx?eT
zVWNcO7r!8BkM5BvOj4anPi7Pi$r}Y7Z+2sRB-gaQ-8&t28Qr{MKV}&eZ+RI_aQCUZ
z<?QKPWsJy|1C?7uLLnq8Y&-zk)1)(Eo84AVj1F{AQ0e~!i^bSP^}o9K;BU}mIrzAm
zv>Tj(MAJTHA$n%B>^jP4eUd0;u`HBb@3`SH(pr)Yxb?%lZn1b3wPS|f$hkUv;V{~i
z(?2F?7W*9hZmu=VbzpO`1Fv!f^JMAKk3VwXW9WqCoR2bmi44KVppM~?_$Cn+G&DNK
z*4|F6vAM2~0yHGxZg-f<|4Gaf6v*vj{yYJJ*qWV%mS2;+p(R1SjUv)ZRi`yLQ?hs&
z{3^G0`1JivDs_!_>V_{@`<f+&c+GW2RX8!#ar^P1;5Ob<sO%V-=r+EePTL#c!Fw^3
zBL#Io%AsSMyKp{+qwF0IT~<yD?wF}O3)w1ows0Mbxyk_MilH$<0hoMiTnYN{vv*?0
zmzs;-qEj85@&X<xchl!9^wQTRk+#Z>@|}BD9PYgzBmS>Y2M9Z=#*=7~{l~5k7;3#2
z4q?W?7v-oYXlJ@uaT74cs|!ZTq`wOSaQ6&#=VTE~pUfkhR90liCMI&`dg~V>$N8LZ
z#*`=t_Uf~qEAM1$RqPtvS`M;=(HDX!O;ZNuc((cUBHk95qvp!a*LX-SbTz-n*?Or)
ziKUh_k+1GsBYy3%><)`e_4OEI5B$DXLdP)Oe9oPFHE|u550Fqg*GCjGBYk<C^FtFw
z1^0zoTq*mPdft2Ca=<S0br9`gJx2;N!i!vu!WISyM%60G9!J<gq5zgEl$d7IBQPr_
zi_C?=9*=ZRBk!jO=7}sab`lR9Wz<eg1SLdw48hd+zFSM)PgLss;T?9U`$m^?%wufS
zO+t?EhgiZM$Cho+rBFUKkis^OP-0Y&DW%M+2<7J!Sz?faUU+9$PNgoy6l#32S9HSs
z>b;Ot5Y5Y;Bg$~Hv51h<aWu8G>lM&)?MN0UdMU*YQy|kjH~X4;PFKP_>&Xs%K%nm>
zMI~x0-JDUXv8gD4@=!}E<+@jfd)VPAl;#ot6`a~<q-n+)o^>bTiV&2;%cXR&cFt{Q
zkxR4J`_;jJEENBU6Yc)I$F69kL79`Ei~egIs3theZsxx~v~6TOyj^o*Su42MW{p&z
zTX#>GZNr|vRky;02fuNgHxeam5<ZluP&epjHH?OYiHZ}hYpqS`ABu`a&-pfa<Ok{U
z;i{V=bNf*vCn13+0opY3Rx&R{No3snc+6M^cT!KO!(<oALENSf;NFLywp0L%aTY_c
z+}<()YTS9QWa4;S3_C{G{!ysTihbar6fwt^Icua*@K~03vv~*Jn!V=v%$Gn7Atdmn
z-vdBJqhPGhS9k~MB=9&Y?{?Yu*9fArKN$a}Qi-9)9rGy6HU=l8=IaSn33ZYHxo~7W
z_;qgQ^ByN7sMJLq8ovF3`c`qnEa*4-I^pqp2I`(hj%Y+9PT}jSEh=!VS!Cj3Xl_3V
zyC|I8kVBfWfvY{<=MzxfsC*K|x+XCh&`%AqleX43cYMY0zPbX0bzdrW{;-Q@hC=T1
zaxKg3z8MhttBpzeLlT_)L-5S2XEfTwId=5q2s2P2F+mH-gHE}TpAMI~u{`pE&&g}B
z`^CC)KGZ`=5MMRh_uk>`2`!BJh=99r_==%=6hQOKx>63b;!s!4L0RTUf_ftc)iyRG
zlN%l7b}=K{>6T#{KHC2@Gv0pZ1fiP};PHG*k&ZV_1>}h!u^_qwTUaEha@oc(`RtOD
zW|d1kBy*oY5?Ylm!fzbfmK#o!C))Vp)gKz^bG;Op{zwh|Z}<lvD>Z_HSsMm$N`gJW
z3gA-^rB~j}_~v(NGD246fb#~$$|TYDMw?E=pIy>6rM74Z+<Rd$pK+BuDKaWd!{}>S
z(0@udGhR!w?jv>aNNctT$aoE{59K{0cYZ=#IcVljgdjq9V}>#fqymsJ3={-ipN^*z
zKGlW!fj`WnO*1yNmRlb{9K<h%TYoy_q#MlQvy81(%Ub+k0UwAZ@y~_Tr6}3myJW|$
z%LU=1{w$&D`i##F)B^Ah76l&1XeL>2!<DuY(iGL3fwV4t+|EFWx3F!&yW!NnZpTUa
zIdQV()F$>T?mQ39HV-C^O@l@bFn=nsqFiBazPl!SOJo4o%4sDgATY7H080@mg@vQz
z7AG)s{Lq36AKTDBA$sRb0UMQst{#%>YQp6hy`IqcUT}?t&17=5>TZXBC&JdDY*z2-
zQVSN_?P6rM9j<^-#iceM7Mn;o!J9TxUk`pfq&quKyy01^q&1{W^{wDawg6S$Ag!Hu
z6}_6{R8JJ)i0DHlld1832nV)6EGxKP;rtNl-e5amEW02dIWYt;UBy{k-ys{29{j+G
zCNzw=8QdiU@&wDgWRJdcTBJo7lmu=hu|@Iq=G(-k>n-DMjL^^mu3dhdz22Em2zr1$
zJZO^WYv`nkUskf>lWy6qwQuBQMt_hvKQ6>hzA#InV(5dH7bWyGd6P0VI*V1|3!OM#
zvMxu>P(;mf)h|Cz$_0FkLmcsMW=Te5y>+YH#qRd1%y90kN2|);)!)4lz0sdDlClH7
z?oZ{el{XK$0I{bxpF#n#hH0%$YT{l#C?tlnU0%o-0|A!<<>aUcqX2*Ml<(T--cr$9
zc2WS82ij{O_UDn{Xn%uCgj@|mU{s`1fAZI|Q+I$Ycw3z2TfK>^<=Hqf{8XY{{oM1D
zHPFZeXbWk-K17SDdbc8Q>hl2L{ItBp$b0f4ti-M5Y|#Og@W{3#jqQ+;wFoK#?8S9_
zWM{#j<3c`k>I}~_C0(nS36uXe!k>FqW$H6JB}-FpHlU3{SWGSO<SV_{_tBo{;%eiJ
z5gn4hNo$ImjQ#qtf5R-k(h=bCdwK3~K)#KpJ3xl~AoM+h6v|O6-8NbRxmz(5GRu-h
zE=SRbY|H%^tMU!x1D^d~?Y(7NTwBmK7zhLiu7Ti<ySq2;!6mr6yG!Hl4#C|C?(QxD
zg1ZL~KFvAjnV&E-?_3YxxVrY<y=vDgx$9olK|3Sn7ycb&<=5%mgt_j9F(in`B8U4w
z8u{@(LwG%Nfx&o?71=8IxLUCr38<YM$U$(`$nwX}M64r92$Ud`vo*XEIVfOB{pp4%
znKR-dM-Y)UD53^iA0d0JkEN4w|2a=1&e&_1rRwbdWC{!>jDT?(xX%NWB7omFGU|oA
zabVpE*G|W@KkdbCjH=HsB($;tDJo7|HEerQoOZw2)y>j+d`@95zM-V{<d+C344rn9
z)7^gvQcRi=MXeFkxH|(pAsi8HwDr~tvBE}O>8U7*+<l{+cMTqLd~FXp8$y1wnTid_
zMLDxqV0;8%+-^~v(-xE?J+aS`&+mfzVp8le2MBg@OMWsaj{!{iq^(LsZWp`Jb1!{p
z6o?@PL9shua=g=~=_CKp*;cOKJ4C$QH~E8tOKc)yi$#JBF_T*vlXFJjh+~Nk2>kIL
zY-HdNAAhd~YJ9KozAEm%{W;8e{2SM(iK<usyfE_%%!m`JCh5myBP0SbxTL{Un7a`1
zTmkKi<BE4ipFWEOwh-)DzPwipHtBvH2rpZikNex#FyGHiFjNu<00vL}YQ$|7m^z}a
zDN!@+R*G%oLt>aAc%zdQ&jnkJz;>YmEBumsmN?}+IfjWv1|xcpKO@HXAk_N8Q~Xx?
z3L4f8dNwoz!X;iK?BRF}AwTFCv|J|Gu|>aF4-|YuNgN3?0KpyX7{93(Ls*y^)eUXk
z_Ag-`NJZ+SzW1;V`X_8-<o}66iJ3U$s<qp2zyl_T4v)nH7?ceeeprbNy<$thfs5tv
zpfa{_rGQXc_Rrk`;%2X^3FBh8D<N>!LElV3Mgav_K?W^lfzxj86<VOXq#6+2u9R6F
zg}YdVk`XD$^|ieo`9i)yJZ2h%hnj9B12Ty_Sq&k4Q(VSX3K@uDEjL}LBDrxMq0P@c
zE7U?gn(Yn$2zO@v<u^w5QXbl;N)Z*f3C>OwPj}e2O5O&d&bxJ$wvM|CYv7Z|I))-#
z6mM2k9;@;96=aJqh6WN67AVZgc)hTw9;D(qtpd$#NG!O7y$3CXQPZbfUm2K_`K#t%
zD`0JLb03r>vMDlLz-p@FfzJ&R>aQ091GJh(qo29p{gG)^e5x{r)G(np;32Ly1uzNP
z`6#awVEVrq#-@!)^cI^SS_liEjur<tV>`UQ^)i-hZc~PZb_Yv5GA?<5z(;J_?k2Rh
z5#hOf{$BD#sF9H}NGlZRO%Ma?<?mKt_f~{0L+W7A(mVP!;$jWXWTV_=Oxn$I#h?(H
zdPJZs-gGK+z%89&3h)LCcVxPQrVJfIM(>HLrI>+V=)haM;IkBxG*HLPMUDtJ_MFQb
zlalCLPx^yMGnZ%jLJVCn2)t9mS|W+-C*s4!;VKr<SMSd@^{>MY@~MhBWjbhE2~s1%
zMMG`Qxs95(T9M|gHMd0;-v7lDFntPLS9C%kq=L=9Bv5;Lo)=3A?TNw5D)o{t|5?MD
zHqBVE?2Jp<So^8>=?9)$y3dt|`gUkcvF81>M{a+dycy${QB0k+MpJP~{y1Xsw7OR+
zQNw!^r0X#K?u<LM-7TU9N8}(pcOW4gG9u2ED%o`=>fopRGVS3Qdr=pOZdBB|Y^-pB
zB3((D$KETLvFF(40wPJz;Je*Xg}`PtPhP8#nd%5Dxe@%kJW-r=3{TsG0dD4tpuV$-
zblrBhE$dvxtzH))P&<QInxhe~#0V~IREoNAbRZe)CtT!4qhFsswI76#m=Af>r+nrJ
z_A?}W(R#0l6W%N0TOwva&9W@hDtx?*KghxXxF)P$`1w}hZ(?eLA<Xx}zSV+a2@w35
zxjN$=0d?9p05kT5)}`GtO$J&-c3mvAgu<Wad0k*yF35&jlK6V;+Ic5awB2DYB50aG
ztE~b=hng<>;6QmblP6-KDB*cg21@UBbaPlcHjtDq@1mW(I^kl?o+|F`y(5QZcr=#0
z2B;7HME+Lb)NT%<0EmdDDV-zu=}^|D>~U(sPCS(J0HB7p4{GCag`v*(VXekdB?a!(
z-Mw~)H&L1KnnMO_IMWEe22GXkZh7;r$M&Eb6u>S>VsBM0$n*lbV$-aIJv;6yVwoR^
z0n81$gLdcvSdfk4lnui*EjC-C1Am1AV6zVf1QU3T2;3IeQ;pDei-}kzi5C%VZS^-@
zRZfkW`t{Th#T`e|Ma*)1_h0Lpp3@~(o3c+mH3fk08V}>H^WN%{P9UQN)DZ7CaYfzW
zf3JnH&T2exJ95i;P5>thq3_88mHFvqI1xb%A0gH@Vgx-BbXUn~gvwasc`oe0@0R$z
z)_lsJ9)B@Vi5ezQCu?*`%d08lVO`Zp6F==;xF21G#teipx)}i3fx*C>yK7L*DXPS(
zxWo=n;ae)!Zr>KwVKs+;O(N>o0b<&H-UaX>tz5vt5M{$a?%^?4R(uxrdf`DAR``<J
z>cRAB$OyMl6iqwJK}ma9UY1-;Lw0oMaSV6V-e~<K4HVB;xJuF)kho?0WF4G}HtNyU
z9p^|B-U)2HZmj~=X&;$+Xfb^OZE4>sa7B_>Sk$cDkO2KkNU>dF<SQ>518>z}65o1p
zZ%JHIm<WoFd@Uh&&X)}ywUKV-rl6y7_ni*9c$3U8$^rooGN%LF8AI5a7)!C~<9)cj
z^<_hRc%~>^cH@1khOr|Pl^l_w07_nJ&mLZ{gNKSP4iV4Aus>B+^XTLa>f&r;$uKX&
z@MC?O>fiIokS0%x3o`P97Nv)D_Wr5Z?5I)F^*8W{1>w65QW>kB*8C=zd%$z_kf`2g
z0bou4{X5zA(C#n_xH9<&>VGsj1JH!0KQw#VUhu{AO48cd-gh#;+Wm0J#Jmbdx`?hP
zipxDYj!UnWI5TkvRCuZh=+J7~Lg+6%)>Eu(>BD(@phE|_I-i>VvZd4?<3tbg0`Xq>
z`e=ZDW;4LYbA(PC#lzu02p8){ev9KHkfcC@g{R_<Fd-Uj;(h!<$j?g&W9nYeE9hyM
zkOR0}yN;uxz))HTGidcYRzF^Q6z2Wt6{f9*i>STa2Q9i41Wot$R!I`)>mwuPF5?E!
z7`JnV9&a;hoeog)2B+L{{%LP)q1dGvoBmEQ{t=4EiFX%-Wvj7_U*|TQVPKH$aZq4{
zi24C4(9bQQ!+qY9CAvb>ae~)Ca3oRLVz*bwm)K#aDr9NHXDbSIR7ZW-UPT1njTbgl
zeDrTT`P7c>8-$8uK+m9h{S>(V2cYiX!KigShWD5tat0Ge{7U1_o;#WBXuXklQ+Bxs
zL(`NbfYWe4DB;eI`Kd|5yeJS;f3J3~>ocGRksI?W3GVg}mI|6-G%9;+K@*7neE!tC
zRall-(`3$x*XF%6=_cHFRTrz?!kbsW&ExrbC*ax5VI5=ZPaOzJN_Ken^cxgb!MC}T
zUND=qhP0zx!Gl>oqEKh903TM=qi!n}ZnlAIBH*A$0p)&xrWxd+-kO~Pw|{??d=HB{
z%4*2unISMs3&H}45KE-&7b;Y=!x4*tIH@qAYN9#(hh@-Pwa}%)Aq~>TIKO{YEqg(d
zf@ZpcD!(7-6OM(I9qqiM67B(nArJavEVvqRzYcE*ql+aBB7NC1-Vw2Y{rHKI$(9u4
zrWENpoZz+Y)pILK+@`AQ%|=R~iSqR!mgh<@_0NjC?xV)K2l{$Ji-gbmSm<W#=NR;d
z&pgz|or79uXF+$spg}9!heo%B<d$9*wu>Ng_h+9kayMV{CMb5zCg&f=y+`O@>)Kmr
zO8eW>F}+#3+{5<^YYLo5E=n=9H?<|3Oj#2<exf6AHNr*7p-`uMFhO3)W;dL@45!K4
zxVw)m4;GX@c-qVX=exv_V!(V>^*maEY{U~wMrcXSzU>cZ@qc96HAe%y$a!<b`v7ZS
z^-<v`x$t($u`bStut10=pSz3L;n#MXxXx7sFv8}|i75$EzXA5hb|%`6V>2w1BQ5XL
zN4Xk>lkG<w;NevSNJC+r6~!=;5X5igLYwq|fi}KIY!F2A3FJx^7FSS4{}i?VgTafg
zrN%28GC;*%AY*?Rp+=w}*skbF+M8cD3nC3l%q*@S66b>ZKH^l)z;Wj@RQ@GXmv^KL
zfkPZNg||IDEbpqS=WAY>XS|9&KYn%$M=|!s)|0{1phml4(${o{`G5iB{H9FT7k#|w
zTdOCYE!M@>E=Ze=k)Gp>UPH0jvu*y;(}j9bz5UpfLk}u?<2JfT0CX_pJwo77<8>9i
zL!IuPdp3<bqtPiM7vp&BH;)$$ZJ=9kz)GwW>Fz2$;Pc%L3T3>I)cFjd_w~BMRs9gO
z<#k5f0h^7)q}QE`H$m);vkvCN$`@5P`+bTBH`jcwv)={YTm-tZs<T5BfPw7x2`>F1
zT!6?K|8>u^;<%^h@ZztJsQPvnMX6njlND6^u2Xt*41}I`YN8TL&GS2FJa^V3o6&CQ
z(au|Xt+&#5`#&LFuhnJ~IMT5{%1_TEy7*F~UPB$B*h;l<-5ulA>p_wH^q~ePVYAp#
zAhEdldJXpNfd?2R_4vB30&|`rU~ZYXnTQxJ&8h*)pb`p-G3_M1)7F~P8MGpj?2xqn
zS=4e$O-K+FEfR>br#~-$^UUy1=5Lb9bj9l{wSnzfm~|p~xHM((!G)SQ1hFMID8<eb
z2$f8-0>$>AtV_VoA$+-d)=IrZ7IX%xX#!<lb#Tz}>w0;@_78JflWdv+65f6JUlquP
z{%n;<K?VVz42s*;CletGfjEzg9glM63O{<~>z4Jj?(L4V0gwl`cxU|9E|}JgMxW)F
z<ifAFIr|1fIpb!fhc=Y$0KUEo-SEZkKhI`K^kCvRzDy}IM*%x|tv@gQ9dS1R)!l=7
zG3or`YM2Yp#x>bGc~{}prg-r_J5??bIz)68etBWqIF0FT(stYKwS6>}pOTI?6~o;R
z;j*ljC@Oq%<2zRhe+EkI=f<|X522%dB}|1XBoG&IGv{@-{M5M*=b((|MSkMUl%u!v
z*PoF~xEec<_*zp0MelTPWS{V=(Z(*seTT50e7nce5u@K8yE*+yLiKy}z+e33Bp2Y_
zzF{d~W!m=s^6P}BI6=kr-NqBTxsU~KL-r6~Dd_iBF8$f964;f-4w{iRSi~*oGfT!z
z>xD~7!-elPP4Q`bXfx;<_m^Y%`*V{PB|_V+0FRX(%Xuo^y2-dlu|eVkF@SdISYJ-W
zjBwPTVM)lHSh7F#bcLN58Wc%MzHBy7AJGY5@fNI+80yWZrD7K8zrLyvC^3(j?i`AC
zqacilzAChnLk=Roxvu4`E<i@A`g}6nujEVjX!DGLEbM<3z+tTZ8f$<J3Cw%dT?^;O
zK*&daw2?`Sw<lqF%Nq9H+mH4w%XEtX*52t+_E?m<^3J+T@)?o;9ylfxu(7(V*<4o9
z8{707x+7w%W98V~l*Q_GK!8~XUDE8um=Smk6>wNtAsXiEEb0DI$s@G#J$|0{NgML7
z=2t_;7Q%Q48jme`IIJY!ReHpAU?&`TQva;Q^ZKLMJ*atP275${%8_d^Q=sN(f@&oP
zXV**{pT-Hsk#nV7nDUtFn%b`7#R6B*nRjIw@1>RiTlddNJHvj^Q)J3a@y40G=3JzH
zNPiF!rh0dsx9&q8@;@=H*Nl`EeYIfE{2Kt{TyGTY4|^HGrwH21fThl9r@a=TL}d&F
zB2$nT9hdpkfBda$d$Y1;zI3thEZEj+Dr4zZM|~```K;4X)g*n}Fi)b;7vK^j^05Gf
z<p$y7p~J_^0)~0yXw|a`$CNq8IH|C`figf5-B3OOsf$H7cN@MRWida^5(3VG{2?O_
zrIFzYG$ScOXSPFvT9Z=tX5clBQu^`KmptWmI~C1`1XSeq@U%|h%QAq3p@u{GZwK)n
zkqUn&eD|T-qp3Fn#GmO}fPPFH!yMWAyvlo{fo#a0gDR1_-XXi+4@%{3c4zwvL)OPW
z^;|wG9l$)4t~ReaeCxW1M5H=`Urr#LExn$ZYRuhgaVZ5E^F~r!OMG7loC$6z!q4H(
z`MGtUk9p4w_#A!N=5doBLehI$IpTu<lWr=_bRi#1uG~oSA?&OU@qOylOnHj^2^q2e
ztHI-@lh9g5*qslq^5YtqDumz1OU9sp!ixz<tfSF?l^8_zyu+YS&x?vtT}6s0f|V&i
z2#CCSEL2PJPBq9128_HDeqVyX<9<ST!Nrjv4p~ZBF)`d`RZ>i{HvyLEMGWZf6^(($
z)6>oLj{EtAeIp0|V7%R6jNJF9oH!gQSEPvE{5!3jc=VleBDPExhKCeA`kJss`Cwr3
z9L1n7sZZ-EcSq-D5|2rW$LY(?2xpMk6W=EZCB_Qo9OG|GgP9A)iQ09`^kKp->9pcc
z#=xkbiQeIV#tV{pq$x00HF!WNWMBg1<2%tG=^bSDW0qoApyf<+sT+2)7?!!iv6Tr{
zKTnHN+&ZWc!+*akK=z-z>Xd2Nu}RMoWVF*Aoz;;Vhwf!BQ)2ocS<b%`H;HH8zw(vh
zKb%qT=yx6Xz3X5^`X83`4cGgfAf>=}B4hZ6sXcuFW<@*0Qi64z*q-<gQ#$8;J`eGa
z|Gm-wgF5mdZKz)Jw!MB8cp5V~+ZoF^nQZ={!RRa}K6U-?%Pgjcv~;nuI`_8Ma9kvl
z&LDgWyJuh0@zvH1w10cK=V;pS@!UB8k1OCf?2P8ZHm-5KC&^v&z3`rZS%2f0Ku6}C
z9na#dOk;NG)eA}Zm!@wgMO5SXmTl6vel`)LroH$)?76<C<$cEH4Gww9(*&NoGy5BF
z4;$&?gdWYyU9P&0o8A7MSjsN+|B!dL#5Nmhi`jO_BW>wBwy*YAyPkM5r0F}1n%nGv
zXIXWiM)RG){iEV+8+Myt(<JphJed?wcI%n%R@NOVsyCkcsdOTnrI_pD`8r*r-t_+V
zo16yl#In@r;2#DtczP2ycz>V=(-QtSckEF7ebR#h^>XolClS!ah~UmZ#AE+6KkX3}
ze1T7*)ZM@IVv$V{m~zacKYaB+gVKCNHj~g+-oO(H-`<v=Eqp(GV0?Hl8qd7pT$PuS
z&KLbYkxqj^q@<)M5)uZnD5=YCgTlNGwQ~gVdfaPADYvR!Ki55cw>uVIRJ)u!oM$DQ
zYFW6@UGRp&YSDKZqk2qY1zGY<;FzS$1g}>&P*sMTm7S@QO1S4;u|%~OI_`fvcGlm&
zOfH+|cmv2;!}^qUbm)zZjq5Cz=&#TIu<M9Hg?>OKf)wdyf>cpaxq3XQhVP`Ip~2ty
zA_R1d#AG0a6!C*a@#pdT0C$GuH!h9>0T6@=?LH+D1ZH?gK0v+y_W`x}BW&#S`G?`$
z8(10d=%}cXKYwZv5Rk$TH|*^%(a_OJW@Z%7aBy&>28=aAyYYqe%5)Hb46J0Nq}Uyy
zp`j>VsX#&qG|j^%1wm<8z{$x;Lh_<&7AK$0aAk9W2-d;Al%_>^|IiT3jr({c>@U9$
zH`^;e`OQ^2N~)@oOG+gCS-w{ti)v|Qf(d@sC1Sr);y_sBfX!q7ySqCnPtUgb#f35d
zKp0~uCz-|M>ZI*$z{vM@bxArE14BdQ-27fB995P~RJ$WldHL#wQl0ry9XxKQZ+R-O
z7j3p}iOI=QVq(EKmG+5PkMTet{0sYM2{>(%`g+r;Q6aLr$?Gqq?8K5aciSGi)ua>h
ziM_phiZxE9y9axc4i1Wf*<&)Hf|MkiU(>R<998*woFsTWv(Pa4hEg{U%yMibeX_0A
zT2-!;DfFT#N`o9M!|j)fSAEQyW+J5yj{RH3OxS)pUmf{R85!{c7p;F&$0^J>Z2!`4
z;kT0&H{+`F((tdC)iHN*X&leOwUwb$-o{~rutgGr((>5)nzk@6b3Cv)9hz97NfO6x
zx~T4|rI>Iqh8`neB4%;DKf&T?L9C9Cb@T}mFL!M(&+cTn;%OT*EhU~FM}sTA%c*5t
z!8G>#I%{@qfqpH^nwg267RS7Fg=HcoiAGJzWFL4ZbK!skH*~zl&D>^pSvM1LGj269
zGLevb5~!%FL>9PICG{A9L8V?N5(unN@E%4GZ>}h1ftyG#YV8PU_4v^gJx)q3cJ95Y
zprO;Q7$2`lDt0iodHZW|mQ)w-S$!pAYV_}64ejV;;sh>-lu(bN&=dyx4tA-1gNx@`
zceyY;M2gODZsv&4=p<|8^VmFGkC|eVze<QdXC3YeGHq>>BcaW&riGD5ulB~AqPdbC
z$8#4d&Szt%e}DU}4Ko)t&g^()q@iD1U}{92HgJ4MBgJ=5F@IgFCJruU7G|jNttXg&
z@XZ~D!jU!7YBW^^tn0X@xPh##i*VD7g&4xa7a)uj8N`CccaM{#XT9zc@dR8X$i+e;
zsXA-ymZP}#cWVY16KUjt@Oz7CLdzc2+6hRvE8Inw<9BDCy1_%_Mkg;kPCh@h>gpUx
z?bp3xT3nqQ$1d}8qz%vP&AI2yv>pi8YK0doH0X|wjs{o9cA}o+U_p$Ka^fbn7GbIB
zE1HudJ@WTsK~?OMQmN%de+P;JQ|c*0PS}efF#$uQ4JhWJf?qzKnDZ;y4vvmemsZ=x
zZq3+hWa<);m5&+qC7*H2$6j2<$(qWIydtkBwH{NqIBm6)mKaKyOV(D6%sjDM#b##W
z#oS+t15id`Iq}m(!4uEtohqBed4lOyT6m}^79>fD>BSnYmT5(Ug0z~rFiTLl$a8}v
zgrQooO(!$syf1uzoJ<_=9xo6a5n@jt5|LP_QY97VcuIy#sk(a^8EXG*e_7Q%d=`qT
z8VFMcX3&&UjSLDI4MmGP$yIjmc)u^18M32esi5*KcwqbG-q=XS=;ULJL@9E_z@O<@
z`sVYv;1%?%lzSfrk|h|U>wVcelUr(yqSn~59@tbM%)57K5EYc3otq<}qpNgyxH86>
zrG!10WJ?R054%>{tA2g_uGsDnHFUsMQhHgb`y%q_K|ez7{CPypc8D{}Qd((hXrT2E
zH&xil953V$q3~A-!`i<xbXcvG62iiPk!FJa1Jd&bxgeLR{j)l=uMyDui8RQBtOPN#
z+&SdvJ9SV5F%|(`uB+_z{B#901SUZwxj~YF!g2Sf?y|uRrmfclYhrPHDjl7zig%s0
z;sy&>bx%oZ&AwR_v41n1VY>}3J2h#K=MDTPvuo8OyA2co^5<R}pCkCb)5<NXjq!N;
z(wlxgkr}e7(xgQ@Ad`}`ujS2i9K$YT3POS;ZJ0KX(4d2ZvsT?Q-Y1UkRtBr>Pb@22
z=>SmN3Qp2|(4aPNzHPwZ$9<jEa=Qi{mOHcQBpw^vtO07cEZBGlRMUc(<EBj@lugX(
zAmwgzCyi^T8P(fdy8wH}#*2$wc%Q*iA*zeq+$wtMczq&7z3u68xBeiLlpnFvrxpxd
z(kIIPl4d@_<G6Id?o$5PX<7?qI%>Fys=KSU206@B&^||h`wRdo^Y|m&M4s3}Ui|-F
z_WuEu3>D(z;KaT9L*Nj6jlaCKFD<Q1*f?LKLBpis(e)AySMykRZ|W?PT7TZNL}Own
zo#KD0NK8(YQdTbL@9&oY0t1$P*6oSih*4otp(6Qj?#IBnL1a{vw6%8S>Dp7#Oo*xX
zQ6Fw~SP4IGlVdSB*h&}`91B}jkd9|bd&9=vIb#Nb_JP~<Pv@tniTOj(l1h#dH}|*F
z_d)Qp6+>weSG#G|zQU@BRd?z;?=GNi<5{M4C7qOmLydX%%W!+Y$jH=GLU{P+k<rmB
zH$q=h$lOT3nUH+(!qQTWs@vOAeyzmCMb(kfk(A<``zmBxVZxORGTTc7NqPB{{(h*D
zvG25)mP?9+z=|13GvY-ef9lE7x#B(fiw=sUB$e5j8LB=K!}!EZKw<gVUffVlTtF`-
z0iknQt=#L)oo(S=KU2%%Dv5oqBdO^OK%=cui@^IvL8H+sZRqJQ2`a2Hlw3J~d-0>)
zsd`~UW7F95bm(|S4r!s~;;5lKAH`UN6ff4>c<HBy&9x0lYv;lnt(H8JI!cYiEV8{l
z<16gj9ukwDXK=zJb7OkB{KZK`RZX`Vq)G*Y>_bmtVWOeZsUxLYnUoi=Qb)wsw==h@
znZA4d@d>(UD0<OEGFw2N-FltW*m6-qUq82?u+Y6dMgYzKo7g6J8mYt8u4I-W7kv&s
zFQuC2BPkDuyzGUg(az!%nW=r}(6(;-G+or~f?_d#J9<(Gnb^cd2D|k)AWcgAwgHR5
zwHt{q!e_AS`O(e)gxult(yBE>U@R*eb-H@rd4r53z0CLBNJF8Nc3iWdWA1_b3kn%3
zDk?J*6LEfizQfC-pOdTe@X_E`b3RwiR-RQ-77}d&pTGK4s(X8QE2mAkIgC88&HK~G
zC%8r%PZ!c^_o5@iqjA5VA*r-#6`7gq$>@EmM8w70oDICZWGXQ_96HOjo%2+wyEuv9
z?3u0(qs5K(Mw8;>;{o>e_Vr47`^R4(Nm;w};9oKXzX>0s%q}dD-Eu4PEX@P5%KF13
z%@sR+5_b3LR05<D54jv8=Y|L1wH@lJ#JHW#PgfSC(W;9E+XuKz)`<iJbQP79CZIUU
z`_n@GtYN#EJ{P!vBTkmM<!sBIs=0-El29r^!km$UO(zCsbs{uQ+Tf9)lX6zs<5R)U
zqY7mLeoAk?6jB;Pf7%gUD{j}cv^2ExXksG4qG=&N5Ei#J6Af9KO@ZnU$4@x$_4A%g
zs-Iw<3V%Li;t4oN&}$#Ywv?5OeNG+*&&!?1A=T}`%a4qol7dL1LTwI&_4_qAIEYaC
zvNV)4e8h5KwD}lYKsTOz6&|%R^Br*Mh(3?&AkjMII+~W?jF-07bC-6MFJZ<a%GiIA
zV^e^Y!>AjsmXv|IvaA%vz2&WN00SVUW0OqY-75_-j5abg9rI$7?WGMvMn|o|he%zz
zJW^3I5El<&ZE758ah!0tl!}RrmX%N(g``r~DIoug+Vi#H<fQ{IO=JF$lr`e3CAnya
zbAijxCi!sHWNA=3iu_T5Yk|yRdOhYN)yz-!!uVWAQyGy>8}T1)PXWqH^Xbm%&fHz>
zRq5Z<>lEja7!cW#jxmykxmY4VejpPC!-;~Drs(-ei|U4sllJ2?kP)o}Th#-!bV{m%
zhNfm2xU87kwQo;>Rrl}i-u$7}>ZGx<x`1<h%4I%x5%GngG>s34WlWz~C_Rs^h@Z!4
zY9i{$tQd+vfzhnkDrRAa@nH8RKG<eIS0r23Kr;?dpv^M@KyC#KUVtl<i*GPx!5$A<
zYmW!xVaYJ(T~o0*9XMxkpQ5Kv&CC4~=h+kkrG3AD&%ntwHZhheYbBba6CgriL5D}h
z7At9fh1h|1*Wn8!At529pcrc2pUarZ{zR6$Jf})a^<K95!McGBVjmh0j9R}JY7m|%
zME@^+3i}<1OF*#rjpL>DVy4Et63%lalB^>TwwZk*EG&#Ex95*H2kGeB+H`omF3jq#
zfE65~JpwYpqn75Xrn4_8Cc^c`DLibB-WSLO&z`W|{oPUU&}|tnfi{cHmWJ3jVWnL{
zPY@HT|98|H+1I}0X*p%Ju_@onU{+SvW2B#9%oHY*mDVF&VSbSaF~ggJIT+x+cZ#lu
zf@-Cvv)T_uZ{gPysyA0r&q-A>tAw{2vWBvqp}pIjXJvY=R_YUWn}pNMfqF9Z@g)N&
z?p5$Cg<N@3J~|B-?~{x=lZmepmX7s@L$+~mw4-mbc5b_3-Ruv@-+|>774d5K?c~CS
zfT@m#?^3EsZMbnTBHclP+RlZi-i9&#VqvH2FX4$PNiA|3FWMR^WS%y|Z<AlB-c&jE
zX|dMXA1HFcX_W&0>C_5)kV`lx+ys^4mUou*l{^rfAT|v?mULKuKFjAmpCe`Ez~KE8
z;P>xG?x?2Ob+WlxjoANt#s80Vl|cr?B_vF_ykvTj36(j7p#lH_BqU_i)QZR?6uxay
zZxpOKa1*ZMN!2oD{HWv%yAjB*cZuU2MR@5USTHHM2y@yOR55nX6JTRn5vvz9i><$8
zN_q;jPf|%@qeCHO?j80{94&dUTro(0QW=aRH^#j>(<mx7VP5k---tSu@95jAov9vS
z=A;Vg)2eJ4e>YI*)af6Be-V(8C3QF0TKG9B4f9FF{POb-B(%KR$1^x|4Gec|q)RAn
zpqYvo!cUf9_B3s)5fHUG7$Qp!&B7ugSRMw4D26k+9C`O6JjKPtSYs3t<P$sf40g83
zG%rlk2_RN6JRZ}%9>%RuSHSt<&At36pVNne;Aqv%QO)h=2QgJ}Od!gSwRifxKOPom
zmUK`;iIRJdv{`ce>&Nj|?)ZKFB+FrT?xero%Gb9Pq80}%nzvYlhso6rf3G9Ibq@Wr
zjL>O8>~EX7CrfJuj(b<4M{0_=I5_Og@S@Vn%+lXu>96-GaM8k}!q}vEvhw`=f<uBM
z6}LraRvlI7wcCjTKfajUz>sBR#Qu0710y?SBLe(BRpVgaHhOTZlBdPzQGH^JUQXI<
zr=LtpItg#={n=yLO3AF)4?29BTwE~rML%f-;>@blOQ)66d&xuA6bXcei3u*s!o<!G
zD9EJSM=iCi+T#>rG2_cgLZ7&KR-mDh3a)tNCGF0CAMsdJJ?iv%pH(vF?WN{q8gNHB
z-lhplMQ7n#2VB1vU@yogLS#+5U&oV@voGdYRdoE{Yi6~klb!wTq?$gN9e<sLFV<2`
z6Q}1C1j;sVkDiWKR5VtYG|=eGQ+Bvcmhu<`=Kskq=sY;aG&koaIhf3hpQAY0^HM8D
zxVs-=lu$h5A_CqR<l4cr<~);)6&R_s813yI*C+nE!J8d`KPb$t(oCPqUgYIA5_$wK
zziuLp%$=kaiFdOHWp|S6@}pA%Kl<O4^ENiIOHW)H&U&3U9NBF()y^mgD>JeTmWTor
zc;wTTx#}uPM6OckS-Ab8vC+g-4>7V8aNah-L-UDs5|Y5Fab2&0Mq&oVwXN3up62go
z_}1DBWLvnKLN}E(+V7n}2dsm!pmTcnx3!ItG$lk1hR&$``35H{N-h0S|GI)Ay5^|1
zv#$AftC)zL{?Wkr=G^deG^~t7QO!|Hj$?6T;QYEMB}@ZUGEB*m!dR70H%{qiREI^E
zykNN`T}n+DN=HKCI9p(!KlB+|oB@*|oFs=Z)?%Wqwra4QR*ycind!^cF>#xiFLc>(
zMw0;L0L_ww$vyzTRyMj@6O3v3zdc4|mhb{h<vG3rUiO>1Xv)WyR;!)s)jzIctWaV2
z0~#zl_-1F;0Y53qcDu!W+3TPah-u;J=Kw~^X%mNnkG(Qf+S*k{D#Vt`ul4ng&?b&I
zlX^D1UWZB#mtC*AEnY_fzu$(@>wI?Dspz$hx%K825Fd6r37@;(=)O$4jyA%$47VbF
zZ!lCN*FLkps#o*<;o$pMx(OG(J)Wt88e1DofU^{h)MPIz3et!$k{$5Kr?6aHCrNvB
z<s@6(%>l6s1KT|I;&J#EXyWjByd?BYKpM~_KgLf*Ju(eKKTE-gc2(>HW5<$cQYLoY
z8_`zzzMm&yl9fY^2I|nF$%He~EiNd7A*WX}F)#BYocLUm-th`6vNBQ@?m#Z|jx8Jf
zmMp#nv+)d#2!}!uIP0;y5F`6n#RNjQiEg{qx8<T~IT~zOV4w^~4i5>TZ?pVY3;B|E
zmyDLW!j)1;dwm-YMIJ|@7=onx#zk^Ns_sKeg>!?Jw=Igq3RP(-i*%+v==kYH+us(U
z==gm*L!FF|ESR|J%G?e0q0BvX%<|4Iefwu7${e@cPO!NtM9f}{#wc<l?<VsgY4**8
zQrhyGi|g-5h@$fiJlFC!!%V(K>tAJjt<H+zJQT~s#Il-UQAQ5==3d%zj4NMnK}Eb!
zRhG48#P*j%PK9=tq^fx}8wtypo3nNcLLRTVn5pmU&gwB6yS_(-<;5~7^jP?~7nBZ`
zFn8A{7HLE8pI*-?_q&PFNKa3X1>eNIb>9Y?qLx2(OtE{@i<>pQQNL#ci*`kzPY*B(
zDkq_DQ8~}*JW7c1MOn?5Qs3^^-$F4kDj7|j#&i5j^!BTkCYNjbVD_dd2up<(GkG6(
z>lN*%u-H7mm<=&d{UBjQ`6`n<VzD0Lh$W`4GzWe5VG&Qjg1<&n@IvSJ?R{Ncg~740
z$?m6@fnmgrDy}XDZKp49`Wep&)6b-=sUNp`)YY`fF4}Kt%xAKx$@5b%Xj8&(GmJ;-
zfOFT8OK{ed5IP_JH1zJW?$^svLwMg^`milEP{R$Tf9WxlKVubv%Ik&S{5vz1^o)cH
zDI+JDP@D}OCv6_zzyW6~$?3IO2&s578|D<bucyg%Hz0&hY%&br)=6TyWQ~!STb`c;
z+6Nzj1P4LQfi$%Av2qls+Xy~^MCmWB#%o2^ibW<|sgGD?m=HzIsrWOOxZfiBFxf}~
zcwhhC*+x0eQJ>xPoT{tV3eh2_)dS7U;hAMR?c~zZCtVwHJAmR1SHr|g>uR0X0x}v!
zIxzTwuRw~+Gl#l#kBK!etABOS(9;8<G96(8b$gQ2VG(Mv>WwTNv~tcQ{sH<nS*s}B
zuuP`->+$@290kM0t1Y+UY6;YrYN&bg5dO7lM^kZUo$Ugn@`RylhPS8+>hBy}OD++(
zEFQUbA`rPR5AwDrY)n`4*9<tT0`2v!Z7Y&y>tBu19UG}`SQq||PO*@&*3*j0hKTdO
zDf>JUzL~U}627EIH?!Nskx8eV3{y_DKiiV|B_RI1I=}F#beyE)rXFR-=Zq}U%SLV(
zK_y{LJ>(;E;|~~)_PI>MTi`mTbyj_7dwcCNI^)aW(<W%_S|)!ZWfg!stBYSwOF(qk
zjra%#Utjh1;j_KBG6q4A)uPP^p3<p=JPYdOqc>?@e_z{7&bQL4yR2YQ&u==om!sEh
zlc=>9;uxH6QfPIb%Da-8@$LO-w?6T)>C+>U%Z?j1loXSBBuYCchPLoF|8Z?#OPDT#
z-2d|n{8rWJ(x`HydU2!<KrxiW&$~k#DfDMDNhcEUR~l7Wb)sDEMLW*rVdrI&g+1`t
zV;Xu~A1>*X9d)07w~(?HN4dgO*UP?w(MYtfGC@T7lCuxLZy0qSo_rY{^;0=gZM4&*
zvGl9j^N`5I=Xp+|mVB*bn2`lu8$6t?+30&2=UB@!_^#fRJBIX_mz!^WsI-;nA0D9N
z^UnL+kg#HUr4csv)qRDl|1M@TWdXH@H`bfLIedc)__YjEj7KvPzbYO~Z(<>Txi=Oa
zaF{CcB(ptYuJ=LMcS^}@HCjmU5$xmwI<9!Qo^!uD+I@;#ed}Kogx)_Aw#B20mkhM!
z@3v_+6wq~p;=?Xt6^la3&tDl`Pap1mnMlef?)lArN2w*Lq&jn84=kvvjfOm}vsunC
zQe~B#*w--Pp6DwoUP>kyy=)o>xU*C&31P%bsvTwo$OGHL2F~!_{ptz363~%L^k>P?
zxoImNl;yNtq%rN0g}xtcs}V*tPOQ=}CAYy}+&JMPa<mm?R^Y*}|J<Vb*sZDdhtI{a
z<tYs1?$r|6o=2E0#6CRRu(+==ueUk#LCpiH7S!{>0$jzhhIssX+5X>e!VJX`O7+)}
zu@w8fvG+x2iw0pFKdAeKa|L3s?dZuy9pPuN@f#$6o*uh&(KNZz1P4fSL{llVssn8F
z?Up%{s{73$!cndyHIc?1yei^aX!wSY!F%LsCJN4~E>%@M<4jphsW@AFp-4ml$aY{b
zk<F>oI>8q$JqHvV`sr!idyKZ=d}+j91l8>>$TkgnqKPD$k?QpT`Bh`>I3p^9DHF(U
zwQRFbG!4&wV(bLjp?%Q3;<Sdan9+j+y&j+1*Dd{_<9oXxiZeuiqZSDht+tL4FFljE
ze?RKVS;OdrrM1#w2HCw21|?AV8-!CGQA%ihV9N^qEm3>fnB3+v>>M|q1d0xJt#HCT
zI`&EHGz5qY64mh!PhME;bbFHq<K)V}o6VYeeljC?`jgX|nm^jg<SHpE3Knmr&db*Q
z__ka#Pv3+O7@F&>>qfU6Yi0WkxqmJwhdtx$(r9i3q8GON<AA3!gpx`e;S|UIJ69;O
z#a&uPZ^1=da1{JCyU^Y<RW<~S+?MPEhn=B#3{aPar42KxNKGb`LqyO5EE-<vb(`aY
zho5u&y}(to*eDf8@+VqhA%YNOfa0)#Ij@9TL2RYTC0hmR9NFPaWgJlJWcqNA6W?Oe
zr|G}VhQ-&%gBroo*48?1b{J&)l7(Z;g^H<m<CTi{cD?bs3}sZ=Y?|O9X~Ui!z5Mme
z&;odYIU|gMSs?G(PsHfPc6WQ*>N%XcBs@%a6Nx%>?vJm_!hj%x-BQ5t6>NwiS@^3(
z(~}ltEO!p=w_FKwfc&qB;vX^YOT5aUA^dA*Py@#eF5jS&WSdV&=*=6dj;QCOmn`rL
z*H~*~z8XBv+`lR8-3D0s{$+ARQm~z6W6dmvVh*JDpZ59N@s^L@e(g{_NGY0~FlKt#
zcmc;z{6Ah05fzB!RZ)w|RxxSryaD6yWq9v8F-&=t44s-FReX7BL+VBZSj6|J_~YPp
zCceQ@d(yamKV6^{Du&P_0nY$Mn;Fakt97BT+P;9evcNJJkq1VN(QCmV#K%Fz$2f{?
zriY&PIoHY%(7uJ=>fc6iq*t{rB4HgSlL>>Bi=4#TMU$2!Dpo=_A~fQ_xvpK{jM0<B
zz6z&&WaQz0c^*?#Zl6t^6p|5mOz~yiZMyXjG42psyKKxxa2HM)_YQ__Gky~02>nJ!
zFo#OKlq+jpcog&B4*A3)_0z)ze-3pqS6+Y_eUkye91>lJhGz4L%ejRkCh90Sr0lod
zfjAU2?YLM2B<1$^uE$cQCf?VZ>pD5lWLRiW_Ssfo&SAulCEC`>#g<QzD_S!n+JM7j
zoWN%k3bZ)U8aqrrBibu^-KHMeDuTrpr08j_60W!RMJ`$uflORdzOlHNelHKRV9{+7
zF8fuZfzRJ*r6K4}bWbEO%$k`=bRGH<+P(VF3j2iX8%VT_xz9}-^bq%nBG{&4X_#af
z{bUuTa6<`rgMWy2_51o_BVoN<ZLeo2n-v`A*tTM~sp6)2bFUUQtp0ThCXLQB3SR+R
zZ%1bv*m)if2N4PTKox!zWHEz;p1`LlcNB7AkrDA-BoveYIFg&m;^WsCqLQhOzu`<=
zy&X8~2A&&kYn1GZcNi&EOfwItM7N7<CMd#7lqmk{sM>gZdi@K+AiH*Kl%pKbol4N{
zZ3H+vl=D|zgX4qMm|{6W=bx*}CHXRXGIud?%~Tk%5JdPVKC&-y$^=|yzy|-4&X=_}
z^F`b3$5E0TTmxXY9u&QpI_*+s9YqUF8u2g$Eq>SYh_4xTvyi(CPcS2w0Sy@~>Yqfd
zJ-q@H`NQ8}aNyr0_Rl$x;iuFNUFXACPI}ly@3tb#SKz!2r(rk3$L#=;AudGSpi1NE
ze35qX$O;q*>`DC4UoqYze}I$=c6{pj9~nkBF1-6vSx|rCc}np8Jp^+E^~rUyb=P~o
zn0)DX0ws)0>AvKKeF>vu9bFiK`reO#><DsElFcG~4>FeEAk$ZcW-1eM>WCQLF$J|8
z9F!|V2f5B3&9>gixNs6+bqIC$F%vJhl0~$dPl2lI^?@KC_qriLt*%bS#SF#J?Yy_u
zo~G1)eDg;TiWZ?6imjWB!2^O!(mLQSb~;ls{2eO0PM9V?_%v9cBTLcscE-RO{}+x0
z<^%fZN1kj}N6252U(vVi7X2W7>)yN#KRZ!TkPivj9Tpmp-VUS7<MsxW5c}-t-gS_$
zeH}yf54WE{ib@Lco@7+>zvZbiBc+6IEQ09S=DKKq2!8d}=J?|^?<Q*i3i&3Co!dhi
zR)WNc$oF<-v%|VA+km<T0spi14v`_@yDsJWc?a&~yB3&osB&9eLW4(MkZoU9I04qb
zU2?eHD>#_jiTJd6oDvi)zmc;n6-HF)y9)|JiX4PU3$zF3EdE&i#SaHp*CNlcTT#kw
zvY*7n@zT#Sl`-=ba3d~Sj(6-#|8g?@GO3p&IIaYBFnEr|;B{>)D?T}wne86>Dh_lc
z-g;U6&ko8I#Y$@eS@TGh%bQ3VXFtU49uXK}=L+GVqdDWc;dALHs^(`n^+w?+0ISMN
zp`bvLmZ{1p7z;nW5kwG}aBxp7qv`+>_BoS<dsG?#-SNVnL`Z1yUZAQKN1(GsYDzbZ
zXA$4sykkJ<eyKV9{8Q~1xVYf{W$~#^qfLg3Ji+`m&{4|2%3PR>Pn0Nazp-uK^t}lF
z1KtlyatlOp_v`L+R;OW;$TBg~@Si6%St_^!$NjvtdfK;Sg5-#NXs9*J1FZY%m-ZG`
zE62VuN(Jui{K=I!`uiB0KB<=GZVbf|OhqUXMn+AfQZXfBT6b`4yki7u;Pv;?)M@TF
zns||XI$no#<Ui(;F-;B)K|&%T-L2J|v*9_EDuD6|Gm0>`4M`T@rfv8!$YyC^?$=tA
zl4p|LonIT1Ow=l&$9yXE7a9$JuNv`{!{L&f8?CsnD(Cm>%jK&R^Irj(#i&3*3pe}b
zUYMS(k37$fhbuY6Wb;Pe>{`kCWmL)ogYX|5EkcmIjC3$kV~2jP8$`i{Ie18i@(0{n
zN#BsMEi4bL^104&y=&#}T`P_<u+1-o*e$@Cb7JS$Zb~ZeEg<?aH9tRuwtyjVce<n%
zOCWYp4GM2bGD1a4|F%U+J$iDc*1V;j`LFvBjzY;w>_1nsa<Xi3cGGbYiwa*eWq!D(
zjs;p7vhJ^he=@l{%9POB8{EFwX=Q;9t(W51ANYM!b@AM%))-^2O<1+jI3fUN9Sr((
z)eCzXP<7p#`C%WH+nF_ZW~pS-GOFB@MJtnOC@2nxd`!p=_WomG|HWaY0hDrHgp7VP
zD1{_bgz*X(gQ@V4`wQFu_Sl1<Lu-(g8+JTNB2_6+5+xid@8g4?%w<-3ur)7WV)2G?
zjP>vkLBFjA8;=1swzx?ISgw4K$%j_XkMth}VSTf1U+V|#MDq?!F6)uOu~O9SV3@h!
zL6ogc3EanAY$dkjdOtUm2n?RmKCfu}Rn#IxawpOyW(=pt8e+=vtLb9P*3*8{q!;@$
z*xs8cu48O`h~9=SiEDphsm@QL!Sr0qXOQ2QGb^f@gt`@qC~~}oK07~rn3IK)FypAT
zBTPz3kxuJ`bvaux?p&h?aQmjAK~{YZrgoS}m{n0XXEJY2`f6tJ)of^lw6Ypr{dVIh
zgCr-F8`K@27+W%f3mZHQwbZMdpKDz&>P^+FVTb9^B3%lw`1Oo?B9~gJPw(P7rovWB
zw2P$}5Kk!Pa(HWiguEUV8q2ug-v84Y#$}wH0aiQnj5N3Sd!tJr4yHjH@-7Y-t^mWp
z?EU~&USxb`PWCL7D|3F&6_L+9!*J5#V!^>ZCMMdX<4cTveIQsG;dGB{U=c6bk_`r<
zLhqwLT^!04qMYh9+%Pfw{J`*M(%?zl#qR(g@cGZPG`%X&0u(_K?6Ea)vQ#wSh#V4Z
z65?hM09Li#7=M2V^nDmUHBJL9HOEbCF~zcZb?hh<0Y+Q{kswrDF|9L;;C?f!&U29t
zMn^><?eoDZXDju-z`t&Xr3OOm<B_RDg1rW+Zmzla5xhU<-4tYQKWE&Gw&V2lIA6%l
z+<Jks>DvcT5O7EYQ%J%O$}c)~t-<Ll9f}07k469ZT%o-Gr;eAhm<qDqVsH>-UCbKc
zwi7EfSAf?xZtI7%t3<6f1wV}1&a1In>7gh*z+V?$Bb^aghn&O$M;DI$B)&baTaHMg
z+?-*;85g}pk;oAdd^+Ye5pdH7jE09!+k|(Y;rE<|GOD^k=z%BdeWCp%iUD1;Nurkl
z<PoKv2JPRpv`^AQy(bi!v2cjjmW;W2=O0YQD6<T&7ae~}<DW8Af`dMWJzBa_LGRN)
zWgeCh5Z=}q?qvm>!^)wO<#LHqxBpwFk~aKVR44q|B%AB4Jmj4Y+7AwZ(Ln*X!6d;a
z0~of)l^UrU=CDZq_lkFg{GX*Oa3k~EC0}(1y>}qV)i0%~vAR4ohKW(7t?=%n1m~;b
zjW;p0orB?X?Y4r7iqk5;-r=_srQNRBD_=EMD?b`A;1)uco5g7l^m(k$kyet<pw+39
ziBC}aNc(yG+(z&RIMRV}d^-0QuB!>#cG_~a>Q3A;GMfimm4~U~=MBs#Zuq2T58>Bq
zy14hA=-{B3`%{){LG4ySq`XY%p#VAH4<-~q1V92X8VCovACFKJmXsXQr2P8(ac^T#
z6&{#oJM0@b`Zk9(?E4FC1xyOv1OGCv!`?9_L=L!FdxY|hKOu1!L!-_pte}8Y-Zf37
zOX;=JqUs0MBKdm*6}}AgXX~HNH2}p9>`Ea*5^>P};TQ`tf?L_pAD#IVz-?_X%@{KJ
z103CEEVJ;xlw;UWU}lNlk4e-2q`OhU(s+s{68{pKdj!Ee3559vng66Y!P0)hqX=RD
zu$_sp!F=Taf9@Q~f2ASB!P0gTX~h5b)S*6t>4J>k4p{z^-ULfSB2lsYM~2oT5(L73
zI`$&}PZ|#_-A!y9{NFMVz{}_n%$oX7+7C>XhUY98kNtNU0si1+V3%B{{qLgxPk7UI
z;;@x@c|a*CDWikQEJZCXyhx9$|K6Ql3duKJT?Wtl^Q6{RZZ40TlARSX88Ziu+hn@$
zq-<;|$@JRPR8&-QwmZ)MzHLD$JQBOTQA{q+y9%vNFAeZ7lyCm+Tm>{dU3BSo`|j-Q
zL=$$y|7&%Au)pac<Kp7bP*I7W?k`-uPOMQ^!q6!Ya=9k6zWi(TA5j0GL!hi2FO=hN
zcz*}&bE@h5Xa6FbNMO5b>+`#||M$^gvQ+G}35?pl)W0V0iVcCbg8wNqhU6dOi2pK}
zN5an2%&hkBhH35ti=Q)Ydg1+R<m2xg(&e6{mVY1IjW6a0atVPr*!lPPfSwC@vrZsY
ujVJwU<>!oGaUKLtLh*l}Rv*~?2BY5np_P1yWbFg^Pf}D)qzb4P@P7d!Kz)G#

literal 0
HcmV?d00001

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/BUILD.gn b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/BUILD.gn
new file mode 100644
index 000000000..865bc4d3b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/BUILD.gn
@@ -0,0 +1,33 @@
+generated_doxygen_out_dir =
+    get_path_info(".", "gen_dir") + "/.."
+
+loadgen_doxygen_sources = [
+  "doxygen.cfg",
+  "doxygen_footer.html",
+  "doxygen_header.html",
+  "doxygen_layout.xml",
+  "doxygen_stylesheet.css",
+  "loadgen-integration_diagram.dia",
+  "mlperf_icon.png",
+  "mlperf_logo_horizontal_color.svg",
+  "README.md"
+]
+
+source_set("loadgen_doxygen_sources") {
+  sources = loadgen_doxygen_sources
+}
+
+source_set("doxygen_html_generator_script") {
+  sources = [ "doxygen_html_generator.py" ]
+}
+
+action("generate_doxygen_html") {
+  script = "doxygen_html_generator.py"
+  args = [ rebase_path(generated_doxygen_out_dir, root_build_dir),
+           rebase_path("../..") ]
+  outputs = [ generated_doxygen_out_dir ]
+  deps = [ ":loadgen_doxygen_sources",
+           ":doxygen_html_generator_script",
+           "../..:mlperf_loadgen_sources_no_gen",
+           "../..:docs" ]
+}
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/README.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/README.md
new file mode 100644
index 000000000..d5cf5fe18
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/README.md
@@ -0,0 +1,34 @@
+# Generating the HTML docs {#ReadmeHtmlDocs}
+
+This document is generated from inline docstrings in the source and
+various markdown files checked into the git repository. If you've
+checked out the code, you can generate this documentation.
+
+*Prerequisite:* You must have [doxygen](http://www.doxygen.nl) installed
+on your system:
+
+## With gn / ninja
+
+If you are using the gn build flow, you may run:
+
+    ninja -C out/Release generate_doxygen_html
+
+* This will output the documentation to out/Release/gen/loadgen/docs/gen and
+avoid poluting the source directory.
+
+## Manually
+
+Alternatively, you can manually run:
+
+    python docs/src/doxygen_html_generator.py <target_dir> <loadgen_root>
+
+* If <loadgen_root> is omitted, it will default to ".".
+* If <target_dir> is also omitted, it will default to "./docs/gen".
+
+## Hosting
+
+A version of this doc is currently hosted online at
+https://mlperf.github.io/inference/loadgen/index.html
+
+To update the hosted version, submit a PR to the
+[mlperf.github.io](https://github.com/mlperf/mlperf.github.io) repository.
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen.cfg b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen.cfg
new file mode 100644
index 000000000..fc05853d1
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen.cfg
@@ -0,0 +1,2495 @@
+# Doxyfile 1.8.13
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all text
+# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
+# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
+# for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = "LoadGen Guide"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO           = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/mlperf_logo_horizontal_color.svg
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = $(MLPERF_DOXYGEN_OUT_PATH)
+
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES    = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = YES
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 4
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:\n"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". You can put \n's in the value part of an alias to insert
+# newlines.
+
+ALIASES                =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding "class=itcl::class"
+# will allow you to use the command class in the itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
+# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
+# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
+# Fortran. In the later case the parser tries to guess whether the code is fixed
+# or free formatted code, this is the default for Fortran type files), VHDL. For
+# instance to make doxygen treat .inc files as Fortran files (default is PHP),
+# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 0.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 1
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = YES
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = YES
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# (class|struct|union) declarations. If set to NO, these declarations will be
+# included in the documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
+# names in lower-case letters. If set to YES, upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/doxygen_layout.xml
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = NO
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some parameters
+# in a documented function, or documenting parameters that don't exist or using
+# markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong or incomplete
+# parameter documentation, but not about the absence of documentation.
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = NO
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr).
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  = $(MLPERF_LOADGEN_SRC_PATH)
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see: http://www.gnu.org/software/libiconv) for the list of
+# possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
+# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
+# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf.
+
+FILE_PATTERNS          = *.c \
+                         *.cc \
+                         *.cxx \
+                         *.cpp \
+                         *.c++ \
+                         *.java \
+                         *.ii \
+                         *.ixx \
+                         *.ipp \
+                         *.i++ \
+                         *.inl \
+                         *.idl \
+                         *.ddl \
+                         *.odl \
+                         *.h \
+                         *.hh \
+                         *.hxx \
+                         *.hpp \
+                         *.h++ \
+                         *.cs \
+                         *.d \
+                         *.php \
+                         *.php4 \
+                         *.php5 \
+                         *.phtml \
+                         *.inc \
+                         *.m \
+                         *.markdown \
+                         *.md \
+                         *.mm \
+                         *.dox \
+                         *.py \
+                         *.pyw \
+                         *.f90 \
+                         *.f95 \
+                         *.f03 \
+                         *.f08 \
+                         *.f \
+                         *.for \
+                         *.tcl \
+                         *.vhd \
+                         *.vhdl \
+                         *.ucf \
+                         *.qsf
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                = depot_tools
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       = *
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             = $(MLPERF_LOADGEN_SRC_PATH)/docs/src
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = YES
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# function all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see http://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
+# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
+# cost of reduced performance. This can be particularly helpful with template
+# rich C++ code for which doxygen's built-in parser lacks the necessary type
+# information.
+# Note: The availability of this option depends on whether or not doxygen was
+# generated with the -Duse-libclang=ON option for CMake.
+# The default value is: NO.
+
+CLANG_ASSISTED_PARSING = YES
+
+# If clang assisted parsing is enabled you can provide the compiler with command
+# line options that you would normally use when invoking the compiler. Note that
+# the include paths will already be set by doxygen for the files and directories
+# specified with INPUT and INCLUDE_PATH.
+# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
+
+CLANG_OPTIONS          = -I ../third_party/pybind/include --std=c++14
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot o=
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
+# which the alphabetical index list will be split.
+# Minimum value: 1, maximum value: 20, default value: 5.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/doxygen_header.html
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/doxygen_footer.html
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/doxygen_stylesheet.css
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/mlperf_icon.png \
+                         $(MLPERF_LOADGEN_SRC_PATH)/loadgen_integration_diagram.svg
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a colorwheel, see
+# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use grayscales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 127
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = NO
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = YES
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 50
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see: http://developer.apple.com/tools/xcode/), introduced with
+# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
+# Makefile in the HTML output directory. Running make will produce the docset in
+# that directory and running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
+# Windows.
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the master .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
+# folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
+# filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location of Qt's
+# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
+# generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine-tune the look of the index. As an example, the default style
+# sheet generated by doxygen has an example that shows how to put an image at
+# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
+# the same information as the tab index, you could consider setting
+# DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = YES
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# http://www.mathjax.org) which uses client side Javascript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. See the MathJax site (see:
+# http://docs.mathjax.org/en/latest/output.html) for more details.
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility), NativeMML (i.e. MathML) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from http://www.mathjax.org before deployment.
+# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using Javascript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see: http://xapian.org/). See the section "External Indexing and
+# Searching" for details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when enabling USE_PDFLATEX this option is only used for generating
+# bitmaps for formulas in the HTML output, but not in the Makefile that is
+# written to the output directory.
+# The default file is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
+# generated LaTeX document. The header should contain everything until the first
+# chapter. If it is left blank doxygen will generate a standard header. See
+# section "Doxygen usage" for information on how to let doxygen write the
+# default header to a separate file.
+#
+# Note: Only use a user-defined header if you know what you are doing! The
+# following commands have a special meaning inside the header: $title,
+# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
+# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
+# string, for the replacement values of the other commands the user is referred
+# to HTML_HEADER.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
+# generated LaTeX document. The footer should contain everything after the last
+# chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer.
+#
+# Note: Only use a user-defined footer if you know what you are doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
+# the PDF file directly from the LaTeX files. Set this option to YES, to get a
+# higher quality PDF documentation.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help. This option is also used
+# when generating formulas in HTML.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
+# code with syntax highlighting in the LaTeX output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's config
+# file, i.e. a series of assignments. You only have to provide replacements,
+# missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's config file. A template extensions file can be generated
+# using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
+# with syntax highlighting in the RTF output.
+#
+# Note that which sources are shown also depends on other settings such as
+# SOURCE_BROWSER.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_SOURCE_CODE        = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR             =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = xml
+
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
+# program listings (including syntax highlighting and cross-referencing
+# information) to the DOCBOOK output. Note that enabling this will significantly
+# increase the size of the DOCBOOK output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_PROGRAMLISTING = NO
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sf.net) file that captures the
+# structure of the code including all documentation. Note that this feature is
+# still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of 'which perl').
+# The default file (with absolute path) is: /usr/bin/perl.
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
+# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
+# NO turns the diagrams off. Note that this option also works with HAVE_DOT
+# disabled, but it is recommended to install and use dot, since it yields more
+# powerful graphs.
+# The default value is: YES.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see:
+# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               =
+
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: YES.
+
+HAVE_DOT               = YES
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
+# each documented class showing the direct and indirect inheritance relations.
+# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = YES
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = YES
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = YES
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = YES
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = YES
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,
+# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,
+# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = YES
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file. If left blank, it is assumed
+# PlantUML is not used or called during a preprocessing step. Doxygen will
+# generate a warning when it encounters a \startuml command in this case and
+# will not generate output for the diagram.
+
+PLANTUML_JAR_PATH      =
+
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# files that are used to generate the various graphs.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_CLEANUP            = YES
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_footer.html b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_footer.html
new file mode 100644
index 000000000..0d7b177af
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_footer.html
@@ -0,0 +1,26 @@
+<!-- HTML footer for doxygen 1.8.13-->
+<!-- start footer part -->
+<!--BEGIN GENERATE_TREEVIEW-->
+<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
+  <ul>
+    $navpath
+    <li class="footer">
+      <div style="flex-grow:1"></div>
+      <div>
+        $generatedby
+        <a href="http://www.doxygen.org/index.html">doxygen</a>
+        $doxygenversion
+      </div>
+    </li>
+  </ul>
+</div>
+<!--END GENERATE_TREEVIEW-->
+<!--BEGIN !GENERATE_TREEVIEW-->
+<hr class="footer"/><address class="footer"><small>
+$generatedby &#160;<a href="http://www.doxygen.org/index.html">
+doxygen
+</a> $doxygenversion
+</small></address>
+<!--END !GENERATE_TREEVIEW-->
+</body>
+</html>
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_header.html b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_header.html
new file mode 100644
index 000000000..91d214b95
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_header.html
@@ -0,0 +1,49 @@
+<!-- HTML header for doxygen 1.8.13-->
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
+<meta http-equiv="X-UA-Compatible" content="IE=9"/>
+<meta name="generator" content="Doxygen $doxygenversion"/>
+<meta name="viewport" content="width=device-width, initial-scale=1"/>
+<link rel="shortcut icon" href="mlperf_icon.png">
+<!--BEGIN PROJECT_NAME--><title>LoadGen: $title</title><!--END PROJECT_NAME-->
+<!--BEGIN !PROJECT_NAME--><title>$title</title><!--END !PROJECT_NAME-->
+<link href="$relpath^tabs.css" rel="stylesheet" type="text/css"/>
+<script type="text/javascript" src="$relpath^jquery.js"></script>
+<script type="text/javascript" src="$relpath^dynsections.js"></script>
+$treeview
+$search
+$mathjax
+<link href="$relpath^$stylesheet" rel="stylesheet" type="text/css" />
+$extrastylesheet
+</head>
+<body>
+<div id="top" style="display:flex; flex-flow:row wrap; justify-content:flex-start; align-items:center;"><!-- do not remove this div, it is closed by doxygen! -->
+
+<!--BEGIN TITLEAREA-->
+<div id="titlearea" style="display:flex; flex-flow:row wrap; justify-content:flex-start; align-items:center;">
+  <!--BEGIN PROJECT_LOGO-->
+  <a href="https://www.mlperf.org"><img alt="MLPerf" src="$relpath^$projectlogo"/ height="55px"></a>
+  <!--END PROJECT_LOGO-->
+  <!--BEGIN PROJECT_NAME-->
+  <div style="padding-left: 1em;">
+    <div id="projectname"><a href="index.html">$projectname</a>
+   <!--BEGIN PROJECT_NUMBER-->&#160;<span id="projectnumber">$projectnumber</span><!--END PROJECT_NUMBER-->
+   </div>
+   <!--BEGIN PROJECT_BRIEF--><div id="projectbrief">$projectbrief</div><!--END PROJECT_BRIEF-->
+  </div>
+  <!--END PROJECT_NAME-->
+  <!--BEGIN !PROJECT_NAME-->
+   <!--BEGIN PROJECT_BRIEF-->
+    <div id="projectbrief" style="padding-left: 1em;">$projectbrief</div>
+   <!--END PROJECT_BRIEF-->
+  <!--END !PROJECT_NAME-->
+  <!--BEGIN DISABLE_INDEX-->
+   <!--BEGIN SEARCHENGINE-->
+   <div>$searchbox</div>
+   <!--END SEARCHENGINE-->
+  <!--END DISABLE_INDEX-->
+</div>
+<!--END TITLEAREA-->
+<!-- end header part -->
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_html_generator.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_html_generator.py
new file mode 100644
index 000000000..4065d7bd0
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_html_generator.py
@@ -0,0 +1,37 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# \file
+#  \brief A script that sets the environment variables expected by doxygen.cfg.
+#  \details This can be run manually without any arguments, but also allows a
+#  build system to customize the output directory.
+
+import os
+import sys
+
+
+def generate_doxygen_html(doxygen_out_dir, loadgen_root):
+    os.environ["MLPERF_LOADGEN_SRC_PATH"] = loadgen_root
+    os.environ["MLPERF_DOXYGEN_OUT_PATH"] = doxygen_out_dir
+    os.popen("doxygen " + loadgen_root + "/docs/src/doxygen.cfg")
+
+
+def main(argv):
+    doxygen_out_dir = "./docs/gen" if len(argv) < 2 else argv[1]
+    loadgen_root = "." if len(argv) < 3 else argv[2]
+    generate_doxygen_html(doxygen_out_dir, loadgen_root)
+
+
+main(sys.argv)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_layout.xml b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_layout.xml
new file mode 100644
index 000000000..1fc5a9cb4
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_layout.xml
@@ -0,0 +1,211 @@
+<doxygenlayout version="1.0">
+  <!-- Generated by doxygen 1.8.13 -->
+  <!-- Navigation index tabs for HTML output -->
+  <navindex>
+    <tab type="mainpage" visible="yes" title="Overview"/>
+    <tab type="usergroup" visible="yes" title="API and Usage">
+      <tab type="user" url="@ref LoadgenAPI" visible="yes" title="API"/>
+      <tab type="user" url="@ref LoadgenAPITestSettings" visible="yes" title="Settings"/>
+    </tab>
+    <tab type="usergroup" visible="yes" title="Development">
+      <tab type="user" url="@ref ReadmeBuild" visible="yes" title="Building"/>
+      <tab type="user" url="@ref ReadmeHtmlDocs" visible="yes" title="Generating this Document"/>
+      <tab type="user" url="@ref todo" visible="yes" title="TODOs"/>
+    </tab>
+    <tab type="usergroup" visible="yes" title="Test Coverage">
+      <tab type="user" url="@ref ReadmeTests" visible="yes" title="Build and Run"/>
+      <tab type="user" url="@ref LoadgenTestsBasic" visible="yes" title="Basic"/>
+      <tab type="user" url="@ref LoadgenTestsPerformance" visible="yes" title="Performance"/>
+    </tab>
+    <tab type="usergroup" visible="yes" title="The Code">
+      <tab type="modules" visible="no" title="" intro=""/>
+      <tab type="namespaces" visible="yes" title="">
+        <tab type="namespacelist" visible="yes" title="" intro=""/>
+        <tab type="namespacemembers" visible="yes" title="" intro=""/>
+      </tab>
+      <tab type="classes" visible="yes" title="">
+        <tab type="classlist" visible="yes" title="" intro=""/>
+        <tab type="classindex" visible="$ALPHABETICAL_INDEX" title=""/>
+        <tab type="hierarchy" visible="yes" title="" intro=""/>
+        <tab type="classmembers" visible="yes" title="" intro=""/>
+      </tab>
+      <tab type="files" visible="yes" title="">
+        <tab type="filelist" visible="yes" title="" intro=""/>
+        <tab type="globals" visible="yes" title="" intro=""/>
+      </tab>
+      <tab type="examples" visible="no" title="" intro=""/>
+    </tab>
+    <tab type="pages" visible="no" title="" intro=""/>
+    <tab type="user" url="@ref ReadmeFAQ" visible="yes" title="FAQ"/>
+  </navindex>
+
+  <!-- Layout definition for a class page -->
+  <class>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <inheritancegraph visible="$CLASS_GRAPH"/>
+    <collaborationgraph visible="$COLLABORATION_GRAPH"/>
+    <memberdecl>
+      <nestedclasses visible="yes" title=""/>
+      <publictypes title=""/>
+      <services title=""/>
+      <interfaces title=""/>
+      <publicslots title=""/>
+      <signals title=""/>
+      <publicmethods title=""/>
+      <publicstaticmethods title=""/>
+      <publicattributes title=""/>
+      <publicstaticattributes title=""/>
+      <protectedtypes title=""/>
+      <protectedslots title=""/>
+      <protectedmethods title=""/>
+      <protectedstaticmethods title=""/>
+      <protectedattributes title=""/>
+      <protectedstaticattributes title=""/>
+      <packagetypes title=""/>
+      <packagemethods title=""/>
+      <packagestaticmethods title=""/>
+      <packageattributes title=""/>
+      <packagestaticattributes title=""/>
+      <properties title=""/>
+      <events title=""/>
+      <privatetypes title=""/>
+      <privateslots title=""/>
+      <privatemethods title=""/>
+      <privatestaticmethods title=""/>
+      <privateattributes title=""/>
+      <privatestaticattributes title=""/>
+      <friends title=""/>
+      <related title="" subtitle=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <services title=""/>
+      <interfaces title=""/>
+      <constructors title=""/>
+      <functions title=""/>
+      <related title=""/>
+      <variables title=""/>
+      <properties title=""/>
+      <events title=""/>
+    </memberdef>
+    <allmemberslink visible="yes"/>
+    <usedfiles visible="$SHOW_USED_FILES"/>
+    <authorsection visible="yes"/>
+  </class>
+
+  <!-- Layout definition for a namespace page -->
+  <namespace>
+    <briefdescription visible="yes"/>
+    <memberdecl>
+      <nestednamespaces visible="yes" title=""/>
+      <constantgroups visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </namespace>
+
+  <!-- Layout definition for a file page -->
+  <file>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <includegraph visible="$INCLUDE_GRAPH"/>
+    <includedbygraph visible="$INCLUDED_BY_GRAPH"/>
+    <sourcelink visible="yes"/>
+    <memberdecl>
+      <classes visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <constantgroups visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection/>
+  </file>
+
+  <!-- Layout definition for a group page -->
+  <group>
+    <briefdescription visible="yes"/>
+    <groupgraph visible="$GROUP_GRAPHS"/>
+    <memberdecl>
+      <nestedgroups visible="yes" title=""/>
+      <dirs visible="yes" title=""/>
+      <files visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <pagedocs/>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </group>
+
+  <!-- Layout definition for a directory page -->
+  <directory>
+    <briefdescription visible="yes"/>
+    <directorygraph visible="yes"/>
+    <memberdecl>
+      <dirs visible="yes"/>
+      <files visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+  </directory>
+</doxygenlayout>
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_stylesheet.css b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_stylesheet.css
new file mode 100644
index 000000000..3bd61261c
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_stylesheet.css
@@ -0,0 +1,1629 @@
+/* The standard CSS for doxygen 1.8.13 */
+
+body, table, div, p, dl {
+	font: 400 14px/22px Roboto,sans-serif;
+}
+
+p.reference, p.definition {
+	font: 400 14px/22px Roboto,sans-serif;
+}
+
+/* @group Heading Levels */
+
+h1.groupheader {
+	font-size: 150%;
+}
+
+.title {
+	font: 400 14px/28px Roboto,sans-serif;
+	font-size: 175%;
+	font-weight: bold;
+	margin: 10px 2px;
+	color: #135384;
+}
+
+h2.groupheader {
+	border-bottom: 1px solid #879ECB;
+	color: #354C7B;
+	font-size: 150%;
+	font-weight: normal;
+	margin-top: 1.75em;
+	padding-top: 8px;
+	padding-bottom: 4px;
+	width: 100%;
+}
+
+h3.groupheader {
+	font-size: 100%;
+}
+
+h1, h2, h3, h4, h5, h6 {
+	-webkit-transition: text-shadow 0.5s linear;
+	-moz-transition: text-shadow 0.5s linear;
+	-ms-transition: text-shadow 0.5s linear;
+	-o-transition: text-shadow 0.5s linear;
+	transition: text-shadow 0.5s linear;
+	margin-right: 15px;
+	color: #135384;
+
+}
+
+h1.glow, h2.glow, h3.glow, h4.glow, h5.glow, h6.glow {
+	text-shadow: 0 0 15px cyan;
+}
+
+dt {
+	font-weight: bold;
+}
+
+div.multicol {
+	-moz-column-gap: 1em;
+	-webkit-column-gap: 1em;
+	-moz-column-count: 3;
+	-webkit-column-count: 3;
+}
+
+p.startli, p.startdd {
+	margin-top: 2px;
+}
+
+p.starttd {
+	margin-top: 0px;
+}
+
+p.endli {
+	margin-bottom: 0px;
+}
+
+p.enddd {
+	margin-bottom: 4px;
+}
+
+p.endtd {
+	margin-bottom: 2px;
+}
+
+/* @end */
+
+caption {
+	font-weight: bold;
+}
+
+span.legend {
+        font-size: 70%;
+        text-align: center;
+}
+
+h3.version {
+        font-size: 90%;
+        text-align: center;
+}
+
+div.qindex, div.navtab{
+	background-color: #EBEFF6;
+	border: 1px solid #A3B4D7;
+	text-align: center;
+}
+
+div.qindex, div.navpath {
+	width: 100%;
+	line-height: 140%;
+}
+
+div.navtab {
+	margin-right: 15px;
+}
+
+/* @group Link Styling */
+
+a {
+	color: #3D578C;
+	font-weight: normal;
+	text-decoration: none;
+}
+
+.contents a:visited {
+	color: #4665A2;
+}
+
+a:hover {
+	text-decoration: underline;
+}
+
+a.qindex {
+	font-weight: bold;
+}
+
+a.qindexHL {
+	font-weight: bold;
+	background-color: #9CAFD4;
+	color: #ffffff;
+	border: 1px double #869DCA;
+}
+
+.contents a.qindexHL:visited {
+        color: #ffffff;
+}
+
+a.el {
+	font-weight: bold;
+}
+
+a.elRef {
+}
+
+a.code, a.code:visited, a.line, a.line:visited {
+	color: #4665A2; 
+}
+
+a.codeRef, a.codeRef:visited, a.lineRef, a.lineRef:visited {
+	color: #4665A2; 
+}
+
+/* @end */
+
+dl.el {
+	margin-left: -1cm;
+}
+
+pre.fragment {
+        border: 1px solid #C4CFE5;
+        background-color: #FBFCFD;
+        padding: 4px 6px;
+        margin: 4px 8px 4px 2px;
+        overflow: auto;
+        word-wrap: break-word;
+        font-size:  9pt;
+        line-height: 125%;
+        font-family: monospace, fixed;
+        font-size: 105%;
+}
+
+div.fragment {
+        padding: 0px;
+        margin: 4px 8px 4px 2px;
+	background-color: #FBFCFD;
+	border: 1px solid #C4CFE5;
+}
+
+div.line {
+	font-family: monospace, fixed;
+        font-size: 13px;
+	min-height: 13px;
+	line-height: 1.0;
+	text-wrap: unrestricted;
+	white-space: -moz-pre-wrap; /* Moz */
+	white-space: -pre-wrap;     /* Opera 4-6 */
+	white-space: -o-pre-wrap;   /* Opera 7 */
+	white-space: pre-wrap;      /* CSS3  */
+	word-wrap: break-word;      /* IE 5.5+ */
+	text-indent: -53px;
+	padding-left: 53px;
+	padding-bottom: 0px;
+	margin: 0px;
+	-webkit-transition-property: background-color, box-shadow;
+	-webkit-transition-duration: 0.5s;
+	-moz-transition-property: background-color, box-shadow;
+	-moz-transition-duration: 0.5s;
+	-ms-transition-property: background-color, box-shadow;
+	-ms-transition-duration: 0.5s;
+	-o-transition-property: background-color, box-shadow;
+	-o-transition-duration: 0.5s;
+	transition-property: background-color, box-shadow;
+	transition-duration: 0.5s;
+}
+
+div.line:after {
+    content:"\000A";
+    white-space: pre;
+}
+
+div.line.glow {
+	background-color: cyan;
+	box-shadow: 0 0 10px cyan;
+}
+
+
+span.lineno {
+	padding-right: 4px;
+	text-align: right;
+	border-right: 2px solid #0F0;
+	background-color: #E8E8E8;
+        white-space: pre;
+}
+span.lineno a {
+	background-color: #D8D8D8;
+}
+
+span.lineno a:hover {
+	background-color: #C8C8C8;
+}
+
+.lineno {
+	-webkit-touch-callout: none;
+	-webkit-user-select: none;
+	-khtml-user-select: none;
+	-moz-user-select: none;
+	-ms-user-select: none;
+	user-select: none;
+}
+
+div.ah, span.ah {
+	background-color: black;
+	font-weight: bold;
+	color: #ffffff;
+	margin-bottom: 3px;
+	margin-top: 3px;
+	padding: 0.2em;
+	border: solid thin #333;
+	border-radius: 0.5em;
+	-webkit-border-radius: .5em;
+	-moz-border-radius: .5em;
+	box-shadow: 2px 2px 3px #999;
+	-webkit-box-shadow: 2px 2px 3px #999;
+	-moz-box-shadow: rgba(0, 0, 0, 0.15) 2px 2px 2px;
+	background-image: -webkit-gradient(linear, left top, left bottom, from(#eee), to(#000),color-stop(0.3, #444));
+	background-image: -moz-linear-gradient(center top, #eee 0%, #444 40%, #000 110%);
+}
+
+div.classindex ul {
+        list-style: none;
+        padding-left: 0;
+}
+
+div.classindex span.ai {
+        display: inline-block;
+}
+
+div.groupHeader {
+	margin-left: 16px;
+	margin-top: 12px;
+	font-weight: bold;
+}
+
+div.groupText {
+	margin-left: 16px;
+	font-style: italic;
+}
+
+body {
+	background-color: white;
+	color: black;
+        margin: 0;
+}
+
+div.contents {
+	margin-top: 10px;
+	margin-left: 12px;
+	margin-right: 8px;
+}
+
+td.indexkey {
+	background-color: #EBEFF6;
+	font-weight: bold;
+	border: 1px solid #C4CFE5;
+	margin: 2px 0px 2px 0;
+	padding: 2px 10px;
+        white-space: nowrap;
+        vertical-align: top;
+}
+
+td.indexvalue {
+	background-color: #EBEFF6;
+	border: 1px solid #C4CFE5;
+	padding: 2px 10px;
+	margin: 2px 0px;
+}
+
+tr.memlist {
+	background-color: #EEF1F7;
+}
+
+p.formulaDsp {
+	text-align: center;
+}
+
+img.formulaDsp {
+	
+}
+
+img.formulaInl {
+	vertical-align: middle;
+}
+
+div.center {
+	text-align: center;
+        margin-top: 0px;
+        margin-bottom: 0px;
+        padding: 0px;
+}
+
+div.center img {
+	border: 0px;
+}
+
+address.footer {
+	text-align: right;
+	padding-right: 12px;
+}
+
+img.footer {
+	border: 0px;
+	vertical-align: middle;
+}
+
+/* @group Code Colorization */
+
+span.keyword {
+	color: #008000
+}
+
+span.keywordtype {
+	color: #604020
+}
+
+span.keywordflow {
+	color: #e08000
+}
+
+span.comment {
+	color: #800000
+}
+
+span.preprocessor {
+	color: #806020
+}
+
+span.stringliteral {
+	color: #002080
+}
+
+span.charliteral {
+	color: #008080
+}
+
+span.vhdldigit { 
+	color: #ff00ff 
+}
+
+span.vhdlchar { 
+	color: #000000 
+}
+
+span.vhdlkeyword { 
+	color: #700070 
+}
+
+span.vhdllogic { 
+	color: #ff0000 
+}
+
+blockquote {
+        background-color: #F7F8FB;
+        border-left: 2px solid #9CAFD4;
+        margin: 0 24px 0 4px;
+        padding: 0 12px 0 16px;
+}
+
+/* @end */
+
+/*
+.search {
+	color: #003399;
+	font-weight: bold;
+}
+
+form.search {
+	margin-bottom: 0px;
+	margin-top: 0px;
+}
+
+input.search {
+	font-size: 75%;
+	color: #000080;
+	font-weight: normal;
+	background-color: #e8eef2;
+}
+*/
+
+td.tiny {
+	font-size: 75%;
+}
+
+.dirtab {
+	padding: 4px;
+	border-collapse: collapse;
+	border: 1px solid #A3B4D7;
+}
+
+th.dirtab {
+	background: #EBEFF6;
+	font-weight: bold;
+}
+
+hr {
+	height: 0px;
+	border: none;
+	border-top: 1px solid #4A6AAA;
+}
+
+hr.footer {
+	height: 1px;
+}
+
+/* @group Member Descriptions */
+
+table.memberdecls {
+	border-spacing: 0px;
+	padding: 0px;
+}
+
+.memberdecls td, .fieldtable tr {
+	-webkit-transition-property: background-color, box-shadow;
+	-webkit-transition-duration: 0.5s;
+	-moz-transition-property: background-color, box-shadow;
+	-moz-transition-duration: 0.5s;
+	-ms-transition-property: background-color, box-shadow;
+	-ms-transition-duration: 0.5s;
+	-o-transition-property: background-color, box-shadow;
+	-o-transition-duration: 0.5s;
+	transition-property: background-color, box-shadow;
+	transition-duration: 0.5s;
+}
+
+.memberdecls td.glow, .fieldtable tr.glow {
+	background-color: cyan;
+	box-shadow: 0 0 15px cyan;
+}
+
+.mdescLeft, .mdescRight,
+.memItemLeft, .memItemRight,
+.memTemplItemLeft, .memTemplItemRight, .memTemplParams {
+	background-color: #F9FAFC;
+	border: none;
+	margin: 4px;
+	padding: 1px 0 0 8px;
+}
+
+.mdescLeft, .mdescRight {
+	padding: 0px 8px 4px 8px;
+	color: #555;
+}
+
+.memSeparator {
+        border-bottom: 1px solid #DEE4F0;
+        line-height: 1px;
+        margin: 0px;
+        padding: 0px;
+}
+
+.memItemLeft, .memTemplItemLeft {
+        white-space: nowrap;
+}
+
+.memItemRight {
+	width: 100%;
+}
+
+.memTemplParams {
+	color: #4665A2;
+        white-space: nowrap;
+	font-size: 80%;
+}
+
+/* @end */
+
+/* @group Member Details */
+
+/* Styles for detailed member documentation */
+
+.memtitle {
+	padding: 8px;
+	border-top: 1px solid #A8B8D9;
+	border-left: 1px solid #A8B8D9;
+	border-right: 1px solid #A8B8D9;
+	border-top-right-radius: 4px;
+	border-top-left-radius: 4px;
+	margin-bottom: -1px;
+	background-image: url('nav_f.png');
+	background-repeat: repeat-x;
+	background-color: #E2E8F2;
+	line-height: 1.25;
+	font-weight: 300;
+	float:left;
+}
+
+.permalink
+{
+        font-size: 65%;
+        display: inline-block;
+        vertical-align: middle;
+}
+
+.memtemplate {
+	font-size: 80%;
+	color: #4665A2;
+	font-weight: normal;
+	margin-left: 9px;
+}
+
+.memnav {
+	background-color: #EBEFF6;
+	border: 1px solid #A3B4D7;
+	text-align: center;
+	margin: 2px;
+	margin-right: 15px;
+	padding: 2px;
+}
+
+.mempage {
+	width: 100%;
+}
+
+.memitem {
+	padding: 0;
+	margin-bottom: 10px;
+	margin-right: 5px;
+        -webkit-transition: box-shadow 0.5s linear;
+        -moz-transition: box-shadow 0.5s linear;
+        -ms-transition: box-shadow 0.5s linear;
+        -o-transition: box-shadow 0.5s linear;
+        transition: box-shadow 0.5s linear;
+        display: table !important;
+        width: 100%;
+}
+
+.memitem.glow {
+         box-shadow: 0 0 15px cyan;
+}
+
+.memname {
+        font-weight: 400;
+        margin-left: 6px;
+}
+
+.memname td {
+	vertical-align: bottom;
+}
+
+.memproto, dl.reflist dt {
+        border-top: 1px solid #A8B8D9;
+        border-left: 1px solid #A8B8D9;
+        border-right: 1px solid #A8B8D9;
+        padding: 6px 0px 6px 0px;
+        color: #253555;
+        font-weight: bold;
+        text-shadow: 0px 1px 1px rgba(255, 255, 255, 0.9);
+        background-color: #DFE5F1;
+        /* opera specific markup */
+        box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
+        border-top-right-radius: 4px;
+        /* firefox specific markup */
+        -moz-box-shadow: rgba(0, 0, 0, 0.15) 5px 5px 5px;
+        -moz-border-radius-topright: 4px;
+        /* webkit specific markup */
+        -webkit-box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
+        -webkit-border-top-right-radius: 4px;
+
+}
+
+.overload {
+        font-family: "courier new",courier,monospace;
+	font-size: 65%;
+}
+
+.memdoc, dl.reflist dd {
+        border-bottom: 1px solid #A8B8D9;      
+        border-left: 1px solid #A8B8D9;      
+        border-right: 1px solid #A8B8D9; 
+        padding: 6px 10px 2px 10px;
+        background-color: #FBFCFD;
+        border-top-width: 0;
+        background-image:url('nav_g.png');
+        background-repeat:repeat-x;
+        background-color: #FFFFFF;
+        /* opera specific markup */
+        border-bottom-left-radius: 4px;
+        border-bottom-right-radius: 4px;
+        box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
+        /* firefox specific markup */
+        -moz-border-radius-bottomleft: 4px;
+        -moz-border-radius-bottomright: 4px;
+        -moz-box-shadow: rgba(0, 0, 0, 0.15) 5px 5px 5px;
+        /* webkit specific markup */
+        -webkit-border-bottom-left-radius: 4px;
+        -webkit-border-bottom-right-radius: 4px;
+        -webkit-box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
+}
+
+dl.reflist dt {
+        padding: 5px;
+}
+
+dl.reflist dd {
+        margin: 0px 0px 10px 0px;
+        padding: 5px;
+}
+
+.paramkey {
+	text-align: right;
+}
+
+.paramtype {
+	white-space: nowrap;
+}
+
+.paramname {
+	color: #602020;
+	white-space: nowrap;
+}
+.paramname em {
+	font-style: normal;
+}
+.paramname code {
+        line-height: 14px;
+}
+
+.params, .retval, .exception, .tparams {
+        margin-left: 0px;
+        padding-left: 0px;
+}       
+
+.params .paramname, .retval .paramname {
+        font-weight: bold;
+        vertical-align: top;
+}
+        
+.params .paramtype {
+        font-style: italic;
+        vertical-align: top;
+}       
+        
+.params .paramdir {
+        font-family: "courier new",courier,monospace;
+        vertical-align: top;
+}
+
+table.mlabels {
+	border-spacing: 0px;
+}
+
+td.mlabels-left {
+	width: 100%;
+	padding: 0px;
+}
+
+td.mlabels-right {
+	vertical-align: bottom;
+	padding: 0px;
+	white-space: nowrap;
+}
+
+span.mlabels {
+        margin-left: 8px;
+}
+
+span.mlabel {
+        background-color: #728DC1;
+        border-top:1px solid #5373B4;
+        border-left:1px solid #5373B4;
+        border-right:1px solid #C4CFE5;
+        border-bottom:1px solid #C4CFE5;
+	text-shadow: none;
+	color: white;
+	margin-right: 4px;
+	padding: 2px 3px;
+	border-radius: 3px;
+	font-size: 7pt;
+	white-space: nowrap;
+	vertical-align: middle;
+}
+
+
+
+/* @end */
+
+/* these are for tree view inside a (index) page */
+
+div.directory {
+        margin: 10px 0px;
+        border-top: 1px solid #9CAFD4;
+        border-bottom: 1px solid #9CAFD4;
+        width: 100%;
+}
+
+.directory table {
+        border-collapse:collapse;
+}
+
+.directory td {
+        margin: 0px;
+        padding: 0px;
+	vertical-align: top;
+}
+
+.directory td.entry {
+        white-space: nowrap;
+        padding-right: 6px;
+	padding-top: 3px;
+}
+
+.directory td.entry a {
+        outline:none;
+}
+
+.directory td.entry a img {
+        border: none;
+}
+
+.directory td.desc {
+        width: 100%;
+        padding-left: 6px;
+	padding-right: 6px;
+	padding-top: 3px;
+	border-left: 1px solid rgba(0,0,0,0.05);
+}
+
+.directory tr.even {
+	padding-left: 6px;
+	background-color: #F7F8FB;
+}
+
+.directory img {
+	vertical-align: -30%;
+}
+
+.directory .levels {
+        white-space: nowrap;
+        width: 100%;
+        text-align: right;
+        font-size: 9pt;
+}
+
+.directory .levels span {
+        cursor: pointer;
+        padding-left: 2px;
+        padding-right: 2px;
+	color: #3D578C;
+}
+
+.arrow {
+    color: #9CAFD4;
+    -webkit-user-select: none;
+    -khtml-user-select: none;
+    -moz-user-select: none;
+    -ms-user-select: none;
+    user-select: none;
+    cursor: pointer;
+    font-size: 80%;
+    display: inline-block;
+    width: 16px;
+    height: 22px;
+}
+
+.icon {
+    font-family: Arial, Helvetica;
+    font-weight: bold;
+    font-size: 12px;
+    height: 14px;
+    width: 16px;
+    display: inline-block;
+    background-color: #728DC1;
+    color: white;
+    text-align: center;
+    border-radius: 4px;
+    margin-left: 2px;
+    margin-right: 2px;
+}
+
+.icona {
+    width: 24px;
+    height: 22px;
+    display: inline-block;
+}
+
+.iconfopen {
+    width: 24px;
+    height: 18px;
+    margin-bottom: 4px;
+    background-image:url('folderopen.png');
+    background-position: 0px -4px;
+    background-repeat: repeat-y;
+    vertical-align:top;
+    display: inline-block;
+}
+
+.iconfclosed {
+    width: 24px;
+    height: 18px;
+    margin-bottom: 4px;
+    background-image:url('folderclosed.png');
+    background-position: 0px -4px;
+    background-repeat: repeat-y;
+    vertical-align:top;
+    display: inline-block;
+}
+
+.icondoc {
+    width: 24px;
+    height: 18px;
+    margin-bottom: 4px;
+    background-image:url('doc.png');
+    background-position: 0px -4px;
+    background-repeat: repeat-y;
+    vertical-align:top;
+    display: inline-block;
+}
+
+table.directory {
+    font: 400 14px Roboto,sans-serif;
+}
+
+/* @end */
+
+div.dynheader {
+        margin-top: 8px;
+	-webkit-touch-callout: none;
+	-webkit-user-select: none;
+	-khtml-user-select: none;
+	-moz-user-select: none;
+	-ms-user-select: none;
+	user-select: none;
+}
+
+address {
+	font-style: normal;
+	color: #2A3D61;
+}
+
+table.doxtable caption {
+	caption-side: top;
+}
+
+table.doxtable {
+	border-collapse:collapse;
+        margin-top: 4px;
+        margin-bottom: 4px;
+}
+
+table.doxtable td, table.doxtable th {
+	border: 1px solid #2D4068;
+	padding: 3px 7px 2px;
+}
+
+table.doxtable th {
+	background-color: #374F7F;
+	color: #FFFFFF;
+	font-size: 110%;
+	padding-bottom: 4px;
+	padding-top: 5px;
+}
+
+table.fieldtable {
+        /*width: 100%;*/
+        margin-bottom: 10px;
+        border: 1px solid #A8B8D9;
+        border-spacing: 0px;
+        -moz-border-radius: 4px;
+        -webkit-border-radius: 4px;
+        border-radius: 4px;
+        -moz-box-shadow: rgba(0, 0, 0, 0.15) 2px 2px 2px;
+        -webkit-box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.15);
+        box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.15);
+}
+
+.fieldtable td, .fieldtable th {
+        padding: 3px 7px 2px;
+}
+
+.fieldtable td.fieldtype, .fieldtable td.fieldname {
+        white-space: nowrap;
+        border-right: 1px solid #A8B8D9;
+        border-bottom: 1px solid #A8B8D9;
+        vertical-align: top;
+}
+
+.fieldtable td.fieldname {
+        padding-top: 3px;
+}
+
+.fieldtable td.fielddoc {
+        border-bottom: 1px solid #A8B8D9;
+        /*width: 100%;*/
+}
+
+.fieldtable td.fielddoc p:first-child {
+        margin-top: 0px;
+}       
+        
+.fieldtable td.fielddoc p:last-child {
+        margin-bottom: 2px;
+}
+
+.fieldtable tr:last-child td {
+        border-bottom: none;
+}
+
+.fieldtable th {
+        background-image:url('nav_f.png');
+        background-repeat:repeat-x;
+        background-color: #E2E8F2;
+        font-size: 90%;
+        color: #253555;
+        padding-bottom: 4px;
+        padding-top: 5px;
+        text-align:left;
+        font-weight: 400;
+        -moz-border-radius-topleft: 4px;
+        -moz-border-radius-topright: 4px;
+        -webkit-border-top-left-radius: 4px;
+        -webkit-border-top-right-radius: 4px;
+        border-top-left-radius: 4px;
+        border-top-right-radius: 4px;
+        border-bottom: 1px solid #A8B8D9;
+}
+
+
+.tabsearch {
+	top: 0px;
+	left: 10px;
+	height: 36px;
+	background-image: url('tab_b.png');
+	z-index: 101;
+	overflow: hidden;
+	font-size: 13px;
+}
+
+.navpath ul {
+	display: flex;
+	flex-flow: row wrap;
+	justify-content: flex-start;
+	align-items: center;
+	font-size: 11px;
+	background-image:none;
+	background-repeat:repeat-x;
+	background-position: 0 -5px;
+	height:auto;
+	line-height:30px;
+	color:#8AA0CC;
+	border:solid 1px #C2CDE4;
+	overflow:hidden;
+	margin:0px;
+	padding:0px;
+}
+
+.navpath li
+{
+	list-style-type:none;
+	float:left;
+	padding-left:10px;
+	padding-right:15px;
+	background-image:url('bc_s.png');
+	background-repeat:no-repeat;
+	background-position:right;
+	color:#364D7C;
+}
+
+.navpath li.navelem a
+{
+	height:32px;
+	display:block;
+	text-decoration: none;
+	outline: none;
+	color: #283A5D;
+	font-family: 'Lucida Grande',Geneva,Helvetica,Arial,sans-serif;
+	text-shadow: 0px 1px 1px rgba(255, 255, 255, 0.9);
+	text-decoration: none;        
+}
+
+.navpath li.navelem a:hover
+{
+	color:#6884BD;
+}
+
+.navpath li.footer
+{
+	display: flex;
+	flex-flow: row wrap;
+	justify-content: flex-start;
+	align-items: center;
+        flex-grow: 1;
+        list-style-type:none;
+        float:none;
+        padding-left:10px;
+        padding-right:15px;
+        background-image:none;
+        background-repeat:no-repeat;
+        background-position:right;
+        color:#364D7C;
+        font-size: 8pt;
+}
+
+div.summary
+{
+	float: right;
+	font-size: 8pt;
+	padding-right: 5px;
+	width: 50%;
+	text-align: right;
+}       
+
+div.summary a
+{
+	white-space: nowrap;
+}
+
+table.classindex
+{
+        margin: 10px;
+        white-space: nowrap;
+        margin-left: 3%;
+        margin-right: 3%;
+        width: 94%;
+        border: 0;
+        border-spacing: 0; 
+        padding: 0;
+}
+
+div.ingroups
+{
+	font-size: 8pt;
+	width: 50%;
+	text-align: left;
+}
+
+div.ingroups a
+{
+	white-space: nowrap;
+}
+
+div.header
+{
+        background-image:url('nav_h.png');
+        background-repeat:repeat-x;
+	background-color: #F9FAFC;
+	margin:  0px;
+	border-bottom: 1px solid #C4CFE5;
+}
+
+div.headertitle
+{
+	padding: 5px 5px 5px 10px;
+       	color: #135384;
+}
+
+dl
+{
+        padding: 0 0 0 10px;
+}
+
+/* dl.note, dl.warning, dl.attention, dl.pre, dl.post, dl.invariant, dl.deprecated, dl.todo, dl.test, dl.bug */
+dl.section
+{
+	margin-left: 0px;
+	padding-left: 0px;
+}
+
+dl.note
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #D0C000;
+}
+
+dl.warning, dl.attention
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #FF0000;
+}
+
+dl.pre, dl.post, dl.invariant
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #00D000;
+}
+
+dl.deprecated
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #505050;
+}
+
+dl.todo
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #00C0E0;
+}
+
+dl.test
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #3030E0;
+}
+
+dl.bug
+{
+        margin-left:-7px;
+        padding-left: 3px;
+        border-left:4px solid;
+        border-color: #C08050;
+}
+
+dl.section dd {
+	margin-bottom: 6px;
+}
+
+
+#projectlogo
+{
+	text-align: center;
+	vertical-align: bottom;
+	border-collapse: separate;
+}
+ 
+#projectlogo img
+{ 
+	border: 0px none;
+}
+ 
+#projectalign
+{
+        vertical-align: middle;
+}
+
+#projectname
+{
+	font: 200% Tahoma, Arial,sans-serif;
+	margin: 0px;
+	padding: 2px 0px;
+}
+    
+#projectbrief
+{
+	font: 120% Tahoma, Arial,sans-serif;
+	margin: 0px;
+	padding: 0px;
+}
+
+#projectnumber
+{
+	font: 50% Tahoma, Arial,sans-serif;
+	margin: 0px;
+	padding: 0px;
+}
+
+#top {
+	border-bottom: 1px solid #5373B4;
+}
+
+#titlearea
+{
+	flex-grow: 1;
+	padding: 0px;
+	margin: 0px;
+	width: auto;
+	border-bottom: none;
+}
+
+#main-nav {
+}
+
+#main-menu {
+	display: flex;
+	flex-flow: row wrap;
+	justify-content: flex-start;
+	align-items: center;
+	background-image: none;
+	min-width: 770px;
+}
+
+.ui-resizable-e {
+	height: 100%;
+	background-repeat: repeat-y;
+}
+
+.image
+{
+        text-align: center;
+}
+
+.dotgraph
+{
+        text-align: center;
+}
+
+.mscgraph
+{
+        text-align: center;
+}
+
+.plantumlgraph
+{
+        text-align: center;
+}
+
+.diagraph
+{
+        text-align: center;
+}
+
+.caption
+{
+	font-weight: bold;
+}
+
+div.zoom
+{
+	border: 1px solid #90A5CE;
+}
+
+dl.citelist {
+        margin-bottom:50px;
+}
+
+dl.citelist dt {
+        color:#334975;
+        float:left;
+        font-weight:bold;
+        margin-right:10px;
+        padding:5px;
+}
+
+dl.citelist dd {
+        margin:2px 0;
+        padding:5px 0;
+}
+
+div.toc {
+        padding: 14px 25px;
+        background-color: #F4F6FA;
+        border: 1px solid #D8DFEE;
+        border-radius: 7px 7px 7px 7px;
+        float: right;
+        height: auto;
+        margin: 0 8px 10px 10px;
+        width: 200px;
+}
+
+div.toc li {
+        background: url("bdwn.png") no-repeat scroll 0 5px transparent;
+        font: 10px/1.2 Verdana,DejaVu Sans,Geneva,sans-serif;
+        margin-top: 5px;
+        padding-left: 10px;
+        padding-top: 2px;
+}
+
+div.toc h3 {
+        font: bold 12px/1.2 Arial,FreeSans,sans-serif;
+	color: #4665A2;
+        border-bottom: 0 none;
+        margin: 0;
+}
+
+div.toc ul {
+        list-style: none outside none;
+        border: medium none;
+        padding: 0px;
+}       
+
+div.toc li.level1 {
+        margin-left: 0px;
+}
+
+div.toc li.level2 {
+        margin-left: 15px;
+}
+
+div.toc li.level3 {
+        margin-left: 30px;
+}
+
+div.toc li.level4 {
+        margin-left: 45px;
+}
+
+.inherit_header {
+        font-weight: bold;
+        color: gray;
+        cursor: pointer;
+	-webkit-touch-callout: none;
+	-webkit-user-select: none;
+	-khtml-user-select: none;
+	-moz-user-select: none;
+	-ms-user-select: none;
+	user-select: none;
+}
+
+.inherit_header td {
+        padding: 6px 0px 2px 5px;
+}
+
+.inherit {
+        display: none;
+}
+
+tr.heading h2 {
+        margin-top: 12px;
+        margin-bottom: 4px;
+}
+
+/* tooltip related style info */
+
+.ttc {
+        position: absolute;
+        display: none;
+}
+
+#powerTip {
+	cursor: default;
+	white-space: nowrap;
+	background-color: white;
+	border: 1px solid gray;
+	border-radius: 4px 4px 4px 4px;
+	box-shadow: 1px 1px 7px gray;
+	display: none;
+	font-size: smaller;
+	max-width: 80%;
+	opacity: 0.9;
+	padding: 1ex 1em 1em;
+	position: absolute;
+	z-index: 2147483647;
+}
+
+#powerTip div.ttdoc {
+        color: grey;
+	font-style: italic;
+}
+
+#powerTip div.ttname a {
+        font-weight: bold;
+}
+
+#powerTip div.ttname {
+        font-weight: bold;
+}
+
+#powerTip div.ttdeci {
+        color: #006318;
+}
+
+#powerTip div {
+        margin: 0px;
+        padding: 0px;
+        font: 12px/16px Roboto,sans-serif;
+}
+
+#powerTip:before, #powerTip:after {
+	content: "";
+	position: absolute;
+	margin: 0px;
+}
+
+#powerTip.n:after,  #powerTip.n:before,
+#powerTip.s:after,  #powerTip.s:before,
+#powerTip.w:after,  #powerTip.w:before,
+#powerTip.e:after,  #powerTip.e:before,
+#powerTip.ne:after, #powerTip.ne:before,
+#powerTip.se:after, #powerTip.se:before,
+#powerTip.nw:after, #powerTip.nw:before,
+#powerTip.sw:after, #powerTip.sw:before {
+	border: solid transparent;
+	content: " ";
+	height: 0;
+	width: 0;
+	position: absolute;
+}
+
+#powerTip.n:after,  #powerTip.s:after,
+#powerTip.w:after,  #powerTip.e:after,
+#powerTip.nw:after, #powerTip.ne:after,
+#powerTip.sw:after, #powerTip.se:after {
+	border-color: rgba(255, 255, 255, 0);
+}
+
+#powerTip.n:before,  #powerTip.s:before,
+#powerTip.w:before,  #powerTip.e:before,
+#powerTip.nw:before, #powerTip.ne:before,
+#powerTip.sw:before, #powerTip.se:before {
+	border-color: rgba(128, 128, 128, 0);
+}
+
+#powerTip.n:after,  #powerTip.n:before,
+#powerTip.ne:after, #powerTip.ne:before,
+#powerTip.nw:after, #powerTip.nw:before {
+	top: 100%;
+}
+
+#powerTip.n:after, #powerTip.ne:after, #powerTip.nw:after {
+	border-top-color: #ffffff;
+	border-width: 10px;
+	margin: 0px -10px;
+}
+#powerTip.n:before {
+	border-top-color: #808080;
+	border-width: 11px;
+	margin: 0px -11px;
+}
+#powerTip.n:after, #powerTip.n:before {
+	left: 50%;
+}
+
+#powerTip.nw:after, #powerTip.nw:before {
+	right: 14px;
+}
+
+#powerTip.ne:after, #powerTip.ne:before {
+	left: 14px;
+}
+
+#powerTip.s:after,  #powerTip.s:before,
+#powerTip.se:after, #powerTip.se:before,
+#powerTip.sw:after, #powerTip.sw:before {
+	bottom: 100%;
+}
+
+#powerTip.s:after, #powerTip.se:after, #powerTip.sw:after {
+	border-bottom-color: #ffffff;
+	border-width: 10px;
+	margin: 0px -10px;
+}
+
+#powerTip.s:before, #powerTip.se:before, #powerTip.sw:before {
+	border-bottom-color: #808080;
+	border-width: 11px;
+	margin: 0px -11px;
+}
+
+#powerTip.s:after, #powerTip.s:before {
+	left: 50%;
+}
+
+#powerTip.sw:after, #powerTip.sw:before {
+	right: 14px;
+}
+
+#powerTip.se:after, #powerTip.se:before {
+	left: 14px;
+}
+
+#powerTip.e:after, #powerTip.e:before {
+	left: 100%;
+}
+#powerTip.e:after {
+	border-left-color: #ffffff;
+	border-width: 10px;
+	top: 50%;
+	margin-top: -10px;
+}
+#powerTip.e:before {
+	border-left-color: #808080;
+	border-width: 11px;
+	top: 50%;
+	margin-top: -11px;
+}
+
+#powerTip.w:after, #powerTip.w:before {
+	right: 100%;
+}
+#powerTip.w:after {
+	border-right-color: #ffffff;
+	border-width: 10px;
+	top: 50%;
+	margin-top: -10px;
+}
+#powerTip.w:before {
+	border-right-color: #808080;
+	border-width: 11px;
+	top: 50%;
+	margin-top: -11px;
+}
+
+@media print
+{
+  #top { display: none; }
+  #side-nav { display: none; }
+  #nav-path { display: none; }
+  body { overflow:visible; }
+  h1, h2, h3, h4, h5, h6 { page-break-after: avoid; }
+  .summary { display: none; }
+  .memitem { page-break-inside: avoid; }
+  #doc-content
+  {
+    margin-left:0 !important;
+    height:auto !important;
+    width:auto !important;
+    overflow:inherit;
+    display:inline;
+  }
+}
+
+/* @group Markdown */
+
+/*
+table.markdownTable {
+	border-collapse:collapse;
+        margin-top: 4px;
+        margin-bottom: 4px;
+}
+
+table.markdownTable td, table.markdownTable th {
+	border: 1px solid #2D4068;
+	padding: 3px 7px 2px;
+}
+
+table.markdownTableHead tr {
+}
+
+table.markdownTableBodyLeft td, table.markdownTable th {
+	border: 1px solid #2D4068;
+	padding: 3px 7px 2px;
+}
+
+th.markdownTableHeadLeft th.markdownTableHeadRight th.markdownTableHeadCenter th.markdownTableHeadNone {
+	background-color: #374F7F;
+	color: #FFFFFF;
+	font-size: 110%;
+	padding-bottom: 4px;
+	padding-top: 5px;
+}
+
+th.markdownTableHeadLeft {
+	text-align: left
+}
+
+th.markdownTableHeadRight {
+	text-align: right
+}
+
+th.markdownTableHeadCenter {
+	text-align: center
+}
+*/
+
+table.markdownTable {
+	border-collapse:collapse;
+        margin-top: 4px;
+        margin-bottom: 4px;
+}
+
+table.markdownTable td, table.markdownTable th {
+	border: 1px solid #2D4068;
+	padding: 3px 7px 2px;
+}
+
+table.markdownTable tr {
+}
+
+th.markdownTableHeadLeft, th.markdownTableHeadRight, th.markdownTableHeadCenter, th.markdownTableHeadNone {
+	background-color: #374F7F;
+	color: #FFFFFF;
+	font-size: 110%;
+	padding-bottom: 4px;
+	padding-top: 5px;
+}
+
+th.markdownTableHeadLeft, td.markdownTableBodyLeft {
+	text-align: left
+}
+
+th.markdownTableHeadRight, td.markdownTableBodyRight {
+	text-align: right
+}
+
+th.markdownTableHeadCenter, td.markdownTableBodyCenter {
+	text-align: center
+}
+
+
+/* @end */
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/loadgen_integration_diagram.dia b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/loadgen_integration_diagram.dia
new file mode 100644
index 0000000000000000000000000000000000000000..569089f243e4584e12134caf36d078248cb50af1
GIT binary patch
literal 1943
zcmV;I2Wa>oiwFP!000021MOW~kJ>mCexF|v(Z^*%?A+VVRH|0H`_NS@ZTA@&;(=^K
z%qE@-FZ<iqPC_9!7?Rj+kdZnS!8ykc{=V-V`y{;o@Un@tC!C}qiSH&rHzpd6mm~<|
z)!pRpufN@TlMnYd?}HG%W&c(Q+Gy;IIOXr|CTmK!Z)daT=Vv|IrHGP5kHSYi#k2oV
z6rmYwG@IPtXxg!X08zw`6-N=JNw|2VSc}mH-%S>1`E8Yu$2geet%`O_5|KoELebsi
z%|r2;%nC!ZlBE;Yw`heI2}a-AjT`Lc``wIhaZ)wB*^)G5P3Uf0Ytmwe|2=9`v`Sf{
zcy<5g6Q&d=Z}M&x2M_j|I@{`qZcwrc<GL?)g`)g&K;1Td{&Ik4n+9~v{dKwA-9oOr
zNUpm?uDdXuZ%INE6w<oO1tAeeaqg8SkGQ*^bcrH1#ZC<ibhrKxQcBMB{{cnmYX*=X
z{*Jgy(Ys2*;52wkEn2JuJNAFL6#A7shXGyBUj%(6x%HhO>rY`CE+X92`!J@1ncod#
zzB&Ukoj9D{bH?S?T7@X^u#N35LbQ4e1Du|j%;h#MmexhH*}3apZP)YC1Yx?3(C+jQ
zs*PG~p_qn@!%&I?u}m4G?JXs@6`By|h%ElWOF9sHw)t9<=huvA2scX-$810>6usRN
z2G?|$tN7Zfm>;S{shL+c$#7Ei^y48u)e5#LdZHEmM@NSc8_v`I-O+r{zq(=`{}z$w
z<r*bayVX9Dm&qX@XW@7l+)X|kWgt|q9<#=Rff|^zi&#V+Q&`1j{W`UoUM#AX6F$s%
z72$$Wb>E)yKktB{8}`&<f4aHD5#G#~B#G-$@(`oR%x3~uAJJaiydI|4I9#o1yX}tM
z=IEF=Bu3{!afz2L3W9Q^SLwZ>cl=f<eyLrQDw$W>v|WiT-zl}Lq(^^}07u#{+OG`2
zDW)xX(g}WQi&RNWoBZ<Q+5?OEq8;lViqnaf(j8|22HyPCKXCMfX}CnkWOaEDYLlz+
zEAou;%j)N}Oe;-bbf%^be7m2Uawv2nGdGMwK-A3@3k6p31~-43IDXtTM2V;5tVvzk
zLY>)x`@4$)=r0D2Xfd!4V&DlE14oI05`#uD_>y4lOG1`7O-Z7pK)e*VN(y930qiXW
zmQXE#=65kLffO;YM5=+K`=yr$l{}1E9!N@+KFHGt=<2qT2)Pnr_LT@=PJzgD0dk2j
zxJ1BAB7DIT(S1BWI$`jXB8XOmk2qegHz@h0#6Ywd0G9=ZN(v-Ofi*z30EA}?#vxm<
z#b*lwrN9*^1(uQm;ZndZ6OIEV1X8jBwTB>83hW`W0`X}9ILZrvDIqZcYAZp>L37zH
zLBa1hR&pRAFJRzTYA~1@{Ka@RKuN&F;8Pgm1~qVctAQ~!1&V>swE*gl>A`oI46ty)
zbU;T&<z-rP7TMKm%ubK0Q7}4=n$3yjw(7a5K=mS}h=Vyw67pO%e3*g|_B1_@aJ)LD
za-F&S;NpyqzS_f~oG`6v#Qqa)&9G6BD<xT_op$)NL9}Z|he4n_00Ldn2y~7S=t@GM
zdj^57Lf~~F5d0Vfdc6^7PkoUIk(Hru>SdKxc7{HduXN6dUXQ`jxEeq2nl>pA{;VK0
z%OC`>HUk<?2F1yC<zz_+_Io4P6}+b597EWZ2;uU;8XNcu!q)}i3JNbma25WhBnX4v
zH@W~{tO|R_5cVZP*gvDfzEa_<S7ED%3QL1<|JxIQ(8VNZ9IYlnOWJBud035`(P}m6
zN&vjapjF18D+_}|)P=jkG3Xp)P|A8zd034YRqIJtCZbR<2EXzR#-nE_?J?ND@Yc}1
zOsH&;0d3*Luvt6{bZEQ$ooB!iY{`NjxEbr!He+u;FQV|@@)HTJXH&LeE?&p3tRY}m
zw(TJ)S(n<m{9O1q5Ha;G0|jtu2`&9$0l8%`K4N}!V2^={R9T2{1}aTD#XaNXp8j=*
zj1i`MH(Kmg8~0)IkjT<)!_5j2!6;%PsA3{%j8vbBik8MnOZ^Mr95xpiNm#bn{X;(C
zJkCqbcV2Qv!Aqv%C273WFc%dujgy%A*9CeK;wA48FL@)2mu$sL(s;>J@zRyhlGlHa
zx7dUg$xC}Dd~F*-#!n+!+qM)hNxT2FVJ<3S8YeLgQUfaVQOUd%Ghf}#?{ibj+iH2c
zKJOI&NamjbD{X-zsl)&pLp;z$B-=qt=XZzb4s^_S(AN#Xb3+!2xu!RoTfVhVWR)rg
z6ds0Ua|P-U0#gy8Fe2;-_u7wJU0lO8Olcz=qP$ov=Vi5xob(RGjhynNR2Vnhs5K1T
ztNO-~+~?!T3O?WvYd~2tBzg^KnpLmmYe3oJG#JUInWGkgWG(_#C!>lzM!5(C`&VAu
zJQQ_xp_^uz4@dWn>~{XlpQ9FoR4UXw{}dOElZ$#6TT7s#BPLQrq=@L`<WoHKbMsK!
zvPtj4>+C7LLNhg22mH|rR`$qOSNHg%6&&$TR;UiFrG=x`O|EO*#A<)W$@v?*Ve7+H
dUb*O!zt4VH`2ME!%ft6K{{zrB$(5Fa002n3teyY>

literal 0
HcmV?d00001

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/mlperf_icon.png b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/mlperf_icon.png
new file mode 100644
index 0000000000000000000000000000000000000000..95321896d3e467b923909c3654a4260346df5b9f
GIT binary patch
literal 4632
zcmb_gXHXMNyG2n1{1g!clqO9CBp9Uz2+B(h0@6F7BQ<mqs!{|5i6~846eM&bMM@x`
zAR;9oHHMN%3%!>RAaZ%{ulwU0f8Ck0`^@Z~bDr7Rvwt?}v4Qp_)*Gx03=Efabu^4m
z>+!#Xndx*kspxh;E!TrIErU#8Zb6}r0j>-WU0_bGH+6j+-Cd1c9bFLq-L5JO4D484
zjfbZ2@zs3&08VYLr}rz)yNSCgFmgQeeHw9Q1eeE<(8)3h7k6`{og2>kvKC*(FKO^J
z5eV$fjChu2;2`1j#U2~VGnny)Xm)IQvn=Xf#l6~A8TAK>pG-{`EH+$Q!`>Q8h%vkq
z6rDWF5O?A4cmQYkJN~~7kgq0V{Ag5f`8x#@9Wx!a1SkQI%6R2<NU)Jh=uF~VRy+aE
zgwlhvPpIDH<HOD;bpulyOvRk-)Q8>@OxAA2Vq7bSB2YpbCM3ct@U0s2<-Lx1%cM>r
zwzF{wFH}(Yx}S{v)uUm5#9$ZXMUlI5f;lo0Tt}hWMgM@!;u4VO6+CD^V}JOla{PpX
zwtMN%ELZ8yvLK3y_CZ1Mq8jX$wO>9fYz*a|&=U~pAT!MNR7NXwpXU;@VAWjMJFl;u
zZgEB}y1V3IwpP_S?E_{}g=C*@p6=dW8BKXt5_0qFCW`32_A=9!RcRfH^_5v8v^_HH
z6vnCDp!d@sz)?jsi^)qBDn`P35;UO}ZWR<=WsEUExn<kR0SHZGq<e*h28nCJ<6s#j
zwZy#&r%z=tdR!fj9t33@*M)X!A7pAm*}LmoJwS!E%n1j~f{0}U4ef|HWKK&I0G+tr
zc3>U9_G^X4ym6;+U@Tq-eYo%Ir*L*Cs;M!{r^~um8-8JDSIRo*AwoK6t1Vi(M%Ozy
zBCY{!XrUFtH8<n%F^;b*og&h5zHh?+t&Z^!6+V2d;WEnu+RI@jA4z|V%AvR5u5=64
zbS2Zg@$TMWFM%kHV2ur{Izr$P{_(=T#=(tjV_qX%+u)xUI;nV`o9)_{r8fQCHlw3M
z>)1_m1`6%sIirN;1oQ!ZTJMXVJ<ff9XEANaLT5USAI!evn_ImV)w|KCJCj9s5RMO%
zEUoIGRYKDG4nMsR^nnaS@Oy60k)AmB@qrbI^I!>;k$eJ*Rc^OBh+uk<@e5q;gredU
zu@fU{oHe`eOw{cbgxuZjF^pQ0+j#9FCowRQgP%C+d0Rbs9;iQacd*Uf$-h$Vl{2r@
zcbZ#{lQetm=SQV-O)3RFd}f`D)7uwH;JaEbCX~&GKh{@f@`y=hZ#}*{5*9w;R4YCz
z_IT=GPL!{=z^Ys?%aDSb)(&qs8gx@Rs((?N`g7adRCvmd8!N!;@6`w>!W?Nam1=~e
zwjYUwN2R<@z3TCU_bz8{bV_&t=hzz){?UIB4XuRsBcX7<*Hpu(TCp)FzVRq3?iz`a
zi1J_=0_P{0=OGowqL1&*?b=D*r#saFS+-`vB_gUHP{{zM<W@%y;TjJVQWapG+_3kP
zH$j$+bQDP;v5@heExISBo)Edb;Qa!>{UxNk_JlnP`PnL{uy?Ih5N7)PST#s)OaJ(n
zeC_JXP1;ILY)2+nYUYlLa9AUI+smA=K}=e~vWL3O><4blSZlNb3AbA93!7x>RLxx+
z@EUiV9jZY`HN@v&5sgda*1j$@;ten{IOWPvXY9z{?<6`y>_@bNv>;{BTA$8wv8yVG
zsEHa{ps&tAKZ>X`!>oX9YN0$0_9oF_<zFWU4{x;vtELumwU!=%gRhS|6ETyad}~)H
zb@g%Zr;$`$Ax-%J+V3Rug9tWIt#rnl(g&t&D&rGjmYA1q&409PZBC$AGsuwB`}KpE
z6%?CB{kmkLMpH?CQyJ~~8-R;~DusB3*!*=eH1K3+>^IiPj?|_+&Lf|(41=CD(yi^4
zZP_sp%min08u!cIQK!|gq%3yQv~WIQf_pCsWb=wx{%(@*<P*T9HM{h~;Xa{@O;7cB
zr+&u8eBav6`x(C-Wa9~`|DtSx@kjA?7gx{8x)`S+stLlUFoQ-}tNY!#KK7|SXs!|B
z@ykWHp1ht6fot#VH|YvVG|=wQn_!QvZM>R*fBa2!JX!~o9gBOw(P@8bM;c45q;pU&
zyzy>t@0KgISap7uY9^-KwfQHc4TwV0^7{K$P;XDhO!wPTL&to<zlekyHTs)X&^$n@
znMI>>BG$|bCwUb^*&#8*%way6-jA@9mHwhlASDol51mA~L2Rg6yf4`zTDJE3Mbl<X
z+?fk(jWD--&I#|-MviZglG8~KB?o3~k+dVYd$w*`6wlzmTqhNZX4BKO*YkpmL4u81
z^k7OE+WQ4gT=Vw5Uc;*~87K4VcDh7;v4<y-yz>*IPAP2Rr*~gbH)f1**^toGpVIi5
z+!L`Af3>(Pp&;7OTh99}aJ_?lG#|YyMzU59X?(@zwJfG}RFs8mfZ(V2*UCrWqu|B#
z)^9d?KQ1qzHw4E<^!$v?_l0S}Tem2l1#*e~Tw5T$rJXTdDcG=3_`qGc;2ntfd7`GV
zP)36t5cFm|+9ZW;uz0t90v-ckqfcl(i*oQ>W|UYCnp7<t;xmz@HVPbdkTl@6@WEr2
z<qa0p@0;Z)s7YH8DZ&gV$<KMm;J(a@?R@Fq!B?lH(?Jx<l&1?4Rfq<UW{!Z)ndr(X
zk?8d!ia<QDu-K=G^@4TuMa&*EtjAAZk00Wa+XCQKa5Gx}9-D94>x&)sC8RIE^A@-L
zL%rY;exe>M6)NO3jX%~Yx&OwkCRzJEKij#{M%rF~qlP~2n?pCGWE-u7UMs50!T!c=
z(g3KJzRR_bk<2ky3z5;53NHbWB)h=~?L7G$=a}T}D*SXZfiri~bZ4lYM|$QF87fYw
z|GZc)FXdCE^xZfe<|8?19oHtw^s=8{VJC!ZX=c_(-|lv@GKbXX$h=Iu(pn=Rt9VZX
z*K}^5#iOtgBqH22J10yyOKQ~Iu&|26S2Xl-Lo?T?HG`S#?P_0;9ohF-w6N5EWF(&M
zT;e6$b>PQkAO7z)W_W828wsL3J!4E$d}3`P!%iw`Tl3}5x~P77p5uxpY0G=QB0b6*
znH4p|3nj(~Zsln8=9V=RZeOUUQdm>7*&+WP%Iyf$@v0#bYv_8pdNayUSvq{m)YU*z
z5Ip9&oOQ{?to3X<OldxA^^)C|NISQU1>Ja~eU7m{<Fe(hWaKg)QCJi%VRdKYt3$p?
z8hyS{qLM)0gza+ZjlH(xyBYpvsd*~ru=uS*CnbBtg1EE7n|&dx)+8+ty$ga6AV&9$
zX*Yq%0b^dvSucSSk|UE@cpx+^inn13w3LWV4n_o#c?t~`8_UbGu1f(7Q<r=JsK_Y$
z!0d+PkUj33_sr)R+r0&rO3_hD1qxjwf83?R4JLmFT!xl>Jz?>|On=xgIJMSJ*Om$C
zRp`2-5k8!fI=KXB=Wa$+nxFjVeyZO-U+wMCov;0Mx+v^<gO6?)2r{Ek>6WOu-CE+T
z23%sqZSSP3Zz^dWyi<=R(Re){%5~K(JkFAcoKJRjB|%9(0%a{%7OaTQg=fW6n4vQH
zS>mfsv+jrXve@Gp-Sj;7a%-Zb8p^KV9Q-K%!(I{{)~Ir(&J$!9L`?yBE{ies>^9Wd
z2KHT#vE?nFHC}0#5~=`aP%xcNY@NrtWSGh?q7>JBgrqaSnX(c^wZthc$DeE5-Ke8D
zBwk@2dE;}x>Nu4CZaC-AmC<Z8nz4!pdWPxRY3O?d%(>UNdVMO(aV#1V{m5P1DQ7+%
ztVB6}^0wGm6dJ!zoHHQizP2=2s_0jmV<~Q2-~51%T1v-20{aki$?ZY_5;4DPqM*Y9
zt0=)4NKL&hk5t+BB4in+q_N_GX{ico@iGSADlJocjYqt=Yj3;nM-gtRUK40YqQtz2
z<wiE#Nlm-iHBpWqcB^8+e0v&1qL!{(oW8nuH4{1+$cD|K!)4!mWT&RG*^-0?h%C@q
zXj`t`U*!P6ss}zA^<OxWMVHw~vqD>OV2<nU%IIGz_wdK$-RV0SInI(|d+O_6OW{&2
zotn*_xo5W}t85<W<4!5EM16dux7YjiS^rS+FAtny9r{wI*}Y(<Q}OK#1pC!n05SWu
z|Gnw9@-(e*Qc1<jIuPl9Y!=+-0g>r+!1Y{I0CkFDm37F5^PhF$JHu*5O-g8cYndQx
zLGxbkb<055Rb}p?F-cNDT*HqEt*kGu+>mlzJcWn-=%8XfdqOFqSo1t(gUjTA3hS9-
zJw^rWuFNJ>wYc#w9@N(Oxr-HBWB|1I-<N&njFR~_WedKJu-2JZ6G+$&xlPr!x=J#S
z!+cT{MG(Kotb*dnNX?MI=kHeHvCQA6%emVvpsE@N=<FYWW1o<+p5qr`a!bE~9FvN5
z)yAm0$U(@RE3MOcs+m7pNBW%t|1<mP+VmTUVZMU;%?SNR2{wyTTkgh)UOccG+2Q#S
z!9Go#LjP&8OXw(-zi4$ExW4R84d#>QvNyL>ox9Y0n9X6Oc~IQIcA=vv#B+TsMcc!%
znDl&;FjyVGND3qNNc878JAHrdRfXPdF}6Xfy&ib9+<c?At;_Ya^<ecT<cC~((6Nif
zJ@n$AN_tK>-!DJBdYnv~^k>f3=bK+-uQw!8)~jT@z9|dj3WiR3LJTjpYK_1B;V8>q
z{+Yt_y@<R0pQ<1**R+jS>X|sUw|lpltZmJh;x4$KQ;Xg}D=}RWz`iS@@GRQG=^d8E
z8@v#NQJ}lZ)#<O1lF^&@A{W;H)sZU9#cK*{@JkKaK}0@A4fqp<7Paj%%4KFjR&gQw
zo!%~+HxCLPotYw22roAOa7?P)XOYaXNQlIm$5H7bnVxE&^e$H_Y3(mt+gxi^RAnrW
zcwqIoeE?H#G5cyA!<kG6+f|&O?j4Xye_T0aF{g=Dxu=wfKzpQaeWpYSOy^yfG4xJn
zmtKVzr49}?=?KZ@pPS$-Vl=w(#nrm0!KckbL4&*;kx6%CnwwlXD5~)<rBYx1Ko^k7
zv@**N_RJW#-I!9ZtQ9M)sKg^x0xj^+Dddw`JNd}#BYq6fEFY&<xf!bPvq7K}jRb4G
zf5VgGtdhFDGcz7{nwI$`@9*t)Q@1hcp&qp|yvE~6+e6K#%5I9lAy*tGWNaTI$_om5
z=Ai*xsx>T&coew~O<9MBM&sqIA|EuIAm4G3<G7n`+!nG$*qCM?`b!)6ht^fEb)`|3
zZ?|#vUbosQnJFFOD6npcr?rI1ty&(6sKUr8dw&X8<yOtZg43a~u@c!;@$sx`eu*{s
zg&jLxm~b<V2<)4X$r%V0j|4PV)NYvHxfIh;Bej_ab~X&7PlWkL>?%li|A|cTI7STR
z9vwx5{9LBvly87=ZRWx)S8Lka)t_(HFYH35xPA2?;uQB?&HdWIFx6H&GP#L%9Y9$%
zh?60m#qK`O4*Im5CkW`OB(2DX<Gn$S2#0<=b%Y!~wNO=V>ula*s9BE|!R;M+wU*FG
z^%k=-em^e{xYq`YT@~5!<PPNc0TjpQbciSKEH3-{M>>(n3YgvRDi)Yn1ky!QbRxj2
z78AZZTw~?#dDhiTFx@zc8Fnz>U0;8tTjOM_;=W;Z8N!B{IMJD>=bl<FX@F^3Oaa|z
zVAJ2gd30T($r0tdV<#VwZEC-W69%Fgc)sE(*8<3Ojq*!jI}kX7pPtDyaf|7l_DvOr
zcE-Qsl=64{ufYS+N?BDb6GP%A!_)r;W;TYu;=hXi@8Vz8|9a1yoc#|&&S#NfSoEu^
R)2|)|T}=axkLnIF{{zz;5aa*=

literal 0
HcmV?d00001

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/mlperf_logo_horizontal_color.svg b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/mlperf_logo_horizontal_color.svg
new file mode 100644
index 000000000..d932bdae6
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/mlperf_logo_horizontal_color.svg
@@ -0,0 +1,55 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Generator: Adobe Illustrator 22.0.1, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
+<svg version="1.1" id="mlperf_x5F_logo_x5F_horizontal_x5F_color"
+	 xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px" viewBox="0 0 494 166"
+	 style="enable-background:new 0 0 494 166;" xml:space="preserve">
+<style type="text/css">
+	.st0{fill:#333333;}
+	.st1{fill:#1A6197;}
+	.st2{fill:#B4BFC1;}
+</style>
+<g>
+	<g>
+		<path class="st0" d="M242.3,58.3c0.9,0.9,1.4,1.9,1.4,3.2v43c0,1.3-0.4,2.4-1.3,3.2c-0.8,0.9-2,1.3-3.3,1.3
+			c-1.4,0-2.5-0.4-3.4-1.3c-0.9-0.8-1.3-1.9-1.3-3.3V74.6l-12.2,18.7c-0.5,0.6-1.1,1.1-1.7,1.4c-0.6,0.4-1.3,0.5-2,0.5
+			c-0.6,0-1.3-0.1-1.9-0.5c-0.6-0.4-1.2-0.9-1.7-1.4l-12.2-19.4v30.6c0,1.3-0.4,2.4-1.2,3.3c-0.8,0.8-1.8,1.3-3.1,1.3
+			c-1.2,0-2.3-0.4-3-1.3c-0.8-0.9-1.2-1.9-1.2-3.2v-43c0-1.2,0.5-2.3,1.4-3.2c0.9-0.9,2-1.3,3.3-1.3c0.7,0,1.5,0.2,2.2,0.6
+			c0.7,0.4,1.3,0.9,1.7,1.4l16.2,24.9L235.4,59c0.4-0.6,0.9-1.1,1.6-1.4c0.7-0.4,1.3-0.6,2-0.6C240.3,57,241.3,57.4,242.3,58.3z"/>
+		<path class="st0" d="M286.7,101.5c0.9,0.8,1.3,1.9,1.3,3.2c0,1.2-0.4,2.3-1.3,3.1c-0.9,0.8-1.9,1.2-3.2,1.2h-24.1
+			c-1.3,0-2.4-0.4-3.2-1.3c-0.9-0.9-1.3-1.9-1.3-3.2V61.5c0-1.3,0.4-2.4,1.3-3.2c0.9-0.9,2-1.3,3.4-1.3c1.2,0,2.3,0.4,3.2,1.3
+			c0.9,0.9,1.3,1.9,1.3,3.2v38.8h19.4C284.7,100.3,285.8,100.7,286.7,101.5z"/>
+		<path class="st0" d="M325.2,59.3c2.3,1.5,4.1,3.5,5.5,6.1c1.4,2.6,2.1,5.3,2.1,8.4c0,3-0.7,5.7-2.1,8.3c-1.4,2.6-3.2,4.6-5.5,6.1
+			c-2.3,1.5-4.7,2.3-7.2,2.3H305v14c0,1.3-0.4,2.4-1.2,3.3s-1.8,1.3-3.1,1.3c-1.2,0-2.3-0.4-3-1.3c-0.8-0.9-1.2-1.9-1.2-3.2V61.5
+			c0-1.3,0.4-2.4,1.3-3.2c0.9-0.9,1.9-1.3,3.2-1.3H318C320.5,57,322.9,57.8,325.2,59.3z M320.8,80.8c0.9-0.8,1.7-1.8,2.2-3.1
+			c0.6-1.3,0.9-2.6,0.9-4c0-1.4-0.3-2.7-0.9-4c-0.6-1.3-1.3-2.3-2.2-3c-0.9-0.8-1.8-1.2-2.8-1.2H305V82H318
+			C318.9,82,319.8,81.6,320.8,80.8z"/>
+		<path class="st0" d="M373.9,90.4c-0.9,0.7-1.9,1.1-3.1,1.1h-24.5c0.6,2.9,2,5.2,4.1,6.9c2.1,1.7,4.5,2.5,7.2,2.5
+			c2,0,3.6-0.2,4.7-0.6c1.1-0.4,2-0.8,2.7-1.2c0.7-0.4,1.1-0.7,1.4-0.9c0.9-0.4,1.7-0.7,2.5-0.7c1,0,1.9,0.4,2.7,1.1
+			c0.7,0.7,1.1,1.6,1.1,2.6c0,1.3-0.7,2.5-2.1,3.6c-1.4,1.1-3.2,2.1-5.6,2.9c-2.3,0.8-4.7,1.2-7,1.2c-4.2,0-7.8-0.9-10.9-2.6
+			c-3.1-1.7-5.5-4.1-7.2-7.2c-1.7-3-2.5-6.5-2.5-10.3c0-4.3,0.9-8,2.7-11.2c1.8-3.2,4.1-5.7,7-7.4c2.9-1.7,6-2.6,9.3-2.6
+			c3.3,0,6.3,0.9,9.2,2.7c2.9,1.8,5.2,4.2,6.9,7.2c1.7,3,2.6,6.3,2.6,9.8C375.2,88.7,374.8,89.6,373.9,90.4z M346.4,84.1h19.4v-0.5
+			c-0.2-2.1-1.2-3.9-3-5.3c-1.8-1.5-3.9-2.2-6.2-2.2C350.8,76,347.4,78.7,346.4,84.1z"/>
+		<path class="st0" d="M407.5,69.8c1,0.8,1.5,1.8,1.5,3c0,1.5-0.4,2.7-1.2,3.4c-0.8,0.8-1.7,1.1-2.8,1.1c-0.7,0-1.6-0.2-2.5-0.5
+			c-0.1,0-0.5-0.1-1-0.3c-0.5-0.1-1.1-0.2-1.7-0.2c-1.3,0-2.5,0.4-3.7,1.2c-1.2,0.8-2.2,2-2.9,3.6c-0.8,1.6-1.1,3.5-1.1,5.7v17.6
+			c0,1.3-0.4,2.4-1.3,3.2c-0.8,0.9-1.9,1.3-3.2,1.3c-1.3,0-2.3-0.4-3.2-1.3c-0.8-0.9-1.3-1.9-1.3-3.2V73.8c0-1.3,0.4-2.4,1.3-3.2
+			c0.8-0.9,1.9-1.3,3.2-1.3c1.3,0,2.3,0.4,3.2,1.3c0.8,0.9,1.3,1.9,1.3,3.2v1c1.1-2,2.8-3.6,4.9-4.6c2.1-1.1,4.4-1.6,6.8-1.6
+			C405.2,68.6,406.5,69,407.5,69.8z"/>
+		<path class="st0" d="M432.3,63.3c-0.7,1-1,2-1,3.2V70h6.8c1.2,0,2.2,0.4,3,1.2c0.8,0.8,1.2,1.7,1.2,2.9c0,1.2-0.4,2.2-1.2,2.9
+			c-0.8,0.8-1.8,1.2-3,1.2h-6.8v26.3c0,1.3-0.4,2.4-1.3,3.2c-0.8,0.9-1.9,1.3-3.2,1.3s-2.4-0.4-3.2-1.3c-0.8-0.9-1.3-1.9-1.3-3.2
+			V78.2h-4c-1.2,0-2.2-0.4-3-1.2c-0.8-0.8-1.2-1.7-1.2-2.9c0-1.2,0.4-2.2,1.2-2.9c0.8-0.8,1.8-1.2,3-1.2h4v-3.4
+			c0-3.7,1.3-6.7,3.8-9.1c2.6-2.3,6.2-3.5,11-3.5c1.8,0,3.4,0.4,4.8,1.1c1.4,0.7,2,1.9,2,3.4c0,1.2-0.3,2.2-1,2.9
+			c-0.7,0.7-1.5,1.1-2.5,1.1c-0.2,0-0.5,0-0.8-0.1c-0.3,0-0.6-0.1-0.9-0.1c-1.2-0.3-2.2-0.4-3-0.4C434.1,61.9,433,62.4,432.3,63.3z"
+			/>
+	</g>
+	<g>
+		<path class="st1" d="M138.7,71.3l-24.2,39.3l0,0c-1.9,3.2-5.4,5.3-9.4,5.3c-6.1,0-11-4.9-11-11c0-4,2.1-7.5,5.3-9.4l0,0
+			L138.7,71.3"/>
+		<g>
+			<path class="st2" d="M136.1,67.1l-11.7,7.2c-7.3-6.6-17-10.6-27.6-10.6C74,63.8,55.5,82.2,55.5,105c0,2.7,0.3,5.3,0.7,7.8l0,0
+				c0,0.1,0,0.3,0,0.4c0,1.6-1.3,2.8-2.8,2.8c-1.4,0-2.6-1-2.8-2.4l0,0c-0.4-2.8-0.7-5.7-0.7-8.6c0-30.4,24.6-55,55-55
+				c14,0,26.7,5.2,36.4,13.8L136.1,67.1z M146.2,68.6l-3.3,5.3l-8.6,14c2.4,5.2,3.7,11,3.7,17.1c0,0,0,0,0,0c0,6.1,4.9,11,11,11
+				c6.1,0,11-4.9,11-11h0C160,91,154.8,78.3,146.2,68.6z"/>
+		</g>
+	</g>
+</g>
+</svg>
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/early_stopping.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/early_stopping.cc
new file mode 100644
index 000000000..41f74b803
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/early_stopping.cc
@@ -0,0 +1,117 @@
+#include "early_stopping.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <iostream>
+#include <map>
+#include <numeric>
+#include <string>
+
+namespace mlperf {
+
+namespace loadgen {
+
+double lbeta(int64_t x, int64_t y) {
+  return std::lgamma(x) + std::lgamma(y) - std::lgamma(x + y);
+}
+
+// The Gaussian Hypergeometric function specialized for a = 1.
+// Based on http://dlmf.nist.gov/15.2.E1.
+// Converges if c > 0 and (b <= 0 or x < 1).
+// TODO(ckstanton): http://dlmf.nist.gov/15.2.E1 says there are transformations
+// to replace x with with a value less than 0.5, for faster convergence.
+// Presently, this function can take up to 200,000 iterations to converge.
+double hypergeometric_2F1_A1(int64_t b, int64_t c, double x) {
+  // TODO(ckstanton): Is there a more principled way to pick kTolerance?
+  constexpr double kTolerance = 1.0 / (1LL << 33);
+  double term = 1.0;
+  double result = 1.0;
+  for (int64_t i = 0; std::abs(term) > kTolerance; ++i) {
+    term *= (b + i) * x / (c + i);
+    result += term;
+  }
+  return result;
+}
+
+// BetaRegularized[x, a, b] =
+// Beta[x, a, b]/Beta[a, b] =
+// x^a/a Hypergeometric2F1[a, 1-b, 1+a, x]/Beta[a, b] =
+// (http://dlmf.nist.gov/15.8.E1.)
+// x^a/a (1-x)^(b-1) Hypergeometric2F1[1, 1-b, 1+a, x/(x-1)]/Beta[a, b]
+double beta_regularized(double x, int64_t a, int64_t b) {
+  return std::exp(a * std::log(x) + (b - 1) * std::log(1 - x) - lbeta(a, b)) /
+         a * hypergeometric_2F1_A1(1 - b, 1 + a, x / (x - 1));
+}
+
+// Compute the odds of t or fewer overlatency queries in h + t total queries.
+// The binomial distribution is the discrete probability distribution for
+// independent boolean experiments. The CDF of the binomial distribution is:
+// BetaRegularized[q, n - k, 1 + k] where 1 - q is the probability of an event
+// per experiment, n is the total number of experiments, and k is the number of
+// events. An even in our case is an overlatency query, so q = p - d, n = h + t,
+// and k = t.
+// Sum[Binomial[h + t, x] (p - d)^(h + t - x) (1 - p + d)^x, {x, 0, t}] =
+// BetaRegularized[p - d, h, 1 + t]
+double odds(int64_t h, int64_t t, double p, double d) {
+  return beta_regularized(p - d, h, 1 + t);
+}
+
+// Binary search to find the minimum value h such that:
+// odds(h, t, p, d) <= 1 - c on the range [min_h, max_h] given t, p, d, and c.
+int64_t find_min_passing(int64_t min_h, int64_t max_h, int64_t t, double p,
+                         double d, double c) {
+  int64_t count = max_h - min_h;
+  while (count > 0) {
+    int64_t step = count / 2;
+    int64_t h = min_h + step;
+    double prob = odds(h, t, p, d);
+    if (prob < 1 - c) {
+      count = step;
+    } else {
+      min_h = h + 1;
+      count -= step + 1;
+    }
+  }
+  return min_h;
+}
+
+int64_t MinPassingQueriesFinder::operator()(int64_t t, double p, double d,
+                                            double c) {
+  // Given t, p, d, and c, return the minimum h such that odds(h, t, p, d) <= 1
+  // - c
+
+  auto &cache = caches_[std::make_tuple(p, d, c)];
+  auto it = cache.lower_bound(t);
+  if (it != cache.end() && it->first == t) {
+    return it->second;
+  }
+
+  int64_t x0 = -1;
+  int64_t y0 = 0;
+  int64_t x1 = 0;
+  int64_t y1 = std::ceil(std::log(1 - c) / std::log(p - d));
+
+  if (it != cache.begin()) {
+    --it;
+    x1 = it->first;
+    y1 = it->second;
+  }
+
+  if (it != cache.begin()) {
+    --it;
+    x0 = it->first;
+    y0 = it->second;
+  }
+
+  double min_slope = (p - d) / (1 - p + d);
+  double max_slope = (y1 - y0) * (x1 - x0);
+  int64_t min_h = (t - x1) * min_slope + y1;
+  int64_t max_h = (t - x1) * max_slope + y1 + 1;
+  int64_t h = find_min_passing(min_h, max_h, t, p, d, c);
+  cache[t] = h;
+  return h;
+}
+
+}  // namespace loadgen
+}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/early_stopping.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/early_stopping.h
new file mode 100644
index 000000000..49b7a901e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/early_stopping.h
@@ -0,0 +1,27 @@
+#ifndef MLPERF_LOADGEN_EARLYSTOPPING_H_
+#define MLPERF_LOADGEN_EARLYSTOPPING_H_
+
+#include <cstdint>
+#include <map>
+
+namespace mlperf {
+namespace loadgen {
+
+class MinPassingQueriesFinder {
+ public:
+  int64_t operator()(int64_t t, double p, double d, double c);
+
+ private:
+  // Memoize prior computations results and use them to bound the binary search
+  // range for subsequent computations.
+
+  // TODO: Is there something more efficient to use besides std::map for
+  // caches_?
+  std::map<std::tuple<double, double, double>, std::map<int64_t, int64_t>>
+      caches_;
+};
+
+}  // namespace loadgen
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_EARLYSTOPPING_H_
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/generated/version_generated.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/generated/version_generated.cc
new file mode 100644
index 000000000..75fdc9519
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/generated/version_generated.cc
@@ -0,0 +1,98 @@
+// DO NOT EDIT: Autogenerated by version_generator.py.
+
+#include <string>
+
+namespace mlperf {
+
+const std::string& LoadgenVersion() {
+  static const std::string str = "4.1";
+  return str;
+}
+
+const std::string& LoadgenBuildDateLocal() {
+  static const std::string str = "2024-10-18T23:12:51.002440";
+  return str;
+}
+
+const std::string& LoadgenBuildDateUtc() {
+  static const std::string str = "2024-10-19T06:12:51.002446";
+  return str;
+}
+
+const std::string& LoadgenGitRevision() {
+  static const std::string str = "f5c8f17583";
+  return str;
+}
+
+const std::string& LoadgenGitCommitDate() {
+  static const std::string str = "2024-10-08T18:30:16+01:00";
+  return str;
+}
+
+const std::string& LoadgenGitStatus() {
+  static const std::string str = R"LGVG_RSLD()LGVG_RSLD";
+  return str;
+}
+
+const std::string& LoadgenGitLog() {
+  static const std::string str = R"LGVG_RSLD(f5c8f1758374aeaba26b2e84d31690111cfdf054 Fix bug: Loadgen ignoring token latency targets in user conf (#1874)
+976bb1ad9c7946be79507f3ff67955c27426af52 Set correct remote repo (#1871)
+41fa8aadd1ba0ecc97f6a519d8b42b04278e5f24 Add format files github action (#1682)
+518b454fd8647bfbd23a074e875e87353f33393e Tflite tpu (#1449)
+e0fdec1c7a75c98cfc194f13d62ac4388d419c8a Fix link in GettingStarted.ipynb (#1512)
+92bd8198d15411d7fb7d7c27f8904bc5a0bcfe7a Fix warning in the submission checker (#1808)
+224cfbf5c0e82cae6d48620025b7e1258ae3666a Fix typo in reference datatype (#1851)
+3ef1249b7f50a250c02c568342e0aea6638fc5a7 Fix docs (#1853)
+a0874c100c54cbc54fb743ac8bf9fb5fadc64135 Update build_wheels.yml (#1758)
+6eff09986e337ccf03f675c9f244d8ee93644e16 Extend the final report generation script to output a json file of results (#1825)
+54f3f93a73cc8ca5e3319ad87fb325e510574f56 Add binding for server_num_issue_query_threads parameter (#1862)
+c4d0b3ea98e6fe7252e50cb573f0d523da7979df Update docs: SCC24,  fix broken redirect  (#1843)
+7d2f0c41e5cd79c9178702867392e38f57953338 Update DLRM readme (#1811)
+cf5fddc5d0746bf3820eb0ab7294bbf709d788ab Enable systems to be marked as power only (#1850)
+81c2de69de4af90410cd1ba000fc5bd731bf6dee Documentation updates (#1821)
+73b02798219c794a735a7f2ddabbc3df9173352d Fix error with generate_final_report.py when the input CSV file is empty (#1827))LGVG_RSLD";
+  return str;
+}
+
+const std::string& LoadgenSha1OfFiles() {
+  static const std::string str = R"LGVG_RSLD(012aad77e5206c89d50718c46c119d1f3cb056b2 /.clang-format
+e173f4513f3c5dac1f0bea1473bb0a058e23f190 /=42
+d5274ff0b56e8d3cdb273174628a4461fca6f02a /CMakeLists.txt
+20a55bb946c2c0bbb564ced2af1e48efd096b3a8 /README.md
+5f6c6a784e9cd6995db47f9b9f70b1769909c9d8 /README_BUILD.md
+01f9ae9887f50bc030dc6107e740f40c43ca388f /README_FAQ.md
+32181da9e161c285f8fe46ddaa49e6cba2f9f918 /bindings/c_api.cc
+91f58bd79b83b278f3240174a9af747fc38aff74 /bindings/c_api.h
+ea4c89decad19eaf3217bfa2fb757d3b83a561d6 /bindings/python_api.cc
+53dba8ad4272190ceb6335c12fd25e53dc02a8cb /diagram_network_submission.png
+84c2f79309b237cef652aef6a187ba8e875a3952 /diagram_submission.png
+0cd7b546a389deac73f7955cd39255ed76557d62 /early_stopping.cc
+158fcae6a5f47e82150d6416fa1f7bcef37e77fe /early_stopping.h
+126e952d00f4ea9efd12405fb209aa3ed585e4b2 /issue_query_controller.cc
+923d9d5cdf598e3ec33d7a1110a31f7e11527ec7 /issue_query_controller.h
+6650091ba7a918f343b06eb7a5aa540eae87275f /loadgen.cc
+e00fdc6dbc85a8c9a8485dbcbfe2944f81251c4e /loadgen.h
+47f748307536f80cfc606947b440dd732afc2637 /loadgen_integration_diagram.svg
+197efc96d178e5d33a750d07fa7b2966417506ea /logging.cc
+ddb961df7bcc145bcd7cce8c21f7cf075350dcbe /logging.h
+ca17720f9c8246e821331946d893e830fc88f8bd /pyproject.toml
+13ad6d842200cb161d6927eb74a3fafd79c46c75 /query_dispatch_library.h
+e9187c8612bbdc972305b789feb6e15c26e96cfe /query_sample.h
+8323a2225be1dff31f08ecc86b76eb3de06568bc /query_sample_library.h
+a5ff7e77caa6e9e22ada90f0de0c865c987bf167 /requirements.txt
+34e2d2a44324cb07c884f92146ecbb8ef9d704e2 /results.cc
+d82500c326c2de83db411f1146882aa4692b419c /results.h
+13c49b028b22749b5f3c44f3d9bb489e8c0574e9 /setup.py
+18d4809589dae33317d88d9beeb5491a6e1ccdec /system_under_test.h
+c15c3e150030089a8d634bd2ad6d4b644002e613 /test_settings.h
+e21febd60f9b5bedd1fc81bb990f09c34b32043c /test_settings_internal.cc
+f1d5335b53ca610c30e0edc5d07999a27b5b4b9a /test_settings_internal.h
+3df8fdabf6eaea4697cf25d1dcb89cae88e36efd /utils.cc
+40775e32d619ea6356826ae5ea4174c7911f6894 /utils.h
+cbec2a5f98f9786c8c3d8b06b3d12df0b6550fa0 /version.cc
+9d574baa64424e9c708fcfedd3dbb0b518a65fcc /version.h
+eea9b9cb1a06cd1abe1bbdaee82f9af31527fedb /version_generator.py)LGVG_RSLD";
+  return str;
+}
+
+}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/issue_query_controller.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/issue_query_controller.cc
new file mode 100644
index 000000000..c1abea9d1
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/issue_query_controller.cc
@@ -0,0 +1,552 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Implements IssueQueryController and other helper classes for
+/// query issuing.
+
+#include "issue_query_controller.h"
+
+#include <sstream>
+
+namespace mlperf {
+
+void RegisterIssueQueryThread() {
+  loadgen::IssueQueryController::GetInstance().RegisterThread();
+}
+
+/// \brief Loadgen implementation details.
+namespace loadgen {
+
+QueryMetadata::QueryMetadata(
+    const std::vector<QuerySampleIndex>& query_sample_indices,
+    std::chrono::nanoseconds scheduled_delta,
+    ResponseDelegate* response_delegate, SequenceGen* sequence_gen)
+    : scheduled_delta(scheduled_delta),
+      response_delegate(response_delegate),
+      sequence_id(sequence_gen->NextQueryId()),
+      wait_count_(query_sample_indices.size()) {
+  samples_.reserve(query_sample_indices.size());
+  for (QuerySampleIndex qsi : query_sample_indices) {
+    samples_.push_back({this, sequence_gen->NextSampleId(), qsi,
+                        sequence_gen->NextAccLogRng()});
+  }
+  query_to_send.reserve(query_sample_indices.size());
+  for (auto& s : samples_) {
+    query_to_send.push_back({reinterpret_cast<ResponseId>(&s), s.sample_index});
+  }
+}
+
+QueryMetadata::QueryMetadata(QueryMetadata&& src)
+    : query_to_send(std::move(src.query_to_send)),
+      scheduled_delta(src.scheduled_delta),
+      response_delegate(src.response_delegate),
+      sequence_id(src.sequence_id),
+      wait_count_(src.samples_.size()),
+      samples_(std::move(src.samples_)) {
+  // The move constructor should only be called while generating a
+  // vector of QueryMetadata, before it's been used.
+  // Assert that wait_count_ is in its initial state.
+  assert(src.wait_count_.load() == samples_.size());
+  // Update the "parent" of each sample to be this query; the old query
+  // address will no longer be valid.
+  // TODO: Only set up the sample parenting once after all the queries have
+  //       been created, rather than re-parenting on move here.
+  for (size_t i = 0; i < samples_.size(); i++) {
+    SampleMetadata* s = &samples_[i];
+    s->query_metadata = this;
+    query_to_send[i].id = reinterpret_cast<ResponseId>(s);
+  }
+}
+
+void QueryMetadata::NotifyOneSampleCompleted(PerfClock::time_point timestamp) {
+  size_t old_count = wait_count_.fetch_sub(1, std::memory_order_relaxed);
+  if (old_count == 1) {
+    all_samples_done_time = timestamp;
+    all_samples_done_.set_value();
+    response_delegate->QueryComplete();
+  }
+}
+
+void QueryMetadata::WaitForAllSamplesCompleted() {
+  all_samples_done_.get_future().wait();
+}
+
+PerfClock::time_point QueryMetadata::WaitForAllSamplesCompletedWithTimestamp() {
+  all_samples_done_.get_future().wait();
+  return all_samples_done_time;
+}
+
+// When server_coalesce_queries is set to true in Server scenario, we
+// sometimes coalesce multiple queries into one query. This is done by moving
+// the other query's sample into current query, while maintaining their
+// original scheduled_time.
+void QueryMetadata::CoalesceQueries(QueryMetadata* queries, size_t first,
+                                    size_t last, size_t stride) {
+  // Copy sample data over to current query, boldly assuming that each query
+  // only has one sample.
+  query_to_send.reserve((last - first) / stride +
+                        2);  // Extra one for the current query.
+  for (size_t i = first; i <= last; i += stride) {
+    auto& q = queries[i];
+    auto& s = q.samples_[0];
+    query_to_send.push_back({reinterpret_cast<ResponseId>(&s), s.sample_index});
+    q.scheduled_time = scheduled_time + q.scheduled_delta - scheduled_delta;
+    q.issued_start_time = issued_start_time;
+  }
+}
+
+void QueryMetadata::Decoalesce() { query_to_send.resize(1); }
+
+/// \brief A base template that should never be used since each scenario has
+/// its own specialization.
+template <TestScenario scenario>
+struct QueryScheduler {
+  static_assert(scenario != scenario, "Unhandled TestScenario");
+};
+
+/// \brief Schedules queries for issuance in the single stream scenario.
+template <>
+struct QueryScheduler<TestScenario::SingleStream> {
+  QueryScheduler(const TestSettingsInternal& /*settings*/,
+                 const PerfClock::time_point) {}
+
+  PerfClock::time_point Wait(QueryMetadata* next_query) {
+    auto tracer = MakeScopedTracer([](AsyncTrace& trace) { trace("Waiting"); });
+    if (prev_query != nullptr) {
+      prev_query->WaitForAllSamplesCompleted();
+    }
+    prev_query = next_query;
+
+    auto now = PerfClock::now();
+    next_query->scheduled_time = now;
+    next_query->issued_start_time = now;
+    return now;
+  }
+
+  QueryMetadata* prev_query = nullptr;
+};
+
+/// \brief Schedules queries for issuance in the multi stream scenario.
+template <>
+struct QueryScheduler<TestScenario::MultiStream> {
+  QueryScheduler(const TestSettingsInternal& /*settings*/,
+                 const PerfClock::time_point) {}
+
+  PerfClock::time_point Wait(QueryMetadata* next_query) {
+    auto tracer = MakeScopedTracer([](AsyncTrace& trace) { trace("Waiting"); });
+    if (prev_query != nullptr) {
+      prev_query->WaitForAllSamplesCompleted();
+    }
+    prev_query = next_query;
+
+    auto now = PerfClock::now();
+    next_query->scheduled_time = now;
+    next_query->issued_start_time = now;
+    return now;
+  }
+
+  QueryMetadata* prev_query = nullptr;
+};
+
+/// \brief Schedules queries for issuance in the server scenario.
+template <>
+struct QueryScheduler<TestScenario::Server> {
+  QueryScheduler(const TestSettingsInternal& /*settings*/,
+                 const PerfClock::time_point start)
+      : start(start) {}
+
+  PerfClock::time_point Wait(QueryMetadata* next_query) {
+    auto tracer =
+        MakeScopedTracer([](AsyncTrace& trace) { trace("Scheduling"); });
+
+    auto scheduled_time = start + next_query->scheduled_delta;
+    next_query->scheduled_time = scheduled_time;
+
+    auto now = PerfClock::now();
+    if (now < scheduled_time) {
+      std::this_thread::sleep_until(scheduled_time);
+      now = PerfClock::now();
+    }
+    next_query->issued_start_time = now;
+    return now;
+  }
+
+  const PerfClock::time_point start;
+};
+
+/// \brief Schedules queries for issuance in the offline scenario.
+template <>
+struct QueryScheduler<TestScenario::Offline> {
+  QueryScheduler(const TestSettingsInternal& /*settings*/,
+                 const PerfClock::time_point start)
+      : start(start) {}
+
+  PerfClock::time_point Wait(QueryMetadata* next_query) {
+    next_query->scheduled_time = start;
+    auto now = PerfClock::now();
+    next_query->issued_start_time = now;
+    return now;
+  }
+
+  const PerfClock::time_point start;
+};
+
+IssueQueryController& IssueQueryController::GetInstance() {
+  // The singleton.
+  static IssueQueryController instance;
+  return instance;
+}
+
+void IssueQueryController::RegisterThread() {
+  // Push this thread to thread queue.
+  auto thread_id = std::this_thread::get_id();
+  size_t thread_idx{0};
+  {
+    std::lock_guard<std::mutex> lock(mtx);
+    thread_idx = thread_ids.size();
+    thread_ids.emplace_back(thread_id);
+  }
+
+  LogDetail([thread_id, thread_idx](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    std::stringstream ss;
+    ss << "Registered IssueQueryThread[" << thread_idx
+       << "]. thread ID : " << std::hash<std::thread::id>()(thread_id);
+    MLPERF_LOG(detail, "generic_message", ss.str());
+#else
+    detail("Registered IssueQueryThread[" + std::to_string(thread_idx) +
+               "]. thread ID : ",
+           std::to_string(std::hash<std::thread::id>()(thread_id)));
+#endif
+  });
+
+  // Start test.
+  while (true) {
+    // Wait until the main thread signals a start or the end.
+    {
+      std::unique_lock<std::mutex> lock(mtx);
+      cond_var.wait(lock, [this]() { return issuing || end_test; });
+      // The test has ended.
+      if (end_test) {
+        break;
+      }
+    }
+
+    // Start issuing queries.
+    if (thread_idx <= num_threads) {
+      IssueQueriesInternal<TestScenario::Server, true>(num_threads, thread_idx);
+      {
+        std::lock_guard<std::mutex> lock(mtx);
+        thread_complete[thread_idx] = true;
+      }
+      cond_var.notify_all();
+    }
+
+    // Wait until all issue threads complete.
+    {
+      std::unique_lock<std::mutex> lock(mtx);
+      cond_var.wait(lock, [this]() { return !issuing; });
+    }
+  }
+}
+
+void IssueQueryController::SetNumThreads(size_t n) {
+  // Try waiting for IssueQueryThreads() to registered themselves.
+  std::unique_lock<std::mutex> lock(mtx);
+  const std::chrono::seconds timeout(10);
+  num_threads = n;
+  cond_var.wait_for(lock, timeout,
+                    [this]() { return thread_ids.size() >= num_threads; });
+  // If the number of registered threads do not match the settings, report an
+  // error.
+  if (num_threads != thread_ids.size()) {
+    LogDetail([this](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+      std::stringstream ss;
+      ss << "Mismatch between settings and number of registered "
+         << "IssueQueryThreads! settings.server_num_issue_query_threads = "
+         << num_threads << " but " << thread_ids.size()
+         << " threads registered.";
+      MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
+#else
+      detail.Error(
+          "Mismatch between settings and number of registered ",
+          "IssueQueryThreads! settings.server_num_issue_query_threads = ",
+          num_threads, " but ", thread_ids.size(), " threads registered.");
+#endif
+    });
+  }
+}
+
+template <TestScenario scenario>
+void IssueQueryController::StartIssueQueries(IssueQueryState* s) {
+  // Get the state.
+  state = s;
+  state->start_for_power = std::chrono::system_clock::now();
+  state->start_time = PerfClock::now();
+
+  if (scenario != TestScenario::Server || num_threads == 0) {
+    // Usually, we just use the same thread to issue queries.
+    IssueQueriesInternal<scenario, false>(1, 0);
+  } else {
+    // If server_num_issue_query_threads is non-zero, issue queries on the
+    // registered threads.
+    // Tell all threads to start issuing queries.
+    {
+      std::unique_lock<std::mutex> lock(mtx);
+      issuing = true;
+      thread_complete.assign(num_threads, false);
+    }
+    cond_var.notify_all();
+    // Wait until all issue threads complete.
+    {
+      std::unique_lock<std::mutex> lock(mtx);
+      cond_var.wait(lock, [this]() {
+        return std::all_of(thread_complete.begin(), thread_complete.end(),
+                           [](bool in) { return in; });
+      });
+      issuing = false;
+    }
+    cond_var.notify_all();
+  }
+}
+
+template void IssueQueryController::StartIssueQueries<
+    TestScenario::MultiStream>(IssueQueryState* s);
+template void IssueQueryController::StartIssueQueries<TestScenario::Offline>(
+    IssueQueryState* s);
+template void IssueQueryController::StartIssueQueries<TestScenario::Server>(
+    IssueQueryState* s);
+template void IssueQueryController::StartIssueQueries<
+    TestScenario::SingleStream>(IssueQueryState* s);
+
+void IssueQueryController::EndThreads() {
+  // Tell all the issue threads to end.
+  {
+    std::lock_guard<std::mutex> lock(mtx);
+    end_test = true;
+  }
+  cond_var.notify_all();
+}
+
+template <TestScenario scenario, bool multi_thread>
+void IssueQueryController::IssueQueriesInternal(size_t query_stride,
+                                                size_t thread_idx) {
+  // Get all the needed information.
+  auto sut = state->sut;
+  auto& queries = *state->queries;
+  auto& response_logger = *state->response_delegate;
+
+  // Some book-keeping about the number of queries issued.
+  size_t queries_issued = 0;
+  size_t queries_issued_per_iter = 0;
+  size_t queries_count = queries.size();
+
+  // Calculate the min/max queries per issue thread.
+  const auto& settings = *state->settings;
+  const size_t min_query_count = settings.min_query_count;
+  const size_t min_query_count_for_thread =
+      (thread_idx < (min_query_count % query_stride))
+          ? (min_query_count / query_stride + 1)
+          : (min_query_count / query_stride);
+  const size_t max_query_count = settings.max_query_count;
+  const size_t max_query_count_for_thread =
+      (thread_idx < (max_query_count % query_stride))
+          ? (max_query_count / query_stride + 1)
+          : (max_query_count / query_stride);
+
+  // Create query scheduler.
+  const auto start = state->start_time;
+  QueryScheduler<scenario> query_scheduler(settings, start);
+  auto last_now = start;
+
+  // We can never run out of generated queries in the server scenario,
+  // since the duration depends on the scheduled query time and not
+  // the actual issue time.
+  bool ran_out_of_generated_queries = scenario != TestScenario::Server;
+  // This is equal to the sum of numbers of samples issued.
+  size_t expected_latencies = 0;
+
+  for (size_t queries_idx = thread_idx; queries_idx < queries_count;
+       queries_idx += query_stride) {
+    queries_issued_per_iter = 0;
+    auto& query = queries[queries_idx];
+    auto tracer1 =
+        MakeScopedTracer([](AsyncTrace& trace) { trace("SampleLoop"); });
+    last_now = query_scheduler.Wait(&query);
+
+    // If in Server scenario and server_coalesce_queries is enabled, multiple
+    // queries are coalesed into one big query if the current time has already
+    // passed the scheduled time of multiple queries.
+    if (scenario == TestScenario::Server &&
+        settings.requested.server_coalesce_queries) {
+      auto current_query_idx = queries_idx;
+      for (; queries_idx + query_stride < queries_count;
+           queries_idx += query_stride) {
+        auto next_scheduled_time =
+            start + queries[queries_idx + query_stride].scheduled_delta;
+        // If current time hasn't reached the next query's scheduled time yet,
+        // don't include next query.
+        if (last_now < next_scheduled_time) {
+          break;
+        }
+        queries_issued_per_iter++;
+      }
+      if (queries_idx > current_query_idx) {
+        // Coalesced all the pass due queries.
+        query.CoalesceQueries(queries.data(), current_query_idx + query_stride,
+                              queries_idx, query_stride);
+      }
+    }
+
+    // Issue the query to the SUT.
+    {
+      auto tracer3 =
+          MakeScopedTracer([](AsyncTrace& trace) { trace("IssueQuery"); });
+      sut->IssueQuery(query.query_to_send);
+    }
+
+    // Increment the counter.
+    expected_latencies += query.query_to_send.size();
+    queries_issued_per_iter++;
+    queries_issued += queries_issued_per_iter;
+
+    if (scenario == TestScenario::Server &&
+        settings.requested.server_coalesce_queries) {
+      // Set the query back to its clean state.
+      query.Decoalesce();
+    }
+
+    if (state->mode == TestMode::AccuracyOnly) {
+      // TODO: Rate limit in accuracy mode so accuracy mode works even
+      //       if the expected/target performance is way off.
+      continue;
+    }
+
+    auto duration = (last_now - start);
+    if (scenario == TestScenario::Server) {
+      if (settings.max_async_queries != 0) {
+        // Checks if there are too many outstanding queries.
+        size_t queries_issued_total{0};
+        if (multi_thread) {
+          // To check actual number of async queries in multi-thread case,
+          // we would have to combine the number of queries_issued from all
+          // issue threads.
+          {
+            std::lock_guard<std::mutex> lock(state->mtx);
+            state->queries_issued += queries_issued_per_iter;
+            queries_issued_total = state->queries_issued;
+          }
+        } else {
+          queries_issued_total = queries_issued;
+        }
+        size_t queries_outstanding =
+            queries_issued_total -
+            response_logger.queries_completed.load(std::memory_order_relaxed);
+        if (queries_outstanding > settings.max_async_queries) {
+          LogDetail([thread_idx, queries_issued_total,
+                     queries_outstanding](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+            std::stringstream ss;
+            ss << "IssueQueryThread " << thread_idx
+               << " Ending early: Too many outstanding queries." << " issued "
+               << queries_issued_total << " outstanding "
+               << queries_outstanding;
+            MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
+#else
+            detail.Error("IssueQueryThread ", std::to_string(thread_idx),
+                         " Ending early: Too many outstanding queries.",
+                         "issued", std::to_string(queries_issued_total),
+                         "outstanding", std::to_string(queries_outstanding));
+#endif
+          });
+          break;
+        }
+      }
+    } else {
+      // Checks if we end normally.
+      if (queries_issued >= min_query_count_for_thread &&
+          duration >= settings.target_duration) {
+        LogDetail([thread_idx](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+          MLPERF_LOG(
+              detail, "generic_message",
+              "Ending naturally: Minimum query count and test duration met.");
+#else
+          detail(
+              " Ending naturally: Minimum query count and test duration met.");
+#endif
+        });
+        ran_out_of_generated_queries = false;
+        break;
+      }
+    }
+
+    // Checks if we have exceeded max_query_count for this thread.
+    if (settings.max_query_count != 0 &&
+        queries_issued >= max_query_count_for_thread) {
+      LogDetail([thread_idx, queries_issued](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+        std::stringstream ss;
+        ss << "IssueQueryThread " << thread_idx
+           << " Ending early: Max query count reached." << " query_count "
+           << queries_issued;
+        MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
+#else
+        detail.Error("IssueQueryThread ", std::to_string(thread_idx),
+                     " Ending early: Max query count reached.", "query_count",
+                     std::to_string(queries_issued));
+#endif
+      });
+      ran_out_of_generated_queries = false;
+      break;
+    }
+
+    // Checks if we have exceeded max_duration.
+    if (settings.max_duration.count() != 0 &&
+        duration > settings.max_duration) {
+      LogDetail([thread_idx, duration](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+        std::stringstream ss;
+        ss << "IssueQueryThread " << thread_idx
+           << " Ending early: Max test duration reached." << " duration_ns "
+           << duration.count();
+        MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
+#else
+        detail.Error("IssueQueryThread ", std::to_string(thread_idx),
+                     " Ending early: Max test duration reached.", "duration_ns",
+                     std::to_string(duration.count()));
+#endif
+      });
+      ran_out_of_generated_queries = false;
+      break;
+    }
+  }
+
+  // Combine the issuing statistics from multiple issue threads.
+  {
+    std::lock_guard<std::mutex> lock(state->mtx);
+    state->ran_out_of_generated_queries |= ran_out_of_generated_queries;
+    // In Server scenario and when max_async_queries != 0, we would have set
+    // state->queries_issued when we check max_async_queries in the loop.
+    if (!(scenario == TestScenario::Server && settings.max_async_queries != 0 &&
+          multi_thread)) {
+      state->queries_issued += queries_issued;
+    }
+    state->expected_latencies += expected_latencies;
+  }
+}
+
+}  // namespace loadgen
+
+}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/issue_query_controller.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/issue_query_controller.h
new file mode 100644
index 000000000..5668c574e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/issue_query_controller.h
@@ -0,0 +1,215 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Declare IssueQueryController and other helper classes for
+/// query issuing.
+
+#ifndef MLPERF_LOADGEN_ISSUE_QUERY_CONTROLLER_H_
+#define MLPERF_LOADGEN_ISSUE_QUERY_CONTROLLER_H_
+
+#include <stdint.h>
+
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <future>
+#include <mutex>
+#include <queue>
+#include <random>
+#include <thread>
+#include <vector>
+
+#include "loadgen.h"
+#include "logging.h"
+#include "query_sample.h"
+#include "system_under_test.h"
+#include "test_settings_internal.h"
+#include "utils.h"
+
+namespace mlperf {
+
+namespace loadgen {
+
+struct SampleMetadata;
+class QueryMetadata;
+
+/// \brief Every query and sample within a call to StartTest gets a unique
+/// sequence id for easy cross reference, and a random number which is used to
+/// determine accuracy logging when it is enabled.
+struct SequenceGen {
+  uint64_t NextQueryId() { return query_id++; }
+  uint64_t NextSampleId() { return sample_id++; }
+  uint64_t CurrentSampleId() { return sample_id; }
+  double NextAccLogRng() { return accuracy_log_dist(accuracy_log_rng); }
+  void InitAccLogRng(uint64_t accuracy_log_rng_seed) {
+    accuracy_log_rng = std::mt19937(accuracy_log_rng_seed);
+  }
+
+ private:
+  uint64_t query_id = 0;
+  uint64_t sample_id = 0;
+  std::mt19937 accuracy_log_rng;
+  std::uniform_real_distribution<double> accuracy_log_dist =
+      std::uniform_real_distribution<double>(0, 1);
+};
+
+/// \brief An interface for a particular scenario + mode to implement for
+/// extended hanlding of sample completion.
+struct ResponseDelegate {
+  virtual ~ResponseDelegate() = default;
+  virtual void SampleComplete(SampleMetadata*, QuerySampleResponse*,
+                              PerfClock::time_point,
+                              const ResponseCallback&) = 0;
+  virtual void TokenComplete(SampleMetadata*, QuerySampleResponse*,
+                             PerfClock::time_point,
+                             const ResponseCallback&) = 0;
+  virtual void QueryComplete() = 0;
+  std::atomic<size_t> queries_completed{0};
+};
+
+/// \brief Used by the loadgen to coordinate response data and completion.
+struct SampleMetadata {
+  QueryMetadata* query_metadata;
+  uint64_t sequence_id;
+  QuerySampleIndex sample_index;
+  double accuracy_log_val;
+};
+
+/// \brief Maintains data and timing info for a query and all its samples.
+class QueryMetadata {
+ public:
+  QueryMetadata(const std::vector<QuerySampleIndex>& query_sample_indices,
+                std::chrono::nanoseconds scheduled_delta,
+                ResponseDelegate* response_delegate, SequenceGen* sequence_gen);
+  QueryMetadata(QueryMetadata&& src);
+
+  void NotifyOneSampleCompleted(PerfClock::time_point timestamp);
+
+  void WaitForAllSamplesCompleted();
+
+  PerfClock::time_point WaitForAllSamplesCompletedWithTimestamp();
+
+  /// \brief Coalesce multiple queries into one query.
+  /// When server_coalesce_queries is set to true in Server scenario, we
+  /// sometimes coalesce multiple queries into one query. This is done by moving
+  /// the other query's sample into current query, while maintaining their
+  /// original scheduled_time.
+  void CoalesceQueries(QueryMetadata* queries, size_t first, size_t last,
+                       size_t stride);
+
+  /// \brief Set a coalesced query back to its original state.
+  void Decoalesce();
+
+ public:
+  std::vector<QuerySample> query_to_send;
+  const std::chrono::nanoseconds scheduled_delta;
+  ResponseDelegate* const response_delegate;
+  const uint64_t sequence_id;
+
+  // Performance information.
+
+  size_t scheduled_intervals = 0;  // Number of intervals between queries, as
+                                   // actually scheduled during the run.
+                                   // For the MultiStream scenario only.
+  PerfClock::time_point scheduled_time;
+  PerfClock::time_point issued_start_time;
+  PerfClock::time_point all_samples_done_time;
+
+ private:
+  std::atomic<size_t> wait_count_;
+  std::promise<void> all_samples_done_;
+  std::vector<SampleMetadata> samples_;
+};
+
+/// \brief A state object for communications between the controller and its
+/// caller.
+struct IssueQueryState {
+  // Information from caller to controller.
+  SystemUnderTest* sut;
+  std::vector<QueryMetadata>* queries;
+  ResponseDelegate* response_delegate;
+  const TestSettingsInternal* settings;
+  TestMode mode;
+  // Information from controller to caller.
+  std::chrono::system_clock::time_point start_for_power;
+  PerfClock::time_point start_time;
+  bool ran_out_of_generated_queries;
+  size_t queries_issued;
+  size_t expected_latencies;
+  // The lock to modify this state (in multi-thread case).
+  std::mutex mtx;
+};
+
+/// \brief Controls the query issuing part.
+/// This controller handles both the cases if the user registers or does not
+/// register IssueQueryThreads. It is implemented as a singleton, and is NOT
+/// thread-safe (i.e. users should not call StartTest() on multiple threads).
+/// It is thread-safe with regard to IssueQueryThreads.
+class IssueQueryController {
+ public:
+  /// \brief Get the controller instance singleton.
+  static IssueQueryController& GetInstance();
+
+  /// \brief Don't allow copy. This is a singleton.
+  IssueQueryController(IssueQueryController const&) = delete;
+  void operator=(IssueQueryController const&) = delete;
+
+  /// \brief Register an IssueQueryThread.
+  /// It is blocking until the entire test ends.
+  void RegisterThread();
+
+  /// \brief Set number of IssueQueryThreads and wait for thread registration.
+  /// If for any reason the number of registered threads do not match the
+  /// specified number, it prints out an error.
+  void SetNumThreads(size_t n);
+
+  /// \brief Kick off the query issuing.
+  /// The query issuing will be done on the current thread if there is no
+  /// registered IssueQueryThreads or if it is not in Server scenario.
+  template <TestScenario scenario>
+  void StartIssueQueries(IssueQueryState* s);
+
+  /// \brief Notify the IssueQueryThreads to end.
+  void EndThreads();
+
+ private:
+  /// \brief Hide constructor. This is a singleton.
+  IssueQueryController() {}
+
+  /// \brief The internal helper which actually issues queries.
+  /// This should be called by the thread(s) which issues queries.
+  template <TestScenario scenario, bool multi_thread>
+  void IssueQueriesInternal(size_t query_stride, size_t thread_idx);
+
+  /// \brief The issue query state.
+  IssueQueryState* state;
+  /// \brief Locks for communications across IssueQueryThreads and the main
+  /// thread.
+  std::mutex mtx;
+  std::condition_variable cond_var;
+  /// \brief Thread ids of the registered IssueQueryThreads.
+  std::vector<std::thread::id> thread_ids;
+  size_t num_threads{0};
+  /// \brief Whether the threads should be actively issuing queries.
+  bool issuing{false};
+  /// \brief Flags for each IssueQueryThread to mark that it is done.
+  std::vector<bool> thread_complete;
+  /// \brief Whether the threads can end now.
+  bool end_test{false};
+};
+
+}  // namespace loadgen
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_ISSUE_QUERY_CONTROLLER_H_
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen.cc
new file mode 100644
index 000000000..42b2140de
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen.cc
@@ -0,0 +1,1345 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "loadgen.h"
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <chrono>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <future>
+#include <iomanip>
+#include <iostream>
+#include <queue>
+#include <random>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "early_stopping.h"
+#include "issue_query_controller.h"
+#include "logging.h"
+#include "query_sample.h"
+#include "query_sample_library.h"
+#include "results.h"
+#include "system_under_test.h"
+#include "test_settings.h"
+#include "test_settings_internal.h"
+#include "utils.h"
+#include "version.h"
+
+namespace mlperf {
+
+/// \brief Loadgen implementation details.
+namespace loadgen {
+
+/// \brief A random set of samples in the QSL that should fit in RAM when
+/// loaded together.
+struct LoadableSampleSet {
+  std::vector<QuerySampleIndex> set;
+  const size_t sample_distribution_end;  // Excludes padding in MultiStream.
+};
+
+/// \brief Generates nanoseconds from a start time to multiple end times.
+/// TODO: This isn't very useful anymore. Remove it.
+struct DurationGeneratorNs {
+  const PerfClock::time_point start;
+  int64_t delta(PerfClock::time_point end) const {
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(end - start)
+        .count();
+  }
+};
+
+/// \brief ResponseDelegate implementation templated by scenario and mode.
+template <TestScenario scenario, TestMode mode>
+struct ResponseDelegateDetailed : public ResponseDelegate {
+  double accuracy_log_offset = 0.0f;
+  double accuracy_log_prob = 0.0f;
+
+  void SampleComplete(SampleMetadata* sample, QuerySampleResponse* response,
+                      PerfClock::time_point complete_begin_time,
+                      const ResponseCallback& response_cb) override {
+    // Using a raw pointer here should help us hit the std::function
+    // small buffer optimization code path when we aren't copying data.
+    // For some reason, using std::unique_ptr<std::vector> wasn't moving
+    // into the lambda; even with C++14.
+    std::vector<uint8_t>* sample_data_copy = nullptr;
+    double accuracy_log_val =
+        sample->accuracy_log_val + accuracy_log_offset < 1.0
+            ? sample->accuracy_log_val + accuracy_log_offset
+            : sample->accuracy_log_val + accuracy_log_offset - 1.0;
+    if (mode == TestMode::AccuracyOnly ||
+        accuracy_log_val <= accuracy_log_prob) {
+      // if a response_cb callback is provided, data only needs to reside on the
+      // host *after* calling it note that the callback is blocking and will
+      // likely involve a memcpy from accelerator to host
+      if (response_cb) {
+        response_cb(response);
+      }
+      // TODO: Verify accuracy with the data copied here.
+      uint8_t* src_begin = reinterpret_cast<uint8_t*>(response->data);
+      uint8_t* src_end = src_begin + response->size;
+      sample_data_copy = new std::vector<uint8_t>(src_begin, src_end);
+    }
+    int64_t n_tokens = response->n_tokens;
+    Log([sample, complete_begin_time, sample_data_copy,
+         n_tokens](AsyncLog& log) {
+      QueryMetadata* query = sample->query_metadata;
+      DurationGeneratorNs sched{query->scheduled_time};
+      if (scenario == TestScenario::Server) {
+        // Trace the server scenario as a stacked graph via counter events.
+        DurationGeneratorNs issued{query->issued_start_time};
+        log.TraceCounterEvent("Latency", query->scheduled_time, "issue_delay",
+                              sched.delta(query->issued_start_time),
+                              "issue_to_done",
+                              issued.delta(complete_begin_time));
+      }
+
+      // While visualizing overlapping samples in offline mode is not
+      // practical, sample completion is still recorded for auditing purposes.
+      log.TraceSample("Sample", sample->sequence_id, query->scheduled_time,
+                      complete_begin_time, "sample_seq", sample->sequence_id,
+                      "query_seq", query->sequence_id, "sample_idx",
+                      sample->sample_index, "issue_start_ns",
+                      sched.delta(query->issued_start_time), "complete_ns",
+                      sched.delta(complete_begin_time));
+
+      if (sample_data_copy) {
+        log.LogAccuracy(sample->sequence_id, sample->sample_index,
+                        LogBinaryAsHexString{sample_data_copy}, n_tokens);
+        delete sample_data_copy;
+      }
+
+      // Record the latency at the end, since it will unblock the issuing
+      // thread and potentially destroy the metadata being used above.
+      QuerySampleLatency latency = sched.delta(complete_begin_time);
+      log.RecordSampleCompletion(sample->sequence_id, complete_begin_time,
+                                 latency, n_tokens);
+    });
+  }
+
+  void TokenComplete(SampleMetadata* sample, QuerySampleResponse* response,
+                     PerfClock::time_point complete_begin_time,
+                     const ResponseCallback& response_cb) override {
+    // Using a raw pointer here should help us hit the std::function
+    // small buffer optimization code path when we aren't copying data.
+    // For some reason, using std::unique_ptr<std::vector> wasn't moving
+    // into the lambda; even with C++14.
+    std::vector<uint8_t>* token_data_copy = nullptr;
+    double accuracy_log_val =
+        sample->accuracy_log_val + accuracy_log_offset < 1.0
+            ? sample->accuracy_log_val + accuracy_log_offset
+            : sample->accuracy_log_val + accuracy_log_offset - 1.0;
+    if (mode == TestMode::AccuracyOnly ||
+        accuracy_log_val <= accuracy_log_prob) {
+      uint8_t* src_begin = reinterpret_cast<uint8_t*>(response->data);
+      uint8_t* src_end = src_begin + response->size;
+      token_data_copy = new std::vector<uint8_t>(src_begin, src_end);
+    }
+    Log([sample, complete_begin_time, token_data_copy](AsyncLog& log) {
+      QueryMetadata* query = sample->query_metadata;
+      DurationGeneratorNs sched{query->scheduled_time};
+      if (scenario == TestScenario::Server) {
+        DurationGeneratorNs issued{query->issued_start_time};
+        log.TraceCounterEvent(
+            "Token_Latency", query->scheduled_time, "issue_delay",
+            sched.delta(query->issued_start_time), "issue_to_done",
+            issued.delta(complete_begin_time));
+      } else {
+        log.TraceSample("Token", sample->sequence_id, query->scheduled_time,
+                        complete_begin_time, "sample_seq", sample->sequence_id,
+                        "query_seq", query->sequence_id, "sample_idx",
+                        sample->sample_index, "issue_start_ns",
+                        sched.delta(query->issued_start_time), "complete_ns",
+                        sched.delta(complete_begin_time));
+      }
+      if (token_data_copy) {
+        log.CacheToken(sample->sequence_id,
+                       LogBinaryAsHexString{token_data_copy});
+      }
+      QuerySampleLatency latency = sched.delta(complete_begin_time);
+      log.RecordTokenCompletion(sample->sequence_id, complete_begin_time,
+                                latency);
+    });
+  }
+
+  void QueryComplete() override {
+    // We only need to track outstanding queries in the server scenario to
+    // detect when the SUT has fallen too far behind.
+    if (scenario == TestScenario::Server) {
+      queries_completed.fetch_add(1, std::memory_order_relaxed);
+    }
+  }
+};
+
+/// \brief Selects the query timestamps for all scenarios except Server.
+template <TestScenario scenario>
+auto ScheduleDistribution(double qps) {
+  return [period = std::chrono::duration_cast<std::chrono::nanoseconds>(
+              std::chrono::duration<double>(1.0 / qps))](auto& /*gen*/) {
+    return period;
+  };
+}
+
+/// \brief Selects the query timestamps for the Server scenario.
+template <>
+auto ScheduleDistribution<TestScenario::Server>(double qps) {
+  // Poisson arrival process corresponds to exponentially distributed
+  // interarrival times.
+  return [dist = std::exponential_distribution<>(qps)](auto& gen) mutable {
+    return std::chrono::duration_cast<std::chrono::nanoseconds>(
+        std::chrono::duration<double>(dist(gen)));
+  };
+}
+
+/// \brief Selects samples for the accuracy mode.
+template <TestMode mode>
+auto SampleDistribution(size_t sample_count, size_t stride, std::mt19937* rng) {
+  std::vector<size_t> indices;
+  for (size_t i = 0; i < sample_count; i += stride) {
+    indices.push_back(i);
+  }
+  std::shuffle(indices.begin(), indices.end(), *rng);
+  return [indices = std::move(indices), i = size_t(0)](auto& /*gen*/) mutable {
+    return indices.at(i++);
+  };
+}
+
+/// \brief Selects samples for the performance mode.
+template <>
+auto SampleDistribution<TestMode::PerformanceOnly>(size_t sample_count,
+                                                   size_t /*stride*/,
+                                                   std::mt19937* /*rng*/) {
+  return [dist = std::uniform_int_distribution<>(0, sample_count - 1)](
+             auto& gen) mutable { return dist(gen); };
+}
+
+/// \brief Sample across the dataset, and ensure coverage of each of the
+/// samples.
+// Useful for non-uniform dataset (e.g. Llama2, GPTJ, 3d-unet)
+auto SampleDistributionEqualIssue(size_t sample_count, size_t set_size,
+                                  std::mt19937* rng) {
+  std::vector<size_t> indices;
+  std::vector<size_t> shuffle_indices(set_size);
+  std::iota(shuffle_indices.begin(), shuffle_indices.end(), 0);
+  for (size_t j = 0; j < sample_count; j += set_size) {
+    std::shuffle(shuffle_indices.begin(), shuffle_indices.end(), *rng);
+    indices.insert(indices.end(), shuffle_indices.begin(),
+                   shuffle_indices.end());
+  }
+  return [indices = std::move(indices), i = size_t(0)](auto& /*gen*/) mutable {
+    return indices.at((i++) % indices.size());
+  };
+}
+
+/// \brief Generates queries for the requested settings, templated by
+/// scenario and mode.
+/// \todo Make GenerateQueries faster.
+/// QueryMetadata is expensive to move; either reserve queries in advance
+/// so the queries vector doesn't need to grow. And/or parent samples to their
+/// queries only after all queries have been generated.
+/// \todo For the server scenario only, scale the query timeline at the end so
+/// the QPS as scheduled is equal to the QPS as requested.
+template <TestScenario scenario, TestMode mode>
+std::vector<QueryMetadata> GenerateQueries(
+    const TestSettingsInternal& settings,
+    const LoadableSampleSet& loaded_sample_set, SequenceGen* sequence_gen,
+    ResponseDelegate* response_delegate) {
+  auto tracer =
+      MakeScopedTracer([](AsyncTrace& trace) { trace("GenerateQueries"); });
+
+  auto& loaded_samples = loaded_sample_set.set;
+
+  // Generate 2x more samples than we think we'll need given the expected
+  // QPS in case the SUT is faster than expected.
+  // We should exit before issuing all queries.
+  // Does not apply to the server scenario since the duration only
+  // depends on the ideal scheduled time, not the actual issue time.
+  const int duration_multiplier = scenario == TestScenario::Server ? 1 : 2;
+  std::chrono::microseconds gen_duration =
+      duration_multiplier * settings.target_duration;
+  size_t min_queries = settings.min_query_count;
+
+  size_t samples_per_query = settings.samples_per_query;
+  if (mode == TestMode::AccuracyOnly && scenario == TestScenario::Offline) {
+    samples_per_query = loaded_sample_set.sample_distribution_end;
+  }
+
+  // We should not exit early in accuracy mode.
+  if (mode == TestMode::AccuracyOnly || settings.performance_issue_unique) {
+    gen_duration = std::chrono::microseconds(0);
+    // Integer truncation here is intentional.
+    // For MultiStream, loaded samples is properly padded.
+    // For Offline, we create a 'remainder' query at the end of this function.
+    min_queries = loaded_samples.size() / samples_per_query;
+  }
+
+  std::vector<QueryMetadata> queries;
+
+  // Using the std::mt19937 pseudo-random number generator ensures a modicum of
+  // cross platform reproducibility for trace generation.
+  std::mt19937 sample_rng(settings.sample_index_rng_seed);
+  std::mt19937 schedule_rng(settings.schedule_rng_seed);
+
+  constexpr bool kIsMultiStream = scenario == TestScenario::MultiStream;
+  const size_t sample_stride = kIsMultiStream ? samples_per_query : 1;
+
+  auto sample_distribution = SampleDistribution<mode>(
+      loaded_sample_set.sample_distribution_end, sample_stride, &sample_rng);
+  // Use the unique sample distribution same as in AccuracyMode to
+  // to choose samples when either flag performance_issue_unique
+  // or performance_issue_same is set.
+  auto sample_distribution_unique = SampleDistribution<TestMode::AccuracyOnly>(
+      loaded_sample_set.sample_distribution_end, sample_stride, &sample_rng);
+
+  auto sample_distribution_equal_issue = SampleDistributionEqualIssue(
+      min_queries, loaded_samples.size(), &sample_rng);
+
+  auto schedule_distribution =
+      ScheduleDistribution<scenario>(settings.target_qps);
+
+  // When sample_concatenate_permutation is turned on, pad to a multiple of the
+  // complete dataset to ensure fairness.
+  auto enable_equal_issue = settings.sample_concatenate_permutation;
+  if (mode != TestMode::AccuracyOnly && enable_equal_issue) {
+    if (scenario == TestScenario::Offline &&
+        samples_per_query % loaded_samples.size() != 0) {
+      // In offline mode, we pad samples_per_query
+      size_t pad_size =
+          (loaded_samples.size() - samples_per_query % loaded_samples.size());
+      samples_per_query += pad_size;
+    } else if ((scenario != TestScenario::Offline) &&
+               (min_queries % loaded_samples.size() != 0)) {
+      // In Server, SingleStream, MultiStream mode, the min_queries should be
+      // padded
+      size_t pad_size =
+          (loaded_samples.size() - min_queries % loaded_samples.size());
+      min_queries += pad_size;
+    }
+  }
+
+  std::vector<QuerySampleIndex> samples(samples_per_query);
+  std::chrono::nanoseconds timestamp(0);
+  std::chrono::nanoseconds prev_timestamp(0);
+  // Choose a single sample to repeat when in performance_issue_same mode
+  QuerySampleIndex same_sample = settings.performance_issue_same_index;
+
+  while (prev_timestamp < gen_duration || queries.size() < min_queries) {
+    if (kIsMultiStream) {
+      QuerySampleIndex sample_i = settings.performance_issue_unique
+                                      ? sample_distribution_unique(sample_rng)
+                                  : settings.performance_issue_same
+                                      ? same_sample
+                                      : sample_distribution(sample_rng);
+      for (auto& s : samples) {
+        // Select contiguous samples in the MultiStream scenario.
+        // This will not overflow, since GenerateLoadableSets adds padding at
+        // the end of the loadable sets in the MultiStream scenario.
+        // The padding allows the starting samples to be the same for each
+        // query with respect to samples_per_query.
+        s = loaded_samples[sample_i++];
+      }
+    } else if (scenario == TestScenario::Offline) {
+      // For the Offline + Performance scenario, we also want to support
+      // contiguous samples. In this scenario the query can be much larger than
+      // what fits into memory. We simply repeat loaded_samples N times, plus a
+      // remainder to ensure we fill up samples. Note that this eliminates
+      // randomization.
+      size_t num_loaded_samples = loaded_samples.size();
+      size_t num_full_repeats = samples_per_query / num_loaded_samples;
+      uint64_t remainder = samples_per_query % (num_loaded_samples);
+      if (settings.performance_issue_same) {
+        std::fill(samples.begin(), samples.begin() + samples_per_query,
+                  loaded_samples[same_sample]);
+      } else {
+        for (size_t i = 0; i < num_full_repeats; ++i) {
+          std::copy(loaded_samples.begin(), loaded_samples.end(),
+                    samples.begin() + i * num_loaded_samples);
+
+          if (settings.sample_concatenate_permutation) {
+            std::shuffle(samples.begin() + i * num_loaded_samples,
+                         samples.begin() + (i + 1) * num_loaded_samples,
+                         sample_rng);
+          }
+        }
+
+        std::copy(loaded_samples.begin(), loaded_samples.begin() + remainder,
+                  samples.begin() + num_full_repeats * num_loaded_samples);
+
+        if (settings.sample_concatenate_permutation) {
+          assert(remainder == 0);
+        }
+      }
+    } else {
+      for (auto& s : samples) {
+        s = loaded_samples[settings.performance_issue_unique
+                               ? sample_distribution_unique(sample_rng)
+                           : settings.performance_issue_same ? same_sample
+                           : enable_equal_issue
+                               ? sample_distribution_equal_issue(sample_rng)
+                               : sample_distribution(sample_rng)];
+      }
+    }
+    queries.emplace_back(samples, timestamp, response_delegate, sequence_gen);
+    prev_timestamp = timestamp;
+    timestamp += schedule_distribution(schedule_rng);
+    // In equal_issue mode, the min_queries will be bumped up by a multiple of
+    // the dataset size if the test time has not met the threshold.
+    if (enable_equal_issue && (queries.size() >= min_queries) &&
+        (prev_timestamp < gen_duration) &&
+        (scenario != TestScenario::Offline)) {
+      min_queries += loaded_samples.size();
+    }
+  }
+
+  // See if we need to create a "remainder" query for offline+accuracy to
+  // ensure we issue all samples in loaded_samples. Offline doesn't pad
+  // loaded_samples like MultiStream does.
+  if (scenario == TestScenario::Offline && mode == TestMode::AccuracyOnly) {
+    size_t remaining_samples = loaded_samples.size() % samples_per_query;
+    if (remaining_samples != 0) {
+      samples.resize(remaining_samples);
+      for (auto& s : samples) {
+        s = loaded_samples[sample_distribution(sample_rng)];
+      }
+      queries.emplace_back(samples, timestamp, response_delegate, sequence_gen);
+    }
+  }
+
+  LogDetail([count = queries.size(), spq = samples_per_query,
+             duration = timestamp.count()](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "generated_query_count", count);
+    MLPERF_LOG(detail, "generated_samples_per_query", spq);
+    MLPERF_LOG(detail, "generated_query_duration", duration);
+#else
+    detail("GeneratedQueries: ", "queries", count, "samples per query", spq,
+           "duration", duration);
+#endif
+  });
+
+  return queries;
+}
+
+/// \brief Issues a series of pre-generated queries.
+// TODO: Templates for scenario and mode are overused, given the loadgen
+//       no longer generates queries on the fly. Should we reduce the
+//       use of templates?
+template <TestScenario scenario, TestMode mode>
+PerformanceResult IssueQueries(SystemUnderTest* sut,
+                               const TestSettingsInternal& settings,
+                               const LoadableSampleSet& loaded_sample_set,
+                               SequenceGen* sequence_gen) {
+  // Create reponse handler.
+  ResponseDelegateDetailed<scenario, mode> response_logger;
+  std::uniform_real_distribution<double> accuracy_log_offset_dist =
+      std::uniform_real_distribution<double>(0.0, 1.0);
+  std::mt19937 accuracy_log_offset_rng(settings.accuracy_log_rng_seed);
+  response_logger.accuracy_log_offset =
+      accuracy_log_offset_dist(accuracy_log_offset_rng);
+  response_logger.accuracy_log_prob = settings.accuracy_log_probability;
+
+  // Generate queries.
+  auto sequence_id_start = sequence_gen->CurrentSampleId();
+  std::vector<QueryMetadata> queries = GenerateQueries<scenario, mode>(
+      settings, loaded_sample_set, sequence_gen, &response_logger);
+
+  // Calculated expected number of queries
+  uint64_t expected_queries =
+      settings.target_qps * settings.min_duration.count() / 1000;
+  uint64_t minimum_queries =
+      settings.min_query_count * settings.samples_per_query;
+
+  if (scenario != TestScenario::Offline) {
+    expected_queries *= settings.samples_per_query;
+  } else {
+    minimum_queries = settings.min_sample_count;
+  }
+
+  expected_queries =
+      expected_queries < minimum_queries ? minimum_queries : expected_queries;
+
+  if (settings.accuracy_log_sampling_target > 0) {
+    response_logger.accuracy_log_prob =
+        (double)settings.accuracy_log_sampling_target / expected_queries;
+  }
+  auto sequence_id_end = sequence_gen->CurrentSampleId();
+  size_t max_latencies_to_record = sequence_id_end - sequence_id_start;
+
+  // Initialize logger for latency recording.
+  GlobalLogger().RestartLatencyRecording(sequence_id_start,
+                                         max_latencies_to_record);
+
+  // Create and initialize an IssueQueryState.
+  IssueQueryState state{
+      sut, &queries, &response_logger, &settings, mode, {}, {}, false, 0,
+      0,   {}};
+  auto& controller = IssueQueryController::GetInstance();
+
+  // Set number of IssueQueryThreads and wait for the threads to register.
+  controller.SetNumThreads(settings.requested.server_num_issue_query_threads);
+
+  // Start issuing the queries.
+  controller.StartIssueQueries<scenario>(&state);
+
+  // Gather query issuing statistics.
+  const auto start_for_power = state.start_for_power;
+  const auto start = state.start_time;
+  const auto ran_out_of_generated_queries = state.ran_out_of_generated_queries;
+  const auto queries_issued = state.queries_issued;
+  const auto expected_latencies = state.expected_latencies;
+
+  // Let the SUT know it should not expect any more queries.
+  sut->FlushQueries();
+
+  if (mode == TestMode::PerformanceOnly && ran_out_of_generated_queries) {
+    LogDetail([](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_ERROR(
+          detail, "error_runtime",
+          "Ending early: Ran out of generated queries to issue before the "
+          "minimum query count and test duration were reached. "
+          "Please update the relevant expected latency or target qps in the "
+          "TestSettings so they are more accurate.");
+#else
+      detail.Error(
+          "Ending early: Ran out of generated queries to issue before the "
+          "minimum query count and test duration were reached.");
+      detail(
+          "Please update the relevant expected latency or target qps in the "
+          "TestSettings so they are more accurate.");
+#endif
+    });
+  }
+
+  // Wait for tail queries to complete and collect all the latencies.
+  // We have to keep the synchronization primitives alive until the SUT
+  // is done with them.
+  auto& final_query = queries[queries_issued - 1];
+  std::vector<QuerySampleLatency> sample_latencies(
+      GlobalLogger().GetLatenciesBlocking(expected_latencies));
+
+  std::vector<QuerySampleLatency> first_token_latencies(
+      GlobalLogger().GetTokenLatencies(expected_latencies));
+
+  std::vector<QuerySampleLatency> time_per_output_token_arr(
+      GlobalLogger().GetTimePerOutputToken(expected_latencies));
+
+  std::vector<int64_t> tokens_per_sample(
+      GlobalLogger().GetTokensPerSample(expected_latencies));
+
+  // Log contention counters after every test as a sanity check.
+  GlobalLogger().LogContentionAndAllocations();
+
+  // This properly accounts for the fact that the max completion time may not
+  // belong to the final query. It also excludes any time spent postprocessing
+  // in the loadgen itself after final completion, which may be significant
+  // in the offline scenario.
+  PerfClock::time_point max_completion_time =
+      GlobalLogger().GetMaxCompletionTime();
+  auto sut_active_duration = max_completion_time - start;
+  LogDetail([start_for_power, sut_active_duration](AsyncDetail& detail) {
+    auto end_for_power =
+        start_for_power +
+        std::chrono::duration_cast<std::chrono::system_clock::duration>(
+            sut_active_duration);
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG_INTERVAL_START(detail, "power_begin",
+                              DateTimeStringForPower(start_for_power));
+    MLPERF_LOG_INTERVAL_END(detail, "power_end",
+                            DateTimeStringForPower(end_for_power));
+#else
+    detail("POWER_BEGIN: ", "mode", ToString(mode), "time",
+           DateTimeStringForPower(start_for_power));
+    detail("POWER_END: ", "mode", ToString(mode), "time",
+           DateTimeStringForPower(end_for_power));
+#endif
+  });
+
+  double max_latency =
+      QuerySampleLatencyToSeconds(GlobalLogger().GetMaxLatencySoFar());
+  double final_query_scheduled_time =
+      DurationToSeconds(final_query.scheduled_delta);
+  double final_query_issued_time =
+      DurationToSeconds(final_query.issued_start_time - start);
+  double final_query_all_samples_done_time =
+      DurationToSeconds(final_query.all_samples_done_time - start);
+
+  std::vector<QuerySampleLatency> query_latencies;
+  if (scenario == TestScenario::MultiStream) {
+    query_latencies.resize(queries_issued);
+    for (size_t i = 0; i < queries_issued; i++) {
+      query_latencies[i] = DurationGeneratorNs{queries[i].scheduled_time}.delta(
+          queries[i].all_samples_done_time);
+    }
+  }
+
+  return PerformanceResult{
+      std::move(sample_latencies),
+      std::move(query_latencies),
+      queries_issued,
+      max_latency,
+      final_query_scheduled_time,
+      final_query_issued_time,
+      final_query_all_samples_done_time,
+      TokenPerformanceResults{first_token_latencies, time_per_output_token_arr,
+                              tokens_per_sample}};
+}
+
+void LoadSamplesToRam(QuerySampleLibrary* qsl,
+                      const std::vector<QuerySampleIndex>& samples) {
+  LogDetail([&samples](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "loaded_qsl_set", samples);
+#else
+    std::string set("\"[");
+    for (auto i : samples) {
+      set += std::to_string(i) + ",";
+    }
+    set.resize(set.size() - 1);
+    set += "]\"";
+    detail("Loading QSL : ", "set", set);
+#endif
+  });
+  qsl->LoadSamplesToRam(samples);
+}
+
+/// \brief Generates random sets of samples in the QSL that we can load into
+/// RAM at the same time.
+std::vector<LoadableSampleSet> GenerateLoadableSets(
+    QuerySampleLibrary* qsl, const TestSettingsInternal& settings) {
+  auto tracer = MakeScopedTracer(
+      [](AsyncTrace& trace) { trace("GenerateLoadableSets"); });
+
+  std::vector<LoadableSampleSet> result;
+  std::mt19937 qsl_rng(settings.qsl_rng_seed);
+
+  // Generate indices for all available samples in the QSL.
+  const size_t qsl_total_count = qsl->TotalSampleCount();
+  std::vector<QuerySampleIndex> samples(qsl_total_count);
+  for (size_t i = 0; i < qsl_total_count; i++) {
+    samples[i] = static_cast<QuerySampleIndex>(i);
+  }
+
+  // Randomize the order of the samples.
+  std::shuffle(samples.begin(), samples.end(), qsl_rng);
+
+  // Partition the samples into loadable sets.
+  const size_t set_size = settings.performance_sample_count;
+  const size_t set_padding = (settings.scenario == TestScenario::MultiStream)
+                                 ? settings.samples_per_query - 1
+                                 : 0;
+  std::vector<QuerySampleIndex> loadable_set;
+  loadable_set.reserve(set_size + set_padding);
+
+  for (auto s : samples) {
+    loadable_set.push_back(s);
+    if (loadable_set.size() == set_size) {
+      result.push_back({std::move(loadable_set), set_size});
+      loadable_set.clear();
+      loadable_set.reserve(set_size + set_padding);
+    }
+  }
+
+  if (!loadable_set.empty()) {
+    // Copy the size since it will become invalid after the move.
+    size_t loadable_set_size = loadable_set.size();
+    result.push_back({std::move(loadable_set), loadable_set_size});
+  }
+
+  // Add padding for the multi stream scenario. Padding allows the
+  // starting sample to be the same for all SUTs, independent of the value
+  // of samples_per_query, while enabling samples in a query to be contiguous.
+  for (auto& loadable_set : result) {
+    auto& set = loadable_set.set;
+    for (size_t i = 0; i < set_padding; i++) {
+      // It's not clear in the spec if the STL deallocates the old container
+      // before assigning, which would invalidate the source before the
+      // assignment happens. Even though we should have reserved enough
+      // elements above, copy the source first anyway since we are just moving
+      // integers around.
+      QuerySampleIndex p = set[i];
+      set.push_back(p);
+    }
+  }
+
+  return result;
+}
+
+/// \brief Opens and owns handles to all of the log files.
+struct LogOutputs {
+  LogOutputs(const LogOutputSettings& output_settings,
+             const std::string& test_date_time) {
+    std::string prefix = output_settings.outdir;
+    prefix += "/" + output_settings.prefix;
+    if (output_settings.prefix_with_datetime) {
+      prefix += test_date_time + "_";
+    }
+    const std::string& suffix = output_settings.suffix;
+
+    summary_out.open(prefix + "summary" + suffix + ".txt");
+    detail_out.open(prefix + "detail" + suffix + ".txt");
+    accuracy_out.open(prefix + "accuracy" + suffix + ".json");
+    trace_out.open(prefix + "trace" + suffix + ".json");
+  }
+
+  bool CheckOutputs() {
+    bool all_ofstreams_good = true;
+    if (!summary_out.good()) {
+      all_ofstreams_good = false;
+      std::cerr << "LoadGen: Failed to open summary file.";
+    }
+    if (!detail_out.good()) {
+      all_ofstreams_good = false;
+      std::cerr << "LoadGen: Failed to open detailed log file.";
+    }
+    if (!accuracy_out.good()) {
+      all_ofstreams_good = false;
+      std::cerr << "LoadGen: Failed to open accuracy log file.";
+    }
+    if (!trace_out.good()) {
+      all_ofstreams_good = false;
+      std::cerr << "LoadGen: Failed to open trace file.";
+    }
+    return all_ofstreams_good;
+  }
+
+  std::ofstream summary_out;
+  std::ofstream detail_out;
+  std::ofstream accuracy_out;
+  std::ofstream trace_out;
+};
+
+/// \brief Find boundaries of performance settings by widening bounds
+/// exponentially.
+/// \details To find an upper bound of performance, widen an
+/// upper bound exponentially until finding a bound that can't satisfy
+/// performance constraints. i.e. [1, 2) -> [2, 4) -> [4, 8) -> ...
+template <TestScenario scenario>
+std::pair<PerformanceSummary, PerformanceSummary> FindBoundaries(
+    SystemUnderTest* sut, QuerySampleLibrary* qsl, SequenceGen* sequence_gen,
+    PerformanceSummary l_perf_summary) {
+  // Get upper bound
+  TestSettingsInternal u_settings = l_perf_summary.settings;
+  find_peak_performance::WidenPerformanceField<scenario>(&u_settings);
+
+  LogDetail(
+      [l_field = find_peak_performance::ToStringPerformanceField<scenario>(
+           l_perf_summary.settings),
+       u_field = find_peak_performance::ToStringPerformanceField<scenario>(
+           u_settings)](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+        MLPERF_LOG(detail, "generic_message",
+                   "FindBoundaries: Checking fields [" + l_field + ", " +
+                       u_field + ")");
+#else
+        detail("FindBoundaries: Checking fields [" + l_field + ", " + u_field +
+               ")");
+#endif
+      });
+
+  std::vector<loadgen::LoadableSampleSet> loadable_sets(
+      loadgen::GenerateLoadableSets(qsl, u_settings));
+  const LoadableSampleSet& performance_set = loadable_sets.front();
+  LoadSamplesToRam(qsl, performance_set.set);
+
+  PerformanceResult u_pr(IssueQueries<scenario, TestMode::PerformanceOnly>(
+      sut, u_settings, performance_set, sequence_gen));
+  PerformanceSummary u_perf_summary{sut->Name(), u_settings, std::move(u_pr)};
+
+  qsl->UnloadSamplesFromRam(performance_set.set);
+
+  std::string tmp;
+  if (!u_perf_summary.PerfConstraintsMet(&tmp)) {
+    return std::make_pair(l_perf_summary, u_perf_summary);
+  } else {
+    return FindBoundaries<scenario>(sut, qsl, sequence_gen, u_perf_summary);
+  }
+}
+
+/// \brief Find peak performance by binary search.
+/// \details The found lower & upper bounds by the function 'FindBoundaries' are
+/// used as initial bounds of binary search
+template <TestScenario scenario>
+PerformanceSummary FindPeakPerformanceBinarySearch(
+    SystemUnderTest* sut, QuerySampleLibrary* qsl, SequenceGen* sequence_gen,
+    const LoadableSampleSet& performance_set, PerformanceSummary l_perf_summary,
+    PerformanceSummary u_perf_summary) {
+  if (find_peak_performance::IsFinished<scenario>(l_perf_summary.settings,
+                                                  u_perf_summary.settings)) {
+    return l_perf_summary;
+  }
+
+  const TestSettingsInternal m_settings =
+      find_peak_performance::MidOfBoundaries<scenario>(l_perf_summary.settings,
+                                                       u_perf_summary.settings);
+
+  LogDetail([l_field =
+                 find_peak_performance::ToStringPerformanceField<scenario>(
+                     l_perf_summary.settings),
+             u_field =
+                 find_peak_performance::ToStringPerformanceField<scenario>(
+                     u_perf_summary.settings),
+             m_field =
+                 find_peak_performance::ToStringPerformanceField<scenario>(
+                     m_settings)](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(
+        detail, "generic_message",
+        "FindPeakPerformanceBinarySearch: Testing the mid value of bounds [" +
+            l_field + ", " + u_field + "): " + m_field);
+#else
+    detail(
+        "FindPeakPerformanceBinarySearch: Testing the mid value of bounds [" +
+        l_field + ", " + u_field + "): " + m_field);
+#endif
+  });
+
+  PerformanceResult m_pr(IssueQueries<scenario, TestMode::PerformanceOnly>(
+      sut, m_settings, performance_set, sequence_gen));
+  PerformanceSummary m_perf_summary{sut->Name(), m_settings, std::move(m_pr)};
+
+  std::string tmp;
+  if (m_perf_summary.PerfConstraintsMet(&tmp)) {
+    return FindPeakPerformanceBinarySearch<scenario>(
+        sut, qsl, sequence_gen, performance_set, m_perf_summary,
+        u_perf_summary);
+  } else {
+    return FindPeakPerformanceBinarySearch<scenario>(
+        sut, qsl, sequence_gen, performance_set, l_perf_summary,
+        m_perf_summary);
+  }
+}
+
+/// \brief Runs the performance mode, templated by scenario.
+template <TestScenario scenario>
+void RunPerformanceMode(SystemUnderTest* sut, QuerySampleLibrary* qsl,
+                        const TestSettingsInternal& settings,
+                        SequenceGen* sequence_gen) {
+  LogDetail([](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "generic_message", "Starting performance mode");
+#else
+    detail("Starting performance mode:");
+#endif
+  });
+
+  // Use first loadable set as the performance set.
+  std::vector<loadgen::LoadableSampleSet> loadable_sets(
+      loadgen::GenerateLoadableSets(qsl, settings));
+  const LoadableSampleSet& performance_set = loadable_sets.front();
+  LoadSamplesToRam(qsl, performance_set.set);
+
+  // Start PerfClock/system_clock timers for measuring performance interval
+  // for comparison vs external timer.
+  auto pc_start_ts = PerfClock::now();
+  auto sc_start_ts = std::chrono::system_clock::now();
+  if (settings.print_timestamps) {
+    std::cout << "Loadgen :: Perf mode start. system_clock Timestamp = "
+              << std::chrono::system_clock::to_time_t(sc_start_ts) << "\n"
+              << std::flush;
+  }
+
+  PerformanceResult pr(IssueQueries<scenario, TestMode::PerformanceOnly>(
+      sut, settings, performance_set, sequence_gen));
+
+  // Measure PerfClock/system_clock timer durations for comparison vs
+  // external timer.
+  auto pc_stop_ts = PerfClock::now();
+  auto sc_stop_ts = std::chrono::system_clock::now();
+  auto pc_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+                         pc_stop_ts - pc_start_ts)
+                         .count();
+  auto sc_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+                         sc_stop_ts - sc_start_ts)
+                         .count();
+  float pc_sc_ratio = static_cast<float>(pc_duration) / sc_duration;
+  if (settings.print_timestamps) {
+    std::cout << "Loadgen :: Perf mode stop. systme_clock Timestamp = "
+              << std::chrono::system_clock::to_time_t(sc_stop_ts) << "\n"
+              << std::flush;
+    std::cout << "Loadgen :: PerfClock Perf duration = " << pc_duration
+              << "ms\n"
+              << std::flush;
+    std::cout << "Loadgen :: system_clock Perf duration = " << sc_duration
+              << "ms\n"
+              << std::flush;
+    std::cout << "Loadgen :: PerfClock/system_clock ratio = " << std::fixed
+              << std::setprecision(4) << pc_sc_ratio << "\n"
+              << std::flush;
+  }
+
+  if (pc_sc_ratio > 1.01 || pc_sc_ratio < 0.99) {
+    LogDetail([pc_sc_ratio](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+      std::stringstream ss;
+      ss << "PerfClock and system_clock differ by more than 1%! "
+         << " pc_sc_ratio: " << pc_sc_ratio;
+      MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
+#else
+      detail.Error("PerfClock and system_clock differ by more than 1\%! ",
+                   "pc_sc_ratio", pc_sc_ratio);
+#endif
+    });
+  } else if (pc_sc_ratio > 1.001 || pc_sc_ratio < 0.999) {
+    LogDetail([pc_sc_ratio](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+      std::stringstream ss;
+      ss << "PerfClock and system_clock differ by more than 0.1%! "
+         << " pc_sc_ratio: " << pc_sc_ratio;
+      MLPERF_LOG_WARNING(detail, "warning_generic_message", ss.str());
+#else
+      detail.Warning("PerfClock and system_clock differ by more than 0.1\%. ",
+                     "pc_sc_ratio", pc_sc_ratio);
+#endif
+    });
+  }
+
+  PerformanceSummary perf_summary{sut->Name(), settings, std::move(pr)};
+  LogSummary([perf_summary](AsyncSummary& summary) mutable {
+    perf_summary.LogSummary(summary);
+  });
+  // Create a copy to prevent thread hazard between LogSummary and LogDetail.
+  PerformanceSummary perf_summary_detail{perf_summary};
+  LogDetail([perf_summary_detail](AsyncDetail& detail) mutable {
+    perf_summary_detail.LogDetail(detail);
+  });
+
+  qsl->UnloadSamplesFromRam(performance_set.set);
+}
+
+/// \brief Runs the binary search mode, templated by scenario.
+/// \details 1. Check whether lower bound from user satisfies the performance
+/// constraints, 2. Find an upper bound using the function 'FindBoundaries'
+/// based on the lower bound, 3. Find peak performance settings using the
+/// function 'FindPeakPerformanceBinarySearch'. note: Since we can't find a
+/// lower bound programmatically because of the monotonicity issue of Server
+/// scenario, rely on user's settings. After resolving this issue, we can
+/// make the function 'FindBoundaries' find a lower bound as well from some
+/// random initial settings.
+template <TestScenario scenario>
+void FindPeakPerformanceMode(SystemUnderTest* sut, QuerySampleLibrary* qsl,
+                             const TestSettingsInternal& base_settings,
+                             SequenceGen* sequence_gen) {
+  LogDetail([](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "generic_message", "Starting FindPeakPerformance mode");
+#else
+    detail("Starting FindPeakPerformance mode:");
+#endif
+  });
+
+  if (scenario != TestScenario::Server) {
+    LogDetail([unsupported_scenario = ToString(scenario)](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_ERROR(detail, "error_invalid_config",
+                       find_peak_performance::kNotSupportedMsg);
+#else
+      detail.Error(find_peak_performance::kNotSupportedMsg);
+#endif
+    });
+    return;
+  }
+
+  LogDetail(
+      [base_field = find_peak_performance::ToStringPerformanceField<scenario>(
+           base_settings)](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+        MLPERF_LOG(
+            detail, "generic_message",
+            "FindPeakPerformance: Check validity of the base settings field: " +
+                base_field);
+#else
+        detail(
+            "FindPeakPerformance: Check validity of the base settings field: " +
+            base_field);
+#endif
+      });
+
+  // 1. Check whether the lower bound came from user satisfy performance
+  // constraints or not.
+  std::vector<loadgen::LoadableSampleSet> base_loadable_sets(
+      loadgen::GenerateLoadableSets(qsl, base_settings));
+  const LoadableSampleSet& base_performance_set = base_loadable_sets.front();
+  LoadSamplesToRam(qsl, base_performance_set.set);
+
+  PerformanceResult base_pr(IssueQueries<scenario, TestMode::PerformanceOnly>(
+      sut, base_settings, base_performance_set, sequence_gen));
+  PerformanceSummary base_perf_summary{sut->Name(), base_settings,
+                                       std::move(base_pr)};
+
+  // We can also use all_constraints_met to check performance constraints,
+  // but to reduce searching time, leave it up to whether the settings satisfy
+  // min duration & min queries or not to users.
+  std::string msg;
+  if (!base_perf_summary.PerfConstraintsMet(&msg)) {
+    LogDetail([msg](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+      std::stringstream ss;
+      ss << "FindPeakPerformance: Initial lower bound does not satisfy "
+         << "performance constraints, msg: " << msg;
+      MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
+#else
+      detail.Error(
+          "FindPeakPerformance: Initial lower bound does not satisfy "
+          "performance constraints, msg: " +
+          msg);
+#endif
+    });
+
+    PerformanceSummary perf_summary{sut->Name(), base_settings,
+                                    std::move(base_perf_summary.pr)};
+    LogSummary([perf_summary](AsyncSummary& summary) mutable {
+      perf_summary.LogSummary(summary);
+    });
+    // Create a copy to prevent thread hazard between LogSummary and LogDetail.
+    PerformanceSummary perf_summary_detail{perf_summary};
+    LogDetail([perf_summary_detail](AsyncDetail& detail) mutable {
+      perf_summary_detail.LogDetail(detail);
+    });
+
+    qsl->UnloadSamplesFromRam(base_performance_set.set);
+
+    return;
+  }
+
+  // Clear loaded samples.
+  qsl->UnloadSamplesFromRam(base_performance_set.set);
+
+  // 2. Find an upper bound based on the lower bound.
+  std::pair<PerformanceSummary, PerformanceSummary> boundaries =
+      FindBoundaries<scenario>(sut, qsl, sequence_gen, base_perf_summary);
+  PerformanceSummary l_perf_summary = boundaries.first;
+  PerformanceSummary u_perf_summary = boundaries.second;
+
+  LogDetail(
+      [l_field = find_peak_performance::ToStringPerformanceField<scenario>(
+           l_perf_summary.settings),
+       u_field = find_peak_performance::ToStringPerformanceField<scenario>(
+           u_perf_summary.settings)](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+        MLPERF_LOG(detail, "generic_message",
+                   "FindPeakPerformance: Found boundaries: [" + l_field + ", " +
+                       u_field + ")");
+#else
+        detail("FindPeakPerformance: Found boundaries: [" + l_field + ", " +
+               u_field + ")");
+#endif
+      });
+
+  // Reuse performance_set, u_perf_summary has the largest 'samples_per_query'.
+  std::vector<loadgen::LoadableSampleSet> loadable_sets(
+      loadgen::GenerateLoadableSets(qsl, u_perf_summary.settings));
+  const LoadableSampleSet& performance_set = loadable_sets.front();
+  LoadSamplesToRam(qsl, performance_set.set);
+
+  // 3. Find peak performance settings using the found boundaries
+  PerformanceSummary perf_summary = FindPeakPerformanceBinarySearch<scenario>(
+      sut, qsl, sequence_gen, performance_set, l_perf_summary, u_perf_summary);
+
+  // Print-out the peak performance test setting.
+  LogDetail([field = find_peak_performance::ToStringPerformanceField<scenario>(
+                 perf_summary.settings)](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "generic_message",
+               "FindPeakPerformance: Found peak performance field: " + field);
+#else
+    detail("FindPeakPerformance: Found peak performance field: " + field);
+#endif
+  });
+
+  LogSummary([perf_summary](AsyncSummary& summary) mutable {
+    perf_summary.LogSummary(summary);
+  });
+  // Create a copy to prevent thread hazard between LogSummary and LogDetail.
+  PerformanceSummary perf_summary_detail{perf_summary};
+  LogDetail([perf_summary_detail](AsyncDetail& detail) mutable {
+    perf_summary_detail.LogDetail(detail);
+  });
+
+  qsl->UnloadSamplesFromRam(performance_set.set);
+}
+
+/// \brief Runs the accuracy mode, templated by scenario.
+template <TestScenario scenario>
+void RunAccuracyMode(SystemUnderTest* sut, QuerySampleLibrary* qsl,
+                     const TestSettingsInternal& settings,
+                     SequenceGen* sequence_gen) {
+  LogDetail([](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "generic_message", "Starting accuracy mode");
+#else
+    detail("Starting accuracy mode:");
+#endif
+  });
+
+  std::vector<loadgen::LoadableSampleSet> loadable_sets(
+      loadgen::GenerateLoadableSets(qsl, settings));
+
+  for (auto& loadable_set : loadable_sets) {
+    {
+      auto tracer = MakeScopedTracer(
+          [count = loadable_set.set.size()](AsyncTrace& trace) {
+            trace("LoadSamples", "count", count);
+          });
+      LoadSamplesToRam(qsl, loadable_set.set);
+    }
+
+    PerformanceResult pr(IssueQueries<scenario, TestMode::AccuracyOnly>(
+        sut, settings, loadable_set, sequence_gen));
+
+    {
+      auto tracer = MakeScopedTracer(
+          [count = loadable_set.set.size()](AsyncTrace& trace) {
+            trace("UnloadSampes", "count", count);
+          });
+      qsl->UnloadSamplesFromRam(loadable_set.set);
+    }
+  }
+}
+
+/// \brief Routes runtime scenario requests to the corresponding instances
+/// of its templated mode functions.
+struct RunFunctions {
+  using Signature = void(SystemUnderTest* sut, QuerySampleLibrary* qsl,
+                         const TestSettingsInternal& settings,
+                         SequenceGen* sequence_gen);
+
+  template <TestScenario compile_time_scenario>
+  static RunFunctions GetCompileTime() {
+    return {(RunAccuracyMode<compile_time_scenario>),
+            (RunPerformanceMode<compile_time_scenario>),
+            (FindPeakPerformanceMode<compile_time_scenario>)};
+  }
+
+  static RunFunctions Get(TestScenario run_time_scenario) {
+    switch (run_time_scenario) {
+      case TestScenario::SingleStream:
+        return GetCompileTime<TestScenario::SingleStream>();
+      case TestScenario::MultiStream:
+        return GetCompileTime<TestScenario::MultiStream>();
+      case TestScenario::Server:
+        return GetCompileTime<TestScenario::Server>();
+      case TestScenario::Offline:
+        return GetCompileTime<TestScenario::Offline>();
+    }
+    // We should not reach this point.
+    assert(false);
+    return GetCompileTime<TestScenario::SingleStream>();
+  }
+
+  Signature& accuracy;
+  Signature& performance;
+  Signature& find_peak_performance;
+};
+
+}  // namespace loadgen
+
+void StartTest(SystemUnderTest* sut, QuerySampleLibrary* qsl,
+               const TestSettings& requested_settings,
+               const LogSettings& log_settings,
+               const std::string audit_config_filename) {
+  GlobalLogger().StartIOThread();
+
+  const std::string test_date_time = CurrentDateTimeISO8601();
+
+  loadgen::LogOutputs log_outputs(log_settings.log_output, test_date_time);
+  if (!log_outputs.CheckOutputs()) {
+    return;
+  }
+
+  GlobalLogger().StartLogging(&log_outputs.summary_out, &log_outputs.detail_out,
+                              &log_outputs.accuracy_out,
+                              log_settings.log_output.copy_detail_to_stdout,
+                              log_settings.log_output.copy_summary_to_stdout);
+
+  GlobalLogger().SetUseTokens(requested_settings.use_token_latencies);
+  bool needs_first_token =
+      (requested_settings.scenario != TestScenario::Offline);
+  GlobalLogger().SetNeedsFirstToken(needs_first_token);
+
+  if (log_settings.enable_trace) {
+    GlobalLogger().StartNewTrace(&log_outputs.trace_out, PerfClock::now());
+  }
+
+  // measure sut->Name() response time
+  PerfClock::time_point pre_get_sut_name_ts = PerfClock::now();
+  const std::string& sut_name = sut->Name();
+  PerfClock::time_point post_get_sut_name_ts = PerfClock::now();
+
+  auto get_sut_name_duration_ns =
+      std::chrono::duration_cast<std::chrono::nanoseconds>(
+          post_get_sut_name_ts - pre_get_sut_name_ts)
+          .count();
+
+  LogLoadgenVersion();
+  LogDetail([sut, qsl, test_date_time, &sut_name,
+             &get_sut_name_duration_ns](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "test_datetime", test_date_time);
+    MLPERF_LOG(detail, "sut_name", sut_name);
+    MLPERF_LOG(detail, "get_sut_name_duration_ns", get_sut_name_duration_ns);
+    MLPERF_LOG(detail, "qsl_name", qsl->Name());
+    MLPERF_LOG(detail, "qsl_reported_total_count", qsl->TotalSampleCount());
+    MLPERF_LOG(detail, "qsl_reported_performance_count",
+               qsl->PerformanceSampleCount());
+#else
+    detail("Date + time of test: ", test_date_time);
+    detail("System Under Test (SUT) name: ", sut_name);
+    detail("Get SUT name time [ns]: ", get_sut_name_duration_ns);
+    detail("Query Sample Library (QSL) name: ", qsl->Name());
+    detail("QSL total size: ", qsl->TotalSampleCount());
+    detail("QSL performance size*: ", qsl->PerformanceSampleCount());
+    detail("*TestSettings (performance_sample_count_override) can override");
+    detail("*Refer to Effective Settings for actual value");
+#endif
+  });
+
+  TestSettings test_settings = requested_settings;
+  // Look for Audit Config file to override TestSettings during audit
+  if (FileExists(audit_config_filename)) {
+    LogDetail([](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_WARNING(detail, "warning_generic_message",
+                         "Found Audit Config file (audit.config)."
+                         " Overriding TestSettings from audit.config file.");
+#else
+      detail(
+          "Found Audit Config file (audit.config)."
+          " Overriding TestSettings from audit.config file.");
+#endif
+    });
+    std::string audit_scenario = loadgen::ToString(test_settings.scenario);
+    // Remove Spaces from the string
+    RemoveValue(&audit_scenario, ' ');
+    const std::string generic_model = "*";
+    test_settings.FromConfig(audit_config_filename, generic_model,
+                             audit_scenario, 2);
+  }
+  if (test_settings.test05) {
+    // If the configuration indicates we are running test05,
+    // random seeds
+    LogDetail([](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_WARNING(detail, "warning_generic_message",
+                         "Test05 flag detected"
+                         " Overriding random seeds");
+#else
+      detail(
+          "Test05 flag detected"
+          " Overriding random seeds");
+#endif
+    });
+    test_settings.mode = TestMode::PerformanceOnly;
+    test_settings.qsl_rng_seed = requested_settings.test05_qsl_rng_seed;
+    test_settings.sample_index_rng_seed =
+        requested_settings.test05_sample_index_rng_seed;
+    test_settings.schedule_rng_seed =
+        requested_settings.test05_schedule_rng_seed;
+  }
+
+  loadgen::TestSettingsInternal sanitized_settings(
+      test_settings, qsl->PerformanceSampleCount());
+  sanitized_settings.LogAllSettings();
+
+  auto run_funcs = loadgen::RunFunctions::Get(sanitized_settings.scenario);
+
+  loadgen::SequenceGen sequence_gen;
+  switch (sanitized_settings.mode) {
+    case TestMode::SubmissionRun:
+      run_funcs.accuracy(sut, qsl, sanitized_settings, &sequence_gen);
+      run_funcs.performance(sut, qsl, sanitized_settings, &sequence_gen);
+      break;
+    case TestMode::AccuracyOnly:
+      run_funcs.accuracy(sut, qsl, sanitized_settings, &sequence_gen);
+      break;
+    case TestMode::PerformanceOnly:
+      run_funcs.performance(sut, qsl, sanitized_settings, &sequence_gen);
+      break;
+    case TestMode::FindPeakPerformance:
+      run_funcs.find_peak_performance(sut, qsl, sanitized_settings,
+                                      &sequence_gen);
+      break;
+  }
+
+  loadgen::IssueQueryController::GetInstance().EndThreads();
+
+  // Stop tracing after logging so all logs are captured in the trace.
+  GlobalLogger().StopLogging();
+  GlobalLogger().StopTracing();
+  GlobalLogger().StopIOThread();
+}
+
+void AbortTest() {
+  loadgen::IssueQueryController::GetInstance().EndThreads();
+  GlobalLogger().StopLogging();
+  GlobalLogger().StopTracing();
+  GlobalLogger().StopIOThread();
+}
+
+void QuerySamplesComplete(QuerySampleResponse* responses, size_t response_count,
+                          const ResponseCallback& response_cb) {
+  PerfClock::time_point timestamp = PerfClock::now();
+
+  auto tracer = MakeScopedTracer(
+      [](AsyncTrace& trace) { trace("QuerySamplesComplete"); });
+
+  const QuerySampleResponse* end = responses + response_count;
+
+  // Notify first to unblock loadgen production ASAP.
+  for (QuerySampleResponse* response = responses; response < end; response++) {
+    loadgen::SampleMetadata* sample =
+        reinterpret_cast<loadgen::SampleMetadata*>(response->id);
+    loadgen::QueryMetadata* query = sample->query_metadata;
+    query->NotifyOneSampleCompleted(timestamp);
+  }
+
+  // Log samples.
+  for (QuerySampleResponse* response = responses; response < end; response++) {
+    loadgen::SampleMetadata* sample =
+        reinterpret_cast<loadgen::SampleMetadata*>(response->id);
+    loadgen::QueryMetadata* query = sample->query_metadata;
+    query->response_delegate->SampleComplete(sample, response, timestamp,
+                                             response_cb);
+  }
+  // PerfClock::time_point end_timestamp = PerfClock::now();
+  // mlperf::samples_overhead_acum += (end_timestamp - timestamp).count();
+}
+
+void FirstTokenComplete(QuerySampleResponse* responses, size_t response_count,
+                        const ResponseCallback& response_cb) {
+  PerfClock::time_point timestamp = PerfClock::now();
+
+  auto tracer =
+      MakeScopedTracer([](AsyncTrace& trace) { trace("FirstTokenComplete"); });
+
+  const QuerySampleResponse* end = responses + response_count;
+
+  // Log samples.
+  for (QuerySampleResponse* response = responses; response < end; response++) {
+    loadgen::SampleMetadata* sample =
+        reinterpret_cast<loadgen::SampleMetadata*>(response->id);
+    loadgen::QueryMetadata* query = sample->query_metadata;
+    query->response_delegate->TokenComplete(sample, response, timestamp,
+                                            response_cb);
+  }
+  // PerfClock::time_point end_timestamp = PerfClock::now();
+  // mlperf::tokens_overhead_acum += (end_timestamp - timestamp).count();
+}
+
+}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen.h
new file mode 100644
index 000000000..84e02656c
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen.h
@@ -0,0 +1,103 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Provides the entry points for a SUT to start a test and respond
+/// to issued queries.
+
+#ifndef MLPERF_LOADGEN_LOADGEN_H_
+#define MLPERF_LOADGEN_LOADGEN_H_
+
+#include <cstddef>
+#include <functional>
+#include <numeric>
+#include <string>
+
+/// \brief Contains the loadgen API.
+namespace mlperf {
+
+struct QuerySampleResponse;
+class QuerySampleLibrary;
+class SystemUnderTest;
+struct TestSettings;
+struct LogSettings;
+
+using ResponseCallback = std::function<void(QuerySampleResponse*)>;
+
+/// \addtogroup LoadgenAPI Loadgen API
+/// @{
+
+///
+/// \brief SUT calls this to notify loadgen of completed samples.
+/// \details
+/// * The samples may be from any combination of queries or partial queries as
+///   issued by \link mlperf::SystemUnderTest::IssueQuery
+///
+///   SystemUnderTest::IssueQuery \endlink.
+/// * The SUT is responsible for owning and allocating the reponse data. The
+///   loadgen will copy the response data if needed (e.g. for accuracy mode).
+///   + If no response callback is provided, the response data must remain valid
+///     for the entire duration of this call.
+///   + The response callback is untimed; it is called for each response in
+///     responses after the loadgen records the completion time and before the
+///     loadgen copies the response data. The response callback enables the
+///     loadgen to simulate response data being stored in accelerator DRAM.
+///     After the response callback is called, response data must reside on the
+///     host so that the loadgen can copy it. Submitters must seek prior
+///     approval to use this feature of loadgen (refer to
+///     https://github.com/mlcommons/inference_policies/blob/master/inference_rules.adoc#5-load-generator).
+/// * All calls to QuerySampleComplete are thread-safe and wait-free bounded.
+///   + Any number of threads can call QuerySampleComplete simultaneously.
+///   + Regardless of where any other thread stalls, the current thread will
+///     finish QuerySampleComplete in a bounded number of cycles.
+///   + Note: If a callback is provided, the SUT must ensure that the callback
+///     is also thread-safe and wait-free bounded for the above to hold.
+void QuerySamplesComplete(QuerySampleResponse* responses, size_t response_count,
+                          const ResponseCallback& response_cb = {});
+
+void FirstTokenComplete(QuerySampleResponse* responses, size_t response_count,
+                        const ResponseCallback& response_cb = {});
+
+///
+/// \brief Starts the test against SUT with the specified settings.
+/// \details This is the C++ entry point. See mlperf::c::StartTest for the
+/// C entry point.
+///
+void StartTest(SystemUnderTest* sut, QuerySampleLibrary* qsl,
+               const TestSettings& requested_settings,
+               const LogSettings& log_settings,
+               const std::string audit_config_filename = "audit.config");
+
+///
+/// \brief Aborts the running test.
+/// \details This function will stop issueing new samples to the SUT. StartTest
+/// will return after the current inference finishes. Since StartTest is a
+/// blocking function, this function can only be called in another thread.
+void AbortTest();
+
+///
+/// \brief Register a thread for query issuing in Server scenario.
+/// \details If a thread registers itself, the thread(s) is used to call SUT's
+/// IssueQuery(). This function is blocking until the entire test is done. The
+/// number of registered threads must match server_num_issue_query_threads in
+/// TestSettings. This function only has effect in Server scenario.
+/// This is the C++ entry point. See mlperf::c::RegisterIssueQueryThread for the
+/// C entry point.
+///
+void RegisterIssueQueryThread();
+// inline long long samples_overhead_acum;
+// inline long long tokens_overhead_acum;
+/// @}
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_LOADGEN_H_
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen_integration_diagram.svg b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen_integration_diagram.svg
new file mode 100644
index 000000000..17dd1b481
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen_integration_diagram.svg
@@ -0,0 +1,85 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!-- Created by diasvg.py -->
+<svg width="29.100cm" height="12.100cm" viewBox="1.950 0.950 29.100 12.100"
+ xmlns="http://www.w3.org/2000/svg"
+ xmlns:xlink="http://www.w3.org/1999/xlink">
+<!-- Layer: Background -->
+<g id="Background">
+<!-- Flowchart - Box -->
+<rect x="2.000" y="2.000" width="8.000" height="2.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100"  rx="0.000" />
+<text x="6.000" y="3.195"  fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+Model + Dataset</text>
+<!-- Flowchart - Box -->
+<rect x="2.000" y="6.000" width="8.000" height="2.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100"  rx="0.000" />
+<text x="6.000" y="7.195"  fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+Pre Processor</text>
+<!-- Flowchart - Box -->
+<rect x="24.000" y="6.000" width="7.000" height="2.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100"  rx="0.000" />
+<text x="27.500" y="7.195"  fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+Post Processor</text>
+<!-- Flowchart - Box -->
+<rect x="13.000" y="1.000" width="8.000" height="8.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100"  rx="0.000" />
+<text x="17.000" y="5.195"  fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+Benchmark</text>
+<!-- Flowchart - Box -->
+<rect x="24.000" y="1.000" width="7.000" height="4.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100"  rx="0.000" />
+<text x="27.500" y="3.195"  fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+Backend</text>
+<!-- Flowchart - Box -->
+<rect x="13.000" y="11.000" width="8.000" height="2.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100"  rx="0.000" />
+<text x="17.000" y="12.195"  fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+LoadGen</text>
+<!-- Standard - Line -->
+<line x1="10.000" y1="3.000" x2="12.513" y2="3.000" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="12.888,3.000 12.388,3.250 12.513,3.000 12.388,2.750 "/>
+<!-- Standard - Line -->
+<line x1="10.000" y1="7.000" x2="12.513" y2="7.000" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="12.888,7.000 12.388,7.250 12.513,7.000 12.388,6.750 "/>
+<!-- Standard - Line -->
+<line x1="15.000" y1="9.000" x2="15.000" y2="10.513" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="15.000,10.888 14.750,10.388 15.000,10.513 15.250,10.388 "/>
+<!-- Standard - Line -->
+<line x1="17.000" y1="11.000" x2="17.000" y2="9.487" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="17.000,9.112 17.250,9.612 17.000,9.487 16.750,9.612 "/>
+<!-- Standard - Line -->
+<line x1="19.000" y1="9.000" x2="19.000" y2="10.513" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="19.000,10.888 18.750,10.388 19.000,10.513 19.250,10.388 "/>
+<!-- Standard - Line -->
+<line x1="21.000" y1="3.000" x2="23.513" y2="3.000" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="23.888,3.000 23.388,3.250 23.513,3.000 23.388,2.750 "/>
+<!-- Standard - Line -->
+<line x1="24.000" y1="7.000" x2="21.487" y2="7.000" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="21.112,7.000 21.612,6.750 21.487,7.000 21.612,7.250 "/>
+<!-- Standard - Text -->
+<text x="10.850" y="2.650"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+1</text>
+<!-- Standard - Text -->
+<!-- Standard - Text -->
+<text x="14.100" y="9.950"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+2</text>
+<!-- Standard - Text -->
+<text x="16.100" y="10.450"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+3</text>
+<!-- Standard - Text -->
+<text x="18.100" y="9.950"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+5</text>
+<!-- Standard - Text -->
+<text x="21.950" y="2.650"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+4</text>
+<!-- Standard - Text -->
+<!-- Standard - Text -->
+<text x="24.500" y="12.221"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+LoadGen Logs</text>
+<!-- Standard - Line -->
+<line x1="21.000" y1="12.000" x2="23.513" y2="12.000" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="23.888,12.000 23.388,12.250 23.513,12.000 23.388,11.750 "/>
+<!-- Standard - Text -->
+<text x="21.900" y="11.450"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
+6</text>
+<!-- Standard - Text -->
+<!-- Standard - Text -->
+<!-- Standard - Line -->
+<line x1="27.500" y1="5.000" x2="27.500" y2="5.513" stroke="#000000" stroke-width="0.100" />
+<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="27.500,5.888 27.250,5.388 27.500,5.513 27.750,5.388 "/>
+</g>
+</svg>
\ No newline at end of file
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/logging.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/logging.cc
new file mode 100644
index 000000000..807c1954a
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/logging.cc
@@ -0,0 +1,1301 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Implements a logging system with a central IO thread that handles
+/// all stringification and IO.
+/// \details Log-producing threads only submit lambdas to be executed on the
+/// IO thread.
+/// All producers and consumers use lock-free operations that guarantee
+/// forward progress independent of a) other stalled threads and b) where
+/// those threads are stalled.
+/// Each thread uses a double-buffering scheme to queue its logs. One buffer
+/// is always reserved for writes and the other is reserved for reads.
+/// A producing thread sends requests to the IOThread to swap the buffers
+/// and the IOThread does the actual read/write swap after it has finished
+/// reading the buffer it was working on.
+
+#include "logging.h"
+
+#include <cassert>
+#include <cmath>
+#include <future>
+#include <iomanip>
+#include <iostream>
+#include <limits>
+#include <sstream>
+
+#if defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64)
+#define WIN32_LEAN_AND_MEAN
+#define NOMINMAX
+#include <process.h>
+#include <windows.h>
+#define MLPERF_GET_PID() _getpid()
+#else
+#include <unistd.h>
+#define MLPERF_GET_PID() getpid()
+#endif
+
+// Use system-level TID for tracing. This enables correlation with other
+// performance tools that are not aware of C++ std::thread::id.
+#if defined(__linux__)
+#include <sys/syscall.h>
+#define MLPERF_GET_TID() syscall(SYS_gettid)
+#elif defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64)
+#define MLPERF_GET_TID() GetCurrentThreadId()
+#elif defined(__APPLE__)
+#define MLPERF_GET_TID() \
+  std::hash<std::thread::id>{}(std::this_thread::get_id())
+#else
+// TODO: std::this_thread::id is a class but MLPERF_GET_TID() assigned to
+// uint64_t
+#define MLPERF_GET_TID() std::this_thread::get_id()
+#endif
+
+#include "utils.h"
+
+namespace mlperf {
+namespace logging {
+
+namespace {
+
+uintptr_t SwapRequestSlotIsWritableValue(size_t id) {
+  // LSB of 1 indicates that this isn't a pointer.
+  // MSBs encode the id to detect collisions when a slot in
+  // |thread_swap_request_slots_| is reused for a different id and the request
+  // for the previous id is very slow.
+  return (id << 1) | 0x1;
+}
+
+bool SwapRequestSlotIsReadable(uintptr_t value) {
+  // Valid pointers will not have their lsb set.
+  return (value & 0x1) != 0x1;
+}
+
+constexpr size_t kMaxThreadsToLog = 1024;
+constexpr std::chrono::milliseconds kLogPollPeriod(10);
+
+/// \brief How many log entries to pre-allocate per thread to help avoid
+/// runtime allocation.
+constexpr size_t kTlsLogReservedEntryCount = 1024;
+
+constexpr auto kInvalidLatency = std::numeric_limits<QuerySampleLatency>::min();
+constexpr auto nTokenInvalid = std::numeric_limits<int64_t>::min();
+
+}  // namespace
+
+const std::string& ArgValueTransform(const bool& value) {
+  static const std::string v_true("true");
+  static const std::string v_false("false");
+  return value ? v_true : v_false;
+}
+
+char Bin2Hex(uint8_t four_bits) {
+  char number = '0' + four_bits;
+  char letter = ('A' - 10) + four_bits;
+  return four_bits < 10 ? number : letter;
+}
+
+const std::string ArgValueTransform(const LogBinaryAsHexString& value) {
+  if (value.data == nullptr) {
+    return "\"\"";
+  }
+  std::string hex;
+  hex.reserve(value.data->size() + 2);
+  hex.push_back('"');
+  for (auto b : *value.data) {
+    hex.push_back(Bin2Hex(b >> 4));
+    hex.push_back(Bin2Hex(b & 0x0F));
+  }
+  hex.push_back('"');
+  return hex;
+}
+
+#if USE_NEW_LOGGING_FORMAT
+const std::string ArgValueTransform(const std::string& value) {
+  return std::string("\"") + value + std::string("\"");
+}
+
+const std::string ArgValueTransform(const char* value) {
+  return std::string("\"") + std::string(value) + std::string("\"");
+}
+
+const std::string ArgValueTransform(const std::vector<size_t>& value) {
+  std::string s("[");
+  for (auto i : value) {
+    s += std::to_string(i) + ",";
+  }
+  s.resize(s.size() - 1);
+  s += "]";
+  return s;
+}
+
+const std::string ArgValueTransform(
+    const std::map<std::string, std::string>& value) {
+  std::string s("{");
+  for (const auto& i : value) {
+    s += "\"";
+    s += i.first;
+    s += "\":\"";
+    s += i.second;
+    s += "\",";
+  }
+  s.resize(s.size() - 1);
+  s += "}";
+  return s;
+}
+
+const std::string ArgValueTransform(const float value) {
+  if (value == std::numeric_limits<float>::infinity()) {
+    return "Infinity";
+  } else if (value == -std::numeric_limits<float>::infinity()) {
+    return "-Infinity";
+  } else if (std::isnan(value)) {
+    return "NaN";
+  }
+  return std::to_string(value);
+}
+
+const std::string ArgValueTransform(const double value) {
+  if (value == std::numeric_limits<double>::infinity()) {
+    return "Infinity";
+  } else if (value == -std::numeric_limits<double>::infinity()) {
+    return "-Infinity";
+  } else if (std::isnan(value)) {
+    return "NaN";
+  }
+  return std::to_string(value);
+}
+#endif
+
+ChromeTracer::ChromeTracer(std::ostream* out, PerfClock::time_point origin)
+    : out_(out), origin_(origin) {
+  WriteTraceEventHeader();
+}
+
+ChromeTracer::~ChromeTracer() {
+  WriteTraceEventFooter();
+  out_->flush();
+}
+
+void ChromeTracer::WriteTraceEventHeader() {
+  // Times and durations are converted from nanoseconds to microseconds, use
+  // 3 decimal digits to preserve precision.
+  *out_ << std::fixed << std::setprecision(3) << "{\"traceEvents\":[\n";
+}
+
+void ChromeTracer::WriteTraceEventFooter() {
+  *out_ << "{\"name\":\"LastTrace\"}\n"
+        << "],\n"
+        << "\"displayTimeUnit\":\"ns\",\n"
+        << "\"otherData\":{\n"
+        << "\"ts\":" << Micros(origin_.time_since_epoch()).count() << ",\n"
+        << "\"version\":\"MLPerf LoadGen v1.0\"\n"
+        << "}\n"
+        << "}\n";
+}
+
+void AsyncLog::SetCurrentPidTid(uint64_t pid, uint64_t tid) {
+  current_pid_ = pid;
+  current_tid_ = tid;
+}
+
+void AsyncLog::SetLogFiles(std::ostream* summary, std::ostream* detail,
+                           std::ostream* accuracy, bool copy_detail_to_stdout,
+                           bool copy_summary_to_stdout,
+                           PerfClock::time_point log_origin) {
+  std::unique_lock<std::mutex> lock(log_mutex_);
+  if (summary_out_ != &std::cerr) {
+    std::string warning_summary;
+    if (log_warning_count_ == 0) {
+      warning_summary = "\nNo warnings encountered during test.\n";
+    } else if (log_warning_count_ == 1) {
+      warning_summary = "\n1 warning encountered. See detailed log.\n";
+    } else if (log_warning_count_ != 0) {
+      warning_summary = "\n" + std::to_string(log_warning_count_) +
+                        " warnings encountered. See detailed log.\n";
+    }
+
+    std::string error_summary;
+    if (log_error_count_ == 0) {
+      error_summary = "\nNo errors encountered during test.\n";
+    } else if (log_error_count_ == 1) {
+      error_summary = "\n1 ERROR encountered. See detailed log.\n";
+    } else if (log_error_count_ != 0) {
+      error_summary = "\n" + std::to_string(log_error_count_) +
+                      " ERRORS encountered. See detailed log.\n";
+    }
+
+    *summary_out_ << warning_summary << error_summary;
+    if (copy_summary_to_stdout_) {
+      std::cout << warning_summary << error_summary;
+    }
+  }
+  if (summary_out_) {
+    summary_out_->flush();
+  }
+  if (detail_out_) {
+    detail_out_->flush();
+  }
+  if (accuracy_out_ != &std::cerr) {
+    WriteAccuracyFooterLocked();
+    accuracy_out_->flush();
+  }
+  summary_out_ = summary;
+  detail_out_ = detail;
+  accuracy_out_ = accuracy;
+  if (accuracy_out_ != &std::cerr) {
+    WriteAccuracyHeaderLocked();
+  }
+  copy_detail_to_stdout_ = copy_detail_to_stdout;
+  copy_summary_to_stdout_ = copy_summary_to_stdout;
+  log_origin_ = log_origin;
+  log_error_count_ = 0;
+  log_warning_count_ = 0;
+}
+
+void AsyncLog::StartNewTrace(std::ostream* trace_out,
+                             PerfClock::time_point origin) {
+  std::unique_lock<std::mutex> lock(trace_mutex_);
+  if (trace_out) {
+    tracer_ = std::make_unique<ChromeTracer>(trace_out, origin);
+  } else {
+    tracer_.reset();
+  }
+}
+
+void AsyncLog::StopTrace() {
+  std::unique_lock<std::mutex> lock(trace_mutex_);
+  tracer_.reset();
+}
+
+void AsyncLog::LogAccuracy(uint64_t seq_id, const QuerySampleIndex qsl_idx,
+                           const LogBinaryAsHexString& response,
+                           int64_t n_tokens = 0) {
+  std::unique_lock<std::mutex> lock(log_mutex_);
+  if (!accuracy_out_) {
+    return;
+  }
+  *accuracy_out_ << (accuracy_needs_comma_ ? ",\n{ " : "\n{ ");
+  if (!use_tokens_) {
+    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "data",
+            response);
+  } else if (!needs_first_token_) {
+    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "data",
+            response, "token_count", n_tokens);
+  } else {
+    const size_t i = seq_id - latencies_first_sample_sequence_id_;
+    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "data",
+            response, "token_data", token_records_[i], "token_count", n_tokens);
+  }
+
+  *accuracy_out_ << " }";
+  accuracy_needs_comma_ = true;
+}
+
+void AsyncLog::CacheToken(uint64_t seq_id,
+                          const LogBinaryAsHexString& response) {
+  std::unique_lock<std::mutex> lock(token_record_mutex_);
+  const size_t i = seq_id - latencies_first_sample_sequence_id_;
+  if (token_records_.size() <= i) {
+    token_records_.resize(i + 1);
+  }
+  token_records_[i] = response;
+}
+
+void AsyncLog::Flush() {
+  {
+    std::unique_lock<std::mutex> lock(log_mutex_);
+    if (summary_out_) {
+      summary_out_->flush();
+    }
+    if (detail_out_) {
+      detail_out_->flush();
+    }
+    if (accuracy_out_) {
+      accuracy_out_->flush();
+    }
+  }
+
+  {
+    std::unique_lock<std::mutex> lock(trace_mutex_);
+    if (tracer_) {
+      tracer_->Flush();
+    }
+  }
+}
+
+void AsyncLog::WriteAccuracyHeaderLocked() {
+  *accuracy_out_ << "[";
+  accuracy_needs_comma_ = false;
+}
+
+void AsyncLog::WriteAccuracyFooterLocked() { *accuracy_out_ << "\n]\n"; }
+
+void AsyncLog::RestartLatencyRecording(uint64_t first_sample_sequence_id,
+                                       size_t latencies_to_reserve) {
+  std::unique_lock<std::mutex> lock(latencies_mutex_);
+  assert(latencies_.empty());
+  assert(latencies_recorded_ == latencies_expected_);
+  latencies_recorded_ = 0;
+  latencies_expected_ = 0;
+  max_latency_ = 0;
+  max_completion_timstamp_ = PerfClock::now();
+  latencies_first_sample_sequence_id_ = first_sample_sequence_id;
+  latencies_.reserve(latencies_to_reserve);
+  token_latencies_.reserve(latencies_to_reserve);
+  tokens_per_sample_.reserve(latencies_to_reserve);
+  time_per_output_token_.reserve(latencies_to_reserve);
+}
+
+void AsyncLog::RecordSampleCompletion(uint64_t sample_sequence_id,
+                                      PerfClock::time_point completion_time,
+                                      QuerySampleLatency latency,
+                                      int64_t n_tokens = 0) {
+  std::unique_lock<std::mutex> lock(latencies_mutex_);
+
+  max_latency_ = std::max(max_latency_, latency);
+
+  max_completion_timstamp_ =
+      std::max(max_completion_timstamp_, completion_time);
+
+  if (sample_sequence_id < latencies_first_sample_sequence_id_) {
+    // Call LogErrorSync here since this kind of error could result in a
+    // segfault in the near future.
+#if USE_NEW_LOGGING_FORMAT
+    std::stringstream ss;
+    ss << "Received completion for an old sample."
+       << " Min expected id: " << latencies_first_sample_sequence_id_
+       << " Actual id: " << sample_sequence_id;
+    MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime", ss.str());
+#else
+    GlobalLogger().LogErrorSync(
+        "Received completion for an old sample.", "Min expected id",
+        latencies_first_sample_sequence_id_, "Actual id", sample_sequence_id);
+#endif
+    return;
+  }
+
+  const size_t i = sample_sequence_id - latencies_first_sample_sequence_id_;
+
+  if (latencies_.size() <= i) {
+    // TODO: Reserve in advance.
+    latencies_.resize(i + 1, kInvalidLatency);
+  } else if (latencies_[i] != kInvalidLatency) {
+    // Call LogErrorSync here since this kind of error could result in a
+    // segfault in the near future.
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
+                          "Attempted to complete a sample twice.");
+#else
+    GlobalLogger().LogErrorSync("Attempted to complete a sample twice.");
+#endif
+
+    // Return without recording the latency again to avoid potentially
+    // ending the test before the SUT is actually done, which could result
+    // in a segfault.
+    // If the SUT recorded the wrong sample, the test will hang and see
+    // the error above.
+    return;
+  }
+
+  if (use_tokens_) {
+    if (needs_first_token_ && (token_latencies_.size() <= i)) {
+      MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
+                            "Attempted to record a sample latency before it's "
+                            "first token latency");
+    } else if (needs_first_token_ && (token_latencies_[i] == kInvalidLatency)) {
+      MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
+                            "Attempted to record a sample latency before it's "
+                            "first token latency");
+    }
+
+    if (tokens_per_sample_.size() <= i) {
+      // TODO: Reserve in advance.
+      tokens_per_sample_.resize(i + 1, nTokenInvalid);
+    } else if (tokens_per_sample_[i] != nTokenInvalid) {
+      // Call LogErrorSync here since this kind of error could result in a
+      // segfault in the near future.
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
+                            "Attempted to complete a sample twice.");
+#else
+      GlobalLogger().LogErrorSync("Attempted to complete a sample twice.");
+#endif
+
+      // Return without recording the latency again to avoid potentially
+      // ending the test before the SUT is actually done, which could result
+      // in a segfault.
+      // If the SUT recorded the wrong sample, the test will hang and see
+      // the error above.
+      return;
+    }
+    if (n_tokens == 0) {
+      MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
+                            "n_tokens argument missing or attempted to record "
+                            "0 as number of tokens");
+    } else if (n_tokens < 0) {
+      MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
+                            "Attempted to record a negative number of tokens");
+      n_tokens = 0;
+    } else if (n_tokens == 1) {
+      MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
+                            "Number of tokens need to be greater than 1");
+      n_tokens = 0;
+    }
+    if (time_per_output_token_.size() <= i) {
+      time_per_output_token_.resize(i + 1, kInvalidLatency);
+    } else if (time_per_output_token_[i] != kInvalidLatency) {
+      // Call LogErrorSync here since this kind of error could result in a
+      // segfault in the near future.
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
+                            "Attempted to complete a sample twice.");
+#else
+      GlobalLogger().LogErrorSync("Attempted to complete a sample twice.");
+#endif
+
+      // Return without recording the latency again to avoid potentially
+      // ending the test before the SUT is actually done, which could result
+      // in a segfault.
+      // If the SUT recorded the wrong sample, the test will hang and see
+      // the error above.
+      return;
+    }
+    tokens_per_sample_[i] = n_tokens;
+    time_per_output_token_[i] =
+        (latency - token_latencies_[i]) / (n_tokens - 1);
+  }
+  latencies_[i] = latency;
+  latencies_recorded_++;
+  if (AllLatenciesRecorded()) {
+    all_latencies_recorded_.notify_all();
+  }
+}
+
+void AsyncLog::RecordTokenCompletion(uint64_t sample_sequence_id,
+                                     PerfClock::time_point completion_time,
+                                     QuerySampleLatency latency) {
+  std::unique_lock<std::mutex> lock(token_latencies_mutex_);
+  // std::unique_lock<std::mutex> lock(latencies_mutex_);
+  // max_latency_ = std::max(max_latency_, latency);
+
+  // max_completion_timstamp_ =
+  //     std::max(max_completion_timstamp_, completion_time);
+
+  if (sample_sequence_id < latencies_first_sample_sequence_id_) {
+    // Call LogErrorSync here since this kind of error could result in a
+    // segfault in the near future.
+#if USE_NEW_LOGGING_FORMAT
+    std::stringstream ss;
+    ss << "Received completion for an old sample."
+       << " Min expected id: " << latencies_first_sample_sequence_id_
+       << " Actual id: " << sample_sequence_id;
+    MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime", ss.str());
+#else
+    GlobalLogger().LogErrorSync(
+        "Received completion for an old sample.", "Min expected id",
+        latencies_first_sample_sequence_id_, "Actual id", sample_sequence_id);
+#endif
+    return;
+  }
+
+  const size_t i = sample_sequence_id - latencies_first_sample_sequence_id_;
+
+  if (latencies_.size() > i) {
+    if (latencies_[i] != kInvalidLatency) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_ERROR_SYNC(
+          GlobalLogger(), "error_runtime",
+          "Attempted to record token latency after sample was completed");
+#else
+      GlobalLogger().LogErrorSync(
+          "Attempted to record token latency after sample was completed");
+#endif
+
+      // Return without recording the latency again to avoid potentially
+      // ending the test before the SUT is actually done, which could result
+      // in a segfault.
+      // If the SUT recorded the wrong sample, the test will hang and see
+      // the error above.
+      return;
+    }
+  }
+  if (token_latencies_.size() <= i) {
+    // TODO: Reserve in advance.
+    token_latencies_.resize(i + 1, kInvalidLatency);
+  } else if (token_latencies_[i] != kInvalidLatency) {
+    // Call LogErrorSync here since this kind of error could result in a
+    // segfault in the near future.
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
+                          "Attempted to complete a sample twice.");
+#else
+    GlobalLogger().LogErrorSync("Attempted to complete a sample twice.");
+#endif
+
+    // Return without recording the latency again to avoid potentially
+    // ending the test before the SUT is actually done, which could result
+    // in a segfault.
+    // If the SUT recorded the wrong sample, the test will hang and see
+    // the error above.
+    return;
+  }
+  token_latencies_[i] = latency;
+}
+
+std::vector<QuerySampleLatency> AsyncLog::GetLatenciesBlocking(
+    size_t expected_count) {
+  std::vector<QuerySampleLatency> latencies;
+  {
+    std::unique_lock<std::mutex> lock(latencies_mutex_);
+    latencies_expected_ = expected_count;
+    all_latencies_recorded_.wait(lock, [&] { return AllLatenciesRecorded(); });
+    latencies.swap(latencies_);
+  }
+
+  if (latencies.size() != expected_count) {
+    // Call LogErrorSync here since this kind of error could result in a
+    // segfault in the near future.
+#if USE_NEW_LOGGING_FORMAT
+    std::stringstream ss;
+    ss << "Received SequenceId that was too large."
+       << " expected_size: " << expected_count
+       << " actual_size: " << latencies.size();
+    MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime", ss.str());
+#else
+    GlobalLogger().LogErrorSync("Received SequenceId that was too large.",
+                                "expected_size", expected_count, "actual_size",
+                                latencies.size());
+#endif
+  }
+
+  size_t invalid_latency_count = 0;
+  for (auto l : latencies) {
+    if (l == kInvalidLatency) {
+      invalid_latency_count++;
+    }
+  }
+  if (invalid_latency_count != 0) {
+    // Call LogErrorSync here since this kind of error could result in a
+    // segfault in the near future.
+#if USE_NEW_LOGGING_FORMAT
+    std::stringstream ss;
+    ss << "Encountered incomplete samples at the end of a series of queries."
+       << " count: " << invalid_latency_count;
+    MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime", ss.str());
+#else
+    GlobalLogger().LogErrorSync(
+        "Encountered incomplete samples at the end of a series of queries.",
+        "count", invalid_latency_count);
+#endif
+  }
+
+  return latencies;
+}
+
+std::vector<QuerySampleLatency> AsyncLog::GetTokenLatencies(
+    size_t expected_count) {
+  std::vector<QuerySampleLatency> token_latencies;
+  token_latencies.swap(token_latencies_);
+  return token_latencies;
+}
+
+std::vector<QuerySampleLatency> AsyncLog::GetTimePerOutputToken(
+    size_t expected_count) {
+  std::vector<QuerySampleLatency> tpot_latencies;
+  tpot_latencies.swap(time_per_output_token_);
+  return tpot_latencies;
+}
+
+std::vector<int64_t> AsyncLog::GetTokensPerSample(size_t expected_count) {
+  std::vector<int64_t> tokens_per_sample;
+  tokens_per_sample.swap(tokens_per_sample_);
+  return tokens_per_sample;
+}
+
+PerfClock::time_point AsyncLog::GetMaxCompletionTime() {
+  return max_completion_timstamp_;
+}
+
+QuerySampleLatency AsyncLog::GetMaxLatencySoFar() {
+  std::unique_lock<std::mutex> lock(latencies_mutex_);
+  return max_latency_;
+}
+
+void AsyncLog::SetUseTokens(bool use_tokens) { use_tokens_ = use_tokens; }
+
+void AsyncLog::SetNeedsFirstToken(bool needs_first_token) {
+  needs_first_token_ = needs_first_token;
+}
+
+/// \brief Records a single thread using thread-local storage and submits
+/// entries to the central Logger.
+///
+/// \details This setup allows for each log entry to be added:
+///   * With forward-progress guarantees. (i.e.: no locking or blocking
+///       operations even if other threads have stalled.)
+///   * Without expensive syscalls or I/O operations, which are deferred to
+///       the central Logger.
+class TlsLogger {
+ public:
+  TlsLogger(std::function<void()> forced_detatch);
+  ~TlsLogger();
+  void ForcedDetatchFromThread() { forced_detatch_(); }
+
+  void Log(AsyncLogEntry&& entry);
+  void SwapBuffers();
+
+  std::vector<AsyncLogEntry>* StartReadingEntries();
+  void FinishReadingEntries();
+  bool ReadBufferHasBeenConsumed();
+  size_t MaxEntryVectorSize() { return max_entry_size_; }
+
+  uint64_t Pid() const { return pid_; }
+  uint64_t Tid() const { return tid_; }
+
+  void RequestSwapBuffersSlotRetried() {
+    swap_buffers_slot_retry_count_.fetch_add(1, std::memory_order_relaxed);
+  }
+
+  size_t ReportLogCasFailCount() {
+    size_t c = log_cas_fail_count_.load(std::memory_order_relaxed);
+    log_cas_fail_count_.fetch_sub(c, std::memory_order_relaxed);
+    return c;
+  }
+
+  size_t ReportSwapBuffersSlotRetryCount() {
+    size_t c = swap_buffers_slot_retry_count_.load(std::memory_order_relaxed);
+    swap_buffers_slot_retry_count_.fetch_sub(c, std::memory_order_relaxed);
+    return c;
+  }
+
+  void TraceCounters();
+
+ private:
+  using EntryVector = std::vector<AsyncLogEntry>;
+  enum class EntryState { Unlocked, ReadLock, WriteLock };
+
+  // Accessed by producer only.
+  size_t i_read_ = 0;
+
+  // Accessed by producer and consumer atomically.
+  EntryVector entries_[2];
+  std::atomic<EntryState> entry_states_[2]{{EntryState::ReadLock},
+                                           {EntryState::Unlocked}};
+  std::atomic<size_t> i_write_{1};
+
+  std::atomic<size_t> log_cas_fail_count_{0};
+  std::atomic<size_t> swap_buffers_slot_retry_count_{0};
+
+  // Accessed by consumer only.
+  size_t unread_swaps_ = 0;
+  size_t i_write_prev_ = 0;
+  uint64_t pid_;
+  uint64_t tid_;
+  size_t max_entry_size_ = kTlsLogReservedEntryCount;
+
+  std::function<void()> forced_detatch_;
+};
+
+Logger::Logger(std::chrono::duration<double> poll_period,
+               size_t max_threads_to_log)
+    : poll_period_(poll_period),
+      max_threads_to_log_(max_threads_to_log),
+      thread_swap_request_slots_(max_threads_to_log * 2) {
+  const size_t kSlotCount = max_threads_to_log * 2;
+  for (size_t i = 0; i < kSlotCount; i++) {
+    std::atomic_init(&thread_swap_request_slots_[i],
+                     SwapRequestSlotIsWritableValue(i));
+  }
+}
+
+Logger::~Logger() {
+  // TlsLoggers might outlive this Logger when loaded as a python module.
+  // Forcefully make all currently registered TlsLoggers orphans.
+  std::unique_lock<std::mutex> lock(tls_loggers_registerd_mutex_);
+  TlsLogger* tls_logger_prev = nullptr;
+  (void)tls_logger_prev;  // Avoid unused error in release builds.
+  while (!tls_loggers_registerd_.empty()) {
+    TlsLogger* tls_logger = *tls_loggers_registerd_.begin();
+    // Otherwise, this is an infinite loop.
+    assert(tls_logger != tls_logger_prev);
+    tls_loggers_registerd_mutex_.unlock();
+    tls_logger->ForcedDetatchFromThread();
+    tls_loggers_registerd_mutex_.lock();
+    tls_logger_prev = tls_logger;
+  }
+}
+
+void Logger::RequestSwapBuffers(TlsLogger* tls_logger) {
+  auto tls_logger_as_uint = reinterpret_cast<uintptr_t>(tls_logger);
+  assert(SwapRequestSlotIsReadable(tls_logger_as_uint));
+  size_t id, slot;
+  uintptr_t slot_is_writeable_value;
+  // The compare_exchange below should almost always succeed.
+  // The compare_exchange may fail if a recycled slot is still actively used
+  // by another thread, so we retry with subsequent slots here if needed.
+  // Since the slot count is 2x the expected number of threads to log,
+  // the CAS should only fail at most 50% of the time when all logging threads
+  // happen to be descheduled between the fetch_add and CAS below, which is
+  // very unlikely.
+  id = swap_request_id_.fetch_add(1, std::memory_order_relaxed);
+  slot = id % thread_swap_request_slots_.size();
+  slot_is_writeable_value = SwapRequestSlotIsWritableValue(id);
+  while (!thread_swap_request_slots_[slot].compare_exchange_strong(
+      slot_is_writeable_value, tls_logger_as_uint, std::memory_order_release)) {
+    id = swap_request_id_.fetch_add(1, std::memory_order_relaxed);
+    slot = id % thread_swap_request_slots_.size();
+    slot_is_writeable_value = SwapRequestSlotIsWritableValue(id);
+    tls_logger->RequestSwapBuffersSlotRetried();
+  }
+}
+
+void Logger::RegisterTlsLogger(TlsLogger* tls_logger) {
+  std::unique_lock<std::mutex> lock(tls_loggers_registerd_mutex_);
+  if (tls_loggers_registerd_.size() >= max_threads_to_log_) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG_ERROR_SYNC((*this), "error_runtime",
+                          "Warning: More TLS loggers registerd than can be "
+                          "active simultaneously.");
+#else
+    LogErrorSync(
+        "Warning: More TLS loggers registerd than can "
+        "be active simultaneously.\n");
+#endif
+  }
+  tls_loggers_registerd_.insert(tls_logger);
+}
+
+// This moves ownership of the tls_logger data to Logger so the
+// exiting thread can exit immediately, even if all the logs of the
+// exiting thread haven't been processed.
+void Logger::UnRegisterTlsLogger(std::unique_ptr<TlsLogger> tls_logger) {
+  OrphanContainer::iterator orphan;
+  {
+    std::unique_lock<std::mutex> lock(tls_logger_orphans_mutex_);
+    tls_logger_orphans_.emplace_front(std::move(tls_logger));
+    orphan = tls_logger_orphans_.begin();
+  }
+
+  // Only remove the TlsLogger from the registry after adding to orphans so
+  // CollectTlsLoggerStats doesn't have any gaps in coverage.
+  {
+    std::unique_lock<std::mutex> lock(tls_loggers_registerd_mutex_);
+    tls_loggers_registerd_.erase(orphan->get());
+  }
+
+  // This will flush the logs of |tls_logger| and mark it for destruction.
+  // Deferring destruction via orphans_to_destroy helps avoid use-after-frees
+  // when the IOThread calls FinishReadingEntries.
+  (*orphan)->Log([this, orphan](AsyncLog&) {
+    CollectTlsLoggerStats(orphan->get());
+    orphans_to_destroy_.push_back(orphan);
+  });
+}
+
+void Logger::CollectTlsLoggerStats(TlsLogger* tls_logger) {
+  tls_total_log_cas_fail_count_ += tls_logger->ReportLogCasFailCount();
+  tls_total_swap_buffers_slot_retry_count_ +=
+      tls_logger->ReportSwapBuffersSlotRetryCount();
+
+  size_t max_entry_vector_size = tls_logger->MaxEntryVectorSize();
+  if (max_entry_vector_size > kTlsLogReservedEntryCount) {
+#if USE_NEW_LOGGING_FORMAT
+    std::stringstream msg;
+    msg << "Logging allocation detected:" << " tid: " << tls_logger->Tid()
+        << " reserved_entries: " << kTlsLogReservedEntryCount
+        << " max_entries: " << max_entry_vector_size;
+    MLPERF_LOG_WARNING((*this), "warning_generic_message", msg.str());
+#else
+    async_logger_.FlagWarning();
+    async_logger_.LogDetail("Logging allocation detected: ", "tid",
+                            tls_logger->Tid(), "reserved_entries",
+                            kTlsLogReservedEntryCount, "max_entries",
+                            max_entry_vector_size);
+#endif
+  }
+}
+
+void Logger::StartIOThread() {
+  {
+    std::unique_lock<std::mutex> lock(io_thread_mutex_);
+    keep_io_thread_alive_ = true;
+  }
+  io_thread_ = std::thread(&Logger::IOThread, this);
+}
+
+void Logger::StopIOThread() {
+  {
+    std::unique_lock<std::mutex> lock(io_thread_mutex_);
+    keep_io_thread_alive_ = false;
+    io_thread_cv_.notify_all();
+  }
+  io_thread_.join();
+}
+
+void Logger::StartLogging(std::ostream* summary, std::ostream* detail,
+                          std::ostream* accuracy, bool copy_detail_to_stdout,
+                          bool copy_summary_to_stdout) {
+  async_logger_.SetLogFiles(summary, detail, accuracy, copy_detail_to_stdout,
+                            copy_summary_to_stdout, PerfClock::now());
+}
+
+void Logger::StopLogging() {
+  if (std::this_thread::get_id() == io_thread_.get_id()) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG_ERROR_SYNC((*this), "error_runtime",
+                          "StopLogging() not supported from IO thread.");
+#else
+    LogErrorSync("StopLogging() not supported from IO thread.");
+#endif
+    return;
+  }
+
+  // Flush logs from this thread.
+  std::promise<void> io_thread_flushed_this_thread;
+  Log([&](AsyncLog&) { io_thread_flushed_this_thread.set_value(); });
+  io_thread_flushed_this_thread.get_future().wait();
+  async_logger_.SetLogFiles(&std::cerr, &std::cerr, &std::cerr, false, false,
+                            PerfClock::now());
+}
+
+void Logger::StartNewTrace(std::ostream* trace_out,
+                           PerfClock::time_point origin) {
+  async_logger_.StartNewTrace(trace_out, origin);
+}
+
+void Logger::StopTracing() {
+  // Flush traces from this thread.
+  std::promise<void> io_thread_flushed_this_thread;
+  Log([&](AsyncLog&) { io_thread_flushed_this_thread.set_value(); });
+  io_thread_flushed_this_thread.get_future().wait();
+  async_logger_.StopTrace();
+}
+
+void Logger::LogContentionAndAllocations() {
+  LogDetail([&](AsyncDetail& detail) {
+    {
+      std::unique_lock<std::mutex> lock(tls_loggers_registerd_mutex_);
+      for (auto tls_logger : tls_loggers_registerd_) {
+        CollectTlsLoggerStats(tls_logger);
+      }
+    }
+
+    {
+      std::unique_lock<std::mutex> lock(tls_logger_orphans_mutex_);
+      for (auto& orphan : tls_logger_orphans_) {
+        CollectTlsLoggerStats(orphan.get());
+      }
+    }
+
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "logger_swap_request_slots_retry_count",
+               swap_request_slots_retry_count_);
+    MLPERF_LOG(detail, "logger_swap_request_slots_retry_retry_count",
+               swap_request_slots_retry_retry_count_);
+    MLPERF_LOG(detail, "logger_swap_request_slots_retry_reencounter_count",
+               swap_request_slots_retry_reencounter_count_);
+    MLPERF_LOG(detail, "logger_start_reading_entries_retry_count",
+               start_reading_entries_retry_count_);
+    MLPERF_LOG(detail, "logger_tls_total_log_cas_fail_count",
+               tls_total_log_cas_fail_count_);
+    MLPERF_LOG(detail, "logger_tls_total_swap_buffers_slot_retry_count",
+               tls_total_swap_buffers_slot_retry_count_);
+#else
+    detail("Log Contention Counters:");
+    detail(std::to_string(swap_request_slots_retry_count_) +
+           " : swap_request_slots_retry_count");
+    detail(std::to_string(swap_request_slots_retry_retry_count_) +
+           " : swap_request_slots_retry_retry_count");
+    detail(std::to_string(swap_request_slots_retry_reencounter_count_) +
+           " : swap_request_slots_retry_reencounter_count");
+    detail(std::to_string(start_reading_entries_retry_count_) +
+           " : start_reading_entries_retry_count");
+    detail(std::to_string(tls_total_log_cas_fail_count_) +
+           " : tls_total_log_cas_fail_count");
+    detail(std::to_string(tls_total_swap_buffers_slot_retry_count_) +
+           " : tls_total_swap_buffers_slot_retry_count");
+#endif
+
+    swap_request_slots_retry_count_ = 0;
+    swap_request_slots_retry_retry_count_ = 0;
+    swap_request_slots_retry_reencounter_count_ = 0;
+    start_reading_entries_retry_count_ = 0;
+    tls_total_log_cas_fail_count_ = 0;
+    tls_total_swap_buffers_slot_retry_count_ = 0;
+  });
+}
+
+void Logger::RestartLatencyRecording(uint64_t first_sample_sequence_id,
+                                     size_t latencies_to_reserve) {
+  async_logger_.RestartLatencyRecording(first_sample_sequence_id,
+                                        latencies_to_reserve);
+}
+
+std::vector<QuerySampleLatency> Logger::GetLatenciesBlocking(
+    size_t expected_count) {
+  return async_logger_.GetLatenciesBlocking(expected_count);
+}
+std::vector<QuerySampleLatency> Logger::GetTokenLatencies(
+    size_t expected_count) {
+  return async_logger_.GetTokenLatencies(expected_count);
+}
+std::vector<QuerySampleLatency> Logger::GetTimePerOutputToken(
+    size_t expected_count) {
+  return async_logger_.GetTimePerOutputToken(expected_count);
+}
+std::vector<QuerySampleLatency> Logger::GetTokensPerSample(
+    size_t expected_count) {
+  return async_logger_.GetTokensPerSample(expected_count);
+}
+
+PerfClock::time_point Logger::GetMaxCompletionTime() {
+  return async_logger_.GetMaxCompletionTime();
+}
+
+QuerySampleLatency Logger::GetMaxLatencySoFar() {
+  return async_logger_.GetMaxLatencySoFar();
+}
+
+void Logger::SetUseTokens(bool use_tokens) {
+  async_logger_.SetUseTokens(use_tokens);
+}
+
+void Logger::SetNeedsFirstToken(bool needs_first_token) {
+  async_logger_.SetNeedsFirstToken(needs_first_token);
+}
+
+TlsLogger* Logger::GetTlsLoggerThatRequestedSwap(size_t slot, size_t next_id) {
+  uintptr_t slot_value = thread_swap_request_slots_[slot].load();
+  if (SwapRequestSlotIsReadable(slot_value)) {
+    // TODO: Convert this block to a simple write once we are confidient
+    // that we don't need to check for success.
+    bool success = thread_swap_request_slots_[slot].compare_exchange_strong(
+        slot_value, SwapRequestSlotIsWritableValue(next_id));
+    if (!success) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_WARNING((*this), "warning_generic_message", "CAS failed.");
+#else
+      LogErrorSync("CAS failed.", "line", __LINE__);
+#endif
+      assert(success);
+    }
+    return reinterpret_cast<TlsLogger*>(slot_value);
+  }
+  return nullptr;
+}
+
+void Logger::GatherRetrySwapRequests(std::vector<TlsLogger*>* threads_to_swap) {
+  if (swap_request_slots_to_retry_.empty()) {
+    return;
+  }
+
+  std::vector<SlotRetry> retry_slots;
+  retry_slots.swap(swap_request_slots_to_retry_);
+  for (auto& slot_retry : retry_slots) {
+    TlsLogger* tls_logger =
+        GetTlsLoggerThatRequestedSwap(slot_retry.slot, slot_retry.next_id);
+    if (tls_logger) {
+      threads_to_swap->push_back(tls_logger);
+    } else {
+      swap_request_slots_to_retry_.push_back(slot_retry);
+      swap_request_slots_retry_retry_count_++;
+    }
+  }
+}
+
+void Logger::GatherNewSwapRequests(std::vector<TlsLogger*>* threads_to_swap) {
+  auto swap_request_end = swap_request_id_.load(std::memory_order_acquire);
+  for (; swap_request_id_read_ < swap_request_end; swap_request_id_read_++) {
+    size_t slot = swap_request_id_read_ % thread_swap_request_slots_.size();
+    size_t next_id = swap_request_id_read_ + thread_swap_request_slots_.size();
+    TlsLogger* tls_logger = GetTlsLoggerThatRequestedSwap(slot, next_id);
+    if (tls_logger) {
+      threads_to_swap->push_back(tls_logger);
+    } else {
+      swap_request_slots_retry_count_++;
+      // A thread is in the middle of its call to RequestSwapBuffers.
+      // Retry later once it's done.
+      auto it = std::find_if(swap_request_slots_to_retry_.begin(),
+                             swap_request_slots_to_retry_.end(),
+                             [=](SlotRetry& s) { return s.slot == slot; });
+      if (it == swap_request_slots_to_retry_.end()) {
+        // This is the first time we are retrying the slot.
+        swap_request_slots_to_retry_.push_back({slot, next_id});
+      } else {
+        // Whoa. We've been retrying this slot since the last time it was
+        // encountered. Just update the next_id.
+        it->next_id = next_id;
+        swap_request_slots_retry_reencounter_count_++;
+      }
+    }
+  }
+}
+
+void Logger::IOThread() {
+  while (keep_io_thread_alive_) {
+    auto tracer1 =
+        MakeScopedTracer([](AsyncTrace& trace) { trace("IOThreadLoop"); });
+    {
+      auto tracer2 = MakeScopedTracer([](AsyncTrace& trace) { trace("Wait"); });
+      std::unique_lock<std::mutex> lock(io_thread_mutex_);
+      io_thread_cv_.wait_for(lock, poll_period_,
+                             [&] { return !keep_io_thread_alive_; });
+    }
+
+    {
+      auto tracer3 =
+          MakeScopedTracer([](AsyncTrace& trace) { trace("Gather"); });
+      std::vector<TlsLogger*> threads_to_swap;
+      threads_to_swap.swap(threads_to_swap_deferred_);
+      GatherRetrySwapRequests(&threads_to_swap);
+      GatherNewSwapRequests(&threads_to_swap);
+      for (TlsLogger* thread : threads_to_swap) {
+        if (thread->ReadBufferHasBeenConsumed()) {
+          thread->SwapBuffers();
+          // After swapping a thread, it's ready to be read.
+          threads_to_read_.push_back(thread);
+        } else {
+          // Don't swap buffers again until we've finish reading the
+          // previous swap.
+          threads_to_swap_deferred_.push_back(thread);
+        }
+      }
+    }
+
+    {
+      auto tracer4 =
+          MakeScopedTracer([](AsyncTrace& trace) { trace("Process"); });
+      // Read from the threads we are confident have activity.
+      for (std::vector<TlsLogger*>::iterator thread = threads_to_read_.begin();
+           thread != threads_to_read_.end(); thread++) {
+        auto tracer5 =
+            MakeScopedTracer([tid = (*thread)->Tid()](AsyncTrace& trace) {
+              trace("Thread", "tid", tid);
+            });
+        std::vector<AsyncLogEntry>* entries = (*thread)->StartReadingEntries();
+        if (!entries) {
+          start_reading_entries_retry_count_++;
+          continue;
+        }
+
+        async_logger_.SetCurrentPidTid((*thread)->Pid(), (*thread)->Tid());
+        for (auto& entry : *entries) {
+          // Execute the entry to perform the serialization and I/O.
+          entry(async_logger_);
+        }
+        (*thread)->FinishReadingEntries();
+        // Mark for removal by the call to RemoveValue below.
+        *thread = nullptr;
+      }
+
+      // Only remove threads where reading succeeded so we retry the failed
+      // threads the next time around.
+      RemoveValue(&threads_to_read_, nullptr);
+    }
+
+    // Explicitly flush every time we wake up. The goal being minimization
+    // of large implicit flushes which could affect tail latency measurements,
+    // especially at percentiles closer to 100%.
+    /// \todo Determine if explicitly flushing logs every wake up is better
+    /// than relying on implicit flushing.
+    {
+      auto tracer6 =
+          MakeScopedTracer([](AsyncTrace& trace) { trace("FlushAll"); });
+      async_logger_.Flush();
+    }
+
+    if (!orphans_to_destroy_.empty()) {
+      auto tracer7 = MakeScopedTracer(
+          [](AsyncTrace& trace) { trace("Abandoning Orphans"); });
+      std::unique_lock<std::mutex> lock(tls_logger_orphans_mutex_);
+      for (auto orphan : orphans_to_destroy_) {
+        tls_logger_orphans_.erase(orphan);
+      }
+      orphans_to_destroy_.clear();
+    }
+  }
+}
+
+TlsLogger::TlsLogger(std::function<void()> forced_detatch)
+    : pid_(MLPERF_GET_PID()),
+      tid_(MLPERF_GET_TID()),
+      forced_detatch_(std::move(forced_detatch)) {
+  for (auto& entry : entries_) {
+    entry.reserve(kTlsLogReservedEntryCount);
+  }
+}
+
+TlsLogger::~TlsLogger() {}
+
+// Log always makes forward progress since it can unconditionally obtain a
+// "lock" on at least one of the buffers for writing.
+// Notificiation is also lock free.
+void TlsLogger::Log(AsyncLogEntry&& entry) {
+  size_t cas_fail_count = 0;
+  auto unlocked = EntryState::Unlocked;
+  size_t i_write = i_write_.load(std::memory_order_relaxed);
+  while (!entry_states_[i_write].compare_exchange_strong(
+      unlocked, EntryState::WriteLock, std::memory_order_acquire,
+      std::memory_order_relaxed)) {
+    unlocked = EntryState::Unlocked;
+    i_write ^= 1;
+    // We may need to try 3 times, since there could be a race with a
+    // previous SwapBuffers request and we use memory_order_relaxed when
+    // loading i_write_ above.
+    cas_fail_count++;
+    if (cas_fail_count >= 3) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_WARNING(GlobalLogger(), "warning_generic_message",
+                         "CAS failed.");
+#else
+      GlobalLogger().LogErrorSync("CAS failed.", "times", cas_fail_count,
+                                  "line", __LINE__);
+#endif
+    }
+    log_cas_fail_count_.fetch_add(1, std::memory_order_relaxed);
+  }
+  entries_[i_write].emplace_back(std::forward<AsyncLogEntry>(entry));
+
+  // TODO: Convert this block to a simple write once we are confidient
+  // that we don't need to check for success.
+  auto write_lock = EntryState::WriteLock;
+  bool success = entry_states_[i_write].compare_exchange_strong(
+      write_lock, EntryState::Unlocked, std::memory_order_release);
+  if (!success) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG_WARNING(GlobalLogger(), "warning_generic_message",
+                       "CAS failed.");
+#else
+    GlobalLogger().LogErrorSync("CAS failed.", "line", __LINE__);
+#endif
+    assert(success);
+  }
+
+  bool write_buffer_swapped = i_write_prev_ != i_write;
+  if (write_buffer_swapped) {
+    GlobalLogger().RequestSwapBuffers(this);
+    i_write_prev_ = i_write;
+  }
+}
+
+void TlsLogger::SwapBuffers() {
+  // TODO: Convert this block to a simple write once we are confidient
+  // that we don't need to check for success.
+  auto read_lock = EntryState::ReadLock;
+  bool success = entry_states_[i_read_].compare_exchange_strong(
+      read_lock, EntryState::Unlocked, std::memory_order_release);
+  if (!success) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG_WARNING(GlobalLogger(), "warning_generic_message",
+                       "CAS failed.");
+#else
+    GlobalLogger().LogErrorSync("CAS failed.", "line", __LINE__);
+#endif
+    assert(success);
+  }
+
+  i_write_.store(i_read_, std::memory_order_relaxed);
+  i_read_ ^= 1;
+  unread_swaps_++;
+}
+
+// Returns nullptr if read lock fails.
+std::vector<AsyncLogEntry>* TlsLogger::StartReadingEntries() {
+  auto unlocked = EntryState::Unlocked;
+  if (entry_states_[i_read_].compare_exchange_strong(
+          unlocked, EntryState::ReadLock, std::memory_order_acquire,
+          std::memory_order_relaxed)) {
+    return &entries_[i_read_];
+  }
+  return nullptr;
+}
+
+void TlsLogger::FinishReadingEntries() {
+  // Detect first logging allocation and track max allocated size.
+  size_t new_size = entries_[i_read_].size();
+  if (new_size > max_entry_size_) {
+    if (max_entry_size_ == kTlsLogReservedEntryCount) {
+      Log([ts = PerfClock::now()](AsyncLog& log) {
+        log.TraceAsyncInstant("FirstAllocation", 0, ts);
+      });
+    }
+    max_entry_size_ = new_size;
+  }
+
+  entries_[i_read_].clear();
+  unread_swaps_--;
+}
+
+bool TlsLogger::ReadBufferHasBeenConsumed() { return unread_swaps_ == 0; }
+
+void TlsLogger::TraceCounters() {
+  auto tracer = MakeScopedTracer(
+      [lcfc = log_cas_fail_count_.load(std::memory_order_relaxed),
+       sbsrc = swap_buffers_slot_retry_count_.load(std::memory_order_relaxed)](
+          AsyncTrace& trace) {
+        trace("TlsLogger:ContentionCounters", "log_cas_fail_count", lcfc,
+              "swap_buffers_slot_retry_count", sbsrc);
+      });
+}
+
+Logger& GlobalLogger() {
+  static Logger g_logger(kLogPollPeriod, kMaxThreadsToLog);
+  return g_logger;
+}
+
+/// \brief Moves ownership of the TlsLogger to Logger on thread exit
+/// so no round-trip synchronization with the IO thread is required.
+struct TlsLoggerWrapper {
+  TlsLoggerWrapper(std::function<void()> forced_detatch)
+      : tls_logger(std::make_unique<TlsLogger>(std::move(forced_detatch))) {
+    GlobalLogger().RegisterTlsLogger(tls_logger.get());
+  }
+  ~TlsLoggerWrapper() {
+    tls_logger->TraceCounters();
+    GlobalLogger().UnRegisterTlsLogger(std::move(tls_logger));
+  }
+  std::unique_ptr<TlsLogger> tls_logger;
+};
+
+TlsLoggerWrapper* InitializeMyTlsLoggerWrapper() {
+  thread_local std::unique_ptr<TlsLoggerWrapper> tls_logger_wrapper;
+  // forced_detatch lets the global Logger forcefully detatch TlsLoggers
+  // from the thread in the Logger's destructor, which may run before
+  // thread-local variables are destroyed when the loadgen is used as a python
+  // module and dynamically unloaded.
+  // Note: We capture a pointer to the tls_logger_wrapper since variables of
+  // the thread-local storage class aren't actually captured. C++ spec says
+  // only variables of the automatic storage class are captured.
+  /// \todo There is a race where the same TlsLoggerWrapper might be
+  /// destroyed both naturally and via forced_detatch. Destruction of
+  /// the TlsLoggerWrapper should be locked.
+  auto forced_detatch = [tls_logger_wrapper = &tls_logger_wrapper]() {
+    tls_logger_wrapper->reset();
+  };
+  tls_logger_wrapper = std::make_unique<TlsLoggerWrapper>(forced_detatch);
+  return tls_logger_wrapper.get();
+}
+
+TlsLogger* InitializeMyTlsLogger() {
+  thread_local TlsLoggerWrapper* wrapper = InitializeMyTlsLoggerWrapper();
+  return wrapper->tls_logger.get();
+}
+
+void Log(AsyncLogEntry&& entry) {
+  thread_local TlsLogger* const tls_logger = InitializeMyTlsLogger();
+  tls_logger->Log(std::forward<AsyncLogEntry>(entry));
+}
+
+}  // namespace logging
+}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/logging.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/logging.h
new file mode 100644
index 000000000..8f1a398e9
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/logging.h
@@ -0,0 +1,816 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Internal logging implementation details.
+
+#ifndef MLPERF_LOADGEN_LOGGING_H_
+#define MLPERF_LOADGEN_LOGGING_H_
+
+#define USE_NEW_LOGGING_FORMAT 1
+#define MLPERF_LOG(logger, key, value) \
+  logger.Log((key), (value), __FILE__, __LINE__)
+#define MLPERF_LOG_ERROR(logger, key, value) \
+  logger.LogError((key), (value), __FILE__, __LINE__)
+#define MLPERF_LOG_ERROR_SYNC(logger, key, value) \
+  logger.LogErrorSync((key), (value), __FILE__, __LINE__)
+#define MLPERF_LOG_WARNING(logger, key, value) \
+  logger.LogWarning((key), (value), __FILE__, __LINE__)
+#define MLPERF_LOG_INTERVAL_START(logger, key, value) \
+  logger.LogIntervalStart((key), (value), __FILE__, __LINE__)
+#define MLPERF_LOG_INTERVAL_END(logger, key, value) \
+  logger.LogIntervalEnd((key), (value), __FILE__, __LINE__)
+
+#include <algorithm>
+#include <atomic>
+#include <cassert>
+#include <chrono>
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <iomanip>
+#include <iostream>
+#include <list>
+#include <map>
+#include <mutex>
+#include <set>
+#include <string>
+#include <thread>
+#include <unordered_set>
+#include <vector>
+
+#include "query_sample.h"
+
+namespace mlperf {
+
+/// \brief Wait-free logging utilities that defer stringification
+/// and syscalls to a worker thread.
+namespace logging {
+
+class AsyncLog;
+class Logger;
+class TlsLogger;
+struct TlsLoggerWrapper;
+
+/// \todo Verify lambas are not allocating when bounded to a std::function.
+using AsyncLogEntry = std::function<void(AsyncLog&)>;
+using PerfClock = std::chrono::high_resolution_clock;
+
+/// \brief Logs the raw bytes as a hexadecimal ascii string.
+struct LogBinaryAsHexString {
+  std::vector<uint8_t>* data;
+};
+
+/// \brief By default, print out the value directly.
+template <typename T>
+const T& ArgValueTransform(const T& value) {
+  return value;
+}
+
+/// \brief Print out True/False.
+const std::string& ArgValueTransform(const bool& value);
+/// \brief Print out binary day as hex string.
+const std::string ArgValueTransform(const LogBinaryAsHexString& value);
+#if USE_NEW_LOGGING_FORMAT
+/// \brief Print out a string in JSON format (with quotes).
+const std::string ArgValueTransform(const std::string& value);
+const std::string ArgValueTransform(const char* value);
+/// \brief Prints a list of int in JSON format.
+const std::string ArgValueTransform(const std::vector<size_t>& value);
+/// \brief Prints a dict in JSON format.
+const std::string ArgValueTransform(
+    const std::map<std::string, std::string>& value);
+#endif
+
+/// \brief Helper to print out values without quotes when value is a string.
+template <typename T>
+const T& ArgValueTransformWithoutQuote(const T& value) {
+  return ArgValueTransform<T>(value);
+}
+inline const std::string ArgValueTransformWithoutQuote(
+    const LogBinaryAsHexString& value) {
+  return ArgValueTransform(value);
+}
+/// \brief Helper to print out a string without the quotes.
+inline const std::string ArgValueTransformWithoutQuote(
+    const std::string& value) {
+  return value;
+}
+
+/// \brief Outputs a trace that can be uploaded to chrome://tracing for
+/// visualization.
+/// \details Trace event format definition:
+/// https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/edit?usp=sharing
+class ChromeTracer {
+ public:
+  ChromeTracer(std::ostream* trace_out, PerfClock::time_point origin);
+  ~ChromeTracer();
+
+  template <typename... Args>
+  void AddCompleteEvent(const std::string& name, uint64_t pid, uint64_t tid,
+                        PerfClock::time_point start, PerfClock::time_point end,
+                        const Args... args) {
+    *out_ << "{\"name\":\"" << name << "\"," << "\"ph\":\"X\","
+          << "\"pid\":" << pid << "," << "\"tid\":" << tid << ","
+          << "\"ts\":" << Micros(start - origin_).count() << ","
+          << "\"dur\":" << Micros(end - start).count() << "," << "\"args\":{";
+    AddArgs(args...);
+    *out_ << "}},\n";
+  }
+
+  template <typename... Args>
+  void AddAsyncBeginEvent(const std::string& name, uint64_t pid, uint64_t id,
+                          PerfClock::time_point time, const Args... args) {
+    *out_ << "{\"name\":\"" << name << "\"," << "\"cat\":\"default\","
+          << "\"ph\":\"b\"," << "\"pid\":" << pid << "," << "\"id\":" << id
+          << "," << "\"ts\":" << Micros(time - origin_).count() << ","
+          << "\"args\":{";
+    AddArgs(args...);
+    *out_ << "}},\n";
+  }
+
+  template <typename... Args>
+  void AddAsyncInstantEvent(const std::string& name, uint64_t pid, uint64_t id,
+                            PerfClock::time_point time, const Args... args) {
+    *out_ << "{\"name\":\"" << name << "\"," << "\"cat\":\"default\","
+          << "\"ph\":\"n\"," << "\"pid\":" << pid << "," << "\"id\":" << id
+          << "," << "\"ts\":" << Micros(time - origin_).count() << ","
+          << "\"args\":{";
+    AddArgs(args...);
+    *out_ << "}},\n";
+  }
+
+  template <typename... Args>
+  void AddAsyncEndEvent(const std::string& name, uint64_t pid, uint64_t id,
+                        PerfClock::time_point time) {
+    *out_ << "{\"name\":\"" << name << "\"," << "\"cat\":\"default\","
+          << "\"ph\":\"e\", " << "\"pid\":" << pid << "," << "\"id\":" << id
+          << "," << "\"ts\":" << Micros(time - origin_).count() << "},\n";
+  }
+
+  template <typename... Args>
+  void AddCounterEvent(const std::string& name, uint64_t pid,
+                       PerfClock::time_point time, const Args... args) {
+    *out_ << "{\"name\":\"" << name << "\"," << "\"ph\": \"C\","
+          << "\"pid\":" << pid << ","
+          << "\"ts\":" << Micros(time - origin_).count() << ","
+          << "\"args\":{ ";
+    AddArgs(args...);
+    *out_ << "}},\n";
+  }
+
+  void Flush() { out_->flush(); }
+
+ private:
+  using Micros = std::chrono::duration<double, std::micro>;
+
+  void WriteTraceEventHeader();
+  void WriteTraceEventFooter();
+
+  void AddArgs() {}
+
+  template <typename T>
+  void AddArgs(const std::string& arg_name, const T& arg_value) {
+    *out_ << "\"" << arg_name << "\":" << ArgValueTransform(arg_value);
+  }
+
+  template <typename T, typename... Args>
+  void AddArgs(const std::string& arg_name, const T& arg_value,
+               const Args... args) {
+    *out_ << "\"" << arg_name << "\":" << ArgValueTransform(arg_value) << ",";
+    AddArgs(args...);
+  }
+
+  std::ostream* out_;
+  PerfClock::time_point origin_;
+};
+
+/// \brief The proxy all logging lambdas ultimately use to write any log type.
+/// \details Passed as an argument to the log lambda on the
+/// recording thread to serialize the data captured by the lambda and
+/// forward it to the output stream.
+/// \todo Make summary_out_, detail_out_, accuracy_out_, and trace_out_
+/// instances of a new LogOutput interface that the client may override.
+class AsyncLog {
+ public:
+  void SetLogFiles(std::ostream* summary, std::ostream* detail,
+                   std::ostream* accuracy, bool copy_detail_to_stdout,
+                   bool copy_summary_to_stdout,
+                   PerfClock::time_point log_origin);
+  void StartNewTrace(std::ostream* trace_out, PerfClock::time_point origin);
+  void StopTrace();
+  void Flush();
+
+  void SetCurrentPidTid(uint64_t pid, uint64_t tid);
+
+  void LogAccuracy(uint64_t seq_id, const QuerySampleIndex qsl_idx,
+                   const LogBinaryAsHexString& response, int64_t n_tokens);
+  void CacheToken(uint64_t seq_id, const LogBinaryAsHexString& response);
+
+  template <typename... Args>
+  void LogSummary(const std::string& message, const Args... args);
+
+  void SetLogDetailTime(PerfClock::time_point time) { log_detail_time_ = time; }
+
+  void FlagError() {
+    std::unique_lock<std::mutex> lock(log_mutex_);
+    log_error_count_++;
+    error_flagged_ = true;
+  }
+
+  void FlagWarning() {
+    std::unique_lock<std::mutex> lock(log_mutex_);
+    log_warning_count_++;
+    warning_flagged_ = true;
+  }
+
+#if USE_NEW_LOGGING_FORMAT
+  template <typename T>
+  void LogDetail(const std::string& key, const T& value,
+                 const std::string file_name, const unsigned int line_no);
+#else
+  template <typename... Args>
+  void LogDetail(const std::string& message, const Args... args);
+#endif
+
+  template <typename... Args>
+  void Trace(const std::string& trace_name, PerfClock::time_point start,
+             PerfClock::time_point end, const Args... args) {
+    std::unique_lock<std::mutex> lock(trace_mutex_);
+    if (tracer_) {
+      tracer_->AddCompleteEvent(trace_name, current_pid_, current_tid_, start,
+                                end, args...);
+    }
+  }
+
+  template <typename... Args>
+  void TraceAsyncInstant(const std::string& trace_name, uint64_t id,
+                         PerfClock::time_point instant_time,
+                         const Args... args) {
+    std::unique_lock<std::mutex> lock(trace_mutex_);
+    if (tracer_) {
+      tracer_->AddAsyncInstantEvent(trace_name, current_pid_, id, instant_time,
+                                    args...);
+    }
+  }
+
+  void SetScopedTraceTimes(PerfClock::time_point start,
+                           PerfClock::time_point end) {
+    scoped_start_ = start;
+    scoped_end_ = end;
+  }
+
+  template <typename... Args>
+  void ScopedTrace(const std::string& trace_name, const Args... args) {
+    std::unique_lock<std::mutex> lock(trace_mutex_);
+    if (tracer_) {
+      tracer_->AddCompleteEvent(trace_name, current_pid_, current_tid_,
+                                scoped_start_, scoped_end_, args...);
+    }
+  }
+
+  template <typename... Args>
+  void TraceSample(const std::string& trace_name, uint64_t id,
+                   PerfClock::time_point start, PerfClock::time_point end,
+                   const Args... args) {
+    std::unique_lock<std::mutex> lock(trace_mutex_);
+    if (tracer_) {
+      tracer_->AddAsyncBeginEvent(trace_name, current_pid_, id, start, args...);
+      tracer_->AddAsyncEndEvent(trace_name, current_pid_, id, end);
+    }
+  }
+
+  template <typename... Args>
+  void TraceCounterEvent(const std::string& trace_name,
+                         PerfClock::time_point time, const Args... args) {
+    std::unique_lock<std::mutex> lock(trace_mutex_);
+    if (tracer_) {
+      tracer_->AddCounterEvent(trace_name, current_pid_, time, args...);
+    }
+  }
+
+  void RestartLatencyRecording(uint64_t first_sample_sequence_id,
+                               size_t latencies_to_reserve);
+  void RecordSampleCompletion(uint64_t sample_sequence_id,
+                              PerfClock::time_point completion_time,
+                              QuerySampleLatency latency, int64_t n_tokens);
+  void RecordTokenCompletion(uint64_t sample_sequence_id,
+                             PerfClock::time_point completion_time,
+                             QuerySampleLatency latency);
+  std::vector<QuerySampleLatency> GetLatenciesBlocking(size_t expected_count);
+  std::vector<QuerySampleLatency> GetTokenLatencies(size_t expected_count);
+  std::vector<QuerySampleLatency> GetTimePerOutputToken(size_t expected_count);
+  std::vector<int64_t> GetTokensPerSample(size_t expected_count);
+  PerfClock::time_point GetMaxCompletionTime();
+  QuerySampleLatency GetMaxLatencySoFar();
+  void SetUseTokens(bool use_tokens);
+  void SetNeedsFirstToken(bool needs_first_token);
+  size_t GetErrorCount() { return log_error_count_; };
+
+ private:
+  void WriteAccuracyHeaderLocked();
+  void WriteAccuracyFooterLocked();
+
+  void LogArgs(std::ostream*) {}
+
+  template <typename T>
+  void LogArgs(std::ostream* out, const T& value_only) {
+    *out << ArgValueTransformWithoutQuote(value_only);
+  }
+
+  template <typename T>
+  void LogArgs(std::ostream* out, const std::string& arg_name,
+               const T& arg_value) {
+    *out << "\"" << arg_name
+         << "\" : " << ArgValueTransformWithoutQuote(arg_value);
+  }
+
+  template <typename T, typename... Args>
+  void LogArgs(std::ostream* out, const std::string& arg_name,
+               const T& arg_value, const Args... args) {
+    *out << "\"" << arg_name
+         << "\" : " << ArgValueTransformWithoutQuote(arg_value) << ", ";
+    LogArgs(out, args...);
+  }
+
+  std::mutex log_mutex_;
+  std::ostream* summary_out_ = &std::cerr;
+  std::ostream* detail_out_ = &std::cerr;
+  std::ostream* accuracy_out_ = &std::cerr;
+  // TODO: Instead of these bools, use a class that forwards to two streams.
+  bool copy_detail_to_stdout_ = false;
+  bool copy_summary_to_stdout_ = false;
+  bool accuracy_needs_comma_ = false;
+  PerfClock::time_point log_origin_;
+  size_t log_error_count_ = 0;
+  bool error_flagged_ = false;
+  size_t log_warning_count_ = 0;
+  bool warning_flagged_ = false;
+  bool use_tokens_ = false;
+  bool needs_first_token_ = false;
+
+  std::mutex trace_mutex_;
+  std::unique_ptr<ChromeTracer> tracer_;
+
+  uint64_t current_pid_;
+  uint64_t current_tid_;
+  PerfClock::time_point log_detail_time_;
+  PerfClock::time_point scoped_start_;
+  PerfClock::time_point scoped_end_;
+
+  std::mutex latencies_mutex_;
+  std::mutex token_latencies_mutex_;
+  std::mutex token_record_mutex_;
+  std::condition_variable all_latencies_recorded_;
+  uint64_t latencies_first_sample_sequence_id_ = 0;
+  std::vector<QuerySampleLatency> latencies_;
+  std::vector<QuerySampleLatency> token_latencies_;
+  std::vector<QuerySampleLatency> time_per_output_token_;
+  std::vector<LogBinaryAsHexString> token_records_;
+  std::vector<int64_t> tokens_per_sample_;
+  QuerySampleLatency max_latency_ = 0;
+  PerfClock::time_point max_completion_timstamp_;
+  size_t latencies_recorded_ = 0;
+  size_t latencies_expected_ = 0;
+  // Must be called with latencies_mutex_ held.
+  bool AllLatenciesRecorded() {
+    return latencies_recorded_ == latencies_expected_;
+  }
+};
+
+/// \brief The central logger that logs all threads belonging to a run.
+class Logger {
+ public:
+  Logger(std::chrono::duration<double> poll_period, size_t max_threads_to_log);
+  ~Logger();
+
+  void StartIOThread();
+  void StopIOThread();
+
+  void StartLogging(std::ostream* summary, std::ostream* detail,
+                    std::ostream* accuracy, bool copy_detail_to_stdout,
+                    bool copy_summary_to_stdout);
+  void StopLogging();
+
+  void StartNewTrace(std::ostream* trace_out, PerfClock::time_point origin);
+  void StopTracing();
+
+  void LogContentionAndAllocations();
+
+  void RestartLatencyRecording(uint64_t first_sample_sequence_id,
+                               size_t latencies_to_reserve);
+  std::vector<QuerySampleLatency> GetLatenciesBlocking(size_t expected_count);
+  std::vector<QuerySampleLatency> GetTokenLatencies(size_t expected_count);
+  std::vector<QuerySampleLatency> GetTimePerOutputToken(size_t expected_count);
+  std::vector<int64_t> GetTokensPerSample(size_t expected_count);
+  PerfClock::time_point GetMaxCompletionTime();
+  QuerySampleLatency GetMaxLatencySoFar();
+  void SetUseTokens(bool use_tokens);
+  void SetNeedsFirstToken(bool needs_first_token);
+
+ private:
+  friend AsyncLog;
+  friend TlsLogger;
+  friend TlsLoggerWrapper;
+
+  void RegisterTlsLogger(TlsLogger* tls_logger);
+  void UnRegisterTlsLogger(std::unique_ptr<TlsLogger> tls_logger);
+  void RequestSwapBuffers(TlsLogger* tls_logger);
+  void CollectTlsLoggerStats(TlsLogger* tls_logger);
+
+  TlsLogger* GetTlsLoggerThatRequestedSwap(size_t slot, size_t next_id);
+  void GatherRetrySwapRequests(std::vector<TlsLogger*>* threads_to_swap);
+  void GatherNewSwapRequests(std::vector<TlsLogger*>* threads_to_swap);
+
+  /// \brief The main logging thread function that handles the serialization
+  /// and I/O to the stream or file.
+  ///
+  /// \todo Provide client hook to set logging thead affinity and priority.
+  void IOThread();
+
+// Slow synchronous error logging for internals that may prevent
+// async logging from working.
+#if USE_NEW_LOGGING_FORMAT
+  template <typename T>
+  void LogErrorSync(const std::string& key, const T& value,
+                    const std::string file_name, const unsigned int line_no) {
+    /// \todo Acquire mutex once for FlagError + LogDetail to avoid
+    ///       races. Better yet, switch to a non-stateful error API.
+    //       This is better than nothing though.
+    async_logger_.FlagError();
+    async_logger_.LogDetail(key, value, file_name, line_no);
+  }
+  template <typename T>
+  void LogWarning(const std::string& key, const T& value,
+                  const std::string file_name, const unsigned int line_no) {
+    async_logger_.FlagWarning();
+    async_logger_.LogDetail(key, value, file_name, line_no);
+  }
+#else
+  template <typename... Args>
+  void LogErrorSync(const std::string& message, Args&&... args) {
+    /// \todo Acquire mutex once for FlagError + LogDetail to avoid
+    ///       races. Better yet, switch to a non-stateful error API.
+    //       This is better than nothing though.
+    async_logger_.FlagError();
+    async_logger_.LogDetail(message, std::forward<Args>(args)...);
+  }
+#endif
+
+  // Accessed by IOThead only.
+  const std::chrono::duration<double> poll_period_;
+  AsyncLog async_logger_;
+
+  const size_t max_threads_to_log_;
+  std::thread io_thread_;
+
+  // Accessed by producers and IOThead during thread registration and
+  // destruction. Protected by io_thread_mutex_.
+  std::mutex io_thread_mutex_;
+  std::condition_variable io_thread_cv_;
+  bool keep_io_thread_alive_ = false;
+
+  std::mutex tls_loggers_registerd_mutex_;
+  std::unordered_set<TlsLogger*> tls_loggers_registerd_;
+
+  // Temporarily stores TlsLogger data for threads that have exited until
+  // all their log entries have been processed.
+  // Accessed by IOThread and producers as their threads exit.
+  std::mutex tls_logger_orphans_mutex_;
+  using OrphanContainer = std::list<std::unique_ptr<TlsLogger>>;
+  OrphanContainer tls_logger_orphans_;
+
+  // Accessed by producers and IOThead atomically.
+  std::atomic<size_t> swap_request_id_{0};
+  std::vector<std::atomic<uintptr_t>> thread_swap_request_slots_;
+
+  // Accessed by IOThead only.
+  size_t swap_request_id_read_{0};
+  struct SlotRetry {
+    size_t slot;
+    uintptr_t next_id;
+  };
+  std::vector<SlotRetry> swap_request_slots_to_retry_;
+  std::vector<TlsLogger*> threads_to_swap_deferred_;
+  std::vector<TlsLogger*> threads_to_read_;
+  std::vector<OrphanContainer::iterator> orphans_to_destroy_;
+
+  // Counts for retries related to the lock-free scheme.
+  // Abnormally high counts could be an indicator of contention.
+  // Access on IOThread only.
+  size_t swap_request_slots_retry_count_ = 0;
+  size_t swap_request_slots_retry_retry_count_ = 0;
+  size_t swap_request_slots_retry_reencounter_count_ = 0;
+  size_t start_reading_entries_retry_count_ = 0;
+  size_t tls_total_log_cas_fail_count_ = 0;
+  size_t tls_total_swap_buffers_slot_retry_count_ = 0;
+};
+
+Logger& GlobalLogger();
+
+/// \brief The generic way to add a log entry.
+/// \details Supports all types of logs, which is useful for complex
+/// lambdas that may wish to log in multiple places or log something other
+/// than a simple summary, detail, or trace entry.
+void Log(AsyncLogEntry&& entry);
+
+/// \brief The convenience proxy a LogSummary lambda uses to write to the
+/// summary log.
+class AsyncSummary {
+ public:
+  explicit AsyncSummary(AsyncLog& async_log) : async_log_(async_log) {}
+  AsyncLog& async_log() { return async_log_; }
+
+  template <typename... Args>
+  AsyncLog& operator()(Args&&... args) {
+    async_log_.LogSummary(std::forward<Args>(args)...);
+    return async_log_;
+  }
+
+ private:
+  AsyncLog& async_log_;
+};
+
+/// \brief A helper to simplify adding a summary log entry.
+template <typename LambdaT>
+void LogSummary(LambdaT&& lambda) {
+  Log([lambda = std::forward<LambdaT>(lambda)](AsyncLog& log) mutable {
+    AsyncSummary async_summary(log);
+    lambda(async_summary);
+  });
+}
+
+/// \brief The convenience proxy a LogDetail lambda uses to write to the detail
+/// log.
+class AsyncDetail {
+ public:
+  explicit AsyncDetail(AsyncLog& async_log) : async_log_(async_log) {}
+  AsyncLog& async_log() { return async_log_; }
+
+#if USE_NEW_LOGGING_FORMAT
+  template <typename T>
+  AsyncLog& Log(const std::string& key, const T& value,
+                const std::string file_name, const unsigned int line_no) {
+    async_log_.LogDetail(key, value, file_name, line_no);
+    return async_log_;
+  }
+
+  template <typename T>
+  AsyncLog& LogError(const std::string& key, const T& value,
+                     const std::string file_name, const unsigned int line_no) {
+    async_log_.FlagError();
+    async_log_.LogDetail(key, value, file_name, line_no);
+    return async_log_;
+  }
+
+  template <typename T>
+  AsyncLog& LogWarning(const std::string& key, const T& value,
+                       const std::string file_name,
+                       const unsigned int line_no) {
+    async_log_.FlagWarning();
+    async_log_.LogDetail(key, value, file_name, line_no);
+    return async_log_;
+  }
+
+  template <typename T>
+  AsyncLog& LogIntervalStart(const std::string& key, const T& value,
+                             const std::string file_name,
+                             const unsigned int line_no) {
+    async_log_.LogDetail(key, value, file_name, line_no);
+    return async_log_;
+  }
+
+  template <typename T>
+  AsyncLog& LogIntervalEnd(const std::string& key, const T& value,
+                           const std::string file_name,
+                           const unsigned int line_no) {
+    async_log_.LogDetail(key, value, file_name, line_no);
+    return async_log_;
+  }
+#else
+  template <typename... Args>
+  AsyncLog& operator()(Args&&... args) {
+    async_log_.LogDetail(std::forward<Args>(args)...);
+    return async_log_;
+  }
+
+  template <typename... Args>
+  AsyncLog& Error(Args&&... args) {
+    async_log_.FlagError();
+    async_log_.LogDetail(std::forward<Args>(args)...);
+    return async_log_;
+  }
+
+  template <typename... Args>
+  AsyncLog& Warning(Args&&... args) {
+    async_log_.FlagWarning();
+    async_log_.LogDetail(std::forward<Args>(args)...);
+    return async_log_;
+  }
+#endif
+
+ private:
+  AsyncLog& async_log_;
+};
+
+/// \brief A helper to simplify adding a detail log entry.
+template <typename LambdaT>
+void LogDetail(LambdaT&& lambda) {
+  Log([lambda = std::forward<LambdaT>(lambda),
+       timestamp = PerfClock::now()](AsyncLog& log) mutable {
+    log.SetLogDetailTime(timestamp);
+    AsyncDetail async_detail(log);
+    lambda(async_detail);
+  });
+}
+
+/// \brief The convenience proxy a ScopedTracer lambda uses to write to the
+/// detail log.
+class AsyncTrace {
+ public:
+  explicit AsyncTrace(AsyncLog& async_log) : async_log_(async_log) {}
+  AsyncLog& async_log() { return async_log_; }
+
+  template <typename... Args>
+  AsyncLog& operator()(Args&&... args) {
+    async_log_.ScopedTrace(std::forward<Args>(args)...);
+    return async_log_;
+  }
+
+ private:
+  AsyncLog& async_log_;
+};
+
+/// \brief ScopedTracer is an RAII object that traces the start and end
+/// of its lifetime.
+template <typename LambdaT>
+class ScopedTracer {
+ public:
+  ScopedTracer(LambdaT&& lambda)
+      : start_(PerfClock::now()), lambda_(std::forward<LambdaT>(lambda)) {}
+
+  ~ScopedTracer() {
+    Log([start = start_, lambda = std::move(lambda_),
+         end = PerfClock::now()](AsyncLog& log) {
+      log.SetScopedTraceTimes(start, end);
+      AsyncTrace async_trace(log);
+      lambda(async_trace);
+    });
+  }
+
+ private:
+  PerfClock::time_point start_;
+  LambdaT lambda_;
+};
+
+/// \brief Helper that creates a ScopeTracer with automatic type deduction.
+/// \details Helps with automatic template type deduction, which has been
+/// supported for functions for a long time.
+/// C++17 will support deduction for classes, which will neutralize the utility
+/// of a helper function like this.
+/// \todo Determine which traces to keep for submission purposes.
+template <typename LambdaT>
+auto MakeScopedTracer(LambdaT&& lambda) -> ScopedTracer<LambdaT> {
+  return ScopedTracer<LambdaT>(std::forward<LambdaT>(lambda));
+}
+
+template <typename... Args>
+void AsyncLog::LogSummary(const std::string& message, const Args... args) {
+  auto tracer = MakeScopedTracer([message](AsyncTrace& trace) {
+    std::string sanitized_message = message;
+    std::replace(sanitized_message.begin(), sanitized_message.end(), '"', '\'');
+    std::replace(sanitized_message.begin(), sanitized_message.end(), '\n', ';');
+    trace("LogSummary", "message", "\"" + sanitized_message + "\"");
+  });
+  std::unique_lock<std::mutex> lock(log_mutex_);
+  *summary_out_ << message;
+  LogArgs(summary_out_, args...);
+  *summary_out_ << "\n";
+
+  if (copy_summary_to_stdout_) {
+    std::cout << message;
+    LogArgs(&std::cout, args...);
+    std::cout << "\n";
+  }
+}
+
+#if USE_NEW_LOGGING_FORMAT
+template <typename T>
+void AsyncLog::LogDetail(const std::string& key, const T& value,
+                         const std::string file_name,
+                         const unsigned int line_no) {
+  auto tracer = MakeScopedTracer([key](AsyncTrace& trace) {
+    std::string sanitized_key = key;
+    std::replace(sanitized_key.begin(), sanitized_key.end(), '"', '\'');
+    std::replace(sanitized_key.begin(), sanitized_key.end(), '\n', ';');
+    trace("LogDetail", "key", "\"" + sanitized_key + "\"");
+  });
+  std::unique_lock<std::mutex> lock(log_mutex_);
+  std::vector<std::ostream*> detail_streams{detail_out_, &std::cout};
+  if (!copy_detail_to_stdout_) {
+    detail_streams.pop_back();
+  }
+  auto time_ns = (log_detail_time_ - log_origin_).count();
+  for (auto os : detail_streams) {
+    *os << ":::MLLOG {" << "\"key\": " << ArgValueTransform(key) << ", "
+        << "\"value\": " << ArgValueTransform(value) << ", "
+        << "\"time_ms\": " << ArgValueTransform(time_ns / 1000000ULL) << "."
+        << std::setfill('0') << std::setw(6)
+        << ArgValueTransform(time_ns % 1000000ULL) << ", "
+        << "\"namespace\": \"mlperf::logging\", "
+        << "\"event_type\": \"POINT_IN_TIME\", " << "\"metadata\": {"
+        << "\"is_error\": " << ArgValueTransform(error_flagged_) << ", "
+        << "\"is_warning\": " << ArgValueTransform(warning_flagged_) << ", "
+        << "\"file\": \"" << file_name << "\", "
+        << "\"line_no\": " << ArgValueTransform(line_no) << ", "
+        << "\"pid\": " << ArgValueTransform(current_pid_) << ", "
+        << "\"tid\": " << ArgValueTransform(current_tid_) << "}}\n";
+    if (error_flagged_) {
+      os->flush();
+    }
+  }
+  error_flagged_ = false;
+  warning_flagged_ = false;
+}
+#else
+template <typename... Args>
+void AsyncLog::LogDetail(const std::string& message, const Args... args) {
+  auto tracer = MakeScopedTracer([message](AsyncTrace& trace) {
+    std::string sanitized_message = message;
+    std::replace(sanitized_message.begin(), sanitized_message.end(), '"', '\'');
+    std::replace(sanitized_message.begin(), sanitized_message.end(), '\n', ';');
+    trace("LogDetail", "message", "\"" + sanitized_message + "\"");
+  });
+  std::unique_lock<std::mutex> lock(log_mutex_);
+  std::vector<std::ostream*> detail_streams{detail_out_, &std::cout};
+  if (!copy_detail_to_stdout_) {
+    detail_streams.pop_back();
+  }
+  for (auto os : detail_streams) {
+    *os << "\"pid\": " << current_pid_ << ", " << "\"tid\": " << current_tid_
+        << ", " << "\"ts\": " << (log_detail_time_ - log_origin_).count()
+        << "ns : ";
+    if (error_flagged_) {
+      *os << "ERROR : ";
+    } else if (warning_flagged_) {
+      *os << "WARNING : ";
+    }
+    *os << message;
+    LogArgs(os, args...);
+    *os << "\n";
+    if (error_flagged_) {
+      os->flush();
+    }
+  }
+  error_flagged_ = false;
+  warning_flagged_ = false;
+}
+#endif
+
+}  // namespace logging
+
+// Export some things out of the logging namespace to simplify call sites.
+
+const auto GlobalLogger = logging::GlobalLogger;
+const auto Log = logging::Log;
+
+using PerfClock = logging::PerfClock;
+
+using LogBinaryAsHexString = logging::LogBinaryAsHexString;
+
+using AsyncLog = logging::AsyncLog;
+
+using AsyncSummary = logging::AsyncSummary;
+template <typename LambdaT>
+void LogSummary(LambdaT&& lambda) {
+  logging::LogSummary(std::forward<LambdaT>(lambda));
+}
+
+using AsyncDetail = logging::AsyncDetail;
+template <typename LambdaT>
+void LogDetail(LambdaT&& lambda) {
+  logging::LogDetail(std::forward<LambdaT>(lambda));
+}
+
+using AsyncTrace = logging::AsyncTrace;
+
+template <typename LambdaT>
+using ScopedTracer = logging::ScopedTracer<LambdaT>;
+
+template <typename LambdaT>
+auto MakeScopedTracer(LambdaT&& lambda) -> ScopedTracer<LambdaT> {
+  return ScopedTracer<LambdaT>(std::forward<LambdaT>(lambda));
+}
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_LOGGING_H_
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/mlperf.conf b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/mlperf.conf
new file mode 100644
index 000000000..1b825514b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/mlperf.conf
@@ -0,0 +1,164 @@
+# The format of this config file is 'key = value'.
+# The key has the format 'model.scenario.key'. Value is mostly int64_t.
+# Model maybe '*' as wildcard. In that case the value applies to all models.
+# All times are in milli seconds
+
+# Set performance_sample_count for each model.
+# User can optionally set this to higher values in user.conf.
+resnet50.*.performance_sample_count_override = 1024
+ssd-mobilenet.*.performance_sample_count_override = 256
+retinanet.*.performance_sample_count_override = 64
+bert.*.performance_sample_count_override = 10833
+dlrm.*.performance_sample_count_override = 204800
+dlrm-v2.*.performance_sample_count_override = 204800
+rnnt.*.performance_sample_count_override = 2513
+gptj.*.performance_sample_count_override = 13368
+mixtral-8x7b.*.performance_sample_count_override = 15000
+llama2-70b.*.performance_sample_count_override = 24576
+llama2-70b-interactive.*.performance_sample_count_override = 24576
+llama3_1-405b.*.performance_sample_count_override = 8313
+llama3_1-405b-interactive.*.performance_sample_count_override = 8313
+llama3_1-8b.*.performance_sample_count_override = 13368
+llama3_1-8b-edge.*.performance_sample_count_override = 5000
+llama3_1-8b-interactive.*.performance_sample_count_override = 13368
+stable-diffusion-xl.*.performance_sample_count_override = 5000
+rgat.*.performance_sample_count_override = 788379
+pointpainting.*.performance_sample_count_override = 1024
+deepseek-r1.*.performance_sample_count_override = 4388
+whisper.*.performance_sample_count_override = 1633
+# set to 0 to let entire sample set to be performance sample
+3d-unet.*.performance_sample_count_override = 0
+
+# Set seeds.
+*.*.qsl_rng_seed = 1780908523862526354
+*.*.sample_index_rng_seed = 14771362308971278857
+*.*.schedule_rng_seed = 18209322760996052031
+
+# Set seeds for TEST_05 (not needed from v5.0 onwards)
+*.*.test05_qsl_rng_seed = 7975553102935885558
+*.*.test05_sample_index_rng_seed = 11403566307062068064
+*.*.test05_schedule_rng_seed = 15816800565822761601
+
+*.SingleStream.target_latency_percentile = 90
+pointpainting.SingleStream.target_latency_percentile = 99.9
+*.SingleStream.min_duration = 600000
+
+*.MultiStream.target_latency_percentile = 99
+*.MultiStream.samples_per_query = 8
+*.MultiStream.min_duration = 600000
+*.MultiStream.min_query_count = 662
+retinanet.MultiStream.target_latency = 528
+
+# 3D-UNet uses equal issue mode because it has non-uniform inputs
+3d-unet.*.sample_concatenate_permutation = 1
+
+# R-GAT uses equal issue mode because it may have non-uniform inputs
+rgat.*.sample_concatenate_permutation = 1
+
+# LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario
+gptj.*.sample_concatenate_permutation = 1
+llama2-70b.*.sample_concatenate_permutation = 1
+llama2-70b-interactive.*.sample_concatenate_permutation = 1
+mixtral-8x7b.*.sample_concatenate_permutation = 1
+llama3_1-405b.*.sample_concatenate_permutation = 1
+llama3_1-405b-interactive.*.sample_concatenate_permutation = 1
+llama3_1-8b.*.sample_concatenate_permutation = 1
+llama3_1-8b-edge.*.sample_concatenate_permutation = 1
+llama3_1-8b-interactive.*.sample_concatenate_permutation = 1
+deepseek-r1.*.sample_concatenate_permutation = 1
+whisper.*.sample_concatenate_permutation = 1
+
+*.Server.target_latency = 10
+*.Server.target_latency_percentile = 99
+*.Server.target_duration = 0
+*.Server.min_duration = 600000
+resnet50.Server.target_latency = 15
+retinanet.Server.target_latency = 100
+bert.Server.target_latency = 130
+dlrm.Server.target_latency = 60
+dlrm-v2.Server.target_latency = 60
+rnnt.Server.target_latency = 1000
+gptj.Server.target_latency = 20000
+stable-diffusion-xl.Server.target_latency = 20000
+# Benchmarks that measure token latencies
+llama2-70b.*.use_token_latencies = 1
+llama2-70b-interactive.*.use_token_latencies = 1
+mixtral-8x7b.*.use_token_latencies = 1
+llama3_1-405b.*.use_token_latencies = 1
+llama3_1-405b-interactive.*.use_token_latencies = 1
+llama3_1-8b.*.use_token_latencies = 1
+llama3_1-8b-edge.*.use_token_latencies = 1
+llama3_1-8b-interactive.*.use_token_latencies = 1
+deepseek-r1.*.use_token_latencies = 1
+whisper.*.use_token_latencies = 1
+
+# gptj benchmark infers token latencies
+gptj.*.infer_token_latencies = 1
+gptj.*.token_latency_scaling_factor = 69
+# Only ttft and tpot are tracked for the llama2-70b, mixtral-8x7B & llama3_1-405b benchmark therefore target_latency = 0
+llama2-70b.Server.target_latency = 0
+llama2-70b.Server.ttft_latency = 2000
+llama2-70b.Server.tpot_latency = 200
+
+# Target Latencies for interactive setting
+llama2-70b-interactive.Server.target_latency = 0
+llama2-70b-interactive.Server.ttft_latency = 450
+llama2-70b-interactive.Server.tpot_latency = 40
+
+mixtral-8x7b.Server.target_latency = 0
+mixtral-8x7b.Server.ttft_latency = 2000
+mixtral-8x7b.Server.tpot_latency = 200
+
+llama3_1-405b.Server.target_latency = 0
+llama3_1-405b.Server.ttft_latency = 6000
+llama3_1-405b.Server.tpot_latency = 175
+
+# Target Latencies for interactive setting
+llama3_1-405b-interactive.Server.target_latency = 0
+llama3_1-405b-interactive.Server.ttft_latency = 4500
+llama3_1-405b-interactive.Server.tpot_latency = 80
+
+
+llama3_1-8b.Server.target_latency = 0
+llama3_1-8b.Server.ttft_latency = 2000
+llama3_1-8b.Server.tpot_latency = 100
+
+# Target Latencies for interactive setting
+llama3_1-8b-interactive.Server.target_latency = 0
+llama3_1-8b-interactive.Server.ttft_latency = 500
+llama3_1-8b-interactive.Server.tpot_latency = 30
+
+deepseek-r1.Server.target_latency = 0
+deepseek-r1.Server.ttft_latency = 2000
+deepseek-r1.Server.tpot_latency = 80
+
+*.Offline.target_latency_percentile = 90
+*.Offline.min_duration = 600000
+
+# In Offline scenario, we always have one query. But LoadGen maps this to
+# min_sample_count internally in Offline scenario. If the dataset size is larger
+# than 24576 we limit the min_query_count to 24576 and otherwise we use
+# the dataset size as the limit
+
+resnet50.Offline.min_query_count = 24576
+retinanet.Offline.min_query_count = 24576
+dlrm-v2.Offline.min_query_count = 24576
+bert.Offline.min_query_count = 10833
+gptj.Offline.min_query_count = 13368
+rnnt.Offline.min_query_count = 2513
+3d-unet.Offline.min_query_count = 43
+stable-diffusion-xl.Offline.min_query_count = 5000
+llama2-70b.Offline.min_query_count = 24576
+llama3_1-405b.Offline.min_query_count = 8313
+llama3_1-8b.Offline.min_query_count = 13368
+llama3_1-8b-edge.Offline.min_query_count = 5000
+mixtral-8x7b.Offline.min_query_count = 15000
+rgat.Offline.min_query_count = 788379
+deepseek-r1.Offline.min_query_count = 4388
+whisper.Offline.min_query_count = 1633
+
+# These fields should be defined and overridden by user.conf.
+*.SingleStream.target_latency = 10
+*.MultiStream.target_latency = 80
+*.Server.target_qps = 1.0
+*.Offline.target_qps = 1.0
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/mlperf_conf.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/mlperf_conf.h
new file mode 100644
index 000000000..7859e0139
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/mlperf_conf.h
@@ -0,0 +1,167 @@
+const char* mlperf_conf =
+"# The format of this config file is 'key = value'.\n"
+"# The key has the format 'model.scenario.key'. Value is mostly int64_t.\n"
+"# Model maybe '*' as wildcard. In that case the value applies to all models.\n"
+"# All times are in milli seconds\n"
+"\n"
+"# Set performance_sample_count for each model.\n"
+"# User can optionally set this to higher values in user.conf.\n"
+"resnet50.*.performance_sample_count_override = 1024\n"
+"ssd-mobilenet.*.performance_sample_count_override = 256\n"
+"retinanet.*.performance_sample_count_override = 64\n"
+"bert.*.performance_sample_count_override = 10833\n"
+"dlrm.*.performance_sample_count_override = 204800\n"
+"dlrm-v2.*.performance_sample_count_override = 204800\n"
+"rnnt.*.performance_sample_count_override = 2513\n"
+"gptj.*.performance_sample_count_override = 13368\n"
+"mixtral-8x7b.*.performance_sample_count_override = 15000\n"
+"llama2-70b.*.performance_sample_count_override = 24576\n"
+"llama2-70b-interactive.*.performance_sample_count_override = 24576\n"
+"llama3_1-405b.*.performance_sample_count_override = 8313\n"
+"llama3_1-405b-interactive.*.performance_sample_count_override = 8313\n"
+"llama3_1-8b.*.performance_sample_count_override = 13368\n"
+"llama3_1-8b-edge.*.performance_sample_count_override = 5000\n"
+"llama3_1-8b-interactive.*.performance_sample_count_override = 13368\n"
+"stable-diffusion-xl.*.performance_sample_count_override = 5000\n"
+"rgat.*.performance_sample_count_override = 788379\n"
+"pointpainting.*.performance_sample_count_override = 1024\n"
+"deepseek-r1.*.performance_sample_count_override = 4388\n"
+"whisper.*.performance_sample_count_override = 1633\n"
+"# set to 0 to let entire sample set to be performance sample\n"
+"3d-unet.*.performance_sample_count_override = 0\n"
+"\n"
+"# Set seeds.\n"
+"*.*.qsl_rng_seed = 1780908523862526354\n"
+"*.*.sample_index_rng_seed = 14771362308971278857\n"
+"*.*.schedule_rng_seed = 18209322760996052031\n"
+"\n"
+"# Set seeds for TEST_05 (not needed from v5.0 onwards)\n"
+"*.*.test05_qsl_rng_seed = 7975553102935885558\n"
+"*.*.test05_sample_index_rng_seed = 11403566307062068064\n"
+"*.*.test05_schedule_rng_seed = 15816800565822761601\n"
+"\n"
+"*.SingleStream.target_latency_percentile = 90\n"
+"pointpainting.SingleStream.target_latency_percentile = 99.9\n"
+"*.SingleStream.min_duration = 600000\n"
+"\n"
+"*.MultiStream.target_latency_percentile = 99\n"
+"*.MultiStream.samples_per_query = 8\n"
+"*.MultiStream.min_duration = 600000\n"
+"*.MultiStream.min_query_count = 662\n"
+"retinanet.MultiStream.target_latency = 528\n"
+"\n"
+"# 3D-UNet uses equal issue mode because it has non-uniform inputs\n"
+"3d-unet.*.sample_concatenate_permutation = 1\n"
+"\n"
+"# R-GAT uses equal issue mode because it may have non-uniform inputs\n"
+"rgat.*.sample_concatenate_permutation = 1\n"
+"\n"
+"# LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario\n"
+"gptj.*.sample_concatenate_permutation = 1\n"
+"llama2-70b.*.sample_concatenate_permutation = 1\n"
+"llama2-70b-interactive.*.sample_concatenate_permutation = 1\n"
+"mixtral-8x7b.*.sample_concatenate_permutation = 1\n"
+"llama3_1-405b.*.sample_concatenate_permutation = 1\n"
+"llama3_1-405b-interactive.*.sample_concatenate_permutation = 1\n"
+"llama3_1-8b.*.sample_concatenate_permutation = 1\n"
+"llama3_1-8b-edge.*.sample_concatenate_permutation = 1\n"
+"llama3_1-8b-interactive.*.sample_concatenate_permutation = 1\n"
+"deepseek-r1.*.sample_concatenate_permutation = 1\n"
+"whisper.*.sample_concatenate_permutation = 1\n"
+"\n"
+"*.Server.target_latency = 10\n"
+"*.Server.target_latency_percentile = 99\n"
+"*.Server.target_duration = 0\n"
+"*.Server.min_duration = 600000\n"
+"resnet50.Server.target_latency = 15\n"
+"retinanet.Server.target_latency = 100\n"
+"bert.Server.target_latency = 130\n"
+"dlrm.Server.target_latency = 60\n"
+"dlrm-v2.Server.target_latency = 60\n"
+"rnnt.Server.target_latency = 1000\n"
+"gptj.Server.target_latency = 20000\n"
+"stable-diffusion-xl.Server.target_latency = 20000\n"
+"# Benchmarks that measure token latencies\n"
+"llama2-70b.*.use_token_latencies = 1\n"
+"llama2-70b-interactive.*.use_token_latencies = 1\n"
+"mixtral-8x7b.*.use_token_latencies = 1\n"
+"llama3_1-405b.*.use_token_latencies = 1\n"
+"llama3_1-405b-interactive.*.use_token_latencies = 1\n"
+"llama3_1-8b.*.use_token_latencies = 1\n"
+"llama3_1-8b-edge.*.use_token_latencies = 1\n"
+"llama3_1-8b-interactive.*.use_token_latencies = 1\n"
+"deepseek-r1.*.use_token_latencies = 1\n"
+"whisper.*.use_token_latencies = 1\n"
+"\n"
+"# gptj benchmark infers token latencies\n"
+"gptj.*.infer_token_latencies = 1\n"
+"gptj.*.token_latency_scaling_factor = 69\n"
+"# Only ttft and tpot are tracked for the llama2-70b, mixtral-8x7B & llama3_1-405b benchmark therefore target_latency = 0\n"
+"llama2-70b.Server.target_latency = 0\n"
+"llama2-70b.Server.ttft_latency = 2000\n"
+"llama2-70b.Server.tpot_latency = 200\n"
+"\n"
+"# Target Latencies for interactive setting\n"
+"llama2-70b-interactive.Server.target_latency = 0\n"
+"llama2-70b-interactive.Server.ttft_latency = 450\n"
+"llama2-70b-interactive.Server.tpot_latency = 40\n"
+"\n"
+"mixtral-8x7b.Server.target_latency = 0\n"
+"mixtral-8x7b.Server.ttft_latency = 2000\n"
+"mixtral-8x7b.Server.tpot_latency = 200\n"
+"\n"
+"llama3_1-405b.Server.target_latency = 0\n"
+"llama3_1-405b.Server.ttft_latency = 6000\n"
+"llama3_1-405b.Server.tpot_latency = 175\n"
+"\n"
+"# Target Latencies for interactive setting\n"
+"llama3_1-405b-interactive.Server.target_latency = 0\n"
+"llama3_1-405b-interactive.Server.ttft_latency = 4500\n"
+"llama3_1-405b-interactive.Server.tpot_latency = 80\n"
+"\n"
+"\n"
+"llama3_1-8b.Server.target_latency = 0\n"
+"llama3_1-8b.Server.ttft_latency = 2000\n"
+"llama3_1-8b.Server.tpot_latency = 100\n"
+"\n"
+"# Target Latencies for interactive setting\n"
+"llama3_1-8b-interactive.Server.target_latency = 0\n"
+"llama3_1-8b-interactive.Server.ttft_latency = 500\n"
+"llama3_1-8b-interactive.Server.tpot_latency = 30\n"
+"\n"
+"deepseek-r1.Server.target_latency = 0\n"
+"deepseek-r1.Server.ttft_latency = 2000\n"
+"deepseek-r1.Server.tpot_latency = 80\n"
+"\n"
+"*.Offline.target_latency_percentile = 90\n"
+"*.Offline.min_duration = 600000\n"
+"\n"
+"# In Offline scenario, we always have one query. But LoadGen maps this to\n"
+"# min_sample_count internally in Offline scenario. If the dataset size is larger\n"
+"# than 24576 we limit the min_query_count to 24576 and otherwise we use\n"
+"# the dataset size as the limit\n"
+"\n"
+"resnet50.Offline.min_query_count = 24576\n"
+"retinanet.Offline.min_query_count = 24576\n"
+"dlrm-v2.Offline.min_query_count = 24576\n"
+"bert.Offline.min_query_count = 10833\n"
+"gptj.Offline.min_query_count = 13368\n"
+"rnnt.Offline.min_query_count = 2513\n"
+"3d-unet.Offline.min_query_count = 43\n"
+"stable-diffusion-xl.Offline.min_query_count = 5000\n"
+"llama2-70b.Offline.min_query_count = 24576\n"
+"llama3_1-405b.Offline.min_query_count = 8313\n"
+"llama3_1-8b.Offline.min_query_count = 13368\n"
+"llama3_1-8b-edge.Offline.min_query_count = 5000\n"
+"mixtral-8x7b.Offline.min_query_count = 15000\n"
+"rgat.Offline.min_query_count = 788379\n"
+"deepseek-r1.Offline.min_query_count = 4388\n"
+"whisper.Offline.min_query_count = 1633\n"
+"\n"
+"# These fields should be defined and overridden by user.conf.\n"
+"*.SingleStream.target_latency = 10\n"
+"*.MultiStream.target_latency = 80\n"
+"*.Server.target_qps = 1.0\n"
+"*.Offline.target_qps = 1.0\n"
+"\n"
+"";
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/pyproject.toml b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/pyproject.toml
new file mode 100755
index 000000000..6f0ae06f0
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/pyproject.toml
@@ -0,0 +1,7 @@
+[build-system]
+requires = ["setuptools>=42", "wheel", "pybind11==2.11.1"]
+build-backend = "setuptools.build_meta:__legacy__"
+
+[tool.cibuildwheel]
+environment = "CFLAGS='-std=c++14'"
+build = "cp3{7,8,9,10,11,12,13}-*"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_dispatch_library.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_dispatch_library.h
new file mode 100644
index 000000000..6c594efe0
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_dispatch_library.h
@@ -0,0 +1,42 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Defines the QueryDispatchLibrary interface.
+
+#ifndef MLPERF_LOADGEN_QUERY_DISPATCH_LIBRARY_H
+#define MLPERF_LOADGEN_QUERY_DISPATCH_LIBRARY_H
+
+#include <string>
+
+#include "system_under_test.h"
+
+namespace mlperf {
+
+/// \addtogroup LoadgenAPI
+/// @{
+
+/// \brief The interface a client implements for the LoadGen over the network to
+/// test. The API inherits the System_under_test.h API When working in LON mode
+/// the QueryDispatchLibrary class is used and natively Upcasted to the
+/// QueryDispatchLibrary class.
+
+class QueryDispatchLibrary : public SystemUnderTest {
+ public:
+  virtual ~QueryDispatchLibrary() = default;
+};
+
+/// @}
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_QUERY_DISPATCH_LIBRARY_H
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_sample.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_sample.h
new file mode 100644
index 000000000..e740be99e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_sample.h
@@ -0,0 +1,91 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Defines the structs involved in issuing a query and responding to
+/// a query.
+/// \details These are broken out into their own files since they are exposed
+/// as part of the C API and we want to avoid C clients including C++ code.
+
+#ifndef MLPERF_LOADGEN_QUERY_SAMPLE_H_
+#define MLPERF_LOADGEN_QUERY_SAMPLE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <iostream>
+
+namespace mlperf {
+
+/// \addtogroup LoadgenAPI
+/// @{
+
+/// \brief Represents a unique identifier for a sample of an issued query.
+/// \details As currently implemented, the id is a pointer to an internal
+/// loadgen struct whose value will never be zero/null.
+typedef uintptr_t ResponseId;
+constexpr ResponseId kResponseIdReserved = 0;
+
+/// \brief An index into the QuerySampleLibrary corresponding to a
+/// single sample.
+typedef size_t QuerySampleIndex;
+
+/// \brief Represents the smallest unit of input inference can run on.
+/// A query consists of one or more samples.
+struct QuerySample {
+  ResponseId id;
+  QuerySampleIndex index;
+};
+
+/// \brief Represents a single response to QuerySample
+struct QuerySampleResponse {
+  ResponseId id;
+  uintptr_t data;
+  size_t size;  ///< Size in bytes.
+  int64_t n_tokens;
+
+ public:
+  QuerySampleResponse(ResponseId id, uintptr_t data, size_t size,
+                      int64_t n_tokens)
+      : id(id),
+        data(data),
+        size(size),
+        n_tokens(n_tokens){
+            // std::cout << "Initialized with 4 arguments, n_tokens: " <<
+            // n_tokens <<"\n";
+        };
+  QuerySampleResponse(ResponseId id, uintptr_t data, size_t size)
+      : id(id),
+        data(data),
+        size(size),
+        n_tokens(0){
+            // std::cout << "Initialized with 3 arguments, n_tokens: " <<
+            // n_tokens <<"\n";
+        };
+  QuerySampleResponse()
+      : id(0),
+        data(0),
+        size(0),
+        n_tokens(0){
+            // std::cout << "Initialized with 0 arguments, n_tokens: " <<
+            // n_tokens <<"\n";
+        };
+};
+
+/// \brief A latency in nanoseconds, as recorded by the loadgen.
+typedef int64_t QuerySampleLatency;
+
+/// @}
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_QUERY_SAMPLE_H_
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_sample_library.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_sample_library.h
new file mode 100644
index 000000000..7258068cb
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_sample_library.h
@@ -0,0 +1,75 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Defines the QuerySampleLibrary interface.
+
+#ifndef MLPERF_LOADGEN_QUERY_SAMPLE_LIBRARY_H
+#define MLPERF_LOADGEN_QUERY_SAMPLE_LIBRARY_H
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "query_sample.h"
+
+namespace mlperf {
+
+/// \addtogroup LoadgenAPI
+/// @{
+
+/// \brief The interface a client implements to coordinate with the loadgen
+/// which samples should be loaded.
+class QuerySampleLibrary {
+ public:
+  virtual ~QuerySampleLibrary() {}
+
+  /// \brief A human readable name for the model.
+  virtual const std::string& Name() = 0;
+
+  /// \brief Total number of samples in library.
+  virtual size_t TotalSampleCount() = 0;
+
+  /// \brief The number of samples that are guaranteed to fit in RAM.
+  virtual size_t PerformanceSampleCount() = 0;
+
+  /// \brief Loads the requested query samples into memory.
+  /// \details Paired with calls to UnloadSamplesFromRam.
+  /// In the MultiStream scenarios:
+  ///   * Samples will appear more than once.
+  ///   * SystemUnderTest::IssueQuery will only be called with a set of samples
+  ///     that are neighbors in the vector of samples here, which helps
+  ///     SUTs that need the queries to be contiguous.
+  /// In all other scenarios:
+  ///   * A previously loaded sample will not be loaded again.
+  virtual void LoadSamplesToRam(
+      const std::vector<QuerySampleIndex>& samples) = 0;
+
+  /// \brief Unloads the requested query samples from memory.
+  /// \details In the MultiStream scenarios:
+  ///   * Samples may be unloaded the same number of times they were loaded;
+  ///     however, if the implementation de-dups loaded samples rather than
+  ///     loading samples into contiguous memory, it may unload a sample the
+  ///     first time they see it unloaded without a refcounting scheme, ignoring
+  ///     subsequent unloads. A refcounting scheme would also work, but is not
+  ///     a requirement.
+  /// In all other scenarios:
+  ///   * A previously unloaded sample will not be unloaded again.
+  virtual void UnloadSamplesFromRam(
+      const std::vector<QuerySampleIndex>& samples) = 0;
+};
+
+/// @}
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_QUERY_SAMPLE_LIBRARY_H
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/requirements.txt b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/requirements.txt
new file mode 100644
index 000000000..e47c59fd7
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/requirements.txt
@@ -0,0 +1 @@
+pybind11
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/results.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/results.cc
new file mode 100644
index 000000000..f7c61af43
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/results.cc
@@ -0,0 +1,856 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "results.h"
+
+#include "early_stopping.h"
+#include "utils.h"
+
+namespace mlperf {
+namespace loadgen {
+
+void PerformanceSummary::ProcessLatencies() {
+  if (pr.sample_latencies.empty()) {
+    return;
+  }
+
+  sample_count = pr.sample_latencies.size();
+
+  QuerySampleLatency accumulated_sample_latency = 0;
+  for (auto latency : pr.sample_latencies) {
+    accumulated_sample_latency += latency;
+  }
+  sample_latency_mean = accumulated_sample_latency / sample_count;
+
+  std::sort(pr.sample_latencies.begin(), pr.sample_latencies.end());
+
+  target_latency_percentile.sample_latency =
+      pr.sample_latencies[sample_count * target_latency_percentile.percentile];
+  sample_latency_min = pr.sample_latencies.front();
+  sample_latency_max = pr.sample_latencies.back();
+  for (auto& lp : latency_percentiles) {
+    assert(lp.percentile >= 0.0);
+    assert(lp.percentile < 1.0);
+    lp.sample_latency = pr.sample_latencies[sample_count * lp.percentile];
+  }
+
+  query_count = pr.queries_issued;
+
+  // Count the number of overlatency queries. Only for Server scenario. Since in
+  // this scenario the number of samples per query is 1, sample_latencies are
+  // used.
+  if (settings.scenario == TestScenario::Server) {
+    QuerySampleLatency max_latency = settings.target_latency.count() + 1;
+    overlatency_query_count =
+        pr.sample_latencies.end() -
+        std::lower_bound(pr.sample_latencies.begin(), pr.sample_latencies.end(),
+                         max_latency);
+  }
+
+  if (settings.use_token_latencies) {
+    ProcessTokenLatencies();
+  }
+
+  // MultiStream only after this point.
+  if (settings.scenario != TestScenario::MultiStream) {
+    return;
+  }
+
+  // Calculate per-query stats.
+  size_t query_count = pr.queries_issued;
+  assert(pr.query_latencies.size() == query_count);
+  std::sort(pr.query_latencies.begin(), pr.query_latencies.end());
+  QuerySampleLatency accumulated_query_latency = 0;
+  for (auto latency : pr.query_latencies) {
+    accumulated_query_latency += latency;
+  }
+  query_latency_mean = accumulated_query_latency / query_count;
+  query_latency_min = pr.query_latencies.front();
+  query_latency_max = pr.query_latencies.back();
+  target_latency_percentile.query_latency =
+      pr.query_latencies[query_count * target_latency_percentile.percentile];
+  for (auto& lp : latency_percentiles) {
+    lp.query_latency = pr.query_latencies[query_count * lp.percentile];
+  }
+}
+
+void PerformanceSummary::ProcessTokenLatencies() {
+  constexpr auto nTokenInvalid = std::numeric_limits<int64_t>::min();
+  token_count = 0;
+  for (auto n_tokens : pr.token_results.tokens_per_sample) {
+    if (n_tokens != nTokenInvalid) token_count += n_tokens;
+  }
+  if (pr.token_results.first_token_latencies.empty()) {
+    return;
+  }
+  QuerySampleLatency accumulated_first_token_latency = 0;
+  for (auto latency : pr.token_results.first_token_latencies) {
+    accumulated_first_token_latency += latency;
+  }
+  first_token_latency_mean = accumulated_first_token_latency / sample_count;
+  QuerySampleLatency accumulated_tpot = 0;
+  for (auto latency : pr.token_results.time_per_output_token_arr) {
+    accumulated_tpot += latency;
+  }
+  time_per_output_token_mean = accumulated_tpot / sample_count;
+  std::sort(pr.token_results.first_token_latencies.begin(),
+            pr.token_results.first_token_latencies.end());
+  std::sort(pr.token_results.time_per_output_token_arr.begin(),
+            pr.token_results.time_per_output_token_arr.end());
+
+  token_target_latency_percentile.sample_latency =
+      pr.token_results
+          .first_token_latencies[sample_count *
+                                 token_target_latency_percentile.percentile];
+  first_token_latency_min = pr.token_results.first_token_latencies.front();
+  first_token_latency_max = pr.token_results.first_token_latencies.back();
+  for (auto& lp : token_latency_percentiles) {
+    assert(lp.percentile >= 0.0);
+    assert(lp.percentile < 1.0);
+    lp.sample_latency =
+        pr.token_results.first_token_latencies[sample_count * lp.percentile];
+  }
+
+  target_tpot_percentile.sample_latency =
+      pr.token_results
+          .time_per_output_token_arr[sample_count *
+                                     target_tpot_percentile.percentile];
+  time_per_output_token_min =
+      pr.token_results.time_per_output_token_arr.front();
+  time_per_output_token_max = pr.token_results.time_per_output_token_arr.back();
+  for (auto& lp : tpot_percentiles) {
+    assert(lp.percentile >= 0.0);
+    assert(lp.percentile < 1.0);
+    lp.sample_latency =
+        pr.token_results
+            .time_per_output_token_arr[sample_count * lp.percentile];
+  }
+
+  if (settings.scenario == TestScenario::Server) {
+    // TODO: Maybe another target latency needs to be added?
+    QuerySampleLatency max_latency = settings.target_latency.count() + 1;
+    overlatency_first_token_count =
+        pr.token_results.first_token_latencies.end() -
+        std::lower_bound(pr.token_results.first_token_latencies.begin(),
+                         pr.token_results.first_token_latencies.end(),
+                         max_latency);
+  }
+}
+
+bool PerformanceSummary::EarlyStopping(
+    std::string* recommendation, int64_t queries_issued,
+    std::vector<QuerySampleLatency>* sample_latencies,
+    std::vector<QuerySampleLatency>* query_latencies,
+    std::chrono::nanoseconds target_latency) {
+  recommendation->clear();
+
+  MinPassingQueriesFinder find_min_passing;
+  double confidence = 0.99;
+  double tolerance = 0.0;
+
+  ProcessLatencies();
+  switch (settings.scenario) {
+    case TestScenario::SingleStream: {
+      // TODO: Grab multistream percentile from settings, instead of hardcoding.
+      double multi_stream_percentile = 0.99;
+      int64_t t = 1;
+      int64_t h_min = find_min_passing(1, target_latency_percentile.percentile,
+                                       tolerance, confidence);
+      int64_t h = h_min;
+      if (queries_issued < h_min + 1) {
+        *recommendation =
+            " * Only processed " + std::to_string(queries_issued) +
+            " queries.\n * Need to process at least " +
+            std::to_string(h_min + 1) + " queries for early stopping.";
+        return false;
+      } else {
+        for (int64_t i = 2; i < queries_issued + 1; ++i) {
+          h = find_min_passing(i, target_latency_percentile.percentile,
+                               tolerance, confidence);
+          if (queries_issued < h + i) {
+            t = i - 1;
+            break;
+          }
+        }
+      }
+      QuerySampleLatency percentile_estimate =
+          (*sample_latencies)[queries_issued - t];
+      *recommendation =
+          " * Processed at least " + std::to_string(h_min + 1) + " queries (" +
+          std::to_string(queries_issued) + ").\n" + " * Would discard " +
+          std::to_string(t - 1) + " highest latency queries.\n" +
+          " * Early stopping " +
+          DoubleToString(target_latency_percentile.percentile * 100, 1) +
+          "th percentile estimate: " + std::to_string(percentile_estimate);
+      early_stopping_latency_ss = percentile_estimate;
+
+      // Early stopping estimate for 99%ile (used for infering multi-stream from
+      // single-stream)
+      t = 1;
+      h_min =
+          find_min_passing(1, multi_stream_percentile, tolerance, confidence);
+      h = h_min;
+      if (queries_issued < h_min + 1) {
+        *recommendation +=
+            "\n * Not enough queries processed for " +
+            DoubleToString(multi_stream_percentile * 100, 1) +
+            "th percentile\n" +
+            " early stopping estimate (would need to process at\n least " +
+            std::to_string(h_min + 1) + " total queries).";
+      } else {
+        for (int64_t i = 2; i < queries_issued + 1; ++i) {
+          h = find_min_passing(i, multi_stream_percentile, tolerance,
+                               confidence);
+          if (queries_issued < h + i) {
+            t = i - 1;
+            break;
+          }
+        }
+        percentile_estimate = (*sample_latencies)[queries_issued - t];
+        *recommendation +=
+            "\n * Early stopping " +
+            DoubleToString(multi_stream_percentile * 100, 1) +
+            "th percentile estimate: " + std::to_string(percentile_estimate);
+        early_stopping_latency_ms = percentile_estimate;
+      }
+      break;
+    }
+    case TestScenario::Server: {
+      int64_t t =
+          std::count_if((*sample_latencies).begin(), (*sample_latencies).end(),
+                        [=](auto const& latency) {
+                          return latency > target_latency.count();
+                        });
+      int64_t h = find_min_passing(t, target_latency_percentile.percentile,
+                                   tolerance, confidence);
+      if (queries_issued >= h + t) {
+        *recommendation = " * Run successful.";
+      } else {
+        *recommendation = " * Run unsuccessful.\n * Processed " +
+                          std::to_string(queries_issued) + " queries.\n" +
+                          " * Would need to run at least " +
+                          std::to_string(h + t - queries_issued) +
+                          " more queries,\n with the run being successful if "
+                          "every additional\n query were under latency.";
+        return false;
+      }
+      break;
+    }
+    case TestScenario::MultiStream: {
+      int64_t t = 1;
+      int64_t h_min = find_min_passing(1, target_latency_percentile.percentile,
+                                       tolerance, confidence);
+      int64_t h = h_min;
+      if (queries_issued < h_min + 1) {
+        *recommendation =
+            " * Only processed " + std::to_string(queries_issued) +
+            " queries.\n * Need to process at least " +
+            std::to_string(h_min + 1) + " queries for early stopping.";
+        return false;
+      } else {
+        for (int64_t i = 2; i < queries_issued + 1; ++i) {
+          h = find_min_passing(i, target_latency_percentile.percentile,
+                               tolerance, confidence);
+          if (queries_issued < h + i) {
+            t = i - 1;
+            break;
+          }
+        }
+      }
+      QuerySampleLatency percentile_estimate =
+          (*query_latencies)[queries_issued - t];
+      *recommendation =
+          " * Processed at least " + std::to_string(h_min + 1) + " queries (" +
+          std::to_string(queries_issued) + ").\n" + " * Would discard " +
+          std::to_string(t - 1) + " highest latency queries.\n" +
+          " * Early stopping " +
+          DoubleToString(target_latency_percentile.percentile * 100, 1) +
+          "th percentile estimate: " + std::to_string(percentile_estimate);
+      early_stopping_latency_ms = percentile_estimate;
+      break;
+    }
+    case TestScenario::Offline:
+      break;
+  }
+  return true;
+}
+
+bool PerformanceSummary::MinDurationMet(std::string* recommendation) {
+  recommendation->clear();
+  const double min_duration = DurationToSeconds(settings.min_duration);
+  bool min_duration_met = false;
+  switch (settings.scenario) {
+    case TestScenario::Offline:
+      min_duration_met = pr.max_latency >= min_duration;
+      break;
+    case TestScenario::Server:
+      min_duration_met = pr.final_query_scheduled_time >= min_duration;
+      break;
+    case TestScenario::SingleStream:
+    case TestScenario::MultiStream:
+      min_duration_met = pr.final_query_issued_time >= min_duration;
+      break;
+  }
+  if (min_duration_met) {
+    return true;
+  }
+
+  switch (settings.scenario) {
+    case TestScenario::SingleStream:
+    case TestScenario::MultiStream:
+      *recommendation =
+          "Decrease the expected latency so the loadgen pre-generates more "
+          "queries.";
+      break;
+    case TestScenario::Server:
+      *recommendation =
+          "Increase the target QPS so the loadgen pre-generates more queries.";
+      break;
+    case TestScenario::Offline:
+      *recommendation =
+          "Increase expected QPS so the loadgen pre-generates a larger "
+          "(coalesced) query.";
+      break;
+  }
+  return false;
+}
+
+bool PerformanceSummary::MinQueriesMet() {
+  return pr.queries_issued >= settings.min_query_count;
+}
+
+bool PerformanceSummary::MinSamplesMet() {
+  return sample_count >= settings.min_sample_count;
+}
+
+bool PerformanceSummary::HasPerfConstraints() {
+  return settings.scenario == TestScenario::Server;
+}
+
+bool PerformanceSummary::PerfConstraintsMet(std::string* recommendation) {
+  recommendation->clear();
+  bool perf_constraints_met = true;
+  switch (settings.scenario) {
+    case TestScenario::SingleStream:
+    case TestScenario::MultiStream:
+      break;
+    case TestScenario::Server:
+      ProcessLatencies();
+      if (!settings.use_token_latencies) {
+        if (target_latency_percentile.sample_latency >
+            settings.target_latency.count()) {
+          *recommendation = "Reduce target QPS to improve latency.";
+          perf_constraints_met = false;
+        }
+      } else {
+        if (token_target_latency_percentile.sample_latency >
+            settings.server_ttft_latency) {
+          *recommendation =
+              "TTFT constrain not met: Reduce target QPS to improve latency.";
+          perf_constraints_met = false;
+        }
+
+        if (target_tpot_percentile.sample_latency >
+            settings.server_tpot_latency) {
+          if (recommendation->empty()) {
+            *recommendation =
+                "TPOT constrain not met: Reduce target QPS to improve latency.";
+          } else {
+            recommendation->append(
+                "\n * TPOT constrain not met: Reduce target QPS to improve "
+                "latency.");
+          }
+          perf_constraints_met = false;
+        }
+      }
+      break;
+    case TestScenario::Offline:
+      break;
+  }
+  return perf_constraints_met;
+}
+
+void PerformanceSummary::LogSummary(AsyncSummary& summary) {
+  ProcessLatencies();
+
+  summary(
+      "================================================\n"
+      "MLPerf Results Summary\n"
+      "================================================");
+  summary("SUT name : ", sut_name);
+  summary("Scenario : ", ToString(settings.scenario));
+  summary("Mode     : ", ToString(settings.mode));
+
+  switch (settings.scenario) {
+    case TestScenario::SingleStream: {
+      summary(DoubleToString(target_latency_percentile.percentile * 100, 1) +
+                  "th percentile latency (ns) : ",
+              target_latency_percentile.sample_latency);
+      break;
+    }
+    case TestScenario::MultiStream: {
+      summary(DoubleToString(target_latency_percentile.percentile * 100, 1) +
+                  "th percentile latency (ns) : ",
+              target_latency_percentile.query_latency);
+      break;
+    }
+    case TestScenario::Server: {
+      // Subtract 1 from sample count since the start of the final sample
+      // represents the open end of the time range: i.e. [begin, end).
+      // This makes sense since:
+      // a) QPS doesn't apply if there's only one sample; it's pure latency.
+      // b) If you have precisely 1k QPS, there will be a sample exactly on
+      //    the 1 second time point; but that would be the 1001th sample in
+      //    the stream. Given the first 1001 queries, the QPS is
+      //    1000 queries / 1 second.
+      // TODO: make a more permanent solution
+      double qps_as_completed =
+          (sample_count - 1) / pr.final_query_all_samples_done_time;
+      summary("Completed samples per second    : ",
+              DoubleToString(qps_as_completed));
+      break;
+    }
+    case TestScenario::Offline: {
+      double samples_per_second = sample_count / pr.max_latency;
+      summary("Samples per second: ", samples_per_second);
+      break;
+    }
+  }
+
+  if (settings.use_token_latencies) {
+    switch (settings.scenario) {
+      case TestScenario::SingleStream: {
+        summary(DoubleToString(token_target_latency_percentile.percentile * 100,
+                               1) +
+                    "th first token percentile latency (ns) : ",
+                token_target_latency_percentile.sample_latency);
+        break;
+      }
+      case TestScenario::MultiStream: {
+        summary(DoubleToString(token_target_latency_percentile.percentile * 100,
+                               1) +
+                    "th first token percentile latency (ns) : ",
+                token_target_latency_percentile.sample_latency);
+        break;
+      }
+      case TestScenario::Offline: {
+        double tokens_per_second = token_count / pr.max_latency;
+        summary("Tokens per second: ", tokens_per_second);
+        break;
+      }
+      case TestScenario::Server:
+        double tps_as_completed =
+            token_count / pr.final_query_all_samples_done_time;
+        summary("Completed tokens per second: ",
+                DoubleToString(tps_as_completed));
+        break;
+    }
+  }
+
+  if (settings.infer_token_latencies) {
+    switch (settings.scenario) {
+      case TestScenario::SingleStream: {
+        break;
+      }
+      case TestScenario::MultiStream: {
+        break;
+      }
+      case TestScenario::Offline: {
+        double tokens_per_second = settings.token_latency_scaling_factor *
+                                   sample_count / pr.max_latency;
+        summary("Tokens per second (inferred): ", tokens_per_second);
+        break;
+      }
+      case TestScenario::Server:
+        double tps_as_completed = settings.token_latency_scaling_factor *
+                                  (sample_count - 1) /
+                                  pr.final_query_all_samples_done_time;
+        summary("Completed tokens per second (inferred): ",
+                DoubleToString(tps_as_completed));
+        break;
+    }
+  }
+
+  std::string min_duration_recommendation;
+  std::string perf_constraints_recommendation;
+  std::string early_stopping_recommendation;
+  std::string early_stopping_ttft_recommendation;
+  std::string early_stopping_tpot_recommendation;
+
+  bool min_duration_met = MinDurationMet(&min_duration_recommendation);
+  bool min_queries_met = MinQueriesMet() && MinSamplesMet();
+  bool early_stopping_met = true;
+  if (!settings.use_token_latencies) {
+    early_stopping_met = EarlyStopping(
+        &early_stopping_recommendation, pr.queries_issued, &pr.sample_latencies,
+        &pr.query_latencies, settings.target_latency);
+  } else {
+    early_stopping_met =
+        EarlyStopping(&early_stopping_tpot_recommendation, pr.queries_issued,
+                      &pr.token_results.time_per_output_token_arr,
+                      &pr.query_latencies,
+                      std::chrono::nanoseconds(settings.server_tpot_latency)) &&
+        EarlyStopping(&early_stopping_ttft_recommendation, pr.queries_issued,
+                      &pr.token_results.first_token_latencies,
+                      &pr.query_latencies,
+                      std::chrono::nanoseconds(settings.server_ttft_latency));
+  }
+  bool perf_constraints_met =
+      PerfConstraintsMet(&perf_constraints_recommendation);
+  bool all_constraints_met = min_duration_met && min_queries_met &&
+                             perf_constraints_met && early_stopping_met;
+  summary("Result is : ", all_constraints_met ? "VALID" : "INVALID");
+  if (HasPerfConstraints()) {
+    summary("  Performance constraints satisfied : ",
+            perf_constraints_met ? "Yes" : "NO");
+  }
+  summary("  Min duration satisfied : ", min_duration_met ? "Yes" : "NO");
+  summary("  Min queries satisfied : ", min_queries_met ? "Yes" : "NO");
+  summary("  Early stopping satisfied: ", early_stopping_met ? "Yes" : "NO");
+
+  if (!all_constraints_met) {
+    summary("Recommendations:");
+    if (!perf_constraints_met) {
+      summary(" * " + perf_constraints_recommendation);
+    }
+    if (!min_duration_met) {
+      summary(" * " + min_duration_recommendation);
+    }
+    if (!min_queries_met) {
+      summary(
+          " * The test exited early, before enough queries were issued.\n"
+          "   See the detailed log for why this may have occurred.");
+    }
+  }
+  // Early stopping results
+  if (settings.scenario == TestScenario::SingleStream ||
+      settings.scenario == TestScenario::Server ||
+      settings.scenario == TestScenario::MultiStream) {
+    if (!settings.use_token_latencies) {
+      summary("Early Stopping Result:");
+      summary(early_stopping_recommendation);
+    } else {
+      summary("TTFT Early Stopping Result:");
+      summary(early_stopping_ttft_recommendation);
+      summary("TPOT Early Stopping Result:");
+      summary(early_stopping_tpot_recommendation);
+    }
+  }
+
+  summary(
+      "\n"
+      "================================================\n"
+      "Additional Stats\n"
+      "================================================");
+
+  if (settings.scenario == TestScenario::SingleStream) {
+    double qps_w_lg = (sample_count - 1) / pr.final_query_issued_time;
+    double qps_wo_lg = 1 / QuerySampleLatencyToSeconds(sample_latency_mean);
+    summary("QPS w/ loadgen overhead         : " + DoubleToString(qps_w_lg));
+    summary("QPS w/o loadgen overhead        : " + DoubleToString(qps_wo_lg));
+    summary("");
+  } else if (settings.scenario == TestScenario::Server) {
+    // Scheduled samples per second as an additional stat
+    double qps_as_scheduled =
+        (sample_count - 1) / pr.final_query_scheduled_time;
+    summary("Scheduled samples per second : ",
+            DoubleToString(qps_as_scheduled));
+  } else if (settings.scenario == TestScenario::MultiStream) {
+    summary("Per-query latency:  ");
+    summary("Min latency (ns)                : ", query_latency_min);
+    summary("Max latency (ns)                : ", query_latency_max);
+    summary("Mean latency (ns)               : ", query_latency_mean);
+    for (auto& lp : latency_percentiles) {
+      summary(
+          DoubleToString(lp.percentile * 100) + " percentile latency (ns)   : ",
+          lp.query_latency);
+    }
+  }
+
+  if (settings.scenario != TestScenario::MultiStream) {
+    summary("Min latency (ns)                : ", sample_latency_min);
+    summary("Max latency (ns)                : ", sample_latency_max);
+    summary("Mean latency (ns)               : ", sample_latency_mean);
+    for (auto& lp : latency_percentiles) {
+      summary(
+          DoubleToString(lp.percentile * 100) + " percentile latency (ns)   : ",
+          lp.sample_latency);
+    }
+  }
+  if (settings.use_token_latencies) {
+    summary("");
+    if (settings.scenario == TestScenario::SingleStream) {
+      double tps_w_lg = token_count / pr.final_query_issued_time;
+      double tps_wo_lg =
+          ((double)token_count) /
+          (QuerySampleLatencyToSeconds(sample_latency_mean) * sample_count);
+      summary("TPS w/ loadgen overhead         : " + DoubleToString(tps_w_lg));
+      summary("TPS w/o loadgen overhead        : " + DoubleToString(tps_wo_lg));
+
+    } else if (settings.scenario == TestScenario::Server) {
+      double tps_as_completed =
+          token_count / pr.final_query_all_samples_done_time;
+      summary("Completed tokens per second                 : ",
+              DoubleToString(tps_as_completed));
+    }
+
+    if (settings.scenario != TestScenario::Offline) {
+      summary("Min First Token latency (ns)                : ",
+              first_token_latency_min);
+      summary("Max First Token latency (ns)                : ",
+              first_token_latency_max);
+      summary("Mean First Token latency (ns)               : ",
+              first_token_latency_mean);
+      for (auto& lp : token_latency_percentiles) {
+        summary(DoubleToString(lp.percentile * 100) +
+                    " percentile first token latency (ns)   : ",
+                lp.sample_latency);
+      }
+      summary("");
+      summary("Min Time per Output Token (ns)                : ",
+              time_per_output_token_min);
+      summary("Max Time per Output Token (ns)                : ",
+              time_per_output_token_max);
+      summary("Mean Time per Output Token (ns)               : ",
+              time_per_output_token_mean);
+      for (auto& lp : tpot_percentiles) {
+        summary(DoubleToString(lp.percentile * 100) +
+                    " percentile time to output token (ns)   : ",
+                lp.sample_latency);
+      }
+    }
+  }
+
+  summary(
+      "\n"
+      "================================================\n"
+      "Test Parameters Used\n"
+      "================================================");
+  settings.LogSummary(summary);
+}
+
+void PerformanceSummary::LogDetail(AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+  ProcessLatencies();
+
+  // General validity checking
+  std::string min_duration_recommendation;
+  std::string perf_constraints_recommendation;
+  std::string early_stopping_recommendation;
+  std::string early_stopping_ttft_recommendation;
+  std::string early_stopping_tpot_recommendation;
+  bool min_duration_met = MinDurationMet(&min_duration_recommendation);
+  bool min_queries_met = MinQueriesMet() && MinSamplesMet();
+  bool perf_constraints_met =
+      PerfConstraintsMet(&perf_constraints_recommendation);
+  bool early_stopping_met = true;
+  if (!settings.use_token_latencies) {
+    early_stopping_met = EarlyStopping(
+        &early_stopping_recommendation, pr.queries_issued, &pr.sample_latencies,
+        &pr.query_latencies, settings.target_latency);
+  } else {
+    early_stopping_met =
+        EarlyStopping(&early_stopping_tpot_recommendation, pr.queries_issued,
+                      &pr.token_results.time_per_output_token_arr,
+                      &pr.query_latencies,
+                      std::chrono::nanoseconds(settings.server_tpot_latency)) &&
+        EarlyStopping(&early_stopping_ttft_recommendation, pr.queries_issued,
+                      &pr.token_results.first_token_latencies,
+                      &pr.query_latencies,
+                      std::chrono::nanoseconds(settings.server_ttft_latency));
+  }
+  bool all_constraints_met = min_duration_met && min_queries_met &&
+                             perf_constraints_met && early_stopping_met;
+
+  MLPERF_LOG(detail, "result_validity",
+             all_constraints_met ? "VALID" : "INVALID");
+  if (HasPerfConstraints()) {
+    MLPERF_LOG(detail, "result_perf_constraints_met", perf_constraints_met);
+  }
+  MLPERF_LOG(detail, "result_min_duration_met", min_duration_met);
+  MLPERF_LOG(detail, "result_min_queries_met", min_queries_met);
+  MLPERF_LOG(detail, "early_stopping_met", early_stopping_met);
+  if (!all_constraints_met) {
+    std::string recommendation;
+    if (!perf_constraints_met) {
+      recommendation += perf_constraints_recommendation + " ";
+    }
+    if (!min_duration_met) {
+      recommendation += min_duration_recommendation + " ";
+    }
+    if (!min_queries_met) {
+      recommendation +=
+          "The test exited early, before enough queries were issued.";
+    }
+    std::replace(recommendation.begin(), recommendation.end(), '\n', ' ');
+    MLPERF_LOG(detail, "result_invalid_reason", recommendation);
+  }
+  std::replace(early_stopping_recommendation.begin(),
+               early_stopping_recommendation.end(), '\n', ' ');
+  if (!settings.use_token_latencies) {
+    MLPERF_LOG(detail, "early_stopping_result", early_stopping_recommendation);
+  } else {
+    std::replace(early_stopping_ttft_recommendation.begin(),
+                 early_stopping_ttft_recommendation.end(), '\n', ' ');
+    std::replace(early_stopping_tpot_recommendation.begin(),
+                 early_stopping_tpot_recommendation.end(), '\n', ' ');
+    MLPERF_LOG(detail, "early_stopping_ttft_result",
+               early_stopping_ttft_recommendation);
+    MLPERF_LOG(detail, "early_stopping_tpot_result",
+               early_stopping_tpot_recommendation);
+  }
+  // Report number of queries
+  MLPERF_LOG(detail, "result_query_count", query_count);
+  if (settings.scenario == TestScenario::Server) {
+    MLPERF_LOG(detail, "result_overlatency_query_count",
+               overlatency_query_count);
+  }
+
+  auto reportPerQueryLatencies = [&]() {
+    MLPERF_LOG(detail, "result_min_query_latency_ns", query_latency_min);
+    MLPERF_LOG(detail, "result_max_query_latency_ns", query_latency_max);
+    MLPERF_LOG(detail, "result_mean_query_latency_ns", query_latency_mean);
+    for (auto& lp : latency_percentiles) {
+      std::string percentile = DoubleToString(lp.percentile * 100);
+      MLPERF_LOG(detail,
+                 "result_" + percentile + "_percentile_per_query_latency_ns",
+                 lp.query_latency);
+    }
+  };
+
+  // Per-scenario performance results.
+  switch (settings.scenario) {
+    case TestScenario::SingleStream: {
+      double qps_w_lg = (sample_count - 1) / pr.final_query_issued_time;
+      double qps_wo_lg = 1 / QuerySampleLatencyToSeconds(sample_latency_mean);
+      MLPERF_LOG(detail, "result_qps_with_loadgen_overhead", qps_w_lg);
+      MLPERF_LOG(detail, "result_qps_without_loadgen_overhead", qps_wo_lg);
+      MLPERF_LOG(detail, "early_stopping_latency_ss",
+                 early_stopping_latency_ss);
+      MLPERF_LOG(detail, "early_stopping_latency_ms",
+                 early_stopping_latency_ms);
+      break;
+    }
+    case TestScenario::MultiStream: {
+      reportPerQueryLatencies();
+      MLPERF_LOG(detail, "early_stopping_latency_ms",
+                 early_stopping_latency_ms);
+      break;
+    }
+    case TestScenario::Server: {
+      // Subtract 1 from sample count since the start of the final sample
+      // represents the open end of the time range: i.e. [begin, end).
+      // This makes sense since:
+      // a) QPS doesn't apply if there's only one sample; it's pure latency.
+      // b) If you have precisely 1k QPS, there will be a sample exactly on
+      //    the 1 second time point; but that would be the 1001th sample in
+      //    the stream. Given the first 1001 queries, the QPS is
+      //    1000 queries / 1 second.
+      double qps_as_scheduled =
+          (sample_count - 1) / pr.final_query_scheduled_time;
+      MLPERF_LOG(detail, "result_scheduled_samples_per_sec", qps_as_scheduled);
+      double qps_as_completed =
+          (sample_count - 1) / pr.final_query_all_samples_done_time;
+      MLPERF_LOG(detail, "result_completed_samples_per_sec", qps_as_completed);
+      break;
+    }
+    case TestScenario::Offline: {
+      double samples_per_second = sample_count / pr.max_latency;
+      MLPERF_LOG(detail, "result_samples_per_second", samples_per_second);
+      break;
+    }
+  }
+
+  // Detailed latencies
+  MLPERF_LOG(detail, "result_min_latency_ns", sample_latency_min);
+  MLPERF_LOG(detail, "result_max_latency_ns", sample_latency_max);
+  MLPERF_LOG(detail, "result_mean_latency_ns", sample_latency_mean);
+  for (auto& lp : latency_percentiles) {
+    MLPERF_LOG(detail,
+               "result_" + DoubleToString(lp.percentile * 100) +
+                   "_percentile_latency_ns",
+               lp.sample_latency);
+  }
+  // Detailed first token latencies
+  if (settings.use_token_latencies) {
+    if (settings.scenario != TestScenario::Offline) {
+      MLPERF_LOG(detail, "result_first_token_min_latency_ns",
+                 first_token_latency_min);
+      MLPERF_LOG(detail, "result_first_token_max_latency_ns",
+                 first_token_latency_max);
+      MLPERF_LOG(detail, "result_first_token_mean_latency_ns",
+                 first_token_latency_mean);
+      for (auto& lp : token_latency_percentiles) {
+        MLPERF_LOG(detail,
+                   "result_first_token_" + DoubleToString(lp.percentile * 100) +
+                       "_percentile_latency_ns",
+                   lp.sample_latency);
+      }
+      double tps_w_lg = ((double)token_count) / pr.final_query_issued_time;
+      double tps_wo_lg =
+          ((double)token_count) / (sample_latency_mean * sample_count);
+      MLPERF_LOG(detail, "result_token_throughput_with_loadgen_overhead",
+                 tps_w_lg);
+      MLPERF_LOG(detail, "result_token_throughput", tps_wo_lg);
+      for (auto& lp : tpot_percentiles) {
+        MLPERF_LOG(detail,
+                   "result_time_per_output_token_" +
+                       DoubleToString(lp.percentile * 100) + "_percentile_ns",
+                   lp.sample_latency);
+      }
+      MLPERF_LOG(detail, "result_time_to_output_token_min",
+                 time_per_output_token_min);
+      MLPERF_LOG(detail, "result_time_to_output_token_max",
+                 time_per_output_token_max);
+      MLPERF_LOG(detail, "result_time_to_output_token_mean",
+                 time_per_output_token_mean);
+      double tps_as_completed =
+          token_count / pr.final_query_all_samples_done_time;
+      MLPERF_LOG(detail, "result_completed_tokens_per_second",
+                 tps_as_completed);
+    } else {
+      double tokens_per_second = token_count / pr.max_latency;
+      MLPERF_LOG(detail, "result_tokens_per_second", tokens_per_second);
+    }
+  }
+
+  if (settings.infer_token_latencies) {
+    switch (settings.scenario) {
+      case TestScenario::Server: {
+        double completed_tokens_per_second =
+            (sample_count - 1) * settings.token_latency_scaling_factor /
+            pr.final_query_all_samples_done_time;
+        MLPERF_LOG(detail, "result_inferred_completed_tokens_per_second",
+                   completed_tokens_per_second);
+        break;
+      }
+      case TestScenario::Offline: {
+        double tokens_per_second = sample_count *
+                                   settings.token_latency_scaling_factor /
+                                   pr.max_latency;
+        MLPERF_LOG(detail, "result_inferred_tokens_per_second",
+                   tokens_per_second);
+        break;
+      }
+      case TestScenario::SingleStream: {
+        break;
+      }
+      case TestScenario::MultiStream: {
+        break;
+      }
+    }
+  }
+  MLPERF_LOG(detail, "num_errors", detail.async_log().GetErrorCount());
+#endif
+}
+}  // namespace loadgen
+}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/results.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/results.h
new file mode 100644
index 000000000..6befea2c0
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/results.h
@@ -0,0 +1,128 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Defines PerformanceResult and PerformanceSummary.
+
+#ifndef MLPERF_LOADGEN_RESULTS_H_
+#define MLPERF_LOADGEN_RESULTS_H_
+
+#include <string>
+#include <vector>
+
+#include "query_sample.h"
+#include "test_settings_internal.h"
+
+namespace mlperf {
+namespace loadgen {
+
+/// \brief Contains the performance results for benchmarks that have
+/// token based metrics
+struct TokenPerformanceResults {
+  std::vector<QuerySampleLatency> first_token_latencies;
+  std::vector<QuerySampleLatency> time_per_output_token_arr;
+  std::vector<int64_t> tokens_per_sample;
+};
+
+/// \brief Provides performance results that are independent of scenario
+/// and other context.
+struct PerformanceResult {
+  std::vector<QuerySampleLatency> sample_latencies;
+  std::vector<QuerySampleLatency> query_latencies;
+  size_t queries_issued;
+  double max_latency;
+  double final_query_scheduled_time;         // seconds from start.
+  double final_query_issued_time;            // seconds from start.
+  double final_query_all_samples_done_time;  // seconds from start.
+  TokenPerformanceResults token_results;
+};
+
+/// \brief Wraps PerformanceResult with relevant context to change how
+/// it's interpreted and reported.
+struct PerformanceSummary {
+  std::string sut_name;
+  TestSettingsInternal settings;
+  PerformanceResult pr;
+
+  // Set by ProcessLatencies.
+  size_t sample_count = 0;
+  size_t query_count = 0;
+  size_t overlatency_query_count = 0;
+  QuerySampleLatency sample_latency_min = 0;
+  QuerySampleLatency sample_latency_max = 0;
+  QuerySampleLatency sample_latency_mean = 0;
+  QuerySampleLatency query_latency_min = 0;
+  QuerySampleLatency query_latency_max = 0;
+  QuerySampleLatency query_latency_mean = 0;
+
+  /// \brief The latency at a given percentile.
+  struct PercentileEntry {
+    const double percentile;
+    QuerySampleLatency sample_latency = 0;
+    QuerySampleLatency query_latency = 0;  // MultiStream only.
+  };
+
+  // Latency target percentile
+  PercentileEntry target_latency_percentile{settings.target_latency_percentile};
+  PercentileEntry latency_percentiles[6] = {{.50}, {.90}, {.95},
+                                            {.97}, {.99}, {.999}};
+
+  // Early stopping percentile estimates for SingleStream and MultiStream
+  QuerySampleLatency early_stopping_latency_ss = 0;
+  QuerySampleLatency early_stopping_latency_ms = 0;
+
+  // Set by ProcessTokenLatencies
+  size_t token_count = 0;
+  size_t overlatency_first_token_count = 0;
+  QuerySampleLatency first_token_latency_min = 0;
+  QuerySampleLatency first_token_latency_max = 0;
+  QuerySampleLatency first_token_latency_mean = 0;
+  QuerySampleLatency time_per_output_token_min = 0;
+  QuerySampleLatency time_per_output_token_max = 0;
+  QuerySampleLatency time_per_output_token_mean = 0;
+
+  // Latency token target percentile
+  PercentileEntry token_target_latency_percentile{
+      settings.target_latency_percentile};
+  PercentileEntry token_latency_percentiles[6] = {{.50}, {.90}, {.95},
+                                                  {.97}, {.99}, {.999}};
+  PercentileEntry target_tpot_percentile{settings.target_latency_percentile};
+  PercentileEntry tpot_percentiles[6] = {{.50}, {.90}, {.95},
+                                         {.97}, {.99}, {.999}};
+
+#if defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64)
+  // MSVC complains if there is no explicit constructor.
+  // (target_latency_percentile above depends on construction with settings)
+  PerformanceSummary(const std::string& sut_name_arg,
+                     const TestSettingsInternal& settings_arg,
+                     const PerformanceResult& pr_arg)
+      : sut_name(sut_name_arg), settings(settings_arg), pr(pr_arg){};
+#endif
+  void ProcessLatencies();
+  void ProcessTokenLatencies();
+
+  bool MinDurationMet(std::string* recommendation);
+  bool EarlyStopping(std::string* recommendation, int64_t queries_issued,
+                     std::vector<QuerySampleLatency>* sample_latencies,
+                     std::vector<QuerySampleLatency>* query_latencies,
+                     std::chrono::nanoseconds target_latency);
+  bool MinQueriesMet();
+  bool MinSamplesMet();
+  bool HasPerfConstraints();
+  bool PerfConstraintsMet(std::string* recommendation);
+  void LogSummary(AsyncSummary& summary);
+  void LogDetail(AsyncDetail& detail);
+};
+}  // namespace loadgen
+}  // namespace mlperf
+
+#endif
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/setup.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/setup.py
new file mode 100644
index 000000000..6254eea17
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/setup.py
@@ -0,0 +1,136 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# \file
+#  \brief MLPerf Inference LoadGen python module setup.
+#  \details Creates a module that python can import.
+#  All source files are compiled by python"s C++ toolchain  without depending
+#  on a loadgen lib.
+#
+#  This setup.py can be used stand-alone, without the use of an external
+#  build system. This will polute your source tree with output files
+#  and binaries. Use one of the gn build targets instead if you want
+#  to avoid poluting the source tree.
+
+from setuptools import Extension, setup
+from pathlib import Path
+from pybind11 import get_include
+from pybind11.setup_helpers import Pybind11Extension, build_ext
+from version_generator import generate_loadgen_version_definitions
+import subprocess
+
+generated_version_source_filename = "generated/version_generated.cc"
+generate_loadgen_version_definitions(generated_version_source_filename, ".")
+
+public_headers = [
+    "loadgen.h",
+    "query_sample.h",
+    "query_sample_library.h",
+    "system_under_test.h",
+    "test_settings.h",
+    "issue_query_controller.h",
+    "early_stopping.h",
+    "query_dispatch_library.h"
+]
+
+lib_headers = [
+    "logging.h",
+    "test_settings_internal.h",
+    "trace_generator.h",
+    "utils.h",
+    "version.h",
+    "results.h",
+    "bindings/c_api.h",
+    "version_generator.py",
+    "mlperf_conf.h"
+]
+
+lib_sources = [
+    "early_stopping.cc",
+    "issue_query_controller.cc",
+    "loadgen.cc",
+    "logging.cc",
+    "test_settings_internal.cc",
+    "utils.cc",
+    "version.cc",
+    "results.cc",
+]
+
+lib_bindings = [
+    "bindings/c_api.cc",
+    "bindings/python_api.cc",
+]
+
+this_directory = Path(__file__).parent
+mlperf_loadgen_headers = public_headers + lib_headers
+mlperf_loadgen_sources_no_gen = lib_sources + lib_bindings
+mlperf_loadgen_sources = mlperf_loadgen_sources_no_gen + [
+    generated_version_source_filename
+]
+mlperf_long_description = (
+    this_directory /
+    "README.md").read_text(
+        encoding="utf-8")
+
+with open("VERSION.txt", "r") as f:
+    version = f.read()
+version_split = version.split(".")
+
+if len(version_split) < 2:
+    print("Version is incomplete. Needs a format like 4.1.1 in VERSION file")
+
+
+try:
+    with open("mlperf.conf", 'r') as file:
+        conf_contents = file.read()
+
+    # Escape backslashes and double quotes
+    conf_contents = conf_contents.replace('\\', '\\\\').replace('"', '\\"')
+
+    # Convert newlines
+    conf_contents = conf_contents.replace('\n', '\\n"\n"')
+
+    formatted_content = f'const char* mlperf_conf =\n"{conf_contents}";\n'
+
+    with open("mlperf_conf.h", 'w') as header_file:
+        header_file.write(formatted_content)
+
+except IOError as e:
+    raise RuntimeError(f"Failed to generate header file: {e}")
+
+mlperf_loadgen_module = Pybind11Extension(
+    "mlperf_loadgen",
+    define_macros=[
+        ("MAJOR_VERSION",
+         version_split[0]),
+        ("MINOR_VERSION",
+         version_split[1])
+    ],
+    include_dirs=[".", get_include()],
+    sources=mlperf_loadgen_sources,
+    depends=mlperf_loadgen_headers,
+)
+
+setup(name="mlcommons_loadgen",
+      version=version,
+      description="MLPerf Inference LoadGen python bindings",
+      url="https://mlcommons.org/",
+      cmdclass={"build_ext": build_ext},
+      ext_modules=[mlperf_loadgen_module],
+      packages=['mlcommons_loadgen'],
+      package_dir={'mlcommons_loadgen': '.'},
+      include_package_data=True,
+      long_description=mlperf_long_description,
+      long_description_content_type='text/markdown')
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/system_under_test.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/system_under_test.h
new file mode 100644
index 000000000..843453962
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/system_under_test.h
@@ -0,0 +1,67 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Defines the SystemUnderTest interface.
+
+#ifndef MLPERF_LOADGEN_SYSTEM_UNDER_TEST_H
+#define MLPERF_LOADGEN_SYSTEM_UNDER_TEST_H
+
+#include <string>
+#include <vector>
+
+#include "query_sample.h"
+
+namespace mlperf {
+
+/// \addtogroup LoadgenAPI
+/// @{
+
+/// \brief The interface a client implements for the loadgen to test.
+/// \todo Add hook for an untimed warm up period for the SUT.
+/// \todo Add hook for an untimed warm up period for the loadgen logic.
+/// \todo Support power hooks for cool-down period before runing performance
+/// traffic.
+/// \todo Support power hooks for correlating test timeline with power
+/// measurment timeline.
+class SystemUnderTest {
+ public:
+  virtual ~SystemUnderTest() {}
+
+  /// \brief A human-readable string for logging purposes.
+  virtual const std::string& Name() = 0;
+
+  /// \brief Lets the loadgen issue N samples to the SUT.
+  /// \details The SUT may either a) return immediately and signal completion
+  /// at a later time on another thread or b) it may block and signal
+  /// completion on the current stack. The load generator will handle both
+  /// cases properly.
+  /// Note: The data for neighboring samples may or may not be contiguous
+  /// depending on the scenario.
+  virtual void IssueQuery(const std::vector<QuerySample>& samples) = 0;
+
+  /// \brief Called immediately after the last call to IssueQuery
+  /// in a series is made.
+  /// \details This doesn't necessarily signify the end of the
+  /// test since there may be multiple series involved during a test; for
+  /// example in accuracy mode.
+  /// Clients can use this to flush any deferred queries immediately, rather
+  /// than waiting for some timeout.
+  /// This is especially useful in the server scenario.
+  virtual void FlushQueries() = 0;
+};
+
+/// @}
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_SYSTEM_UNDER_TEST_H
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings.h
new file mode 100644
index 000000000..584d073bb
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings.h
@@ -0,0 +1,329 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Provides ways for a client to change the behavior and
+/// constraints of the load generator.
+/// \details Note: The MLPerf specification takes precedent over any of the
+/// comments in this file if there are inconsistencies in regards to how the
+/// loadgen *should* work.
+/// The comments in this file are indicative of the loadgen implementation.
+
+#ifndef MLPERF_LOADGEN_TEST_SETTINGS_H
+#define MLPERF_LOADGEN_TEST_SETTINGS_H
+
+#include <cstdint>
+#include <string>
+
+namespace mlperf {
+
+/// \addtogroup LoadgenAPI
+/// @{
+
+/// \addtogroup LoadgenAPITestSettings Test Settings
+/// \brief This page contains a description of all the scenarios, modes,
+/// and log settings as implemented by the LoadGen.
+/// @{
+
+///
+/// \enum TestScenario
+/// * **SingleStream**
+///  + Issues queries containing a single sample.
+///  + The next query is only issued once the previous one has completed.
+///  + Internal LoadGen latency between queries is not included in the
+///    latency results.
+///  + **Final performance result is:** a percentile of the latency.
+/// * **MultiStream**
+///  + Issues queries containing N samples.
+///   - N is specified by \link
+///   mlperf::TestSettings::multi_stream_samples_per_query
+///   multi_stream_samples_per_query \endlink.
+///  + The next query is only issued once the previous one has completed.
+///  + The samples of each query are guaranteed to be contiguous with respect
+///    to the order they were loaded in the QuerySampleLibrary.
+///  + Latency is tracked and reported on a per-query and per-sample basis.
+///  + The latency of a query is the maximum latency of its samples, including
+///    any cross-thread communication within the loadgen.
+///  + Internal LoadGen latency between queries is not included in the
+///    latency results.
+///  + **Final performance result is:** a percentile of the query latency.
+/// * **Server**
+///  + Sends queries with a single sample.
+///  + Queries have a random poisson (non-uniform) arrival rate that, when
+///    averaged, hits the target QPS.
+///  + There is no limit on the number of outstanding queries, as long as
+///    the latency constraints are met.
+///  + **Final performance result is:** PASS if the a percentile of the latency
+///    is under a given threshold. FAIL otherwise.
+///   - Threshold is specified by \link
+///   mlperf::TestSettings::server_target_latency_ns server_target_latency_ns
+///   \endlink.
+/// * **Offline**
+///  + Sends all N samples to the SUT inside of a single query.
+///  + The samples of the query are guaranteed to be contiguous with respect
+///    to the order they were loaded in the QuerySampleLibrary.
+///  + **Final performance result is:** samples per second.
+///
+enum class TestScenario {
+  SingleStream,
+  MultiStream,
+  Server,
+  Offline,
+};
+
+///
+/// \enum TestMode
+/// * **SubmissionRun**
+///  + Runs accuracy mode followed by performance mode.
+///  + TODO: Implement further requirements as decided by MLPerf.
+/// * **AccuracyOnly**
+///  + Runs each sample from the QSL through the SUT a least once.
+///  + Outputs responses to an accuracy json that can be parsed by a model +
+///    sample library specific script.
+/// * **PerformanceOnly**
+///  + Runs the performance traffic for the given scenario, as described in
+///    the comments for TestScenario.
+/// * **FindPeakPerformance**
+///  + Determines the maximumum QPS for the Server scenario.
+///  + Not applicable for SingleStream, MultiStream or Offline scenarios.
+///
+enum class TestMode {
+  SubmissionRun,
+  AccuracyOnly,
+  PerformanceOnly,
+  FindPeakPerformance,
+};
+
+///
+/// \brief Top-level struct specifing the modes and parameters of the test.
+///
+struct TestSettings {
+  TestScenario scenario = TestScenario::SingleStream;
+  TestMode mode = TestMode::PerformanceOnly;
+
+  // ==================================
+  /// \name SingleStream-specific
+  /**@{*/
+  /// \brief A hint used by the loadgen to pre-generate enough samples to
+  ///        meet the minimum test duration.
+  double single_stream_expected_latency_ns = 1000000;
+  /// \brief The latency percentile reported as the final result.
+  double single_stream_target_latency_percentile = 0.90;
+  /**@}*/
+
+  // ==================================
+  /// \name MultiStream-specific
+  /**@{*/
+  /// \brief A hint used by the loadgen to pre-generate enough samples to
+  ///        meet the minimum test duration.
+  /// \brief MultiStream latency is for query (not sample) latency
+  double multi_stream_expected_latency_ns = 8000000;
+  /// \brief The latency percentile for MultiStream mode.
+  double multi_stream_target_latency_percentile = 0.99;
+  /// \brief The number of samples in each query.
+  /// \details How many samples are bundled in a query
+  uint64_t multi_stream_samples_per_query = 8;
+  /**@}*/
+
+  // ==================================
+  /// \name Server-specific
+  /**@{*/
+  /// \brief The average QPS of the poisson distribution.
+  /// \details note: This field is used as a FindPeakPerformance's lower bound.
+  /// When you run FindPeakPerformanceMode, you should make sure that this value
+  /// satisfies performance constraints.
+  double server_target_qps = 1;
+  /// \brief The latency constraint for the Server scenario.
+  uint64_t server_target_latency_ns = 100000000;
+  /// \brief The latency percentile for server mode. This value is combined with
+  /// server_target_latency_ns to determine if a run is valid.
+  /// \details 99% is the default value, which is correct for image models. GNMT
+  /// should be set to 0.97 (97%) in v0.5.(As always, check the policy page for
+  /// updated values for the benchmark you are running.)
+  double server_target_latency_percentile = 0.99;
+  /// \brief If this flag is set to true, LoadGen will combine samples from
+  /// multiple queries into a single query if their scheduled issue times have
+  /// passed.
+  bool server_coalesce_queries = false;
+  /// \brief The decimal places of QPS precision used to terminate
+  /// FindPeakPerformance mode.
+  int server_find_peak_qps_decimals_of_precision = 1;
+  /// \brief A step size (as a fraction of the QPS) used to widen the lower and
+  /// upper bounds to find the initial boundaries of binary search.
+  double server_find_peak_qps_boundary_step_size = 1;
+  /// \brief The maximum number of outstanding queries to allow before earlying
+  /// out from a performance run. Useful for performance tuning and speeding up
+  /// the FindPeakPerformance mode.
+  uint64_t server_max_async_queries = 0;  ///< 0: Infinity.
+  /// \brief The number of issue query threads that will be registered and used
+  /// to call SUT's IssueQuery(). If this is 0, the same thread calling
+  /// StartTest() will be used to call IssueQuery(). See also
+  /// mlperf::RegisterIssueQueryThread().
+  uint64_t server_num_issue_query_threads = 0;
+  /**@}*/
+
+  // ==================================
+  /// \name Offline-specific
+  /**@{*/
+  /// \brief Specifies the QPS the SUT expects to hit for the offline load.
+  /// The loadgen generates 10% more queries than it thinks it needs to meet
+  /// the minimum test duration.
+  double offline_expected_qps = 1;
+  /// \brief Affects the order in which the samples of the dataset are chosen.
+  /// If false it concatenates a single permutation of the dataset (or part
+  /// of it depending on QSL->PerformanceSampleCount()) several times up to the
+  /// number of samples requested.
+  /// If true it concatenates a multiple permutation of the dataset (or a
+  /// part of it depending on QSL->PerformanceSampleCount()) several times
+  /// up to the number of samples requested.
+  bool sample_concatenate_permutation = false;
+  /**@}*/
+
+  // ==================================
+  /// \name Test duration
+  /// The test runs until **both** min duration and min query count have been
+  /// met. However, it will exit before that point if **either** max duration or
+  /// max query count have been reached.
+  /**@{*/
+  uint64_t min_duration_ms = 10000;
+  uint64_t max_duration_ms = 0;  ///< 0: Infinity.
+  uint64_t min_query_count = 100;
+  uint64_t max_query_count = 0;  ///< 0: Infinity.
+  /**@}*/
+
+  // ==================================
+  /// \name Random number generation
+  /// There are 4 separate seeds, so each dimension can be changed
+  /// independently.
+  /**@{*/
+  /// \brief Affects which subset of samples from the QSL are chosen for
+  /// the performance sample set and accuracy sample sets.
+  uint64_t qsl_rng_seed = 0;
+  /// \brief Affects the order in which samples from the performance set will
+  /// be included in queries.
+  uint64_t sample_index_rng_seed = 0;
+  /// \brief Affects the poisson arrival process of the Server scenario.
+  /// \details Different seeds will appear to "jitter" the queries
+  /// differently in time, but should not affect the average issued QPS.
+  uint64_t schedule_rng_seed = 0;
+  /// \brief Affects which samples have their query returns logged to the
+  /// accuracy log in performance mode.
+  uint64_t accuracy_log_rng_seed = 0;
+
+  /// \brief Probability of the query response of a sample being logged to the
+  /// accuracy log in performance mode
+  double accuracy_log_probability = 0.0;
+
+  /// \brief Target number of samples that will have their results printed to
+  /// accuracy log in performance mode for compliance testing
+  uint64_t accuracy_log_sampling_target = 0;
+
+  /// \brief Variables for running test05 from native config. A boolean that
+  /// determines whether or not to run test05 and three random seed to run the
+  /// test
+  bool test05 = false;
+  uint64_t test05_qsl_rng_seed = 0;
+  uint64_t test05_sample_index_rng_seed = 0;
+  uint64_t test05_schedule_rng_seed = 0;
+
+  /// \brief Load mlperf parameter config from file.
+  int FromConfig(const std::string &path, const std::string &model,
+                 const std::string &scenario, int conf_type = 1);
+  /**@}*/
+
+  // ==================================
+  /// \name Performance Sample modifiers
+  /// \details These settings can be used to Audit Performance mode runs.
+  /// In order to detect sample caching by SUT, performance of runs when only
+  /// unique queries (with non-repeated samples) are issued can be compared with
+  /// that when the same query is repeatedly issued.
+  /**@{*/
+  /// \brief Prints measurement interval start and stop timestamps to std::cout
+  /// for the purpose of comparison against an external timer
+  bool print_timestamps = false;
+  /// \brief Allows issuing only unique queries in Performance mode of any
+  /// scenario \details This can be used to send non-repeat & hence unique
+  /// samples to SUT
+  bool performance_issue_unique = false;
+  /// \brief If true, the same query is chosen repeatedley for Inference.
+  /// In offline scenario, the query is filled with the same sample.
+  bool performance_issue_same = false;
+  /// \brief Offset to control which sample is repeated in
+  /// performance_issue_same mode.
+  /// Value should be within [0, performance_sample_count)
+  uint64_t performance_issue_same_index = 0;
+  /// \brief Overrides QSL->PerformanceSampleCount() when non-zero
+  uint64_t performance_sample_count_override = 0;
+  /// \brief Measure token latencies
+  bool use_token_latencies = false;
+  /// Token latency parameters
+  uint64_t server_ttft_latency = 100000000;
+  uint64_t server_tpot_latency = 100000000;
+  /// \brief Infer token latencies
+  bool infer_token_latencies = false;
+  uint64_t token_latency_scaling_factor;
+  /**@}*/
+};
+
+///
+/// \enum LoggingMode
+/// Specifies how and when logging should be sampled and stringified at
+/// runtime.
+/// * **AsyncPoll**
+///  + Logs are serialized and output on an IOThread that polls for new logs at
+///  a fixed interval. This is the only mode currently implemented.
+/// * **EndOfTestOnly**
+///  + TODO: Logs are serialzied and output only at the end of the test.
+/// * **Synchronous**
+///  + TODO: Logs are serialized and output inline.
+enum class LoggingMode {
+  AsyncPoll,
+  EndOfTestOnly,
+  Synchronous,
+};
+
+///
+/// \brief Specifies where log outputs should go.
+///
+/// By default, the loadgen outputs its log files to outdir and
+/// modifies the filenames of its logs with a prefix and suffix.
+/// Filenames will take the form:
+/// "<outdir>/<datetime><prefix>summary<suffix>.txt"
+///
+/// Affordances for outputing logs to stdout are also provided.
+///
+struct LogOutputSettings {
+  std::string outdir = ".";
+  std::string prefix = "mlperf_log_";
+  std::string suffix = "";
+  bool prefix_with_datetime = false;
+  bool copy_detail_to_stdout = false;
+  bool copy_summary_to_stdout = false;
+};
+
+///
+/// \brief Top-level log settings.
+///
+struct LogSettings {
+  LogOutputSettings log_output;
+  LoggingMode log_mode = LoggingMode::AsyncPoll;
+  uint64_t log_mode_async_poll_interval_ms = 1000;  ///< TODO: Implement this.
+  bool enable_trace = true;
+};
+
+/// @}
+
+/// @}
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_TEST_SETTINGS_H
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings_internal.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings_internal.cc
new file mode 100644
index 000000000..3f2cd8847
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings_internal.cc
@@ -0,0 +1,800 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "test_settings_internal.h"
+
+#include <fstream>
+#include <map>
+#include <sstream>
+#include <string>
+
+#include "logging.h"
+#include "mlperf_conf.h"
+#include "utils.h"
+
+namespace mlperf {
+namespace loadgen {
+
+TestSettingsInternal::TestSettingsInternal(
+    const TestSettings &requested_settings, size_t qsl_performance_sample_count)
+    : requested(requested_settings),
+      scenario(requested.scenario),
+      mode(requested.mode),
+      samples_per_query(1),
+      target_qps(1),
+      max_async_queries(0),
+      target_duration(std::chrono::milliseconds(requested.min_duration_ms)),
+      min_duration(std::chrono::milliseconds(requested.min_duration_ms)),
+      max_duration(std::chrono::milliseconds(requested.max_duration_ms)),
+      min_query_count(requested.min_query_count),
+      max_query_count(requested.max_query_count),
+      min_sample_count(0),
+      qsl_rng_seed(requested.qsl_rng_seed),
+      sample_index_rng_seed(requested.sample_index_rng_seed),
+      schedule_rng_seed(requested.schedule_rng_seed),
+      accuracy_log_rng_seed(requested.accuracy_log_rng_seed),
+      accuracy_log_probability(requested.accuracy_log_probability),
+      accuracy_log_sampling_target(requested.accuracy_log_sampling_target),
+      print_timestamps(requested.print_timestamps),
+      performance_issue_unique(requested.performance_issue_unique),
+      performance_issue_same(requested.performance_issue_same),
+      performance_issue_same_index(requested.performance_issue_same_index),
+      performance_sample_count(0),
+      sample_concatenate_permutation(false),
+      use_token_latencies(requested.use_token_latencies),
+      server_ttft_latency(requested.server_ttft_latency),
+      server_tpot_latency(requested.server_tpot_latency),
+      infer_token_latencies(requested.infer_token_latencies),
+      token_latency_scaling_factor(requested.token_latency_scaling_factor) {
+  // Target QPS, target latency, and max_async_queries.
+  switch (requested.scenario) {
+    case TestScenario::SingleStream:
+      target_qps = static_cast<double>(std::nano::den) /
+                   requested.single_stream_expected_latency_ns;
+      max_async_queries = 1;
+      target_latency_percentile =
+          requested.single_stream_target_latency_percentile;
+      break;
+    case TestScenario::MultiStream:
+      target_qps = static_cast<double>(std::nano::den) /
+                   requested.multi_stream_expected_latency_ns;
+      max_async_queries = 1;
+      target_latency_percentile =
+          requested.multi_stream_target_latency_percentile;
+      break;
+    case TestScenario::Server:
+      if (requested.server_target_qps >= 0.0) {
+        target_qps = requested.server_target_qps;
+      } else {
+        LogDetail([server_target_qps = requested.server_target_qps,
+                   target_qps = target_qps](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+          std::stringstream ss;
+          ss << "Invalid value for server_target_qps requested."
+             << " requested: " << server_target_qps << " using: " << target_qps;
+          MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", ss.str());
+#else
+          detail.Error("Invalid value for server_target_qps requested.",
+                       "requested", server_target_qps, "using", target_qps);
+#endif
+        });
+      }
+      target_latency =
+          std::chrono::nanoseconds(requested.server_target_latency_ns);
+      target_latency_percentile = requested.server_target_latency_percentile;
+      max_async_queries = requested.server_max_async_queries;
+      break;
+    case TestScenario::Offline:
+      // target_latency_percentile is not used in Offline, but set it to
+      // 0.99 anyway to avoid garbage value.
+      target_latency_percentile = 0.99;
+      if (requested.offline_expected_qps >= 0.0) {
+        target_qps = requested.offline_expected_qps;
+      } else {
+        LogDetail([offline_expected_qps = requested.offline_expected_qps,
+                   target_qps = target_qps](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+          std::stringstream ss;
+          ss << "Invalid value for offline_expected_qps requested."
+             << " requested: " << offline_expected_qps
+             << " using: " << target_qps;
+          MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", ss.str());
+#else
+          detail.Error("Invalid value for offline_expected_qps requested.",
+                       "requested", offline_expected_qps, "using", target_qps);
+#endif
+        });
+      }
+      max_async_queries = 1;
+      break;
+  }
+
+  // Performance Sample Count: TestSettings override QSL ->
+  // PerformanceSampleCount
+  performance_sample_count = (requested.performance_sample_count_override == 0)
+                                 ? qsl_performance_sample_count
+                                 : requested.performance_sample_count_override;
+
+  // Sample by concatentating several permutations of the dataset
+  // sample_concatenate_permutation
+  sample_concatenate_permutation =
+      (requested.sample_concatenate_permutation == 0)
+          ? false
+          : requested.sample_concatenate_permutation;
+
+  // Samples per query.
+  if (requested.scenario == TestScenario::MultiStream) {
+    samples_per_query = requested.multi_stream_samples_per_query;
+  }
+
+  // In the offline scenario, coalesce all queries into a single query.
+  if (requested.scenario == TestScenario::Offline) {
+    // TODO: Should the spec require a max duration for large query counts?
+    // kSlack is used to make sure we generate enough samples for the SUT
+    // to take longer than than the minimum test duration required by the
+    // MLPerf spec.
+    constexpr double kSlack = 1.1;
+    uint64_t target_sample_count =
+        kSlack * DurationToSeconds(target_duration) * target_qps;
+    samples_per_query =
+        (requested.performance_issue_unique)
+            ? performance_sample_count
+            : std::max<uint64_t>(min_query_count, target_sample_count);
+    min_query_count = 1;
+    target_duration = std::chrono::milliseconds(0);
+  }
+
+  // FIXME: Only do this for 3D-UNet SingleStream, for v2.0
+  // TODO: consolidate after v2.0
+  // make min_queries to be multiple of performance_sample_count
+  // performance_sample_count == 0 makes it to be equal to loaded_samples.size()
+  if (sample_concatenate_permutation &&
+      requested.scenario == TestScenario::SingleStream) {
+    // set slack larger for 3D-UNet KiTS19 distribution, i.e. 50% latency << 90%
+    // latency
+    constexpr double kSlack = 2.0;
+    uint64_t expected_queries =
+        kSlack * DurationToSeconds(target_duration) * target_qps;
+    min_query_count =
+        min_query_count > expected_queries ? min_query_count : expected_queries;
+    min_query_count += qsl_performance_sample_count -
+                       (min_query_count % qsl_performance_sample_count);
+  }
+
+  min_sample_count = min_query_count * samples_per_query;
+
+  // Validate TestSettings
+  if (requested.performance_issue_same &&
+      (requested.performance_issue_same_index >= performance_sample_count)) {
+    LogDetail([performance_issue_same_index =
+                   requested.performance_issue_same_index,
+               performance_sample_count =
+                   performance_sample_count](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+      std::stringstream ss;
+      ss << "Sample Idx to be repeated in performance_issue_same mode"
+         << " cannot be greater than loaded performance_sample_count."
+         << " performance_issue_same_index: " << performance_issue_same_index
+         << " performance_sample_count: " << performance_sample_count;
+      MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", ss.str());
+#else
+      detail.Error(
+          "Sample Idx to be repeated in performance_issue_same mode"
+          " cannot be greater than loaded performance_sample_count.",
+          "performance_issue_same_index", performance_issue_same_index,
+          "performance_sample_count", performance_sample_count);
+#endif
+    });
+  }
+
+  if (requested.performance_issue_unique && requested.performance_issue_same) {
+    LogDetail([performance_issue_unique = requested.performance_issue_unique,
+               performance_issue_same =
+                   requested.performance_issue_same](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+      std::stringstream ss;
+      ss << "Performance_issue_unique and performance_issue_same, both"
+         << " cannot be true at the same time."
+         << " performance_issue_unique: " << performance_issue_unique
+         << " performance_issue_same: " << performance_issue_same;
+      MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", ss.str());
+#else
+      detail.Error(
+          "Performance_issue_unique and performance_issue_same, both"
+          " cannot be true at the same time.",
+          "performance_issue_unique", performance_issue_unique,
+          "performance_issue_same", performance_issue_same);
+#endif
+    });
+  }
+}
+
+std::string ToString(TestScenario scenario) {
+  switch (scenario) {
+#if USE_NEW_LOGGING_FORMAT
+    case TestScenario::SingleStream:
+      return "SingleStream";
+    case TestScenario::MultiStream:
+      return "MultiStream";
+#else
+    case TestScenario::SingleStream:
+      return "Single Stream";
+    case TestScenario::MultiStream:
+      return "Multi Stream";
+#endif
+    case TestScenario::Server:
+      return "Server";
+    case TestScenario::Offline:
+      return "Offline";
+  }
+  assert(false);
+  return "InvalidScenario";
+}
+
+std::string ToString(TestMode mode) {
+  switch (mode) {
+#if USE_NEW_LOGGING_FORMAT
+    case TestMode::SubmissionRun:
+      return "SubmissionRun";
+    case TestMode::AccuracyOnly:
+      return "AccuracyOnly";
+    case TestMode::PerformanceOnly:
+      return "PerformanceOnly";
+    case TestMode::FindPeakPerformance:
+      return "FindPeakPerformance";
+#else
+    case TestMode::SubmissionRun:
+      return "Submission";
+    case TestMode::AccuracyOnly:
+      return "Accuracy";
+    case TestMode::PerformanceOnly:
+      return "Performance";
+    case TestMode::FindPeakPerformance:
+      return "Find Peak Performance";
+#endif
+  }
+  assert(false);
+  return "InvalidMode";
+}
+
+void LogRequestedTestSettings(const TestSettings &s) {
+  LogDetail([s](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "requested_scenario", ToString(s.scenario));
+    MLPERF_LOG(detail, "requested_test_mode", ToString(s.mode));
+
+    // Scenario-specific
+    switch (s.scenario) {
+      case TestScenario::SingleStream:
+        MLPERF_LOG(detail, "requested_single_stream_expected_latency_ns",
+                   s.single_stream_expected_latency_ns);
+        MLPERF_LOG(detail, "requested_single_stream_target_latency_percentile",
+                   s.single_stream_target_latency_percentile);
+        break;
+      case TestScenario::MultiStream:
+        MLPERF_LOG(detail, "requested_multi_stream_expected_latency_ns",
+                   s.multi_stream_expected_latency_ns);
+        MLPERF_LOG(detail, "requested_multi_stream_target_latency_percentile",
+                   s.multi_stream_target_latency_percentile);
+        MLPERF_LOG(detail, "requested_multi_stream_samples_per_query",
+                   s.multi_stream_samples_per_query);
+        break;
+      case TestScenario::Server:
+        MLPERF_LOG(detail, "requested_server_target_qps", s.server_target_qps);
+        MLPERF_LOG(detail, "requested_server_target_latency_ns",
+                   s.server_target_latency_ns);
+        MLPERF_LOG(detail, "requested_server_target_latency_percentile",
+                   s.server_target_latency_percentile);
+        MLPERF_LOG(detail, "requested_server_coalesce_queries",
+                   s.server_coalesce_queries);
+        MLPERF_LOG(detail,
+                   "requested_server_find_peak_qps_decimals_of_precision",
+                   s.server_find_peak_qps_decimals_of_precision);
+        MLPERF_LOG(detail, "requested_server_find_peak_qps_boundary_step_size",
+                   s.server_find_peak_qps_boundary_step_size);
+        MLPERF_LOG(detail, "requested_server_max_async_queries",
+                   s.server_max_async_queries);
+        MLPERF_LOG(detail, "requested_server_num_issue_query_threads",
+                   s.server_num_issue_query_threads);
+        break;
+      case TestScenario::Offline:
+        MLPERF_LOG(detail, "requested_offline_expected_qps",
+                   s.offline_expected_qps);
+        break;
+    }
+
+    // Overrides
+    MLPERF_LOG(detail, "requested_min_duration_ms", s.min_duration_ms);
+    MLPERF_LOG(detail, "requested_max_duration_ms", s.max_duration_ms);
+    MLPERF_LOG(detail, "requested_min_query_count", s.min_query_count);
+    MLPERF_LOG(detail, "requested_max_query_count", s.max_query_count);
+    MLPERF_LOG(detail, "requested_qsl_rng_seed", s.qsl_rng_seed);
+    MLPERF_LOG(detail, "requested_sample_index_rng_seed",
+               s.sample_index_rng_seed);
+    MLPERF_LOG(detail, "requested_schedule_rng_seed", s.schedule_rng_seed);
+    MLPERF_LOG(detail, "requested_accuracy_log_rng_seed",
+               s.accuracy_log_rng_seed);
+    MLPERF_LOG(detail, "requested_accuracy_log_probability",
+               s.accuracy_log_probability);
+    MLPERF_LOG(detail, "requested_accuracy_log_sampling_target",
+               s.accuracy_log_sampling_target);
+    MLPERF_LOG(detail, "requested_print_timestamps", s.print_timestamps);
+    MLPERF_LOG(detail, "requested_performance_issue_unique",
+               s.performance_issue_unique);
+    MLPERF_LOG(detail, "requested_performance_issue_same",
+               s.performance_issue_same);
+    MLPERF_LOG(detail, "requested_performance_issue_same_index",
+               s.performance_issue_same_index);
+    MLPERF_LOG(detail, "requested_performance_sample_count_override",
+               s.performance_sample_count_override);
+    MLPERF_LOG(detail, "requested_sample_concatenate_permutation",
+               s.sample_concatenate_permutation);
+    // Token latencies specific values
+    if (s.use_token_latencies) {
+      MLPERF_LOG(detail, "requested_use_token_latencies",
+                 s.use_token_latencies);
+      if (s.scenario != TestScenario::Offline) {
+        MLPERF_LOG(detail, "requested_server_ttft_latency",
+                   s.server_ttft_latency);
+        MLPERF_LOG(detail, "requested_server_tpot_latency",
+                   s.server_tpot_latency);
+      }
+    }
+#else
+    detail("");
+    detail("Requested Settings:");
+    detail("Scenario : " + ToString(s.scenario));
+    detail("Test mode : " + ToString(s.mode));
+
+    // Scenario-specific
+    switch (s.scenario) {
+      case TestScenario::SingleStream:
+        detail("single_stream_expected_latency_ns : ",
+               s.single_stream_expected_latency_ns);
+        detail("single_stream_target_latency_percentile : ",
+               s.single_stream_target_latency_percentile);
+        break;
+      case TestScenario::MultiStream:
+        detail("multi_stream_expected_latency_ns : ",
+               s.multi_stream_expected_latency_ns);
+        detail("multi_stream_target_latency_percentile : ",
+               s.multi_stream_target_latency_percentile);
+        detail("multi_stream_samples_per_query : ",
+               s.multi_stream_samples_per_query);
+        break;
+      case TestScenario::Server:
+        detail("server_target_qps : ", s.server_target_qps);
+        detail("server_target_latency_ns : ", s.server_target_latency_ns);
+        detail("server_target_latency_percentile : ",
+               s.server_target_latency_percentile);
+        detail("server_coalesce_queries : ", s.server_coalesce_queries);
+        detail("server_find_peak_qps_decimals_of_precision : ",
+               s.server_find_peak_qps_decimals_of_precision);
+        detail("server_find_peak_qps_boundary_step_size : ",
+               s.server_find_peak_qps_boundary_step_size);
+        detail("server_max_async_queries : ", s.server_max_async_queries);
+        detail("server_num_issue_query_threads : ",
+               s.server_num_issue_query_threads);
+        break;
+      case TestScenario::Offline:
+        detail("offline_expected_qps : ", s.offline_expected_qps);
+        break;
+    }
+
+    // Overrides
+    detail("min_duration_ms : ", s.min_duration_ms);
+    detail("max_duration_ms : ", s.max_duration_ms);
+    detail("min_query_count : ", s.min_query_count);
+    detail("max_query_count : ", s.max_query_count);
+    detail("qsl_rng_seed : ", s.qsl_rng_seed);
+    detail("sample_index_rng_seed : ", s.sample_index_rng_seed);
+    detail("schedule_rng_seed : ", s.schedule_rng_seed);
+    detail("accuracy_log_rng_seed : ", s.accuracy_log_rng_seed);
+    detail("accuracy_log_probability : ", s.accuracy_log_probability);
+    detail("accuracy_log_sampling_target : ", s.accuracy_log_sampling_target);
+    detail("print_timestamps : ", s.print_timestamps);
+    detail("performance_issue_unique : ", s.performance_issue_unique);
+    detail("performance_issue_same : ", s.performance_issue_same);
+    detail("performance_issue_same_index : ", s.performance_issue_same_index);
+    detail("performance_sample_count_override : ",
+           s.performance_sample_count_override);
+    detail("");
+#endif
+  });
+}
+
+void TestSettingsInternal::LogEffectiveSettings() const {
+  LogDetail([s = *this](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "effective_scenario", ToString(s.scenario));
+    MLPERF_LOG(detail, "effective_test_mode", ToString(s.mode));
+
+    MLPERF_LOG(detail, "effective_samples_per_query", s.samples_per_query);
+    MLPERF_LOG(detail, "effective_target_qps", s.target_qps);
+    MLPERF_LOG(detail, "effective_target_latency_ns", s.target_latency.count());
+    MLPERF_LOG(detail, "effective_target_latency_percentile",
+               s.target_latency_percentile);
+    MLPERF_LOG(detail, "effective_max_async_queries", s.max_async_queries);
+    MLPERF_LOG(detail, "effective_target_duration_ms",
+               s.target_duration.count());
+    MLPERF_LOG(detail, "effective_min_duration_ms", s.min_duration.count());
+    MLPERF_LOG(detail, "effective_max_duration_ms", s.max_duration.count());
+    MLPERF_LOG(detail, "effective_min_query_count", s.min_query_count);
+    MLPERF_LOG(detail, "effective_max_query_count", s.max_query_count);
+    MLPERF_LOG(detail, "effective_min_sample_count", s.min_sample_count);
+    MLPERF_LOG(detail, "effective_qsl_rng_seed", s.qsl_rng_seed);
+    MLPERF_LOG(detail, "effective_sample_index_rng_seed",
+               s.sample_index_rng_seed);
+    MLPERF_LOG(detail, "effective_schedule_rng_seed", s.schedule_rng_seed);
+    MLPERF_LOG(detail, "effective_accuracy_log_rng_seed",
+               s.accuracy_log_rng_seed);
+    MLPERF_LOG(detail, "effective_accuracy_log_probability",
+               s.accuracy_log_probability);
+    MLPERF_LOG(detail, "effective_accuracy_log_sampling_target",
+               s.accuracy_log_sampling_target);
+    MLPERF_LOG(detail, "effective_print_timestamps", s.print_timestamps);
+    MLPERF_LOG(detail, "effective_performance_issue_unique",
+               s.performance_issue_unique);
+    MLPERF_LOG(detail, "effective_performance_issue_same",
+               s.performance_issue_same);
+    MLPERF_LOG(detail, "effective_performance_issue_same_index",
+               s.performance_issue_same_index);
+    MLPERF_LOG(detail, "effective_performance_sample_count",
+               s.performance_sample_count);
+    MLPERF_LOG(detail, "effective_sample_concatenate_permutation",
+               s.sample_concatenate_permutation);
+#else
+    detail("");
+    detail("Effective Settings:");
+
+    detail("Scenario : " + ToString(s.scenario));
+    detail("Test mode : " + ToString(s.mode));
+
+    detail("samples_per_query : ", s.samples_per_query);
+    detail("target_qps : ", s.target_qps);
+    detail("target_latency (ns): ", s.target_latency.count());
+    detail("target_latency_percentile : ", s.target_latency_percentile);
+    detail("max_async_queries : ", s.max_async_queries);
+    detail("target_duration (ms): ", s.target_duration.count());
+    detail("min_duration (ms): ", s.min_duration.count());
+    detail("max_duration (ms): ", s.max_duration.count());
+    detail("min_query_count : ", s.min_query_count);
+    detail("max_query_count : ", s.max_query_count);
+    detail("min_sample_count : ", s.min_sample_count);
+    detail("qsl_rng_seed : ", s.qsl_rng_seed);
+    detail("sample_index_rng_seed : ", s.sample_index_rng_seed);
+    detail("schedule_rng_seed : ", s.schedule_rng_seed);
+    detail("accuracy_log_rng_seed : ", s.accuracy_log_rng_seed);
+    detail("accuracy_log_probability : ", s.accuracy_log_probability);
+    detail("accuracy_log_sampling_target : ", s.accuracy_log_sampling_target);
+    detail("print_timestamps : ", s.print_timestamps);
+    detail("performance_issue_unique : ", s.performance_issue_unique);
+    detail("performance_issue_same : ", s.performance_issue_same);
+    detail("performance_issue_same_index : ", s.performance_issue_same_index);
+    detail("performance_sample_count : ", s.performance_sample_count);
+#endif
+  });
+}
+
+void TestSettingsInternal::LogAllSettings() const {
+  LogRequestedTestSettings(requested);
+  LogEffectiveSettings();
+}
+
+void TestSettingsInternal::LogSummary(AsyncSummary &summary) const {
+  summary("samples_per_query : ", samples_per_query);
+  summary("target_qps : ", target_qps);
+  if (!use_token_latencies) {
+    summary("target_latency (ns): ", target_latency.count());
+  } else {
+    summary("ttft_latency (ns): ", server_ttft_latency);
+    summary("tpot_latency (ns): ", server_tpot_latency);
+  }
+  summary("max_async_queries : ", max_async_queries);
+  summary("min_duration (ms): ", min_duration.count());
+  summary("max_duration (ms): ", max_duration.count());
+  summary("min_query_count : ", min_query_count);
+  summary("max_query_count : ", max_query_count);
+  summary("qsl_rng_seed : ", qsl_rng_seed);
+  summary("sample_index_rng_seed : ", sample_index_rng_seed);
+  summary("schedule_rng_seed : ", schedule_rng_seed);
+  summary("accuracy_log_rng_seed : ", accuracy_log_rng_seed);
+  summary("accuracy_log_probability : ", accuracy_log_probability);
+  summary("accuracy_log_sampling_target : ", accuracy_log_sampling_target);
+  summary("print_timestamps : ", print_timestamps);
+  summary("performance_issue_unique : ", performance_issue_unique);
+  summary("performance_issue_same : ", performance_issue_same);
+  summary("performance_issue_same_index : ", performance_issue_same_index);
+  summary("performance_sample_count : ", performance_sample_count);
+  if (sample_concatenate_permutation) {
+    summary(
+        "WARNING: sample_concatenate_permutation was set to true. \n"
+        "Generated samples per query might be different as the one in the "
+        "setting.\n"
+        "Check the generated_samples_per_query line in the detailed log for "
+        "the real\n"
+        "samples_per_query value");
+  }
+}
+
+}  // namespace loadgen
+
+int TestSettings::FromConfig(const std::string &path, const std::string &model,
+                             const std::string &scenario, int conf_type) {
+  std::map<std::string, std::string> kv;
+  static int configCount = 0;
+
+  if (conf_type == 1) {
+    if (configCount == 0) {
+      // Only allow userConf as the single configFile and loadgen loads the
+      // mlperfConf automatically for perf and accuracy runs
+      FromConfig("", model, scenario, 0);
+    }
+
+    else {
+      LogDetail([](AsyncDetail &detail) {
+        std::stringstream ss;
+        ss << "Multiple conf files are used. This is not valid for official "
+              "submission.";
+        MLPERF_LOG_ERROR(detail, "error_invalid_config", ss.str());
+      });
+    }
+    configCount++;
+  }
+
+  // lookup key/value pairs from config
+  auto lookupkv = [&](const std::string &model, const std::string &scenario,
+                      const std::string &key, uint64_t *val_l, double *val_d,
+                      double multiplier = 1.0) {
+    std::map<std::string, std::string>::iterator it;
+    std::string found;
+    // lookup exact key first
+    it = kv.find(model + "." + scenario + "." + key);
+    if (it != kv.end()) {
+      found = it->second;
+    } else {
+      // lookup key with model wildcard
+      it = kv.find("*." + scenario + "." + key);
+      if (it != kv.end()) {
+        found = it->second;
+      } else {
+        it = kv.find(model + ".*." + key);
+        if (it != kv.end()) {
+          found = it->second;
+        } else {
+          it = kv.find("*.*." + key);
+          if (it != kv.end()) {
+            found = it->second;
+          } else {
+            return false;
+          }
+        }
+      }
+    }
+    // if we get here, found will be set
+    if (val_l) {
+      *val_l = strtoull(found.c_str(), nullptr, 0) *
+               static_cast<uint64_t>(multiplier);
+    }
+    if (val_d) *val_d = strtod(found.c_str(), nullptr) * multiplier;
+    return true;
+  };
+
+  int line_nr = 0;
+  int errors = 0;
+  // Declare the input stream before the if-else block
+  std::unique_ptr<std::istream> fss;
+  std::string line;
+
+  if (conf_type != 0) {
+    // dirt simple config parser
+    fss = std::make_unique<std::ifstream>(path);
+    if (!static_cast<std::ifstream *>(fss.get())->is_open()) {
+      LogDetail([p = path](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+        std::stringstream ss;
+        ss << "can't open file " << p;
+        MLPERF_LOG_ERROR(detail, "error_invalid_config", ss.str());
+#else
+        detail.Error("can't open file ", p);
+#endif
+      });
+      return -ENOENT;
+    }
+  } else {
+    // Convert unsigned char array to std::string
+    std::string config_str(mlperf_conf);
+    fss = std::make_unique<std::istringstream>(config_str);
+  }
+  while (std::getline(*fss, line)) {
+    line_nr++;
+    std::istringstream iss(line);
+    std::string s, k;
+    int looking_for = 0;  // 0=key, 1=equal, 2=value
+    while (iss >> s) {
+      if (s == "#" && looking_for != 2) {
+        // done with this line
+        break;
+      }
+      if (looking_for == 2) {
+        // got key and value
+        const char *start = s.c_str();
+        char *stop;
+        (void)strtoul(start, &stop, 0);
+        if (start + s.size() == stop) {
+          kv[k] = s;
+          continue;
+        }
+        (void)strtod(start, &stop);
+        if (start + s.size() == stop) {
+          kv[k] = s;
+          continue;
+        }
+        errors++;
+        LogDetail([l = line_nr](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+          std::stringstream ss;
+          ss << "value needs to be integer or double, line=" << l;
+          MLPERF_LOG_ERROR(detail, "error_invalid_config", ss.str());
+#else
+          detail.Error("value needs to be integer or double, line=", l);
+#endif
+        });
+        break;
+      }
+      if (looking_for == 1 && s != "=") {
+        errors++;
+        LogDetail([l = line_nr](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+          std::stringstream ss;
+          ss << "expected 'key=value', line=" << l;
+          MLPERF_LOG_ERROR(detail, "error_invalid_config", ss.str());
+#else
+          detail.Error("expected 'key=value', line=", l);
+#endif
+        });
+        break;
+      }
+      if (looking_for == 0) k = s;
+      looking_for++;
+    }
+  }
+  if (errors != 0) return -EINVAL;
+
+  uint64_t val;
+
+  // keys that apply to all scenarios
+  if (lookupkv(model, scenario, "mode", &val, nullptr)) {
+    switch (val) {
+      case 0:
+        mode = TestMode::SubmissionRun;
+        break;
+      case 1:
+        mode = TestMode::AccuracyOnly;
+        break;
+      case 2:
+        mode = TestMode::PerformanceOnly;
+        break;
+      case 3:
+        mode = TestMode::FindPeakPerformance;
+        break;
+      default:
+        LogDetail([](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+          std::stringstream ss;
+          ss << "Invalid value passed to Mode key in config.";
+          MLPERF_LOG_ERROR(detail, "error_invalid_config", ss.str());
+#else
+          detail.Error("Invalid value passed to Mode key in config.");
+#endif
+        });
+        break;
+    }
+  }
+
+  if (conf_type == 0) {
+    lookupkv(model, scenario, "qsl_rng_seed", &qsl_rng_seed, nullptr);
+    lookupkv(model, scenario, "sample_index_rng_seed", &sample_index_rng_seed,
+             nullptr);
+    lookupkv(model, scenario, "schedule_rng_seed", &schedule_rng_seed, nullptr);
+    lookupkv(model, scenario, "accuracy_log_probability", nullptr,
+             &accuracy_log_probability, 0.01);
+    if (lookupkv(model, scenario, "test05", &val, nullptr))
+      test05 = (val == 1) ? true : false;
+    lookupkv(model, scenario, "test05_qsl_rng_seed", &test05_qsl_rng_seed,
+             nullptr);
+    lookupkv(model, scenario, "test05_sample_index_rng_seed",
+             &test05_sample_index_rng_seed, nullptr);
+    lookupkv(model, scenario, "test05_schedule_rng_seed",
+             &test05_schedule_rng_seed, nullptr);
+  }
+
+  // keys that can be overriden in user.conf but will make the results eligible
+  // only for open submissions
+
+  // keys to measure token metrics
+  if (lookupkv(model, scenario, "use_token_latencies", &val, nullptr)) {
+    use_token_latencies = (val == 1) ? true : false;
+  }
+  if (use_token_latencies) {
+    lookupkv(model, "Server", "ttft_latency", &server_ttft_latency, nullptr,
+             1000 * 1000);
+    lookupkv(model, "Server", "tpot_latency", &server_tpot_latency, nullptr,
+             1000 * 1000);
+  }
+
+  // keys to infer token metrics
+  if (lookupkv(model, scenario, "infer_token_latencies", &val, nullptr)) {
+    infer_token_latencies = (val == 1) ? true : false;
+  }
+  if (infer_token_latencies) {
+    lookupkv(model, scenario, "token_latency_scaling_factor",
+             &token_latency_scaling_factor, nullptr, 1);
+  }
+  // keys that apply to SingleStream
+  lookupkv(model, "SingleStream", "target_latency_percentile", nullptr,
+           &single_stream_target_latency_percentile, 0.01);
+
+  // keys that apply to MultiStream
+  lookupkv(model, "MultiStream", "target_latency_percentile", nullptr,
+           &multi_stream_target_latency_percentile, 0.01);
+  lookupkv(model, "MultiStream", "samples_per_query",
+           &multi_stream_samples_per_query, nullptr, 1);
+
+  // keys that apply to Server
+  lookupkv(model, "Server", "target_latency_percentile", nullptr,
+           &server_target_latency_percentile, 0.01);
+  lookupkv(model, "Server", "target_latency", &server_target_latency_ns,
+           nullptr, 1000 * 1000);
+
+  // keys that can be overriden in user.conf (the provided values still need to
+  // pass the submission checker rules)
+  if (lookupkv(model, scenario, "performance_issue_unique", &val, nullptr))
+    performance_issue_unique = (val == 0) ? false : true;
+  if (lookupkv(model, scenario, "performance_issue_same", &val, nullptr))
+    performance_issue_same = (val == 0) ? false : true;
+  lookupkv(model, scenario, "performance_issue_same_index",
+           &performance_issue_same_index, nullptr);
+
+  if (lookupkv(model, scenario, "sample_concatenate_permutation", &val,
+               nullptr))
+    sample_concatenate_permutation = (val == 1) ? true : false;
+  if (lookupkv(model, "Server", "coalesce_queries", &val, nullptr))
+    server_coalesce_queries = (val == 0) ? false : true;
+  if (lookupkv(model, "Server", "max_async_queries", &val, nullptr))
+    server_max_async_queries = int(val);
+
+  lookupkv(model, scenario, "min_duration", &min_duration_ms, nullptr);
+  lookupkv(model, scenario, "max_duration", &max_duration_ms, nullptr);
+  lookupkv(model, scenario, "min_query_count", &min_query_count, nullptr);
+  lookupkv(model, scenario, "max_query_count", &max_query_count, nullptr);
+  lookupkv(model, scenario, "performance_sample_count_override",
+           &performance_sample_count_override, nullptr);
+  lookupkv(model, "SingleStream", "target_latency", nullptr,
+           &single_stream_expected_latency_ns, 1000 * 1000);
+  lookupkv(model, "MultiStream", "target_latency", nullptr,
+           &multi_stream_expected_latency_ns, 1000 * 1000);
+  lookupkv(model, "Server", "target_qps", nullptr, &server_target_qps);
+  lookupkv(model, "Offline", "target_qps", 0, &offline_expected_qps);
+
+  if (lookupkv(model, scenario, "print_timestamps", &val, nullptr))
+    print_timestamps = (val == 0) ? false : true;
+
+  // keys that are used in audit.conf
+  lookupkv(model, scenario, "accuracy_log_rng_seed", &accuracy_log_rng_seed,
+           nullptr);
+  lookupkv(model, scenario, "accuracy_log_sampling_target",
+           &accuracy_log_sampling_target, nullptr);
+  return 0;
+}
+
+}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings_internal.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings_internal.h
new file mode 100644
index 000000000..ab2773bd1
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings_internal.h
@@ -0,0 +1,182 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief The internal representation of user-provided settings.
+
+#ifndef MLPERF_LOADGEN_TEST_SETTINGS_INTERNAL_H
+#define MLPERF_LOADGEN_TEST_SETTINGS_INTERNAL_H
+
+#include <chrono>
+#include <cmath>
+#include <string>
+
+#include "logging.h"
+#include "test_settings.h"
+
+namespace mlperf {
+
+namespace logging {
+class AsyncSummary;
+}
+
+namespace loadgen {
+
+using AsyncSummary = logging::AsyncSummary;
+
+std::string ToString(TestScenario scenario);
+std::string ToString(TestMode mode);
+
+/// \brief takes the user-friendly TestSettings and normalizes it
+/// for consumption by the loadgen.
+/// \details It does things like remove scenario-specific naming and introduce
+/// the concept of target_duration used to pre-generate queries.
+struct TestSettingsInternal {
+  explicit TestSettingsInternal(const TestSettings &requested_settings,
+                                size_t qsl_performance_sample_count);
+  void LogEffectiveSettings() const;
+  void LogAllSettings() const;
+  void LogSummary(AsyncSummary &summary) const;
+
+  const TestSettings requested;
+  const TestScenario scenario;  // Copied here for convenience.
+  const TestMode mode;          // Copied here for convenience.
+
+  uint64_t samples_per_query;
+  double target_qps;
+  std::chrono::nanoseconds target_latency{0};
+  double target_latency_percentile;  // Single, multistream, and server modes.
+  uint64_t max_async_queries;
+
+  // Target duration is used to generate queries of a minimum duration before
+  // the test run.
+  std::chrono::milliseconds target_duration{0};
+
+  // Min duration/query_count/sample_count are used to validate the test
+  // duration at the end of the run.
+  std::chrono::milliseconds min_duration{0};
+  std::chrono::milliseconds max_duration{0};
+  uint64_t min_query_count;
+  uint64_t max_query_count;
+  uint64_t min_sample_count;  // Offline only.
+
+  uint64_t qsl_rng_seed;
+  uint64_t sample_index_rng_seed;
+  uint64_t schedule_rng_seed;
+  uint64_t accuracy_log_rng_seed;
+  double accuracy_log_probability;
+  uint64_t accuracy_log_sampling_target;
+  bool print_timestamps;
+  bool performance_issue_unique;
+  bool performance_issue_same;
+  uint64_t performance_issue_same_index;
+  uint64_t performance_sample_count;
+
+  bool sample_concatenate_permutation;
+  bool use_token_latencies = false;
+  int64_t server_ttft_latency;
+  int64_t server_tpot_latency;
+
+  bool infer_token_latencies = false;
+  int64_t token_latency_scaling_factor;
+};
+
+/// \brief A namespace of collections of FindPeakPerformance helper functions,
+/// mainly about binary search.
+namespace find_peak_performance {
+
+constexpr char const *kNotSupportedMsg =
+    "Finding peak performance is only supported in Server scenarios.";
+
+template <TestScenario scenario>
+TestSettingsInternal MidOfBoundaries(
+    const TestSettingsInternal &lower_bound_settings,
+    const TestSettingsInternal &upper_bound_settings) {
+  TestSettingsInternal mid_settings = lower_bound_settings;
+  if (scenario == TestScenario::Server) {
+    assert(lower_bound_settings.target_qps < upper_bound_settings.target_qps);
+    mid_settings.target_qps =
+        lower_bound_settings.target_qps +
+        (upper_bound_settings.target_qps - lower_bound_settings.target_qps) / 2;
+  } else {
+    LogDetail([](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", kNotSupportedMsg);
+#else
+      detail(kNotSupportedMsg);
+#endif
+    });
+  }
+  return mid_settings;
+}
+
+template <TestScenario scenario>
+bool IsFinished(const TestSettingsInternal &lower_bound_settings,
+                const TestSettingsInternal &upper_bound_settings) {
+  if (scenario == TestScenario::Server) {
+    uint8_t precision = lower_bound_settings.requested
+                            .server_find_peak_qps_decimals_of_precision;
+    double l =
+        std::floor(lower_bound_settings.target_qps * std::pow(10, precision));
+    double u =
+        std::floor(upper_bound_settings.target_qps * std::pow(10, precision));
+    return l + 1 >= u;
+  } else {
+    LogDetail([](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", kNotSupportedMsg);
+#else
+      detail(kNotSupportedMsg);
+#endif
+    });
+    return true;
+  }
+}
+
+template <TestScenario scenario>
+std::string ToStringPerformanceField(const TestSettingsInternal &settings) {
+  if (scenario == TestScenario::Server) {
+    return std::to_string(settings.target_qps);
+  } else {
+    LogDetail([](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", kNotSupportedMsg);
+#else
+      detail(kNotSupportedMsg);
+#endif
+    });
+    return ToString(settings.scenario);
+  }
+}
+
+template <TestScenario scenario>
+void WidenPerformanceField(TestSettingsInternal *settings) {
+  if (scenario == TestScenario::Server) {
+    settings->target_qps =
+        settings->target_qps *
+        (1 + settings->requested.server_find_peak_qps_boundary_step_size);
+  } else {
+    LogDetail([](AsyncDetail &detail) {
+#if USE_NEW_LOGGING_FORMAT
+      MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", kNotSupportedMsg);
+#else
+      detail(kNotSupportedMsg);
+#endif
+    });
+  }
+}
+
+}  // namespace find_peak_performance
+}  // namespace loadgen
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_TEST_SETTINGS_INTERNAL_H
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/BUILD.gn b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/BUILD.gn
new file mode 100644
index 000000000..d73bf831a
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/BUILD.gn
@@ -0,0 +1,25 @@
+static_library("mlperf_loadgen_tests_loadgen_test_main") {
+  sources = [ "loadgen_test.h", "loadgen_test_main.cc" ]
+  configs += [ "//build/config/compiler:exceptions" ]
+}
+
+executable("mlperf_loadgen_perftests") {
+  sources = [ "perftests_null_sut.cc" ]
+  deps = [ "..:mlperf_loadgen" ]
+}
+
+executable("mlperf_loadgen_tests_basic") {
+  sources = [ "basic.cc" ]
+  deps = [ "..:mlperf_loadgen",
+           ":mlperf_loadgen_tests_loadgen_test_main"  ]
+  configs += [ "//build/config/compiler:exceptions" ]
+}
+
+source_set("mlperf_loadgen_perftests_py") {
+  sources = [ "perftests_null_sut.py" ]
+  deps = [ "../..:loadgen_pymodule_wheel_lib" ]
+}
+
+source_set("docs") {
+  sources = [ "README.md" ]
+}
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/README.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/README.md
new file mode 100644
index 000000000..41056b457
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/README.md
@@ -0,0 +1,42 @@
+# Building and Running the Tests {#ReadmeTests}
+
+The unit and performance tests are only supported via gn/ninja at the moment.
+
+See the [top-level build readme](@ref ReadmeBuild) for details but, from a clean checkout, you must first run:
+
+    make bootstrap_gn_ninja
+    third_party/gn/gn gen out/Release --args="is_debug=false"
+
+This will build the gn and ninja build tools and create a release project.
+
+## Unit Tests
+
+To build:
+
+    third_party/ninja/ninja -C out/Release mlperf_loadgen_tests_basic
+
+To run all tests:
+
+    out/Release/mlperf_loadgen_tests_basic .
+
+To run specific tests:
+
+    out/Release/mlperf_loadgen_tests_basic <regex>
+    e.g.:
+    out/Release/mlperf_loadgen_tests_basic SingleStream
+
+## Performance Tests
+
+To build:
+
+    third_party/ninja/ninja -C out/Release mlperf_loadgen_perftests
+
+To run all tests:
+
+    out/Release/mlperf_loadgen_perftests .
+
+To run specific tests:
+
+    out/Release/mlperf_loadgen_perftests <regex>
+    e.g.:
+    out/Release/mlperf_loadgen_tests_basic ServerPool
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/basic.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/basic.cc
new file mode 100644
index 000000000..97c6a0bb1
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/basic.cc
@@ -0,0 +1,314 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Basic functionality unit tests.
+
+#include <algorithm>
+#include <deque>
+#include <future>
+#include <iostream>
+#include <queue>
+#include <unordered_set>
+#include <vector>
+
+#include "../loadgen.h"
+#include "../query_sample_library.h"
+#include "../system_under_test.h"
+#include "../test_settings.h"
+#include "loadgen_test.h"
+
+/// \brief Correctness unit tests.
+namespace unit_tests {
+
+/// \defgroup LoadgenTestsBasic Test Coverage: Basic
+
+/// \brief Implements the client interfaces of the loadgen and
+/// has some basic sanity checks that are enabled for all tests.
+/// \details It also forwards calls to overrideable *Ext methods and implements
+/// the TestProxy concept.
+struct SystemUnderTestBasic : public mlperf::QuerySampleLibrary,
+                              public mlperf::SystemUnderTest {
+  const std::string& Name() const override { return name_; }
+
+  size_t TotalSampleCount() override { return total_sample_count_; }
+  size_t PerformanceSampleCount() override { return performance_sample_count_; }
+
+  void LoadSamplesToRam(
+      const std::vector<mlperf::QuerySampleIndex>& samples) override {
+    for (auto s : samples) {
+      samples_load_count_.at(s)++;
+      loaded_samples_.push_back(s);
+    }
+    LoadSamplesToRamExt(samples);
+  }
+  virtual void LoadSamplesToRamExt(
+      const std::vector<mlperf::QuerySampleIndex>& samples) {}
+
+  void UnloadSamplesFromRam(
+      const std::vector<mlperf::QuerySampleIndex>& samples) override {
+    for (auto s : samples) {
+      FAIL_IF(loaded_samples_.front() != s) &&
+          FAIL_EXP(loaded_samples_.front()) && FAIL_EXP(s);
+      loaded_samples_.pop_front();
+      size_t prev_load_count = samples_load_count_.at(s)--;
+      FAIL_IF(prev_load_count == 0) && FAIL_EXP(prev_load_count);
+    }
+    UnloadSamplesFromRamExt(samples);
+  }
+  virtual void UnloadSamplesFromRamExt(
+      const std::vector<mlperf::QuerySampleIndex>& samples) {}
+
+  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
+    std::vector<mlperf::QuerySampleResponse> responses;
+    query_sizes_.push_back(samples.size());
+    samples_between_flushes_.back() += samples.size();
+    responses.reserve(samples.size());
+    for (auto s : samples) {
+      FAIL_IF(samples_load_count_.at(s.index) == 0) &&
+          FAIL_MSG("Issued unloaded sample:") && FAIL_EXP(s.index);
+      samples_issue_count_.at(s.index)++;
+      issued_samples_.push_back(s.index);
+      responses.push_back({s.id, 0, 0});
+    }
+    mlperf::QuerySamplesComplete(responses.data(), responses.size());
+    IssueQueryExt(samples);
+  }
+  virtual void IssueQueryExt(const std::vector<mlperf::QuerySample>& samples) {}
+
+  void FlushQueries() override {
+    samples_between_flushes_.push_back(0);
+    FlushQueriesExt();
+  }
+  virtual void FlushQueriesExt() {}
+
+  virtual void RunTest() {
+    samples_load_count_.resize(total_sample_count_, 0);
+    samples_issue_count_.resize(total_sample_count_, 0);
+    samples_between_flushes_.resize(1, 0);
+    mlperf::StartTest(this, this, test_settings_, log_settings_);
+  }
+
+  virtual void EndTest() {}
+
+ protected:
+  mlperf::TestSettings test_settings_;
+  mlperf::LogSettings log_settings_;
+
+  std::string name_{"BasicSUT"};
+  size_t total_sample_count_;
+  size_t performance_sample_count_;
+  std::vector<mlperf::QuerySampleIndex> issued_samples_;
+  std::deque<mlperf::QuerySampleIndex> loaded_samples_;
+  std::vector<size_t> samples_load_count_;
+  std::vector<size_t> samples_issue_count_;
+
+  std::vector<size_t> query_sizes_;
+  std::vector<size_t> samples_between_flushes_;
+};
+
+/// \brief Provides common test set up logic.
+struct SystemUnderTestAccuracy : public SystemUnderTestBasic {
+  virtual void SetUpTest(size_t samples_per_query,
+                         size_t samples_per_query_remainder,
+                         size_t accuracy_remainder,
+                         mlperf::TestScenario scenario) {
+    performance_sample_count_ =
+        samples_per_query * 16 + samples_per_query_remainder;
+    total_sample_count_ = performance_sample_count_ * 32 + accuracy_remainder;
+
+    log_settings_.log_output.prefix_with_datetime = false;
+
+    test_settings_.scenario = scenario;
+    test_settings_.mode = mlperf::TestMode::AccuracyOnly;
+    test_settings_.multi_stream_samples_per_query = samples_per_query;
+
+    double qps = 1e3;
+    test_settings_.server_target_qps = qps;
+  }
+};
+
+/// \brief Verifies all samples from the QSL are included at least once
+/// in accuracy mode.
+/// \ingroup LoadgenTestsBasic
+struct TestAccuracyIncludesAllSamples : public SystemUnderTestAccuracy {
+  void EndTest() override {
+    std::sort(issued_samples_.begin(), issued_samples_.end());
+
+    FAIL_IF(issued_samples_.size() < total_sample_count_) &&
+        FAIL_EXP(issued_samples_.size()) && FAIL_EXP(total_sample_count_);
+    FAIL_IF(issued_samples_.front() != 0) && FAIL_EXP(issued_samples_.front());
+    FAIL_IF(issued_samples_.back() != total_sample_count_ - 1) &&
+        FAIL_EXP(issued_samples_.back()) && FAIL_EXP(total_sample_count_);
+
+    mlperf::QuerySampleIndex prev = -1;
+    size_t discontinuities = 0;
+    size_t dupes = 0;
+    for (auto s : issued_samples_) {
+      if (s == prev) {
+        dupes++;
+      } else if (s - prev > 1) {
+        discontinuities++;
+      }
+      prev = s;
+    }
+
+    FAIL_IF(discontinuities != 0) && FAIL_EXP(discontinuities);
+    FAIL_IF(dupes != 0) && FAIL_EXP(dupes);
+  }
+};
+
+REGISTER_TEST_ALL_SCENARIOS(AccuracyIncludesAllSamples,
+                            TestProxy<TestAccuracyIncludesAllSamples>(), 4, 0,
+                            0);
+
+/// \brief Verifies samples from the QSL aren't included too many times.
+/// \details This is a regression test for:
+/// https://github.com/mlperf/inference/pull/386
+/// The root cause was using different values for samples_per_query while
+/// generating queries for the GNMT dataset.
+/// \ingroup LoadgenTestsBasic
+struct TestAccuracyDupesAreLimitted : public SystemUnderTestAccuracy {
+  void SetUpTest(bool, mlperf::TestScenario scenario) {
+    SystemUnderTestAccuracy::SetUpTest(4, 0, 0, scenario);
+    total_sample_count_ = 3003;
+    performance_sample_count_ = 1001;
+  }
+
+  void EndTest() override {
+    std::sort(issued_samples_.begin(), issued_samples_.end());
+
+    FAIL_IF(issued_samples_.size() < total_sample_count_) &&
+        FAIL_EXP(issued_samples_.size()) && FAIL_EXP(total_sample_count_);
+    FAIL_IF(issued_samples_.front() != 0) && FAIL_EXP(issued_samples_.front());
+    FAIL_IF(issued_samples_.back() != total_sample_count_ - 1) &&
+        FAIL_EXP(issued_samples_.back()) && FAIL_EXP(total_sample_count_);
+
+    std::vector<size_t> issue_counts(total_sample_count_, 0);
+    for (auto s : issued_samples_) {
+      issue_counts.at(s)++;
+    }
+
+    const size_t max_count = 1;
+    for (size_t i = 0; i < issue_counts.size(); i++) {
+      FAIL_IF(issue_counts[i] > max_count) && FAIL_EXP(i) &&
+          FAIL_EXP(max_count) && FAIL_EXP(issue_counts[i]);
+    }
+  }
+};
+
+REGISTER_TEST_ALL_SCENARIOS(TestAccuracyDupesAreLimitted,
+                            TestProxy<TestAccuracyDupesAreLimitted>(), true);
+
+/// \brief Verifies offline + accuracy doesn't hang if the last set
+/// in the accuracy series is smaller than others.
+/// \ingroup LoadgenTestsBasic
+struct TestOfflineRemainderAccuracySet : public SystemUnderTestAccuracy {
+  void SetUpTest() {
+    SystemUnderTestAccuracy::SetUpTest(4, 0, 7, mlperf::TestScenario::Offline);
+  }
+
+  void EndTest() override {
+    auto& flush_samples = samples_between_flushes_;
+
+    FAIL_IF(flush_samples.size() < 3) && FAIL_EXP(flush_samples.size()) &&
+        BAD_TEST_MSG("Test should generate multiple query sets.") && ABORT_TEST;
+
+    // The last counter will be 0, since a test ends with a call to
+    // FlushQuery.
+    FAIL_IF(flush_samples.back() != 0) && FAIL_EXP(flush_samples.back()) &&
+        FAIL_MSG(
+            "Detected stray calls to IssueQuery after the last call to "
+            "FlushQuery.");
+    flush_samples.pop_back();
+
+    // Verify the test ran with a smaller last accuracy set.
+    size_t first_size = flush_samples.front();
+    size_t last_size = flush_samples.back();
+    FAIL_IF(first_size <= last_size) && FAIL_EXP(first_size) &&
+        FAIL_EXP(last_size) && BAD_TEST_MSG();
+
+    flush_samples.pop_back();  // Don't check the last set for equality.
+    for (size_t query_size : flush_samples) {
+      FAIL_IF(query_size != first_size) && FAIL_EXP(query_size) &&
+          FAIL_EXP(first_size);
+    }
+  }
+};
+
+REGISTER_TEST(Offline_RemainderAccuracySets,
+              TestProxy<TestOfflineRemainderAccuracySet>());
+
+/// \brief Verifies all queries only contain samples that are contiguous,
+/// even if the set size is not a multiple of samples_per_query.
+/// \ingroup LoadgenTestsBasic
+struct TestMultiStreamContiguousRemainderQuery
+    : public SystemUnderTestAccuracy {
+  void SetUpTest(mlperf::TestScenario scenario) {
+    SystemUnderTestAccuracy::SetUpTest(4, 1, 0, scenario);
+    first_qsl_offsets_.resize(total_sample_count_, kBadQslOffset);
+
+    auto spq = test_settings_.multi_stream_samples_per_query;
+    FAIL_IF(performance_sample_count_ % spq == 0) &&
+        FAIL_EXP(performance_sample_count_) && FAIL_EXP(spq) &&
+        BAD_TEST_MSG("There is no remainder.");
+  }
+
+  void LoadSamplesToRamExt(
+      const std::vector<mlperf::QuerySampleIndex>& samples) override {
+    FAIL_IF(loaded_samples_.size() != samples.size()) &&
+        FAIL_MSG("Contiguous sample order is likely ambiguous.");
+    for (size_t i = 0; i < samples.size(); i++) {
+      auto& offset = first_qsl_offsets_.at(samples.at(i));
+      // Samples may be loaded into multiple slots for padding purposes,
+      // so make sure to only index the first time a sample appears in a
+      // loaded set.
+      if (offset == kBadQslOffset) {
+        offset = i;
+      }
+    }
+  }
+
+  void UnloadSamplesFromRamExt(
+      const std::vector<mlperf::QuerySampleIndex>& samples) override {
+    FAIL_IF(!loaded_samples_.empty()) &&
+        FAIL_MSG("Contiguous sample order is likely ambiguous.");
+    for (size_t i = 0; i < samples.size(); i++) {
+      first_qsl_offsets_.at(samples.at(i)) = kBadQslOffset;
+    }
+  }
+
+  void IssueQueryExt(const std::vector<mlperf::QuerySample>& samples) override {
+    size_t expected_offset = first_qsl_offsets_[samples[0].index];
+    for (auto s : samples) {
+      FAIL_IF(loaded_samples_[expected_offset] != s.index) &&
+          FAIL_MSG("Samples are not contiguous.");
+      expected_offset++;
+    }
+  }
+
+  void FlushQueriesExt() override {}
+
+  void EndTest() override {}
+
+ private:
+  static const size_t kBadQslOffset;
+  std::vector<size_t> first_qsl_offsets_;
+};
+
+constexpr size_t TestMultiStreamContiguousRemainderQuery::kBadQslOffset =
+    std::numeric_limits<size_t>::max();
+
+REGISTER_TEST(MultiStream_RemainderQueryContiguous,
+              TestProxy<TestMultiStreamContiguousRemainderQuery>(),
+              mlperf::TestScenario::MultiStream);
+}  // namespace unit_tests
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/loadgen_test.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/loadgen_test.h
new file mode 100644
index 000000000..777029b99
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/loadgen_test.h
@@ -0,0 +1,198 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief A minimal test framework.
+
+#ifndef MLPERF_LOADGEN_TESTS_LOADGEN_TEST_H_
+#define MLPERF_LOADGEN_TESTS_LOADGEN_TEST_H_
+
+#include <algorithm>
+#include <exception>
+#include <functional>
+#include <iostream>
+#include <map>
+#include <vector>
+
+#define REGISTER_TEST(name, ...) \
+  static Test::StaticRegistrant test##name(#name, __VA_ARGS__);
+
+#define REGISTER_TEST_SCENARIO(name, scenario, test, ...) \
+  static Test::StaticRegistrant t##name##scenario(        \
+      #name "_" #scenario, test, __VA_ARGS__, mlperf::TestScenario::scenario)
+
+#define REGISTER_TEST_ALL_SCENARIOS(name, test, ...)             \
+  REGISTER_TEST_SCENARIO(name, SingleStream, test, __VA_ARGS__); \
+  REGISTER_TEST_SCENARIO(name, MultiStream, test, __VA_ARGS__);  \
+  REGISTER_TEST_SCENARIO(name, Server, test, __VA_ARGS__);       \
+  REGISTER_TEST_SCENARIO(name, Offline, test, __VA_ARGS__);
+
+#define FAIL_IF(exp)                                              \
+  [&]() {                                                         \
+    const bool v = exp;                                           \
+    if (v) {                                                      \
+      std::cerr << "\n   ERROR: (" << __FILE__ << "@" << __LINE__ \
+                << ") : " #exp;                                   \
+      Test::AddFailure();                                         \
+    }                                                             \
+    return v;                                                     \
+  }()
+
+#define FAIL_MSG(...)                                                      \
+  [&]() {                                                                  \
+    std::cerr << "\n    Info: (" << __FILE__ << "@" << __LINE__ << ") : "; \
+    Test::Log(__VA_ARGS__);                                                \
+    return true;                                                           \
+  }()
+
+#define FAIL_EXP(exp)                                                      \
+  [&]() {                                                                  \
+    std::cerr << "\n    Info: (" << __FILE__ << "@" << __LINE__ << ") : "; \
+    std::cerr << #exp << " is " << (exp);                                  \
+    return true;                                                           \
+  }()
+
+#define BAD_TEST_MSG(...)                                        \
+  [&]() {                                                        \
+    FAIL_MSG("The test isn't testing what it claims to test. "); \
+    Test::Log(__VA_ARGS__);                                      \
+    return true;                                                 \
+  }()
+
+#define ABORT_TEST                                     \
+  [&]() {                                              \
+    FAIL_MSG("ABORTING");                              \
+    throw std::logic_error("ABORT_TEST encountered."); \
+    return false;                                      \
+  }();
+
+/// \brief Testing utilities.
+namespace testing {
+
+/// \brief Wraps a test class as a functor for easy registration.
+/// Forwards registration args to a SetUpTest method.
+/// \details Calls SetUpTest, RunTest, and EndTest.
+template <typename TestT>
+struct TestProxy {
+  template <typename... Args>
+  void operator()(Args&&... args) {
+    TestT test;
+    test.SetUpTest(std::forward<Args>(args)...);
+    test.RunTest();
+    test.EndTest();
+  }
+};
+
+/// \brief A collection of methods for registering and running tests.
+class Test {
+  /// \brief Maps registered test names to a callback.
+  using TestMap = std::multimap<const char*, std::function<void()>>;
+
+  /// \brief The registered tests.
+  /// \details Wraps a static local to avoid undefined initialization order
+  /// and guarantee it is initialized before the first test registers itself.
+  static TestMap& tests() {
+    static TestMap tests_;
+    return tests_;
+  }
+
+  /// \brief The number of errors the current test has encountered.
+  static size_t& test_fails() {
+    static size_t test_fails_ = 0;
+    return test_fails_;
+  }
+
+ public:
+  /// \brief Registers a test before main() starts during static initialization.
+  struct StaticRegistrant {
+    template <typename... Args>
+    StaticRegistrant(Args&&... args) {
+      Test::Register(std::forward<Args>(args)...);
+    }
+  };
+
+  /// \brief Registers a test at runtime.
+  template <typename TestF, typename... Args>
+  static void Register(const char* name, TestF test, Args&&... args) {
+    std::function<void()> test_closure =
+        std::bind(test, std::forward<Args>(args)...);
+    tests().insert({std::move(name), std::move(test_closure)});
+  }
+
+  /// \brief Runs all currently registered tests that match the given filter.
+  static int Run(std::function<bool(const char*)> filter) {
+    // Determine which tests are enabled.
+    std::vector<TestMap::value_type*> enabled_tests;
+    for (auto& test : tests()) {
+      if (filter(test.first)) {
+        enabled_tests.push_back(&test);
+      }
+    }
+    const size_t enabled = enabled_tests.size();
+    std::cout << enabled << " of " << tests().size() << " tests enabled.\n";
+
+    // Run the tests.
+    std::vector<const char*> failures;
+    for (size_t i = 0; i < enabled; i++) {
+      const char* name = enabled_tests[i]->first;
+      std::cout << "[" << (i + 1) << "/" << enabled << "] : " << name << " : ";
+      std::cout.flush();
+      test_fails() = 0;
+      try {
+        enabled_tests[i]->second();  // Run the test.
+      } catch (std::exception& e) {
+        constexpr bool TestThrewException = true;
+        FAIL_IF(TestThrewException) && FAIL_EXP(e.what());
+      }
+      if (test_fails() > 0) {
+        failures.push_back(name);
+        std::cerr << "\n FAILED: " << name << "\n";
+      } else {
+        std::cout << "SUCCESS\n";
+      }
+    }
+
+    // Summarize.
+    if (enabled_tests.empty()) {
+      std::cerr << "Check your test filter.\n";
+    } else if (failures.empty()) {
+      std::cout << "All " << enabled << " tests passed! \\o/\n";
+    } else {
+      std::cout << failures.size() << " of " << enabled << " tests failed:\n";
+      for (auto failed_test_name : failures) {
+        std::cout << "  " << failed_test_name << "\n";
+      }
+    }
+    return failures.size();
+  }
+
+  /// \brief Used by test macros to flag test failure.
+  static void AddFailure() { test_fails()++; }
+
+  /// \brief Base case for the variadic version of Log.
+  static void Log() {}
+
+  /// \brief Used by test macros to log an arbitrary list of args.
+  template <typename T, typename... Args>
+  static void Log(T&& v, Args&&... args) {
+    std::cerr << v;
+    Log(std::forward<Args>(args)...);
+  }
+};
+
+}  // namespace testing
+
+// The testing namespace exists for documentation purposes.
+// Export the testing namespace for all files that define tests.
+using namespace testing;
+
+#endif  // MLPERF_LOADGEN_TESTS_LOADGEN_TEST_H_
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/loadgen_test_main.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/loadgen_test_main.cc
new file mode 100644
index 000000000..3dc5afa80
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/loadgen_test_main.cc
@@ -0,0 +1,33 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief A main entry point a test binary can use if it just wants to execute
+/// Test::Run on all statically registered tests.
+
+#include <regex>
+
+#include "loadgen_test.h"
+
+int main(int argc, char* argv[]) {
+  if (argc <= 1) {
+    std::cerr << "Usage: " << argv[0] << " <include_regex> <exclude_regex>\n";
+    return -1;
+  }
+  std::regex include_regex(argc >= 2 ? argv[1] : ".*");
+  std::regex exclude_regex(argc >= 3 ? std::regex(argv[2]) : std::regex());
+  auto test_filter = [&](const char* test_name) {
+    return (std::regex_search(test_name, include_regex) &&
+            !std::regex_search(test_name, exclude_regex));
+  };
+  return Test::Run(test_filter);
+}
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/perftests_null_sut.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/perftests_null_sut.cc
new file mode 100644
index 000000000..56d562c3e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/perftests_null_sut.cc
@@ -0,0 +1,230 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Performance tests using a null backend.
+
+#include <future>
+
+#include "../loadgen.h"
+#include "../query_sample_library.h"
+#include "../system_under_test.h"
+#include "../test_settings.h"
+
+/// \brief Performance unit tests.
+namespace perf_tests {
+
+/// \defgroup LoadgenTestsPerformance Test Coverage: Performance
+
+/// \brief A simple SUT implemenatation that immediately completes
+/// issued queries sychronously ASAP.
+class SystemUnderTestNull : public mlperf::SystemUnderTest {
+ public:
+  SystemUnderTestNull() = default;
+  ~SystemUnderTestNull() override = default;
+  const std::string& Name() override { return name_; }
+  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
+    std::vector<mlperf::QuerySampleResponse> responses;
+    responses.reserve(samples.size());
+    for (auto s : samples) {
+      responses.push_back({s.id, 0, 0});
+    }
+    mlperf::QuerySamplesComplete(responses.data(), responses.size());
+  }
+
+  void FlushQueries() override {}
+
+ private:
+  std::string name_{"NullSUT"};
+};
+
+/// \brief A stub implementation of QuerySampleLibrary.
+class QuerySampleLibraryNull : public mlperf::QuerySampleLibrary {
+ public:
+  QuerySampleLibraryNull() = default;
+  ~QuerySampleLibraryNull() = default;
+  const std::string& Name() const override { return name_; }
+
+  size_t TotalSampleCount() override { return 1024 * 1024; }
+
+  size_t PerformanceSampleCount() override { return 1024; }
+
+  void LoadSamplesToRam(
+      const std::vector<mlperf::QuerySampleIndex>& samples) override {
+    return;
+  }
+
+  void UnloadSamplesFromRam(
+      const std::vector<mlperf::QuerySampleIndex>& samples) override {
+    return;
+  }
+
+ private:
+  std::string name_{"NullQSL"};
+};
+
+/// \brief Runs single stream traffic.
+/// \ingroup LoadgenTestsPerformance
+void TestSingleStream() {
+  SystemUnderTestNull null_sut;
+  QuerySampleLibraryNull null_qsl;
+
+  mlperf::LogSettings log_settings;
+  log_settings.log_output.prefix_with_datetime = true;
+
+  mlperf::TestSettings ts;
+
+  mlperf::StartTest(&null_sut, &null_qsl, ts, log_settings);
+}
+
+/// \brief A SUT implementation that completes queries asynchronously using
+/// std::async.
+class SystemUnderTestNullStdAsync : public mlperf::SystemUnderTest {
+ public:
+  SystemUnderTestNullStdAsync() { futures_.reserve(1000000); }
+  ~SystemUnderTestNullStdAsync() override = default;
+  const std::string& Name() const override { return name_; }
+  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
+    futures_.emplace_back(std::async(std::launch::async, [samples] {
+      std::vector<mlperf::QuerySampleResponse> responses;
+      responses.reserve(samples.size());
+      for (auto s : samples) {
+        responses.push_back({s.id, 0, 0});
+      }
+      mlperf::QuerySamplesComplete(responses.data(), responses.size());
+    }));
+  }
+
+  void FlushQueries() override {}
+
+ private:
+  std::string name_{"NullStdAsync"};
+  std::vector<std::future<void>> futures_;
+};
+
+/// \brief Tests server traffic using SystemUnderTestNullStdAsync.
+/// \ingroup LoadgenTestsPerformance
+void TestServerStdAsync() {
+  SystemUnderTestNullStdAsync null_std_async_sut;
+  QuerySampleLibraryNull null_qsl;
+
+  mlperf::LogSettings log_settings;
+  log_settings.log_output.prefix_with_datetime = true;
+  log_settings.log_output.copy_summary_to_stdout = true;
+
+  mlperf::TestSettings ts;
+  ts.scenario = mlperf::TestScenario::Server;
+  ts.server_target_qps = 2000000;
+  ts.min_duration_ms = 100;
+
+  mlperf::StartTest(&null_std_async_sut, &null_qsl, ts, log_settings);
+}
+
+/// \brief A SUT implementation that completes queries asynchronously using
+/// an explicitly managed thread pool.
+class SystemUnderTestNullPool : public mlperf::SystemUnderTest {
+ public:
+  SystemUnderTestNullPool() {
+    samples_.reserve(kReserveSampleSize);
+    next_poll_time_ = std::chrono::high_resolution_clock::now() + poll_period_;
+    for (size_t i = 0; i < thread_count_; i++) {
+      threads_.emplace_back(&SystemUnderTestNullPool::WorkerThread, this);
+    }
+  }
+
+  ~SystemUnderTestNullPool() override {
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      keep_workers_alive_ = false;
+    }
+    cv_.notify_all();
+    for (auto& thread : threads_) {
+      thread.join();
+    }
+  }
+
+  const std::string& Name() const override { return name_; }
+
+  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
+    std::unique_lock<std::mutex> lock(mutex_);
+    samples_.insert(samples_.end(), samples.begin(), samples.end());
+  }
+
+  void FlushQueries() override {}
+
+ private:
+  void WorkerThread() {
+    std::vector<mlperf::QuerySample> my_samples;
+    my_samples.reserve(kReserveSampleSize);
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (keep_workers_alive_) {
+      next_poll_time_ += poll_period_;
+      auto my_wakeup_time = next_poll_time_;
+      cv_.wait_until(lock, my_wakeup_time,
+                     [&]() { return !keep_workers_alive_; });
+      my_samples.swap(samples_);
+      lock.unlock();
+
+      std::vector<mlperf::QuerySampleResponse> responses;
+      responses.reserve(my_samples.size());
+      for (auto s : my_samples) {
+        responses.push_back({s.id, 0, 0});
+      }
+      mlperf::QuerySamplesComplete(responses.data(), responses.size());
+
+      lock.lock();
+      my_samples.clear();
+    }
+  }
+
+  static constexpr size_t kReserveSampleSize = 1024 * 1024;
+  const std::string name_{"NullPool"};
+  const size_t thread_count_ = 4;
+  const std::chrono::milliseconds poll_period_{1};
+  std::chrono::high_resolution_clock::time_point next_poll_time_;
+
+  std::mutex mutex_;
+  std::condition_variable cv_;
+  bool keep_workers_alive_ = true;
+  std::vector<std::thread> threads_;
+
+  std::vector<mlperf::QuerySample> samples_;
+};
+
+/// \brief Tests server traffic using SystemUnderTestNullPool.
+/// \ingroup LoadgenTestsPerformance
+void TestServerPool() {
+  SystemUnderTestNullPool null_pool;
+  QuerySampleLibraryNull null_qsl;
+
+  mlperf::LogSettings log_settings;
+  log_settings.log_output.prefix_with_datetime = true;
+  log_settings.log_output.copy_summary_to_stdout = true;
+
+  mlperf::TestSettings ts;
+  ts.scenario = mlperf::TestScenario::Server;
+  ts.server_target_qps = 2000000;
+  ts.min_duration_ms = 100;
+
+  mlperf::StartTest(&null_pool, &null_qsl, ts, log_settings);
+}
+
+/// @}
+
+}  // namespace perf_tests
+
+int main(int argc, char* argv[]) {
+  perf_tests::TestSingleStream();
+  perf_tests::TestServerStdAsync();
+  perf_tests::TestServerPool();
+  return 0;
+}
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/perftests_null_sut.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/perftests_null_sut.py
new file mode 100644
index 000000000..115372e18
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/perftests_null_sut.py
@@ -0,0 +1,61 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+"""Python version of perftests_null_sut.cc.
+"""
+
+from __future__ import print_function
+from absl import app
+import mlperf_loadgen
+
+
+def load_samples_to_ram(query_samples):
+    del query_samples
+    return
+
+
+def unload_samples_from_ram(query_samples):
+    del query_samples
+    return
+
+
+def issue_query(query_samples):
+    responses = []
+    for s in query_samples:
+        responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0))
+    mlperf_loadgen.QuerySamplesComplete(responses)
+
+
+def flush_queries():
+    pass
+
+
+def main(argv):
+    del argv
+    settings = mlperf_loadgen.TestSettings()
+    settings.scenario = mlperf_loadgen.TestScenario.SingleStream
+    settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
+
+    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
+    qsl = mlperf_loadgen.ConstructQSL(
+        1024 * 1024, 1024, load_samples_to_ram, unload_samples_from_ram
+    )
+    mlperf_loadgen.StartTest(sut, qsl, settings)
+    mlperf_loadgen.DestroyQSL(qsl)
+    mlperf_loadgen.DestroySUT(sut)
+
+
+if __name__ == "__main__":
+    app.run(main)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tools/mlperf-trace.ipynb b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tools/mlperf-trace.ipynb
new file mode 100644
index 000000000..ab834d17a
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tools/mlperf-trace.ipynb
@@ -0,0 +1,441 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Tool to extract usefull information from mlperf trace"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<Figure size 3600x2400 with 0 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "%matplotlib inline\n",
+    "# Ignore warnings\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "import json\n",
+    "import os\n",
+    "import seaborn as sns\n",
+    "from operator import itemgetter\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "figsize=(10, 5)\n",
+    "font=10\n",
+    "\n",
+    "plt.figure(dpi=600)\n",
+    "plt.rc('xtick', labelsize=font) \n",
+    "plt.rc('font', size=font)\n",
+    "sns.set(font_scale=1.4, style=\"whitegrid\");"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def trace_to_df(fname):\n",
+    "    with open(fname, \"r\") as f:\n",
+    "        j = json.load(f)\n",
+    "    if type(j) == dict:\n",
+    "        j = j['traceEvents']\n",
+    "        \n",
+    "    result = []\n",
+    "    for item in j:\n",
+    "        name = item['name']\n",
+    "        if name not in [\"Latency\", \"Sample\", \"QuerySamplesComplete\", \"IssueQuery\"]:\n",
+    "            continue\n",
+    "\n",
+    "        args = item.get('args')\n",
+    "        d = {\"ts\": item['ts'], \"name\": name, \"dur\": item.get(\"dur\")}\n",
+    "\n",
+    "        if name == \"Latency\":\n",
+    "            d[\"issue_delay\"] = args[\"issue_delay\"]\n",
+    "            d[\"issue_to_done\"] = args[\"issue_to_done\"] / 1e3\n",
+    "            result.append(d)\n",
+    "        elif name == \"Sample\":\n",
+    "            if args:\n",
+    "                d[\"issue_start_ns\"] = args[\"issue_start_ns\"]\n",
+    "                d[\"complete_ns\"] = args[\"complete_ns\"]\n",
+    "                d[\"issue_to_done\"] = (args[\"complete_ns\"] - args[\"issue_start_ns\"]) / 1e3\n",
+    "                result.append(d)\n",
+    "        elif name == \"QuerySamplesComplete\":\n",
+    "            result.append(d)\n",
+    "        elif name == \"IssueQuery\":\n",
+    "            result.append(d)\n",
+    "\n",
+    "    df = pd.DataFrame(result)\n",
+    "    df = df.sort_values(by=[\"ts\"])\n",
+    "    return df\n",
+    "\n",
+    "BINS = 10"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ts</th>\n",
+       "      <th>dur</th>\n",
+       "      <th>issue_delay</th>\n",
+       "      <th>issue_to_done</th>\n",
+       "      <th>issue_start_ns</th>\n",
+       "      <th>complete_ns</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>2.000000e+04</td>\n",
+       "      <td>10000.000000</td>\n",
+       "      <td>5.000000e+03</td>\n",
+       "      <td>10000.000000</td>\n",
+       "      <td>5.000000e+03</td>\n",
+       "      <td>5.000000e+03</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>4.894584e+07</td>\n",
+       "      <td>17.731682</td>\n",
+       "      <td>7.001508e+04</td>\n",
+       "      <td>6112.554491</td>\n",
+       "      <td>7.001508e+04</td>\n",
+       "      <td>6.182570e+06</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>2.839099e+07</td>\n",
+       "      <td>25.578639</td>\n",
+       "      <td>9.666462e+04</td>\n",
+       "      <td>2254.077235</td>\n",
+       "      <td>9.666462e+04</td>\n",
+       "      <td>2.263719e+06</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>min</th>\n",
+       "      <td>4.102560e+03</td>\n",
+       "      <td>1.152000</td>\n",
+       "      <td>8.810000e+02</td>\n",
+       "      <td>2754.967000</td>\n",
+       "      <td>8.810000e+02</td>\n",
+       "      <td>2.780383e+06</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25%</th>\n",
+       "      <td>2.463025e+07</td>\n",
+       "      <td>3.974750</td>\n",
+       "      <td>5.806250e+04</td>\n",
+       "      <td>4100.473000</td>\n",
+       "      <td>5.806250e+04</td>\n",
+       "      <td>4.166623e+06</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50%</th>\n",
+       "      <td>4.881766e+07</td>\n",
+       "      <td>7.364000</td>\n",
+       "      <td>6.159800e+04</td>\n",
+       "      <td>6089.880000</td>\n",
+       "      <td>6.159800e+04</td>\n",
+       "      <td>6.155939e+06</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>75%</th>\n",
+       "      <td>7.373552e+07</td>\n",
+       "      <td>27.441000</td>\n",
+       "      <td>6.835175e+04</td>\n",
+       "      <td>7337.257000</td>\n",
+       "      <td>6.835175e+04</td>\n",
+       "      <td>7.408272e+06</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max</th>\n",
+       "      <td>9.832065e+07</td>\n",
+       "      <td>508.552000</td>\n",
+       "      <td>6.522433e+06</td>\n",
+       "      <td>22234.101000</td>\n",
+       "      <td>6.522433e+06</td>\n",
+       "      <td>2.414005e+07</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                 ts           dur   issue_delay  issue_to_done  \\\n",
+       "count  2.000000e+04  10000.000000  5.000000e+03   10000.000000   \n",
+       "mean   4.894584e+07     17.731682  7.001508e+04    6112.554491   \n",
+       "std    2.839099e+07     25.578639  9.666462e+04    2254.077235   \n",
+       "min    4.102560e+03      1.152000  8.810000e+02    2754.967000   \n",
+       "25%    2.463025e+07      3.974750  5.806250e+04    4100.473000   \n",
+       "50%    4.881766e+07      7.364000  6.159800e+04    6089.880000   \n",
+       "75%    7.373552e+07     27.441000  6.835175e+04    7337.257000   \n",
+       "max    9.832065e+07    508.552000  6.522433e+06   22234.101000   \n",
+       "\n",
+       "       issue_start_ns   complete_ns  \n",
+       "count    5.000000e+03  5.000000e+03  \n",
+       "mean     7.001508e+04  6.182570e+06  \n",
+       "std      9.666462e+04  2.263719e+06  \n",
+       "min      8.810000e+02  2.780383e+06  \n",
+       "25%      5.806250e+04  4.166623e+06  \n",
+       "50%      6.159800e+04  6.155939e+06  \n",
+       "75%      6.835175e+04  7.408272e+06  \n",
+       "max      6.522433e+06  2.414005e+07  "
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = trace_to_df('/tmp/mlperf_log_trace.json')\n",
+    "df.describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAoIAAAFKCAYAAACJoz5RAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAstklEQVR4nO3deZxcVZ338Q8ECIQASVgNyjIIP8AoPkRGUfRBkSXgOiIMjAsoKIroIAgzKItsowODKCIoI4IzqICODgjigiwqKtCAEiA/MLI8EEFIAoJgB0ieP84tUhSVdHelu6vS9/N+vfp1U/eeunWqTi/fnHvOuSssWrQISZIk1c+K3a6AJEmSusMgKEmSVFMGQUmSpJoyCEqSJNWUQVCSJKmmDIKSJEk1tVK3KyBJdRIRmwB3A/tn5nndrc3z9ULdIuJdwDeAjTLz0W7UYbAiYhpwC/DKzJzZ5epIHTEISmNIROxH+SO6fWb+psvVGZSIWAc4Eng7sBHwJHAD8MXMvLybdRurIuKjwJM9GETHAScAZ/V6CATIzJkRcTlwPPAP3a6P1AkvDUvqmogISo/KIcCVwMeAfwPWAy6LiM91r3Zj2keB/drsvxdYDfivUa3NYm8BtgK+2qXX78TZwDsj4qXdrojUCXsEJXVFRKwMfBeYDLwhM69vOnYacAFwZET0ZebFo1y31TPzr6P5mp2qetFWysz+ZT1XZi4C/rbsterYB4AbM/OPXazDUP0UmE8J1p/pblWkoTMISmNcRKwPnATsSulpexS4CTg8M2+rymwLnAhsB6wBPARcC3woM5+KiB2Bq4A3ZubVTefehDZjyiJii+p8OwGrA3cAJ2Xmd5uq9i5gGnBMcwgEyMxnI+LDVZ0/C1xcnXfY69F0OX0n4B3A3sB61XPvBA7LzNNaPtNXAL8DPpqZZ7EEETEJOB14J7AI+F/gC23KXV297x1b9p8H7JiZm7S8z38FngA+AWwKvBm4OiIOq15rS2AicBflEvt/Np3zHmDj6t+Ne4zem5mbLOVz3IbSU7sDMI5y6f7ozPxFU5n9KJ/jjpTL/O8FJgA/oXwfPbykz6l6/qrAbsB/tDm2CPhsZh7Xsv8e4OrM3K96vBLwL8D7gJcAT1Wfwecz83+anjeY708iYi1KuHsXsCHwCHAN8KnMfAAgM5+u2u+dGAS1HPLSsDT2fRfYEzifckmwEUS2AIiIdSm9GpsB/065PHseJaStPtQXi4itgN8CL6/OdxgwF7g4It7TVPSt1fab7c6TmY9RgtNWEbHZCNaj4QxgW0poPiYz7wJ+DbQr+x5gAXDhUl5/har+76X0bn4GmEpph2X1XuBwSvA6FPhTtf9QYCZlzNqnKIH+nIg4qOm5/wzcD8yqzvPeat+S3sdWwC+A/wOcAhxXvY+fRcQb2jzldGAbSoA/i9LOXx7Ee5oOrALcOIiyS3Is5b1fA3y8+vcs4O8bBQb7fRERq1fnORT4OSV0f4USolsvA/dRvk8nL0Pdpa6wR1Aaw6oeqR0oPRinNh1qHnv3WmAKsGtmNv8RPrbDl/0iMAd4VWY+Ve07MyJ+AnwuIi6oLkFuDTyWmfcu5Vy/q7ZbA7NHqB4NT1B6355p2vdN4KyI2DozbweIiBWBfYDLMnPeUl7/bcAbgCMz89+r554F/GyI76OdjYHNM/NPLfu3yMwnmx6fUb3fwylj2cjMH0TEicAjmfnfg3itk4BVgelVOCYivkEJWKcBr2opPxfYufHZVp/XxyNirSrcL8mW1XZZLgu/Bbg8Mw9cSpnBfl98ihJo92oZmnBSFfKb/RFYgTK+8bplqL806uwRlMa2pyg9VztGxJQllGn8cX5LNW6vY9VrvBm4CFg9ItZpfAFXUC6vbVEVXwN4fIBTNo6vMYL1aDinJQRC6fHrp/SaNewIvJiBJ1TsDiyk9IoB5ZI3cOZQ3ssS/KBNCKQRAiNi5YiYUr3fq4DNqsucQ1KNP9wVuLQRAqvXeYTSazy9GnrQ7OstAfsXlMvJGw/wcmtX2/lDrWeTx4CXVZd+X2CI3xd7Are1G5/a8v6a67zOMtRd6gqDoDSGVRMIjqSMvXooIn4ZEUdFxEuail1DuXx8LDA3Ii6NiAOrS2ND9VJKz8hxwMMtX42xX+tV28cZOOA1jv95BOvR8IIex8ycD1wC7NvUC/QeYB5w2QB12Bh4MDNbw+6dg3sLS9W2dzQi3h4RN1L+AzCX8n5Prg4POQgC61LG+WWbY3dU201a9t/X8rgRkgZ72bS1t20ojqG8z4yI2yLitIho7rEcyvfFZpTL7EOpc2tAlHqeQVAa4zLzdGBzyqWux4CjgTuqiRdk5qLMfDfwasr4rnWArwG3RkTjj+KS/sCNa3nc+J3yBWDnJXw1/rjeDqwVERstpfqvqLaNy4UjUY+Gp2jvm5T1Dd9QTWh4F3BRZi5YSr2HarDvq+EFdY2IHYDvU9ZhPAjYg/I+G2NCR+v3/bNL2D9QwHuk2g5lnN3zPp/MvJYS4N4P3EyZNHJ9RBxRFenk+2IwGnV+ZKmlpB7kGEGpBjLzbkrIOz0iXkxZu+/TwNVNZa4HrgeOiYgZwOXAgZQxYo1enUktp2693NcIbM9k5kBj4S4F9qX8sT6x9WBErEmZfXpT03IiI1GPgVxB6ZF8L7A+sCaDW2fvXmDniFijpVew3WXL+cDftdk/0OXUZntSln7ZJTOfWwImIt7Ypuxge64epgTLaHOsMabvniHUcWkaPYybUkJcs/m0tHlErAK8qPUkVS/uN4FvRsRqlO/jz0bEfzC074vZlAlTg7Ep5TOdNcjyUs+wR1AawyJiQvXH8DmZeT8l2EyqykxuM/j9pmo7qdreS+npaZ0l+tGWc/+ZMibtwIjYsE191m16+D3gNuBfWi7fNcamnUXpaTmp6dBI1GOpqnGDF1CC1geBP2TmYCYEXE75HfuRptddETi4TdnZwJbN9aqWbHndYOtJ+VwW0fR7vZrF+oE2Zf/KIHreqjGNVwBvbZ65XY21ez9lzb+HhlDHpemjBNnWySdQPp/WNv8QLT2CEbF28+NqMsgsymSX1Yb4ffFdynjDd7cp1/rzMh2YVYVQablij6A0tm0B/DwiLqaErn7KJIatKDNJofxBPzgivk/5g7sasD8lWHwXylIu1TkOqdZ0m02Zodk6zg5K8PkV8PuIOKcqux7l0vPWVEtvVOuvvYuyNMcvI+JcShiYTOkp/D/Aic3rv41EPQbpm5RlRHahjC8bjEur1/+3an2+2yjrFLabtHMu8EngxxHx9aqeB1XPWXMIr/dJ4KcR8V/V6xwIPAhs0FL2RuCjEXEsZcziE5l56RLO+xnK+/5lRJxJCWsHUv6TsOcg6zagzFwQEVdQLs8e1XL4P4GzI+J7lKWOtqFMYmm9FHtHRFxLWefwkarcAcAPM/OJqsxgvy9OoQwD+HZE7EL53pwEzKCMRbwGnlsY/f+yfN0NRXqOPYLS2Pb/KL1Zr6f0rJ1CmRn5wcxsDI6/hnJJeC/K0hpHUcLDmzLzt03nOoSyLt5BlEu591FC5PNkZlJ6dS6hXPY9k9JjtxJlfGJr2W0o68ztTFmn7RRKCHx/Zj6v/EjVYyCZeQvw++rhYJZcITMXUpaQuQD4J8rn/6cl1PWOqo5rUZZkeRvlUvRNrWWX8npXV+eeQhkG8AHK2ohfalP8eBYHx29V5ZZ03jsoSxDdTJl49FnK98ebqzF5w+lc4FURsWnL/nOAz1N6Bf+Dcil2Z0rPZrPTKTO6j6R8T+1GWSppn0aBwX5fVHeWeUN1fDfK5/gxyhqMz82gpsxCnkKZRS0td1ZYtMhJTpJ6R0S8nLLkyL2UW88tbe25URMRNwALMnMol2s1BNWl85mU5WqO7HZ9BiMiLgEWZuY7ul0XqRP2CErqKZl5K2WSSADfryYFdFVEvJLSizQcdwXRElS9qEcDH6kWQ+9pETGNMtTCW8tpuWWPoCQtQfWHfjplfOCLgE1b7twhScs1ewQlacn2pNzPdzXgHw2BksYaewQlSZJqyuVjOtDX1zce2I4yA3BJq+hLkiT1gnGU4S03TJ8+vb/5gEGwM9tRZjVKkiQtL14P/LJ5h0GwM38C2GKLLVhlleGd0Dhz5kymTRvsXY00GmyT3mJ79Bbbo/fYJr2lF9pjwYIF3HnnnVDll2YGwc48C7DKKqswfvz4YT/5SJxTy8Y26S22R2+xPXqPbdJbeqg9XjCczVnDkiRJNWUQlCRJqimDoCRJUk0ZBCVJkmrKIChJklRTBkFJkqSaMghKkiTVlEFQkiSppgyCkiRJNeWdRXrY439dwJP9z3S7GsNiwviVWGP14b0dnyRJWjYGwR72ZP8zXHnDfd2uxrDYabuNDIKSJPUYLw1LkiTVlEFQkiSppgyCkiRJNWUQlCRJqimDoCRJUk0ZBCVJkmrKIChJklRTBkFJkqSaMghKkiTVlEFQkiSppgyCkiRJNWUQlCRJqimDoCRJUk0ZBCVJkmrKIChJklRTBkFJkqSaMghKkiTVlEFQkiSppgyCkiRJNWUQlCRJqimDoCRJUk0ZBCVJkmrKIChJklRTBkFJkqSaMghKkiTVlEFQkiSppgyCkiRJNWUQlCRJqimDoCRJUk0ZBCVJkmrKIChJklRTBkFJkqSaMghKkiTVlEFQkiSpplbqdgUaImIiMAvYENguM29sOvY+4ChgE2A2cHxmXtjy/JWB44H3A5OAG4BPZOYtLeU2AL4I7AYsAn4I/HNmPjIS70uSJKlX9VKP4HG0CaYRsSdwPvB9YAbwM+DbETGjpegXgIOBY4G3AwuAKyNiatO5VgKuAF4OvA84AHgtcElErDDM70eSJKmn9USPYERMAw4CPgl8teXwCcDFmfmv1eOrImIr4LPAj6rnb1g9/+OZeU617zfA3cA/A0dUz30XsA0wLTNvq8rNAX5FCZmXj8T7kyRJ6kW90iN4JvBl4M7mnRGxKbAl8J2W8t8CtouIdavHuwDjgOcuF2fm45TLvrs3PW934NZGCKzKXQfc21JOkiRpzOt6EIyI9wIvBU5sc3irant7y/5GkIumcg9l5tw25baIiBWbyrWeq1Fuy6HUW5IkaXnX1SAYEWsBpwBHZOYTbYpMrraPtuyfX22nNJVrLdMotzIwcRDlprTZL0mSNGZ1e4zgicBdmXlBl+vRkZkzZ47Iefv6+gBYbc11mTNnzoi8xmibO3cC99/9cLer0bFGm6g32B69xfboPbZJb+nl9uhaEIyIl1EmeOwcEZOq3Y2eu4kRsQaLe/4mAQ82Pb3RUziv2s6vyrSaDDwNPDGIcvPa7F+qadOmMX78+KE+ban6+vqYPn06AA/Ne5KpU58c1vN3y9prr8P6m2/U7Wp0pLlN1H22R2+xPXqPbdJbeqE9+vv7l9h51c1Lw5tTguhVlIA2H7i0OnYV8AvgjurxVi3P3braZrW9A1gvIlov724N3JmZC5vKtZ6rUW5WB+9BkiRpudXNIPhL4I0tX4dWxw4CDsjMuykBbe+W5+4D3JCZjWuNPwEWAns1ClQLVL+V5y8Jcznw8mr5mUa511AWqnbpGEmSVCtduzRc3cnj6uZ9EY1JwPQ13VnkGODCiJgN/JSyWPQuwB5N53ogIs4GPh8Rz1CWgzkcWAE4veklvgf8HvhuRPwr5f2fAvyaak1CSZKkuuj68jEDycyLgf2BPYEfA7sC+2Zma3A7FDiLMgHlEmA14M2ZOafpXM9Qbi03E/hv4BvAb4C3ZeaiEX4rkiRJPaXbs4afJzOvpvTite4/n3KbuaU992ngX6qvpZV7kBdeapYkSaqdnu8RlCRJ0sgwCEqSJNWUQVCSJKmmDIKSJEk1ZRCUJEmqKYOgJElSTRkEJUmSasogKEmSVFMGQUmSpJoyCEqSJNWUQVCSJKmmDIKSJEk1ZRCUJEmqKYOgJElSTRkEJUmSasogKEmSVFMGQUmSpJoyCEqSJNWUQVCSJKmmDIKSJEk1ZRCUJEmqKYOgJElSTRkEJUmSasogKEmSVFMGQUmSpJoyCEqSJNWUQVCSJKmmDIKSJEk1ZRCUJEmqKYOgJElSTRkEJUmSasogKEmSVFMGQUmSpJoyCEqSJNWUQVCSJKmmhhwEI2LXiFhhJCojSZKk0dNJj+CPgPsj4pSI2Ga4KyRJkqTR0UkQfAfwK+Bg4KaI+H1EHB4RU4e1ZpIkSRpRQw6CmXlJZu4FrA8cCDwMfA64NyJ+EhHviYgJw1xPSZIkDbOOJ4tk5uOZeW5m7gRsDBwFrAecDzwUEd+MiJ2GqZ6SJEkaZsM1a3gcsDIwHlgBeAp4M/DTiLg5IqYN0+tIkiRpmKzU6RMjYi1gL+A9wOuAZ4DLgH+ptguBtwFfAL4BbLeslZUkSdLwGXIQjIh3UMLf7sCqwA3AJ4BvZ+a8luI/iIh1gK+0Oc8/AJ8EtgQmAg8A3wdOyMzHmsrNAE4Ctq7KnJ6ZZ7Q53+GUCSwbALcBR2bmlS1l1gBOAfas6n4VcEhm3jOkD0GSJGkM6OTS8P8Arwa+CGydma/OzDPbhMCG3wMXtNk/BbgW+BCwW3W+DwAXNwpExPbAJcDNwAxKz+LpEXFQ84mqEHgycCawB3AXcFmb5W2+TemlPATYG5gKXOnkFkmSVEedXBreBbgyMxcNpnBmXg9c32b/f7bsujoi/gZ8NSKmZuYc4Bjgpsz8YFXmqojYCDg2Ir6WmQsjYjzwGUpP4akAEXENcCvwacrlayLi1ZSQuEdmXl7tuxWYDexHm15LSZKksayT5WN+NtgQ2IFHqu0qVcB7E3BhS5lvUS7/bls9fi2wFvCdpjo+C1wEzGi6C8ruwGPAFU3l7qOsibj78L4NSZKk3tfJLea+EBF3LeX4nRFxyhDONy4iVo2I6ZQewEuqMXubAasAt7c85bZqu2W13ara3tGm3ERgw6ZyszJzYZtyWyJJklQznVwa3oMX9tI1uxB4N/CpQZ5vLqVHD0pv3b7VvydX20dbys+vtlOayvVn5lNLKXd/Va71XI1yU9rsH9DMmTM7edqA+vr6AFhtzXWZM2fOiLzGaJs7dwL33/1wt6vRsUabqDfYHr3F9ug9tklv6eX26CQIvgS4ZynH763KDNaOwARgGmWs36URsXMH9Rp106ZNY/z48cN6zr6+PqZPnw7AQ/OeZOrUJ4f1/N2y9trrsP7mG3W7Gh1pbhN1n+3RW2yP3mOb9JZeaI/+/v4ldl51EgT/Amy6lON/R1lQelAy85bqn9dFRB9wI/BOFl8SntTylEZPYWOW8nxgfESsmpl/G6BcuyQyuamMJElSbXSyfMzPgQ9Xs3efJyI2AT5clenELZSFqF9Kmc27gMVjABu2rrazqm1jbGC7co9T1h5slIumySPN5WYhSZJUM50EwWMoPYkzI+KLEfGh6utLlDUDVwSO7rA+21fP/2Nm9lMC5V4tZfYBHgRuqh5fR5kNvHejQESMq553RdMM58spvYu7NpV7CbBDdUySJKlWhnxpODPviojXURZvPqTl8DWUO3XkQOeJiB8DV1Jm7f4NeCVlgsnvgR9UxY4Hro2IcyiLUr8OOBA4uDH7NzP7I+JE4OSIeJgSEA+gzDpuTDwhM38bEZcBX4+IwyiXuI8H7gPOG9qnIEmStPzr6F7DmXkbsGN1+7i/q3bPzsy5QzjN9ZRb1TXGG94DnA2clpkLqtf5dUS8nXLXkPcBc4BDM/PslvqcGhEAHwfWp4TLPTLzdy2vuQ9wKmXx6PGUW8y9OzPHxowMSZKkIegoCDZk5iMsXgR6qM89mkFcQq7uAjLgpdvqriKnDlDmccoYxg8PspqSJEljVkdBsBqDtyulN3Ay0DoBY1FmnrCMdZMkSdIIGnIQjIhXAd8DXswLA2DDIsAgKEmS1MM66RH8CrAa8A7gF5n56HBWSJIkSaOjkyD4CuDTmXnpcFdGkiRJo6eTdQTvZ8mXhCVJkrSc6CQIfg44MCLWHO7KSJIkafR0cml4CvBX4A8R8V3g/wHPtpRZlJmnLGvlJEmSNHI6CYKfa/r3QUsoswgwCEqSJPWwToLgpgMXkSRJUq/r5F7D945ERSRJkjS6Or7FXERsDuwIrAdckJn3RMQqwAbAg437BUuSJKk3dXJnkRWBs4EPUpaRWQT8GrgHWAW4FTge+I9hq6UkSZKGXSfLxxwFfAA4GtiepjUFM/MJyu3n/mFYaidJkqQR00kQ3B84NzNPBv7Q5vitwObLVCtJkiSNuE6C4IuB65dy/Clgjc6qI0mSpNHSSRB8ENh4KcenA84sliRJ6nGdBMHvAR+pZg03LAKIiBnA+4CLhqFukiRJGkGdBMHjgPuAm4ELKCHwqIj4DfBD4HfAvw1XBSVJkjQyhhwEM/MvwGuBk4H1gb8BOwATKSHxDZn51DDWUZIkSSOgowWlM/NvlCB48vBWR5IkSaOlk0vDkiRJGgM6ubPIuYMotigzP9hBfSRJkjRKOrk0/CaqWcJNxgEvqrYPA39dxnpJkiRphA05CGbmJu32R8TKwIeBfwZ2XqZaSZIkacQN2xjBzHw6M78M/AT48nCdV5IkSSNjJCaL/A54wwicV5IkScNoJILgzsCTI3BeSZIkDaNOZg0fs4RDkyg9gdsCn1uGOkmSJGkUdDJr+Lgl7J8PzAYOAs7ptEKSJEkaHZ3MGnYRakmSpDHAUCdJklRTnYwR3KiTF8rM+zp5niRJkkZGJ2ME7+GFdxYZjHEdPEeSJEkjpJMgeADwceAlwLeAO6v9AewD3Ad8CVg4HBWUJEnSyOgkCL4IGA+8NDPnNx+IiGOBXwEbZOa/DUP9JEmSNEI6mSxyEPC11hAIkJlzKUvHfGRZKyZJkqSR1UkQXBuYuJTjq1dlJEmS1MM6CYK/AT4REdNbD0TEq4BPAL9d1opJkiRpZHUyRvBjwNXA9RFxA3BXtX9zYDtgHnDIsNROkiRJI2bIPYKZeTvwcsrM4EnAntXXJOCLwMsz87bhq6IkSZJGQic9gmTmQ8Ch1ZckSZKWQx0FwYaI2BxYD5iZmY8N8bnvBv4JmA5MAWYDZwFfzcyFTeVmACcBWwMPAKdn5hltznc4cDCwAXAbcGRmXtlSZg3gFEoP5qrAVcAhmXnPUOouSZI0FnR0r+GI2Dci7gNmAddSwhwRsU5E3BkRew3iNIcB/cCngLcAP6Bcbv580+tsD1wC3AzMAL4BnB4RB7XU53DgZOBMYA/KuMXLImKbltf8NvA2yhjGvYGpwJURMWHQb16SJGmM6ORew+8C/hv4KXA6cGrjWGY+EhF3AO8DLhrgVG/NzIebHl8VEROBj0XEZzKzHzgGuCkzP9hUZiPg2Ij4WmYujIjxwGcoPYWnVnW8BrgV+DSwV7Xv1ZSQuEdmXl7tu5XSE7kf8JWhfhaSJEnLs056BD8N/CwzdwXOb3P8t0BrT9wLtITAhpspl2ynVAHvTcCFLWW+Rbn8u231+LXAWsB3ms79LCWIzoiIFarduwOPAVc0lbuPcieU3QeqryRJ0ljTSRDcCvj+Uo7/GVi3s+rwesryM38GNgNWAW5vKdOYkbxlU30A7mhTbiKwYVO5Wc3jD5vKbYkkSVLNdDJZ5K8s/c4imwGPDPWk1WLU+wOfzcxnI2JydejRlqKNW9tNqbaTgf7MfGop5e6vyrWeq1FuSpv9A5o5c2YnTxtQX18fAKutuS5z5swZkdcYbXPnTuD+u9t1Ai8fGm2i3mB79Bbbo/fYJr2ll9ujkyD4c2C/iPhi64GImAocCPzvUE4YERsA3wOup2mySK+bNm0a48ePH9Zz9vX1MX16uWnLQ/OeZOrUJ4f1/N2y9trrsP7mG3W7Gh1pbhN1n+3RW2yP3mOb9JZeaI/+/v4ldl51OkbwRcCNwEeBRcDuEfE5ygSNhcBnB3uyiFgL+BHwJPC2zHy6OtTo0ZvU8pRGT+G8pnLjI2LVQZRrPVej3Lw2+yVJksa0Tu4schfwOuBB4DhgBeCTwBHALcAO1SSMAVXh7RLKWoS7ZebcpsOzgQUsHgPYsHW1nVVtG2MD25V7nLL2YKNcNE0eaS43C0mSpJoZUhCMiHHV8i0PZeYuwDrAq4HtgfUzc6fMvHOQ51qJMrP3FcCMzLy3+Xi1fMzPqZZ/abIPJYTeVD2+jjIbeO/melbPuyIzF1W7L6f0CO7aVO4lwA7VMUmSpFoZ6hjBFSk9dUcCp2XmfOCGDl/7TOCtlJ7ECRHxmqZjt2fmX4DjgWsj4hzgAkpP5IHAwY3Zv5nZHxEnAidHxMOUgHgAZdLKvo0TZuZvI+Iy4OsRcRjQOP99wHkdvgdJkqTl1pB6BKvxe3Mo4wKXVaNn7t+BX7d8bVu93q+BtwPbAT+mBLxDM/PslnqdChwFfJwy3nBLysLRv2t5zX2AH1IWj76Y0rP45swcGzMyJEmShqCTWcPfoMwaPisz/9bpC2fmJoMsdzmDuHRbhcFTByjzOPDh6kuSJKnWOgmCdwLjgFkRcT7wR6B1DT8yc6BbzEmSJKmLOgmC/93076OXUGYRA99rWJIkSV00qCAYEV8Czs/MPuCN1e6JlJ7AZ0eobpIkSRpBg+0R/BjwG6AvM6+JiLUp9wPeOTOvGbHaSZIkacR0cmeRhtaFmSVJkrQcWZYgKEmSpOWYQVCSJKmmhjJr+O8i4u+rf69VbbeMiCfaFc7M65epZpIkSRpRQwmCn62+mp3RptwKlOVjxnVaKUmSJI28wQbB/Ue0FpIkSRp1gwqCmXn+SFdEkiRJo8vJIpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1dRK3XzxiHgpcDjwGmAaMCszp7UpNwM4CdgaeAA4PTPPaFPucOBgYAPgNuDIzLyypcwawCnAnsCqwFXAIZl5z/C9M0mSpN7X7R7BlwF7AH8Abm9XICK2By4BbgZmAN8ATo+Ig1rKHQ6cDJxZnfMu4LKI2KbllN8G3gYcAuwNTAWujIgJw/SeJEmSlgtd7REELs3M/wWIiPOAV7UpcwxwU2Z+sHp8VURsBBwbEV/LzIURMR74DKWn8NTqfNcAtwKfBvaq9r2aEhL3yMzLq323ArOB/YCvjMi7lCRJ6kFd7RHMzIVLO14FvDcBF7Yc+hbl8u+21ePXAmsB32k697PARcCMiFih2r078BhwRVO5+4BfVcckSZJqo9uXhgeyGbAKL7xsfFu13bLablVt72hTbiKwYVO5WW0C6G1N55IkSaqFbl8aHsjkavtoy/751XZKU7n+zHxqKeXur8q1nqtRbkqb/Us1c+bMoT5lUPr6+gBYbc11mTNnzoi8xmibO3cC99/9cLer0bFGm6g32B69xfboPbZJb+nl9uj1INjTpk2bxvjx44f1nH19fUyfPh2Ah+Y9ydSpTw7r+btl7bXXYf3NN+p2NTrS3CbqPtujt9gevcc26S290B79/f1L7Lzq9UvDjR69SS37Gz2F85rKjY+IVQdRrvVcjXLz2uyXJEkas3o9CM4GFrB4DGDD1tV2VrVtjA1sV+5xytqDjXLRNHmkudwsJEmSaqSng2Bm9gM/p1r+pck+wIPATdXj6yizgfduFIiIcdXzrsjMRdXuyyk9grs2lXsJsEN1TJIkqTa6fWeRCSxetmVjYM2I2LN6fENm3gscD1wbEecAFwCvAw4EDm7M/s3M/og4ETg5Ih6mBMQDKLOO9228Xmb+NiIuA74eEYcBf6nOfx9w3oi+WUmSpB7T7cki6wEXt+xrPN4fOC8zfx0Rb6fcNeR9wBzg0Mw8u/lJmXlqRAB8HFifsiTMHpn5u5bz7wOcSlk8ejzlFnPvzsyxMStDkiRpkLoaBKv7+7aO12tX7nIGcem2uqvIqQOUeRz4cPUlSZJUWz09RlCSJEkjxyAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaZW6nYFVA8LFy7ioXlPdrsaHVltzXWfV/cJ41dijdVX6WKNJEkaHgZBjYr+p5/lut/P6XY1OjJnzhymTl0cBHfabiODoCRpTPDSsCRJUk0ZBCVJkmqqdpeGI2Jz4AxgB+Ap4DvAkZm5fA5gkyRJ6lCtgmBETAKuAu4F9gTWA04D1gX+sXs1kyRJGn21CoLAh4HJwCsz8xGAiHgGuCAiTsjM27paO0mSpFFUtzGCuwNXNkJg5XtAPzCjO1WSJEnqjrr1CG4FnNu8IzP7I2I2sOUQzjMOYMGCBcNYtcX6+/sBeObpBay04sIReY3R9uwzTy+372XVlVd4Xt0XLOjngYdGpu1H07gVV+DZhYu6XY0hGz9xCg889Ojz9q22ykqsPmHl7lRIz/3OUu+wTXpLt9ujKa+Maz1WtyA4GXi0zf75wJQhnOdFAHfeeecwVOmFZs6c+dy/NxtKrXrYo3++Z7l9L5tNWZMyr6j48wN/7F5lBMCDT8zrdhXUpPl3lnqDbdJbeqg9XgTMbt5RtyA4XG4AXg/8CXi2y3WRJElamnGUEHhD64G6BcH5wKQ2+ycDswZ7kunTp/cDvxymOkmSJI202e121m2yyB2UcYLPiYjxwGYMIQhKkiSNBXULgpcDO0XE2k373gmMr45JkiTVxgqLFi1/swY7VS0oPRO4BziBxQtKX5mZLigtSZJqpVY9gpn5KPAm4Angf4AvABcCH+hitSRJkrqiVj2CkiRJWqxWPYKSJElazCAoSZJUUwZBSZKkmqrbgtI9KyI2B84AdqDcz+w7wJGZ+WRXKzbGRMRLgcOB1wDTgFmZOa1NuRnAScDWwAPA6Zl5RptyhwMHAxsAt1Ha7MqRewdjR0S8G/gnYDrlFo+zgbOAr2bmwqZytsUoiYh/AD5Juff6RMrn/X3ghMx8rKmcbdIFETGRsubthsB2mXlj07H3AUcBm1B+lo7PzAtbnr8ycDzwfsrNFW4APpGZt4xC9Zd7EbEf8I02h87MzI81lVuufj7sEewB1bI2VwFrAHsChwH7AOd2sVpj1cuAPYA/ALe3KxAR2wOXADcDMyg/+KdHxEEt5Q4HTgbOrM55F3BZRGwzYrUfWw4D+oFPAW8BfgB8Cfh8o4BtMeqmANcCHwJ2A75IWVXh4kYB26SrjqNNB05E7AmcTwntM4CfAd+uAkmzL1CCx7HA24EFwJURMXUE6zwW7QZs3/R1auPA8vjz4azhHhARRwLHABtn5iPVvn2BC4BpmXlbN+s3lkTEio3epog4D3hVa49gRPwImJKZr27a9zXgrcCGmbmwuiPNQ8DXMvOIqsw44FZgZmbuNSpvaDkWEetm5sMt+04DPgJMysx+26L7IuJDwFcpn/cc26Q7ImIa8BtKj+1XaeoRjIg7gFubP9eI+Anl5+jvq8cbAvcCH8/Mr1T71gDuBs5ttJOWrKlHcN3G3+o2ZZa7nw97BHvD7pRFrZu/sb5H6S1p/R+dlkHzJcd2qh/QN1HWl2z2LUr3/bbV49cCa1Eu4TfO/SxwETAjIlYYrjqPVa0hsHIzsCowxbboGY3fS6vYJl11JvBl4M7mnRGxKeVS/ndayn8L2C4i1q0e7wKMo6ntMvNx4IeUv0FaRsvrz4dBsDdsRctlyszsp4zz2LIrNaqvzYBVeOFl40avbKM9GvesvqNNuYmUMTwautcD84A/Y1t0TUSMi4hVI2I65WrFJZl5D7ZJV0TEe4GXAie2Odz4rJfUJtFU7qHMnNum3BYRYR4YvJkR8WxE3B0Rx0ZE43L9cvnzYcP3hsnAo232z6eM2dHomVxtH23ZP7/aTmkq15+ZTw1QToMUEa8C9ge+UP3v2LbonrmUSWs3An8C9q322yajLCLWAk4BjsjMJ9oUGUqbtJZplFuZEkC0dH+ijK/cjzJO8PvA0cB/VseXy58PZw1L6rqI2IAyHOJ6miaLqGt2BCZQZtZ/Brg0Inbuao3q60Tgrsy8oNsVqbvM/DHw46ZdP42Ix4DjIuKELlVrmdkj2BvmU6byt5pMuUym0dP4H9mklv2N/+nNayo3PiJWHaCcBlD1ePwIeBJ4W2Y+XR2yLbokM2/JzOsy82vAO4E3VlvbZBRFxMuAg4CjI2JStcJEo+duYjXZYyht0lqmUe5poF1vowZ2UbXdluX058Mg2BvuYPGYAeC5QaebUdaM0uiZTVlSYauW/VtX20Z7NMZ2tCv3OGXtKA2g+kV4CbAesFvL+CXbojfcAiykjFGzTUbX5pQrd1dRwsN84NLq2FXAL1j6Zw2Q1fYOYL2IaL3suDVw50AT6TQoy+XPh0GwN1wO7BQRazfteycwvjqmUVJN0vk50Dp9fx/gQeCm6vF1wGPA3o0C1fT/vYArMtN1mQZQDbC+CHgFMCMz720+blv0jO0pfyv+aJuMul9SemObvw6tjh0EHJCZd1MCxt4tz90HuKFpdv5PKIG+eYmZiZRlTfw707l/BBYBfcvrz4djBHvDV4FDgP+txhmsB5wGXJiZbRc9VmciYgKLl0rYGFizWowVyi/Neykr718bEedQ1nJ8HXAgcHDjf83VGncnAidHxMOUH/ADKL24+6LBOJPyR+gIYEJEvKbp2O2Z+Rdsi1EVET8GrqTMXvwb8ErKgt+/pyz4DbbJqKmWFLu6eV9EYxIwfU13FjkGuDAiZgM/pSwWvQtloeLGuR6IiLOBz0fEM5Q1BQ8HVgBOH7l3MXZUPx8/B2ZSQvUM4KPA1zPzj1Wx5e7nwwWle0REbEG5q8LrWXyLuSO8xdzwiohNKAuotrN/Zp5Xldudsur7VsAcykzWL7U53+GUEL8+5Y/nEd5Ca3Ai4h5KGG/njZl5dVXOthgl1X9E3w5sWu26hzKJ57QqmDfK2SZdEhE7Ui4Lt95i7v288BZz32l57srACZRZr2ux+BZzN49G3Zd3EXE6Jfy9mNKRdhfVnUOqlQ4a5Zarnw+DoCRJUk05RlCSJKmmDIKSJEk1ZRCUJEmqKYOgJElSTRkEJUmSasogKEmSVFMGQUmSpJoyCEqSJNXU/we2dyrPLzUr8gAAAABJRU5ErkJggg==\n",
+      "text/plain": [
+       "<Figure size 720x360 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAApsAAAFKCAYAAABSGJRzAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAA3OklEQVR4nO3debgcVZn48W8EuRDZQtiMIy4sr2DUGQIjICiKigFFHFnEUQSF0ZHFBRQXREDEhYyiAu4KjiCL/lQQxAUQHRDBwAAB8oIIZDAjsoRFYS6Y5PfHqSZFc29yb9+u23f5fp4nT6er3q46dbq679unzjk1ZenSpUiSJElNeEqvCyBJkqSJy2RTkiRJjTHZlCRJUmNMNiVJktQYk01JkiQ1xmRTkiRJjTHZlMagiDg6IsbUvGQRsTQivtLrckgjERFTIuLaiPhEr8syFBFxTkSc3etySCOxcq8LIE0Ww0ge92+0IGNEREwFPgj8KjN/1ePiNCIidgQuAfbJzDN7XJwhiYinAe8H9gI2Bh4DrgW+BpyemWPqR1AH9gE2Ab7Q64IM0aeA30fEizLz2l4XRuqEyaY0et7a9vzfgG2At7ctvxz4LvDp0ShUD00FPl79/1c9LIcqEbEBcBGwOfA94EvAasC/AP8JzI6It2bmkt6VcsQ+AHw/M+/pdUGGIjOvjojfA4fz5O8QaVww2ZRGSWZ+t/48Il4J/HP78pq/N18q6QlOoySab8jMc2vLvxARJ1ASnv8GThjNQkXEKsCSzBzRZyIi/gn4R+DIbpRrFJ0FHBsRB2Xmg70ujDRcJpvSGBQRRwMfz8wptWW3A/MpLZ5zgOcDtwKHZubFEbE7cCywGXAjcGBmzm3b7mbAccBOwNOAm4BPZub3h1G2vSktks8FEjgiMy9si1mritkD2BC4E/g2cHxmLo6IZwO3VeEfj4hWC+dpwH8A1wF7ZOYPqu1Fdex/yMxNa/v5T+Clmfms2rKtgWOAlwCrAHOBj2XmJW1lfDrwCeC1wDTgj8AXM/PLtZgdKZfB3ww8B3g3sC5wGfDOzPzDUOutts3VgaOBNwIzgAeBG4CjMvPXVcwmwPHADsA6wL3Ab4GDM/N/a/W3f2ae2rb9pcAxmXn0MI91G2Bn4FttiWbLh4HXAx+KiJMy85GGyrEjpc7fAmxKafl/BvBPEXEF8I3MPLRtX9OB/wU+n5lHDFD2lt2BxcDFba8/mrbPW7V8P8p5+5zMvL1atiXlM7Q1sAZwF/Br4N8y85EqZgpwMOXqxaaU9/g8ymflnrZ9vIpSt1sBU4CbgS9n5jdqYb+gfOZ3Bs5ZzvFJY5IDhKTx5bmUy5vnAx8C1gbOjYg3A18EzgCOquLOiYiVWi+MiM2B3wEvAD4LHEZJYs6JiLcMcf8vAb4MnA18FFgVOC8itq/tZzVKsrAfpTvAwZQ/7kcDX63C7gb+vfr/DymXB99arZ8HLAJeWtvvS4ElwCZVwtKyA+UPfWvfLwN+Q0nQjgWOAPqAn1dJTCtufeAK4DXAKcB7qv2eEhEDtXp9kHIpeQ6lD902wOmD1tLyfRk4hHLc7wY+Q6mPF1VleyrwM2B74OQq5hRgA0pyOizDONbXVY/fGWg7VaviGZS63a7BcrR8hJKQf4HyPi4EfgTsHRHtDSV7A08drOw12wE3tpLCDo5hPUritzHlM3QwcCowk/LjreXLwOcon7f3UPq77gFcEhGr1rb3Vsp7vUG1vQ8CVwK7tu36RuARyudPGnds2ZTGl00pLXm/AYiImyh/rL4FbJ6Zt1XL76ckbi8Hflm99guUP9hb1f7YnhwRPwc+HRFDGfwxE9guM39b7edU4BZKa2sr4Xwf8Dxgy8ycXy37WkTcBhwXESdkZkbE9yl/lK8boIvBZTwx2dwB+CmwY7X8rIh4JvAs4JPVa6ZUx/xfwKtax1KNoL+G0lLYSpKOoyShL8jMu6tlX4mIrwMfqVru7q/tf1XgRZn5aLXNRZRLyzMzc94K6qzda4GvZ+b7B1m/BeXHwp5tLc7HDXM/9dcN5Vi3qNYtbxBKa90WlL6dTZSjZQ3KOf231oKI+A5lgM+rgQtqsW8BrsnMG1ZQhudRWro7tR0l2d45M39fW95qmScitgPeCbwtM79TW34h5YfQvpTPw5rAScDVwA71BLg6lx+XmX+PiP9h2XskjSu2bErjy82tRLPyu+rxV61Es235cwEiYh3glZQWyadFxLqtf8CFlMuUmw1h/79vJZoAmXkvpbXrJRExrVq8FyXhu6dtP62kd8ch7Oc3wAury/FQEsyLKS1jrSR0h1oslJbBqMozvbbfNSmtUS+OiKnVH/I9KK3DS9vK+HPKgJgXt5XnO61Es22fzx3CsbR7oCrLMwZZ3+qTt3M1MrxjwzzWNarHh5azyda6NZYTM9JytHynnmhWfkG5XP74QJmIeC6wLWUA04pMp7Sad+qB6vG1VQv0QPYC/gpc2Hac8ymX3F9exb2acm5+ur2ldZAffYsoXTikcceWTWl8WVB/kpkPlO6M/E9bXOuPYisB3ITSH+zo6t9A1qf0wVyeWwZYdnP1+CzKH8TNKInf3QPEtvazIr+h/BjePiKuq7b9a2B1YM8qZgfgL7XW01ay/M3lbHc60E+pl7fz5JkABivjgrbnrYRlGsP3AUrf1AURcQ0l2f/PzEyAzLwtIj5HmX7oLVUr73nAd6vkfjjWY+jHWk8k7x8ktpVk/qXBcrTc2h5Q9ff9LnBQRKyRmQ9RWjUXU7qXDMWUFYcM6lLg+5SWzPdHxKXAucAZtcR4M8p5etcg22gd58bV41BbxqcA433aKU1SJpvS+LJ4mMtbf1hbVzE+zxMvP9YN93LwYJ5CaYX81CDr/ziEbfye0kftpZR+qQ9RLoWvARxdtdTuQGlBre8XSl/WwS6V3l1tD0py8q1B4tovx66ofocsM8+JiN9QBtu8GjgU+GBE7JeZZ1Qxh0XEt4Ddqpj/AI6MiJdl5o0MknTU++hWWnUylGO9kTKA5oXU+sG2eWH12HoPmyhHy2D9Kr9DSdj/hZK0/yvwi8z88yDxdfcw8A+EwZK4JxxH1eK4Z0T8M6U7xKso/TE/HBHbZOZfKMd6L/CmQbbZacvqNJYNqpPGFZNNaXJoJQd/z8xfLjdy+TYdYFmrRfGO6vFWYI0h7GfQVprMfKwaefxSYC3g8qpV6wrKlFCvp/Rf+3rtZa2WsIeWt++IuJuSvK48wrroWJUYfRX4akSsTekecAylC0Ar5gZKAvapiHghJYF+H3AgyxKWtds2/ay258M51vMog3L2ZYBks0og38yy0dc0VI7lysx5EXE18Naqz/JmlLobipsoswq0WwQQEWu39RttP45WGa6kDOQ5KiJmU37AHUjpP3wrJQm9IjP/upyytM7XmZRL7IOqBkQ9k8F/KEpjmn02pUmganG5BDhwoL6C1SjbodgqIratvW46JQG5PDNbicdZwNYRscsA+1kjIvqqpw9Xj4Ndiv4NMIvyh/vX1XE8Qmn1PILSqlhPiuYCf6Bc3nxSn8LWMWbmYsql0N0j4kWDxTUhIlaq9UOlKs/9lBartauYNQcYbX0TpaVv7eo1D1Ja6V7aFvfutm0P+Vgz8wpK/8n9I6J9NDSURGoz4DOt+S6bKMcQnUbp+/hBShL7wyG+7jJgi2rGhLpW4vf4cVT9Zd/WVs5p7YN3KAN8YFnCfRblb+tR7Tuv3v/W+f5zSv/cD7WXZ4B9bEEZpHb5wIcljW22bEqTx79T/theV40AvpXSf+zFlD9mmwxhG/OAn0TElyh/5P+Ncmn7w7WYEyjT6Pw4Ik6jJIGrUVpw9qRMvXR7NU/jDcCbIuJmyqXH2zKzNbjpNyybxqmeVP6akmw+SG3kdGYuiYh3UPpA3lhdhr6TMl3QyyjJaWtwxocoA5V+W9XFDZSk9x+BN1D+sDdhDeBPEfGDquwPUqazeQ1lZDLAKyizBHyf0od2CmVqnzUoiUzLNyiJyjcoCfhLGXiQ13COdV9KF4hzI+IMynuwKuWS9csog3Y+37b9JsqxIt+jTEP1RuDUYUxl9GNKK+grKIOVWn5O6Zf7zSiT1y+m9C29G9ioFvc2Sn/RH1I+P6tRbi/bSqbJzF9HxMnAB6oW6Z9R+glvQhkkdVRV5gcj4j2UbgW/r+r7Xsr8uc+g1HnLqyg/Nn42xOOUxhRbNqVJohqAshVlQMO+LJvDcWXgY0PczGXVa/amTCXUD+zemoy82s8jlKTiM5TE40TK5dnNKRN61/vWvQO4ndIn8Xssm3sTyiTmfwf+j3LJsqU1Evyy9tsmVuXYhnJZ+t2UBO7twH1VeVpxf6Ek2d+g9FM8iXKJekPK/KNNeZhS7y+g1PmJlPfk8Gr/UJLQnwK7UBKqT1ASzt3bpkI6ljIYag/KHI0rAbPbdzicY83Mu6rYY4B/okyX9QVKovnxzHxCS19T5ViRauqkn1ZPhzIKvfW6ayktkXu1LX+MkvDeSqnvQ6tyntS2iUsp5+JelHr5COV8fkXtRxKZeTDl3F6H0iL8aUrf27OpTShfTYT/Wsr5+RFK/W1L6dJQtxfww8x8AGkcmrJ0qYPbJEkDq7pdXE5pnNg2M+/scZEAiIhzKD8sntX+o2MFr9uH0tf3WR2M7h91Ue5Y9HtgVmZe0+vySJ2wZVOSNKjM/BPlMv9UytyRa/e2RI/fjWg3ypRRQ040K2dSWjDf2+1yNeTDwPdNNDWe2bIpSRoXIuI5lD6ub6dcbt50rLS0ShqcA4QkSePFy4BvU25isJ+JpjQ+2LIpSZKkxtiy2YG5c+f2AVtT7tE72J1FJEmSxoKVgKcDV82aNat/tHdustmZrVk2/YokSdJ40H6b31FhstmZ/wXYbLPNWGWVVRrbybx585g5c2Zj2x8vrIfCeiish8J6KKyHwnoorIeivR4effRRbr75Zqjyl9FmstmZxQCrrLIKfX19K4odkaa3P15YD4X1UFgPhfVQWA+F9VBYD8Ug9dCTrn/OsylJkqTGmGxKkiSpMSabkiRJaozJpiRJkhpjsilJkqTGmGxKkiSpMSabkiRJaozJpiRJkhpjsilJkqTGeAehMWz1tdblrvse7nUxumJq38qs8bTmbu0pSZLGJpPNMWzx0ilcdNWCXhejK3baeiOTTUmSJiEvo0uSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGrNyr3YcEXsC/wrMAtYBbgW+DHw1M5dUMacCbxvg5Xtm5vfbtnc4cBCwIXADcERmXtQWswZwArAHsCpwCXBIZt7etQOTJEnS43rZsnkY0A98AHgt8CPgi8Bn2uL+CGzb9u/iekCVaB4PnAzsCtwCnB8RL2rb1veA3YBDgL2BGcBFETG1WwclSZKkZXrWsgm8LjPvrj2/JCJWBw6OiCMzs79a/khmXjHYRiKiDzgSODEz51TLLgWuBz4K7FUtezElEd01My+oll1PaVHdDzilmwcnSZKkHrZstiWaLddQLm+vM4xNbQesBZxZ2/Zi4GxgdkRMqRbvAjwAXFiLWwBcVq2TJElSl/WyZXMgOwD3AX+pLds4Iu4HngbMAz6dmWfV1m9ePd7Utq0bgNWBZwB3VnHzW/1B2+J27krpJUmS9ARjJtmMiK2A/YFjqpZJKC2dV1ESwrWAA4AzI2K1zDy1ipkG9GfmI22bXFQ9rkNJNqcB9w+w60UMryX1cfPmzevkZUO22prrsXDhwkb3MVruvXcqd942UGP20MydO7eLpRm/rIfCeiish8J6KKyHwnooxlI9jIlkMyI2BH4AXEltgFBmfqEt9McRcTFwDHDqqBVwEDNnzqSvr6+x7d94ywJmzJjR2PZH0/Tp67LBpht19Nq5c+cya9asLpdo/LEeCuuhsB4K66GwHgrroWivh/7+/sYbyJan5/NsRsRawE+Bh4HdMvOxFbzkHGCjiFiver4I6IuIVdviplWP99Xi1h5ge9NqMZIkSeqiniabVYJ4LrA+8JrMvLeDzbT6am7etnwL4CHgT7W4qA0YqsfN72C/kiRJWoGeJZsRsTJlxPgLgdmZeccQXjOFMpXRHbXR7JdTRpnvXYtbqYq7MDOXVosvoLRs7lyLeyawfbVOkiRJXdbLPpsnA68DPghMjYhtautupFzePo0yEfsfKIniAcCOwFtbgZnZHxHHAcdHxN3A1VXcxsCba3G/i4jzgW9GxGHAg8CxwALGQP9PSZKkiaiXyWarhfGzA6x7OXAdpcXySMpl9scoieRumXlePTgz50QEwKHABpTR67tm5rVt290HmEOZwL2PcrvKPTPz4W4ckCRJkp6oZ8lmZj57CGGvH8b25lASyeXFPAS8s/onSZKkhvV8NLokSZImLpNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNWblXu04IvYE/hWYBawD3Ap8GfhqZi6pxc0GPglsAfwJODEzvzTA9g4HDgI2BG4AjsjMi9pi1gBOAPYAVgUuAQ7JzNu7fXySJEnqbcvmYUA/8AHgtcCPgC8Cn2kFRMS2wLnANcBs4NvAiRHxrvqGqkTzeOBkYFfgFuD8iHhR2z6/B+wGHALsDcwALoqIqV0+NkmSJNHDlk3gdZl5d+35JRGxOnBwRByZmf3AUcDVmfmOWsxGwMcj4muZuSQi+oAjKS2ecwAi4lLgeuCjwF7VshdTEtFdM/OCatn1lBbV/YBTGj5eSZKkSadnLZttiWbLNZTL2+tUSeQrgLPaYs6gXCrfsnq+HbAWcGZt24uBs4HZETGlWrwL8ABwYS1uAXBZtU6SJEldNtYGCO0A3Af8BdgYWAW4sS3mhurxedXj5tXjTQPErQ48oxY3v94ftBb3PCRJktR1vbyM/gQRsRWwP3BMZi6OiGnVqvvbQhdVj+tUj9OA/sx8ZDlxd1Zx7dtqxa0zwPIVmjdvXicvG7LV1lyPhQsXNrqP0XLvvVO587aBGrOHZu7cuV0szfhlPRTWQ2E9FNZDYT0U1kMxluphTCSbEbEh8APgSmoDhMa6mTNn0tfX19j2b7xlATNmzGhs+6Np+vR12WDTjTp67dy5c5k1a1aXSzT+WA+F9VBYD4X1UFgPhfVQtNdDf39/4w1ky9Pzy+gRsRbwU+BhYLfMfKxa1WqZXLvtJa0Wz/tqcX0RseoQ4tq31Yq7b4DlkiRJGqGeJptVgngusD7wmsy8t7b6VuBRlvXJbNmiepxfPbb6ag4U9xBlbs5WXNQGDNXj5iNJkqSu61myGRErU0aMvxCYnZl31NdXUx9dTDV1Uc0+wJ+Bq6vnl1NGme9d2/ZK1esuzMyl1eILKC2bO9finglsX62TJElSl/Wyz+bJwOuADwJTI2Kb2robM/NB4Fjg1xHxdeB04CXAgcBBrVHlmdkfEccBx0fE3ZQk9ADKaPY3tzaYmb+LiPOBb0bEYUBr+wuAUxs9UkmSpEmql8lmq4XxswOseznwq8z8bUS8nnJ3oH2BhcD7MvMr9eDMnBMRAIcCG1CmM9o1M69t2+4+wBzKBO59lNtV7pmZD3fnkCRJklTXs2QzM589xLgLGMJl7uruQXNWEPMQ8M7qnyRJkhrW89HokiRJmrhMNiVJktQYk01JkiQ1xmRTkiRJjTHZlCRJUmNMNiVJktQYk01JkiQ1ZtjJZkTsPMD9xSVJkqQn6aRl86fAnRFxQkS8qNsFkiRJ0sTRSbK5O3AZcBBwdURcFxGHR8SMrpZMkiRJ496wk83MPDcz96Lcg/xA4G7g08AdEfHziHhLREztcjklSZI0DnU8QCgzH8rMb2XmTsCzgI8A6wOnAXdFxHciYqculVOSJEnjULdGo68EPBXoA6YAjwCvBH4REddExMwu7UeSJEnjyMqdvjAi1gL2At4CvAT4O3A+8KHqcQmwG/B54NvA1iMtrCRJksaXYSebEbE7JcHcBVgVuAp4D/C9zLyvLfxHEbEucMoIyylJkqRxqJOWzf8H/An4AnBaZs5fQfx1wOkd7EeSJEnjXCfJ5quBizJz6VCCM/NK4MoO9iNJkqRxbtjJZmb+somCSJIkaeLp5HaVn4+IW5az/uaIOGFkxZIkSdJE0MnUR7sCZy1n/VnA6zorjiRJkiaSTpLNZwK3L2f9HVWMJEmSJrlOks0HgecsZ/1zKZO6S5IkaZLrJNm8GHhnRGzUviIing28s4qRJEnSJNfJ1EdHAbOBeRHxbeCGavlMYD9gMfCxrpROkiRJ41onUx/dEhEvAU4GDmlbfSlwSGZmNwonSZKk8a2je6Nn5g3AjtWtKJ9bLb41M+/tWskkSZI07nWUbLZk5j3APV0qiyRJkiaYjpLNiFgJ2JnSqjkNmNIWsjQzPzHCskmSJGmcG3ayGRFbAT8A/oEnJ5ktSwGTTUmSpEmuk5bNU4DVgN2B32Tm/d0skCRJkiaOTpLNFwIfzczzul0YSZIkTSydTOp+J4NfPpckSZIe10my+WngwIhYs9uFkSRJ0sTSyWX0dYC/AX+IiO8D/0O5a1Dd0sw8YaSFkyRJ0vjWSbL56dr/3zVIzFLAZFOSJGmS6yTZfE63dh4RmwCHA9tQ7q0+PzNntsWcCrxtgJfvmZnfb4s9HDgI2JByz/YjMvOitpg1KInwHsCqwCWUW2ze3oVDkiRJUk0n90a/o4v7fz6wK/A7Sv/RwfqQ/hH417ZlN9efVInm8cBHgKuBA4HzI+LFmXltLfR7wJaU+7o/CBwLXBQRL8jMh0d2OJIkSarr+HaVEbEpsCOwPnB6Zt4eEatQWhX/nJmPDmEz52Xmj6vtnQpsNUjcI5l5xXLK0gccCZyYmXOqZZcC1wMfBfaqlr2YktzumpkXVMuuB24F9qPMISpJkqQuGfZo9Ih4SkR8DZgPfJXSMvjcavUqlATvkKFsKzOXDHf/g9gOWAs4s7btxcDZwOyIaE3VtAvwAHBhLW4BcFm1TpIkSV3UydRHHwHeDnwM2JbanJuZ+VfKrSz/pSulW2bjiLg/Ih6LiGsiYu+29ZtXjze1Lb8BWB14Ri1u/gBJ7g3A87paYkmSJHV0GX1/4FuZeXxETB9g/fXAa0dWrCe4BriKkhCuBRwAnBkRq2XmqVXMNKA/Mx9pe+2i6nEdymT004D7B9jHoipmWObNmzfclwzLamuux8KFCxvdx2i5996p3Hnb3R2/fu7cuV0szfhlPRTWQ2E9FNZDYT0U1kMxluqhk2TzH4Arl7P+EWCNzorzZJn5hbZFP46Ii4FjgFO7tZ9OzJw5k76+vsa2f+MtC5gxY0Zj2x9N06evywabbtTRa+fOncusWbO6XKLxx3oorIfCeiish8J6KKyHor0e+vv7G28gW55OLqP/GXjWctbPAro5Yn0g5wAbRcR61fNFQF9ErNoWN616vK8Wt/YA25tWi5EkSVKXdJJs/gD492o0estSgIiYDexLGZgzmlp9NTdvW74F8BDwp1pc1AYM1ePmN1c8SZKkyamTZPNoYAGlL+XplETzIxFxBfAT4FrgU90qYLsqUdwLuCMzW50AL6eMMt+7FrdSFXdhZi6tFl9AadncuRb3TGD7ap0kSZK6qJNJ3R+MiO2A9wN7Av9HSdZupSSiJ2Tm/w1lWxExlWVTDj0LWDMi9qieX1U9nkaZiP0PlETxAMr8nm+tlak/Io4Djo+IuymTuh8AbAy8uRb3u4g4H/hmRBzGskndF9Dj/p+SJEkTUUeTulfJ5PHVv5FYn9L/sq71fH/gXEqL5ZFV7GOURHK3zDyvrUxzIgLgUGADyuj1XdvuHgSwDzCHMoF7H+V2lXt69yBJkqTu6/gOQt1Q3Y+8vf9ku9cPY3tzKInk8mIeAt5Z/ZMkSVKDhp1sRsS3hhC2NDPf0UF5JEmSNIF00rL5CqrR5zUrAU+vHu8G/jbCckmSJGkC6GSA0LMHWh4RT6Vcmn4v8KoRlUqSJEkTQidTHw0oMx/LzJOAnwMndWu7kiRJGr+6lmzWXAu8tIHtSpIkaZxpItl8FeA0QpIkSepoNPpRg6xam9KiuSXw6RGUSZIkSRNEJ6PRjx5k+SLKXYTeBXy90wJJkiRp4uhkNHoTl94lSZI0AZk4SpIkqTGd9NncqJMdZeaCTl4nSZKk8auTPpu38+Q7CA3FSh28RpIkSeNYJ8nmAcChwDOBM4Cbq+UB7AMsAL4ILOlGASVJkjR+dZJsPh3oAzbJzEX1FRHxceAyYMPM/FQXyidJkqRxrJMBQu8CvtaeaAJk5r2UaY/+faQFkyRJ0vjXSbI5HVh9OeufVsVIkiRpkusk2bwCeE9EzGpfERFbAe8BfjfSgkmSJGn866TP5sHAr4ArI+Iq4JZq+abA1sB9wCFdKZ0mjCVLlnLXfQ939NrV1lyv49c2YWrfyqzxtFV6XQxJksaFTu4gdGNEvAD4EDAb2KNadQfwBeCzmfnn7hVRE0H/Y4u5/LqFHb124cKFzJgxdpLNnbbeyGRTkqQh6qRlk8y8C3hf9U+SJEkaUEfJZktEbAqsD8zLzAe6UyRJkiRNFB3dGz0i3hwRC4D5wK+BWdXydSPi5ojYq4tllCRJ0jg17GQzIt4IfBe4CfgAMKW1LjPvqZbv260CSpIkafzqpGXzo8AvM3Nn4LQB1v8OeNGISiVJkqQJoZNkc3Pgh8tZ/xdgvc6KI0mSpImkk2Tzbyz/DkIbA/d0VhxJkiRNJJ0kmxcD+0XEkyYajIgZwIHAz0ZaMEmSJI1/nfbZfDrwe+DdwFJgl4j4NHA9sAQ4pmsllCRJ0rg17GQzM28BXgL8GTiaMhr9/cAHgf8Gts/MBd0roiRJksarYU3qHhErAc8A7srMV0fENGATStL6x8y8u4EySpIkaZwa7h2EngLcChwBfC4zFwFXdb1UkiRJmhCGdRk9Mx8DFlL6aUqSJEnL1ckAoW9TRqOv2u3CSJIkaWIZ7mV0gJuBlYD5EXEa8EfgkfagzDx7hGWTJEnSONdJsvnd2v8/NkjMUmCFyWZEbAIcDmwDzATmZ+bMAeJmA58EtgD+BJyYmV8aIO5w4CBgQ+AG4IjMvKgtZg3gBGAPYFXgEuCQzLx9ReWVJEnS8AzpMnpEfDEiZlVPX179ex3wytrz+r9XDHH/zwd2Bf4A3DjIvrcFzgWuAWZTLuOfGBHvaos7HDgeOLna5i3A+RHRfp/27wG7AYcAewMzgIsiYuoQyyxJkqQhGmrL5sHAFcDczLw0IqZT7oH+qsy8dAT7Py8zfwwQEacCWw0QcxRwdWa+o3p+SURsBHw8Ir6WmUsiog84ktLiOafa3qWUSeY/CuxVLXsxJRHdNTMvqJZdTxlhvx9wygiORZIkSW06GSDUMmWkO8/MJctbXyWRrwDOalt1BuVS+ZbV8+2AtYAza9teTLmUPzsiWmXdBXgAuLAWtwC4rFonSZKkLhpJsjkaNgZW4cmX2G+oHp9XPW5ePd40QNzqlInoW3HzB0hyb6htS5IkSV3SyQCh0TStery/bfmi6nGdWlx/ZraPiq/H3VnFtW+rFbfOAMuXa968ecN9ybCstuZ6LFy4sNF9jJb+WGdExzKW6uHee6dy5229uVnW3Llze7LfscZ6KKyHwnoorIfCeijGUj0MJ9l8bkT8c/X/tarH50XEXwcKzswrR1SycWDmzJn09fU1tv0bb1nAjBkzGtv+aOrrW7XjY1m4cOGYqofp09dlg003GvX9zp07l1mzZq04cIKzHgrrobAeCuuhsB6K9nro7+9vvIFseYaTbB5T/at70vRDlL6cSylzcY5Uq2Vy7bblrRbP+2pxfRGxamb+3wriBsoSptViJEmS1CVDTTb3b7QUg7sVeJTS1/LC2vItqsf51WOrr+bmlCmS6nEPUebmbMW9KiKmZObStrj5SJIkqauGlGxm5mlNF2SQ/fZHxMWUqYs+X1u1D/Bn4Orq+eWUUeZ7UyWbEbFS9boLa4nlBZSplHamSl4j4pnA9sB7Gj0YSZKkSainA4SqidRbUw49C1gzIvaonl+VmXcAxwK/joivA6cDLwEOBA5qjSqvktLjgOMj4m5KEnoAZTT7m1v7y8zfRcT5wDcj4jDgwWr7C4BTGz1YSZKkSajXo9HXB85pW9Z6vj9wamb+NiJeT7k70L7AQuB9mfmV+osyc05EABwKbECZzmjXzLy2bfv7AHMoE7j3UW5XuWdmPty1o5IkSRLQ42Szuh/5CieHr+72c8EQ4uZQEsnlxTwEvLP6J0mSpAaN9UndJUmSNI6ZbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqzMq9LsCKRMR+wLcHWHVyZh5ci5sNfBLYAvgTcGJmfmmA7R0OHARsCNwAHJGZFzVQdE1QS5Ys5a77Hh71/a625npd3e/UvpVZ42mrdG17kiQNZMwnmzWvAR6oPf9z6z8RsS1wLvAd4DDgJcCJEfFYZn6lFnc4cDzwEeBq4EDg/Ih4cWZe2/whaCLof2wxl1+3cNT3u3DhQmbM6F6yudPWG5lsSpIaN56SzbmZec8g644Crs7Md1TPL4mIjYCPR8TXMnNJRPQBR1JaPOcARMSlwPXAR4G9Gi6/JEnSpDPu+2xWSeQrgLPaVp1BuVS+ZfV8O2At4MxWQGYuBs4GZkfElOZLK0mSNLmMp5bNeRGxHrAAOBX4ZGb+HdgYWAW4sS3+hurxecDvgc2r5zcNELc68Azgzu4XW5IkafIaD8nm/wIfB64EFgOzgY8BzwH2A6ZVcfe3vW5R9bhO9TgN6M/MR5YTN6xkc968ecMJH7bV1lyPhQtHv29gE/pjnREdy1iqh5Eey0h0c7/33juVO2+7u2vbG01z587tdRHGBOuhsB4K66GwHoqxVA9jPtnMzJ8BP6st+kVEPAAcHRGf6FGxAJg5cyZ9fX2Nbf/GWxYwY8aMxrY/mvr6Vu34WMrAmLFTDyM5lpHodj1Mn74uG2y6Ude2N1rmzp3LrFmzel2MnrMeCuuhsB4K66For4f+/v7GG8iWZ7z22Ty7etySZS2Ta7fFtFo876seFwF9EbHqCuIkSZLUJeM12ay7FXiUZX0yW7aoHudXj62+mgPFPUSZm1OSJEldNF6TzTcBSynTIfUDF/PkqYv2oczFeXX1/HLKPJ17twIiYqXqdRdm5tKmCy1JkjTZjPk+mxHxM0oyOQ9YQhkg9G7gm5n5xyrsWODXEfF14HTKpO4HAgdl5hKAzOyPiOOA4yPibkoSegBlNPubR/GQJEmSJo0xn2xSLn+/HfgHSnlvAY4ATmwFZOZvI+L1lLsD7QssBN5Xv3tQFTcnIgAOBTagTHu0q3cPkiRJasaYTzYz873Ae4cQdwFwwRDi5gBzRlwwSZIkrdB47bMpSZKkccBkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUmJV7XQBJvbFkyVLuuu/hXhdj2FZbc70nlXtq38qs8bRVelQiSdLymGxKk1T/Y4u5/LqFvS7GsC1cuJAZM56YbO609UYmm5I0RnkZXZIkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGO+NLmncW7JkKXfd9/CKA8eBqX0re593SROKyaakca//scVcft3CXhejK3baeiOTTUkTipfRJUmS1BhbNiVpDBlql4DV1lxvTHcdsDuApBaTTUkaQ4baJWDhwoXMmDF2k027A0hqmXTJZkRsCnwJ2B54BDgTOCIzx+63tiRJ0jg1qZLNiFgbuAS4A9gDWB/4HLAe8KbelUySJGlimlTJJvBOYBrwj5l5D0BE/B04PSI+kZk39LR0kjRBjNZ0VKPRd9X+p9LITLZkcxfgolaiWfkB8C1gNmCyKUldMFrTUY1G31X7n0ojM9mSzc0pieXjMrM/Im4FnjeM7awE8Oijj3axaE+2ZMliVn7Kkkb3MVoW//2xjo9l1adOGVP1MJJjGYlu10OvjmOkBqqH8XosAxnqsYy1z0W70XpPRqMe/v7Yo/T3r9ToPrqhv7+/10UYE6yHol4PtXylJyfylKVLl/Zivz0REY8BH8vMT7ct/y/gL5n5L0PZzty5c7cHftNAESVJkpqyw6xZs/5rtHc62Vo2u+UqYAfgf4HFPS6LJEnS8qwEPJ2Sv4y6yZZsLgLWHmD5NGD+UDcya9asfmDUfxlIkiR16NZe7Xiy3a7yJkq/zcdFRB+wMcNINiVJkjQ0ky3ZvADYKSKm15a9Aeir1kmSJKmLJtsAobWBecDtwCdYNqn7RZnppO6SJEldNqlaNjPzfuAVwF+B/wd8HjgLeHsPiyVJkjRhTaqWTUmSJI2uSdWyKUmSpNFlsilJkqTGmGxKkiSpMZNtUvcxLyI2Bb4EbA88ApwJHJGZD/e0YCsQEXsC/wrMAtahTB77ZeCrmbmkFjcb+CSwBfAn4MTM/NIA2zscOAjYELiBUgcXtcWsAZwA7AGsClwCHJKZt7fF9aROI2J1yvytzwC2zszf19btC3wEeDalro7NzLPaXv9U4FjgbZSbEVwFvCcz/7stbkPgC8BrgKXAT4D3ZuY9bXH/TJl9YRZwH/CNar+N3QUrIt4KvJfyfj8MXA3s0yrbZDgfImJ3ynu9OfA34DLgQ5l5S1vchDknImIT4HBgG2AmMD8zZw4QN2bf/6GWbST1EBErAYcBu1b7WRm4Hjim/fgmcj0MED8LuBJ4JDNXb1vXk8/AUD6fKzKMz8WqwIeAtwL/ANwDXJCZB7bFjZvzwZbNMaSamukSYA3KiXEYsA/wrR4Wa6gOA/qBDwCvBX4EfBH4TCsgIrYFzgWuAWYD3wZOjIh31TdUfYCOB06mfAnfApwfES9q2+f3gN2AQ4C9gRnARRExtbatteldnR7NAD/oImIP4DTgh5R6+CXwvepDXPd5yhfJx4HXA49Sjm9GbVsrAxcCLwD2BQ4AtgPOjYgptbjnVvu5j/L+HE95rz7ZheMcUER8lPKD4/9RjvMdlC/Evmr9hD8fImInyvHPB/6lKtvzgF9GxJq1uIl2Tjyf8l79AbhxoICx/P4PtWxDsKJ6WI2SwPw3sD/wJsof8F9ExGvbyjSR66G+z6dQvjfuHiRk1D8Dw/h8rshQPhdPofz93Lcqz6uBD1Jm0anHjavzwZbNseWdlFtn/mOt5efvwOkR8YnMvKGnpVu+12Vm/cvhkqpl7+CIODIz+4GjgKsz8x21mI2Aj0fE1zJzSXVHpyMpv5bmAETEpZRf+x8F9qqWvZjyAds1My+oll1P+cW5H3BKtY+e1GlEzATeBbwf+Grb6k8A52Tmh6vnl0TE5sAxwE+r1z+jev2hmfn1atkVwG2UlsIPVq99I/AiYGbrWCJiIaX1bDbLblbwAeB+YM/qvbgoItYCjoqIz2bmfd07eoiIoCTbb8jMn9RW/aj2/8lwPuwD3AG8LTOXVvu7A/gd8BKq95uJd06cl5k/rvZ9KrDVADFj+f1fYdm6VA+PAM/JzEWtBRHxc2Azyh/8n1TLJno91B0IrEVJdA6tr+jhZ2CFn88u1sP+wLbAFpn5p9ry02v1MO7OB1s2x5ZdKBPM15v5f0BpMRzuL6hR1ZZotlxDabZfp/pwvIIyr2ndGZRLAFtWz7ejfNGcWdv2YuBsYHbtV+kuwAOUX6+tuAWUL5NdatvvVZ2eDJwE3FxfGBHPobRsndkWfwawdUSsVz1/NbAStfrKzIcof3zaj+/6epKUmZdTEpz2uB9VX6j1fbbel27bH7ijLdF83CQ6H54KPNRKNCv3V49TYGKeEytKQsby+z+Msq3QiuohMxfXE81q2VJKS+eM2uIJXQ8tEbEupbXuPZQWy3aj/hkYxudzhYZYDwdSEts/LSdm3J0PJptjy+a0Na1XH4JbKSf7eLMD5fLEXyj3n1+FJ186aH0ZtI6vde/6mwaIW53S/7EVN3+AD+8NPLGuRr1Oo/RT3AQ4boDVreMbrB6iFndXZt47QNxm1aWWVtxAl2Mer4eIeBqwUXtc1WfnYZqph22A6yLiyIj4c0Q8FhFXRsTLqvWT5Xw4Fdg8Ig6JiLUj4tnAHMrxtPpWTZZzom4sv/9DLVsjqvdxO554zJOlHj4D/FdmXjjI+l58Bob6+RyxKP1RtwRuj4jTIuKvEfG3iPhR1YLYMu7OB5PNsWUay1o96hZRBt2MGxGxFaV16/PVL65p1ar720Jbv+pbxzcN6M/MR4YQ176tVly9rka1TqtLMCcAH8zMvw4QMpx6aI9pxT2V8oWyorjWttYeZJ/tcd20IfAqyjlwKPA64EHgwirhmhTnQ2ZeQumr+clqH7cBzwFeVWtNmSznRN1Yfv+HWramHEJJYP6jtmzC10PVH3Af4H3LCevFZ2A062E65TiOoHyHvpHS1/1FwAVR+qK2yjSuzgeTTXVdlFGAP6CMJvzMCsInmuOAWzLz9BVGTmxPoXzxvzEzz65aKnajJJwf6GnJRlFEbAd8B/gm5RLUnsASykCF1XpZNo09Vcv/Z4E5mfmbXpdntEQZlX8K8LnM/GOvy9NDrZzsr8DumfmzzDyT8r3xfOANPSvZCJlsji2LWPZrq24a5XL0mFe17P2Uchlit8x8rFrV+vWzdttLWr+W7qvF9UWZ+mFFce3basXV62rU6jQink/pvP6x6pLp2iz7pb16lCkohlMP7TGtuMdYNjJxKMd3/yD7bI/rpkXAvVmbjiTLFBpXUKb8mPDnQ+WLwCWZ+b7MvCQzv0/psP9PlGlNWmVigHJNtHOibiy//0MtW1dFxAuBH1MG0R3Rtnqi18OBwNOBU2rfnatCGSld+2HWi8/AaNbD/ZRpmi6rt1pmmTbvQcp3Z6tM4+p8MNkcW25iWV8M4PFOuRtTpk4Z06oT/1xgfeA1bf1qbqV0+N687WVbVI+t42v1QRko7iHKtCCtuKh1hK7H1etqNOt0U8oMD5dQPpiLgPOqdZcAv2H5xweQ1eNNwPoR0X5pYgvg5lofnCcdXy1uPkBm/g1Y0B4XEc8CptLMubW8Ud2rMjnOh9b+/7u+IDPvpMybt3GtTLSXi4l3TtSN5fd/qGXrmojYGPgZZR7at7YNKIOJXw/PAzagHEfru/MI4GnV/z9VK/dofwaG+vkcseoH+e2DrF5KlYCvoExj8nww2RxbLgB2iojptWVvoIyMu2Dgl4wNVV+Ss4EXArMz8476+qp/2sVUUzLU7AP8mfIlC3A5ZfTc3rVtr1S97sLal/AFlF9YO9finkmZkLZeV6NZp/8FvLztX6v/0buAAzLzNsqHcu+21+4DXFUb1f9zyuXWx+srylRSr+PJx/eCahqOVtw2lImH2+N2j4hV2vbZz7KBKt30E2B6RDw+QrHqkL8tMHeSnA9QRr/Oqi+o/pitS/VHZRKdE48by+//MMrWFVW3o59X2949MwcahT3R6+EknvzdeRrwf9X/T6riRv0zMIzPZ7f8BNi+3s0mysTzawFzq0Xj7nyYsnRp+w8o9Up16WAe5Y/QJygthJ+jTEnwpt6VbMUi4qvAv1HmOWvva3RjZj5YdQD/NWWE7umUeQaPBQ7KzK/UttWarPbDlBP4AEpH6Rdn5rW1uJ9QLkceRrnEcCylSf8F1S/EntdpROxIadV8/A5CUe62dBbl1/ovKBMTv4cyF9pPa689iXKp9TBK0nI4ZV62F2TmwipmZeD3lE7lH6a0rJ4A3AW8JJfN7fhcSgvbxZS7REQV96XM/FADx/0U4LfAepR53x6qjmNrylxuf5gM50NEHEyp75Mol0inU+bHWw94fqv1f6KdE1EmjG5NrXIQpYXk/dXzqzLzjrH8/g+1bCOtB8pMHb+tlr+F8h49LjOvmAz10N44Ub3maODwfPIdhEb9MzDUz2c36qFKBq+lvMefpySLx1Peyy1bXdPG2/lgy+YYkpn3UwYR/JVy15HPU07wt/ewWEPV+uX0WcqXZ/3flgCZ+VvKh3RryiWjA4D3tZ+kWSap/QhlFPNPKZdYdq1/gCr7UH4FngKcQ/l19cqs3V5rLNZpZp5DGaW9B6UedgbePMCX1vsod9I4jtI9YTXK8S2sbevvlNuxzQO+S7mjwxWU/rJLa3F/BF5JSXLOpyQ8/0FJBLuuupy1K+WLqfX+AOyYmX+oYibD+XAyZcLkHSh98U6k3D3k5fVuJhPwnFif8h6cA+wIPLP2/OXV/sfs+z/UsnWhHjagjDRenXJ+tH93TpZ6GI5R/wwM4/O5IkP5XPxP9f8p1fKTKA04r8xlYyDG3flgy6YkSZIaY8umJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqzP8HVvNfDeBqWn8AAAAASUVORK5CYII=\n",
+      "text/plain": [
+       "<Figure size 720x360 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "df1 = df[df[\"name\"].isin([\"IssueQuery\"])]\n",
+    "df1['delta'] = df1['ts'].diff()\n",
+    "ax = df1['dur'].plot.hist(bins=BINS, alpha=0.5, figsize=figsize)\n",
+    "ax.set_title('IssueQuery duration (usec)');\n",
+    "plt.show()\n",
+    "ax = df1['delta'].plot.hist(bins=BINS, alpha=0.5, figsize=figsize)\n",
+    "ax.set_title('Time between IssueQuery (usec)');\n",
+    "\n",
+    "# df1['delta'].describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# for SingleStream\n",
+    "if False:\n",
+    "    df1 = df[df[\"name\"].isin([\"QuerySamplesComplete\"])]\n",
+    "    ax = df1['dur'].plot.hist(bins=BINS, alpha=0.5, figsize=figsize)\n",
+    "    ax.set_title('Inference time (usec)');\n",
+    "    plt.show()\n",
+    "    ax = df1['dur'].plot(figsize=figsize)\n",
+    "    ax.set(ylim=(0, 100))\n",
+    "    ax.set_title('Individual inference time (usec)');"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAoMAAAFtCAYAAAB8yGDhAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAABA20lEQVR4nO3deZwcZbX/8U8WMkmAkIWwBAkgwgGM4DXyExQF5SKyCFxlEQQFRUAQRETZd81FiQsoKKAsV5TVK8tlEWVfJQ4IJJAjW4hhIIQkLJIwZJnfH+fppNL0bD29Tdf3/Xrl1emq6uqnz9TUnH7qeU4N6OjoQERERETyaWC9GyAiIiIi9aNkUERERCTHlAyKiIiI5JiSQREREZEcUzIoIiIikmNKBkVERERybHC9GyAi+WFmw4GfArsDawHnuvvRdW1UAzKz9YEXgIPc/bL6tmZFZvZd4AhgY3dfXO/2dMXMdgGuBjZw9zn1bo9Io1LPoIj0mJkdaGYdZrZVmbs4BjgEuBg4APhdxRrXD5nZ4WZ2YL3b0VNmtipwAnBOoyeCAO5+M/Ac0WYR6YR6BkWklrYDHnf3U+vdkAZxOPAacFnR8heBYcCiWjeoG18j2vU/9W5IL1wInGNmp7v7m/VujEgjUjIoIrW0BjCvUjszswHAUHdfWKl9NgJ37wDeqXc7SvgacIu7v13vhvTCdcB5wN7Ab+rcFpGGpGRQRPrEzC4DvgRsCJwP/CewELgcOM7dl5jZdsBdmdcU7oO5gbvPMLMW4Hhgf2A80Vt2DXCSuy8oet2FaV8nA0Zcdr7MzFYDTgP2JMYjzgIuBSa5+5L0+vWJsXgnAHPTe74PeAI43N2nFH22jYEzgO2BEcC/gJuz4xzNbG3gLGBXYBTwPHCeu/+qm7jNANYriseL7r5+qTGDZnZ6+nybAScBnyd6Di9Kz8cR8f8MEf/J7n5O0Xv2KM6dtHcDYPP0Htnl72lrZl0HcIa7n56erwKcDnwxtfdNYBpwqrvfm3ndlkTcPwEMAVqBU9z9rqL9r532twswFngZ+AtwjLu/BeDur5rZE8B/oWRQpCSNGRSRShgI3EYkWMcC9wDfJRI1gKeJMYKzgOnp/wcAc1Lv3p+A7wM3A0cSCcrhwPVpfdaniITkj8BRwHQzG0YkiAcCVwDfAu4kEoULS7R3n/R+FxJJ5frA/5rZSoUNzOyDwCPAjsAl6b2uJZKwwjZrAA8DnwMuAL4NTAUuMLOTu4nZ0SXicXQ3rwG4kvgifzzwEJHYfpdIgl4BjgOeAX5sZp/JtLW3cS728fT49x60sTO/Su/7p/S+PwLmAFtk2rktcB8wGjgzfZ4W4Pb0paKw3VrEz+crxLFwJHG5/f8BY4retxXYugefUSSX1DMoIpWwEnCtu5+Znv/azB4Fvg78yt1nA1eY2fHAa+5+ReGFZrYfkUx92t3vySz/O5HY7QDcnnmvTYCPuPs/MtuemFk+PS2+yMxeAH5gZue4u2f2sS6wkbvPT6934AYi8fu/tM35xDnyQ+7+Qua9Tsrs5wdEovKhzGzVX5vZxcCJZvZLd3+9VMDc/Xoz+0FxPHqg1d2/ntpyETAD+DHRc/bDtPxKoI24rHtnet2+9C7OxTZJj8/3oq3FdgUudvdjSq1MydqFwP3ADulyOWb2a+AxYBLLk9Kzid7Fj7v73zK7Ob1E0vc80Wu7NhEXEclQMigilXJx0fP7iN6u7uwN/BOYZmarZ5bfA3QAn2bFJOXBbCKY2cf9wGtF+/grkbBtB2STwT8WEsFMWwHeD2BmY4FtgfOziSAsG89XSFz2JHq5Oore93bgYOBjwJ87++BlWnapM12C/ztxqfu3meWvpwT3/ZnX9TbOxcYAS4E3+tD2N4CPmdk67v5SifVbEJf+fwyMMbPsur8AR6byRO8Ql31vLUoEgeU/o4zCz3p1lAyKvIeSQRGphEXu/nLRsvlEb0x3NiYSgM7qwK1R9Py5TvaxRS/2MTP7xN3np8Sj0N5CEjW1k/1BjFEbRfS+fa2H71sJM4uev0HE/5USy9fMPO9tnEsZkP4VJ1s99T1iLOlMM3uMGFrwu0yv7cbp8belXpyMAd4lxnB29fPJKvQUlttukaamZFBEKmFpH147EHiKGG9XSnFPTqmZwwOJy6H/3ck+ii9tLulku96MKSuMub6SGFNYyrRe7K+nSrW9s/hnP09v41zstbS/1Vje0wadJFhmNqh4mbtfa2b3EUXHP0uMw/y+mR3o7n9geUyPJ8b5lTIntaE3Ckn+a718nUguKBkUkXp7DpgI3FHi8l5v9rGqu/+1gm0CmNDFNnOAt4DBfXjfWvZU9TXOT6fHDVgxGSz8f2TR9uuV2knqwbwQuNDMRhITcM4A/sDyuL/VVUzN7F1iJnJXP5+sQpuLe09FBM0mFpH6u5q4nPnN4hVm1pLuetGTfWxpZjuX2MeqqaRKj7n7a8RYugNTSZXs/gakbZYQNez2MLMtiveRxh125216dim9Evoa5wfS40ezC1Mh59eIWd5Zhxe9x6BU/if72teJsjQj06JW4FngmFLtKcTU3ZcSYzV3MrOPldiuuId3IvBwH75siDQ19QyKSL1dQUzEOD+VFbmfuBxpxKSHvYC7u9nHOUTJlxvM7HIiqRhG9BztBXyImHXbG0emtrSa2YXEpebxRE3FjdI2xxOTUx5KM4inEcndh4kJDkO7eY+/A4eb2WnE5I5/u/tNvWxnT/Upzu4+08z+Qcw6vqho9W+A483sN8Rn+hTLx/8VrAq8ZGZ/BB4nevY+Qcxw/mV6j6Vm9nViLOFTZnYJUX5nHDGhZwAx0QWipM4OwN3p5/MUkex+gYj9DFhW/mdz4Nfdh0gkn5QMikhdpQTgC0SNva8S48kWEsnXBURB6O72sTDVoDuBSGwOIC7hPkMUhO715UF3fzLdg/ks4FAiufwXcFNmm1dTz9QpwB5Er9s84pLqd3vwNmcSZW6OISZEvJjdfyVVIs7E2MizzWzloruQnElMqNmTiP+twE7Aq5ltFhDlenYAdiPKEb1A1KU8N9POe1PcTyF6F0cQP78prDiT+uUU+7OIsjkjiXGPt7Pi2MAvEhNOru7B5xPJpQEdHeo1FxGR7qVLt88Tdwzp8g4rjSL1Zt6dvWuMiKxIYwZFRKRH0i3ezga+Z2YNf2XJzHYBPkAUqxaRTqhnUERERCTH1DMoIiIikmNKBkVERERyrOHHfDSi1tbWFmBL4GU6v5OBiIiISCMYBKwNTJk4cWJ78Uolg+XZkuU3thcRERHpDz5J1BhdgZLB8rwMsPHGGzNkyJA+7Wjq1KlMmNDTOyo1L8VBMQDFABQDUAxAMQDFACoXg3fffZd//vOfkPKXYkoGy7MEYMiQIbS09OouVyVVYh/NQHFQDEAxAMUAFANQDEAxgIrHoOTQNk0gEREREckxJYMiIiIiOaZkUERERCTHlAyKiIiI5JiSQREREZEcUzIoIiIikmNKBkVERERyTMmgiIiISI4pGRQRERHJMSWDIiIiIjmm29E1sLfefpcF7Yvr3YyKGN4ymFVX7tt9nEVERKTylAw2sAXti7ljysx6N6Mitt9yvJJBERGRBqTLxCIiIiI5pmRQREREJMeUDIqIiIjkmJJBERERkRxTMigiIiKSY0oGRURERHJMyaCIiIhIjikZFBEREckxJYMiIiIiOaZkUERERCTHlAyKiIiI5JiSQREREZEcUzIoIiIikmNKBkVERERyTMmgiIiISI4Nruebm9kHgGOBrYAJwHR3n1C0zWXAV0u8fC93v65o22OBI4C1gGnAce5+R9E2qwLnAHsCQ4G7gCPdfUYFPpKIiIhIv1LvnsEPArsAzwJPdbHd88DWRf/uzG6QEsFJwPlpn88AN5vZFkX7uhLYDTgS2AcYB9xhZsP7+mFERERE+pu69gwCN7n7DbCsB/CjnWy30N0f7mwnZtYCnAz83N0np2X3AE8CJwF7p2UfIxLFXdz9lrTsSeA54EDggr5/JBEREZH+o649g+6+tEK7+jiwGnBVZt9LgGuAncxsQFq8M/AGcFtmu5nAA2mdiIiISK7Uu2ewpzY0s9eBlYGpwNnufnVm/abp8emi100DVgHWAWal7aaXSEKnATtWutEiIiIija7eYwZ74jFikskexKSPWcBVZnZgZptRQLu7Lyx67fz0ODqz3esl3mN+ZhsRERGR3Gj4nkF3P7do0Q1mdidwBnBZ7Vu03NSpUyuyn9bW1pLLh40YS1tbW0Xeo97mzh3OrBfmdLlNZ3HIE8VAMQDFABQDUAxAMYDaxKDhk8FOXAtcYGZj3X0O0bPXYmZD3f2dzHaj0uO89DgfGF9if6My2/TYhAkTaGlp6e3LVtDa2srEiRNLrps9bwHjxi3o0/4bxZgxq7PmRqVCH7qKQ14oBooBKAagGIBiAIoBVC4G7e3tXXZg9YfLxD1RGCu4adHyzYC3gJcy21lmQkl2u+nVa56IiIhIY+p3yWBK5PYGXky9ggAPErOE98lsNyhtd5u7d6TFtwAjyUwWMbN1gW3SOhEREZFcqfcdSIazvKTLesAIM9szPZ+SHi8nCkU/SyRyBwPbAQcU9uPu7Wb2A2CSmc0BHk3bbQjsl9nub2Z2M/BbM/su8CZwJjCTOo8/FBEREamHeo8ZXIMY/5dVeH4QcCPR43dy2nYRkejt5u43ZV/k7pPNDOAoYE2iXMwu7v540f73BSYTBaZbiNvR7eXuzTE4T0RERKQX6poMpvsBF4/fK7Z7L/Y3mUj0utrmLeDQ9E9EREQk1/rdmEERERERqRwlgyIiIiI5pmRQREREJMeUDIqIiIjkmJJBERERkRxTMigiIiKSY0oGRURERHJMyaCIiIhIjikZFBEREckxJYMiIiIiOaZkUERERCTH6npvYpH+6K2332VB++KK73fYiLHMnreg4vvtzPCWway68pCavZ+IiDQmJYMivbSgfTF3TJlZ8f22tbUxblztksHttxyvZFBERHSZWERERCTPlAyKiIiI5JiSQREREZEcUzIoIiIikmNKBkVERERyTLOJpSaWLu3osmxKrcuq9MWixUvq3QQREZGKUTIoNdG+aAkPPtHW6fpal1Xpi49vPq7eTRAREakYXSYWERERyTElgyIiIiI5pmRQREREJMeUDIqIiIjkmJJBERERkRxTMigiIiKSY0oGRURERHKsrnUGzewDwLHAVsAEYLq7T8isHwR8F9gF2Ixo75PAGe5+R9G+ZgDrlXibse7+Wma7VYFzgD2BocBdwJHuPqNiH0xERESkn6h3z+AHiUTvWeCpEuuHAScC/wAOAr4EvAT8xcx2LbH9dcDWRf9eL9rmSmA34EhgH2AccIeZDe/bRxERERHpf+p9B5Kb3P0GADO7DPho0fqFwAbuPr+wwMxuBzYmegz/r2j72e7+cGdvZmYfI5LPXdz9lrTsSeA54EDggr58GBEREZH+pq49g+6+tJv1S7KJYFrWQfQUlnNPsJ2BN4DbMvubCTyQ1omIiIjkSr17BnvNzAYCHweeLrH6y2Z2MLAEuB84wd0fzazflBiXWJyETgN2rEZ7RURERBpZv0sGibF+BhxStPxG4G/ATGIiyQnAfWa2pbsXxiOO4r1jCAHmA6N725CpU6f29iUltba2llw+bMRY2traKvIe9dZuo7v9LP3ls/bks5SrljGYO3c4s16YU7P366nOfh/yRDFQDEAxAMUAahODfpUMmtm2wI+Bye5+X3adux+VeXqfmd0KTAeOB75SjfZMmDCBlpaWPu2jtbWViRMnllw3e94Cxo1b0Kf9N4qWlqGMG9f5lf22trYu1zeS7j5LuWodgzFjVmfNjcbX7P16oqvfh7xQDBQDUAxAMYDKxaC9vb3LDqx6zybuMTPbHLgBuB44rrvt3X0ucCeQjeJ8YGSJzUcB8/rcSBEREZF+pl8kg2a2IfBn4FHggDSJpBxPx+5sQNHyzYheRBEREZFcafhk0MzWAm4HXgH2cPd3e/i61YHtgSmZxbcQPYM7ZrZbF9gmrRMRERHJlXrfgWQ4y0u6rAeMMLM90/MpwKtEGZg1gGOAzcxs2esLNQXNbF9gV+BWoij1+sSl5Bbg7Mz2fzOzm4Hfmtl3gTeBM4lJJ5dV4zOKiIiINLJ6TyBZA7i2aFnh+UHA3cAW6fn1JV5fuNz7AlF38KfE+L83gHuAPd29+PLvvsBkosB0C3E7ur3cvTlmaoiIiIj0Ql2TwXQ/4OLxe8W6W1/oIfx0D9/zLeDQ9E9EREQk1xp+zKCIiIiIVI+SQREREZEcUzIoIiIikmNKBkVERERyTMmgiIiISI4pGRQRERHJMSWDIiIiIjmmZFBEREQkx5QMioiIiOSYkkERERGRHFMyKCIiIpJjSgZFREREckzJoIiIiEiO9ToZNLMdzWxANRojIiIiIrVVTs/grcAsMzvHzLaodINEREREpHbKSQb3AB4AjgAeNbMnzOxYMxtX0ZaJiIiISNX1Ohl09xvdfW9gTeAbwBzgbOBFM7vdzPY3s+EVbqeIiIiIVEHZE0jc/S13v8TdtwfWA04E1gAuB2ab2f+Y2fYVaqeIiIiIVEGlZhMPAlYCWoABwELgP4G/mNljZjahQu8jIiIiIhU0uNwXmtlqwN7A/sAngMXAzcDx6XEpsBvwM+BSYMu+NlZEREREKqvXyaCZ7UEkgDsDQ4EpwLeBK919XtHm15vZ6sAFfWyniIiIiFRBOT2D/wu8BJwLXO7u07vZ/gng92W8j4iIiIhUWTnJ4GeBO9y9oycbu/sjwCNlvI+IiIiIVFmvk0F3/2s1GiIiIiIitVfO7eh+ZmbPdLH+n2Z2Tt+aJSIiIiK1UE5pmV2Aq7tYfzXw+fKaIyIiIiK1VM6YwXWBGV2sfzFt0y0z+wBwLLAVMAGY7u7vqUloZjsBPwQ2Iyav/Nzdf1Fiu2OJ2+StBUwDjnP3O4q2WRU4B9iTmA19F3Cku3f1mURERESaUjk9g28CG3Sx/v1E0eme+CDR0/gs8FSpDcxsa+BG4DFgJ6Jm4c/N7LCi7Y4FJgHnp30+A9xsZlsU7fJKov7hkcA+wDjgDt1CT0RERPKonGTwTuBQMxtfvMLM1gcOTdv0xE3uvq677wk82sk2pwKPuvvX3f0ud/8B8FvgNDMbmN63BTiZ6DGc7O53ErUQnwdOyrTvY0SieLC7X+nuNwP/BYwHDuxhm0VERESaRjnJ4KnE5eWpZnaumR2S/p1H1BQcCJzSkx25+9Ku1qck7zO8d4ziH4hLwR9Jzz8OrAZcldn3EuAaYCczG5AW7wy8AdyW2W4m8EBaJyIiIpIrvU4G3f0Z4vZzjxKXWn+d/n0LaAU+6e5eofZtCAzhvZeQp6XHTdLjpunx6RLbrQKsk9lueokkdFpmXyIiIiK5Uda9id19GrBdutXc+9Pi59x9bsVaFkalx9eLls9Pj6Mz27W7e/FYxex2s9J2xfsqbDe6xPIuTZ06tbcvKam1tbXk8mEjxtLW1laR96i3dhvd7WfpL5+1J5+lXLWMwdy5w5n1wpyavV9Pdfb7kCeKgWIAigEoBlCbGJSVDBa4+2vAaxVqS78zYcIEWlpa+rSP1tZWJk6cWHLd7HkLGDduQZ/23yhaWoYybty4Tte3tbV1ub6RdPdZylXrGIwZszprbvSeob911dXvQ14oBooBKAagGEDlYtDe3t5lB1ZZyaCZDQJ2JHoFRwEDijbpcPezytl3kULP3sii5YUew3mZ7VrMbKi7v9PNdqX++o3KbCMiIiKSG71OBs3so8Afgffx3iSwoAOoRDL4HPAuMdbvtszyzdLj9PRYGCu4KVGCJrvdW0RtwsJ2O5jZgKJ7K2+W2ZeIiIhIbpTTM3gBMAzYA7jP3V+vZIOy3L3dzO4E9gZ+llm1L/AKy8vRPEjMEt6HlAym3su9gdsyid8txGzoHUnJpZmtC2wDfLtan0NERESkUZWTDG4OnOTuN/X1zVOh50JJl/WAEWa2Z3o+xd1fBM4E7jWzi4HfEzOZvwEcUZgVnJLGHwCTzGwOkSQeTMxG3q/wfu7+NzO7GfitmX2XKKB9JjATuKyvn0dERESkvyknGZxF55eHe2sN4NqiZYXnBwGXuftDZrY7cXeRrwBtwHfc/dfZF7n7ZDMDOApYkygXs4u7P160/32ByUQPZwtxO7q93L05ZmqIiIiI9EI5yeDZwPfM7CJ3f7Mvb57uB9xtYunutxCXeLvbbjKR6HW1zVvEXVIO7VkrRURERJpXOcngaOBt4Fkzuw74F7CkaJsOdz+nr40TERERkeoqt2ew4LBOtukAlAyKiIiINLhyksENKt4KEREREamLXieDaYaviIiIiDSBsm9HZ2YbAdsRM4J/7+4zzGwIsBbwiru/W5kmioiIiEi1lHMHkoHAr4GvEzOBO4CHgBnAEOBJonbfTyrWShERERGpioFlvOZE4GvAKcDWZErDuPu/iVvVfaEirRMRERGRqionGTwIuMTdJwHPllj/JLBRn1olIiIiIjVRTjL4PuCRLtYvBFYtrzkiIiIiUkvlJIOvEPcR7sxEQDOORURERPqBcpLBPwLfTLOJCzoAzGwn4v7B11SgbSIiIiJSZeUkg6cDM4HHgN8TieCJZvYw8H/A48B/V6qBIiIiIlI9vU4G3f1N4OPAJGBN4B1gG2AVIlH8lLsvrGAbRURERKRKyio67e7vEMngpMo2R0RERERqqZzLxCIiIiLSJMq5A8klPdisw92/XkZ7RERERKSGyrlM/BnS7OGMQcDa6XEO8HYf2yUiIiIiNdDrZNDd1y+13MxWAg4FjgZ26FOrRERERKQmKjZm0N0XufsvgduBX1ZqvyIiIiJSPdWYQPI48Kkq7FdEREREKqwayeAOwIIq7FdEREREKqyc2cSndrJqJNEj+BHg7D60SURERERqpJzZxKd3snw+8BxwGHBxuQ0SERERkdopZzaxClWLiIiINAkldiIiIiI5Vs6YwfHlvJG7zyzndSIiIiJSPeWMGZzBe+9A0hODyngNZnY3sG0nq09w97PN7HTgtBLrv+fuk4v29xXgRGB9Yozjme5+dTltExEREenvykkGDwaOAtYF/gD8My03YF9gJnAesLQSDQQOB0YULTsgLb8ls2whcau8rBezT8xsT+ByYrbz7cAewJVm9qa731qh9oqIiIj0G+Ukg2sDLcAH3H1+doWZnQY8AKzl7v9dgfbh7k8VLzOz84An3f2JzOKl7v5wN7s7C7jW3U9Iz+8ys02BMwAlgyIiIpI75UwgOQy4qDgRBHD3uURZmW/2tWGdMbONgC2BK3r5ug2ATYCrilb9AdjSzMZWpoUiIiIi/Uc5PYNjgFW6WL9y2qZa9icuQf+haPkwM3sVGA08C/zC3c/PrN80PRb3NE5LjwbMqXBbRURERBpaOT2DDwPfNrOJxSvM7KPAt4G/9bVhXfgycI+7z8osexY4jhizuBvwEPDLNLGkYFR6fL1of4UeztEVb6mIiIhIgyunZ/BbwN3AI2Y2BXgmLS9cvp0HHFmR1hUxs62ADYFJ2eXuXnzJ+BYzAzjOzM5x97er0Z6pU6dWZD+tra0llw8bMZa2traKvEe9tdvobj9Lf/msPfks5aplDObOHc6sFxqvM7yz34c8UQwUA1AMQDGA2sSgnDuQPGVmHwKOB3YC9kyrXgTOBX7s7q9Urokr2B94B7iuB9teAxwIbAZMYXkP4Egg275Cj+G83jZmwoQJtLS09PZlK2htbWXixPd0sgIwe94Cxo1b0Kf9N4qWlqGMGzeu0/VtbW1drm8k3X2WctU6BmPGrM6aG5VVNrRquvp9yAvFQDEAxQAUA6hcDNrb27vswCqnZxB3nw18J/2rCTMbDOwD3OTub5axi6fT46bA9MzyzdKj96F5IiIiIv1Sn25HZ2YbmdknzGy1SjWoCzsCq9PzWcRfImoPTgNw9xeIJHCfou32Baa4e+NdLxMRERGpsrJ6Bs1sP6Jw8zpp0Q7AnWa2OvAgcLK7X1OZJi6zPzCXEvUAzayVKCbtwBAi4ftyakf2OuupwNVm9hzwF2B34LPALhVuq4iIiEi/0OueQTP7ItE79zTwPWBAYZ27v5aWf6VSDUzvuQoxS/gad19UYpNngaOB64mxgpsAX3P3H2Y3cvdrgYOIcY5/Jnob99PdR0RERCSvyukZPAn4q7vvaGZjgMlF6/9GhYtOu/u/ifqFna0vvvTb1b4uJ3oRRURERHKvnDGDmwJ/6mL9q4Du5iEiIiLSD5STDL5N13cg2RB4rbzmiIiIiEgtlZMM3gkcaGZDileY2TjgG8R4PBERERFpcOUkgycBawN/Bw4HOoCdzexs4EnivsFnVKyFIiIiIlI1vU4G3f0Z4BPEXTxOJ2YTHwN8H/gHsI27z6xcE0VERESkWno1m9jMBhG1BWe7+2fNbBTwASKpfF6Fm0VERET6l96WlhkIPAccB/zU3ecT9/0VERERkX6oV5eJU8HnNmKcoIiIiIj0c+VMILmUmE08tNKNEREREZHaKucOJP8EBgHTzexy4HlgYfFGVbg3sYiIiIhUWDnJ4BWZ/5/SyTYdxD2CRURERKSB9SgZNLPzgMvdvRX4dFq8CtEjuKRKbRMRERGRKutpz+C3gIeBVne/x8zGEPcg3sHd76la60RERESkqsqZQFIwoGKtEBEREZG66EsyKCIiIiL9nJJBERERkRzrzWzi95vZ/0v/Xy09bmJm/y61sbs/0qeWiYiIiEjV9SYZPCP9y/pFie0GEKVlBpXbKBERERGpjZ4mgwdVtRUiIiIiUhc9Sgbd/fJqN0REREREak8TSERERERyTMmgiIiISI4pGRQRERHJMSWDIiIiIjmmZFBEREQkx5QMioiIiORYb4pO14WZHQhcWmLV+e7+rcx2OwE/BDYDXgJ+7u7vKYptZscCRwBrAdOA49z9jio0XURERKTh9aeewc8BW2f+TS6sMLOtgRuBx4CdiOTx52Z2WHYHKRGcBJwP7AI8A9xsZlvU4gOIiIiINJqG7xnMaHX31zpZdyrwqLt/PT2/y8zGA6eZ2UXuvtTMWoCTiR7DyQBmdg/wJHASsHeV2y8iIiLScPpTz2BJKcn7DHB10ao/EJeCP5KefxxYDbiqsIG7LwGuAXYyswHVb62IiIhIY+lPPYNTzWwsMBO4DPihuy8GNgSGAE8VbT8tPW4C/B3YND1/usR2qwDrALMq32wRERGRxtUfksGXgdOAR4AlxJjAU4ANgAOBUWm714teNz89jk6Po4B2d1/YxXZKBkVERCRXGj4ZdPc/A3/OLPqLmb0BnG5mZ9WpWQBMnTq1IvtpbW0tuXzYiLG0tbVV5D3qrd1Gd/tZ+stn7clnKVctYzB37nBmvTCnZu/XU539PuSJYqAYgGIAigHUJgYNnwx24hrgdGI8YOFy8MiibQo9hvPS43ygxcyGuvs7XWzXYxMmTKClpaW3L1tBa2srEydOLLlu9rwFjBu3oE/7bxQtLUMZN25cp+vb2tq6XN9Iuvss5ap1DMaMWZ01Nxpfs/fria5+H/JCMVAMQDEAxQAqF4P29vYuO7D6/QQS4DngXZaPCSzYLD1OT4+FsYKltnuLqE0oIiIikiv9NRn8EtBBlJtpB+7kvaVh9gVeAR5Nzx8E3gD2KWxgZoPS625z945qN1pERESk0TT8ZWIz+zOR7E0FlhITSA4Hfuvuz6fNzgTuNbOLgd8DnwC+ARzh7ksB3L3dzH4ATDKzOUSSeDAxG3m/Gn4kERERkYbR8MkgcXn3a8D7iPY+AxwH/Lywgbs/ZGa7E3cX+QrQBnzH3X+d3ZG7TzYzgKOANYnxhru4++PV/xgiIiIijafhk0F3Pxo4ugfb3QLc0oPtJpO5lZ2IiIhInvXXMYMiIiIiUgEN3zMoItWxdGkHs+c1VumiYSPGltWm4S2DWXXlIVVokYhI81MyKJJT7YuW8OATjVXoO2ot9j4Z3H7L8UoGRUTKpMvEIiIiIjmmZFBEREQkx5QMioiIiOSYkkERERGRHFMyKCIiIpJjSgZFREREckzJoIiIiEiOKRkUERERyTElgyIiIiI5pmRQREREJMeUDIqIiIjkmJJBERERkRxTMigiIiKSY0oGRURERHJMyaCIiIhIjikZFBEREckxJYMiIiIiOaZkUERERCTHlAyKiIiI5JiSQREREZEcUzIoIiIikmNKBkVERERyTMmgiIiISI4pGRQRERHJscH1bkB3zGwv4MvARGA08BzwK+BCd1+atrkM+GqJl+/l7tcV7e9Y4AhgLWAacJy731G1DyAiIiLSwPpDz+B3gXbge8CuwPXAecCPirZ7Hti66N+d2Q1SIjgJOB/YBXgGuNnMtqhe80VEREQaV8P3DAKfd/c5med3mdkqwLfM7GR3b0/LF7r7w53txMxagJOBn7v75LTsHuBJ4CRg7+o0X0RERKRxNXzPYFEiWPAYMJS4bNxTHwdWA67K7HsJcA2wk5kN6Es7RURERPqj/tAzWMongXnAq5llG5rZ68DKwFTgbHe/OrN+0/T4dNG+pgGrAOsAs6rSWhEREZEG1e+SQTP7KHAQcEbq2YPoKZxCJHarAQcDV5nZMHe/LG0zCmh394VFu5yfHkfTy2Rw6tSpvf8AJbS2tpZcPmzEWNra2iryHvXWbqO7/Sz95bP25LOUq5YxqObn6Ity2jR37nBmvVDqIkL/1Nk5IU8UA8UAFAOoTQz6VTJoZmsBfwQeITOBxN3PLdr0BjO7EzgDuKxa7ZkwYQItLS192kdraysTJ04suW72vAWMG7egT/tvFC0tQxk3blyn69va2rpc30i6+yzlqnUMqvU5+qLcGIwZszprbjS+Ci2qva7OCXmhGCgGoBhA5WLQ3t7eZQdWw48ZLDCz1YBbgQXAbu6+qJuXXAuMN7Ox6fl8oMXMhhZtNyo9zqtYY0VERET6iX6RDKYE7kZgDeBz7j63jN0UxgpuWrR8M+At4KXyWygiIiLSPzV8Mmhmg4kZv5sDO7n7iz14zQCiVMyLmdnIDwJvAPtkthuUtrvN3Tsq3XYRERGRRtcfxgyeD3we+D4w3My2yqx7irjMezlwJfAsMJKYQLIdcEBhQ3dvN7MfAJPMbA7waNpuQ2C/qn8KERERkQbUH5LBHdPjj0us+zTwBNHjdzJxGXkRkejt5u43ZTd298lmBnAUsCYx+3gXd3+8Ok0XERERaWwNnwy6+/o92Gz3XuxvMjC57AaJiIiINJGGHzMoIiIiItWjZFBEREQkx5QMioiIiOSYkkERERGRHFMyKCIiIpJjSgZFREREckzJoIiIiEiOKRkUERERyTElgyIiIiI5pmRQREREJMeUDIqIiIjkWMPfm1hEpDtLl3Ywe96CejejIlZZbfV6N0FEckbJoIj0e+2LlvDgE231bkZFfPj9w+vdBBHJGV0mFhEREckxJYMiIiIiOaZkUERERCTHlAyKiIiI5JiSQREREZEcUzIoIiIikmNKBkVERERyTMmgiIiISI4pGRQRERHJMSWDIiIiIjmmZFBEREQkx5QMioiIiOSYkkERERGRHBtc7wbUmpltBPwC2AZYCFwFHOfuC+raMBEREZE6yFUyaGYjgbuAF4E9gTWAnwJjgS/Vr2UiImHo0GHMntf/v5sObxnMqisPqXczRKQHcpUMAocCo4APu/trAGa2GPi9mZ3l7tPq2joRyb1FSzq4Y8rMejejz7bfcrySQZF+Im9jBncG7igkgskfgXZgp/o0SURERKR+8tYzuClwSXaBu7eb2XPAJr3YzyCAd999tyKNam9vL7l88aJ3GTxwaUXeo96WLF7U5WcZutKAfvNZu/ss5ap1DKr1Ofqi3Bg04mcp19Ili5vis7z7bjsvzS7vHNmyymhemv16ZRvUB8OGDGbl4SvV/H07+9uQJ4pBZWKQyVcGlVo/oKOjo89v0l+Y2SLgFHc/u2j5/cCr7v6FnuyntbV1G+C+KjRRREREpFo+OXHixPuLF+atZ7BSpgCfBF4GltS5LSIiIiJdGQSsTeQv75G3ZHA+MLLE8lHA9J7uZOLEie3AezJrERERkQb1XGcr8jaB5Gli3OAyZtYCbEgvkkERERGRZpG3ZPAWYHszG5NZ9l9AS1onIiIikit5m0AyEpgKzADOYnnR6TvcXUWnRUREJHdy1TPo7q8DnwH+Dfwv8DPgauBrdWyWiIiISN3kqmdQRERERFaUq55BEREREVmRkkERERGRHFMyKCIiIpJjSgZFREREckzJoIiIiEiOKRlsEGY2oOgxdz8bxeC9FIOgOCgGoBiAYgCKAVQ+BrkPaKNw945UFHukma3s7ksBzGxQfVtWO4oBmNkQM1vXzDY2s5GZGAyod9tqLfuZC3EoXt7sFAPFoFheY6DjoLoxUJ3BBmBmewN7AbsALwLPAH919/My2wzM/vCbjWIAZnYgsC+wA/Av4A3g/4AfuPuCtE1TxyDLzFYHNgG2BZyIyTPuPi+tH+DuTX0CUwwUAwAzGwdMBHYG/gnMAh5095fS+qY/L+g4qG4MlAzWmZltADwG3AHcBGwBbAZsTtwp5SR3v6Z+Law+xWBZDJ4AriHujmPAB4E9gAHAKe5+ftq26U96AGZ2K/EHcAEwHngZeJBIkK9y9/Y6Nq8mFAPFAMDM7gPeD8wF3kecF9uAG4BzM18Wm/bcoOOgujFQMlhnZnYpsDqwT+YXeh1gO2A/YGvgT0Qy0FavdlaTYgBmdgGwAfAFd1+Ylo0EPgR8FTgQuAc41N2frVMza8bMziJ6io8A/pEWHwHsDqwK3A9c4O5/r0sDa0AxUAwAzOwMYB/gQHd/2MyGAfsDOwH/AcwAfuTut9WvldWl46D6MVAyWEdpLNylwEjgi8BSoCMzTmwjIhHYB7jJ3Y+pU1OrRjFYNt7jXOIb3w7uviB72cfMxgK7AUcTl4gOKCTNzcjMhhKJ71/c/eSidesC3wb2JGJxuLs/22w9IoqBYgAxhhi4GZjq7t8pWjcW+BKRGA4krqDc3oQx0HFQgxgoGawzMzsJOAyY6O6vpmUrufui9P9BwPeAScBe7v7HujW2ShQDMLOvAj8Gdnb31rSsOAZfBi4DvunuF9arrbVgZtcBS9197/R8JWBJJkHeAbgCeA7YtTBmppkoBooBgJn9BtjQ3T+dnhfH4GPARcBQYDt3f7luja0SHQfVj4FmE9ff5cT4j/vN7FMA7r7IzAaa2RB3X+LuZwN/Bz5az4ZWkWIQYz6eA243s8/DshgMSknhEnf/H+DPwEfq2dAauRf4opkdYmaD3H2Ruy9NJ0Dc/S/A54jB1J+pZ0OrSDFQDADuBLY1s9PTFYPiGPwN+E9gFHF1pRnpOKhyDJQM1lH6xZ4FHAq8Bvwq/cKv4+5L3f3dwnbAC8AH6tjcqlAMgrvPJWYSPwhcZmaXmNmGKQks9A4OAF4F1qtjU2vlUuBq4vLH0WY2HpZ/SUjbzCAS6C3q0sLqUwwUA9z9D8CPiOEy55nZh9PywpfFgcSEgieATaw5S63k/jigyjFQMlhHhe5dd78XOAmYDnwFuMrMTjCzUWkixReJwcJX1a2xVaIYLOfuLwLfJ37pPwHcbWbnmtkmZvYfwEHE7OLf1K+V1WOZYuPu/hZwKnFimwSca2ZfMLO1fXkJjSHEwOnX69HealAMFIOsTGL3S+LqwR5EDI4xsw+kL4tLgRHAOkBbs4yV03FQ2xhozGCNmdmqwJZEnaABwAvufmla10LMGv08MUtsTaInaBFwu7t/ox5trjTFYNlM4c8S3fmrAE8CV7j7S2l84GeJGOxE9ATOBd4CbnT3o+vR5lpIcRkILEonP8zsIOC/gcHAQ0R9rZeBXYH13X2D+rS2OhQDxcBi4siawHBgji+vI7cH8YVxHWAe8eV5FnEuHenuG9elwVWS9+MAahcDJYM1ZmbXELNGVyH+wI8n/sj/DPiFuy80szWJy6FrpvW3AS96KjnS3ykGYGY3ABsTyXAb8GFgEPA/wPnuPt3MVgFGEyf+dYCHiT8MTVdPy0oXHb/H3X+S2eY44FPE8TAO+D3wB3d/uPYtrjzFQDGATovP3wqc4ctLbx1EnEM3B9YCrgSud/fH6tHmStNxUPsYKBmsITPbnZgs8WV3v9nM3kckBHsRl0afB45Il0yLX9sUU+UVAzCzXYlf2t3c/R4zWw0YC+wNHAu8DZzo7r8r8dqmiEGWdV10/G3gdHe/Im27CpFAd7j7v+vT4spTDBQD6Lb4/EAiIfx52naou79jmaoDzUDHQX1ioGSwhszsZ8Qlvz19xfsKDiUum55GFFo+AZgMDHT3JXVoatUoBmBmpxDf5nYpTJDJrFuX6P7fD/ipux9bhybWlPWs6PiNRNHxf1nMpGu2Y0IxUAywnhWfvw84zN29SWOg46AOMdAEkhrIDAKeSSQB66XlgwHc/R13vw84BLiEqLk3oZkOcMVgBf8CPk1841+Bu/+LmFl9KvBlM9upxm2rqTQ+chDQASybHenuL7n774lC2xcQE2q+A9Bsx4RioBjAsnPkYmIySHbiwOvp3HgCcW5YHZhkZsObMAY6DuoUAyWDNZC5rHcv8ct+RFq+2MwGpx8+7v48cDwxI+g7pfbVXykGK/gr8BRwnJmtBSsky7j728BPgFeAb1pzlooAlp3EnJgsNKowO9KW1856hugt/i1RTqHp6qgpBooBLDtHthJjpTdNy7IxmEMUnT8H+C/ggPq0tHp0HNQvBkoGa8jjzhJnAUeZ2S1pSvhid1+S+UHPA64FVre4B2VTyXsM0pi/WcAPiZlf15vZR4vHAaZLRFcTZQJG1L6lNaWi44oBKAag4vOg4wDqEAONGawyMxvs7ouLlu0PnA6sDfwUOMuXF1dehRg8vMDd96xxc2smxeBMYrZw08egMNi7aNnWxGf/f0TtwJ8Ar7j7m2a2OjGzeJG7717zBtdIuvyxNJ3wzgZWI74IXOzuL2W3I2ZMDnT3verT2spLvb4DUgy2IW5JOAK4jpzEAHQcZFkUE/4lcRnwBuCH7v5cZv0AoodwTXf/XF0aWSV5Pw7qeT5QMlhlFvfdvdbd/5lZthIxAHTf9K8D+BPwDjF77EPAVumSab9nUTtwbWCEuz+Rlg0gkqCvETdbX0pzx+BnxHHwYGbZAGAjoqD2YcAawN3EZfS1iJh90t1fqHmD68DMPg18i7g88hJwC/Brotbax4nLIgd5k9yb2sxGe9H9Q81sW+AootRQG00eg1JyeBwY8LxnZgSb2QeJIvO7ECW4/hf4FTCMiMvPgK+7+3W1b3Ft5PA4qOv5QMlgFZnZN4ALiZuMv2BFZUHMbCwxXXxbYE+i1t6TRNJwRz3aXGlmth1x+5ztiGEJc0k1sYjxMS1EvaztiaSoGWNwMPELbO7+XInjYDjRQ7odkRi/QVwq+l93n1KHJleVqeg4FneUuRfYvPjcYDGz/mvAzsQfgbVozhjoOIhbyz0ATPSoLZo9DoYQyeAONHHxeR0HjXE+UDJYRWY2B/g5cHYaEzeA6PIdD8zwVE08s/1q7v5G7VtaPWb2EnGj9b8RB/A2xIG9gPim+xuPGbSF7ZsxBnOAc4H/TsfBYOKm8psD09z9laLtm6puWDFT0XHM7AHiXtz7FA8fyGyzFrAhzRsDHQdxHMwBvpQ9DrLDiyxKy6xGFBVuuuLzOg4a43wwuBI7kfcys58Ss0Ev8eXTvo8Gvk5k9kMsagkdX/hhFpKg4p6j/srMDgPaiQLKhYTvGjM7mZgRdwrwSTM7uHA5uAlj8FPiNkG/zRwHpwH7E7/UQ83sWuBUd/e0fvF799QcLIqOf5bSRcdPAw4ws0LR8dlFr22KWdUWdxb4MHGZpz0tW5vo/RhJ9I5c6e4ziXNI9rXNEgMdB8uPg0+w/DjYmJgpvE4aE/Zzd3+WuNfsi5nXNksMdBw0yPlAPYNVkC7/zgYmufvJadmpRDJY6CV7H/Bl4lvhbh7TxZuKmR0AnAx8xN3fTgfusiLSaQLFJcT4h8+5+9P1a23lpUkgrwLnuvt30rIziLI61wH3EOMCv0l889/Z3f9ep+bWhKnoOGY2E/i9u5+Qnm9L9BxvDrwJLCTuMvAjd7/YmrOoro4Ds1nA7zLHwWeIqyXrE+PDWog7E51BTCJpuj/WOg4a53yg0jLVsQ3xLW4/MzsvjQv5NlFIeF93P4c4yA8nuv73rldDq+xZ4hf9MDMb5u4d6TJpoYjmQ0Q19QVEdf1mY8AjwDfM7EYz+xjwDaJH9Ah3vxL4BdFL2A4cXLeWVpmp6DgAZvZN4ovgSma2WVp8EfACMWZqDaK+5kzgRDPbuJlioOMgmNmXiXP/+y1mjQKcDzwO/D933wDYB7gKOIY4NpqGjoPQSOcDJYNV4O5/Iga93kp0gT9AFBm+pjAWzKOq/DXEfSi3slR0uclMAX4HfBfYM02UwFMRzfT/x4iBs582s6app2dmK7v7A8RxcDox1uMh4u4jNxR+od19kbv/jYjVZmY2rFkuf2S5io4X3E3MCNwfmJyGCKxE/I7c7+7vuvtVxHCStYk7TjQNHQfLPETcReKDwM/M7G6iJ/B4YgIdKRk6iojTV+rTzOrQcbDM3TTI+UDJYJW4+z3EN7pTiUKidxBdvsu+FaXZYs8TYzebJhksfMNJA6CPJu6leSnwSzPbxFJx6YwnieLKzfSt7ydm9ll3n050+R9CXOa4iRj/kz0OBgOzCi9sxstBBZ7zouPu/rS770oMDVgD+AIxZOAld+/I9Iy8QMy2X7lJvxzk9jhI46Gfd/dvAUcCM4irCNcTdUaXZpKh+cTVhSElzpv9Xp6PA2is84HGDFaQxRT5CcAniVmiN6fl44CR7v5Uujy6NC1fj0gSL3f3s+rV7kpKk0Z2JC77/jvzWQ8jLo8OIwqm3k3ccmdz4EdEr+nxdWhyxZnZocTYn+eJYQFT0vKRxO2FXig6DjYgblH3O3c/vT6tri3LWdHxzqTfi8Xu/pui5WOI5OAxdz+qHm2rNFMB/kJB6TW9qGSUme1DjIe7smj5KOI4cHc/pGYNrTGdD0I9zwdKBiskfZO7lKgF1AGMIS4Pf8lXrBo+KH3r2ZAYB7Gvu7+vHm2utPQt5m3gJHefXGL9eOKy6beIntAWooTAXe6+Xw2bWjUpBnOJX9ytidlfX/WiwtGZ42ATotfwi+6+Xq3bWwumouPAshP6Ku7+YtHyAakXYCWPW04NIeqOXkbUpmyKouOmAvyY2WPEbNnt3f3h7GSAEsdBC3EcXEpzHQc6H9B45wMlgxViZucRs57OIsYB/gfR43Wfu+9ftO0goqL8x4DD3P36mja2SszsQmAr4kT3Wlq2CZEgrwrcBTzh7q+b2W7AfKLA8j+9k9pK/U0mBtsSJSIuJi5xHFCiV2Qg0SO4CXCou99U4+ZWnanoOGa2BTFR6gBiqMhjwMleogB52v7bxBem6939ezVvcBWYCvBjZl8lErvZxFWRA9z9X6V6TNP2pxBfnq9z9+Nq2tgq0fmgcc8HSgYrwKI20hTgSE+3B0o9RMcAk4BPufuDtmJV8S2A8c2SAJjZB4DpxCXie9JA4G8RA6DXJWYMjyQGyx6W7S1tFma2EXGS/0IhwU9/BM8l/gic6O5vFF0iHgP8h7v/tU7NripT0XHM7ElgHvAoMUxie+AfwFe8qGCsma0LfI8oSv6VZhk/airAj5nNI+41+wJwBXAjUV/vPV+EzWw0kTSNAw5pouNA54MGPR9oAkllfJYoJfMMLOvmXUyMfZhB3FKITCL4PqKCfFMkgsmxxPE0OiWCo4kK8jcR3/Y3I7r/twJusKim3mwuAm4mZpEXXAdcTcwG+yJAJhEc5O5zmzgRzBYd/6W7X5PGu4wjLgGdAlxuZu8vvMYzRcfr0eZKs6gruRT4hketycOBnxDHwr5pm2WTx9IfwnOAo5ooAeisAP8DxNWCl8zs3OzkgCY8Dn5K9IL9xt2vJnp6diQmmg1L2yz7e5wmTZwNfLuJjgOdDxr4fKBksDL+TdxKZwZE0pd6fxYTycCyQa8W5VNuIsaJNYV0EruR+FxXm9mfiG++twI/cPdHgFfd/Vric38I+Ei92lsNaRzM08SJbtltotx9vrsfRCSEF5rZ18xsYPrC0Eyzp0t5m7h/5jyIE3pKgN9090OJOy+sDdxlZptmX9gMfwDT7/qOwP8Qk4lw96Xu/mvij9+X07JlY8bS8395zCLt99Ll36OJckovp2WnEn/4pxNDaS4lYvFo6l1fpkmOg3WJ0iknFYbPEJdGryBmkR4Iy78kFrj7QndfUMOmVpvOBw18PlAyWAEe9QJ3TZcACz/Awi/2bcAHzGyr9HwvorbUebVvaXWkA/oWYgzEgcAGRG/gLaQyKhlPEL0E69awiVXn7u3ufri7P5ldnvmWdw4x9uVE4IPNcHLrgbwXHV83/Vvsy+8zWzjn/gH4qJl9MLP9p8zsZmuu8hkqwB+fdRpRYqxw5ehNdz+MSIQnm9kh6UtiM/9N1vmggc8HzXzg1ZS7z0iPxX/kHwJeArYzszVYfmuhebVtYfW5+5tEkendiMvGj6Re0sLsqAHERJJBvDdJbEq+vLj0VOIP3WLgT2b2H3VtWG3ktug4gLtPI7703QHLkoDCl8T7ibqan0vrRhC9ZaOKxw31Zx4F+A8gxwX43f0bxK0mF6TnHZkk4GdED+nxwCbFvYNNppXoDc3z+eB8GvR8oGSwitIPu52YMfp54DhgkLufUd+WVU/6tjcTuMjdH02LC8fZCKLncEkaN5Mb6XLIs8SkonWJnpCmlOkdLxQdv5+cFR235UWDf5S+CKzwRdHdZwN/BnZNi3YnZlh+sbYtrR5bXjD3fuLuESeTvwL8QwDc/ZXs8kzyM5X4mS8E/s/ilpVNyaNu4NHAg+TsfFDg7pPcfWrqCW2o84FmE1dRpkfs88TU+QFEPbk/1bdltZe+BR5J/FE4xN1vrHOT6sbMjgGedPe/1Lst1ZKSoYGF3h8zO4T4pjucqJd1D01adLwgJUNLOhsSYGb/RfSUfAr4PXCTN0kpmYJMQli4LLYGMMbdn7YmL8BfkH4XlpY6DgoxMLPdiQlojwL7NdGY0c5uxPBN4CSW34Sgac8HRTF43N3/nJYvO/7T87qeD5QM1oDFnSceB55298/VuTl1YWYbE5cM72qWX/LeshI1pJpJ+qN3EnBxZrLASpmE8P1Ez/A3iS9GQ2m+ouOlYrCssHBmu4HEDeqnECU21nL3sbVubzV0dhwQiXH2j18zF+Dv0XFQ9JpvAke7u9WomVVlnd+IYW93fzn93L/K8qskzXo+KI7BvcSx/nJmu7qfD5QM1kg6Ga7s7q/Xuy31kn4xBntmtq00DzM7G/g+cZnnUuBcX15OqbjI8OeJcaNvAdO9eYqOdxWDUknhX4HPAHu5+x9r3d5q6CYGKxRYtuUF+LciCq9fX/sWV15vjoPMFaSViF7TV0rutJ+x0jdi+DFwr7/3RgyF88GbxK33muV80OObUaTt63Y+UDIoIn2WSojcS5SNmE9c7pkBTC4MCUjjw4Y065eBXsRgoC8vH7EJccvK0+vR5krrbQzS/ycQBfhvrk+rK6uc46DZWPc3YtjW3R8o/nLQTHoQg1I3o9iU6DU8tdbtHVzrNxSRpvQRYC1i9uyVwMHAHsAFZrYvMMmj7E67mQ0lak22AA800aXznsZgiUVdyg8Tl46aaYxcb2IwhEiUBhFlqJpFOTEYSnP9LpS8EYNF8e1DiMumD2TGkq5LJMcvdrbDfqi7GOwCPJhJBNcjekdPq0djNZtYRCphOjEQ/JpUKmQyMWHoSqLn5wYzOzuVTBhO1NXauYn++EHvYrAyMVh8jybrHepNDFZJy3fP8XFQiEGz/S709kYMN5KKbzeR3sbgemKoRF2OA10mFpGKMLMh7v5udtJIWv45opjsNsAbRN3NHYhB0k0xa7JAMVAMQDEAMLP13X1GifHC2xEzxz/h7g+b2deJ+xKv5U1Wf7c/xUDJoIhURVHpkBHAF4ib0m8DHOvuP61n+2pBMVAMQDHISkMkngEuAC4hyulc7E1cf7dYI8ZAyaCIVFWmhMhg4CpgC3ffqLvXNRPFQDEAxSAzc/oSwICHibqKa9e5aTXTqDHQBBIRqar0x28g8J9Ej8ie3byk6SgGigEoBhl/IsbIbU0T3XWnlxoqBuoZFJGaMLPxwP7uPqnebakXxUAxAMXAdCOGhouBkkERERGpKd2IobFioGRQREREJMdUZ1BEREQkx5QMioiIiOSYkkERERGRHFMyKCIiIpJjSgZFREREckzJoIiIiEiO/X/icxdMgxke1QAAAABJRU5ErkJggg==\n",
+      "text/plain": [
+       "<Figure size 720x360 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAnYAAAFKCAYAAACQBBKyAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAACHR0lEQVR4nO2dd5wURdrHf7MLLDkHEUkCFiKY1hzOHFAx5/CennreneE8MZzhxHzmnD3jmdBTCZJEohIUlpyKuOSwwO4Cm0O/f1T3THdPdZye6ZnZ5/v56DLd1RW6u6qefup5noooigKCIAiCIAgi88kJuwIEQRAEQRBEMJBgRxAEQRAEkSWQYEcQBEEQBJElkGBHEARBEASRJZBgRxAEQRAEkSWQYEcQBEEQBJElkGBHEBkAY6wXY0xhjN2oO/YYY8xXvCLGWCFj7BMX6U5Vyz3VTzlJqM9UxtjUBMo5gzE2jzFWobarrd+8shnG2CeMscKw62GGMdaCMbaNMXZz2HVxA2NsDmPs+bDrQTQsSLAjiIBhjN2oCg3HhV0XIgZjrBWAbwHUA7gDwA0AykKtVIgwxgaoHwe9wq6LB/4OoAbAf8OuiEv+DeB2xth+YVeEaDg0CrsCBEH45ikAz/q8lkEIOJnG2QlcexiAdgCe5JyPDKg+mcwAAMMATAVQaDp3K9Lsw58x1hjA3QDe4ZxXh1wdt4wAsAfA7QD+FW5ViIYCCXYEkaFwzmsB1Pq8tirg6qSEBCf0zurf0iDqAoilQc551mn9OOc1YddBwgUAOgH4JuyKuIVzXs8Y+x+APzLGhnHOM/FjisgwSLAjiBSg2o9dDaAPgLcAnAmgAsCnAB7gnNfp0rYF8CqASwAoAEYCeEWS52MAhnHOI+rvHwEcCqAn51wxpf0ZQB/OeW/1dyGAqZzzG3VpDgDwJoCzIJYovwAwXlJu3LXq8akAwDk/Vf3dBMDDAM4D0BdAUwCLATzDOR8hv1P2SMroBWAdgAcB7ALwTwAHAFgE4G+c8zm6605Rs5nCGAOAT7U2MMaOBvA4gBMBNAFQAOBfnPMpurIfg9BwDQLwAIDzIYRE7Z6erbY3X73kVwD/5Jwv0OXxCdy/BxEAf4PQnjGIZzIfwBOc81906a4F8A8AAwFUAvgZwP2c83U29/FGAB+b7gcA3MQ5/0St56mc8166axQA76n5Pw7gQIj7fBvnfAFj7FYA9wPoDuA3Na+1pnId77MNFwPYyjlfasozrq7q8ceg6x/qsTMQe4Z5ALYCGMc5v0OXJg/iPboeQA8AOyGEyYc55+WmMq6G0CIOglgiXgLgBZNGeCLE0n8+gDku2kkQCZFWqnaCyHJyIASlXQDuBTANwFAAf9YSqJP5SAj7ry8APAJgf4iJ34mvISbV4/UHGWOdAZwKYLjVhYyxZgAmATgHQrh7GmLyTcTwuzWA2wDMgBB4Hoa4Bz8wxgYnkK+MqyCEivcg7lkvAN+ry3eAaM/r6r+fgbi/7wEAY+wUAL8AaA/gCQihLQ/ATxZOI8MhlnQfAfCamse1EM+2EkLIfAxC8PmFMdbfdL3je6DyPsSz2Kbm+TSAEgB/0BIwxv4J4HMI4XYogBcBnARgBmOsk/xWAQCmS+7HDepxO06A+Mj4r9pGBuBHxthfIITLdyDemeMAfKK/0Md9lpU910U6KYyxAQDGAGim1v3vEDaXJ+rSRAD8APEujQFwJ4RQ9zcAI9TzWtpHAHwF8fH1OMRS62qIPqSnQP17IggiBZDGjiBSR2MA33LOn1B/v8sYmwfgZogJEQAuhJi4H+CcPw8AjLF3ILQkToyE0P5cBWCm7vjlAHIhBD8r/gzgIABXcc6/Uct9H0JD5JdiCO1hdNmXMfYmgHkQQsi4BPI20x1AP855sVoOh7gf5wD4kXM+kTHWBsBdACZyzqeq6SIQAt6vAM7SNJ2MsXch2v4MhEChZwXn/DJdm1pACGCfcM7/pDv+IQAO4FEA1+qud3wPVEHnFgBvc85v1137iiZcMMZ6AHgSwGO6vMAY+xrAUghB6yHZzeKcr2WM/WK+Hy7oD+BgzvkataxiiPv3BMT9L1WPNwLwIGOsL+d8tc/7HEXNrw+EsOWXsyAEycGc85264//U/fsaAOcCOI1zPk1X/lwIAfosCEG0D4QwNwrApRJNaxTO+WbGWDWETSNBJB3S2BFEavnA9PsXCM2OxnkQTg2aoAd10njLKWPO+V4AYwFcwRjT9+2rIISRBTaXnwdgO4D/6fKrAPAfp3Jt6lOnCXWMsSaMsfYQWrzpiC1XBsV3mlCnoi1VHihLrOMwCK3TlwA6MMY6MsY6qvWcCOBYxlhz0zXvmH6fBaHB+1K7Xs0jV63HaZJynd6Dy9W/w8wX6pbZL4X4OB9uKrcUYslbVm6iTNGEOpXf1L/fa0Kd6bjWJj/3WU97ABGIjwW/aPW72NQ/9FwJYCWApaZ7Og1CM6fd00sg5s8n9UIdYHg+eooBdEyg7gThGtLYEUTqqOGcbzUdK4YQCjR6AtimCml6Vros42sAl0Fo/aYyxvaHWJp70uG6ngDWSIy73ZYrhTF2C4Tm6GCIiVnDV/w9Gzbof3DOi1W7sXby5FEOUv9+aJOmAwC9bdUa03ktj4kW15vvqZv3oA+A7SbNkhmt3BUW59daHE+EDabfmrC00eK41iY/91lGxOG8HcMhtKIfAHiWMTYZwmv1G9URSasnA1BkkYfmgNNH/bvUIp2ZCIJ/5wlCCgl2BJE6UuERNwbAXggt3VQIDUQO7JdhvWI1QeUC0C9JXQcxiY4G8ByAHRBevDfBuDQZBHUWx50EAU1z80/EbKHMmCf5Cos8bgSw2aE8ILj3QCt3MOTe0eZ6BoHVfXa6/37us55dEO+dTFC3ex+jcM4rVDu/P0BoqM+BsGO9hzF2sqqhzgGwDML+TsYWmzra0RbCCYMgkg4JdgSRXqwHcBZjrJVJa3eQ1QV61MlrFIDLGGN3QAh4CznnVlodfbmHMcZyTFo7WbnFEBOVmZ4waomuUH9fpF+eYozd5NySlKFp3/Zyzt3YMdrlUZRAHrI8z2WMdeKcWwk8WrkbOOfLfJSRSg1SQveZc17HGFsF1QPZhN37aM6nHuKDZyqA+xljfwXwNsSy9hdqPfMBTLJYUtXQ2nMIHBw6GGPdIDyAl9ulI4igIBs7gkgvxkL0y79qB1R7oNstr4jna4h4XzdBeCe60daNBdAFMdsuzVP2FknaNQCOU8OZaGkvgHBg0KNpcfSehAdC2CelCwUQnoz3qDtTGHDwLNWYAOGt+pD+nnjMw4xm6/iYJD/tfn4HcY8fNRvsq+mcbLq0+HtOy9VBEMR9ngHgKMnxNQDaMMYO1eXXFab3jDHWQXLtPPVvW/XvcIh+8FdzQsZYnq7uP0BoXh9ljOWa0pmfhWZPOhMEkQJIY0cQ6cVoiAns32qMtqUQ8bvae8hjAoQW42X1txvB7gOIWFufMsbyIZYUrwcgC2T8HwgBcDxj7BsIe6PrEW97NgpCEzJK1SJ2gwgbwQEc7qE9SUMNIHszRPiRZYyxjwBsgggxcwqEUGrrhMA536OG+/gCwHzG2FcQjig9IDwsl0Is03qp11Q1PtvfVA9MzYP4eIjYcc+onq3/BPACgJ6MsREQAmZvABdBCCmP2RQzH0IwfJCJ2IkVAH6zi3/nlyDuM4SX802MsUNMsey+hljq/4Ex9jqA5hCC2UoAR+rS/Uv1Nh4DsdNGOwB/gRBwf1TTfA7xbr+lLtv+qtaNQZg1XAERw3ENY+wJiPv7K2Psewj7wCMhQt7oP8TOUtvqO1QLQXiBNHYEkUaoS0UXQggJ10HELtsK4I8e8qgB8D2AVhATdaGLa8oBnAHgJwgB718QGob7JWknQIQrOQgikPLxELsCbDKl+1S9fgBEzLTLIBwp0mo7L875dAjN5mwIwfNNAH8CsBtCYHCTx3AIwWQDxL15HcKOcDnUeHk+uBnAPRCa0Ochnkl7CA9NrdwXIQT/aoi4ei9DaKqmQsRos6vzdojgx+0gBPuvEAviHDgB3OcxEHaaV5ry3QXR5nKI+/RHiLh/o03Xj4QwDfijWvbdEMLtiZzz9Wpe9RAfI/dBvLcvQIQ1OQ5iyXaRrtzH1bwaQ4R7eQpCAJygpVG17ZcD+Ix2nSBSRURRyFGHIAiCSH8YYw9CaOP6pOm2ZwYYY5dCaAH7SDyhCSIpkMaOIAiCyBReg3BE+L+wK+KSBwG8SUIdkUpIY0cQBEEQBJElkMaOIAiCIAgiS2jwXrEFBQWNABwAYFN+fr4syCdBEARBEERa4CS3NHjBDiKI5WoAJxcUFGxySkwQBEEQBBEiB0DsL90X8WGmSLAD0FX9+4ttKoIgCIIgiPShK0iwk7IVAA466CA0aRIXND5QlixZgoEDBya1jHSlIbcdaNjtp7Y3zLYDDbv9DbntQMNuf7LbXl1djZUrVwKq/GKGBDt126MmTZogLy8v6YWloox0pSG3HWjY7ae2N1wacvsbctuBht3+FLW9TnaQvGIJgiAIgiCyBBLsCIIgCIIgsgQS7AiCIAiCILIEEuwIgiAIgiCyBBLsCIIgCIIgsgQS7AiCIAiCILIEEuwIgiAIgiCyBBLsCIIgCIIgsgQS7AiCIAgiC9hStA/XDxuHouKKsKtChAgJdgRBEASRBYyfvR6l+6rxy4JNYVeFCBES7AiCIAiCILIEEuwIgiAIgiCyBBLsCIIgCCILUBQl7CoQaQAJdgRBEASRVUTCrgARIiTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQBEFkCSTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQBEFkCSTYEQRBEEQWEaFoJw0aEuwIgiAIgiCyBBLsCIIgCIIgsgQS7AiCIAiCILIEEuwIgiAIgiCyBBLsCIIgCIIgsgQS7AiCIAgiC1CUsGtApAMk2BEEQRBEFkHhTho2JNgRBEEQBEFkCSTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQBEFkCY2cEjDGrgBwHYB8AO0BrAHwDoD3OOf1unSDATwNYACAzQBe5Zy/IcnvXgC3A9gPwFIAD3DOJ5nStALwAoDLATQFMAXAnZzzQlO6fgDeAHASgAoAX6v5lbtoO0EQBEEQRFbhRmM3FEAVgPsAXABgBIDXATynJWCMHQ9gFID5AAYD+BjAq4yxv+gzUoW6ZwC8BeB8AKsAjGGMHWYq8ysAFwK4E8BVAPYHMIkx1lyXV1sIga8VhAA4FMA1AD5y0SaCIAiCyCoUULwTwoXGDsAQznmR7vcUxlhLAHcwxh7hnFcBeBTAPM75zbo0PQAMY4y9zzmvZ4zlAXgEQpP3IgAwxqYBWAzgYQBXqseOhRD6zuecj1WPLYbQFN4I4G21jNsAtANwOOd8p5quFsAXjLEnOedL/dwQgiAIgshsKN5JQ8ZRY2cS6jTmQyyRtlcFttMBDDel+RJiufVI9fcJANpALJdqedcB+AbAYMaY9iaeB6AUwHhdug0AZqjnoEs3SRPqVL6D0C4OdmoXQRAEQRBEtuHXeeJkALsB7ADQB0ATAMtMaTSNWX/178Hq3+WSdC0BdNOlW6G339Ol66/7fbC5TFV7uMaUjiAIgiAIokHgZinWAGPsKAA3AXicc17HGGunnioxJS1W/7ZX/7YDUMU5r7BJt0lNZ85LS9de99ttOlcsWbLE6yW+KCgoSEk56UhDbjvQsNtPbW+4NOT2p7rtO7aXAAA2bdqIgoKSlJYtg559OHgS7Bhj+0Esd/4OnfNENjBw4EDk5eUltYyCggLk5+cntYx0pSG3HWjY7ae2N8y2Aw27/WG0fd6mxQDfhwMO6I78/D4pLdsMPfvktb2qqspWGeV6KZYx1gbAOADlAC7knNeopzSNW1vTJZomb7cuXR5jrKmLdOa8tHS7db/dpiMIgiCI7IecYgm4FOxUYWwUgM4AzuWc79KdXgOgGjEbOo0B6t8V6l/Ntk6Wbi9E7DstHdM5U+jTrdD9Xm7OS3Xk6GNKRxAEQRANhgg5xTZoHAU7xlgjCM/VQwEM5pyv159XHRYmQw1XouMaANsAzFN/z4Twdr1Kl3euet14zrn2rTEWQhN3ji5dd4ggxGN1+Y8FcAZjrIPu2CUA8kzpCIIgCIIgGgRubOzeAjAEwP0AmjPGjtOdW8Y53wPgCQDTGWMfAPgCwIkAbgVwu+bdyjmvYow9BeAZxlgRhMB3C4SG7VotQ875b4yxMQA+ZIwNBaDlvwHAJ7qy34MIYDySMfYkhDbxZQDDOedmD12CIAiCIIisx81SrKY5ex7ALNN/RwIA53wWgIsAHA1gAoTA9g/O+bv6jNTAxA8BuAvCXq8/RCDihaYyrwHwI0Qw4m8hNH9n6rcK45yXQMTP2wfgewCvQMTS+5OLNhEEQRAEQWQdjho7znkvNxmpu0Q4LoGqwt2LDmn2QuwscZtDupUAznVTP4IgCIIgiGzHb4BigiAIgiAIIs0gwY4gCIIgsgCKdkIAJNgRBEEQRFZB0U4aNiTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQBEFkCSTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQRBagKBTwhCDBjiAIgiCyC4p30qAhwY4gCIIgCCJLIMGOIAiCyDh2FJfT0iNBSCDBjiAIgsgo+PrduPmpifjpt/VhV4Ug0g4S7AiCIIiMYuP2fQCAZet2h1wTgkg/SLAjCIIgMgxagiUIK0iwIwiCIDKSCHl/GiF5lwAJdgRBEESGQT4T9kQo3kmDhgQ7giAMlFXU4LclW8OuBkE4QgIMQcRDgh1BEAZe/KIAT338O7bvLg+7KkSCjPplDdZv3RN2NQiCSCGNwq4AQRDpxdadZQCA6pq6kGtCJMoHI5YgJwKMfPGisKsSKLQSSxDWkMaOIAgTYtokw/TsoD4LpSDNxo7eUYKIhzR2BEEYiE2aNGv6pbK6Ftt30VJ28shCaZUgAoI0dgRBGNCmTBLr/PP8f+fijhenoLq2PuyqZDX08WGExF0CIMGOIAgT2v6bNGn6Z8maXQCycxk0HaBwJ/ZQ123YkGBHEIQBsl8KApI8UgG9owQRDwl2BEEYIJEkcaLCcbjVyFroHbWHNJoNGxLsCIIwQkuxBEEQGQsJdgRBGCDnicRJB4WJks1qG/r4sIVuS8OGBDtCypChI/HcZ3PCrgYRAgpJdglDdorJhV5RgrCGBDvCkl8Xbgm7CkQYaNoQmjb9kwbasjSoApFislpLS7iGBDuCIAxEtSEk1/kmplGim5gMSH6xh966hg0JdgRBGKBlxAAJ8R42CNmH3lGCiIMEO4IgTDQIkSCpkEYpydANJghLSLAjCMIA7RUbBJqdYphVyH7hh95QgoiHBDuCIAyQx2HipINMlQZVSBrZ3DaCSBQS7AiCMEKSXcKQA0pyIa0yQVhDgh1BEAbqKdxJVpAOWsNkoaTDUncaksWPnPAACXZE4IybVYiSvVVhV4PwCXnFBkA2S1XpBL2jcqjzNmhIsCMCZUvRPrz9v4V4lnatyGBIKAGAD0YuxvT5mxLO59IHRuPOF6cEUCOv0HMkiIZIo7ArQGQXNbX1AIA9ZdUh14TwC9kvCUZNXwsA+MMRB3i+Vi9S1dTWo3DrnoBqRQAgmZUgbCCNHREoZDSe+dAzTJx0WIlNhzokm4b+8UEQMkiwIwjCiEKG6dlANst12dw2gkgUEuwIgjAQ09iRaEekJ1FzgXCrQRBpCQl2DZAxM9bhq594UvJWSNuT8ZBXbHagZPVaLEl2UrL5kROuIeeJBsi73y8CAFxzNktaGaTtyWRodiAyA4q1KIeG34YNaewIgjBAXrFZQhbL51mtjCSIBCHBjiAIA7SjWHAk6x5WVtVixLQ1qK9PLwnn2c/m4JWv5qWsPPr2IIh4SLAjAoW+pDMfhSS7tOfTscvw4aglmLVkq2WaMLrijIVbMHnuxqSXQ+MMQVhDgh2RFOhLOoPRHGDoIaYt+8prAABV1XWWabLbeYIgCCtIsCMIwgAp7DIHkr0JgjDjyiuWMdYXwL0AjgMwEMAKzvlAU5pPAPxRcvkVnPP/mdLeC+B2APsBWArgAc75JFOaVgBeAHA5gKYApgC4k3NeaErXD8AbAE4CUAHgazW/cjdtI4KFtASZD4U7IdKf7NAq7yguBxSgc/vmgeRHoy8BuNfYHQLgfACrASyzSbcWwPGm/ybrE6hC3TMA3lLzXAVgDGPsMFNeXwG4EMCdAK4CsD+ASYyx5rq82kIIfK0gBMChAK4B8JHLdhFJgsIQZC4KGdkFB93CpJAtAYpvfmoibn56YuD5Zvp9IRLDbRy70ZzzkUBUM3eURboKzvlsq0wYY3kAHgHwKuf8RfXYNACLATwM4Er12LEQQt/5nPOx6rHFANYAuBHA22qWtwFoB+BwzvlONV0tgC8YY09yzpe6bB8REPTFmPnQXrHpjxvFeDYrz7O4aQSRMK40dpzz+oDKOwFAG4jlUi3vOgDfABjMGNOmkvMAlAIYr0u3AcAM9Rx06SZpQp3KdwCqAAwOqM4E0bCg3UOyimwW0LO5bQThl6CdJ/owxkoYYzWMsfmMsatM5w9W/y43HV8KoCWAbrp0KyQC5VIA/U35GZaGOedVEJo9fToi1dCAm7FEtSE0a6aMEdPWYN2W0kDzzGatFtnyEoQ1QQp28yEcLC6GsHfbBOBrxtiNujTtAFRxzitM1xarf9vr0pVIyijWpfGSjkgVNN5mPNliv5RJfDhqCe56aarn62yfEWleCaJBEthesZzz10yHRjLGJgN4HMAnQZWTLJYsWZKScgoKClJSjhvc1MVrfbfsrgYAVJSXx12bTm0Pg0xr/7x5BYF5HWZa2/UEVfcg+9uu3bsAAOvWFaIVdkjTVFSLBQ/FQ75BoS8vGWVv3rwHALBt+3YUFFQGnn9QuG17UPeoqEjoSNZv2ICCvN2B5JkImdzvEyXMtgcm2FnwLYC3GWOdOOdFEJq0PMZYU865vje2U/9qb2IxgB6S/Nrp0mjp2lqkW+GlogMHDkReXp6XSzxTUFCA/Pz8pJbhii83AYB9XdykkdBmYwkwfgeat2huuDZt2h4SGdV+9dkfdZSVj5Q3Mqrtenz2Af21erE4yP42eflcYP1m9O7dC/n53aVp9pZXA//bgoiHfBPG1I5kPfs1xSuBhXvQdb/9kJ8/IPD8g8BV2xN5xyTMWrsAWF2Gnj16ID+/dyB5+iVj+30AJLvtVVVVtsqoVAco1mzrDjYdHwBgL4DNunRM50yhT6cX2Jab81I9b/vAo2BHBIOShLXY14fPx0+/rQ88X4JoEGShraQSjWMXckXSFRc3ZmdJBd75biHq6oLyjSTShaQJdqpQdiWA9aq2DgBmQni7XqVLl6umG88516SCsRCauHN06bpDBCEeqytmLIAzGGMddMcuAZBnSkekiGTYZ038fQPe+GZBgDkSRPqQkCOAzQRO/gWEHW9+uwBjZxZi4aqdzomJjMLtzhPNEQsz0hNAa8bY5ervOerfTyGCCq+GEMpuAXAqgBu0fDjnVYyxpwA8wxgrAjBPTdcHwLW6dL8xxsYA+JAxNhTAHgBPANgAo73eexABjEcyxp4E0BnAywCGc87tAikTyYY+pQnCFb4EMBLaiATR3rt6+gLIOtza2HWGsJfTo/2+CcAoCE3cI2raGgih7ULO+Wj9RZzzFxljAHAXgC4QIUzO55wvNOV/DYAXIYIR50HsMHGFfqswznkJY+x0AK8D+B6xLcXud9kugiCIUElW6I6sDgmSxU1LGfTtnbW4EuzU/VmdXoOL3Baq7jrxokOavRA7S9zmkG4lgHPdlk0QBJFOJFtGyeb5O9P3ig0T7c5l9QdAAyXVzhNElqNQ7CyCiOJG8Ij2GR+dpqH2MxJFEkd7N+leZh8k2BEEQYSIH4cjN5NxQ1DENFTBNlAawHvS0CDBjkg7vp20EisKww+uaWbx6p0YMnQkdu+pRHllDW4YNh6LVhc5X0gQNmjzarKWFbNxtTIqtGZh24LAzW3J0TR2DeELoIFBgh0RKLFJyn8en41djvve+CWQ+gTJ6F/XAgCWF+7G+q17UbKvCv8da972mCC8odRrS7HBSinJiCmZNkRNPkiy84v2umXxW9JgIcGOSAo04DYctu0qo6/+BNDuXI6HLuPKLi/6SLKvL5rftlUbi1FUbN6CnHADdd3sgwQ7gnCJzDGkoY+JfP1u3PrMzxg3qzDsqmQsUaE4G9dMk4x2y+55dTr+9NRP4VYmY2noo1j2QYIdESxZPEbo59/oHJzF7XXD5qIyAGJ5mvBHsuS6bH4100nLdNXDYzD6l7VhV8Mz0aXYNLqXRDCQYEcESsMwao5E25fVdkyuaOjtTxw/S7EabkweslkRGHbTFEVBeWUt3h+xOOSaCLwIaRTuJHshwY5ICmEPuMkm29vnFbof/lF8fA25C3dCU3ayqa9Pz3vsSZhPzyYQCUCCHUF4RD9o0tzpTF29gro0nQDTAe0d8qOxSwd2labeaSGqKQ9ZHants5qTgQ8vQqsOWQsJdkSgNJRBgpYx3PPk15vxwJvpF74mXYh5uGaecLBi/W7c+MRPmDx3Q2oLTg+5DnV1qmAXdkV8EB3DaBDLOkiwI5JCKiepyupaLFu3K+nl0ADoH76+OOwqpC0JOU/YXONnRwuvrN+6BwCwdG1qnWfSpStmtMZO/UtL9tkHCXZEoIQxRrz5zUI88Oav2FFcntRyNG2kYQhv4INitjY/lZNd9L3yItml2X0PSzgIW5zSbOwyUK7LSA0x4Q4S7IiMZ+2WUgBARWVtUsuJaVYiFLXdRLZNEqmUU5IW7iQlbQjnuaeLlimInXbCJk1uJREgJNgRhFcitLOGGW2iLa+swazFW0KuTeKkcq5Luo1dJksdToTcNP3HXjrgReClUJzZCwl2hC3fTV6F8sqasKvhDoex9bOxyzBu5rpAi8zmr10/WpHXhs/HM5/Mwcbte5NQo+wkEa9YO3mioTgyeWXj9r0YPpEHkpdsNxonxs8qxCX3j0qyp7iLGsWM7JJYDyIMSLAjbPlkzDJ8NHqp6/Su9rB0cb2/i+1PfztpFd7+bpH//PXQ564BTWOxY7ewc6yoSu6yeNJJpY2djz1d00VoC3v3Aj+a84fenoHPx69AWUXiH6yKj7XYD0YuQW2dgpqauoTLNzPxd8072fmBaPcuPd4kIkhIsEszdu+pxJChI7Fg5Y7Q6mAWrvzYrvldmsiEj8cI9HJdBlTYJ26eRVyaNFmSSpSULsUiSQb4KWhEWE87EbvE6trgBCo//V97zvVJHOw87UCRtFoQYUGCXZrB14uwAWNmBLtkmCmks5ikF3jTxaYmmfh5FtkSQiEM54mgheJsMOy3IpH3K9Bn6yOkjBYaJZnvmKudSdJ6tCUSgQS7NCXMeTGJq6GW/Hfccnw1YYWvwlM1ccUmykjoS1DpSjYKEclGE1KStles92w901CFBD/CcywwcPw9W72pJJCPIld5pJnjhx8mz92A1ZtKwq5G2kGCHeFMCvYd/ObnlfjyJ57e00NaVy4J+BGys8ZuJ/U2dnYT7NadZfhs7DJPk75fAWHx6p2uPZszUSYIss5+9vnVBHiz88TiNTvxj1emYeT0tQHUyzmNthScLs+wdF8VKqvdm/3U1Nbjla/m44E3aFcbMyTYEXGEOSmTBizD0TSZ9eFWI1FSuhQL5wn2yY9+w7eTVmHrzrKk1+ehd2bgmU/mJL2cIPCjbUr2s527fDse+2CWpWBttRSrOR6t3VySzOpF0a9ApAPXDxuP+153L6Rp0RqqazN8sEkCJNilHenRycIjsyS7TLcls8Ndy4ypssWpJKW1d6Gxq60zTl7p9tqluj5BlDfs/Vn4+ff1CeVRrz6WHN1M+sSHs1GwYoflOxTVapsaEajtnYtMEo1gkAwK1S3q3FCjCnRNGucmqzoZS6OwK0CkIYkYJrvQPrgpOp0GGw39QBi1kwmzQmkIbSzuneiSmIu0cbc19H4SbgUSKZ1vKAbfUIwzj+npO4/YNoOSmlj0AW1sM3vFan0nCG9ZNyHyYnsJh/4S+ULzbm7ahAQ7M6SxIxxJZcdPt+UBGRHdHclmASab2+ZESveKdaGx89Mbsvn5pU3TJCZ2Th99Vpq5aBiUAAIXe9KYBzTUVlTVYsjQkZixKDU7z9TUCI1dHgl2cZBgR8QRro2d90ju0WuDrYp9/ukrd4aCJpPEvIXTZur1R0rDnThrub1UR1EUzFm2Le2M44MkXZYRZa4TTlWKaubq5Ro7P11nReFuY5+T5PH9lFVYu7k0liSBsVbGtl3C/vOrCSsCytEeTWOXR0uxcZBgRwRKwur9dHbBlzrAZbgAY4uf0DPZsUSd2gDFgqDe+ZmLt+KJD3/DiGlrtJwDyTc9Cbdtsg8Yp1BImsbOvORqddyJ35dtw31v/IKxutinMqXfxz8uw99fnhr9HfQ+tzkp7vuaYNwol8QYM3RH0ox0kGeCULb4trFL4Ppk37qYPU2srCCWTTIZy3elYd8WT7jZK9bqlOz4rtIKAMCO4vKE6uWGsDW06TBeAuZ62As4VmYcfpdiNU3Zph37dEfdOE+o9QnqHqb4XdBKyQl8y5bMhwQ7Iq2ILbH4CGMQdGVsmMeLAACbi5IffiIs/IzP0Yk+wyU7N5NTRVVtdFINoiw373yllz14XTyCRAPihiVYpctKv2yfX6d7EtVsxe/H57MS6tUeBRz9h2oQ5ESXmAPK0CXpItwDQE1tHYqKK8KuBgl2hIwERs2ABtx06qwydu+p9H3tnGXbUhKPLAxioRwSy+fbSSt1S4npycPvzMCtz/yccD5uNCfaubtfmaZek3hH+3XhZvzjlWmYNn9zwnmFhb9xIjipULbPbyR2UooWGiWovWJlFiJe9nn2KhBakXLtbZoI93pe/nIe/vTUT6itC7dyFO6EcMTL4JmopiZmo5d+6CfgROr3xIe/IRIBRr14USD1ShZenqQm0EXflQTHtc/GLgcAXHxKn8Qy8okbYWvVxpJAyoqFO7EuzGqulNZPcZEGsaW7Tdv3OlXRkTScY1OCdJ/fqK2Z/K4EHRJIlo8b4Spo54lU29hphKW9raiqxZ6yanRp3zx67Lel20SdwqlSFNLYEXGkh42d+wxSLQRGEElYoxj0YLRwZRFqVC+xdCDjl2LVvyl5t3wUlsrJrK6u3marp2Du0LotpRijM/53Il3eLz/OE1b4H1Piv4Y9bBUbnKNaqm3sQn4FHnz7V9zy9ETDMc0+MmzFBAl2aUrYL21opEkYA1siAQ6GAbBmUwkeeW8mPhy1NNB8/QzQfie1sCneU4lp8zbFn0jBc9Y0drbOEz6qEdQE+8RHv+GKB8c4FJZYGXe9NBXvfr/I/QUS27YwkGl2E62R38em1/h21mmRRJ4ytZ6/cqzICVgTme6s2VQad0zb/zfs6YEEOyKORPplop068K/IANG3LZ2qV1pWDQDYXLTPIaVH/DhPOHgEmqmuqcO6LfEDZKp57D+z8eIXBdhbLu4lAl6msmPDNrGNUrLe+URznbdih3XeIfWD6apdYIUXZ5IkYohjF3UgkhO04KPP77B+HQEATRrlWKaJHUuOEJI6r9j0lSDDnh5IsCOSgt84dmHb2G3cvhfPfTYnbm9OPZE009hp6v+cEOqkDa0/z9kg/uHRxu6t/y3EXS9NRXECzihO1NbVY/Qva1Fn80x3lghPtjrV6DmRsDteeW34AgBOz89LwBPB/JVFvuuU7pTsqwIAbN0ZzMfM356fhP+MXOL5OkWmsoODZBcwbuxBZVUJ+iM66hWbciO7FJeXAZBgl2akg7iQyBdXon0skXAnQfDa8Pn4deEWrHYwjE+H56SRtC9vH9fE5Dp3V/P1xQCAfRU1Pkpzx8hpa/D+iMUYP6vQMo05TIssjEXSsS0qvWevsJbfWjVvEkg+G7fvw8jp9l7YKwp346mPfosutwHyp+KksbPCa9+J1kE3ZlrHlYw/YXae2FFcjh9/XeupbD2xrdIaho2dLbQUS6QdCYRX0jq1X1f+wINmeq6A3Sn9Wmzyq+KWoCPI+ylbI2KxD6YV0Wj7SfzML6usUf9aL9tZaZhTeUuDKipZd1LqKJCkspw48dD9AQAH9WiXsjL//ekc/LZ0m1G7rN4SQ7iTkAYvu1KlGjv14MzFWwEAw96fhfd+WIxSVRvqufxUhzuJFhx/qHRfVUIhqTIdEuzSlHS2H7Bjwuz1AIBFq3f6uj4aNDMkY3G+QWiQ6iSChn67NN9bpiUBTYjODTgCuy/nCY/pc31uo+SWhSuL8OtCsSm5l/YEHQrCDfYCQQI1CUjQCDs2lx7tgyC1MlT82CQLVRPVvKVqKdZNGqmNnfg7flYh1m/bg33l4gPIb18MOoyLW2rr6qO7rWhcP2w8/vj4hJTVYduuMkNg4rBnBxLsiDgS6ZfGbW38Fx62DVtVtU3okEh6OU/Uh+iJZVWmWyFKuz5ZGrtH3psZDQbtqoQ4bXXqbmpOTgQjp6/BkKEjsU9z4nAgmdUbY1qWs7M7TfWHqNX7Vbh1TxLLtDkpfQ6W66LyLHw+TJkphuSz1DYP2/HOJSkXaNWCNmzbixuf+ClUR5pbn/kZf3rqp+jvsOcvEuyynJraOluj8aBxOyFZUR+CpkSG7S4ACL/j6kmrpViPQUqTrbHT42Zi1pK42b81aCIRYMLsQgDArgSWkeKeic983v1hseG3dBzxa1CWIFHDf1PrErERc1um3slFu9dbd5bpQl2401zpu+u+8mp8MHKxdeIEkUc7UWx/e0Vrd6J9ma/f7es6T9vtZTkk2KUZQU/Olz7wIx58e4anaxJZ0pQtYfqheG8VZqhLaE4kQ6Bx2lg6jeQ6XRy0NKqUy9cgOhmkwpXO5r229DlNqY1d/HJeWpGO75epSl620vJcpMRlXz9WLlgpwsL4uU2fjl2ObbvKfdVP/2Fnda2djZ0Zv2YmQdnY3fv6L67SmUtJxcdhpkCCXQNgeaG/L6Aw0G8g/exnc0Krh0xI0g9Y6TTHRcOdBG5j5/0a6w3O7UmFttGdLZIW7iT1k8TiNTtdC7h2t3fR6uSEOQnahjMRErHF9V2mzsY2Vg/JeR9KzERWVfT3Qvv3JC38ULRu1l6xcf/OUPvuoJQK2QAJdoQjqRzOUzGoTJ8v2WHAjN1SbCS9nCeUNNLY+V2ZS4UnnV0Rcbcumja193RzkbAHNAu6+p/3v/ELqtXt42S1m7Nsu/Tawq17ErJDapQbX1pozusSIUscT957JPXYlxQXDdLtoSpePmw2bt8r/QD4fsrqqNZv9pJtsWDbFvWMX7JP7Glq5bUMKASNI6b6p0TrnyGQYJemNFStsrndyRioX/i8wDGNbIhL150ntPEsEnBvTuTOu31sKdW4eGhRzJ4qOXVJhOWFuz05CeyrqMGu0grc+eKUhLTgOTlpOF2kVGNntKEDjO9U9HAS67R6Uwn+9vxkfD91ta5i4k9dvRINtG1GvhQbrI2dFouyZbPG0WOP/2c2PhzlPfCzHxrqnCkjDXsqEQZlFTWoqhFagDA7SNxgE1JdFABTCzZG70k6E67GzviAYlVIfEkxcGzLstD8pJME7xNFAW58Qnjs8QTMMuyE3OkLNvvON12prK5FTW2s/0fN+gw2drF/axovpzdG9s67fc22qxq5lRuKUbBiO1ZuKLZ8rY1OHrKlWN2/9XXxK5lGHY5i189dvh0jptkHft5TVo3rh43zUZyxTWRjF4MEOwIAcPUjY/H3l6aEXY20+epatGonXvpyHj4ZvTR6TB9WINHQBEESRLiTrTvL4u18/MSxS+ONwN3Z2Bl/p6tY5/c9atI4N+CahINVnMFE37tq3YfcFQ+Owd+enxyXecRKYFIPx5wI7MvyM4ZEHaVyInjsg9kY+tp0y3KCtrl1jcdil67dhdJ9iUVTAGgpVg8JdkQUzb4nFUtwltfHueCHgxa2pXhvfBT2CCK+l+iSIfDUS76UvVBUXIE///tnfDJmWWB1ctvM1Bq/23jFupyMg6S6pg41tXKjeaf74ree2SLYaZjvU6LLieZYfZrNGqAzeTCUp6uL6V/JsBfWhJdcK7WhDr1g5yrcSVDV9ZhPUGOpX+eJiqpaTJhdmPodM5IICXYhs2nHXrzw+Vzb4J9mvKTNONKkb9U57ubgU2Pnsz62eeq+4v2gbSG0eI1xtxA/dY3ONy4vTpex1Erzk0zB87J//oi/PjdJXh9dwdt3l8cF/vYbL1Im2A3/eaWra1P5rGYu2hK3m4C8Lql17QKsnSe0Dyu3fcDoZW+/bKqxUt0ZxyC0WaQ1LMVKzhvkoDTph4ngV2P37veL8Oa3C7FsXeZEj3CCBLuQefXr+Zg+fzNWbSgRBxzGqR27y3HJ/aPx02/rk1cp08CSysC3cXYSIc380RAiufLB0fctScZSrGSJyA9ubn3pvipTmASLvNJwpvBy64MSHJy0ANt3l0uP19crUW3eUx/9Fn/e5+3Naxwb8s1V27G7PC44bBjLW+WVNfj3p3Pw1Ee/YUvRPmwpit/NxkrwTuZwYbctl6iM+schSLesm7p9y0b9sjYuD8t4dAYB1NktNrBhPuDporqmTq6NM3vFenj4C1buwIX3jsTe8upoH5Rd/9VPHA+/4y0ObDpAgl3INMoVj2BywUZX6bUv918CMFYeN3MdPhq9NNStWMzECRfhVCMm2ElGO2Fj5y9fd3Zeird9TaN19Vcnt2zbVYbrh43HD1OtjaG92tily1JsNE3Ab5zf3F77ej4ufWA0AATaPxs3sl6KvfnpiXHBYYdP5NF/p6ovllWI9hbvrcJtz07Cbc/KtZpAMDKEWwcprf1GTVj8XfG1jO6xIW608w5RWQwfB0EIxF7z2FK0D9U1dY4fpJf980e8/EV8JINE+ur/Jq+CogBrNpVEn3+eRJv95YQVvvc9DxMS7EJGW+obP6sQG7fvdb4gwInw7e8W4Yepqw2DN5DoAJ7g1QmMMEF+rWtfb4alWL2dtG/nCec0F947ylNYCm3ZOFGv2Hjth7Gy2ibXc5Zvs84jenFCVUk9pgB8YWscubrk9sPU1ZZaPSD+Njttx6QXCNy8Lobg5gF0sJra+ujSv4w9ZdW6gLvWFbR6PvX1CuYu3+5pHNm+q8xVOu0D6relsfffLgSSF+cJJyHMjJNwaa6Qmxh8svqWVdRgyNCR+H7K6viTCVBRVYvbnp2E14cvcPWs3Hhda0oSN8S2P4vtk5vXJHvsTxu5ScQY6wvgXgDHARgIYAXnfKAk3WAATwMYAGAzgFc5529I0t0L4HYA+wFYCuABzvkkU5pWAF4AcDmApgCmALiTc15oStcPwBsATgJQAeBrNT/r0TCN0A+0ldW1+Fj1wmyortvmVptvw7R5m9Bjv1bovX8bd/kpCr4YvwInH9HNUz3qJLs5RAfQiH/52u2EM3PRVg95ir+RJKvstHtRV2fnhJA8w/FE8RegONFCE7v8I51XthsKt7r4OPRA0MPQc5/NwW9Lt2H0SxdJz1/36Di88o9TXNfLLPxNnbcJU+dtwiM3HYNjB3aVX2v67dY2VbvuxS8KcMqRB8Sdj777AXipO2Fot5Vcp/u3NNyJxYX647vVPYt/+m09Lj2tr+d6WlGiOqWtWL8bJx++v688zE2SBdC2Qr9DjhbSprEHwTDdcduSQwCcD2A1AKnrHGPseACjAMwHMBjAxwBeZYz9xZTuXgDPAHhLzXMVgDGMscNMWX4F4EIAdwK4CsD+ACYxxprr8moLIfC1ghAAhwK4BsBHLtsVOmYziKiRtNOAGuCA20IXUFKrh18S9op1uP7FLwpw10tTMWp6/HKgbPAqq6zF8J9X4sG3PNpJqFnJNvyORP9nzZaifRgydGRcINlkiDv6MCx6Zi7agiFDR0YDhzrnY3/eabuw9Vv3RG9LOn6XuFoGN/1NdHJO9m0w3+fGjYKVJvSTfBBt0Wu7rNijhr5wde8t0ui92Z3sBN1qumXvtP4D3Owz4eXd8boC4MZ5wsmWz7KPGpZovQupbpJqu1S0at44MBtuJ43dtl1lKNlbhbf/txDzuNjXV1F0S9LpGtvIB24Fu9Gc8+6c88sBzLNI8yiAeZzzmznnUzjnTwH4EMAwxlgOADDG8gA8AqHJe5FzPhnA9QDWAnhYy4gxdiyE0HcL5/wrzvkYAJcA6AHgRl2ZtwFoB+Aizvl4zvlnAO4CcBVj7BCXbQsV/UvtRkuXjHcvyMC2xw8SX8n7dWjukFJOvNAgvycfjIyPZm5nuuLbk9ji1jgF8dSWDsy2kMmJYyf+mp+jtnyyyWmJX3eZnb2RtvGA/j3Vt2brrrJoXmHLdcV7KqPaBjdky5ie67A7hNeubghi6+Gh7iguxw6bJWTbMrWlWF9XC8ora7CvvBrT5m3CRfeNwtad1sutrjV2uhsQFRalPgla/d23QJ9y3oodlmFwNAwWIm5sRx2kTQWKdDtAL0KqFy19eaX42GzetLFDSrvyjNi9+/P4Dtz6zM+44bHxGDerMJaHR3vmTMGVYMc5t33LVIHtdADDTae+hFhuPVL9fQKANhDLpVredQC+ATCYMaa9PucBKAUwXpduA4AZ6jno0k3inOutG78DUAWhNUx/9B00pCgm8cuf/l/0owd0AQD06NI6gRrp65LY9f9861ctJ28XOgxkTgOdFui0SWNTF0vCGBINWmqqVKNGomynSULPJz/qAzIbz0WXYi2srhVFv09muIPl/z0+AX98fILn67R6h11/vzi9l153FfBrEnLzUxNx89MTLc8nen+tAhRrfPzjMlzzr3HRDyu7LdicNEa/q1pGfY21PmDcUszoOJTI9/KE2YW25105T6gVGPPrWtzw2Pi48/Wmvit9JBZL3l6pqa3D4/+ZHX0OicbelGGXVeGWUulxRdEtnWfN511wzhN9ADRB/DKtNkv0V/8erP5dLknXEkA3XboVEoFyqS4vLZ2hTM55FYA1pnQZgWEQdfKs8ighvPPdQgwZOlJ6LkjTLD/j9QqdgXZC4RUkl2oDSSBeX+rfSCSCH39da5/WYuBK5lKsebBvogp21bUOXn+6ShXvsTZsd/J4/XTM0szdgStJFU++fGgswG95Uyy88o35xX7sKq3Au98vQl29v69Ru27upF3S1ySIZTwn4eJJLdyM7mZou7TInCcMJhsSpE03Ja528NT1IhB9OjY23bpZijVoJh0EaLes2liCucu3461vF6iFSCrkFVP9/bz79Uri1sDzVuxIMIfgceU84YJ26t8S0/Fi9W97Xboqzrk58qQ+3SY1nTkvLV173W+36RxZsiQ1GxUXFBjdtveUxr4kVvCYd2ppaWlcWgBYs00sL+3Zs1d63oqxMwul5QPApk2x5cKCggKUVRoHlV27drkua80GsfxSWlpiuKa8qh4zZs9BU7MGC8BjX26K/nvZcqPMP2/efEPd9Gi/KyoqoteW7mgirZd5uyyn9uzeJYTNoh1F0bTlZWI5Z8WKFdFdOqzy2rpNPNfNmzejoGBvNF1VTb3tdV7qqLFpkxBet2/fhoKCWNcqKxPlruCrgLJN0msBYMtuYe9SXl6O4pxY4NuFCxeiZbNcaTqtbus3xGKMbS4qQ4cWYphct24dWsM44MnaU6a7p/t25sWd9/KOO7Ft2zYUFMiXZ6ur1CDNS5Zgc8tG2L1XhNyora0N7DkF2RaNNWvWokl1zNFmXaH98ufefbFxY8vmmBbr5S/lFjb79sWW8QvmzUdj1UD9q2k7wTdXYsOmZrHzFu2THZ87t8DS2H3VqlUAYs9ElkepOm6uXr0KkXLrd7u4pAQAsHbtGjSp3gIAqDeNBYsXL4r+e/6CBdIxqqCgwCCMzps/H3mNc7B2W+x94pyjYnceqmtUm9aIvO1VlaJd+jmnaEeRIc3GTZtRUGCtZSwqivWtrVvldota2fX1uv1uFSVu3NTqXlsr3vlFixajdXPR77cWiz5fWVnp+P5uKBLtKisri0vL1bmtZM8+fPbDL9GPzr179mD16niPW+16vZBpznP1FmNfXrR4EVo3z8WCteVx16xcUyKt8+rVa1BVLdq4ZMkSbGklF4ns2v7GcHkEg2T0d7cEJdhlPAMHDkReXvzEEiQFBQXIz883HBs1bxawVXTSfv36AT+LDt6mdZu4tACQu3IHMHknWrduFXe+oqoWNbX1aN1CJ9x8aRz0Duh9MITsrDt2wAHAAjFQ5ufni3AE38cmiw4dOiA//0i4obLRFuDX3Wjbtm20fuWVNbjq4bEAIPeG09Wxf//+wPjYoHXEkUcA32yO1k2fVsu/2ZQpQEkNDj64P/p1bwcDanrxZa/EXWtVl44dOwCF5ejUqRPy84VfT/PpU4HdpaKOE2J1lOW1eOtSYNledOvWDfn5B0WffXllDfDtFld1kJ2fPHcjunVqAdazPX78dS2OOWQ/7LdrI7BoD/bv2hX5+QdjyZqdqKtX0KlDPVZu3oIePXsj/3Brr+C534uJbVtxDfr06ARsFAP+oYcdinatmkbTtdlYAozfgebNm0frtq1yHTCnJJqmQ4f2QGE5evXqhfz8Ho7t+fLXacCuEvTv3x+V1XWYUrARd199pO01GhNmF6J1i7yoXacx4/jJvkuXLsjPjznzb99djsqqWvTs2hp54ycCZeUYeMhAdO3YAlt27gNGb0Pjxo18PSdZ+XHpJGm8cuCBByL/0JhX4b7IJmCmdciT1q1i48aq3RxYbC08AECrVq2AHbsAAEcecUR054qxC34DNm9D23btgfXy/qlhaLd6/vAjjjDGDdNd17dvX2DaLjRtmgeUlcfnAWDk3JnA1iL069cP+f27WN7LNm3aAJsr0a9vX+Qfsh8AIOd/W4G6mLAzcOAgYKQQjo44/PCY3Zd5rNH9PuKII9Asr1F0PAaAg/v3R/9e7ZE7YgeAakQk9QaAvJ9+BvbV4pBDDgF+3A4A6NKlM7Ay9pEkxo1+8Q1S69ClSxdgxb7Yv5fF29FqZTf6YTuqVGEzJycSPZ43cRIAkQdjDI1m/Q5UVWPQoEHo2FYI7Ks3lQDjdqCFrs9b0XTtLmBiEVq2bIn8/HyDYMMYAyYWYVtxDb79dTeuOZsB2Ik2bdqgb9/ewPRd0rrX1yvAV5sNx6K02A5MjVlhDRo4CEvW7sKo3+bF5fPYl/LVqt4HHohGCxYBqMKgQQOxX4cWxgQu+njTpk2BPfFBtJ3uVyJUVVXZKqOCWorVNG5tTce1WXa3Ll0eY6ypi3TmvLR0+lHLbbq0xVIT7bQUK9Ef3/7CZFz36Djb62SBjZ3iL3lZ7YjFoIod21Pmfvuj+N0PnBXlm9XI9HZJfavbJVEFEvF2S3Rp7pWv5uHe139B6b4qvPfDYjz63kyd55oo68G3Z+CRd2eisUsbu+3FFloeta7bd5dbb2FlsUuJUzt37C7Hq1/PQ3llbbSoR96diUlz3AXqBoA3v12IZz753XV6M7c8PRF3vDgFQPJCU2SmpV4Mvd2Rvi2xZUd/LVRs1mK95OhkF+WmegZnAZeF3/3yVDzzye/SnSfsPEm37y6POnIYHFPMdXKoiBdnMGtTGw83J8H+YX5OZRUxraZd3lb3YcvOfVHbx2haeJtrohdleieVEJRgtwZANWI2dBoD1L8r1L/aOpss3V6I2HdaOqZzptCnW6H7vdycl+rI0ceULm0xBql0YRBrk0YLIGuHrJMHOalpDiD6dskielte72PrCTfOAUHaOrl5Tpb2KwHVQQucvHtPVTSERPFe49JETLBzF1kfEN6kZm55eiJuf2GKp8HdaWL6fupqTJqzMW4P1LQhQwb7OIHA5XVT523CF+Odh0jj1lWx3DV7Tr/96t7Xp1ufjNqoyV+48soaVFa7f6cBYzvMdrx+hNMtO8swa/FWSOQ6W/u/WYu3eC5Lhj6WpJdAyPpObCVbGwVOzTHLaw3tcWu7J6tiXV09bvv3pKh5UTStoqDeo82nAiUrY8YGItipDguTAVxpOnUNgG2IhUiZCeHtepWWgDGWq143nnOu3eGxEJq4c3TpukMEIR6ry38sgDMYYx10xy4BkGdKR6jIjY3NBv7+X3Sna52MgpPXxVLbebXSauvqsWh1zH4mKG9LbWKtqKrF2s1iGX3C7PWGNFpcJ+k+izr0E6h+I2z9VVahQ8w5u/1IKHMZWy/VaO9vTFGR4IyW6jnD4f3Sns9Lki2avKAZ7/udFNdvsw7BE+0jFrf+qofHxnbEcFzZMGqzAW9e4k7o+3NMW219T+yCexvztT+v19g55WjlaKGv56qNJVFtlyF2oWmZoqKqVpgpmFi5oTjuw9IOqwDTVnXUJ7Mazwwx6TyQqR7wdrjdeaI5YmFGegJozRi7XP09h3O+HsATAKYzxj4A8AWAEwHcCuB2zbuVc17FGHsKwDOMsSIIge8WCA3btVp5nPPfGGNjAHzIGBsKYI+a/wYAn+iq9h5EAOORjLEnAXQG8DKA4ZxzaSDlho5TRyreW5nQZCTrI/pDl/3zR3z37AVRWx3bxPE/Hcq2Th3EZuZW2a/bUop1W/bg9KO6x53TtCJDL+lqm0cyiC6X2bT9X+/OxIJVRZbnnTC3p0Z1DvGyvY8XXvqiAHx9Md5/6EzP19ruPGEKwBfUYJ/0HThM2e+xWjJX8SqoWm1BpR0Pol+Z0bKU1VSLfxath0Nebmone9SV1e7255WOd3aCXb3iKp3Te+P0sablH4lEDM/QqIGN/fvDUXJ7rajHvXrdYx/MwrJ1u+NspYe+ZqOBRfwHn9MHgVZ3swC4cFURtu2Sm44oUHy9j4m+wukYCcCt80RnAN+ajmm/bwLwCed8FmPsIohdJf4PwBYA/+Ccv6u/iHP+ImMMEIGEu0CEMDmfc77QlP81AF4E8DaEBm4KgCv0W4VxzksYY6cDeB3A94htKXa/y3aFRl1dPdZsNsbWWVa4yyK1Po34UvW7MbEshqNezb5qYwn6HdDWV96AOzu06po6S8HO3OG9TLB2KQOZfiwyueulqQDEzgs3DRFxsc31rlW/1AMTGFxko32p19kkthPqXAU+Nd2U0jLhGde6pX9HJKtyd5ZUYOo8/w4HtpOlUa5LCgtW7sDhB3VOYgnAByMcvPs9TkJmQbCouAJzV2wPNP6YZdmSIsxbrDlWIyoY2CXRCVvq33Wb7Z1KbIu0eYkMgp1tJvZlGDR2LrW0gPM2Y+ayY8pTcaVeow+IUFV2MQItcXgudfUKGuVGdDE6xfFH3p1pm6cbgddwidKANXbq/qyOPZlzPhYulkA55y9CCG12afZC7Cxxm0O6lQDOdSoz3fh64kp8PZGjTcuYB+vn4+JtXvaUVRu8XFeqm4P7JddhJDT6jvpAEnXd3HFsN/cOsI/py/War1bFcTMLcfOFA5HXONfxK/r7qaujgp1Vfma27NyHXSWVGNS3Y1ydE8Zs9JMElq41foxUqxo7za7Sysjb1tHF4ty/3rMZ1AMmGWP9v96bhe+fuwCNGwW32Xgq9+RVAAz7YCY2bt+HI5kQUL1OpO5LAmRTTkWlUZPm7Dwh8pq3YgeaNmmEQw7sIEnkq5KG/PX/jm5KIcnXEPfPptzPx6/A2cf2RLvWZj9DgVGws6qbNu5YLMValK0/rlg/CgDAfW/8YpGLPdruD3OWbceh6thnqINWsMslWy2pZ42dTrALamuzdCB7dr3NMAq3Cm1d6T7rpZPl63bjukfHYcbCmMFtohO/dKN4g6o+/ryX5Zto7Wwusf16TqR9pkuDmnNGTI2Ps+QXc/Nu+/ckPPRObB9bfZ2HDB1psM8z5OOiLG3SzQ3Y8nnt5lK8/d1CrN5YgpmLttqmfVMLSGpi5UbrDxSrtun3/0wZiZrYmRrj1sbKLc99NtdTes/NMTlP7C0TS6H1USEmeMHObueGOO1zxGlJUzDql7W6XWjkafSFu53jFdm/XZqEON25d39YZHnO4DxhkZP0qGTv67jrlPi8E9bQ2lz+4aillueiGjsXY5ii+HOESPwVTj+BkAS7kHDzdbBqk5j8lqyJLbt6+SKRaUucBLecSCQh4Upm92zOzm6QSKSP+QmVomfbrljQYf09qZFEmfdal5jyzNm2RM+YGevcF2pCe1dydPZupfuqXGtZ7No7bmahq9ACMxfFewHW1yu2e3daEbRnnp7o81GCXTI3E7YHnletxHrzMlvE8Md1hx0ydCS+nGBckVhRuBtDho7Ehm3GMuzMOfxoZByTSJ6J68ckkez0VTSPwUZvVqexwN25UdMtdsFxaoSLRu4scY60YCYIxVf8nAHsdbAfVRR/Np9JUTqHDAl2IbGv3L1XYPHeKjz2wSzsq6jxJFy8+338F59juJOENRTOam2p1tB0fex34nVxi9Wku7u0Eu/9sEi6P6QVI6evsaiU/LBsiyLZb+cTMbRJpZF6v8sqanD9sPH4yMJQ2isO+80DkA+ac1dst7/Ism2Je6iWV9Zg8Zp4+9RUrcKEPYlUVLlzCtAwa0m1rqv1by8T6Vc/ccNvLabmPO7svDNj4RaU7quSmHU4CEAu+qoxvIc35EKhOLZrTw0uuX+0wS7Uy9L1rMX22nDHukmOeX3PX/hceE+70Zglg6gjTSTiausu2f3dUmQdUkmBEpuz/FVRBPRPM2jniZCoqHIh2Knv6AxV6zF57gZPX/wFko4g05ZFDOddZy/FjcbOrog4wcZL2abUTmPoisLdKNlXheMGCo9VK03ixN83eKiFVd2Mf83U1NYjNzdHItj6lwSiS7Hq1k1lqkfhzEVbcOvFgxLSGAAul0ckD6HWIdzEtPnxQbSBxIUvBcBz/52LeSt24PPH5Wa5Wps320wGXsvUkwwvUi8sL9ztnMgCYbNlfAiJeFRrxD1XiT3Hs5/NwSEHdkALbVcIl/jtPm5tF/WPM2oWpv5jW4nob7MWb8GpRx7gKd8gcFTY+bzOD167bnS81CkKnMcrRZpm2AezHK9LBM9BkVMAaexSyIjZu1Ggaiscl0Qs3jUvgp1sEnGajO3s6eYu347Pxxn3ct22q8zUMSRfsKZjtk1PSENn+u0wid73xi94+uPYzgX6e+P0eJo28WcAbzmIaOEjXGos3dym+jrNPkV0c01wjQ2aLjKxIddGZRcNtSK9zv7mvvKVfN/SILRqhVuEbWu8mUIs8+XrduOZT+aYjvrEdAPqPAZQTTeC1NtEl1xNx7UlN/PzLiqpkI5/Mg2sp3q4cLKynvx116r/NodrsRoeEzMndnNxfBo3z0+Wd1h+BVFzkoiLECmQz3nVNfZ9LmwtejIgwS6FLFhbjsc+mA3AnTFq3PumuBsMFq/ZiXkrdkgHI6lAqd/9Ise6jMf/MxvDf14Z/b1uSylufeZnjNTZeEi9qDx0nLjO6+IrzXVeDrg1EJ7Pi3zLn1ZV0oKmBvm1bHae8Bp7zGnycOOUIXs+fpd1UuW1tlVna1m8twoLA9BKadTXK/j59w3YtMM6QG86MaB3++i/FejMKAKxo5Ko9wG89T9z5CtBbk5E+u7ahcBw059ciUiWAp+Liy3zjF08+hcLOzkL3PThAEzsosjGRqtdNKR23D77bsyRJgLHbyLFaqnbuqGGcCcRYaqxw2qLxQyCBLuQcPOeyzqeVYf+8dfYwPDQ2zMw7INZUsFm9C/xtl/GOEeyTimvn7ZctVwXfy8m1/nsyB7T241vdgPXlIL4PUndChz/HbccVR63NIot08jPX/uvcep583JyIkuxYiTUBLCoxk6ri++cBW7ul+z5+BbsfF0Vw+4jQP+Om6v3no13omOZpt/19cBrw+dHYx+GwYr17pdju3bUbYquKNFnEIiI7fACmsvIicgFOztc9R+DFk1R/xqTrN5U4nSpZyEvkf4nM7Oxyt8qQLFVDYZPXGm504yeNZtKHdP4xezEtKes2nkpFvL50YuAe8+r03HzUxNd1y9dIcEuJJwmt+27y7F6Y4nhmAL5QFVbV4/3flgcd1z2kq+WdEaD74SDMbIsf/3XnOxaLwsCcTZmTpWwC3Vgc+7lL+OX+1KhD7LTgr357QLXzhNunpH29bpx+148/p/Z8c4ffo1wVGy9m6NFyJ5+/HWlLkKZJMPbTkN7l/eV10j6pvj90eilGDJ0ZELla8J2kNtaeeW+1/3FHgOAHep+1EFoT50+As1l5OoC1kbTBNBr3Qx3ljsr6C6urK7F356fHDugmSOEJATIy41gzaYSWyeayXM3xm05J+vrqWiV/nk7yfSKooCvLzYcq6mt91RPt7a1aS7XkWCXKtbqdplYt6U0LoK3mc1F+zBtvjHKvkFtrCMuJIGK649b/VKshwE7KtgZJkJxLKJ7szztHuFtJda206aj7QQvLLY8N2H2+kAnAS2rz8evwNzl27FWtS/TljTMJR3er5On/GUfJ369mp/S2TomDV1dzO/5FjX8ysTf18ed037+EEA8w7DDnSSCf/MD+ZWyfUDtyInEC3ZBoK+fV0N4vSZv1cYSbNzuYYk92a+CJP+a2jrc/co0/PuT3237ZpVpT+/q2jrsM4UbSarAKvn2dN6GDNExTmNL0T7beurP2H0kmDcGII0dAQCGWE1O++rZIRNWrF54t8sWcRo71x5hIp3e1spun0c3eO0wxk24E8vLD7Ko6XY8/7l9QFnzI3OaFO0wp3n1q/mG4+YsWjY3ehwqANbr3luzHZDX+7t1Zxk+HRPuFs5WNT7q4C4AgEP7dYoTWBMJzmq+RWF7xYaBkwOQ27urKArWb/Vmm+j1HTVo3Fzwv8mrdGVZ1UH3b/1xj5Kd112HZLlrWntH72jTxcvW7cY1qrkIIByMUiHbGDR2Dn1H+qw9rEDZMfS16SjRrSqkey8mwS4E/C/DyDc5tnrh3Q5qTjZ2VtRFvS511+jc092UZ8Zc40RCcnj9uvfTWc3CEAAMn8jjjrnNO8g4fmbMAT65ydZK9szueGFK9N+T5xrtEu2qJntuT340G/+bvMrgnJAuaF7OOZEAouwbMN6H5GzBlRr0j9TLLXJssUVm5sPrt+11DFIbV3Yqb3fcMrGn5FIWrirC3OUikoJXhUAyP2zvf/OXlHw4G4RiFzZ2ZiJwuM8e2lBZHVu+dhN6JUxIsEsRQYzniiIX4qwmCz/agR9/XetaColp7GKvkexSu3d84m/rjWm9buKs+7dZKE1F35JtqfX5+Pg9f93eU3Od5/EdWGdaXkgUrYgH355hn87hBsrerxUmGxc9idqVWVVnV2kFvp200tPAv2N3zPPt20krDfZe5hXmSASGJbZEBm3zXqeZhO/JzEllZ0EQ9nNu4vYFNU7EZ2N0VhL/9lbYI+/OxOP/mZ1QveJrJOpqK+8koJOSjQtev5W0HPR5Oc1n1TXxDm3iY9W7gkN+3t6WXE/Yn28k2GUQVjZ2Vi+8exkp9sJOXyAPDivNX2JjJ3eesK7IhNlGwc6rLGq2+9DjeTAxLOsG70pR6SLqv+z56pd7AiGgmcyt7Yq52KCD2L/weQE+G7schRa2psZKiD/6zcs/G7vcaH9nqmAEEdslOr39bFxxphvxwJv+HRfSHa/ym3b8s7EWy/Mpip0WlHZlwUqrfZ1F/jW1ddiwLZgPBN/ox7UEi7d674O0g9TfI6e5QbYvdSQSnK21/nV0bGPIkh0Jdikj8Se9bVeZ9IWy1Nj5WIoF3NdUE+zGzyrEtf8aq16sLcVaZ6g/xePsRrwtRf770zmxfE3tSGRJLWi5ToG75+H2mXmM4mCgzEJr5LXJTnGlVm00PttYvKhgb265uqOGc1gDFzdNshQbMY2S5mze/k4ed02WNoNXYg3ItGmypq3cXGH5YaI9j/IUajFlzhFBPRIn7eCb3y7EotWJBVP2gqKIuWFXaSx0SUR3LtFdMOZbCbIBCHZaHvrx0CnfjdvjPVpzHMI8eKqph9isYXdzEuxSRBAB58fNKpTmY6mxczmLFBV72+hZ29dUL1DuVfe+1S9p+cHrxKd3b99ZWoE/PfUTtqk2XInsbxj4x3QiApvfuiTYBqcq2wqhiogJJatOsDZs3rCazPRtiRPsHOpr18/CHuCDxOgEIFPNxx/7ctoufCEzTXCBq9fE46t03aPj4g8m+SFpt2W5KRKCl2LNoa9clQsbT24fzldukXUH33FNdXn5cjyKONkC65IGaTcaMiTYZRiyzpaoQfbXJmN/pw7960IRcVw2sWvHbDX+kQjKK2uwq1QiUCrmn+7bNnnuRhQVV+An1W7PbT9Nxq4PfnFbh1TsOblT9nz0dbD9Eo4/p9lPBr0U6/aejZ1ZaHlOmzQikfgB3qm6dgJuOrxTiaC3i3TaeivopoYn/idnmTRuBz4PRfzj1WneC1SUuPArsViWri73RZD3zhjHznu+W3eWBRYzUi+cJrrPdrJpFG7xDYlgnrRVgOJkYtZYaDGONK9YDUXo9+OQdYLbX5iCnSXxgkP8lmLu6xlb6Yt4urRkbxU6tWtmOJaMpVhX6ZJkn+IKU5udYnr59Tr2vb2Qw3k32Vo5L+g/jsyaXidTBVuNXdgjfIJMnx+zuXWOI+bxfQjigQZAKj6SgNiezdFyk60phP+PDouh3F25sunI56M0Op54v17bwjMIDMqKNO/WJNiliKBsa2T5pDqEgjaRyTasl64kSg7KhDqR2Pjz9W8WWNbDvMSjD3g6duY6adgRGbm58d5rqUC2J6ZsIA5rDHGMG2XzPSHV6MjsLwMkEbtD7eNIeMXGO0/4LTfNx39PGJbFZO+px8ami9CbdAErGj3A5LWf5LdDUaz7qHDEc5GBD6TCpI+s1m/bg607Y3Zzyblf3vOsrqlDmWrXG1yuwUKCXYoIavCQhjtJssYurg6aYaupLgqMWjNFUfDpmGXo172dIZ3dNGnuvFoMJxnmJeQN6rJDTiSCd75zv79nWDZf42cVxh1z/Z4kIMS4vcBp4q3zKNEkyXfCm22MRZXrPWjsRCaxg3YfVmu3OW+VlikYDNlle3IGXF6oS7EB5KG9N1peidj8+kFRFEthyM0qj2+NnaLgv+OWo7kSe/e368ILucvDGEMzWejHg6kFm6wTIvY8//HqNIN3szzjBCuWICTYpYxgnrR+sl29sQR9u7dFbV2wb5Fs8tMHs9UG9TghU1GgtTMSEQ4V302JN961+9pJxB6iyOc+llGNXRJ7YyK2c9ozX7p2l8dCE0vuZm9GK2RCX+xIsBNcEB9N+onO3C7n98m6AttL7L/sMwmj84TsfLD9J1XfW+Z6+3FUcINZsKupSe4H+YZtex0c45JjJ1avAN/8vNJw7NnP5liklrNph0RwSrKw9IluV5y5y7dbxg90FOqQuuV9K8h5IkUkQ2O3bXeZeiz5Grt7dRuHaxoKs6ZCr7EDrAd6u44RhKGr1wkhSI2dfgsuP1i9J1U1dfjnW7/G0iVUijuclmLtzg97f1b8wQQ0dmN+XWvY0kdGIo8x+i5H4peSnGzs7Pp28b7MDUhsxhBTTKax8/hSWoXLSDU7SyoNv305Ktih3hfzUuxz/7XfXjBRHnpnhqsAzTJK91V53uVDIwgBXz/fhMFHo5fEHUtGbNNkQYJdighqItZPOtqLFrSNnflrwxxYVyvOzs4mWQbybvAaXiH6JR1A4XbLB27Gu0S3hzNck2CDXvlqnn3+njWCmjbX+7vx7g+LPV9jVQsZeo2d2SnIqb6KAqzZVILvJdrplZsrJVdkJkE7T1ja2aoEsfOEG8wmHYDYPD5RamrF/dDui/kD0q/gFBR2j2tHcQX++PgEX/kmYz/kJo1zUZMEk6PPxy2XHs+Nc2H2RtjmoyTYpYqAnrR+0smJAKs3ldhGvg8C824U9RZLscIgVxwbP6vQ4FHnljA2SU/F9OG2VTLbl1Qse/mhzqOmWEueYlMjA1a3LfpBAucPpTjHbUXB3a9Mw8c/LgUgdhhIJTMWbUlJOfp2m7VPycCN/F9Vnfi9lu1YsqKwOOF8xxWUJJxHJpKMoam6pg7DJ650TuiRnaXyDy+ZPWTm6OtIsEsZydDY1dYp+Mcr02zjc/nCobJaHeInQKOO6P0Rci2L3YBtJyzMWhy/L2sQvPntQtTXh20VIbjrpalxx+rrFXwwIn5pwIlky37vfu/eQQWIaSh27wnWocDVVmIqbm6J+ePCrGkxC9XmPC994EfX9QmCZz+dgwoX29Uliv6+VEtMJoJ+3xo1cp6eRk1fE2yhKhXVwd1PBcLBze+yaLJI1vCQHiNpYsg+XGYvcT//hH0HyHkiRSTDxu5Vh6UyvzhVdcS01dhZUhF1OtAoKq5IWJNkpwR65pPfE8rbimnzN+Gc43vGBfMMlARuy4rC3Sh2sC8LAq/Pzq+A9uEo90JqbV09GuXGT/DJCvETicTnbf4QeX34Agzq2zF2QJd8nMTTORVc+dAYjH7poqSWoX8/ZI48QT8RN7avyfp4mbEwOC3ogpVFqE3HveSSdPPCWHUJGplg9/Z3i/DLApfvBS3FNhAC6kR6D1jZV3MqKN1XjTEz1mGvKYDt8J/dqcr//Ym1h9SYGWsTqptfHnp7hqcQKV6prKnHGzYx+dKBdDQOvuT+0dLjK/xqPyzXYmP/NE9M5tsybf4mw4bjeg3F25LYhKliyNCRSc3faQgL2jygsQuNXU5u+r2zMtLBdCJVZIFch1zJxyQALF7jbq/fsG8BaexSRFAPOtm7TADuB6Epkrg/bi4t2Wet6dm2y1u8o2QRtJDz0/xSbNrpz1haZu9hdZ9vfCJm8OxVq5WK5Tw/yN5H2Rd1MM8sgrr6OtMR/wGKswkn54myimBDu7Rv3dQxTSps/QKhgbwjQHYIsam2kw0a0tiliCx4113xXwsvo0wj6MHJr1AHeBNYdumMgb22wS4YdJisksQVK97rz9vUjaw7ae5G4wGH299AurbjGHbrMz+npiI6UuU5myhrkuzg5oek2dhlQYdYuaEkoevDvgck2KWIVi2ahF0F1/h9KSebJ8QMZsLs9WFXIYpMKeFq+6wsGGABYOhr0+OOPWOznO+HtVvExBuJAPNW7DCccwxPnC032oHXhs9PaXnutojLjHuvj0GZ7TSU/mDHO2PD/UimpdgU0altM+dEBCEjDW3f0pXaunpUJhAC4+mP4x10Ig7LffbR/bOHZIdV8kO6mg9kAuWVybl3Tkv2DYG9FeEu5ZJgRxBpzg6P+yxqaFooGXrj/2whAmtni0TzJVKPG23ckjUet9kjkk7pvnADLxO0FJsyzHvnEUQiuJn07IK3ptNSc1AkS0+wbVdZknImCCIb6dEpXNMrEuxSxDy+wzlRmkA2EkQmkqz3toQ0EKFAwxCRqcgiGaS0/FBLJwjCHzTppQzzXskEQRB2hB2FhwQ7Ig6SGYhMJFkanmTtckEQRHYStl0uCXZEHJPmZE/YkmyFRI14GooJwdnH9gy7CgRBpDEk2BFxLF3rbtsUgkgnGoZY13Ci3zQUQZ0ggoYEOyIOWnlKf2jSk9BAbkk67ulLEIQOsrEj0o3Vki2cCCLdyZRdCBKloch1DeNpEkTwkGBHEERW0FCUmDkNRbIjiAwl7B5Kgh1BZCD1tF7eYAl70kgVm3fsC7sKBOETimNHEIRHvvqJh12FtGPrzoaxQ4TT3rXZwioyCSEIX5BgRxAZyKLVRWFXIe148YuCsKuQEpK9EjvsluOSWwBBZDlhW0uQYEcQGUhDsScj4km2jV2fA9okNX+CIJILCXYEkYGQXEckDXq5CCKjIcGOIHzQvUurUMsv2VsVavlEeJBXLEEQdpBgRxA+oLmVCIuDe7dPav6ksCOIxAh7eiDBjiB8EHbHJRouzfIaJTV/2tWEIBKEnCcIgiAIt5C2mCAIO0iwIzKesO3dCCKVkEKNINKbsL+9AtPpM8ZuBPCx5NRbnPM7dOkGA3gawAAAmwG8yjl/Q5LfvQBuB7AfgKUAHuCcTzKlaQXgBQCXA2gKYAqAOznnhQE0icgQBh/fC++PWJzSMmkjdiJbIcGRIDKbZGjszgVwvO6/F7UTjLHjAYwCMB/AYAhB8FXG2F/0GahC3TMA3gJwPoBVAMYwxg4zlfUVgAsB3AngKgD7A5jEGGsefLOIdCW/f2fb8+ed0Cs1FSEIgiCIkL/7k2GFW8A532lx7lEA8zjnN6u/pzDGegAYxhh7n3NezxjLA/AIhCbvRQBgjE0DsBjAwwCuVI8dCyH0nc85H6seWwxgDYAbAbydhLYRaYiT9oz1bIexMwtTUxmCyHBIY0f4pUv75miUG8HmooaxvV+6kjIbO1VgOx3AcNOpLyGWW49Uf58AoA2Ar7UEnPM6AN8AGMwY02bx8wCUAhivS7cBwAz1HBECRw/okvIynVdFadmUINyiUMATwidiLKbxNuw7kAyN3RLGWCcAGwB8AuBpznktgD4AmgBYZkq/VP3bH8BcAAerv5dL0rUE0A3AJjXdCs55vSTdOYk3g/BDJIRX2ilgazL2TCcTO4IgCCMRRJBDLpmhE6RgtxXAMAC/A6iDsKH7F4DeEEuj7dR0JabritW/WtTNdgCqOOcVNuk2qenMeWnpPEfwXLJkiddLCAklpSUpL3PxEnvHicLCwsDLLC83v54EkRpWrlyZ1PwXL06tIxKRPVRVVaG2NuxapAcFBQWhlR2YYMc5nwBggu7QRMZYKYDHGGNPBlVOshg4cCDy8vKSV8CXm5KXdxrRrm1bYPO2lJZ52KGHAiOtyzzwwN7AzN2BltmieXOgpDTQPAnCiUgEOOigg4DJVmbMiTNw4CDb/kQQVjRtmofGjXKA0r1hVyVUIpEI8vPzk5Z/VVWVrTIq2UrTb9S/RyKmcWtrSqNp8rSZtxhAHmOsqYt05ry0dMHO4kRa47QsSqFJiGwhAiR9zy/aeYLwiwIab9OBVK6GrwFQjZgNncYA9e8K9a9mWydLtxci9p2WjumcKfTpVoBoMDjZ2CVlnKGxiwiDBF/mRrlkAEUkF5LrwifZvfxqCCG+gHNeBWAy1HAlOq4BsA3APPX3TAhv16u0BIyxXPW68Zxz7XNyLITG7hxduu4ATlLPESEQRqd2+kJMhkNHTjI8MggiydCkSySbMBzoCCNB7jwxAUJwWwKgHsJ54m8APuScr1WTPQFgOmPsAwBfADgRwK0Abte8WznnVYyxpwA8wxgrghD4boHwqr1WK49z/htjbAyADxljQwHsUfPXvHGJBoLjZJWEcSaXZkgiBBJ969xc77QSmxMB6jNgtbZ1iybYU1YddjUaHBFSCodOkI9gOYA/QdjVjYCIWfcAgOiuEpzzWQAuAnA0hKPFLQD+wTl/V5+RGpj4IQB3ARgHEQrlfM75QlOZ1wD4ESIY8bcQmr8zOeflAbaL8ICd9uy7Zy9IeZkAhTshsodE37tIEJ0hQ17+p/5yQthVaJBkxtuRXHbtDdc1OEiv2LsB3O0i3Vi4WCpVhbsXHdLsBXCb+h+R5jRpnJuUfJ3nKlqKJRoOZx7dAz/P2SA950pj5+CdkSlvPhnxhwPdd2BHSU2o5ZPSNA0ZfHwvtGjWOOxqZAxhaOxIsCPSFTvBzNWc67DMmm7zdqPcNKtQAyfd3o+GCAl2aQjr2Q4XnXxg2NXwRTjOE07nk6Cxo9GLCIVIgh99Qby39O4T1pDzRPiQYJeG1NaZd0nLHFo0Tb2m0VFwo3GGyCL6dm+Lf/3pWMvzds4PkQhwzID9bPPPAL8IV2Rqt7//+qPCroJ/FKC2PnPnr2yBBLs0RFGAfZXhrtH75eqzWMrLJLmOaCho7/oxh9gLZ1YoCvDIn45xSGMv2qWbFUK2xVPusV8r2/OXnto3RTXxx5pN2bsjT9Mm7uzEe3dJ4i5WLiDBLk058+gentIf3Eu+Pe5RB3cJojquadw49a+Uc4Bi+/MtyZ4xaeQ1ycURBzYPuxopp1Xz5LxTbmSqehtJp6a23rE/OApKSTZDGP3SRUnN3yu3XjTQdVrWs51zIiciwBv3nmZ5Ol3se3NyIvjb5YcZBc0UVu2vlx2KT4ed45wwQE4+vJvhd67Fs7jipA6pqI4lJNilKb33bxNIPq1bNAkkH7eEISQ5Bih2GGwOOTDcTpjNRADkSYT9Vs3Fe3nWMd4+YIJk6HXJ28vxlCMPsDyXbMeo+jprySwIMw9zf/IqaNh9tJ53Qi/P9bESZP0o8i6RaMMuOOlAPHv7SfjqqfPw5n3WAhcAdOvU0kepwEmH7R/9dwTBfxhce07/QPMDgMP6dsTg43vhpiGH4NGbVdMABdivQ2o+5Nq0yEP71uadR/3hVng/oLPx+V5xxkEAgL9ceqjhePO8cEUrEuyyHNkXResWTZL2Vdy4UXJCmtjh1nmiRbPGGPXihbjstL6m8+7KCUoA7Nw+uQNf144tkpq/mTuuOMzyXCQSwWmHto473rpFE/zw/BDcccXhSayZNa1bNMGpRx6Ac47rKT3vdsnFEhupwosc9Pb9p3suWm/j1Lypc0SrCxN01BpyUnCOXieZNCJu0Mt1+mVMP3vetpF8COfkRHDIgR3Qsllj9Nwv/l3W41eZuW7LHtdp3bSrp2k5d0Bv+YpOIug/qLvpBJ5zj+sVeFnyCgSXlVkT55brzu2PkS9ciIN6tA2uMgFAgl2Wk8q9If9whL/OkSjOW4qpKAoikUhc1Hyn6x+5SdgklVXE7B4Tses5bqA/+yi3NG6U2m7dsW0zw2/9pBKJyDV27VrnoVFujqW257LT+qZEk9rfYunsDI+mEGYmWsSREyQwI+neVav7U1RcEf13n25tHbO88syDDL+dBAdz7d0IM6ccEdNg2i0VD0zwmV+XgGaqzwFtEn7ufj1CNxftw/+dJ7ZHb9e6KTq0aeZwhT3P/O0kw++ke/HrHmluBu5H3LpFE5x+VHfpuUF9Otpem5MTCUWhYUfmPYEsxEpr4IVO7eQDQa4kxpPbPj7sluM81eE+G2+uls3Eq3bCoV095ekGxx3F1ASN1QDJ5onLadBrp6r795RV4e6rj/BVRz11NktlAHD94P5o18q/8W2TFAt25nn6jisPtzzndLxX19b4+F9n44bzBqBZXmDx0y2pqpEvTd584SGJ5VtdZ3kuJ4HHo39VrQS7VRtLov92CjYsw6uJ3fkn9o5LY/640NfDTnDUPrL8yiF6odHp48u8JH7PNUeiaV5iE3Qi8tPlp/fDd89eEDVTkHH4QZ1c5WX+Xkq2XV5NrehHrVo0lq4S+f1Is4tRaNeiPw1x339vvXigrTAaXWa2oVdXe01uqiHBLg2444rDceP5AxLK44bBB0uPd2lvvSz3wUNn2uZ5aF/7LxUv/HWwcOK4/4aj0UKyPGSeSJ+87XjXeTsNpi3VgVLTKNabVHZO17dtKYSsft3bRQVoBYrtFmmP/9m6/v17tTdoMMyclt8dHdp4sx3RL7/Kvh69LMV4tQFrqbMHatsqD/17xsqqqva2tc4RrDM6tm2G3JyIr6U0r5xpYeOX6Be4+X0+UWdDVesg2Gv03j9+sjj72NhH4CkmDfkFJwkBy24s6XuAs+1uba1R2L3+XKMWTK/hHv3SRehk0tge1q8j/nHNkZb561tvXjKUlaHxxRODLfPUaK4Lt+T0/uT37xxXrya65965fXN0tOmH+mdhxklr/rfLDo07FolEHHfnefgma49mg1mD6f7ZjXFu7qsezVGvdcuYANpjv1a46syD8OAfj5EKY3q7Oy9C0N1Xx96jl/7+B9fXebLzc/yScZ9VukCCXQg8+Mejo//+6knRqS47vV/0mJ/pzGoguegPMfsXs+Hnfh1aYPRLF1l6zgYZ2LdFUzFg5eZEcLtuALrijH4Y/dJFuPgUo93b4QcZB10Zrw89FW/ee5pjPVs0bYzPhp2DP10gJluzYOeksWvcKAdv3Xca7jUZ2+sH4SNZZ3z/XEzQO8Lmy/rUIw/AvdcHa7j/xr2n4cBuYtKWaUW9LMmfeXQP9OraGh89crb0/BCTTVb/nu2jE5V2J6PLswm8Q7J+cCRzfi/MPP3XE3DxKX2k5/Ia5+ICicYJAIY/fR7+drm1/aAd5vdZb8Mj0+Y9d8dJUcFM4/iBxuf4+ePnGoy8e+zX2jCWdGrbHKNfusgwlui1wx8+fBae/uuJjnWvNmkxI5EIjuxvfd/N/e/YQ7rGPzzdb0WX/aH9OuGea+OFQJlQZuUINuyW4/DS3/+A+284Ckcc1AlfPDEYnw47x3Ycvf+Go+K174rQbGmCyYcPn4WPH7X2upRpzzTNmJOzwgCfGqymTeRa7Iv+0MdoAmFRLzOd2zf37GD31F9OwJCTD8SfLx4UKy8SwfWDD0bHts2QI1FJ33JRLK2dTa6ZZnmNog4pzfIaGYTQPNUO9o+SDxmZTD/yhQt1dTjcdR3080OmhNYhwS4E9J0sL1EjbQcikQi6dhDaHNmSif21yaiRe0NVq2XPvt3bYvRLF6H3/m3Q0+XXX7vWTaPqdvOA7GaZosd+rdFUtzRotqVp3CjHoOVJSCi2GDxusfHcymucG71uoMQmxMotX0b71k3xxr2nWS7v5/fvjC4mBxAtrprW7DvVgVOmdQKME/eb950WtW8xTOiy++DjtuZEInHPQ++1etul8doTQGh/nDSnfsItyDxTB/TugM7t7LUMbVrmeX6v9GV1bt/coNGyorrWKHial3PNH5E5ORHDx6ECJc6OTjH8O/YrAqC5ZMndbgI1a/laNmuMg3q0w8mHd0MkEkHrFk2Et6RNHicf3s1SK/vds0MwQicEWGH3KGSrEgYCFhBYj3aGvmHO3urj1esKbf+e7dCkcS7+fPEgyyVjmcauWZNc9OveFoC3sdEs4Ldu0QTfPHM+brtkUPQj73Ldh0z0Okle+nHeSzdKl/AyXiDBLgSSYchq11natRZLiVYvqNWSRdivs6Uhs8fPJvOtOdakCXF6HLLSzJOd287vOuyApFJul2dlNfFm0Ox8f51sfZxslfSPsOd+raXehjIjez/vZCRiXNZ9959n4OYLXcYmc7gVXgRmDbNXts+i49IcNyjeKafO7CkkwTx21JiWYs2PQeYgoF/6UpT4utfbCOzNPSz9f/jIWXj+zpNt6xcrxp/0lJMTcfVcZfdBG9sVAJedELwnql+sBTtv768bMw2re+dXltWqqPXhZnmNcMFJB9oLiA6Feem2+nu0dN0u9xeGCAl2KeLlu2P2Afr3MTcRS2odbt5Tq45g2QdSsB/qhm17PV8TtDZcJpRpHmpmrLze3C51XnO2y505ZEKNU2BZmzuT6o3SA9kvUtIcP5pQ80TTspncwFuGnQenLG83yJaOgPjupijOXVCzAbv2bIb9O8bHUDObHcho1byxQZtfXSNx/FCzufAPB8afg/F17dCmafzHou6n+Z568YTt3C5e62gZxy6MZTNNCKlXMKiXtQY2kapJ2xUxLnGbE1l9eHrtT27SS+e0SCRaJy9FKroyvTxPZ6Fet7zqkFJ/7+Ys2+6+EiFCgl2K6Nc9FlZB3zlSqeb1WpIsvdfwKUf274y7dF6SGprn5m9Lt3msVfCYv1pbNGuMP9g4NwDxg4yfCd4re8qqXaWTDb5BhyAwt79tyzwc3Kt91Ng5kiNPZ4XmZJGns1v0q3Exk5NjXIpNZClIlneiHJhAMHLWsz1e+vsfcKXFVn49usidE/ReipFIxBBgNW4ZVffzSNZZOjBo1xx7yH448dD94+6b0Ss2drxZXiPp8zDfk2NttlCzekZ2nsmJcPygeBvW5+44CQ/deLRBY2eHH693W6cDxeQRbDptZYMdkF7BgCwSAxCrk/6jz2kVomPbZjGNnYc6OI07siHAzbBgDkScrpBgFwJBLcUathFzkaVlsWonMAdZlKV32sfQzOO3Ho+zJN5jiUyIijxChW+0ieWmCw7Ba/eciq+fOg9d2jePDr6GicOi2vrBTLMB8rIVkRsqKr15mOppFOAIrijxk2lubg6ev/NkHKHavTg9XfP1eapRePcu+gCz8df56Trma7zk4aTwct2X1Xw6t2sWJ8g8pHo6mrVr/Xu1QycHuzsAOKhHO8sPi7uvOQK9urbGq/84xXDcSuADgOMGdsWlp/aNOXPEhQeKv+bKMw7CoX074u6rj1CXvo3njSuxsR9XmGLoaTz11xMMv+3scq0eUXebNnrJxxL1PuREhI3k8YP2j773ToJFm5Z5+N+zF3iysTZvM2YW1vTvj7l8q3vhdcXITRgR2cd/BLoxISKCYv/9qsOjbXj/QXmEhr4HtNXdU+ub+sD/GUNt1TnssCL7mHDySAaAwcf3wu2qQ1UqwjH5hQS7EAhqhbONzt1c/xVk9orU+kOOhUpbC/kwoLdxSUSq2ZD0rYF9OuCMo+XBHa3QjLplAp75i9gcly0S8Fur1aFZXm7UsxSQPyfN/qixafDSBsjXh56KZ28XwUEv/EMfTx5gg4/vBUDsFnDpqRKDYIfZwu50x7ZN8dCN9pu/a+zvYlskrSwr7zKvSzxDTu6N+284yhDkOqiltJyI/9Ap+usO72ftBelIREw+z90h7MNOPfIAHHFQJ1x6al90Vp1UzLZt+f27RDWYXiLb6+vcuFEu3rj3NPQ5wHi9zBGne5eWuOy0vmiUm4ObhhyC6889GGcc3R0XmTyKZRNgx7bN8PRfT4yGFiq3+QjR97E8i8m0VfMmeEUnjPoZM9u2yjPssHPducFsq6V5abZtmYerzjoIL98dq6dmL+hG25zXOBeKzZeDlXOCdoXZrEO/y4hM6JCZY1hp1/QxTD9THYT269DclbDc0mI7tPtvOBoXnNQbB+7fBrdePAhnHhP/wS/DzVhi9haukpkT6Ni/Uyw8lNZdrDyO9eTkRHDu8b3w2K3H4am/nOCYPizSV+TMYpzsdtyiKMDVZzGUV9YYjj/4x2MwZOjIuPTN1I5vnkBOPrwbTj68G0ZMW+1Y5p8vGYSnP/4de8vFsqDZ7d0tfQ5oC76+2BBC5Lk7TkIL07Y9r9x9Ctq1zsONT/wEQNgnnXjo/nH5aYx4fgguvn+04ZjT5KuNG27skbTlHfPkpmlMzHv86uPLmXdoMPPnSwbhijMOQsvmTXDyEd3w/OdzDeed3ptjB+6Hwq174pZ5br7wEJx/Ym80bpSL9q2bYveeyrhr+x7QBqs3lQIQGhuNa85m+OonHpdem7isbm1UuLA0bDfSuFFunFbmxEO7YvGanYZjfmz3zM/fS/fTP897rhPLzDW19aisqsVvS7cZ3oPjB3XFrMVbo7+bNMpBta6vnXRYrH2yfWpr1I+dS07ta3Cw+Pqp89BEsnuHFW7aJxPO3r7/DMPvFs0aG+KIaWzbVe6Y/wmHdsX7Ixajc9vG2FFSA0VRMPzp8zB/ZRGOH9gVn49bEXeNef7ue0Bb5PfvjIIVO+LSfjrsHPzx8QkAgEN6u7PR269DMFvtXXdufxzatyMOObBDXODd0/IPAF+/Gycd1g3rVpWgSeNcg83iFWf0M3jj2g05Hzx0Jq5+ZGzcce1DwKzN1QvvjRvlYOQLF+Ki+0ZFj33++OC4/Mwa57OO6YH8g7sYvJzbtW6KP188KOr57kT/nu3xt8sPQ5tIEf797RYA4tl27dgCt13ifynT7r3O798Z3Tq1xOaifQBEyJ03v11oW0cz2kfGgd3aYO3mUtu65PePDxF24P5tsHZLKbp1Su2WjjJIYxcC7VoJuwLN/duMl6Xa687tj1svHhQdFO1svbR4RVZBWbUtJo8ZsF80eLGmPTvq4C74/PFzcciBHQyBIv0IdUBMXd9VN9gO6N0hzjuyb/e26NCmGW66YADOOa4nLj+9n+VeqAN6t5faksnCSHymC1Oh3e9mLkJBaLEATzzMqFU8V9W2mTm0b0zL06dbG2kajUa5OYYQI6NfusjTnr7Xnt0fnz9+Ltq1bmrYTP3iU/pGJxOzkKPFaTvCIj6clW2TNshavaqa8NhetaH5dNg5+ORRXVw8F8LH+ScdiM8fP9dwTF/e3686wjkTyMOduKVbp5bRD4m8xrlo16opOrdrjh77tY5uAP72/afjnmuPjAvo7dWuUQsM3LpFE7RpGRPOWzRr7ClgctSWyaHJ157TH6fl29uSRlHzcuNlCwAd2jTD6JcuwmmDRH9WFBE+5sRD95d+aN115eF46774fXG1pTrzvdRv/u7FrMPOVk+2T6yMRrk5lv3lgM6t8NRfTozWz7yd3v+dN8CgadPGk1YSLVeLZo2lQeLPO6E3ht1yXFyAajPm+yLzaNXmi/Zq5ISuHVtIP5yHnHxgXIgju3IHH99LupWgH2IrTdbvXiQSMezg1LaVWOo2k5sTwUV/kMe01PaHlsUgffimY3C/zc5KAPDEbcfjyduOx7v/tA/8nwpIsEshD1/VDV89ORi992+N68/tj3/qgosCwH3X56Nj22bRmF53Xnk4/mWznYl56RSwF+xyc3Iw/OnzLEM9aB2nW+eW0a/bv191BG67ZBAevfnY6GQThG3BXy87FENOPhC9HYQdjUtP62cbVPLjf50d3e3BTYTydq2b4tC+HTH0unxce05/XH56v7i9bo8fJAY4fXu7d2mFb545H6cfZRSOD3TRjjtNTiSP3XpcNHq9m51HmMW+pgP7iPcgJycSfUZ/vUy+BHymLoRMs7xcnH1MD/xpyCG4+iyG14eeahS+ABzQpRW6d2mJq3XG+U0a50S1glbhD/Iai3umCbPt1f0vtWUst4o3vYCjp1O7ZoYPFP1+p41ycww7HzRpnIuObWOCQFMHuybz7gz3XHsk3nngdMsYcN27tMJp+d3jtDf6pa+uLrRFJ6gT6jED5AHD3ZITEfa3D5rGFzPXnM1wz7XuAmVfqgZc7tOtDf6qBqN20880ecy85PfYrccZTAPOOrandJlP26VDFuvODw/deIwhRp32AXTHFYdFtw4MEm3pHRABr81oO0nceaX8I0Ubq1vq+pkWN9DNx8pNFxyCoyXvk7abiGZ28e/bT0LblnmG+I5B8PwdJxs+Ms1o5bXSCdUnH94Npx55QOzjOxruxL4sTbB77FaxjCxb5h/xwoVxZghHMCHInXdib5xxdHdcdno/dO9iNEc5bmBXnOwgSLdpmecqsH5KUBSlQf83d+7cXnPnzlUqKyuVZDN37lxf123cvkeZt2K7UlZRrfy2ZKuyt7xaWbu5RKmvr4+mqa6pVW56coLy/ZRViqIoSsneSqWqulZRFEV545v5ygX3jFDWbSm1Laesolp5+csCZW9ZlWOdtu0qU2pr66TnVm0sVh59b6ZywT0josf8tt0vF9wzQrnv9enK+z8s8p1HbW2dsntPhW2aucu3KRu27bFNc9u/fzbcCz319fWO1/++dKvyn5GLFUVRlMWri5QL7hkR/a++vt7wHuhZs6lEWbu5xHCstq5e2bRjr3LL0z8pMxZuti3XjL7M6ppaZeJvhZZl19fXKzMXbVFqa+sMz762rl559/uFytad+1yXO3nuhmjZS9fuVPaWVysVVTWKoijKzEWblS/GL1dqa+uiaTRWFO5Svpu8UlEURamrq1cmzC5UyitrpGUMfXWa8ta3C5QL7hmhfDlhheu6mVm3pVTZXLRXURRF+e/YZXF1amjMnP278spXBcrOknJf1+/YXa68/8Mi6VizZM1OZcmanY55LFmzU7nt3z8re8ur485VVNYoo39ZY/keJ4L23tfV1Su1df7znzJ3g+U49Mg7M5QL7hmhlOwV89cF94xQbn16omVeKzfsVkr3ifF99uIt0X4k4+ff1ysvf1ngu95uxvy6unqlQu2T2jxlfk7T5m2UHnfDlqJ9yoVD5X2wYMV227HXbmx1ItnzXWVlpTJ37lxl7ty5vRSJXBNRQgn2kz4UFBT0ArBu4MCByMvzv/G6y7KQnx/sVlJuqK6pw9K1uyyXD1JBWG1PB8orazDr93k44w/Om0m7oaKqFne/PBWXntYX5xzXK5A83TBq+hoM7NPRlXZSTxDPvrKqFjuKy9FDEshY45f5m3FQz3aul4xkbC7ah64dWgQSxkRRFMz+fS76DxgUNb9oaDTkfh9G20v3VaFJ49y08NhMl2dfXVOH+nrFsHNQskl226uqqrBkyRIA6J2fn19oPh/+0yeSTpPGuaEKdQ2d5k0bo22L4Lpas7xGeM8iPEAyudDCNiUVNM1rZCvUAXBcKnFDNxcewW6JRCJo0iinwQp1ROqxMl1oyLgJY5JtkI0dQRAEQRBElkCCHUEQBEEQRJZAgh1BEARBEESWQIIdQRAEQRBElkCCHUEQBEEQRJZAgh1BEARBEESWQIIdQRAEQRBElkCCHUEQBEEQRJZAgh1BEARBEESWQDtPALkAUF1dnZLCqqqqUlJOOtKQ2w407PZT2xsuDbn9DbntQMNufzLbrpNXpNtq0F6xBQUnAfgl7HoQBEEQBEF44OT8/PxfzQdJYwfMAXAygK0A6kKuC0EQBEEQhB25ALpCyC9xNHiNHUEQBEEQRLZAzhMEQRAEQRBZAgl2BEEQBEEQWQIJdgRBEARBEFkCCXYEQRAEQRBZAgl2BEEQBEEQWQIJdgRBEARBEFkCCXYEQRAEQRBZAgl2BEEQBEEQWQLtPJFkGGP9ALwB4CQAFQC+BvAA57w81Iq5hDF2BYDrAOQDaA9gDYB3ALzHOa9X03wC4I+Sy6/gnP/PlN+9AG4HsB+ApRD3YpIpTSsALwC4HEBTAFMA3Mk5LwysYS5gjN0I4GPJqbc453fo0g0G8DSAAQA2A3iVc/6GJL+Mabtal6kATrE4/SDn/FnG2GMAhknO38c5f9GU3/8BeAhAL4j36AnO+XBTmsYAnoB4n9pCRFb/O+d8gd92uIEx1hfAvQCOAzAQwArO+UBJupQ/62SPIU5tZ4zlAhgK4HyIdjcCsBjA45I2FQLoKSmmE+d8py5dWrRdLcPx2Yc1xoX97NU0drsYHM85n62mmwr5eHE053yuLj9XfZwxth+A1wCcC0AB8COAu/XvUSK4mdvUdBnX50ljl0QYY20hHlwriAc5FMA1AD4KsVpeGQqgCsB9AC4AMALA6wCeM6VbC+B403+T9QnUF/8ZAG9BTBKrAIxhjB1myusrABcCuBPAVQD2BzCJMdY8qEZ55FwY2xUVWBhjxwMYBWA+gMEQguCrjLG/6DPI0Lb/DfHP9G313FhdugpJui/0GTHGLgfwKYAfIO7TzwC+UgdNPa9ADI7DAFwEoBqi/fsH1io5h0A8l9UAlskShPGsUzSGOLW9GYRAvgDATQCuhpjgJjLGLpCk/x/i34cSU5p0aTvg4tmrpHSMS5NnD8S3+XgAswHsADDXlHaGJO1yUxrHPs4YawRgPIBBAP4PwC0ATgAwijEW8dFOGY5zW6b2edLYJZfbALQDcLj2lcEYqwXwBWPsSc750lBr544hnPMi3e8pjLGWAO5gjD3COa9Sj1doX24yGGN5AB6B+Np5UT02DeLL/2EAV6rHjoXoGOdzzseqxxZDfE3diJhgkUoKbL4SHwUwj3N+s/p7CmOsB4BhjLH3Oef1mdp2znncQM8Yex3AYs75It3hertnr/IkgG855w+qv6cwxg4G8DiAcWre3QD8BcBdnPMP1GOzAawDcDeA+xNojhOjOecj1TI/AXCUJE0YzzoVY4hT2ysA9OacF2sHGGM/ATgIYtL50ZR+u8NYkE5tB9w9eyD1Y1w6PHuY26wKHkcAeJ9zXmtKXuJwj9z28csAHAZgoNZOxtgWCMFxMIwfln5xM7dlZJ8njV1yOQ/AJJNQ8B3EV4JZU5GWmF58jfkQauT2HrI6AUAbCJWylncdgG8ADNZ9hZ0HoBTia01LtwGiQ5/nqfJJRu3QpwMYbjr1JYQ6/kj1d1a0XV0eOBrA5x6v6w2gP3TtV/kSwNGMsU7q77MhNreO3k/O+V4IwSGp7dcvvcgI8VknfQxxajvnvE4v1KnHFAgNnh9Natq0XS3btv0eyLpnb8EVAPLgcRxQcdvHz4P4gFyqSzcTwHoENBY4zW2Z3OdJsEsuB8Ok3la/AtZATHSZyskAdkOo4jX6MMZKGGM1jLH5jLGrTNccrP41q+WXAmgJoJsu3QrJgLMU4d2zJYyxOsbYOsbYMHWZAAD6AGiC+CUMbTDS6pvJbddzPYB6iIFNTzPG2A7GWC1jbAVj7HbTea39VveJ6dJt55zvkqQ7iDEW5ngV1rNOyzFEfRYnIL6dAHAdY6ySMVbGGJvAGDvSdD5T257qMS7d2q9xPYCVnPPfJedOYYztU5//r4yxM0zn3fbxuLbr0iWz7fq5LWP7PAl2yaUd4m1LAKAY3rRdaQNj7CgIO5tX1K8SQHzl3AvgYgibgE0AvmbC+UCjHYAqznmFKUtNE9Bel65EUnQY92wrhB3IjRB2dj8A+BeA/6jn26l/S0zXydqUaW2XcR2AaZzzTbpjqwE8AGEDciGAWQDeZMKpQsPLfTKn0dI1hhgkwyKsZ52u78SdEAL5S6bjowDcAeAsiCWlAwD8whgboEuTiW0PY4xLp/YDANRlyJNhsqFVmQaxnHoegBsARAD8xBg7XZfGbR9Pedslc1vG9nmysSNcw4SX0ncAfofOwJRz/pop6UjG2GQI+6lPUlbBgOGcTwAwQXdoImOsFMBjjLEnQ6pWKDDGjoP4gn1Gf5xzbl6OGcsYA4AHGGMvcM7LUlRFIkUwxk4B8DyAFznnv+jPcc7v0v38hTE2DsAKAP+EMILPSLJ1jPPBtRACW9wyLOfc4B3PGBsFYCGAx2ByMkk3rOa2TIU0dsmlGMKd20w7CHVvxsAYawNh5F4O4ELOeY3DJd8C6KGznyoGkMcYa2pKp30V7dalayvJL13u2Tfq3yMR+yJra0oja1Omt/16AJUQHo9OfANhp6JpabzcJ3MaLV0NgH3uqpoUwnrWafVOMMYOBTASwoPwAaf06pLbZIiQEhoZ2XYJyR7j0rH91wGYxTlf65RQXT4cCffPXt/HU9Z2m7ktY/s8CXbJZTli6+8AokbYfSC+YjMC9YUdBaAzgHMl9hFu0OwPDjYdHwBgL0T4BC0dY/Eu7QOQfvdsDYSrvqxNQKy+Gd121abwKggPuj0+srBrPwBwXbrOjDHzcsMACJueoIzc/RDWs06bMYQx1gdCgz0PwA2qA4UfMq7tLsnaZ6+WfThErDs/ThMabvt4XNt16QJru8PclrF9ngS75DIWwBmMsQ66Y5dAeBQF4a6ddNRJ/RsAhwIYzDlf7+KaCISL93qd59FMCI+gq3TpctV043WTxFiIr5VzdOm6QwRqTId7djVEsMwC9Yt0MlR3dh3XANgGMQECmd/2cwB0hPsB/WqIEBlLAYBzvg5iQDIbm18DYI7uHfkJwjkjej/V8ANDEPKzD/FZp8UYoi5V/QTR1os559Uur+sI4AyIILQaGdV2GSka49Kt/ddBaNXMXqJSVGHkYhifvds+PhbAICZCImnpjoMIbh5I253mtkzu8xFF8fvRRTjBRLyfJQAKIeJ4dQbwMoQr89Xh1cw9jLH3APwZIr7QL6bTyyDUw59CBF5cDfHi3gLhbHCD3gaLxYI4PgjRKW6BiFd0LOd8oS7djxBxkoYC2AMRpbwdgEE8hTt2MMYmQHTsJRCD0WCIoL0fc85vVdMcD2A6hJ3NFwBOVOt7O+f8XV1eGdV2PYyxryCM4bual+AZYwUQz59DeJBdBTEBPMI5f1qX7gqICeHfACZCBCb9O0RMp3G6dG9CGF4PhQhtcC9EbK1BnPMtSWxjc8RCDtwO8XV8j/p7Dud8fRjPOhVjiFPbITwEZ6nHrwewXX89j+08cA1EoNdxEFqKXhDLtd0gdh+IahvSpe1qOU7tB0IY49Lh2WvCjuqtugEiptuFknxOhgj0+4Na3/0g+nc+gLM451N1aR37uCp0zYVwqHgQwh/gBYh378QEtMX6OtvObZzzPZna58l5IolwzktUj6DXAXyP2NYgyQy0GjTa18XzknOnAVgE8bXyCMQLWAPxYl/IOR+tT8w5f1E1rL8LQBcIjc75+hdf5RqI3R3ehvhKmQKxdU+qBZvlAP4E4dnXCCKa+AMAXtUScM5nMcYugujU/wdgC4B/6Du9mi7T2g4g+jV9IYBPLewqV0N4wnVVfy8F8CfOuWErNs75t+ok8hDEQL4GwLV6oU7lHxB2Nk9BxIaaA+DMZAp1Kp0hbKb0aL9vAvBJGM86RWOIU9unQgSLBYRtnRltaWkdRFy7lyEmq1IIT8nL9UKdSrq0HXBu/yiEMMalybP/RP33qRAC+j2QsxXiw+4ZAB0g7NVmAziVcz7DlNaxj3POaxlj50JsKfY5YluK/T0IoU7FaW6bmql9njR2BEEQBEEQWQLZ2BEEQRAEQWQJJNgRBEEQBEFkCSTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQBEFkCSTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQBEFkCf8Pn/9H/koRoC8AAAAASUVORK5CYII=\n",
+      "text/plain": [
+       "<Figure size 720x360 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "df1 = df[df[\"name\"].isin([\"Latency\"])]\n",
+    "ax = df1['issue_to_done'].plot.hist(bins=BINS, alpha=0.5, figsize=figsize)\n",
+    "ax.set_title('Inference time (usec)');\n",
+    "#ax.set(xlim=(0, 25000))\n",
+    "plt.xticks(rotation=60)\n",
+    "plt.show()\n",
+    "\n",
+    "ax = df1['issue_to_done'].plot(figsize=figsize)\n",
+    "ax.set_title('Individual inference time (usec)');\n",
+    "#ax.set(ylim=(0, 200))\n",
+    "plt.show()\n",
+    "\n",
+    "\n",
+    "# df1['issue_to_done'].describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAApwAAAFKCAYAAACwxI8KAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAA8y0lEQVR4nO3deZgkRZn48e8A0oBcw+24ooL4Ao6iDKyKuIsiIqDIKoegIigs7iIeC4oiIiIiyqh4gNeq4G9REF1PEA9AVJDDwUWG4wURmMVZEbkVbJCZ3x+RNZMkNdPdNZVd3T3fz/P0U12ZUZmRkUe9FRkROW3hwoVIkiRJbVlh0BmQJEnS1GbAKUmSpFYZcEqSJKlVBpySJElqlQGnJEmSWmXAKUmSpFYZcGpSi4hjI2JCje0VEQsj4nODzofUi4g4LSJuGXQ+JqqIeEp1jh8w6Lw0RcS0iLgqIj446LyMRkScHRHfGHQ+ND5WGnQGpKYxBJAHtpqRCSIiVgPeBfwsM3824Oy0KiLWA44EXglsDDwAXAF8MjPPHWTexiIitgSOAZ4LPAG4C7gRuDAzjx1g1iaMiHg58GbgH4G1gbuBy4CvZOa3B5i11kTEDOBfge9k5v+0sIp9gacBn2xh2W34MPDriNgqM68adGbULgNOTUSvb7z/V+B5wBsb0y8B/gs4cTwyNUCrAe+v/v/ZAPPRqogI4HxgPeArwBxgOrAfcE5EfCQz3z3ALI5KRDwfuBCYD5wG/AGYAcwC3g0cO6i8TQQRMQ34PHAw8Fvg05QyWh/YFfjviHhtZn5tcLlszQzKuXwL8D8tLP+dwDcz888tLLvvMvPKiPg1cASPve5rijHg1ISTmf9Vfx8RLwH+sTm95u/t50ptiojHAd+kBJj/lJmX1+Z9HDgDODIi5mTm2eOct8dn5l/H8JGjgb8C22bmnY1lbdjXzE1O76AEm58B3paZC2rzPhIRLwMeN5CcTWIR8Rzg2ZTjbzI5CzguIg7NzPsGnRm1x4BTk1pEHAu8PzOn1abdAlxPqfmcDTwDuAl4a2ZeEBF7AMcBTweuBQ7OzDmN5T4dOB7YEXg8cB3wocz85hjytg+lNmMTIIEjM/O8Rpq1qjR7AhsBt1Fq907IzEci4inAzVXy90dEp6bzdOBjlBqiPTPzW9Xyotr232XmZrX1/D9KIPfk2rRtgQ8ALwBWptQovi8zL2zk8QnAB4GXUwLC3wOfyszP1tLsQKnV2w94KvDvlJrKi4FDMvN3IxTXq4GZwDH1YBOgKodDgJ2r/J7dWOeL6k0NamV2YGaeVps+4j6t2uV9pUqzB7APsEH12RuAwzPz443yeRZwFfDvVZlsClzbDDarbbm98dndKcHXcyg1fH8EvkHZD3+rpTsNeA3lmD0V2AG4DzgxMz8VEc8APkW5E3An8N7M/H9dtuvFlLJ+DTAE/BA4rJmvbiJiP0qwOBP4G/BT4F2ZeXMtzdOAE4AXAutUefkV8JbM/L+IWBU4inI+vKMRbHbKqHmOrFctc3fKrfffUZpYfLGW5imUff6eqlyOoJxPlwBvAuZV8/6Nclz+BHhjvSawdt34KHAS5boxDzg+M786ivJZ6nlSO14BvhIRX6n+/0CnmcUyXnf2AB4BLmjk61ga18hq+gGUY+KpmXlLNW3rav3bAmsAtwM/B/41Mx+s0kwD3kK587QZpby/T7m+/bmxjp0o5b4NMI1yDn02M/+zluwnlOv0zlTntqYmOw1pqtoE+DpwDuU25trA96ovzU8BX6O0sdsEODsiVux8MCK2oLQleybly+dwyhfn2RHxulGu/wXAZynBw3uBVYDvR8T2tfWsSvkCOoDSNOAtlC+LYym3HAHuoHxJAnybctvp9dX8uZR2b/9UW+8/AQuAp1VfgB0vpHxxdNb9z8AvKEHBcZR2k0PAj6svxk66DYBLgZdRAp23Ves9NSK61aS8C3gV5Qvkw5QA6IwlltJir6heu36xZ+a9wHeBLSJi01Es71F62KefBrYGPkQJgm+kBE7d0r4OeIhSUwPldulzImKrUWTtQGCYcky+lbL/30G5Fd+0AnAu5fbzOykBzScj4kDgR8CVlP14H3BaFfw1fZJya/844AuUIOXHEbHy0jIZEe+mHKM3U8puNrA9cHFErF+leVyVj+2BUyg/Ok4FNqTcSoZyXqwLfC0zR7wzERGrUM6RA4Ezq+2+HfhClaem1wCHUWpPP0Y57s+m/FB5BWXff54SFH68y+c3Ab5FadpxJHAPcHr143Fp+RzNeXId5ZoDpew75/J/V8tY1uvOdpQfOg+OIm23bVifEvxtWq3/LZTjcCYl+O34LKXsLqu28wuUH8wXVvurs7zXU46HDavlvQu4HNitseprgQcpx4amMGs4NVVtRqnR+wVARFxHufh9GdiiUysTEfdQvoBeRKmxgfKlPB/YpnbxPiUifgycGBFnZOZIHZtmAttl5q+q9ZxG6TRyIuULGUpgsTmwdWZeX037QkTcDBwfESdlZkbENykX+d92aW5wMY8OOF9IqbXaoZp+VkQ8CXgyJXiqt6H7JbBTZ1uqnvW/odQmbVct73hKIPrMzLyjmva5iPgicFREfCYz76mtfxVgq8x8qFrm3ZSgaGZmzl1KeW0J3JuZty4lzVW1tDctJV03Y92nfwF2aARFXwU+GxFbZua1ABGxAqWjxjmZeVeV7qPATkCnfdovKEHT+fVay8prM/OB2vvPR8SNlP3/zsz839q8xwFnZeYHq3V/vdqmLwGvz8wzquk/odTUHUD326s7ZOZwlfaa6vP7A//ZJS0RsTGl5u7YzDyuNv1M4BrKcXwUZb9sAuzVqJE7vvb/ltXrb7utq4t/pZxLB2Tm6dV6T6Wcy8dGxBcbNcn/ADytc0xWPyTfQ2kH/ZzMfLiavgHwmog4pBGgbQbsl5lfr9J9gXJOnBQRZ3erka1t40jnye0R8UNKsP+rLk2ElvW6sznlLkWvtqP8AN05M39dm965q0JEbAccAryhXusbEedRjvP9KdewNSlB/5XAC+tlXF1/FsnMv0fE/7L42NAUZQ2npqobOsFm5bLq9Wf1W4C16ZsARMQ6wEsoNZOPj4j1On/AecATKbc1R/LrTrAJUH0pfg14QURMrybvTQn6/txYTyfw3WEU6/kF8Kzq1jyUIPMCSm1LJxB9YS0twFZAVPlZt7beNSk1HM+NiNWqL4Y9KbXECxt5/DGwKqUXdt1XO8FmY52bjLAdawD3j5CmM3+NEdI9So/79ItdauDOotRG1js37EAJchbdvs7MCyhl/gNKsHR49f/tVW0ktbQPVHlcISLWqvL0S8rtx627bM5/1j57D+XW9N8otfmd6UmpmetW5p/vBJuVr1ZpX94lbcerKJUTZzXK7l7gasqPNSg1qwA7R8TjuywHyjEGI+/rjt0otfyLgrPMfAQ4mRLgvaSR/luNH0Cd8/u/OsFmbfrjgCc1Pv8nFtdUUwVK/1mle1a3DPZ4njSX0Y/rzrqUOx69urd6fXlVW93N3pQfY+c18ng9pea5cyy8lLKvT2zWuC4haL6b0tRBU5gBp6aqefU31S1ZgP9tpOtM7wSBT6N82R9L+aKr/32sSrPBKNZ/Y5dpN1SvnXaUT6fUhDXXc+kY1vMLynm8fa0m8+fVXz3g/FOtFrXzxfWlLut+W7W8dSltCqdTRgdopuuMndfM47zG+84X4HSW7n5GDiQ78/80QrqmXvbpY2pQM/Nu4HvAfrVamtdRhjw6p5H2ksx8JaUpx7MpNY0LgS9HxIs76SJiZkScS/kSv6fK00XV7LV4tIcz8/8a0+4F/tCl5u1eupf5o47LKqi+GXhKl7QdnePleh5bfttQlV31Q+7jwEGUH1E/jYi3RcS6tWV1gtLR/mh4MqU98iON6ddVr818N4+/0Z73HTd1KcvOedtcV0cv50lTv64700ZOskQXUTruvR+4MyK+HxEHN348PB1YnRJcNvO5YS2PnWYvS7ur0cz3hBpPWf3nLXVNVc0vqJGmdy7UnR9hn6C0l+tmtBfRkaxAqY388BLm/34Uy/g1pf3TP1GCm/sptwDXoNxyXIcScP6ysV4obVuXdAvujmp5UGrPvryEdNc03o9UvktyLfDsiNg4M5tBQ0enhqlTLkv6glqx8b6XfbqkdnBfBfYC/ikiLqN0wPlao1Z3kapW7Srgqoj4FaVt4OuAC6pa6QspPdrfS+kM8yClNus0HlshsKTbub2W+Wh18rEL3UeEWFRWmXl4RHyZ0sHnpZRg6eiI+OeqGUInUHwm8J0+5a+u1/N+WXTKZyznyZKWsSzXnT/T/UfGqM6TquZxr4j4R0qN906U9pnviYjnZeafqnzeSWkr202vNazTWdw5UlOUAaf0aJ1g5u+Z+dOlply6zbpM69QUddop3gSsMYr1LPGXf2Y+HBGd2+drAZdUvbovpQQHr6S0jfpi7WOd2rv7l7buiLiDEsCutIxlMRrfp/Rw359Ht/nr5GVNyrZcmZmdfdT5clu7kfzJjff92qdQbm/+iXJbfUPKbcP/t9RPLNbpfd/pQPMiym3EPTOzU6vZ6dnbls0ot3k761qJMqrARUv8xOLjZV6n7erSZOY1lADrw1F68M9h8VBIv6Tst/0i4oQuNZdNt1I6YK3YSLt59XrLSPkZo00jYoVGLWfnvF3SusZynizpXO7HMXodZV823Q0QEWs3mhs0zxMAsowScTlwTETsQgmAD6a0Ab+JEohempl/WUpeOsfMTErN+BJVx+CTWHKgrSnCW+pSTfUr/kLg4Ih4YnN+1ZNzNLaJMgB453PrUgKqS6pbs1Daim0bEbt2Wc8aETFUve10KlnSbelfUHoe70TVE71qN/VrSk/badR6qFMCgN8B/xERj7m12dnG6gv+m8Ae0aXH9RjKYjS+RQlS3h0R2zTWsyKl09R0qo5PlVspNVf1TlNQekcv0sd92rkFfQalzd6bKLd7L2ks78VVZ6Kmzn7ufAF3Aqj6kF4rAP8x2vz04JDacQUlwF+bRpOAhm9R8npMs8MHLBq2iIhYswoe6q6j1ICuDYuOyw9TAsaPLWF5L43yFCIobV/Xp5w7nfkrUJp+DLO4vXO/bEAZCquzrlUpTQRuYwkdncZ4nnTGc33UudynY/RiYMsqz3Wd4G/ReVLdJn9DYx3Tu+yPK6vXtavXsyhxwzGNdETEirX26T+mNJ94dzM/XdaxJaWz4SVoSrOGU3qsf6NcvH9b9TK9ifJF9FzKxbHbcDNNc4EfRMSnKbUf/0q5zf2eWpqTKEO1fDciTqcEgqtSagX2otx2vCUzH4zSm/g1EXED5ZbWzZnZ6RDxCxYP8VQPLH/O4mFyFj02LjMXRMSbKLV111a3QG+j1Lz9MyUA6jT+fzelY8yvqrK4hvJl+WzgXyhfFMusqql9NaWJwS+rPNWfNPQcyniI/137zL0RcTZwWJTHod5EuRXYra1bP/Zpx1cpNXYvpftTgz4FrB4R36YEXCtQOgC9nrLvTq7SXVy9P706Th6mBLKrjyEvvbgwSg/3p1CGEJpLGde1q8z8fZQhiE4CnhwR36G0N30qpdb5LEo5vJjSq/qblM5M0yjB2xrUOuJQhlTanBI07lDtw/mUwHLnajmdAPOLlHPnS1EGNv89ZSinHYH3ZJexTpfRjZShjJ5DOSdeR+lg99ql9FCH0Z8nN1FqHP8tIv5CuTbMrUZwWNZj9LuU4Z9ezKN/QPyY0rb1SxFxEuXHQ6e96ca1dG8ADq2O25so16IDq/TfBMjMn0fEKcA7q9rrH1EC/6dRjt1jgNMy876IeBulicGvI+JrlGP9GZQmI6+qrXcnyo+SH42wfZrkrOGUGqpevttQOojsz+IxBVcC3jfKxVxcfWYfyjBDw8AembkoIKxqe3YAPkKpfTiZMrzMFpRhaP5YW96bKLf0PkZpK/ZvtXm/otw+/xuLb9vC4h7iFze/LKt8PI/SQenfKUOYvJHSAeYjtXR/onzh/Sfli/4zlGBrI0rv676pyn2rah07UcYzPIkSbL4hM7uV/WGUL9o3U27Fz6NRc1Nb9rLu086y/ofFtV3dnn51BKW3/86U4OqT1facQXn60C3Vcu6i9ML+X0qgcBSl1/f+Y8nPGL2N0sb3/ZThbb4HvHRJbVA7MnM2Zf8/ROkA9XFKIPUzFg/WfRVlSK5dKdv9QUrQuUd9mKTMXJiZb6IEq/OBt1PaCh5O6Ty1e2dYomoYqRdRAuLXUo7/J1AGIm/jkba/p7TL3ZEyvNV0ygMElvqYzdGeJ1Wb3tdTztVTKOfyntW8ZTpGszyL/EpKT/L69Icp++omyj55a5XPzzQWcRHl+rE35Zg9inINenHtxy2Z+RbK9Wgdyh2HEyk/vr5BbdD5LA9deDnlmnIUpTyfT2k+U7c38O1ax05NUdMWLrRjmKSJKSKeSQmcb6WMqzohvpQi4grgocycFINVx+Knyjw/My8dIflyKaonDWXmywadl15FxL6UWuEnt1D723dRnmz0a2BWZv5m0PlRu6zhlDRhZebVlJqwAL4dIzwRZzxExLMpNVFLvA0tDciZlJrMtw84H6P1HuCbBpvLB9twSprQqh7cfWkruiwiYialc9Y7KD3Vu91OlwamGtpoNI9UnRAyc69B50HjxxpOSRqdPSm3pVcFXpOPfiSlJGkpbMMpSZKkVnlLvQdz5swZArYF/o8lP8FCkiRpIliRMsLDFbNmzRoeRAYMOHuzLYuHnJEkSZoMmo86HjcGnL35P4CnP/3prLxye51m586dy8yZM1tb/mRhORSWQ2E5FJZDYTkUlsNilkVRL4eHHnqIG264Aar4ZRAMOHvzCMDKK6/M0NDQSGmXSdvLnywsh8JyKCyHwnIoLIfCcljMsii6lMPAmgHaS12SJEmtMuCUJElSqww4JUmS1CoDTkmSJLXKgFOSJEmtMuCUJElSqww4JUmS1CoDTkmSJLXKgFOSJEmt8klDE9jqa63H7Xc9MOhs9MVqQyuxxuPbewyoJEmauAw4J7BHFk7j/CvmDTobfbHjthsbcEqStJzylrokSZJaZcApSZKkVhlwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVhlwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVhlwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVhlwSpIkqVUGnJIkSWrVSoNacUTsBbwWmAWsA9wEfBb4fGYuqNKcBryhy8f3ysxvNpZ3BHAosBFwDXBkZp7fSLMGcBKwJ7AKcCFwWGbe0rcNkyRJ0qMMsobzcGAYeCfwcuA7wKeAjzTS/R54fuPvgnqCKtg8ATgF2A24ETgnIrZqLOvrwO7AYcA+wAzg/IhYrV8bJUmSpEcbWA0n8IrMvKP2/sKIWB14S0QcnZnD1fQHM/PSJS0kIoaAo4GTM3N2Ne0i4GrgvcDe1bTnUoLR3TLz3Gra1ZSa1QOAU/u5cZIkSSoGVsPZCDY7fkO51b3OGBa1HbAWcGZt2Y8A3wB2iYhp1eRdgXuB82rp5gEXV/MkSZLUgkHWcHbzQuAu4E+1aZtGxD3A44G5wImZeVZt/hbV63WNZV0DrA48EbitSnd9p31oI93Ofcm9JEmSHmPCBJwRsQ1wIPCBqoYSSo3nFZSgcC3gIODMiFg1M0+r0kwHhjPzwcYi765e16EEnNOBe7qs+m7GVqO6yNy5c3v52Kituub6zJ8/v9V1jJc771yN227uVqk9OnPmzOljbiYvy6GwHArLobAcCsthMcuimEjlMCECzojYCPgWcDm1TkOZ+clG0u9GxAXAB4DTxi2DSzBz5kyGhoZaW/61N85jxowZrS1/PK277npsuNnGPX12zpw5zJo1q885mnwsh8JyKCyHwnIoLIfFLIuiXg7Dw8OtV5KNZODjcEbEWsAPgQeA3TPz4RE+cjawcUSsX72/GxiKiFUa6aZXr3fV0q3dZXnTa2kkSZLUZwMNOKsg8XvABsDLMvPOHhbTabu5RWP6lsD9wB9q6aLWiaie7voe1itJkqRRGFjAGRErUXqSPwvYJTNvHcVnplGGObq11sv9Ekrv831q6Vas0p2XmQuryedSajh3rqV7ErB9NU+SJEktGGQbzlOAVwDvAlaLiOfV5l1LudV9OmWw9t9RgsWDgB2A13cSZuZwRBwPnBARdwBXVuk2BfarpbssIs4BvhQRhwP3AccB85gA7UElSZKmqkEGnJ2axo92mfci4LeUmsujKbfcH6YEk7tn5vfriTNzdkQAvBXYkNKrfbfMvKqx3H2B2ZRB3ocoj7bcKzMf6McGSZIk6bEGFnBm5lNGkeyVY1jebEowubQ09wOHVH+SJEkaBwPvpS5JkqSpzYBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrVppUCuOiL2A1wKzgHWAm4DPAp/PzAW1dLsAHwK2BP4AnJyZn+6yvCOAQ4GNgGuAIzPz/EaaNYCTgD2BVYALgcMy85Z+b58kSZKKQdZwHg4MA+8EXg58B/gU8JFOgoh4PvA94DfALsBXgJMj4s31BVXB5gnAKcBuwI3AORGxVWOdXwd2Bw4D9gFmAOdHxGp93jZJkiRVBlbDCbwiM++ovb8wIlYH3hIRR2fmMHAMcGVmvqmWZmPg/RHxhcxcEBFDwNGUms/ZABFxEXA18F5g72racynB6G6ZeW417WpKzeoBwKktb68kSdJyaWA1nI1gs+M3lFvd61SB5IuBsxppvka5bb519X47YC3gzNqyHwG+AewSEdOqybsC9wLn1dLNAy6u5kmSJKkFE63T0AuBu4A/AZsCKwPXNtJcU71uXr1uUb1e1yXd6sATa+mur7cPraXbHEmSJLViwgScEbENcCDwiaqGcno1655G0rur13Wq1+nAcGY+OIp0zWV10q3TZbokSZL6YJBtOBeJiI2AbwGXU+s0NNHNnTu31eWvuub6zJ8/v9V1jJc771yN227u1opidObMmdPH3ExelkNhORSWQ2E5FJbDYpZFMZHKYeABZ0SsBfwQeADYPTMfrmZ1aijXbnykU/N5Vy3dUESskpl/GyHdxl2yML2WZkxmzpzJ0NBQLx8dlWtvnMeMGTNaW/54Wnfd9dhws27FP7I5c+Ywa9asPudo8rEcCsuhsBwKy6GwHBazLIp6OQwPD7deSTaSgd5Sj4hVKMMebQC8LDPvrM2+CXiIxW00O7asXq+vXjttN7ulu58ydmcnXdQ6EdXTXY8kSZJaMbCAMyJWovQkfxawS2beWp9fDYt0AdWwRjX7An8ErqzeX0Lpfb5PbdkrVp87LzMXVpPPpdSW7lxL9yRg+2qeJEmSWjDIW+qnAK8A3gWsFhHPq827NjPvA44Dfh4RXwTOAF4AHAwc2ultnpnDEXE8cEJE3EEJRA+i9HLfr7PAzLwsIs4BvhQRhwOd5c8DTmt1SyVJkpZjgww4OzWNH+0y70XAzzLzVxHxSspThPYH5gPvyMzP1RNn5uyIAHgrsCFlqKPdMvOqxnL3BWZTBnkfojzacq/MfKA/myRJkqSmgQWcmfmUUaY7l1Hc8q6eMjR7hDT3A4dUf5IkSRoHE2YcTkmSJE1NBpySJElqlQGnJEmSWmXAKUmSpFYZcEqSJKlVBpySJElq1ZgDzojYucvjISVJkqSueqnh/CFwW0ScFBFb9TtDkiRJmlp6CTj3AC4GDgWujIjfRsQRETGjrzmTJEnSlDDmgDMzv5eZe1MeIXkwcAdwInBrRPw4Il4XEav1OZ+SJEmapHruNJSZ92fmlzNzR+DJwFHABsDpwO0R8dWI2LFP+ZQkSdIk1a9e6isCjwOGgGnAg8BLgJ9ExG8iYmaf1iNJkqRJZqVePxgRawF7A68DXgD8HTgHeHf1ugDYHfgE8BVg22XNrCRJkiafMQecEbEHJcjcFVgFuAJ4G/D1zLyrkfw7EbEecOoy5lOSJEmTVC81nP8N/AH4JHB6Zl4/QvrfAmf0sB5JkiRNAb0EnC8Fzs/MhaNJnJmXA5f3sB5JkiRNAWMOODPzp21kRJIkSVNTL4+2/ERE3LiU+TdExEnLli1JkiRNFb0Mi7QbcNZS5p8FvKK37EiSJGmq6SXgfBJwy1Lm31qlkSRJknoKOO8DnrqU+ZtQBn6XJEmSego4LwAOiYiNmzMi4inAIVUaSZIkqadhkY4BdgHmRsRXgGuq6TOBA4BHgPf1JXeSJEma9HoZFunGiHgBcApwWGP2RcBhmZn9yJwkSZImv56epZ6Z1wA7VI+t3KSafFNm3tm3nEmSJGlK6Cng7MjMPwN/7lNeJEmSNAX1FHBGxIrAzpTazenAtEaShZn5wWXMmyRJkqaAMQecEbEN8C3gH3hsoNmxEDDglCRJUk81nKcCqwJ7AL/IzHv6mSFJkiRNLb0EnM8C3puZ3+93ZiRJkjT19DLw+20s+Va6JEmS9Ci9BJwnAgdHxJr9zowkSZKmnl5uqa8D/BX4XUR8E/hfytOF6hZm5knLmjlJkiRNfr0EnCfW/n/zEtIsBAw4JUmS1FPA+dR+rTwingYcATyP8iz26zNzZiPNacAbunx8r8z8ZiPtEcChwEaUZ7wfmZnnN9KsQQmG9wRWAS6kPI7zlj5skiRJkhp6eZb6rX1c/zOA3YDLKO1Jl9Sm9PfAaxvTbqi/qYLNE4CjgCuBg4FzIuK5mXlVLenXga0pz4G/DzgOOD8inpmZDyzb5kiSJKmp50dbRsRmwA7ABsAZmXlLRKxMqV38Y2Y+NIrFfD8zv1st7zRgmyWkezAzL11KXoaAo4GTM3N2Ne0i4GrgvcDe1bTnUgLc3TLz3Gra1cBNwAGUMUYlSZLUR2PupR4RK0TEF4Drgc9Tagg3qWavTAnyDhvNsjJzwVjXvwTbAWsBZ9aW/QjwDWCXiOgM47QrcC9wXi3dPODiap4kSZL6rJdhkY4C3gi8D3g+tTE5M/MvlMdevqovuVts04i4JyIejojfRMQ+jflbVK/XNaZfA6wOPLGW7vouge41wOZ9zbEkSZKA3gLOA4EvZ+YJwO+6zL8a2GyZcvVov6F0LNqD0tHnNuDMiDiglmY6MJyZDzY+e3f1uk4t3T1d1nF3LY0kSZL6qJc2nP8AXL6U+Q8Ca/SWncfKzE82Jn03Ii4APgCc1q/19GLu3LmtLn/VNddn/vz5ra5jvNx552rcdvMdPX9+zpw5fczN5GU5FJZDYTkUlkNhOSxmWRQTqRx6CTj/CDx5KfNnAf3syd7N2cCpEbF+Zt5BqaEciohVMvNvtXTTq9e7qte7gY27LG96Lc2ozZw5k6GhobF+bNSuvXEeM2bMaG3542nddddjw826Ff3I5syZw6xZs/qco8nHcigsh8JyKCyHwnJYzLIo6uUwPDzceiXZSHq5pf4t4N+qXuodCwEiYhdgf0pnnfHUabu5RWP6lsD9wB9q6aLWiaie7vr2sidJkrT86iXgPBaYR2lbeQYl2DwqIi4FfgBcBXy4XxlsqoLFvYFbq9pNgEsovc/3qaVbsUp3XmYurCafC6wN7FxL9yRg+2qeJEmS+qyXgd/vi4jtgP8A9gL+RgnYbqIEoyc1bmsvUUSsxuLhiJ4MrBkRe1bvr6heT6cM1v47SrB4EGX8z9fX8jQcEccDJ0TEHZSB3w8CNgX2q6W7LCLOAb4UEYezeOD3eQy4PagkSdJU1dPA71VAeUL1tyw2oLTHrOu8PxD4HqXm8ugq7cOUYHL3zPx+I0+zIwLgrcCGlKGOdms8ZQhgX2A2ZZD3IcqjLffyKUOSJEnt6PlJQ/1QPb+82Z6y6ZVjWN5sSjC5tDT3A4dUf5IkSWrZmAPOiPjyKJItzMw39ZAfSZIkTTG91HC+mKpXes2KwBOq1zuAvy5jviRJkjRF9NJp6CndpkfE4yi3qd8O7LRMuZIkSdKU0cuwSF1l5sOZ+Rngx8Bn+rVcSZIkTW59CzhrrgL+qYXlSpIkaRJqI+DcCXCIIUmSJAG99VI/Zgmz1qbUbG4NnLgMeZIkSdIU0ksv9WOXMP1uytOG3gx8sdcMSZIkaWrppZd6G7fhJUmSNEUZPEqSJKlVvbTh3LiXFWXmvF4+J0mSpMmtlzact/DYJw2Nxoo9fEaSJEmTXC8B50HAW4EnAV8DbqimB7AvMA/4FLCgHxmUJEnS5NZLwPkEYAh4WmbeXZ8REe8HLgY2yswP9yF/kiRJmuR66TT0ZuALzWATIDPvpAyJ9G/LmjFJkiRNDb0EnOsCqy9l/uOrNJIkSVJPAeelwNsiYlZzRkRsA7wNuGxZMyZJkqSpoZc2nG8BfgZcHhFXADdW0zcDtgXuAg7rS+4kSZI06Y25hjMzrwWeSemJvjawZ/W3NvBJ4JmZeU3/sihJkqTJrJcaTjLzduAd1Z8kSZK0RD0FnB0RsRmwATA3M+/tT5YkSZI0lfQUcEbEfsCJwBOrSTsBF0TEesAlwNGZ+Y3+ZFFTwYIFC7n9rgd6+uyqa67f82fbsNrQSqzx+JUHnQ1JkiaNXp6l/mrgv4CfACcDszvzMvPPEXEdsD9gwKlFhh9+hEt+O7+nz86fP58ZMyZOwLnjthsbcEqSNAa9DIv0XuCnmbkzcHqX+ZcBWy1TriRJkjRl9BJwbgF8eynz/wSs31t2JEmSNNX0EnD+laU/aWhT4M+9ZUeSJElTTS8B5wXAARHxmEZsETEDOBj40bJmTJIkSVNDr204nwD8Gvh3YCGwa0ScCFwNLAA+0LccSpIkaVLr5UlDNwIvAP4IHAtMA/4DeBfwP8D2mTmvf1mUJEnSZDamYZEiYkXK2Ju3Z+ZLI2I68DRK4Pr7zLyjhTxKkiRpEhvrOJwrADcBRwIfz8y7gSv6nitJkiRNGWO6pZ6ZDwPzKe02JUmSpBH10mnoK5Re6qv0OzOSJEmaenp5lvoNwIrA9RFxOvB74MFmIp+lLkmSJOgt4Pyv2v/vW0KahfgsdUmSJDHKgDMiPgWcnplzgBdVk1en1Gw+0uvKI+JpwBHA84CZwPWZObNLul2ADwFbAn8ATs7MT3dJdwRwKLARcA1wZGae30izBnASsCewCnAhcFhm3tLrdkiSJGnJRlvD+RbgUmBOZl4UEetSnpm+U2ZetAzrfwawG3AZpT3pY9qURsTzge8BXwUOp4wBenJEPJyZn6ulOwI4ATgKuJLyxKNzIuK5mXlVbZFfB7YGDgPuA44Dzo+IZ2bmA8uwLZIkSeqil1vqHdP6sP7vZ+Z3ASLiNGCbLmmOAa7MzDdV7y+MiI2B90fEFzJzQUQMAUdTaj5nV8u7iPLko/cCe1fTnksJcHfLzHOraVdThno6ADi1D9skSZKkml56qfdNZi5Y2vwqkHwxcFZj1tcot823rt5vB6wFnFlb9iOUdqS7REQnON4VuBc4r5ZuHnBxNU+SJEl9NtCAcxQ2BVYGrm1Mv6Z63bx63aJ6va5LutUpT0fqpLu+S6B7TW1ZkiRJ6qOx3FLfJCL+sfp/rep184j4S7fEmXn5MuWsmF693tOYfnf1uk4t3XBmNodnqqe7rUrXXFYn3Tpdpi/V3Llzx/qRMVl1zfWZP39+q+sYL8OxzjJty0QqhzvvXI3bbh7MU1znzJkzkPVONJZDYTkUlkNhOSxmWRQTqRzGEnB+oPqre0xPcUrbzoWUsTqntJkzZzI0NNTa8q+9cR4zZsxobfnjaWholZ63Zf78+ROqHNZddz023GzjcV/vnDlzmDVr1rivd6KxHArLobAcCsthMcuiqJfD8PBw65VkIxltwHlgq7lYsk4N5dqN6Z2az7tq6YYiYpXM/NsI6bpFCtNraSRJktRHowo4M/P0tjOyBDcBD1HaXp5Xm75l9Xp99dppu7kF8JtGuvspY3d20u0UEdMyc2Ej3fVIkiSp7yZ0p6HMHAYuoBrWqGZf4I+U8TYBLqH0Pt+nkyAiVqw+d14tuDyXUlu6cy3dk4Dtq3mSJEnqs2UZh3OZRcRqLB6O6MnAmhGxZ/X+isy8lTIw+88j4ovAGZSB3w8GDu30Ns/M4Yg4HjghIu6gBKIHUXq579dZX2ZeFhHnAF+KiMNZPPD7POC0VjdWkiRpOTXQgBPYADi7Ma3z/kDgtMz8VUS8kvIUof2B+cA76k8ZAsjM2REB8FZgQ8pQR7s1njIEpXZ0NmWQ9yHKoy338ilDkiRJ7RhowFk9v3zEJxZVTwUa8ZZ39ZSh2SOkuR84pPqTJElSyyZ0G05JkiRNfgackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVSsNOgPSZLNgwUJuv+uBcV/vqmuu39f1rja0Ems8fuW+LU+SpCWZ8AFnRBwAfKXLrFMy8y21dLsAHwK2BP4AnJyZn+6yvCOAQ4GNgGuAIzPz/Bayrilq+OFHuOS388d9vfPnz2fGjP4FnDtuu7EBpyRpXEymW+ovA55f+5vdmRERzwe+B/wG2IUSoJ4cEW+uL6AKNk8ATgF2A24EzomIrcZjAyRJkpZHE76Gs2ZOZv55CfOOAa7MzDdV7y+MiI2B90fEFzJzQUQMAUdTaj5nA0TERcDVwHuBvVvOvyRJ0nJpMtVwdlUFki8GzmrM+hrltvnW1fvtgLWAMzsJMvMR4BvALhExrf3cSpIkLX8mUw3n3IhYH5gHnAZ8KDP/DmwKrAxc20h/TfW6OfBrYIvq/XVd0q0OPBG4rf/ZliRJWr5NhoDz/4D3A5cDj1DaaL4PeCpwADC9SndP43N3V6/rVK/TgeHMfHAp6cYUcM6dO3csycds1TXXZ/788e+c0obhWGeZtmUilcOybsuy6Od677xzNW67+Y6+LW88zZkzZ9BZmBAsh8JyKCyHxSyLYiKVw4QPODPzR8CPapN+EhH3AsdGxAcHlC0AZs6cydDQUGvLv/bGecyYMaO15Y+noaFVet6W0jt74pTDsmzLsuh3Oay77npsuNnGfVveeJkzZw6zZs0adDYGznIoLIfCcljMsijq5TA8PNx6JdlIJmsbzm9Ur1uzuIZy7UaaTs3nXdXr3cBQRKwyQjpJkiT10WQNOOtuAh5icRvNji2r1+ur107bzW7p7qeM3SlJkqQ+m6wB52uAhZShkoaBC3jssEb7An8ErqzeXwLcC+zTSRARK1afOy8zF7adaUmSpOXRhG/DGRE/ogSUc4EFlE5D/w58KTN/XyU7Dvh5RHwROAN4AXAwcGhmLgDIzOGIOB44ISLuoASiB1F6ue83jpskSZK0XJnwASflVvgbgX+g5PdG4Ejg5E6CzPxVRLyS8hSh/YH5wDsy83P1BWXm7IgAeCuwIWVIpN0y86r2N0OSJGn5NOEDzsx8O/D2UaQ7Fzh3FOlmU3sspiRJkto1WdtwSpIkaZIw4JQkSVKrDDglSZLUKgNOSZIktcqAU5IkSa0y4JQkSVKrDDglSZLUKgNOSZIktcqAU5IkSa0y4JQkSVKrDDglSZLUKgNOSZIktcqAU5IkSa0y4JQkSVKrDDglSZLUKgNOSZIktWqlQWdA0mAsWLCQ2+96YNDZGLNV11z/MflebWgl1nj8ygPKkSRpJAac0nJq+OFHuOS38wedjTGbP38+M2Y8OuDccduNDTglaQLzlrokSZJaZcApSZKkVhlwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVhlwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVhlwSpIkqVU+S13SpLdgwUJuv+uBkRNOAqsNreRz4SVNOQackia94Ycf4ZLfzh90Nvpix203NuCUNOV4S12SJEmtMuCUJElSq7ylLkkTyGjbo6665voTut2qbVEl1S13AWdEbAZ8GtgeeBA4EzgyMyfulVvScmO07VHnz5/PjBkT97JlW1RJdctVwBkRawMXArcCewIbAB8H1gdeM7icSZIkTV3LVcAJHAJMB56dmX8GiIi/A2dExAcz85qB5k6SpojxGqpqPJoW2DxAWnbLW8C5K3B+J9isfAv4MrALYMApSX0wXkNVjUfTApsHSMtueQs4t6AEl4tk5nBE3ARsPoblrAjw0EMP9TFrj7VgwSOstMKCVtcxXh75+8M9b8sqj5s2ocphWbZlWfS7HAa1HcuqWzlM1m3pZrTbMtHOi6bx2ifjUQ5/f/ghhodXbHUd/TA8PDzoLEwYlkXRKYdavDKwA3nawoULB7XucRcRDwPvy8wTG9N/CfwpM181muXMmTNne+AXLWRRkiSpLS+cNWvWLwex4uWthrNfrgBeCPwf8MiA8yJJkrQ0KwJPoMQvA7G8BZx3A2t3mT4duH60C5k1a9YwMJBfCJIkST24aZArX96eNHQdpR3nIhExBGzKGAJOSZIkjd7yFnCeC+wYEevWpv0LMFTNkyRJUp8tb52G1gbmArcAH2TxwO/nZ6YDv0uSJLVguarhzMx7gBcDfwH+G/gEcBbwxgFmS5IkaUpbrmo4JUmSNP6WqxpOSZIkjT8DTkmSJLXKgFOSJEmtWt4Gfp/wImIz4NPA9sCDwJnAkZn5wEAzNoKI2At4LTALWIcywOxngc9n5oJaul2ADwFbAn8ATs7MT3dZ3hHAocBGwDWUMji/kWYN4CRgT2AV4ELgsMy8pZFuYGUaEatTxnh9IrBtZv66Nm9/4CjgKZTyOi4zz2p8/nHAccAbKA8tuAJ4W2b+TyPdRsAngZcBC4EfAG/PzD830v0jZWSGWcBdwH9W623liVkR8Xrg7ZT9/QBwJbBvJ1/Lw/EQEXtQ9vMWwF+Bi4F3Z+aNjXRT5niIiKcBRwDPA2YC12fmzC7pJuz+H23elqUcImJF4HBgt2o9KwFXAx9obt9ULocu6WcBlwMPZubqjXkDOQdGc36OZAznxSrAu4HXA/8A/Bk4NzMPbqSbVMeDNZwTSDVs04XAGpSD43BgX+DLA8zWaB0ODAPvBF4OfAf4FPCRToKIeD7wPeA3wC7AV4CTI+LN9QVVJ9EJwCmUC/GNwDkRsVVjnV8HdgcOA/YBZgDnR8RqtWWtzWDL9Fi6/LCLiD2B04FvU8rip8DXq5O57hOUC8r7gVcCD1G2cUZtWSsB5wHPBPYHDgK2A74XEdNq6Tap1nMXZR+dQNlfH+rDdj5GRLyX8qPjvynb+CbKRXGomj/lj4eI2JGy/dcDr6rytjnw04hYs5Zuqh0Pz6Dsq98B13ZLMJH3/2jzNgojlcOqlCDmf4ADgddQvsR/EhEvb+RpKpdDfZ0rUK4bdywhybifA2M4P0cymvNiBcr35/5Vfl4KvIsyuk493aQ7HqzhnFgOoTxm89m1GqC/A2dExAcz85qB5m7pXpGZ9QvEhVXt3lsi4ujMHAaOAa7MzDfV0mwMvD8ivpCZC6onPx1N+dU0GyAiLqL86n8vsHc17bmUk2y3zDy3mnY15ZfnAcCp1ToGVqYRMRN4M/AfwOcbsz8InJ2Z76neXxgRWwAfAH5Yff6J1effmplfrKZdCtxMqTV8V/XZVwNbATM72xMR8yk1abuw+KEG7wTuAfaq9sf5EbEWcExEfDQz7+rjtgcl2P6XzPxBbdZ3av8vD8fDvsCtwBsyc2G1vluBy4AXUO1rpt7x8P3M/G617tOAbbqkmcj7f8S89akcHgSempl3dyZExI+Bp1O+9H9QTZvq5VB3MLAWJdh5a33GAM+BEc/PPpbDgcDzgS0z8w+16WfUymFSHg/WcE4su1IGoa9X+X+LUnM41l9S46oRbHb8hlKFv051gryYMu5p3dcotwO2rt5vR7nYnFlb9iPAN4Bdar9OdwXupfyK7aSbR7mg7Fpb/iDL9BTgM8AN9YkR8VRKLdeZjfRfA7aNiPWr9y8FVqRWZpl5P+VLqLmNV9eDpcy8hBLoNNN9p7qw1tfZ2Tf9dCBwayPYXGQ5Oh4eB9zfCTYr91Sv02BqHg8jffFM5P0/hryNaKRyyMxH6sFmNW0hpcZzRm3ylC6HjohYj1Jr9zZKzWXTuJ8DYzg/RzTKcjiYEtz+YSlpJuXxYMA5sWxBo5q9OhFuohzwk80LKbcq/kR5Xv3KPPY2QueC0Nm+zrPur+uSbnVKW8hOuuu7nMDX8OiyGkiZRmm7+DTg+C6zO9u4pLKIWrrbM/POLumeXt166aTrdntmUVlExOOBjZvpqnY8D9D/snge8NuIODoi/hgRD0fE5RHxz9X85eV4OA3YIiIOi4i1I+IpwGzK9nTaWi0Px0PTRN7/o81bK6r9uB2P3ublpRw+AvwyM89bwvxBnAOjPT+XWZT2qVsDt0TE6RHxl4j4a0R8p6pJ7JiUx4MB58QyncW1H3V3UzriTBoRsQ2llusT1S+v6dWsexpJO7/uO9s3HRjOzAdHka65rE66elmNe5lWt2ROAt6VmX/pkmQsZdFM00n3OMqFZaR0nWWtvYR1NtP1y0bATpRj4K3AK4D7gPOqoGu5OB4y80JK280PVeu4GXgqsFOtVmV5OB6aJvL+H23e2nIYJYj5WG3alC+Hqn3gvsA7lpJsEOfAeJbDupTtOJJyDX01pe37VsC5UdqmdvI06Y4HA071XZTegd+i9DL8yAjJp6LjgRsz84wRU05dK1Au/q/OzG9UNRa7U4LOdw40Z+MoIrYDvgp8iXI7ai9gAaXzwqqDzJsmnuoOwEeB2Zn5i0HnZ7xE6a1/KvDxzPz9oPMzQJ2Y7C/AHpn5o8w8k3LdeAbwLwPLWR8YcE4sd7P4V1fddMqt6Qmvqt37IeWWxO6Z+XA1q/MraO3GRzq/mu6qpRuKMizESOmay+qkq5fVuJZpRDyD0qj9fdUt1LVZ/Kt79ShDVIylLJppOukeZnGvxdFs4z1LWGczXb/cDdyZtaFKsgyvcSllOJDl4nigjNRwYWa+IzMvzMxvUhrxP4cy5EknT3TJ11Q6Hpom8v4fbd76KiKeBXyX0rHuyMbsqV4OBwNPAE6tXTdXgdKDuvbjbBDnwHiWwz2UIZwurtdeZhlO7z7KtbOTp0l3PBhwTizXsbhtBrCooe6mlGFVJrTq4P8esAHwskY7m5sojcC3aHxsy+q1s32dNind0t1PGTKkky5qjaPr6eplNd5luhll9IcLKSfo3cD3q3kXAr9g6dsIkNXrdcAGEdG8VbElcEOtXc5jtrGW7nqAzPwrMK+ZLiKeDKxG/8tiab29V2H5OR62pHQAWSQzb6OMq7dpLU8088XUOh6aJvL+H23e+iYiNgV+RBmn9vWNTmYw9cthc2BDynZ0rptHAo+v/v9wLd/jfQ6M9vxcZtWP8luWMHshVRA+Qp4m7PFgwDmxnAvsGBHr1qb9C6XH3LndPzIxVG1LvgE8C9glM2+tz6/aq11ANVxDzb7AHykXWoBLKL3q9qkte8Xqc+fVLsTnUn5p7VxL9yTKoLX1shrvMv0l8KLGX6dN0puBgzLzZsrJuU/js/sCV9R6/P+Ycvt1UZlFGWrqFTx2G59ZDdPRSfc8ygDFzXR7RMTKjXUOs7gDS7/8AFg3Ihb1XKwa6T8fmLMcHQ+3UgaUXqT6QluP6otlOTkeHmUi7/8x5K0vqiZIP66WvUdmduudPdXL4TM89rp5OvC36v/PVOnG/RwYw/nZLz8Atq83uYkyOP1awJxq0qQ8HqYtXNj8IaVBqW4jzKV8EX2QUlP4ccpwBa8ZXM5GFhGfB/6VMg5as+3RtZl5X9Uo/OeUnrtnUMYhPA44NDM/V1tWZ0Db91AO4oMojaefm5lX1dL9gHJr8nDK7YbjKNX7z6x+KU6IMo2IHSi1m4ueNBTlyUxnUX65/4QygPHbKOOl/bD22c9Qbr0eTglejqCM3fbMzJxfpVkJ+DWlsfl7KDWsJwG3Ay/IxeM/bkKpbbuA8kSJqNJ9OjPf3edtXgH4FbA+ZVy4+6tt2JYy1tvvlofjISLeQinrz1Bul65LGT9vfeAZnbsAU+14iDKodGfYlUMpNSX/Ub2/IjNvncj7f7R5W9ZyoIzg8atq+uso+2iRzLx0eSiHZgVF9ZljgSPysU8aGvdzYLTnZz/KoQoIr6Ls409QAsYTKPty604ztcl4PFjDOYFk5j2UjgV/oTyd5BOUg/yNA8zWaHV+QX2UcgGt/20NkJm/opyo21JuHx0EvKN5oGYZyPYoSu/mH1Jut+xWP4kq+1J+DZ4KnE35lfWSrD2Ka6KWaWaeTenBvSelLHYG9uty8XoH5akbx1OaK6xK2cb5tWX9nfL4trnAf1Ge/nAppQ3twlq63wMvoQQ751ACn49RAsJ+b98CSlvFn7N4/wDskJm/q9IsD8fDKZRBlV9IaZt3MuUpIy+qNzmZgsfDBpR9cDawA/Ck2vsXVeufsPt/tHnrQzlsSOmBvDrl+GheO5eXchiLcT8HxnB+jmQ058X/Vv9Pq6Z/hlKJ85Jc3CdiUh4P1nBKkiSpVdZwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVhlwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVv1/xVCuu0pwNnUAAAAASUVORK5CYII=\n",
+      "text/plain": [
+       "<Figure size 720x360 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAoIAAAFKCAYAAACJoz5RAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAw2ElEQVR4nO3debxtc/348deNXMksEpUy9Ea3+HWpDPVrki6l4WuIypeifJMkU5Mh5Kf4lpKiCRWRVF8iDcZKGU4D1/CmK3y5kXmI7sW9vz8+a3eWbZ9pn2Gfe9br+Xicx7p7rc9a+7M++3PPfp/PtKYtXLgQSZIkNc8zep0BSZIk9YaBoCRJUkMZCEqSJDWUgaAkSVJDGQhKkiQ1lIGgJElSQxkISuMgIg6NCNdm0tNExOsiYmFEvK7XeZmsIuLiiLi41/noJCL2jYibI2LxXudlKBGxdUQ8EhEr9zovmrwmfUWWJosRBHa7jmtGRiEipgHvA3YDNgCeCcwBzgC+mJmP9jB7wxYRzwb2AbYH1gQeB+4Afgd8KTNv6GH2JoWIWAPYH9gSeD7wJHA98FPg+Mx8oGeZG0cR8WHg0cw8eRyuvQzwSeCgzHxirK8/1jLz3IiYQ8nzx3udH01OBoLS8L2v7fUHgVcD72/bfxnwfeCoicjUcEXEYsBplODpN8DBwGPAa4HPAttHxJsy8x+9y+XQIuKZwCXADOB7wNeAZwHrAlsDvwcaHQhGxJbAWZTg73vA1ZTf9xtRgoL/C7y5ZxkcXx8G7gFOHodrv59S1747DtceLycCR0fEoZn5UK8zo8nHQFAapsz8fv11RLwJeGX7/prJ1mJwACUIPCYz96/t/0ZE/BD4CXASJZiaMFUr5ZKZ+dgwT3kHMBPYtb3Vp+quW25MM7iIiYgXAWdSWkjfkJl3tB3/FKVFWCP3fuC8zPxnrzMyAj8CvkL5v/+tHudFk5CBoDQOIuJQ4JDMnFbbdwulpeoo4BjgpZRu2Y9m5oUR8Q7gMOAlwHXA7pnZ13bdlwBHAG8Enk3p6vtcZv5oiPw8i9JNeCOlRegpMvPsiDgF2DUiXpmZV9TyfHFm7tJ2vYur815X2zcd+ATwXuCFlFaZHwKfrnc5V13sJwIXAZ8BAvhgROwOLJOZL++Q/z8Cj2fmq4C1qt2/6XAfTwD31s5bgxIAvwFYA5gP/Bb4ZGZeU0v3uio/7wHWobT2Lgf8EvgA8E/K5/YeSrmfBexRD17b7usQSpd1Agdm5vntee1wj0N+tlWg+wlgZ+AFlBbdm4DPZ+aPq2QHAMsAW7UHgVUZ3Vm9T/299wA+AqwNPAD8T1VG99XSXAysCmwHfBXYGLizSndGRGxOqdcbALdR6vUvaucfWpXLSymf+9bAAkod2TczHxmifKZVefwg5TN6CDiHUr73VGluoXzO9aEct2bmi6p9w6qjA7z/i4GXA8e37X8R8Dc6/2GyEPhsZh5avV4aOBT4D2C16h6uBQ7OzEtr521MaaXfDFgC6KN0R1/Udv3nVdfbGlgZ+DvwK+DjmfkwQGb+IyKuBt6JgaA6cLKINLHWBH4AnEv5QloeODsidqL81X4apct2TeDMqjsXgIhYD7gceBnwBWBfStBzZkS8d4j33RxYAThtkLFNre6ut430pqov6Z9QgpBzgb0oX7AfBn5aHa97LeUL9Szgo5QA+RTgZRHxlECwuu//U8vfLdV25w7Xbbdx9V4/AvYGvgS8Arik+hJtdwAwixL0fZvS+vgN4JvA+pQv57MogdjTAmrKF/fXq3v/NLAkcE4VJA1oBJ/tIZQ/Fi6hlNthlLJ7ZS3NNsDfMvO3g71n7b0/U+X5LsofC6dTWr4urAKnuuUon++VlLJ6FDg1It5NKePzKfV6qSrvnVpnT6fUxU9V53yQUl5D+TrwRUo57U35XLYFLoqIJas0HwNup5TJ+6qfj1X3OdI62m7TanvVMPI62D3sVeXjw8DngbspwTNVPv8v5Y+cFSmf74HAdOCX9QlGEbEqcAWlLp5VXfdkSl1Yqe19+4BNhnGPaiBbBKWJtQ7w2sz8DUBEXA/8AvgOsF5m/q3a/wClden1wK+rc78MzAU2qrVEHR8RvwSOiohTM3OgCS3rV9u/DJK31rH1B0kzkB2BtwCvz8xLWjsj4irKeMktKK1rLesCr8jMP9fS3kC5x/dSvqxb3keZDHJG9fqnlC/6gyktmBdTWvnO7dACdm57a2lEfI/S4voB2lrGKK0vr8zM+VXalYF3U1pZ3lKV79ciYh1KsHRw2/kzgE0z8/fV+SdTWuyOogTjAxnuZ/tWStfk7p0uEhHLAqtTWvSGVN3fQcAFwJaZ+WS1/8+UYQK7U1r/WlYFds7M71XpfkX5LE4DXpOZv6v2t+r1djy9FeoOSmvlwirt34GDqvGpv6aDiNgU+BDwn5n53dr+8ylB087ANzLzpxFxBHBPhyEbI62j7dattjcPkmYobwW+mZkdJ25UgdqJlPq8Ra2MTgD+BBxJf0B6FKVVcdPMvLx2mUM7BHw3U4Lv51HqmfRvtghKE+vGVhBYaf0Cv7gVBLbtXxMgIlYE3kRpwXh2RDyn9UNphVmd0qU8kGWq7cODpGkdW2aQNAPZntLtfG1b3i4BFlIC2rrL6kEgQDWL9Wxgp4h4Bvz7i3En4Oet7r/M/BclqDoGaM2CPhH434j4fhUMta5Z77pdKiJWonTHJWWcYbvvtYLAyuXVe5zUFmRfDqzWocXsqlYQWL3/vZQgabOIWKHD+430s30QeGnVjdxJ694H+5zr3kQJfr/cCgIr36O0ELaPF30MOLV2f0npSr6xFQRWnlJ/23y1rSy/Um3fOkg+twceAc5vK58bqny216+BrjGSOtpuJUpX9oPDeK+BPAi8KiJWH+D4BpShEqcBK9XyuCzlj5FXVfX4GZSu3p+3BYEAdPiD8P5q+5xR5F1TlC2C0sS6rf4iMx+MCID/bUvX+rJpBQ9rUwKSQ6ufTlahBDidDCfIax3rZtbwSyhfYHcPcHyVttdzBkh3CuUL+/WUVqrXUMZ87VdPVAVY+wP7R8TzKd2/e1PG8C2gtBBRdRkeRmllbO8Kvpenu63tdetz6PT5TKN07d9V239Th2veWG3XoP8LuW4kn+3BlBbRjIjrKK1up2Vmq7uyNSt0uMH8GtX2KfUmM5+MiJuAF7WlvyMzF7Tte5C28qnV607B701tae+JiPs7vFfdS4CleWpZ17XXr4GuMZI62sm06qfbNUL3p9Tx2yLiT5RA/3tVQN3KI5RhCQNZiTLWdVlg9jDft9VC6NqmehoDQWliPTnC/a1f4K3W+y8B5w2QdrAvheuq7cspgUQnrbF59a6vgb44FuOpeX5G9R57D5C+vTtqoBnCv6B82b+XEgi+l9LidM4A6cnM24HTIuJHlIH3746I91djIY+jdOEeR1nW5wFKoHgsnXtEuv18RmPYn21mXhoRa1HGcb6ZEvB+LCI+kZlfyMyHImIuZazheOhF+UApo3sp3fSddAqwO11jJHW03T2U+1mu7f06/h+pj+9tycwzI+I3wNspn99HgQMiYpfMPI3+uvAJyri+Tu5m5DPjWwH5PSM8Tw1gICgtGlrB2RMDjaMawu8oQdBOEfG5tm7Alp2r7Zm1ffdTWr3arcFTA8Y5lK7WCwYZpzikqiXqVGD3iNiHMhngzMycN4xz50fEXygtbM+hzGjdDvhuZn6snrbqph2PL8V1OuxrtfLcOsA5I/psM/N+ysSZ70aZDX4e8NmI+O/qcz0b2CMiNmvrru2klaegv+WSqutxHcq4tLG2Ttt7PYcSqNwyyDlzKGP4/jDU7GIG/uNltHX0+mr7Yp4aCLb+vXxb+jXooJq1fSJwYkQsD/yBMgnpNPpbyh8erC5ExHxK6++MYea9lec7h5leDeIYQWkRUC3yfBElQHra+KIY4hFS1dIYX6B84X+uw/lbA7sAZ9eXVaF8Mb06IpaopX0rZemSujOA5wL/1eHa06M8kWG4TqF0bZ5ICRCesnhvRGzQ6X6rL9VNgPvo7/57krZWqYjYkTLIfjxsFBGb1N5rJcoYx8uqAO5pRvLZVtern/sYZZzckpSFjgGOpoyn+3ZEPO0+I+K51UxhKOPO5gMfbY3LrLyH8nn+bPDb7cpH2iYzfLTanjvIOWdQvq/aJ+cQEYu1jb/8J527pEdbR1tB9Ub1ndUizfdQhifUfbhDPp/SkleNi/0b/UFkH/BX4OOd8tOqC1X3/E+AWRHxqg7p2ltiZ1KCaLuG9TS2CEqLjv+ifBldHRHfpARpqwCvosz0XXuI878AbAgcGBGvBn4MtCZevAe4hhIM1n2L0ip3fpRFp9eidNe2j/H7fpXu+CjLX/yWEoAFZczfdsDFw7nJzLy6atnbnvIl2d6qtQVweEScQ3mKyIOUwHRnSoC3V63F82zKMjMPUbpXNwR2YHQzPwczG/hZRBxHGZf5QUpQ22mpmbrhfrbXR8SllOVb7qFMLtgN+FmrpSwzb46IHSgtu9dVs6RbTxZ5BaV79bIq7T0RcThwOGV5kp9SJnh8hDKLfDzWnVsdOC8iflblf3fgl5n5q4FOqLrEj6eMCX05ZQjBPEq5bEsJEE+ukl8FfDgiDqG0PD6SmecwyjqambdVs6m3oCxdU/ct4BMR8a3q/V/L0ydvLQPcERFnUcr2IcpyQ2+hmpmdmQsi4gOUsYPXRcR3KMvhrEZ5Gsw0+ie1fLLKy8URcSKl2/u5wLsoE0luAYiIVSjDPk4Y6N7UbLYISouIakD5RlTBDWUdvg9TvuAPGsb5T1KCgF2qc46gtLq9j9Iy9Mr2VqtqQeB9KV9qx1Ja3N5K+XKqp1tA+QLanxK4HE3p7no15RFwV4/wdk+ptt/v0IpxFmXpjNUpa6ydSGlVuhl4Z2bWlzvZmzLwfgfKGLyXUr542yd/jJXfUT6THShLfcwD3lFfLLiTEXy2x1KeG3wgJXh4C6Usdmy73nmUbsPTgK0oYySPpgTCn6MEPa20R1AC0ecB/035o+Bk4I3D6ZLvwo6Ubsoj6X/axXaDnlHy+RHKkj8rUu7hKMo4ux8CF9aSHkYZU/pxyv0fV50/FnX0O8BWUZ51XXcYpZ5tS/mDazHKepR1j1I+15dRPtNjKZ/5fpTnZrfu89IqT3+g1IGvUsa53kdZd7CV7u+UPxROp5RpazzsFTx12MN/UFp9z0DqYNrChbYUS00V5bm951CevLH1YK0yEyki9qR8AUZm3jhU+skgqieLZOYevc7LZBT9TxZ5XjVObpFTddfeTHkSyNd7nZ/hqFoxL24fJyu12CIoNVhmPk5pMfgLcFZEvKLHWWrZDfj9ohIEqhmqx7YdReminvRDq6qxv2tTWl+ljiZ9RZY0vjLzn5RHsfVU1d22DWUs1IaUbjZpUsnM/6Z0oU96mXkuZf1FaUAGgpImi5UpY7oeAL6QmWf1NjuSNPU5RlCSJKmhbBHsQl9f33RKV9rfGXhFfUmSpMlgMcrKAFfOnDnzKasBGAh2Z2PgN73OhCRJ0gi8hrKG5r8ZCHbn7wAveclLWGKJJQZNOHv2bGbMGO5TgKYuy6GwHArLoZ9lUVgOheVQWA79xqIs5s+fz4033ghV/FJnINidJwGWWGIJpk+fPmTi4aRpAsuhsBwKy6GfZVFYDoXlUFgO/cawLJ42nM11BCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIayieLTGIP/3M+j857otfZGBNLL/ecXmdBkiS1mTSBYEQsDdwArA5snJlX1Y7tDHwKeBEwBzgsM89oO/+ZwGHAfwLLA1cCe2fmn9vSrQp8GXgLsBD4GfCxzLxnPO5rNB6d9wQXXHlbr7MxJjZcc6leZ0GSJLWZTF3Dh9IhMI2IbYFTgJ8As4BfAz+IiFltSb8E7AkcArwdmA9cEBGr1a61OHA+8DJgZ2A3YFPg7IiYNsb3I0mSNKlNihbBiJgB7AF8HDix7fDhwJmZ+cnq9UURsR7wWeDn1fmrV+d/NDO/We37A/A34GPAAdW5/wFsAMzIzGurdHOB31GCzPPG4/4kSZImo8nSIng88FXgxvrOiHgxsC5welv604CNI2Ll6vWbgcWAf3cXZ+bDlG7frWrnbQVc0woCq3SXAbe2pZMkSZryeh4IRsT7gLWBIzocXq/aXte2vxXIRS3dXZl5b4d0L4mIZ9TStV+rlW7dkeRbkiRpUdfTruGIWA44Gtg3Mx+JiPYkK1TbB9r2319tV6yla0/TSvdMYGngoSHSrT/8nBezZ88eVrq+vr6RXhqAZy27MnPnzu3q3MlmwzXX7rocphrLobAc+lkWheVQWA6F5dBvPMui12MEjwBuysxTe5yPrsyYMYPp06cPmqavr4+ZM2d2df277nuU1VZ7tKtzJ6Nuy2EqGU19mEosh36WRWE5FJZDYTn0G4uymDdv3oCNVz0LBCPipZQJHltExPLV7qVb24hYhv6Wv+WBO2unt1oK76u291dp2q0APA48Mox093XYL0mSNGX1cozgOpRA9CJKgHY/cE517CLgN8D11ev12s5tdeNmtb0eWCUiVuyQ7sbMXFBL136tVroburgHSZKkRVYvA8HfAq9v+9mnOrYHsFtm/o0SoO3Qdu6OwJWZeXf1+pfAAmD7VoJqgeq38dQlYc4DXlYtP9NK92rKQtUuHSNJkhqlZ13D1ZM8Lq7vq00W6as9WeRg4IyImAP8irJY9JuBrWvXuiMiTgA+HxFPUJaD2Q+YBhxbe4uzgKuBH0XEJyn3fzTwe6o1CSVJkpqi58vHDCUzzwR2BbYFfgFsCeyUme2B2z7A1ykTUM4GngW8KTPn1q71BOXRcrOB7wMnAX8AtsnMheN8K5IkSZNKr2cNP0VmXkxpxWvffwrlMXODnfs48InqZ7B0d/L0rmZJkqTGmfQtgpIkSRofBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNtXiv3jgi3gV8HFgXWBq4A/gJcHhmPlhLNwv4HLB+lebYzDyuw/X2A/YEVgWuBQ7MzAva0iwDHA1sCywJXATslZm3jPX9SZIkTXa9bBFcEbgU+CDwFuDLwPuBM1sJImIT4GzgT8As4CTg2IjYo36hKgg8Ejge2Bq4CTg3IjZoe88fANsAewE7AKsBF0TEUmN9c5IkSZNdz1oEM/Nbbbsujoh/ASdGxGqZORc4GPhjZn6gSnNRRLwQOCQivpGZCyJiOvAZSkvhMQARcQlwDfBpYPtq36soQeLWmXlete8aYA6wC/C1cbxdSZKkSWeyjRG8p9ouUQV4bwDOaEtzGqX79xXV602B5YDTWwky80ngh8CsiJhW7d4KeBA4v5buNuB31TFJkqRG6VmLYEtELAY8E3gppQXw7My8JSLWB5YArms75dpquy5wFbBe9fr6DumWBlYHbq/S3ZCZCzqk23IMbkWSJGmR0vNAELiX0qIHpbVup+rfK1TbB9rS319tV6ylm5eZjw2S7vYqXfu1WulW7LB/SLNnzx5Wur6+vm4uz7OWXZm5c+d2de5ks+Gaa3ddDlON5VBYDv0si8JyKCyHwnLoN55lMRkCwdcBSwEzKGP9zomILXqao2GaMWMG06dPHzRNX18fM2fO7Or6d933KKut9mhX505G3ZbDVDKa+jCVWA79LIvCcigsh8Jy6DcWZTFv3rwBG696Hghm5p+rf14WEX2U7t530t8lvHzbKa2Wwvuq7f3A9IhYMjP/NUS6F3bIwgq1NJIkSY0x2SaL/BlYAKxNmc07n/4xgC3rV9sbqm1rbGCndA9T1h5spYva5JF6uhuQJElqmMkWCG5CydPNmTkPuJBq+ZeaHYE7gT9Wry+jzAbeoZWgmoCyPXB+Zi6sdp9HaV3cspbuBcDm1TFJkqRG6eWTRX4BXECZtfsvYENgf+Bq4KdVssOASyPim8CpwGbA7sCerdm/mTkvIo4AjoyIuykB4m7AWvRPPCEzL4+Ic4FvR8S+wEPV9W8DTh7Pe5UkSZqMejlG8ArgvcCLq9e3ACcAX8zM+QCZ+fuIeDvlqSE7A3OBfTLzhPqFMvOYiAD4KPBcSnC5dWb+pe09dwSOoSwePZ3yiLntMnPqzMiQJEkapl4+WeQg4KBhpDuPYXTdVk8VOWaINA8DH6p+JEmSGm2yjRGUJEnSBDEQlCRJaigDQUmSpIYyEJQkSWooA0FJkqSGMhCUJElqKANBSZKkhhpxIBgRW3Z4Xq8kSZIWMd20CP4cuD0ijo6IDcY6Q5IkSZoY3QSC7wB+B+wJ/DEiro6I/SJitTHNmSRJksbViAPBzDw7M7enPNN3d+Bu4Cjg1oj4ZUS8NyKWGuN8SpIkaYx1PVkkMx/OzO9k5huBNYBPAasApwB3RcR3I+KNY5RPSZIkjbGxmjW8GPBMYDowDXgMeBPwq4j4U0TMGKP3kSRJ0hhZvNsTI2I5YHvgvcBmwBPAucAnqu0CYBvgS8BJwMajzawkSZLGzogDwYh4ByX42wpYErgS2Bv4QWbe15b8pxHxHOBro8ynJEmSxlg3LYI/Bu4Avgyckpk3DJH+auDULt5HkiRJ46ibQPDNwAWZuXA4iTPzCuCKLt5HkiRJ42jEgWBm/no8MiJJkqSJ1c0j5r4UETcNcvzGiDh6dNmSJEnSeOtm+ZitgTMGOX4G8LbusiNJkqSJ0k0g+ALglkGO31qlkSRJ0iTWTSD4EPDiQY6vSVlQWpIkSZNYN4HghcCHIuKF7Qci4kXAh6o0kiRJmsS6WT7mYGAWMDsiTgKurfbPAHYBngQOGpPcSZIkadx0s3zMTRGxGXA8sFfb4UuAvTIzxyJzkiRJGj9dPWs4M68FXlc9Pm7NaveczLx3zHImSZKkcdVVINiSmfcA94xRXiRJkjSBugoEI2IxYEtKa+AKwLS2JAsz8/BR5k2SJEnjaMSBYERsBJwFPJ+nB4AtCwEDQUmSpEmsmxbBrwHPAt4B/CYzHxjLDEmSJGlidBMIvhz4dGaeM9aZkSRJ0sTpZkHp2xm4S1iSJEmLiG4CwaOA3SNi2bHOjCRJkiZON13DKwL/BP4aET8C/pfyNJG6hZl59GgzJ0mSpPHTTSB4VO3fewyQZiFgIChJkjSJdRMIvnjMcyFJkqQJ182zhm8dj4xIkiRpYnX9iLmIWAd4HbAKcGpm3hIRSwCrAndm5vyxyaIkSZLGQzdPFnkGcALwAcoyMguB3wO3AEsA1wCHAf89ZrmUJEnSmOtm+ZhPAe8HDgI2obamYGY+Qnn83LvGJHeSJEkaN90EgrsC38nMI4G/djh+DbDOqHIlSZKkcddNIPh84IpBjj8GLNNddiRJkjRRugkE7wTWGOT4TMCZxZIkSZNcN4HgWcB/VbOGWxYCRMQsYGfgh2OQN0mSJI2jbgLBQ4HbgD8Bp1KCwE9FxB+AnwF/Af7fWGVQkiRJ42PEgWBmPgRsChwJPBf4F7A5sDQlSHxtZj42hnmUJEnSOOhqQenM/BclEDxybLMjSZKkidJN17AkSZKmgG6eLPKdYSRbmJkf6CI/kiRJmiDddA2/gWqWcM1iwPOq7d3AP0eZL0mSJI2zEQeCmfmiTvsj4pnAh4CPAVsMdZ2I2A54D2XdwRWBOcDXgRMzc0Et3Szgc8D6wB3AsZl5XIfr7QfsCawKXAscmJkXtKVZBjga2BZYErgI2Cszbxkqv5IkSVPNmI0RzMzHM/OrwC+Brw7jlH2BecD+wFuBnwJfAT7fShARmwBnU5aqmQWcBBwbEXvUL1QFgUcCxwNbAzcB50bEBm3v+QNgG2AvYAdgNeCCiFhqJPcqSZI0FXQ1a3gIfwHeN4x0b8vMu2uvL4qIpYGPRMRnMnMecDDwx9p4w4si4oXAIRHxjcxcEBHTgc9QWgqPAYiISyjPPP40sH2171WUIHHrzDyv2ncNpSVyF+Bro7prSZKkRcx4zBreAnh0qERtQWDLnyhdtitWAd4bgDPa0pxG6f59RfV6U2A54PTatZ+kPN1kVkRMq3ZvBTwInF9Ldxvwu+qYJElSo3Qza/jgAQ4tD7yWEqAd1WV+XgPcB/wDCGAJ4Lq2NNdW23WBq4D1qtfXd0i3NLA6cHuV7ob6+MNaui27zK8kSdIiq5uu4UMH2H8/pZt1D+CbI71oRGwE7Ap8NjOfjIgVqkMPdHgfKBNMAFYA5nV4mkk93e1VuvZrtdKt2GG/JEnSlNbNrOEx706OiFWBs4ArqE0Wmexmz549rHR9fX1dXf9Zy67M3Llzuzp3stlwzbW7LoepxnIoLId+lkVhORSWQ2E59BvPshiPySIjEhHLAT+njCvcJjMfrw61WvSWbzul1VJ4Xy3d9IhYsnr03WDpXtghCyvU0ozIjBkzmD59+qBp+vr6mDlzZjeX5677HmW11YYcbrnI6LYcppLR1IepxHLoZ1kUlkNhORSWQ7+xKIt58+YN2HjVzRjBTsHUkKqJGe3XWpKyPMwqwKaZeW/t8BxgPmVs3/m1/etX2xuqbWts4HqUySb1dA9T1h5spdsiIqZl5sK2dDcgSZLUMN10894C/K2Ln6eIiMUpM3tfDszKzFvrx6vlYy6kWv6lZkfgTuCP1evLKLOBd6hde7HqvPNrQd95lNbFLWvpXgBsXh2TJElqlG66hncDPgq8gLKUy43V/qAEabdRFoZun53b7njgbcABwFIR8erasesy8yHgMODSiPgmcCqwGbA7sGdr9m9mzouII4AjI+JuSoC4G7AWsFPrgpl5eUScC3w7IvYFWte/DTi5i3KQJElapHUTCD4PmA6snZn31w9ExCGUdflWzcz/N8R1Wi1zX+hw7PXAxZn5+4h4O+WpITsDc4F9MvOEeuLMPCYioASoz6UsCbN1Zv6l7bo7AsdQFo+eTnnE3HaZOXUG4kmSJA1TN4HgHsAX24NAgMy8t2q92xsYNBAc6JnFHdKdxzC6bqunihwzRJqHKc9D/tBw3luSJGkq62aM4EqUhZoH8uwqjSRJkiaxbgLBPwB7R8TT5jJXi0LvDVw+2oxJkiRpfHXTNfwR4GLgioi4Erip2r8OsDFlTb69xiR3kiRJGjcjbhHMzOuAl1FmBi8PbFv9LA98GXhZZl470PmSJEmaHLp6skhm3gXsU/1IkiRpETSqR8xFxDqUp4LMzswHxyZLkiRJmgjdTBYhInaKiNsoj2a7FJhZ7X9ORNwYEe1PA5EkSdIkM+JAMCL+A/g+5dm9+wPTWscy855q/85jlUFJkiSNj25aBD8N/DoztwRO6XD8cmCDUeVKkiRJ466bQHA94CeDHP8HsHJ32ZEkSdJE6SYQ/CeDP1lkLeCe7rIjSZKkidJNIHghsEtELNF+ICJWA3YHfjHajEmSJGl8dTtG8HnAVcCHgYXAVhFxFHANsAD47JjlUJIkSeOimyeL3ARsBtwJHEqZNfxx4ADgz8DmmXnb2GVRkiRJ42FEC0pHxGLA6sBdmfnmiFgBWJsSUN6cmXePQx4lSZI0Dkb6ZJFnAHOAA4EvZub9wJVjnitJkiSNuxF1DWfm48BcyrhASZIkLcK6mSxyEmXW8JJjnRlJkiRNnJF2DQPcCCwG3BARpwA3A4+1J8rMH44yb5IkSRpH3QSC36/9+6AB0iwEDAQlSZImsWEFghHxFeCUzOwDXl/tXprSEvjkOOVNkiRJ42i4LYIfAf4A9GXmJRGxEuWZwltk5iXjljtJkiSNm24mi7RMG7NcSJIkacKNJhCUJEnSIsxAUJIkqaFGMmt4zYh4ZfXv5artuhHxSKfEmXnFqHImSZKkcTWSQPCz1U/dcR3STaMsH7NYt5mSJEnS+BtuILjruOZCkiRJE25YgWBmnjLeGZEkSdLEcrKIJElSQxkISpIkNZSBoCRJUkMZCEqSJDWUgaAkSVJDGQhKkiQ1lIGgJElSQxkISpIkNZSBoCRJUkMZCEqSJDWUgaAkSVJDGQhKkiQ1lIGgJElSQxkISpIkNZSBoCRJUkMZCEqSJDWUgaAkSVJDGQhKkiQ1lIGgJElSQxkISpIkNZSBoCRJUkMZCEqSJDXU4r1884hYG9gPeDUwA7ghM2d0SDcL+BywPnAHcGxmHtch3X7AnsCqwLXAgZl5QVuaZYCjgW2BJYGLgL0y85axuzNJkqTJr9ctgi8Ftgb+ClzXKUFEbAKcDfwJmAWcBBwbEXu0pdsPOBI4vrrmTcC5EbFB2yV/AGwD7AXsAKwGXBARS43RPUmSJC0SetoiCJyTmf8DEBEnAxt1SHMw8MfM/ED1+qKIeCFwSER8IzMXRMR04DOUlsJjqutdAlwDfBrYvtr3KkqQuHVmnlftuwaYA+wCfG1c7lKSJGkS6mmLYGYuGOx4FeC9ATij7dBplO7fV1SvNwWWA06vXftJ4IfArIiYVu3eCngQOL+W7jbgd9UxSZKkxuh11/BQ1gKW4OndxtdW23Wr7XrV9voO6ZYGVq+lu6FDAHpt7VqSJEmN0Ouu4aGsUG0faNt/f7VdsZZuXmY+Nki626t07ddqpVuxw/5BzZ49e1jp+vr6RnppAJ617MrMnTu3q3Mnmw3XXLvrcphqLIfCcuhnWRSWQ2E5FJZDv/Esi8keCE5qM2bMYPr06YOm6evrY+bMmV1d/677HmW11R7t6tzJqNtymEpGUx+mEsuhn2VRWA6F5VBYDv3GoizmzZs3YOPVZO8abrXoLd+2v9VSeF8t3fSIWHIY6dqv1Up3X4f9kiRJU9ZkDwTnAPPpHwPYsn61vaHatsYGdkr3MGXtwVa6qE0eqae7AUmSpAaZ1IFgZs4DLqRa/qVmR+BO4I/V68sos4F3aCWIiMWq887PzIXV7vMoLYJb1tK9ANi8OiZJktQYvX6yyFL0L9uyBrBsRGxbvb4yM28FDgMujYhvAqcCmwG7A3u2Zv9m5ryIOAI4MiLupgSIu1FmHe/Uer/MvDwizgW+HRH7Ag9V178NOHlcb1aSJGmS6fVkkVWAM9v2tV7vCpycmb+PiLdTnhqyMzAX2CczT6iflJnHRATAR4HnUpaE2Toz/9J2/R2BYyiLR0+nPGJuu8ycOrMyJqEll3wWd903NYp4qemLs8yzl+h1NiRJGrWeBoLV833bx+t1Sncew+i6rZ4qcswQaR4GPlT9aII8/uRCLrjytl5nY0y8ceMXGghKkqaEST1GUJIkSePHQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhFu91BiZaRKwDHAdsDjwGnA4cmJmP9jRjkiRJE6xRgWBELA9cBNwKbAusAnwRWBl4d+9yJkmSNPEaFQgCHwJWADbMzHsAIuIJ4NSIODwzr+1p7iRJkiZQ08YIbgVc0AoCK2cB84BZvcmSJElSbzStRXA94Dv1HZk5LyLmAOuO4DqLAcyfP39YiefNmzeCS/d74vH5LP6MBV2dO9ksePKJKXMvTzw+n3nzFuv6/G7rw1RjOfSzLArLobAcCsuh32jLohavPO3La9rChQtHdfFFSUQ8DhyUmUe17f8t8I/MfNdwrtPX17c58JtxyKIkSdJ4ec3MmTN/W9/RtBbBsXIl8Brg78CTPc6LJEnSYBYDnkeJX56iaYHg/cDyHfavANww3IvMnDlzHvDbIRNKkiRNDnM67WzaZJHrKeME/y0ipgNrMYJAUJIkaSpoWiB4HvDGiFiptu+dwPTqmCRJUmM0bbLI8sBs4BbgcPoXlL4gM11QWpIkNUqjWgQz8wHgDcAjwI+BLwFnAO/vYbYkSZJ6olEtgpIkSerXqBZBSZIk9TMQlCRJaigDQUmSpIZq2oLSEyIi1gGOAzYHHgNOBw7MzEd7mrEJFBG7ACd1OHR8Zn5kgrMzYSJibWA/4NXADOCGzJzRId0s4HPA+sAdwLGZedxE5nU8DaccIuJk4D87nL5dZv5o3DM5ASJiO+A9wExgRcqCrl8HTszMBbV0U70+DFkOTagPABHxLuDjlOfbL035vH8CHJ6ZD9bSTfU6MWQ5NKVO1EXE0pR1jVcHNs7Mq2rHdgY+BbyI8n/osMw8Y7TvaSA4xqolai4CbgW2pX+JmpWBJi5R8xbgwdrrO3uVkQnyUmBr4HJKi/vTWt0jYhPgbOC7wL7AZsCxEfF4Zp4wgXkdT0OWQ+VmSoBQd+M45mui7Uv5XbA/cBfweuArwJrVvqbUhyHLoTLV6wOUQPhSyvfCfcDLgUOr7ZuhMXViyHKoNKFO1B1Kh9gsIrYFTgGOAn4JvAP4QUQ8lJk/H80bOmt4jEXEgcDBwBqZeU+1byfgVGBGZl7by/xNlFqL4MqtcmiCiHhGWwvHRh1awn4OrJiZr6rt+wbwNmD1ekvRomqY5dBx/1QSEStn5t1t+74I/BewfGbOa0h9GE45nMwUrw8DiYgPAidSPu+5TagTnXQoh5NpUJ2IiBnAHygtpSdSaxGMiOuBazJz+1r6X1L+/7xyNO/rGMGxtxVlgep68HMWMA+Y1ZssaaIM9Qu6eqThGyjrV9adBqwKvGKcsjahpuoX1Ui1Bz+VPwFLAis2qD4MWg4TnJ3JqPV9sURT6sQA/l0OPc1F7xwPfJW2Fs+IeDGlC/30tvSnARtHxMqjeVO7hsfeesB36juqv3bnUD7IppldVdLbgJOBz2XmE73NUk+tRfkld13b/lZL8brAVTTHWhHxAPBsylN/jhqLMS+T3GsoXWH/AILm1od6ObQ0pj5ExGLAMynDKA4Gzs7MWyJifRpUJwYqh1qSRtSJiHgfsDZlSM1GbYfXq7YD1YkAOv2xNSy2CI69FYAHOuy/n2b95ft34BBgF8o4wZ8ABwHf6mGeJoMVqu0Dbfvvr7ZNqiN/okwoeQdlPO3twOnVsIIpKSI2AnYFvpSZT9LQ+tChHKB59eFeymTCqyi/L3eq9jetTgxUDtCQOhERywFHAwdk5iMdkoxrnbBFUOMiM38B/KK261cR8SBwaEQcnplzepQ1TRKZ+eW2Xf8TERcCn6W0Hk8pEbEqZZjIFcDne5ydnhmoHJpWH4DXAUtRZtV/BjgnIrboaY5643V0KIfMfLJBdeII4KbMPLUXb26L4Ni7H1i+w/4VKN0gTfbDajuVx7gMpfUX3PJt+1t/8TW9jpwJvHC0Y14mm+ov/p8DjwLbZObj1aFG1YdBymEgU7I+AGTmnzPzssz8BvBOykzqd9KwOjFIOQxkStWJiHgpsAdwUEQsX608snR1eOmIWIZxrhMGgmPvevr784F/TxBYi7I2kJptDjCftjpCWSsMrCNTTkQsSVkKZBXgLZl5b+1wY+rDEOXQdH8GFlDGiDWmTnTwZ/rLoSnWofTOXkQJ+O4HzqmOXQT8hhJXwMB1IkeTAQPBsXce8MaIWKm2753A9OpYk70bWAj09TojvZKZ84ALge3bDu1IWWPxjxOeqUkiIqZRyuXWAWaZLnIiYnFKS/jLgVmZeWv9eFPqw1DlMMA5U64+DGITyvfxzU2pEwP4dzl0OjhF68RvKa2g9Z99qmN7ALtl5t8ofwDs0HbujsCVoy0LxwiOvROBvShjGQ6nf0HpMzKzfcbPlBURv6D8MptN+QtvFvBh4NuZ2fE/+VQQEUtRlhACWANYtloIFMp/2FuBw4BLI+KblPUlNwN2B/acKsuuDFUO1fYU4AfAXyldHrtRxgu9b8IyOv6Op6z9dgCwVES8unbsusx8iAbUB4YoB0oXVxPqQ+t34wWUGZ//AjakLKp9NfDTKtmUrxNDlUNErEED6kS11NzF9X0R0fpnX+3JIgcDZ1QrkPwKeDtl4e2tR5sHA8ExlpkPRMQbKKvm/5j+R8wd0NOMTbzrgfcDz6fUs5uAA4Fje5inibAKZQxLXev1rsDJmfn7iHg7cCSwMzAX2GcKPTEAhi6HsylPnPlMlfZxSkvHNpl5DlPHltX2Cx2OvR64uCH1YahyuJpm1Acok2TeC7y4en0LcALwxcycD9CQOjFoOUTEwzSnTgwpM8+s/sD+FGUm9Rxgp9E+VQR8sogkSVJjOUZQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGur/A0w6fGYym9kTAAAAAElFTkSuQmCC\n",
+      "text/plain": [
+       "<Figure size 720x360 with 1 Axes>"
+      ]
+     },
+     "metadata": {
+      "needs_background": "light"
+     },
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "df1 = df[df[\"name\"].isin([\"QuerySamplesComplete\"])]\n",
+    "df1['delta'] = df1['ts'].diff()\n",
+    "ax = df1['delta'].plot.hist(bins=BINS, alpha=0.5, figsize=figsize)\n",
+    "ax.set_title('Time between QuerySamplesComplete (usec)');\n",
+    "plt.show()\n",
+    "\n",
+    "ax = df1['dur'].plot.hist(bins=BINS, alpha=0.5, figsize=figsize)\n",
+    "ax.set_title('Time QuerySamplesComplete (usec)');"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.3"
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/utils.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/utils.cc
new file mode 100644
index 000000000..de74eb820
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/utils.cc
@@ -0,0 +1,124 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "utils.h"
+
+#include <chrono>
+#include <ctime>
+#include <fstream>
+#include <iomanip>
+#include <sstream>
+
+#include "logging.h"
+
+namespace mlperf {
+
+std::string DoubleToString(double value, int precision) {
+  std::stringstream ss;
+  ss.precision(precision);
+  ss << std::fixed << value;
+  return ss.str();
+}
+
+bool FileExists(const std::string filename) {
+  std::ifstream file_object(filename);
+  return file_object.good();
+}
+
+namespace {
+
+std::string DateTimeString(const char* format,
+                           std::chrono::system_clock::time_point tp,
+                           bool append_ms, bool utc) {
+  std::time_t tp_time_t = std::chrono::system_clock::to_time_t(tp);
+  std::tm date_time =
+      utc ? *std::gmtime(&tp_time_t) : *std::localtime(&tp_time_t);
+  constexpr size_t kDateTimeMaxSize = 256;
+  char date_time_cstring[kDateTimeMaxSize];
+  std::strftime(date_time_cstring, kDateTimeMaxSize, format, &date_time);
+  std::string date_time_string(date_time_cstring);
+  if (!append_ms) {
+    return date_time_string;
+  }
+
+  auto tp_time_t_part = std::chrono::system_clock::from_time_t(tp_time_t);
+  auto tp_remainder = tp - tp_time_t_part;
+  auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(tp_remainder)
+                .count();
+  if (ms < 0 || ms >= 1000) {
+    LogDetail([ms](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+      std::stringstream ss;
+      ss << "WARNING: Unexpected milliseconds getting date and time."
+         << " ms: " << ms;
+      MLPERF_LOG_WARNING(detail, "warning_generic_message", ss.str());
+#else
+      detail("WARNING: Unexpected milliseconds getting date and time.", "ms",
+             ms);
+#endif
+    });
+  }
+  std::string ms_string = std::to_string(ms);
+  // Prefix with zeros so length is always 3.
+  ms_string.insert(0, std::min<size_t>(2, 3 - ms_string.length()), '0');
+  return date_time_string + "." + ms_string;
+}
+
+}  // namespace
+
+std::string CurrentDateTimeISO8601() {
+  return DateTimeString("%FT%TZ", std::chrono::system_clock::now(), false,
+                        false);
+}
+
+std::string DateTimeStringForPower(std::chrono::system_clock::time_point tp) {
+  return DateTimeString("%m-%d-%Y %T", tp, true, true);
+}
+
+std::string EscapeStringJson(const std::string& in) {
+  std::stringstream ss;
+  for (auto c = in.cbegin(); c != in.cend(); c++) {
+    int c_val = static_cast<int>(*c);
+    switch (*c) {
+      case '"':
+        ss << "\\\"";
+        break;
+      case '\\':
+        ss << "\\\\";
+        break;
+      case '\b':
+        ss << "\\b";
+        break;
+      case '\f':
+        ss << "\\f";
+        break;
+      case '\n':
+        ss << "\\n";
+        break;
+      case '\r':
+        ss << "\\r";
+        break;
+      case '\t':
+        ss << "\\t";
+        break;
+      default:
+        if (c_val >= 0x00 && c_val < 0x20) {
+          ss << "\\u" << std::hex << std::setw(4) << std::setfill('0') << c_val;
+        } else {
+          ss << *c;
+        }
+    }
+  }
+  return ss.str();
+}
+
+}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/utils.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/utils.h
new file mode 100644
index 000000000..c587e0cbe
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/utils.h
@@ -0,0 +1,70 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Various shared utility functions.
+
+#ifndef MLPERF_LOADGEN_UTILS_H
+#define MLPERF_LOADGEN_UTILS_H
+
+#include <algorithm>
+#include <chrono>
+#include <string>
+
+#include "query_sample.h"
+
+namespace mlperf {
+
+template <typename T>
+void RemoveValue(T* container, const typename T::value_type& value_to_remove) {
+  container->erase(std::remove_if(container->begin(), container->end(),
+                                  [&](typename T::value_type v) {
+                                    return v == value_to_remove;
+                                  }),
+                   container->end());
+}
+
+template <typename CountT, typename RatioT>
+double DurationToSeconds(
+    const std::chrono::duration<CountT, RatioT>& chrono_duration) {
+  return std::chrono::duration_cast<std::chrono::duration<double>>(
+             chrono_duration)
+      .count();
+}
+
+inline double QuerySampleLatencyToSeconds(QuerySampleLatency qsl) {
+  return static_cast<double>(qsl) / std::nano::den;
+}
+
+template <typename DurationT>
+inline DurationT SecondsToDuration(double seconds) {
+  return std::chrono::duration_cast<DurationT>(
+      std::chrono::duration<double>(seconds));
+}
+
+std::string CurrentDateTimeISO8601();
+
+/// \brief Uses a format that matches the one used by SPEC power
+/// measurement logging.
+std::string DateTimeStringForPower(std::chrono::system_clock::time_point tp);
+
+std::string DoubleToString(double value, int precision = 2);
+
+bool FileExists(const std::string filename);
+
+// \brief Escape special characters in a string for JSON.
+// Don't use this in performance critical path.
+std::string EscapeStringJson(const std::string& in);
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_UTILS_H
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version.cc
new file mode 100644
index 000000000..3216c9d72
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version.cc
@@ -0,0 +1,85 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Non-generated version logic.
+
+#include "version.h"
+
+#include "logging.h"
+#include "utils.h"
+
+namespace mlperf {
+
+/// Helper function to split a string based on a delimiting character.
+std::vector<std::string> splitString(const std::string& input,
+                                     const std::string& delimiter) {
+  std::vector<std::string> result;
+  size_t start = 0;
+  size_t next = 0;
+  while (next != std::string::npos) {
+    next = input.find(delimiter, start);
+    result.emplace_back(input, start, next - start);
+    start = next + 1;
+  }
+  return result;
+}
+
+/// Converts the hash-filename pairs to a dict.
+std::map<std::string, std::string> LoadgenSha1OfFilesToDict(
+    const std::string& in) {
+  std::map<std::string, std::string> result;
+  auto files = splitString(in, "\n");
+  for (const auto& file : files) {
+    auto hash_and_name = splitString(file, " ");
+    assert(hash_and_name.size() > 1);
+    result[hash_and_name[1]] = hash_and_name[0];
+  }
+  return result;
+}
+
+void LogLoadgenVersion() {
+  LogDetail([](AsyncDetail& detail) {
+#if USE_NEW_LOGGING_FORMAT
+    MLPERF_LOG(detail, "loadgen_version",
+               LoadgenVersion() + " @ " + LoadgenGitRevision());
+    MLPERF_LOG(detail, "loadgen_build_date_local", LoadgenBuildDateLocal());
+    MLPERF_LOG(detail, "loadgen_build_date_utc", LoadgenBuildDateUtc());
+    MLPERF_LOG(detail, "loadgen_git_commit_date", LoadgenGitCommitDate());
+    MLPERF_LOG(detail, "loadgen_git_log_message",
+               EscapeStringJson(LoadgenGitLog()));
+    MLPERF_LOG(detail, "loadgen_git_status_message",
+               EscapeStringJson(LoadgenGitStatus()));
+    if (!LoadgenGitStatus().empty() && LoadgenGitStatus() != "NA") {
+      MLPERF_LOG_ERROR(detail, "error_uncommitted_loadgen_changes",
+                       "Loadgen built with uncommitted changes!");
+      ;
+    }
+    MLPERF_LOG(detail, "loadgen_file_sha1",
+               LoadgenSha1OfFilesToDict(LoadgenSha1OfFiles()));
+#else
+    detail("LoadgenVersionInfo:");
+    detail("version : " + LoadgenVersion() + " @ " + LoadgenGitRevision());
+    detail("build_date_local : " + LoadgenBuildDateLocal());
+    detail("build_date_utc   : " + LoadgenBuildDateUtc());
+    detail("git_commit_date  : " + LoadgenGitCommitDate());
+    detail("git_log :\n\n" + LoadgenGitLog() + "\n");
+    detail("git_status :\n\n" + LoadgenGitStatus() + "\n");
+    if (!LoadgenGitStatus().empty() && LoadgenGitStatus() != "NA") {
+      detail.Error("Loadgen built with uncommitted changes!");
+    }
+    detail("SHA1 of files :\n\n" + LoadgenSha1OfFiles() + "\n");
+#endif
+  });
+}
+
+}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version.h
new file mode 100644
index 000000000..87c3409aa
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version.h
@@ -0,0 +1,39 @@
+/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+/// \file
+/// \brief Declares the version-related strings that will be defined in
+/// a version_generated.cc as created by version_generator.py.
+
+#ifndef MLPERF_LOADGEN_VERSION_H
+#define MLPERF_LOADGEN_VERSION_H
+
+#include <string>
+
+namespace mlperf {
+
+// Non-generated.
+void LogLoadgenVersion();
+
+// Definitions generated at compile time.
+const std::string& LoadgenVersion();
+const std::string& LoadgenGitRevision();
+const std::string& LoadgenBuildDateLocal();
+const std::string& LoadgenBuildDateUtc();
+const std::string& LoadgenGitCommitDate();
+const std::string& LoadgenGitStatus();
+const std::string& LoadgenGitLog();
+const std::string& LoadgenSha1OfFiles();
+
+}  // namespace mlperf
+
+#endif  // MLPERF_LOADGEN_VERSION_H
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version_generator.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version_generator.py
new file mode 100644
index 000000000..2e7524330
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version_generator.py
@@ -0,0 +1,141 @@
+# Copyright 2019 The MLPerf Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+
+# \file
+#  \brief A script run by the build to generate the version definitions
+#  expected at link time.
+
+import datetime
+import errno
+import hashlib
+import os
+import sys
+import subprocess
+
+
+# Creates a C++ raw string literal using a delimiter that is very
+# unlikely to show up in a git stats.
+def make_raw_string(str):
+    delimeter = "LGVG_RSLD"
+    return 'R"' + delimeter + "(" + str + ")" + delimeter + '"'
+
+
+def func_def(name, string):
+    return (
+        "const std::string& Loadgen"
+        + name
+        + "() {\n"
+        + "  static const std::string str = "
+        + string
+        + ";\n"
+        + "  return str;\n"
+        + "}\n\n"
+    )
+
+
+# For clients that build the loadgen from the git respository without
+# any modifications.
+def generate_loadgen_version_definitions_git(ofile, git_command):
+    git_rev = os.popen(git_command + "rev-parse --short=10 HEAD").read()
+    git_commit_date = os.popen(
+        git_command +
+        "log --format=\"%cI\" -n 1").read()
+    git_status = os.popen(git_command + "status -s -uno .").read()
+    git_log = subprocess.Popen(
+        git_command + "log --pretty=oneline -n 16 --no-decorate", stdout=subprocess.PIPE, shell=True, encoding='ascii', errors="ignore").stdout.read()
+    ofile.write(func_def("GitRevision", "\"" + git_rev[0:-1] + "\""))
+    ofile.write(func_def("GitCommitDate", "\"" + git_commit_date[0:-1] + "\""))
+    ofile.write(func_def("GitStatus", make_raw_string(git_status[0:-1])))
+    ofile.write(func_def("GitLog", make_raw_string(git_log[0:-1])))
+
+
+# For clients that might not import the loadgen code as the original git
+# repository.
+def generate_loadgen_verstion_definitions_git_stubs(ofile):
+    na = '"NA"'
+    ofile.write(func_def("GitRevision", na))
+    ofile.write(func_def("GitCommitDate", na))
+    ofile.write(func_def("GitStatus", na))
+    ofile.write(func_def("GitLog", na))
+
+
+# Always log the sha1 of the loadgen files, regardless of whether we are
+# in the original git repository or not.
+def generate_loadgen_version_definitions_sha1(ofile, loadgen_root):
+    """Writes definition for Sha1OfFiles."""
+    sha1s = ""
+    loadgen_files = [
+        "/bindings/" + s for s in os.listdir(loadgen_root + "/bindings")
+    ] + ["/" + s for s in os.listdir(loadgen_root)]
+    for fn in sorted(loadgen_files):
+        full_fn = loadgen_root + fn
+        if not os.path.isfile(full_fn):
+            continue
+        file_data = open(full_fn, "rb").read()
+        sha1s += hashlib.sha1(file_data).hexdigest() + " " + fn + "\n"
+
+    ofile.write(func_def("Sha1OfFiles", make_raw_string(sha1s[0:-1])))
+
+
+# Outputs version function definitions to cc_filename.
+# Includes SHA1's of the relevant dirs in the loadgen_root directory.
+def generate_loadgen_version_definitions(cc_filename, loadgen_root):
+    """Generates the C++ source file with the loadgen version info."""
+    try:
+        os.makedirs(os.path.dirname(cc_filename))
+    except OSError as exc:
+        if exc.errno != errno.EEXIST:
+            raise
+    ofile = open(cc_filename, "w")
+    ofile.write("// DO NOT EDIT: Autogenerated by version_generator.py.\n\n")
+    ofile.write("#include <string>\n\n")
+    ofile.write("namespace mlperf {\n\n")
+    # Open and read the VERSION.txt file
+    with open(os.path.join(loadgen_root, "VERSION.txt"), "r") as version_file:
+        # Read and strip any extra whitespace/newlines
+        version_contents = version_file.read().strip()
+
+    # Write the version into the function definition
+    ofile.write(func_def("Version", f"\"{version_contents}\""))
+
+    date_time_now_local = datetime.datetime.now().isoformat()
+    date_time_now_utc = datetime.datetime.utcnow().isoformat()
+    ofile.write(func_def("BuildDateLocal", '"' + date_time_now_local + '"'))
+    ofile.write(func_def("BuildDateUtc", '"' + date_time_now_utc + '"'))
+
+    git_dir = '--git-dir="' + loadgen_root + '/../.git" '
+    git_work_tree = '--work-tree="' + loadgen_root + '/.." '
+    git_command = "git " + git_dir + git_work_tree
+    git_status = os.popen(git_command + "status")
+    git_status.read()
+    is_git_repo = git_status.close() is None
+    if is_git_repo:
+        generate_loadgen_version_definitions_git(ofile, git_command)
+    else:
+        generate_loadgen_verstion_definitions_git_stubs(ofile)
+    generate_loadgen_version_definitions_sha1(ofile, loadgen_root)
+
+    ofile.write("}  // namespace mlperf\n")
+    ofile.close()
+
+
+def main():
+    if len(sys.argv) != 3:
+        raise ValueError("Incorrect command-line arguments.")
+    generate_loadgen_version_definitions(sys.argv[1], sys.argv[2])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/ts_types.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/ts_types.py
new file mode 100644
index 000000000..cb558726b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/ts_types.py
@@ -0,0 +1,70 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+
+"""
+TorchScript-friendly boundary types for the HSTU sparse <-> dense interface.
+
+The eager path uses ``Dict[str, SequenceEmbedding]`` (a NamedTuple of
+``lengths`` and ``embedding`` tensors). TorchScript supports ``NamedTuple`` but
+does not script cleanly through ``Dict[str, NamedTuple]`` once the dict crosses
+device boundaries. The packaged sparse / dense modules instead exchange two
+parallel ``Dict[str, Tensor]`` dicts -- one of jagged values, one of lengths.
+
+These helpers convert between the two representations so we can keep the
+existing eager code unchanged while the scripted modules use only TS-friendly
+types at their boundaries.
+"""
+
+from typing import Dict, Tuple
+
+import torch
+from generative_recommenders.modules.dlrm_hstu import SequenceEmbedding
+
+
+# Per-feature jagged values (concatenated across batch, [L_total, table_dim]).
+SeqEmbValues = Dict[str, torch.Tensor]
+# Per-feature per-batch lengths ([B]).
+SeqEmbLengths = Dict[str, torch.Tensor]
+
+
+def flatten_seq_embeddings(
+    seq_embeddings: Dict[str, SequenceEmbedding],
+) -> Tuple[SeqEmbValues, SeqEmbLengths]:
+    """Split ``Dict[str, SequenceEmbedding]`` into parallel value/length dicts.
+
+    Lossless and zero-copy -- the returned tensors alias the inputs.
+    """
+    values: Dict[str, torch.Tensor] = {}
+    lengths: Dict[str, torch.Tensor] = {}
+    for k, v in seq_embeddings.items():
+        values[k] = v.embedding
+        lengths[k] = v.lengths
+    return values, lengths
+
+
+def unflatten_seq_embeddings(
+    values: SeqEmbValues,
+    lengths: SeqEmbLengths,
+) -> Dict[str, SequenceEmbedding]:
+    """Inverse of :func:`flatten_seq_embeddings`.
+
+    Reconstructs ``Dict[str, SequenceEmbedding]`` for code paths (e.g.
+    ``DlrmHSTU.main_forward``) that still consume the NamedTuple form.
+    """
+    out: Dict[str, SequenceEmbedding] = {}
+    for k, val in values.items():
+        out[k] = SequenceEmbedding(lengths=lengths[k], embedding=val)
+    return out
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/user.conf b/recommendation_v4/generative_recommenders/dlrm_v3/inference/user.conf
new file mode 100644
index 000000000..c6ca854f9
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/inference/user.conf
@@ -0,0 +1,5 @@
+# Please set these fields depending on the performance of your system to
+# override default LoadGen settings.
+*.Server.target_latency = 80
+# *.Server.min_duration = 20000
+# *.Offline.min_duration = 20000
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/preprocess_public_data.py b/recommendation_v4/generative_recommenders/dlrm_v3/preprocess_public_data.py
new file mode 100644
index 000000000..488af712d
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/preprocess_public_data.py
@@ -0,0 +1,211 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+import argparse
+import logging
+import os
+import tarfile
+from typing import Dict, List
+from urllib.request import urlretrieve
+
+import numpy as np
+import pandas as pd
+
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("main")
+
+"""
+Usage: mkdir -p data/ && python3 preprocess_public_data.py --dataset kuairand-1k
+"""
+
+SUPPORTED_DATASETS = ["kuairand-1k", "kuairand-27k"]
+
+
+def get_feature_merge_weights(dataset: str = "debug") -> Dict[str, int]:
+    if "kuairand" in dataset:
+        return {
+            "is_click": 1,
+            "is_like": 2,
+            "is_follow": 4,
+            "is_comment": 8,
+            "is_forward": 16,
+            "is_hate": 32,
+            "long_view": 64,
+            "is_profile_enter": 128,
+        }
+    else:
+        return {"dummy": 1}
+
+
+class DataProcessor:
+    def __init__(
+        self,
+        download_url: str,
+        data_path: str,
+        file_name: str,
+        prefix: str,
+    ) -> None:
+        self._download_url = download_url
+        self._data_path = data_path
+        self._file_name = file_name
+        self._prefix = prefix
+
+    def download(self) -> None:
+        return
+
+    def preprocess(self) -> None:
+        return
+
+    def file_exists(self, name: str) -> bool:
+        return os.path.isfile("%s/%s" % (os.getcwd(), name))
+
+
+class DLRMKuaiRandProcessor(DataProcessor):
+    def __init__(
+        self,
+        download_url: str,
+        data_path: str,
+        file_name: str,
+        prefix: str,
+    ) -> None:
+        super().__init__(download_url, data_path, file_name, prefix)
+        if prefix == "KuaiRand-1K":
+            self._log_files: List[str] = [
+                f"{data_path}{prefix}/data/log_standard_4_08_to_4_21_1k.csv",
+                f"{data_path}{prefix}/data/log_standard_4_22_to_5_08_1k.csv",
+            ]
+            self._user_features_file: str = (
+                f"{data_path}{prefix}/data/user_features_1k.csv"
+            )
+        elif prefix == "KuaiRand-27K":
+            self._log_files: List[str] = [
+                f"{data_path}{prefix}/data/log_standard_4_08_to_4_21_27k_part1.csv",
+                f"{data_path}{prefix}/data/log_standard_4_08_to_4_21_27k_part2.csv",
+                f"{data_path}{prefix}/data/log_standard_4_22_to_5_08_27k_part1.csv",
+                f"{data_path}{prefix}/data/log_standard_4_22_to_5_08_27k_part2.csv",
+            ]
+            self._user_features_file: str = (
+                f"{data_path}{prefix}/data/user_features_27k.csv"
+            )
+        self._output_file: str = f"{data_path}{prefix}/data/processed_seqs.csv"
+        self._event_merge_weight: Dict[str, int] = get_feature_merge_weights(
+            prefix.lower()
+        )
+
+    def download(self) -> None:
+        file_path = f"{self._data_path}{self._file_name}"
+        if not self.file_exists(file_path):
+            log.info(f"Downloading {self._download_url}")
+            urlretrieve(self._download_url, file_path)
+            log.info(f"Downloaded to {file_path}")
+            with tarfile.open(file_path, "r:*") as tar_ref:
+                tar_ref.extractall(path=self._data_path)
+                log.info("Data files extracted")
+            os.remove(file_path)
+            log.info("Tar file removed")
+
+    def preprocess(self) -> None:
+        self.download()
+        log.info("Preprocessing data...")
+        seq_cols = [
+            "video_id",
+            "time_ms",
+            "action_weights",
+            "play_time_ms",
+            "duration_ms",
+        ]
+        df = None
+        for idx, log_file in enumerate(self._log_files):
+            log.info(f"Processing {log_file}...")
+            log_df = pd.read_csv(
+                log_file,
+                delimiter=",",
+            )
+            df_grouped_by_user = log_df.groupby("user_id").agg(list).reset_index()
+
+            for event, weight in self._event_merge_weight.items():
+                df_grouped_by_user[event] = df_grouped_by_user[event].apply(
+                    lambda seq: np.where(np.array(seq) == 0, 0, weight)
+                )
+
+            events = list(self._event_merge_weight.keys())
+            df_grouped_by_user["action_weights"] = df_grouped_by_user.apply(
+                lambda row: [int(sum(x)) for x in zip(*[row[col] for col in events])],
+                axis=1,
+            )
+            df_grouped_by_user = df_grouped_by_user[["user_id"] + seq_cols]
+
+            if idx == 0:
+                df = df_grouped_by_user
+            else:
+                df = df.merge(df_grouped_by_user, on="user_id", suffixes=("_x", "_y"))
+                for col in seq_cols:
+                    df[col] = df.apply(
+                        lambda row: row[col + "_x"] + row[col + "_y"], axis=1
+                    )
+                    df = df.drop(columns=[col + "_x", col + "_y"])
+
+        max_seq_len = df["video_id"].apply(len).max()
+        min_seq_len = df["video_id"].apply(len).min()
+        average_seq_len = df["video_id"].apply(len).mean()
+        log.info(f"{max_seq_len=}, {min_seq_len=}, {average_seq_len=}")
+
+        log.info("Merging user features...")
+        user_features_df = pd.read_csv(self._user_features_file, delimiter=",")
+
+        def _one_hot_encode(row):
+            mapping = {category: i + 1 for i, category in enumerate(row.unique())}
+            row = row.map(mapping)
+            return row
+
+        for col in [
+            "user_active_degree",
+            "follow_user_num_range",
+            "fans_user_num_range",
+            "friend_user_num_range",
+            "register_days_range",
+        ]:
+            user_features_df[col] = _one_hot_encode(user_features_df[col])
+
+        final_df = pd.merge(df, user_features_df, on="user_id")
+        final_df.to_csv(self._output_file, index=False, sep=",")
+        log.info(f"Processed file saved to {self._output_file}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset", choices=SUPPORTED_DATASETS, help="dataset")
+    args = parser.parse_args()
+    if args.dataset == "kuairand-1k":
+        kuairand_processor = DLRMKuaiRandProcessor(
+            download_url="https://zenodo.org/records/10439422/files/KuaiRand-1K.tar.gz",
+            data_path="data/",
+            file_name="KuaiRand-1K.tar.gz",
+            prefix="KuaiRand-1K",
+        )
+        kuairand_processor.preprocess()
+    elif args.dataset == "kuairand-27k":
+        kuairand_processor = DLRMKuaiRandProcessor(
+            download_url="https://zenodo.org/records/10439422/files/KuaiRand-27K.tar.gz",
+            data_path="data/",
+            file_name="KuaiRand-27K.tar.gz",
+            prefix="KuaiRand-27K",
+        )
+        kuairand_processor.preprocess()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/streaming_synthetic_data.py b/recommendation_v4/generative_recommenders/dlrm_v3/streaming_synthetic_data.py
new file mode 100644
index 000000000..bb9e508af
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/streaming_synthetic_data.py
@@ -0,0 +1,664 @@
+# pyre-strict
+"""
+Streaming synthetic data generator for DLRMv3.
+
+This module generates synthetic streaming recommendation data for benchmarking
+and testing purposes. It creates user-item interaction histories with timestamps,
+ratings, and category-based item distributions.
+"""
+
+import csv
+import logging
+import math
+import multiprocessing
+import os
+import random
+import shutil
+import time
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class StreamingSyntheticDataGenerator:
+    """
+    Generator for streaming synthetic recommendation data.
+
+    Creates realistic user-item interaction data with temporal dynamics,
+    category preferences, and rating distributions for benchmarking
+    recommendation systems.
+
+    Args:
+        num_categories: Number of item categories.
+        categories_per_user: Number of categories each user is interested in.
+        num_users: Total number of users to generate.
+        num_items: Total number of items in the catalog.
+        num_timestamps: Number of time periods in the streaming data.
+        avg_samples_per_item: Average number of interactions per item.
+        train_ratio: Fraction of timestamps used for training.
+        user_sampling_ratio: Probability of sampling a user at each timestamp.
+        num_eval_candidates: Number of candidates for evaluation.
+        num_inference_candidates: Number of candidates for inference.
+        debug: If True, use deterministic ratings for debugging.
+        rank: Process rank for distributed generation.
+    """
+
+    def __init__(
+        self,
+        num_categories: int,
+        categories_per_user: int,
+        num_users: int,
+        num_items: int,
+        num_timestamps: int,
+        avg_samples_per_item: int,
+        train_ratio: float,
+        user_sampling_ratio: float,
+        num_eval_candidates: int,
+        num_inference_candidates: int,
+        debug: bool = False,
+        rank: int = 0,
+    ) -> None:
+        self.num_categories = num_categories
+        self.categories_per_user = categories_per_user
+        self.num_users = num_users
+        self.num_items = num_items
+        self.num_timestamps = num_timestamps
+        self.avg_samples_per_item = avg_samples_per_item
+        self.avg_seq_len_per_timestamp = int(
+            num_items * avg_samples_per_item / num_users / num_timestamps
+        )
+        self.items_per_category: int = num_items // num_categories
+        self.category_to_start_end_item_idx: Dict[int, Tuple[int, int]] = {}
+        for i in range(num_categories):
+            start_idx = i * self.items_per_category
+            end_idx = (i + 1) * self.items_per_category
+            self.category_to_start_end_item_idx[i] = (start_idx, end_idx)
+        self.alpha_range = (1, 500)
+        self.min_seq_len: int = num_eval_candidates + 1
+        self.train_ratio = train_ratio
+        self.num_eval_candidates = num_eval_candidates
+        self.num_inference_candidates = num_inference_candidates
+        self.debug = debug
+        self.total_cnt = 0
+        self.rank = rank
+        logger.warning(f"rank {self.rank}: start generating item rating")
+        np.random.seed(1001)
+        self.item_rating = np.random.choice(  # pyre-ignore [4]
+            [5.0, 4.0, 3.0, 2.0, 1.0], size=num_items, p=[0.2, 0.25, 0.25, 0.2, 0.1]
+        )
+        logger.warning(f"rank {self.rank}: finish generating item rating")
+        self.user_sampling_ratio = user_sampling_ratio
+
+    def generate_one_timestamp(
+        self,
+        category_to_cnt: Dict[int, int],
+        categories: List[int],
+        t: int,
+        id: int,
+        output_folder: str,
+        uih_seq_len: int,
+        eval: bool,
+        inference: bool,
+        file_idx: int,
+        ts_buffers: Dict[int, List[int]],
+    ) -> Tuple[List[int], List[float], List[int], List[float], Dict[int, int]]:
+        """
+        Generate interaction data for a single user at one timestamp.
+
+        Args:
+            category_to_cnt: Running count of interactions per category.
+            categories: Categories this user is interested in.
+            t: Current timestamp index.
+            id: User ID.
+            output_folder: Output directory for files.
+            uih_seq_len: Length of user interaction history to generate.
+            eval: Whether this is for evaluation.
+            inference: Whether this is for inference.
+            file_idx: File index for output.
+            ts_buffers: Buffer for timestamp data.
+
+        Returns:
+            Tuple of (uih_item_ids, uih_ratings, candidate_ids, candidate_ratings,
+            updated_category_counts).
+        """
+        if t >= 0 and (not eval):
+            if t not in ts_buffers:
+                ts_buffers[t] = []
+            ts_buffers[t].append(id)
+        seq_len: int = self.num_inference_candidates if inference else uih_seq_len
+        self.total_cnt += seq_len
+        alpha = random.randint(self.alpha_range[0], self.alpha_range[1])
+        total_cnt = sum(category_to_cnt.values())
+        p = np.array(
+            [
+                (alpha / len(categories) + category_to_cnt[c]) / (alpha + total_cnt)
+                for c in categories
+            ]
+        )
+        item_categories = np.random.choice(categories, size=seq_len, p=p)
+        unique, counts = np.unique(item_categories, return_counts=True)
+        for cat, cnt in zip(unique, counts):
+            category_to_cnt[cat] += int(cnt)
+        sample_end_idx = int(
+            self.items_per_category * max((t + 1), 1) / self.num_timestamps
+        )
+        sample_inds = np.random.randint(0, sample_end_idx, size=seq_len)
+        offsets = np.array(
+            [self.category_to_start_end_item_idx[cat][0] for cat in item_categories]
+        )
+        sample_inds = sample_inds + offsets
+        num_categories = len(categories)
+        quarter = num_categories // 4
+        half = num_categories // 2
+        three_quarter = num_categories // 4 * 3
+        category_to_ratings = {}
+        cos1 = math.cos(t * math.pi / 4)
+        cos2 = math.cos((t + 2) * math.pi / 4)
+        cos3 = math.cos((t + 4) * math.pi / 4)
+        for i, cat in enumerate(categories):
+            if i < quarter:
+                if self.debug:
+                    ratings = np.full(seq_len, 5.0)
+                else:
+                    ratings = np.random.choice(
+                        [4.5 + 0.5 * cos1, 4.0 + 0.5 * cos2],
+                        size=seq_len,
+                        p=[0.8, 0.2],
+                    )
+            elif i < half:
+                if self.debug:
+                    ratings = np.full(seq_len, 4.0)
+                else:
+                    ratings = np.random.choice(
+                        [4.5 + 0.5 * cos1, 4.0 + 0.5 * cos2, 3.5 + 0.5 * cos3],
+                        size=seq_len,
+                        p=[0.1, 0.8, 0.1],
+                    )
+            elif i < three_quarter:
+                if self.debug:
+                    ratings = np.full(seq_len, 3.0)
+                else:
+                    ratings = np.random.choice(
+                        [3.5 + 0.5 * cos1, 3.0 + 0.5 * cos2, 2.5 + 0.5 * cos3],
+                        size=seq_len,
+                        p=[0.1, 0.8, 0.1],
+                    )
+            else:
+                if self.debug:
+                    ratings = np.full(seq_len, 2.0)
+                else:
+                    ratings = np.random.choice(
+                        [2.5 + 0.5 * cos1, 2.0 + 0.5 * cos2, 1.5 + 0.5 * cos3],
+                        size=seq_len,
+                        p=[0.1, 0.8, 0.1],
+                    )
+            category_to_ratings[cat] = ratings
+        sample_inds = sample_inds.tolist()
+        sample_ratings = [
+            (
+                category_to_ratings[item_categories[i]][i]
+                + self.item_rating[sample_inds[i]]
+            )
+            / 2
+            for i in range(seq_len)
+        ]
+        if not inference:
+            sub_indices = random.sample(range(seq_len), self.num_eval_candidates)
+            sample_candidate_inds = [sample_inds[i] for i in sub_indices]
+            sample_candidate_ratings = [sample_ratings[i] for i in sub_indices]
+            sample_uih_inds = sample_inds
+            sample_uih_ratings = sample_ratings
+        else:
+            sub_indices = random.sample(range(seq_len), uih_seq_len)
+            sample_uih_inds = [sample_inds[i] for i in sub_indices]
+            sample_uih_ratings = [sample_ratings[i] for i in sub_indices]
+            sample_candidate_inds = sample_inds
+            sample_candidate_ratings = sample_ratings
+        return (
+            sample_uih_inds,
+            sample_uih_ratings,
+            sample_candidate_inds,
+            sample_candidate_ratings,
+            category_to_cnt,
+        )
+
+    def gen_rand_seq_len(self) -> int:
+        """
+        Generate a random sequence length from a Gaussian distribution.
+
+        Returns:
+            Sequence length, guaranteed to be at least min_seq_len.
+        """
+        seq_len = round(
+            random.gauss(
+                self.avg_seq_len_per_timestamp, self.avg_seq_len_per_timestamp // 4
+            )
+        )
+        seq_len = self.min_seq_len if seq_len < self.min_seq_len else seq_len
+        return seq_len
+
+    def get_timestamp_sample(self, t: int) -> int:
+        """
+        Determine if a user should be sampled at this timestamp.
+
+        Args:
+            t: Timestamp index. Base timestamp (-1) is always sampled.
+
+        Returns:
+            1 if the user should be sampled, 0 otherwise.
+        """
+        if t == -1:
+            sample = 1
+        else:
+            sample = np.random.choice(
+                [1, 0],
+                size=1,
+                p=[self.user_sampling_ratio, 1 - self.user_sampling_ratio],
+            )[0]
+        return sample
+
+    def generate_one_user(
+        self,
+        id: int,
+        output_folder: str,
+        file_idx: int,
+        ts_buffers: Dict[int, List[int]],
+    ) -> List[str]:
+        """
+        Generate complete interaction history for one user.
+
+        Creates training, evaluation, and inference data for a single user
+        across all timestamps.
+
+        Args:
+            id: User ID.
+            output_folder: Output directory.
+            file_idx: File index for output.
+            ts_buffers: Buffer for timestamp metadata.
+
+        Returns:
+            List of CSV row values for this user's data.
+        """
+        categories = random.sample(range(self.num_categories), self.categories_per_user)
+        category_to_cnt = {c: 0 for c in categories}
+        out_list: List[str] = []
+        # t = -1 as base UIH
+        (
+            sample_inds,
+            sample_ratings,
+            sample_candidate_inds,
+            sample_candidate_ratings,
+            category_to_cnt,
+        ) = self.generate_one_timestamp(
+            category_to_cnt=category_to_cnt,
+            categories=categories,
+            t=-1,
+            id=id,
+            output_folder=output_folder,
+            uih_seq_len=self.gen_rand_seq_len(),
+            eval=False,
+            inference=False,
+            file_idx=file_idx,
+            ts_buffers=ts_buffers,
+        )
+        out_list.append(",".join([str(ind) for ind in sample_candidate_inds]))
+        out_list.append(",".join([str(rat) for rat in sample_candidate_ratings]))
+        out_list.append(",".join([str(ind) for ind in sample_inds]))
+        out_list.append(",".join([str(rat) for rat in sample_ratings]))
+        # train
+        for t in range(int(self.num_timestamps * self.train_ratio)):
+            if self.get_timestamp_sample(t):
+                (
+                    sample_inds,
+                    sample_ratings,
+                    sample_candidate_inds,
+                    sample_candidate_ratings,
+                    category_to_cnt,
+                ) = self.generate_one_timestamp(
+                    category_to_cnt=category_to_cnt,
+                    categories=categories,
+                    t=t,
+                    id=id,
+                    output_folder=output_folder,
+                    uih_seq_len=self.gen_rand_seq_len(),
+                    eval=False,
+                    inference=False,
+                    file_idx=file_idx,
+                    ts_buffers=ts_buffers,
+                )
+                out_list.append(",".join([str(ind) for ind in sample_candidate_inds]))
+                out_list.append(
+                    ",".join([str(rat) for rat in sample_candidate_ratings])
+                )
+                out_list.append(",".join([str(ind) for ind in sample_inds]))
+                out_list.append(",".join([str(rat) for rat in sample_ratings]))
+            else:
+                out_list += ["", "", "", ""]
+        # eval
+        (
+            sample_inds,
+            sample_ratings,
+            sample_candidate_inds,
+            sample_candidate_ratings,
+            category_to_cnt,
+        ) = self.generate_one_timestamp(
+            category_to_cnt=category_to_cnt,
+            categories=categories,
+            t=int(self.num_timestamps * self.train_ratio),
+            id=id,
+            output_folder=output_folder,
+            uih_seq_len=self.num_eval_candidates,
+            eval=True,
+            inference=False,
+            file_idx=file_idx,
+            ts_buffers=ts_buffers,
+        )
+        out_list.append(",".join([str(ind) for ind in sample_candidate_inds]))
+        out_list.append(",".join([str(rat) for rat in sample_candidate_ratings]))
+        out_list.append(",".join([str(ind) for ind in sample_inds]))
+        out_list.append(",".join([str(rat) for rat in sample_ratings]))
+        # inference
+        for t in range(
+            int(self.num_timestamps * self.train_ratio), self.num_timestamps
+        ):
+            if self.get_timestamp_sample(t):
+                (
+                    sample_inds,
+                    sample_ratings,
+                    sample_candidate_inds,
+                    sample_candidate_ratings,
+                    category_to_cnt,
+                ) = self.generate_one_timestamp(
+                    category_to_cnt=category_to_cnt,
+                    categories=categories,
+                    t=t,
+                    id=id,
+                    output_folder=output_folder,
+                    uih_seq_len=self.gen_rand_seq_len(),
+                    eval=False,
+                    inference=True,
+                    file_idx=file_idx,
+                    ts_buffers=ts_buffers,
+                )
+                out_list.append(",".join([str(ind) for ind in sample_candidate_inds]))
+                out_list.append(
+                    ",".join([str(rat) for rat in sample_candidate_ratings])
+                )
+                out_list.append(",".join([str(ind) for ind in sample_inds]))
+                out_list.append(",".join([str(rat) for rat in sample_ratings]))
+            else:
+                out_list += ["", "", "", ""]
+        return out_list
+
+    def write_dataset(
+        self, output_folder: str, num_files: int, file_idx: int, seed: int
+    ) -> None:
+        """
+        Write dataset for a single file partition.
+
+        Args:
+            output_folder: Output directory path.
+            num_files: Total number of files in the dataset.
+            file_idx: Index of this file partition.
+            seed: Random seed for reproducibility.
+        """
+        t0 = time.time()
+        num_users_per_file = self.num_users // num_files
+        user_id: int = num_users_per_file * file_idx
+        random.seed(seed + file_idx)
+        np.random.seed(seed + file_idx)
+        # Buffer timestamp data in memory to avoid excessive file I/O
+        ts_buffers: Dict[int, List[int]] = {}
+        output_file = output_folder + f"{file_idx}.csv"
+        with open(output_file, "w") as file:
+            writer = csv.writer(file)
+            for i in range(num_users_per_file):
+                out_list = self.generate_one_user(
+                    id=user_id,
+                    output_folder=output_folder,
+                    file_idx=file_idx,
+                    ts_buffers=ts_buffers,
+                )
+                user_id += 1
+                writer.writerow(out_list)
+                if i % 10000 == 0:
+                    logger.warning(
+                        f"rank {self.rank}: Done with users {i} for file {file_idx + 1} / {num_files}, total_cnt = {self.total_cnt}, spends {time.time() - t0} seconds."
+                    )
+        # Write buffered timestamp data after all users are processed
+        for ts, user_ids in ts_buffers.items():
+            ts_file = output_folder + f"ts_{file_idx}_{ts}.csv"
+            with open(ts_file, "w") as f:
+                writer = csv.writer(f)
+                for uid in user_ids:
+                    writer.writerow([uid])
+        logger.warning(
+            f"rank {self.rank}: Wrote {len(ts_buffers)} timestamp files for file {file_idx}"
+        )
+
+
+def worker(
+    rank: int,
+    world_size: int,
+    num_files: int,
+    num_users: int,
+    num_items: int,
+    num_categories: int,
+    categories_per_user: int,
+    num_timestamps: int,
+    avg_samples_per_item: int,
+    num_eval_candidates: int,
+    num_inference_candidates: int,
+    train_ratio: float,
+    user_sampling_ratio: float,
+    output_folder: str,
+) -> None:
+    """
+    Worker function for parallel data generation.
+
+    Each worker generates a subset of the dataset files.
+
+    Args:
+        rank: Worker rank.
+        world_size: Total number of workers.
+        num_files: Total files to generate.
+        num_users: Total users in dataset.
+        num_items: Total items in catalog.
+        num_categories: Number of item categories.
+        categories_per_user: Categories per user.
+        num_timestamps: Number of time periods.
+        avg_samples_per_item: Average interactions per item.
+        num_eval_candidates: Eval candidates count.
+        num_inference_candidates: Inference candidates count.
+        train_ratio: Training data fraction.
+        user_sampling_ratio: User sampling probability.
+        output_folder: Output directory.
+    """
+    generator = StreamingSyntheticDataGenerator(
+        num_categories=num_categories,
+        categories_per_user=categories_per_user,
+        num_users=num_users,
+        num_items=num_items,
+        num_timestamps=num_timestamps,
+        avg_samples_per_item=avg_samples_per_item,
+        train_ratio=train_ratio,
+        user_sampling_ratio=user_sampling_ratio,
+        num_eval_candidates=num_eval_candidates,
+        num_inference_candidates=num_inference_candidates,
+        debug=False,
+        rank=rank,
+    )
+    num_files_per_rank = num_files // world_size
+    file_indices = [i + rank * num_files_per_rank for i in range(num_files_per_rank)]
+    for file_idx in file_indices:
+        logger.warning(f"rank {rank}: start generating file {file_idx}")
+        generator.write_dataset(
+            output_folder=output_folder,
+            num_files=num_files,
+            file_idx=file_idx,
+            seed=1001,
+        )
+        logger.warning(f"rank {rank}: finish generating file {file_idx}")
+
+
+def write_offset(output_folder: str, num_files: int, num_users: int) -> None:
+    """
+    Write file byte offsets for random access to user data.
+
+    Creates an offset.csv file containing byte positions for each user
+    within their respective data files.
+
+    Args:
+        output_folder: Directory containing data files.
+        num_files: Number of data files.
+        num_users: Total number of users.
+    """
+    with open(output_folder + "offset.csv", "a") as output_file:
+        writer = csv.writer(output_file)
+        for i in range(num_files):
+            input_file = output_folder + f"{i}.csv"
+            offsets = []
+            with open(input_file, "r") as f:
+                while True:
+                    offset = f.tell()
+                    line = f.readline()
+                    if not line:
+                        break
+                    offsets.append(offset)
+            assert len(offsets) == num_users // num_files, (
+                f"num_users {num_users // num_files} != {len(offsets)}"
+            )
+            logger.warning(f"offsets for file {i} finished")
+            writer.writerow([",".join([str(offset) for offset in offsets])])
+
+
+def write_ts_metadata(output_folder: str, total_ts: int, num_files: int) -> None:
+    """
+    Write timestamp metadata for streaming simulation.
+
+    Creates files tracking which users are active at each timestamp
+    and cumulative counts for efficient streaming access.
+
+    Args:
+        output_folder: Output directory path.
+        total_ts: Total number of timestamps.
+        num_files: Number of data files.
+    """
+    with open(output_folder + "requests_per_ts.csv", "w") as file_requests:
+        with open(output_folder + "users_cumsum_per_ts.csv", "w") as file_cumsum:
+            requests_writer = csv.writer(file_requests)
+            cumsum_writer = csv.writer(file_cumsum)
+            for ts in range(total_ts):
+                requests = []
+                num_users_per_file = []
+                for file in range(num_files):
+                    with open(f"{output_folder}ts_{file}_{ts}.csv", "r") as file:
+                        reader = csv.reader(file)
+                        size = 0
+                        for row in reader:
+                            requests.append(int(row[0]))
+                            size += 1
+                        num_users_per_file.append(size)
+                cumsum = np.cumsum(num_users_per_file).tolist()
+                assert cumsum[-1] == len(requests)
+                requests_writer.writerow([",".join([str(r) for r in requests])])
+                cumsum_writer.writerow([",".join([str(s) for s in cumsum])])
+                logger.warning(f"ts {ts} finished")
+    with open(
+        output_folder + "requests_per_ts_offset.csv", "w"
+    ) as file_requests_offset:
+        writer = csv.writer(file_requests_offset)
+        input_file = output_folder + "requests_per_ts.csv"
+        offsets = []
+        with open(input_file, "r") as f:
+            while True:
+                offset = f.tell()
+                line = f.readline()
+                if not line:
+                    break
+                offsets.append(offset)
+        assert len(offsets) == total_ts, f"total_ts {total_ts} != {len(offsets)}"
+        logger.warning("offsets for file requests_per_ts.csv finished")
+        writer.writerow([",".join([str(offset) for offset in offsets])])
+
+
+def copy_sub_dataset(src_folder: str) -> None:
+    """
+    Copy a subset of dataset files for quick testing.
+
+    Creates a sampled_data subdirectory with essential files.
+
+    Args:
+        src_folder: Source folder containing full dataset.
+    """
+    dst_folder = src_folder + "sampled_data/"
+    files_to_copy = [
+        "0.csv",
+        "offset.csv",
+        "requests_per_ts.csv",
+        "requests_per_ts_offset.csv",
+        "users_cumsum_per_ts.csv",
+    ]
+    os.makedirs(dst_folder, exist_ok=True)
+    for filename in files_to_copy:
+        src_path = os.path.join(src_folder, filename)
+        dst_path = os.path.join(dst_folder, filename)
+        shutil.copy2(src_path, dst_path)
+    logger.warning("Files copied successfully.")
+
+
+def main() -> None:
+    """
+    Main entry point for synthetic data generation.
+
+    Configures and launches parallel workers to generate a complete
+    streaming recommendation dataset.
+    """
+    processes = []
+    num_files = 100
+    num_users = 5_000_000
+    num_items = 1_000_000_000
+    num_categories = 128
+    categories_per_user = 4
+    num_timestamps = 100
+    avg_samples_per_item = 50
+    num_eval_candidates = 32
+    num_inference_candidates = 2048
+    train_ratio = 0.9
+    user_sampling_ratio = 0.7
+    world_size = 5
+    username = os.getlogin()
+    output_folder = f"/home/{username}/data/streaming-100b/"
+    for i in range(world_size):
+        p = multiprocessing.Process(
+            target=worker,
+            args=(
+                i,
+                world_size,
+                num_files,
+                num_users,
+                num_items,
+                num_categories,
+                categories_per_user,
+                num_timestamps,
+                avg_samples_per_item,
+                num_eval_candidates,
+                num_inference_candidates,
+                train_ratio,
+                user_sampling_ratio,
+                output_folder,
+            ),
+        )
+        processes.append(p)
+        p.start()
+    for p in processes:
+        p.join()
+    write_offset(output_folder, num_files, num_users)
+    write_ts_metadata(output_folder, num_timestamps, num_files)
+    copy_sub_dataset(src_folder=output_folder)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/debug.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/debug.gin
new file mode 100644
index 000000000..9261dc222
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/debug.gin
@@ -0,0 +1,35 @@
+batch_size = 16
+dataset = "debug"
+
+# model parameters
+make_model.dataset = %dataset
+
+# dense model optimizer
+dense_optimizer_factory_and_class.learning_rate = 0.001
+dense_optimizer_factory_and_class.optimizer_name = "Adam"
+dense_optimizer_factory_and_class.momentum = 0
+dense_optimizer_factory_and_class.weight_decay = 0
+dense_optimizer_factory_and_class.eps = 1e-8
+dense_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# sparse model optimizer
+sparse_optimizer_factory_and_class.learning_rate = 0.001
+sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
+sparse_optimizer_factory_and_class.momentum = 0
+sparse_optimizer_factory_and_class.weight_decay = 0
+sparse_optimizer_factory_and_class.eps = 1e-8
+sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# dataloader configs
+make_train_test_dataloaders.batch_size = %batch_size
+make_train_test_dataloaders.dataset_type = %dataset
+make_train_test_dataloaders.train_split_percentage = 0.75
+
+# train loop variables
+train_loop.num_batches = 10
+train_loop.num_epochs = 1000
+train_loop.output_trace = True
+train_loop.metric_log_frequency = 10
+
+# logger variables
+MetricsLogger.tensorboard_log_path = "/tmp/tensorboard_log_path.log"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/kuairand_1k.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/kuairand_1k.gin
new file mode 100644
index 000000000..46d8e1272
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/kuairand_1k.gin
@@ -0,0 +1,41 @@
+batch_size = 16
+dataset = "kuairand-1k"
+
+# model parameters
+make_model.dataset = %dataset
+
+# dense model optimizer
+dense_optimizer_factory_and_class.learning_rate = 0.001
+dense_optimizer_factory_and_class.optimizer_name = "Adam"
+dense_optimizer_factory_and_class.momentum = 0
+dense_optimizer_factory_and_class.weight_decay = 0
+dense_optimizer_factory_and_class.eps = 1e-8
+dense_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# sparse model optimizer
+sparse_optimizer_factory_and_class.learning_rate = 0.001
+sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
+sparse_optimizer_factory_and_class.momentum = 0
+sparse_optimizer_factory_and_class.weight_decay = 0
+sparse_optimizer_factory_and_class.eps = 1e-8
+sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# dataloader configs
+make_train_test_dataloaders.batch_size = %batch_size
+make_train_test_dataloaders.dataset_type = %dataset
+make_train_test_dataloaders.train_split_percentage = 0.75
+make_train_test_dataloaders.new_path_prefix = "/home/linjianma"
+make_train_test_dataloaders.num_workers = 2
+make_train_test_dataloaders.prefetch_factor = 4
+
+# train loop variables
+train_loop.num_epochs = 5
+train_loop.output_trace = True
+train_loop.metric_log_frequency = 10
+
+# logger variables
+MetricsLogger.tensorboard_log_path = "/tmp/tensorboard_log_path.log"
+
+# checkpoint
+# save_dmp_checkpoint.path = "/home/linjianma/ckpts/kuairand_1k"
+# load_dmp_checkpoint.path = "/home/linjianma/ckpts/kuairand_1k/2025_01_12_17_56_43/"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_13b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_13b.gin
new file mode 100644
index 000000000..e2f371de4
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_13b.gin
@@ -0,0 +1,41 @@
+batch_size = 128
+dataset = "movielens-13b"
+
+# model parameters
+make_model.dataset = %dataset
+
+# dense model optimizer
+dense_optimizer_factory_and_class.learning_rate = 0.001
+dense_optimizer_factory_and_class.optimizer_name = "Adam"
+dense_optimizer_factory_and_class.momentum = 0
+dense_optimizer_factory_and_class.weight_decay = 0
+dense_optimizer_factory_and_class.eps = 1e-8
+dense_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# sparse model optimizer
+sparse_optimizer_factory_and_class.learning_rate = 0.001
+sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
+sparse_optimizer_factory_and_class.momentum = 0
+sparse_optimizer_factory_and_class.weight_decay = 0
+sparse_optimizer_factory_and_class.eps = 1e-8
+sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# dataloader configs
+make_train_test_dataloaders.batch_size = %batch_size
+make_train_test_dataloaders.dataset_type = %dataset
+make_train_test_dataloaders.train_split_percentage = 0.75
+make_train_test_dataloaders.new_path_prefix = "/home/linjianma"
+make_train_test_dataloaders.num_workers = 2
+make_train_test_dataloaders.prefetch_factor = 4
+
+# train loop variables
+train_loop.num_epochs = 1
+train_loop.output_trace = True
+train_loop.metric_log_frequency = 10
+train_eval_loop.num_epochs = 1
+train_eval_loop.output_trace = True
+train_eval_loop.metric_log_frequency = 10
+
+# logger variables
+MetricsLogger.tensorboard_log_path = "/tmp/tensorboard_log_path.log"
+save_dmp_checkpoint.path = "/home/linjianma/ckpts/movielens_13b"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_18b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_18b.gin
new file mode 100644
index 000000000..094271b57
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_18b.gin
@@ -0,0 +1,56 @@
+batch_size = 64
+dataset = "movielens-18b"
+
+# model parameters
+make_model.dataset = %dataset
+
+# dense model optimizer
+dense_optimizer_factory_and_class.learning_rate = 0.001
+dense_optimizer_factory_and_class.optimizer_name = "Adam"
+dense_optimizer_factory_and_class.momentum = 0
+dense_optimizer_factory_and_class.weight_decay = 0
+dense_optimizer_factory_and_class.eps = 1e-8
+dense_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# sparse model optimizer
+sparse_optimizer_factory_and_class.learning_rate = 0.001
+sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
+sparse_optimizer_factory_and_class.momentum = 0
+sparse_optimizer_factory_and_class.weight_decay = 0
+sparse_optimizer_factory_and_class.eps = 1e-8
+sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# dataloader configs
+make_train_test_dataloaders.batch_size = %batch_size
+make_train_test_dataloaders.dataset_type = %dataset
+make_train_test_dataloaders.train_split_percentage = 0.80
+make_train_test_dataloaders.new_path_prefix = "/home/linjianma"
+make_train_test_dataloaders.num_workers = 2
+make_train_test_dataloaders.prefetch_factor = 4
+make_train_test_dataloaders.num_blocks = 20
+
+# train loop variables
+train_loop.num_epochs = 200
+train_loop.output_trace = False
+train_loop.metric_log_frequency = 40
+train_loop.checkpoint_frequency = 4000
+train_loop.start_batch_idx = 0
+
+# eval loop variables
+eval_loop.metric_log_frequency = 40
+
+# train eval loop variables
+train_eval_loop.num_epochs = 20
+train_eval_loop.output_trace = False
+train_eval_loop.start_train_batch_idx = 0
+train_eval_loop.start_eval_batch_idx = 0
+train_eval_loop.num_eval_batches = 200
+train_eval_loop.metric_log_frequency = 40
+train_eval_loop.checkpoint_frequency = 2000
+train_eval_loop.eval_frequency = 500
+
+
+# logger variables
+MetricsLogger.tensorboard_log_path = "/home/linjianma/tensorboard/movielens_18b/"
+save_dmp_checkpoint.path = "/home/linjianma/ckpts/movielens_18b/"
+# load_dmp_checkpoint.path = "/home/linjianma/ckpts/movielens_18b/20000/"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_1m.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_1m.gin
new file mode 100644
index 000000000..2b6cd6b64
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_1m.gin
@@ -0,0 +1,38 @@
+batch_size = 128
+dataset = "movielens-1m"
+
+# model parameters
+make_model.dataset = %dataset
+
+# dense model optimizer
+dense_optimizer_factory_and_class.learning_rate = 0.001
+dense_optimizer_factory_and_class.optimizer_name = "Adam"
+dense_optimizer_factory_and_class.momentum = 0
+dense_optimizer_factory_and_class.weight_decay = 0
+dense_optimizer_factory_and_class.eps = 1e-8
+dense_optimizer_factory_and_class.betas = (0.9, 0.98)
+
+# sparse model optimizer
+sparse_optimizer_factory_and_class.learning_rate = 0.001
+sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
+sparse_optimizer_factory_and_class.momentum = 0
+sparse_optimizer_factory_and_class.weight_decay = 0
+sparse_optimizer_factory_and_class.eps = 1e-8
+sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# dataloader configs
+make_train_test_dataloaders.batch_size = %batch_size
+make_train_test_dataloaders.dataset_type = %dataset
+make_train_test_dataloaders.train_split_percentage = 0.75
+make_train_test_dataloaders.new_path_prefix = "/home/linjianma"
+make_train_test_dataloaders.num_workers = 2
+make_train_test_dataloaders.prefetch_factor = 4
+
+# train-eval loop variables
+train_eval_loop.num_epochs = 101
+train_eval_loop.output_trace = True
+train_eval_loop.metric_log_frequency = 10
+train_eval_loop.eval_frequency = 1
+
+# logger variables
+MetricsLogger.tensorboard_log_path = "/tmp/tensorboard_log_path.log"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_20m.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_20m.gin
new file mode 100644
index 000000000..c01fab5af
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_20m.gin
@@ -0,0 +1,56 @@
+batch_size = 64
+dataset = "movielens-20m"
+
+# model parameters
+make_model.dataset = %dataset
+
+# dense model optimizer
+dense_optimizer_factory_and_class.learning_rate = 0.001
+dense_optimizer_factory_and_class.optimizer_name = "Adam"
+dense_optimizer_factory_and_class.momentum = 0
+dense_optimizer_factory_and_class.weight_decay = 0
+dense_optimizer_factory_and_class.eps = 1e-8
+dense_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# sparse model optimizer
+sparse_optimizer_factory_and_class.learning_rate = 0.001
+sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
+sparse_optimizer_factory_and_class.momentum = 0
+sparse_optimizer_factory_and_class.weight_decay = 0
+sparse_optimizer_factory_and_class.eps = 1e-8
+sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# dataloader configs
+make_train_test_dataloaders.batch_size = %batch_size
+make_train_test_dataloaders.dataset_type = %dataset
+make_train_test_dataloaders.train_split_percentage = 0.80
+make_train_test_dataloaders.new_path_prefix = "/home/linjianma"
+make_train_test_dataloaders.num_workers = 2
+make_train_test_dataloaders.prefetch_factor = 4
+make_train_test_dataloaders.num_blocks = 1
+
+# train loop variables
+train_loop.num_epochs = 200
+train_loop.output_trace = False
+train_loop.metric_log_frequency = 40
+train_loop.checkpoint_frequency = 4000
+train_loop.start_batch_idx = 0
+
+# eval loop variables
+eval_loop.metric_log_frequency = 10
+
+# train eval loop variables
+train_eval_loop.num_epochs = 20
+train_eval_loop.output_trace = False
+train_eval_loop.start_train_batch_idx = 0
+train_eval_loop.start_eval_batch_idx = 0
+train_eval_loop.num_eval_batches = 100
+train_eval_loop.metric_log_frequency = 40
+train_eval_loop.checkpoint_frequency = 2000
+train_eval_loop.eval_frequency = 200
+
+
+# logger variables
+MetricsLogger.tensorboard_log_path = "/home/linjianma/tensorboard/movielens_20m/"
+# save_dmp_checkpoint.path = "/home/linjianma/ckpts/movielens_18b/0.5T"
+# load_dmp_checkpoint.path = "/home/linjianma/ckpts/movielens_18b/20000/"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_100b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_100b.gin
new file mode 100644
index 000000000..7d1df4bce
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_100b.gin
@@ -0,0 +1,52 @@
+batch_size = 64
+num_workers = 2
+prefetch_factor = 4
+dataset = "streaming-100b"
+
+# model parameters
+make_model.dataset = %dataset
+
+# dense model optimizer
+dense_optimizer_factory_and_class.learning_rate = 0.001
+dense_optimizer_factory_and_class.optimizer_name = "Adam"
+dense_optimizer_factory_and_class.momentum = 0
+dense_optimizer_factory_and_class.weight_decay = 0
+dense_optimizer_factory_and_class.eps = 1e-8
+dense_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# sparse model optimizer
+sparse_optimizer_factory_and_class.learning_rate = 0.001
+sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
+sparse_optimizer_factory_and_class.momentum = 0
+sparse_optimizer_factory_and_class.weight_decay = 0
+sparse_optimizer_factory_and_class.eps = 1e-8
+sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# dataloader configs
+make_train_test_dataloaders.batch_size = %batch_size
+make_train_test_dataloaders.dataset_type = %dataset
+make_train_test_dataloaders.train_split_percentage = 0.80
+make_train_test_dataloaders.new_path_prefix = "/home/linjianma"
+make_train_test_dataloaders.num_workers = %num_workers
+make_train_test_dataloaders.prefetch_factor = %prefetch_factor
+make_train_test_dataloaders.num_blocks = 20
+
+get_dataset.name = %dataset
+get_dataset.new_path_prefix = "/home/linjianma"
+
+make_streaming_dataloader.batch_size = %batch_size
+make_streaming_dataloader.num_workers = %num_workers
+make_streaming_dataloader.prefetch_factor = %prefetch_factor
+
+# train eval loop variables
+streaming_train_eval_loop.num_train_ts = 90
+streaming_train_eval_loop.output_trace = False
+streaming_train_eval_loop.num_eval_batches = 500
+streaming_train_eval_loop.metric_log_frequency = 40
+streaming_train_eval_loop.checkpoint_frequency = 3
+
+
+# logger variables
+MetricsLogger.tensorboard_log_path = "/home/linjianma/tensorboard/streaming_100b/run4/"
+save_dmp_checkpoint.path = "/home/linjianma/ckpts/streaming_100b/"
+# load_dmp_checkpoint.path = "/home/linjianma/ckpts/streaming_100b/20000/"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_200b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_200b.gin
new file mode 100644
index 000000000..872019962
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_200b.gin
@@ -0,0 +1,63 @@
+batch_size = 64
+num_workers = 2
+prefetch_factor = 4
+dataset = "streaming-200b"
+
+# model parameters
+make_model.dataset = %dataset
+
+# dense model optimizer
+dense_optimizer_factory_and_class.learning_rate = 0.001
+dense_optimizer_factory_and_class.optimizer_name = "Adam"
+dense_optimizer_factory_and_class.momentum = 0
+dense_optimizer_factory_and_class.weight_decay = 0
+dense_optimizer_factory_and_class.eps = 1e-8
+dense_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# sparse model optimizer
+sparse_optimizer_factory_and_class.learning_rate = 0.001
+sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
+sparse_optimizer_factory_and_class.momentum = 0
+sparse_optimizer_factory_and_class.weight_decay = 0
+sparse_optimizer_factory_and_class.eps = 1e-8
+sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# dataloader configs
+make_train_test_dataloaders.batch_size = %batch_size
+make_train_test_dataloaders.dataset_type = %dataset
+make_train_test_dataloaders.train_split_percentage = 0.80
+make_train_test_dataloaders.new_path_prefix = "/home/linjianma"
+make_train_test_dataloaders.num_workers = %num_workers
+make_train_test_dataloaders.prefetch_factor = %prefetch_factor
+make_train_test_dataloaders.num_blocks = 20
+
+get_dataset.name = %dataset
+get_dataset.new_path_prefix = "/home/linjianma"
+
+make_streaming_dataloader.batch_size = %batch_size
+make_streaming_dataloader.num_workers = %num_workers
+make_streaming_dataloader.prefetch_factor = %prefetch_factor
+
+# train loop variables
+train_loop.num_epochs = 200
+train_loop.output_trace = False
+train_loop.metric_log_frequency = 40
+train_loop.checkpoint_frequency = 4000
+train_loop.start_batch_idx = 0
+
+# eval loop variables
+eval_loop.metric_log_frequency = 40
+
+# train eval loop variables
+streaming_train_eval_loop.num_train_ts = 90
+streaming_train_eval_loop.output_trace = False
+streaming_train_eval_loop.num_train_batches = 5000
+streaming_train_eval_loop.num_eval_batches = 200
+streaming_train_eval_loop.metric_log_frequency = 40
+streaming_train_eval_loop.checkpoint_frequency = 2000
+
+
+# logger variables
+MetricsLogger.tensorboard_log_path = "/home/linjianma/tensorboard/streaming_200b/"
+# save_dmp_checkpoint.path = "/home/linjianma/ckpts/streaming_200b/"
+# load_dmp_checkpoint.path = "/home/linjianma/ckpts/streaming_400m/20000/"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_400m.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_400m.gin
new file mode 100644
index 000000000..eba17bc23
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_400m.gin
@@ -0,0 +1,61 @@
+batch_size = 64
+num_workers = 2
+prefetch_factor = 4
+dataset = "streaming-400m"
+
+# model parameters
+make_model.dataset = %dataset
+
+# dense model optimizer
+dense_optimizer_factory_and_class.learning_rate = 0.001
+dense_optimizer_factory_and_class.optimizer_name = "Adam"
+dense_optimizer_factory_and_class.momentum = 0
+dense_optimizer_factory_and_class.weight_decay = 0
+dense_optimizer_factory_and_class.eps = 1e-8
+dense_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# sparse model optimizer
+sparse_optimizer_factory_and_class.learning_rate = 0.001
+sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
+sparse_optimizer_factory_and_class.momentum = 0
+sparse_optimizer_factory_and_class.weight_decay = 0
+sparse_optimizer_factory_and_class.eps = 1e-8
+sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# dataloader configs
+make_train_test_dataloaders.batch_size = %batch_size
+make_train_test_dataloaders.dataset_type = %dataset
+make_train_test_dataloaders.train_split_percentage = 0.80
+make_train_test_dataloaders.new_path_prefix = "/home/linjianma"
+make_train_test_dataloaders.num_workers = %num_workers
+make_train_test_dataloaders.prefetch_factor = %prefetch_factor
+make_train_test_dataloaders.num_blocks = 20
+
+get_dataset.name = %dataset
+get_dataset.new_path_prefix = "/home/linjianma"
+
+make_streaming_dataloader.batch_size = %batch_size
+make_streaming_dataloader.num_workers = %num_workers
+make_streaming_dataloader.prefetch_factor = %prefetch_factor
+
+# train loop variables
+train_loop.num_epochs = 200
+train_loop.output_trace = False
+train_loop.metric_log_frequency = 40
+train_loop.checkpoint_frequency = 4000
+train_loop.start_batch_idx = 0
+
+# eval loop variables
+eval_loop.metric_log_frequency = 40
+
+# train eval loop variables
+streaming_train_eval_loop.num_train_ts = 8
+streaming_train_eval_loop.output_trace = False
+streaming_train_eval_loop.metric_log_frequency = 40
+streaming_train_eval_loop.checkpoint_frequency = 2000
+
+
+# logger variables
+MetricsLogger.tensorboard_log_path = "/home/linjianma/tensorboard/streaming_400m/"
+# save_dmp_checkpoint.path = "/home/linjianma/ckpts/streaming_400m/"
+# load_dmp_checkpoint.path = "/home/linjianma/ckpts/streaming_400m/20000/"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
new file mode 100644
index 000000000..a483f8766
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -0,0 +1,50 @@
+batch_size = 32
+num_workers = 1
+prefetch_factor = 2
+dataset = "yambda-5b"
+
+# model parameters
+make_model.dataset = %dataset
+
+# dense model optimizer
+dense_optimizer_factory_and_class.learning_rate = 0.001
+dense_optimizer_factory_and_class.optimizer_name = "Adam"
+dense_optimizer_factory_and_class.momentum = 0
+dense_optimizer_factory_and_class.weight_decay = 0
+dense_optimizer_factory_and_class.eps = 1e-8
+dense_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# sparse model optimizer
+sparse_optimizer_factory_and_class.learning_rate = 0.001
+sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
+sparse_optimizer_factory_and_class.momentum = 0
+sparse_optimizer_factory_and_class.weight_decay = 0
+sparse_optimizer_factory_and_class.eps = 1e-8
+sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
+
+# dataloader configs
+make_train_test_dataloaders.batch_size = %batch_size
+make_train_test_dataloaders.eval_batch_size = 32
+make_train_test_dataloaders.dataset_type = %dataset
+make_train_test_dataloaders.train_split_percentage = 0.90
+make_train_test_dataloaders.new_path_prefix = "/apps/chcai/dlrm_data"
+make_train_test_dataloaders.num_workers = %num_workers
+make_train_test_dataloaders.prefetch_factor = %prefetch_factor
+make_train_test_dataloaders.num_blocks = 1
+
+get_dataset.name = %dataset
+get_dataset.new_path_prefix = "/apps/chcai/dlrm_data"
+
+# train-eval loop variables (yambda is non-streaming)
+train_eval_loop.num_epochs = 1
+train_eval_loop.output_trace = False
+train_eval_loop.metric_log_frequency = 50
+train_eval_loop.eval_frequency = 5000
+train_eval_loop.num_eval_batches = 500
+train_eval_loop.checkpoint_frequency = 1000000000  # disable mid-training checkpoints (disk-full guard)
+
+# logger variables
+MetricsLogger.tensorboard_log_path = "/tmp/tb/yambda_5b/"
+MetricsLogger.world_size = 8
+MetricsLogger.auc_threshold = 0.80275
+save_dmp_checkpoint.path = "/apps/chcai/ckpts/yambda_5b/"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/train_test.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/train_test.py
new file mode 100644
index 000000000..dfd58f6e5
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/train_test.py
@@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+import unittest
+
+from generative_recommenders.common import gpu_unavailable
+from generative_recommenders.dlrm_v3.train.train_ranker import main
+
+
+class DLRMV3TrainTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    def test_e2e(self) -> None:
+        main()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
new file mode 100644
index 000000000..d17e2992d
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
@@ -0,0 +1,190 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+import argparse
+import logging
+
+logging.basicConfig(level=logging.INFO)
+import os
+import sys
+import traceback
+
+import gin
+import torch
+from generative_recommenders.dlrm_v3.checkpoint import load_dmp_checkpoint
+from generative_recommenders.dlrm_v3.train.utils import (
+    cleanup,
+    eval_loop,
+    make_model,
+    make_optimizer_and_shard,
+    make_train_test_dataloaders,
+    setup,
+    streaming_train_eval_loop,
+    train_eval_loop,
+    train_loop,
+)
+from generative_recommenders.dlrm_v3.utils import MetricsLogger
+from torch import multiprocessing as mp
+from torchrec.test_utils import get_free_port
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+SUPPORTED_CONFIGS = {
+    "debug": "debug.gin",
+    "kuairand-1k": "kuairand_1k.gin",
+    "movielens-1m": "movielens_1m.gin",
+    "movielens-20m": "movielens_20m.gin",
+    "movielens-13b": "movielens_13b.gin",
+    "movielens-18b": "movielens_18b.gin",
+    "streaming-400m": "streaming_400m.gin",
+    "streaming-200b": "streaming_200b.gin",
+    "streaming-100b": "streaming_100b.gin",
+    "yambda-5b": "yambda_5b.gin",
+}
+
+
+def _main_func(
+    rank: int,
+    world_size: int,
+    master_port: int,
+    gin_file: str,
+    mode: str,
+) -> None:
+    device = torch.device(f"cuda:{rank}")
+    logger.info(f"rank: {rank}, world_size: {world_size}, device: {device}")
+    setup(
+        rank=rank,
+        world_size=world_size,
+        master_port=master_port,
+        device=device,
+    )
+    # parse all arguments
+    gin.parse_config_file(gin_file)
+
+    model, model_configs, embedding_table_configs = make_model()
+    model, optimizer = make_optimizer_and_shard(
+        model=model, device=device, world_size=world_size
+    )
+    train_dataloader, test_dataloader = make_train_test_dataloaders(
+        hstu_config=model_configs,
+        embedding_table_configs=embedding_table_configs,
+    )
+    metrics = MetricsLogger(
+        multitask_configs=model_configs.multitask_configs,
+        batch_size=train_dataloader.batch_size,
+        window_size=2500,
+        device=device,
+        rank=rank,
+    )
+    load_dmp_checkpoint(
+        model=model, optimizer=optimizer, metric_logger=metrics, device=device
+    )
+
+    # train loop
+    try:
+        if mode == "train":
+            train_loop(
+                rank=rank,
+                model=model,
+                dataloader=train_dataloader,
+                optimizer=optimizer,
+                metric_logger=metrics,
+                device=device,
+            )
+        elif mode == "eval":
+            # reinit metrics logger for eval
+            metrics = MetricsLogger(
+                multitask_configs=model_configs.multitask_configs,
+                batch_size=train_dataloader.batch_size,
+                window_size=1000,
+                device=device,
+                rank=rank,
+            )
+            eval_loop(
+                rank=rank,
+                model=model,
+                dataloader=test_dataloader,
+                metric_logger=metrics,
+                device=device,
+            )
+        elif mode == "train-eval":
+            train_eval_loop(
+                rank=rank,
+                model=model,
+                train_dataloader=train_dataloader,
+                eval_dataloader=test_dataloader,
+                optimizer=optimizer,
+                metric_logger=metrics,
+                device=device,
+            )
+        elif mode == "streaming-train-eval":
+            streaming_train_eval_loop(
+                rank=rank,
+                model=model,
+                optimizer=optimizer,
+                metric_logger=metrics,
+                device=device,
+                hstu_config=model_configs,
+                embedding_table_configs=embedding_table_configs,
+            )
+    except Exception as e:
+        logger.info(traceback.format_exc())
+        cleanup()
+        raise Exception(e)
+
+
+def get_args():  # pyre-ignore [3]
+    """Parse commandline."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dataset", default="debug", choices=SUPPORTED_CONFIGS.keys(), help="dataset"
+    )
+    parser.add_argument(
+        "--mode",
+        default="train",
+        choices=["train", "eval", "train-eval", "streaming-train-eval"],
+        help="mode",
+    )
+    args, unknown_args = parser.parse_known_args()
+    logger.warning(f"unknown_args: {unknown_args}")
+    return args
+
+
+def main() -> None:
+    args = get_args()
+    logger.info(args)
+    assert args.dataset in SUPPORTED_CONFIGS, f"Unsupported dataset: {args.dataset}"
+    assert args.mode in [
+        "train",
+        "eval",
+        "train-eval",
+        "streaming-train-eval",
+    ], f"Unsupported mode: {args.mode}"
+    WORLD_SIZE = int(os.environ.get("WORLD_SIZE", 1))
+    MASTER_PORT = str(get_free_port())
+    gin_path = f"{os.path.dirname(__file__)}/gin/{SUPPORTED_CONFIGS[args.dataset]}"
+
+    mp.start_processes(
+        _main_func,
+        args=(WORLD_SIZE, MASTER_PORT, gin_path, args.mode),
+        nprocs=WORLD_SIZE,
+        join=True,
+        start_method="spawn",
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
new file mode 100644
index 000000000..21d2baa6e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -0,0 +1,902 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+import logging
+import os
+from collections.abc import Iterator
+from datetime import timedelta
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    Type,
+    Union,
+)
+
+import gin
+import torch
+import torchrec
+from generative_recommenders.dlrm_v3.checkpoint import save_dmp_checkpoint
+from generative_recommenders.dlrm_v3.configs import (
+    get_embedding_table_config,
+    get_hstu_configs,
+)
+from generative_recommenders.dlrm_v3.datasets.dataset import collate_fn, Dataset
+from generative_recommenders.dlrm_v3.utils import get_dataset, MetricsLogger, Profiler
+from generative_recommenders.common import HammerKernel
+from generative_recommenders.modules.dlrm_hstu import DlrmHSTU, DlrmHSTUConfig
+from torch import distributed as dist
+from torch.distributed.optim import (
+    _apply_optimizer_in_backward as apply_optimizer_in_backward,
+)
+from torch.optim.optimizer import Optimizer
+from torch.utils.data import DataLoader, Dataset as TorchDataset
+from torch.utils.data.distributed import _T_co, DistributedSampler
+from torchrec.distributed.model_parallel import DistributedModelParallel
+from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology
+from torchrec.distributed.sharding_plan import get_default_sharders
+from torchrec.distributed.types import ShardedTensor, ShardingEnv
+from torchrec.modules.embedding_configs import EmbeddingConfig
+from torchrec.modules.embedding_modules import (
+    EmbeddingBagCollection,
+    EmbeddingCollection,
+)
+from torchrec.optim.keyed import CombinedOptimizer, KeyedOptimizerWrapper
+from torchrec.optim.optimizers import in_backward_optimizer_filter
+from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+TORCHREC_TYPES: Set[Type[Union[EmbeddingBagCollection, EmbeddingCollection]]] = {
+    EmbeddingBagCollection,
+    EmbeddingCollection,
+}
+
+
+def setup(
+    rank: int, world_size: int, master_port: int, device: torch.device
+) -> dist.ProcessGroup:
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = str(master_port)
+
+    BACKEND = dist.Backend.NCCL
+    TIMEOUT = 1800
+
+    # initialize the process group
+    if not dist.is_initialized():
+        dist.init_process_group("nccl", rank=rank, world_size=world_size)
+
+    pg = dist.new_group(
+        backend=BACKEND,
+        timeout=timedelta(seconds=TIMEOUT),
+    )
+
+    # set device
+    torch.cuda.set_device(device)
+
+    return pg
+
+
+def cleanup() -> None:
+    dist.destroy_process_group()
+
+
+class HammerToTorchDataset(TorchDataset):
+    def __init__(
+        self,
+        dataset: Dataset,
+    ) -> None:
+        self.dataset: Dataset = dataset
+
+    def __getitem__(self, idx: int) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]:
+        self.dataset.load_query_samples([idx])
+        sample = self.dataset.get_sample(idx)
+        self.dataset.unload_query_samples([idx])
+        return sample
+
+    def __getitems__(
+        self, indices: List[int]
+    ) -> List[Tuple[KeyedJaggedTensor, KeyedJaggedTensor]]:
+        self.dataset.load_query_samples(indices)
+        samples = [self.dataset.get_sample(i) for i in indices]
+        self.dataset.unload_query_samples(indices)
+        return samples
+
+
+class _ChainedRanges:
+    """O(1) __len__ + O(log K) __getitem__ over a sequence of `range`s.
+
+    Lets `torch.utils.data.Subset(dataset, _ChainedRanges([r1, r2, ...]))`
+    avoid materializing a Python list of all per-block indices (which at
+    multi-billion totals is ~28 B/int and dominates host RAM).
+    """
+
+    def __init__(self, ranges: List[range]) -> None:
+        self._ranges: List[range] = list(ranges)
+        offsets = [0]
+        for r in self._ranges:
+            offsets.append(offsets[-1] + len(r))
+        self._offsets: List[int] = offsets
+
+    def __len__(self) -> int:
+        return self._offsets[-1]
+
+    def __getitem__(self, idx: int) -> int:
+        import bisect
+        if idx < 0:
+            idx += self._offsets[-1]
+        if idx < 0 or idx >= self._offsets[-1]:
+            raise IndexError(idx)
+        bucket = bisect.bisect_right(self._offsets, idx) - 1
+        return self._ranges[bucket][idx - self._offsets[bucket]]
+
+
+class ChunkDistributedSampler(DistributedSampler[_T_co]):
+    """
+    Each rank reads a contiguous chunk (trunk) of the input data
+    """
+
+    def __init__(
+        self,
+        dataset: TorchDataset,
+        num_replicas: Optional[int] = None,
+        rank: Optional[int] = None,
+        shuffle: bool = True,
+        seed: int = 1,
+        drop_last: bool = False,
+    ) -> None:
+        super().__init__(
+            dataset=dataset,
+            num_replicas=num_replicas,
+            rank=rank,
+            shuffle=shuffle,
+            seed=seed,
+            drop_last=drop_last,
+        )
+
+    def __iter__(self) -> Iterator[_T_co]:
+        if self.shuffle:
+            g = torch.Generator()
+            g.manual_seed(self.seed + self.epoch * 1001 + int(self.rank))
+            indices_t = torch.randperm(self.num_samples, generator=g)
+        else:
+            indices_t = torch.arange(self.num_samples)
+        assert self.drop_last is True, (
+            "drop_last must be True for ChunkDistributedSampler"
+        )
+        indices_t = indices_t + (self.num_samples * int(self.rank))
+        assert indices_t.numel() == self.num_samples
+        # Iterate via the numpy view, NOT directly over the tensor: iter(Tensor)
+        # calls Tensor.unbind(0) which eagerly materializes one zero-dim Tensor
+        # object per element (~600 B each). For 40 M eval / 525 M train samples
+        # that's 24 GB / 315 GB of [heap] growth per rank, blowing host RAM
+        # before the first batch. numpy's iter yields one Python int at a time
+        # with O(1) extra memory.
+        indices_np = indices_t.numpy()
+        return (int(x) for x in indices_np)
+
+    def set_epoch(self, epoch: int) -> None:
+        logger.warning(f"Setting epoch to {epoch}")
+        self.epoch = epoch
+
+
+@gin.configurable
+def make_model(
+    dataset: str,
+) -> Tuple[torch.nn.Module, DlrmHSTUConfig, Dict[str, EmbeddingConfig]]:
+    hstu_config = get_hstu_configs(dataset)
+    table_config = get_embedding_table_config(dataset)
+
+    model = DlrmHSTU(
+        hstu_configs=hstu_config,
+        embedding_tables=table_config,
+        is_inference=False,
+        bf16_training=False,
+    )
+
+    # Triton on ROCm fails to compile some jagged kernels at our shapes
+    # (PassManager::run failed at make_ttgir). Allow the PyTorch backend as a
+    # global override so AMD smoke runs end-to-end. CUDA paths default to TRITON.
+    kernel_override = os.environ.get("HSTU_HAMMER_KERNEL", "").upper()
+    if kernel_override:
+        model.set_hammer_kernel(HammerKernel[kernel_override])
+        logger.warning(f"HSTU_HAMMER_KERNEL override: {kernel_override}")
+
+    return (
+        model,
+        hstu_config,
+        table_config,
+    )
+
+
+@gin.configurable()
+def dense_optimizer_factory_and_class(
+    optimizer_name: str,
+    betas: Tuple[float, float],
+    eps: float,
+    weight_decay: float,
+    momentum: float,
+    learning_rate: float,
+) -> Tuple[
+    Type[Optimizer], Dict[str, Any], Callable[[Iterable[torch.Tensor]], Optimizer]
+]:
+    kwargs: Dict[str, Any] = {"lr": learning_rate}
+    if optimizer_name == "Adam":
+        optimizer_cls = torch.optim.Adam
+        kwargs.update({"betas": betas, "eps": eps, "weight_decay": weight_decay})
+    elif optimizer_name == "SGD":
+        optimizer_cls = torch.optim.SGD
+        kwargs.update({"weight_decay": weight_decay, "momentum": momentum})
+    elif optimizer_name == "AdamW":
+        optimizer_cls = torch.optim.AdamW
+        kwargs.update({"betas": betas, "eps": eps, "weight_decay": weight_decay})
+    else:
+        raise Exception("Unsupported optimizer!")
+
+    optimizer_factory = lambda params: optimizer_cls(params, **kwargs)
+
+    return optimizer_cls, kwargs, optimizer_factory
+
+
+@gin.configurable()
+def sparse_optimizer_factory_and_class(
+    optimizer_name: str,
+    betas: Tuple[float, float],
+    eps: float,
+    weight_decay: float,
+    momentum: float,
+    learning_rate: float,
+) -> Tuple[
+    Type[Optimizer], Dict[str, Any], Callable[[Iterable[torch.Tensor]], Optimizer]
+]:
+    kwargs: Dict[str, Any] = {"lr": learning_rate}
+    if optimizer_name == "Adam":
+        optimizer_cls = torch.optim.Adam
+        beta1, beta2 = betas
+        kwargs.update(
+            {"beta1": beta1, "beta2": beta2, "eps": eps, "weight_decay": weight_decay}
+        )
+    elif optimizer_name == "SGD":
+        optimizer_cls = torchrec.optim.SGD
+        kwargs.update({"weight_decay": weight_decay, "momentum": momentum})
+    elif optimizer_name == "RowWiseAdagrad":
+        optimizer_cls = torchrec.optim.RowWiseAdagrad
+        beta1, beta2 = betas
+        kwargs.update(
+            {
+                "eps": eps,
+                "beta1": beta1,
+                "beta2": beta2,
+                "weight_decay": weight_decay,
+            }
+        )
+    else:
+        raise Exception("Unsupported optimizer!")
+
+    optimizer_factory = lambda params: optimizer_cls(params, **kwargs)
+
+    return optimizer_cls, kwargs, optimizer_factory
+
+
+def make_optimizer_and_shard(
+    model: torch.nn.Module,
+    device: torch.device,
+    world_size: int,
+) -> Tuple[DistributedModelParallel, torch.optim.Optimizer]:
+    dense_opt_cls, dense_opt_args, dense_opt_factory = (
+        dense_optimizer_factory_and_class()
+    )
+
+    sparse_opt_cls, sparse_opt_args, sparse_opt_factory = (
+        sparse_optimizer_factory_and_class()
+    )
+    # Fuse sparse optimizer to backward step
+    for k, module in model.named_modules():
+        if type(module) in TORCHREC_TYPES:
+            for _, param in module.named_parameters(prefix=k):
+                if param.requires_grad:
+                    apply_optimizer_in_backward(
+                        sparse_opt_cls, [param], sparse_opt_args
+                    )
+    sharders = get_default_sharders()
+    # MI350X has 288 GiB HBM3e per GPU; the 160 GiB cap was sized for older parts.
+    # Matches Primus-DLRM (hbm_cap_gb: 260) which runs the same 5b cross-feat
+    # table set on the same hardware without host materialization.
+    planner = EmbeddingShardingPlanner(
+        topology=Topology(
+            local_world_size=world_size,
+            world_size=world_size,
+            compute_device="cuda",
+            hbm_cap=260 * 1024 * 1024 * 1024,
+            ddr_cap=32 * 1024 * 1024 * 1024,
+        )
+    )
+    pg = dist.GroupMember.WORLD
+    env = ShardingEnv.from_process_group(pg)  # pyre-ignore [6]
+    pg = env.process_group
+
+    plan = planner.collective_plan(model, sharders, pg)
+
+    # Shard model
+    model = DistributedModelParallel(
+        module=model,
+        device=device,
+        plan=plan,
+        sharders=sharders,
+    )
+    # Create keyed optimizer
+    all_optimizers = []
+    all_params = {}
+    non_fused_sparse_params = {}
+    for k, v in in_backward_optimizer_filter(model.named_parameters()):
+        if v.requires_grad:
+            if isinstance(v, ShardedTensor):
+                non_fused_sparse_params[k] = v
+            else:
+                all_params[k] = v
+
+    if non_fused_sparse_params:
+        all_optimizers.append(
+            (
+                "sparse_non_fused",
+                KeyedOptimizerWrapper(
+                    params=non_fused_sparse_params, optim_factory=sparse_opt_factory
+                ),
+            )
+        )
+
+    if all_params:
+        all_optimizers.append(
+            (
+                "dense",
+                KeyedOptimizerWrapper(
+                    params=all_params,
+                    optim_factory=dense_opt_factory,
+                ),
+            )
+        )
+    output_optimizer = CombinedOptimizer(all_optimizers)
+    output_optimizer.init_state(set(model.sparse_grad_parameter_names()))
+    return model, output_optimizer
+
+
+@gin.configurable
+def make_streaming_dataloader(
+    dataset: HammerToTorchDataset,
+    ts: int,
+    batch_size: int,
+    num_workers: int,
+    prefetch_factor: int,
+) -> DataLoader:
+    dataset.dataset.set_ts(ts)  # pyre-ignore [16]
+    total_items = dataset.dataset.get_item_count()
+    subset = torch.utils.data.Subset(dataset, range(total_items))
+    dataloader = DataLoader(
+        dataset=subset,
+        batch_size=batch_size,
+        shuffle=False,
+        collate_fn=collate_fn,
+        drop_last=True,
+        num_workers=num_workers,
+        prefetch_factor=prefetch_factor,
+        sampler=DistributedSampler(subset, drop_last=True),
+    )
+    return dataloader
+
+
+@gin.configurable
+def make_train_test_dataloaders(
+    batch_size: int,
+    dataset_type: str,
+    hstu_config: DlrmHSTUConfig,
+    train_split_percentage: float,
+    embedding_table_configs: Dict[str, EmbeddingConfig],
+    new_path_prefix: str = "",
+    num_workers: int = 0,
+    num_blocks: int = 1,
+    prefetch_factor: Optional[int] = None,
+    eval_batch_size: Optional[int] = None,
+) -> Tuple[DataLoader, DataLoader]:
+    dataset_class, kwargs = get_dataset(
+        name=dataset_type, new_path_prefix=new_path_prefix
+    )
+    kwargs["embedding_config"] = embedding_table_configs
+
+    # Create dataset
+    dataset = HammerToTorchDataset(
+        dataset=dataset_class(hstu_config=hstu_config, is_inference=False, **kwargs)
+    )
+    total_items = dataset.dataset.get_item_count()
+    items_per_block = total_items // num_blocks
+    train_size_per_block = round(train_split_percentage * items_per_block)
+    # Avoid `extend(range(...))` which materializes a Python list of all sample
+    # indices — at 3.2B yambda samples × 28 bytes/int ≈ 90 GB/rank just for
+    # train_inds. Subset accepts any sequence with O(1) __len__ and __getitem__,
+    # so pass range objects (or a tiny chained view) directly.
+    if num_blocks == 1:
+        train_inds = range(0, train_size_per_block)
+        test_inds = range(train_size_per_block, items_per_block)
+    else:
+        train_inds = _ChainedRanges([
+            range(i * items_per_block, i * items_per_block + train_size_per_block)
+            for i in range(num_blocks)
+        ])
+        test_inds = _ChainedRanges([
+            range(i * items_per_block + train_size_per_block, (i + 1) * items_per_block)
+            for i in range(num_blocks)
+        ])
+    train_set = torch.utils.data.Subset(dataset, train_inds)
+    test_set = torch.utils.data.Subset(dataset, test_inds)
+
+    # When the parent rank is started via mp.start_processes(start_method="spawn"),
+    # torch.multiprocessing's default Process context is also "spawn". DataLoader
+    # then pickles `self._dataset` to send to each worker — which for our mmap'd
+    # 211 GB yambda store materializes the entire dataset into the parent's anon
+    # memory (~230 GB/rank). Forcing "fork" lets workers inherit the parent's
+    # mmap'd pages via COW with zero extra anon.
+    mp_ctx = "fork" if num_workers and num_workers > 0 else None
+    train_dataloader = DataLoader(
+        dataset=train_set,
+        batch_size=batch_size,
+        shuffle=False,
+        collate_fn=collate_fn,
+        drop_last=True,
+        num_workers=num_workers,
+        prefetch_factor=prefetch_factor,
+        sampler=ChunkDistributedSampler(train_set, drop_last=True, shuffle=True),
+        multiprocessing_context=mp_ctx,
+    )
+    test_dataloader = DataLoader(
+        dataset=test_set,
+        batch_size=eval_batch_size if eval_batch_size is not None else batch_size,
+        shuffle=False,
+        collate_fn=collate_fn,
+        drop_last=True,
+        num_workers=num_workers,
+        prefetch_factor=prefetch_factor,
+        sampler=ChunkDistributedSampler(test_set, drop_last=True, shuffle=True),
+        multiprocessing_context=mp_ctx,
+    )
+    return train_dataloader, test_dataloader
+
+
+@gin.configurable
+def train_loop(
+    rank: int,
+    model: torch.nn.Module,
+    dataloader: torch.utils.data.DataLoader,
+    optimizer: Optimizer,
+    metric_logger: MetricsLogger,
+    device: torch.device,
+    num_epochs: int,
+    num_batches: Optional[int] = None,
+    output_trace: bool = False,
+    metric_log_frequency: int = 1,
+    checkpoint_frequency: int = 100,
+    start_batch_idx: int = 0,
+    # lr_scheduler: to-do: Add a scheduler
+) -> None:
+    model.train()
+    batch_idx: int = start_batch_idx
+    profiler = Profiler(rank, active=10) if output_trace else None
+
+    for epoch in range(num_epochs):
+        dataloader.sampler.set_epoch(epoch)  # pyre-ignore [16]
+        for sample in dataloader:
+            optimizer.zero_grad()
+            sample.to(device)
+            (
+                _,
+                _,
+                aux_losses,
+                mt_target_preds,
+                mt_target_labels,
+                mt_target_weights,
+            ) = model.forward(
+                sample.uih_features_kjt,
+                sample.candidates_features_kjt,
+            )
+            # pyre-ignore
+            sum(aux_losses.values()).backward()
+            optimizer.step()
+            metric_logger.update(
+                mode="train",
+                predictions=mt_target_preds,
+                labels=mt_target_labels,
+                weights=mt_target_weights,
+                num_candidates=sample.candidates_features_kjt.lengths().view(
+                    len(sample.candidates_features_kjt.keys()), -1
+                )[0],
+            )
+            if batch_idx % metric_log_frequency != 0:
+                metric_logger.compute_and_log(
+                    mode="train",
+                    additional_logs={
+                        "losses": aux_losses,
+                    },
+                )
+            if batch_idx % checkpoint_frequency == 0 and batch_idx > 0:
+                save_dmp_checkpoint(
+                    model=model,
+                    optimizer=optimizer,
+                    metric_logger=metric_logger,
+                    rank=rank,
+                    batch_idx=batch_idx,
+                )
+            batch_idx += 1
+            if output_trace:
+                assert profiler is not None
+                profiler.step()
+            if num_batches is not None and batch_idx >= num_batches:
+                break
+        if num_batches is not None and batch_idx >= num_batches:
+            break
+
+
+@gin.configurable
+def eval_loop(
+    rank: int,
+    model: torch.nn.Module,
+    dataloader: torch.utils.data.DataLoader,
+    metric_logger: MetricsLogger,
+    device: torch.device,
+    metric_log_frequency: int = 1,
+    num_batches: Optional[int] = None,
+    output_trace: bool = False,
+    # lr_scheduler: to-do: Add a scheduler
+) -> None:
+    model.eval()
+    batch_idx: int = 0
+    profiler = Profiler(rank, active=10) if output_trace else None
+    metric_logger.reset(mode="eval")
+    with torch.no_grad():
+        for sample in dataloader:
+            sample.to(device)
+            (
+                _,
+                _,
+                _,
+                mt_target_preds,
+                mt_target_labels,
+                mt_target_weights,
+            ) = model.forward(
+                sample.uih_features_kjt,
+                sample.candidates_features_kjt,
+            )
+            metric_logger.update(
+                mode="eval",
+                predictions=mt_target_preds,
+                labels=mt_target_labels,
+                weights=mt_target_weights,
+                num_candidates=sample.candidates_features_kjt.lengths().view(
+                    len(sample.candidates_features_kjt.keys()), -1
+                )[0],
+            )
+            if batch_idx % metric_log_frequency != 0:
+                metric_logger.compute_and_log(mode="eval")
+            batch_idx += 1
+            if output_trace:
+                assert profiler is not None
+                profiler.step()
+            if num_batches is not None and batch_idx >= num_batches:
+                break
+    metric_logger.compute_and_log(mode="eval")
+    for k, v in metric_logger.compute(mode="eval").items():
+        print(f"{k}: {v}")
+
+
+@gin.configurable
+def train_eval_loop(
+    rank: int,
+    model: torch.nn.Module,
+    optimizer: Optimizer,
+    metric_logger: MetricsLogger,
+    device: torch.device,
+    num_epochs: int,
+    num_train_batches: Optional[int] = None,
+    num_eval_batches: Optional[int] = None,
+    train_dataloader: Optional[torch.utils.data.DataLoader] = None,
+    eval_dataloader: Optional[torch.utils.data.DataLoader] = None,
+    output_trace: bool = False,
+    metric_log_frequency: int = 1,
+    checkpoint_frequency: int = 100,
+    eval_frequency: int = 1,
+    start_train_batch_idx: int = 0,
+    start_eval_batch_idx: int = 0,
+    # lr_scheduler: to-do: Add a scheduler
+) -> None:
+    train_batch_idx: int = start_train_batch_idx
+    eval_batch_idx: int = start_eval_batch_idx
+    profiler = Profiler(rank, active=10) if output_trace else None
+    assert train_dataloader is not None and eval_dataloader is not None
+
+    eval_data_iterator = iter(eval_dataloader)
+    train_data_iterator = iter(train_dataloader)
+
+    for epoch in range(num_epochs):
+        train_dataloader.sampler.set_epoch(epoch)  # pyre-ignore [16]
+        while True:
+            model.train()
+            try:
+                sample = next(train_data_iterator)
+            except StopIteration:
+                train_data_iterator = iter(train_dataloader)
+                break
+            optimizer.zero_grad()
+            sample.to(device)
+            (
+                _,
+                _,
+                aux_losses,
+                mt_target_preds,
+                mt_target_labels,
+                mt_target_weights,
+            ) = model.forward(
+                sample.uih_features_kjt,
+                sample.candidates_features_kjt,
+            )
+            # pyre-ignore
+            sum(aux_losses.values()).backward()
+            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+            optimizer.step()
+            metric_logger.update(
+                mode="train",
+                predictions=mt_target_preds,
+                labels=mt_target_labels,
+                weights=mt_target_weights,
+                num_candidates=sample.candidates_features_kjt.lengths().view(
+                    len(sample.candidates_features_kjt.keys()), -1
+                )[0],
+            )
+            if train_batch_idx % metric_log_frequency == 0:
+                metric_logger.compute_and_log(
+                    mode="train",
+                    additional_logs={
+                        "losses": aux_losses,
+                    },
+                )
+            if train_batch_idx % checkpoint_frequency == 0 and train_batch_idx > 0:
+                save_dmp_checkpoint(
+                    model=model,
+                    optimizer=optimizer,
+                    metric_logger=metric_logger,
+                    rank=rank,
+                    batch_idx=train_batch_idx,
+                )
+            train_batch_idx += 1
+            if output_trace:
+                assert profiler is not None
+                profiler.step()
+            if train_batch_idx % eval_frequency == 0:
+                model.eval()
+                eval_batch_idx: int = 0
+                with torch.no_grad():
+                    while True:
+                        try:
+                            sample = next(eval_data_iterator)
+                        except StopIteration:
+                            eval_data_iterator = iter(eval_dataloader)
+                            sample = next(eval_data_iterator)
+                        sample.to(device)
+                        (
+                            _,
+                            _,
+                            _,
+                            mt_target_preds,
+                            mt_target_labels,
+                            mt_target_weights,
+                        ) = model.forward(
+                            sample.uih_features_kjt,
+                            sample.candidates_features_kjt,
+                        )
+                        metric_logger.update(
+                            mode="eval",
+                            predictions=mt_target_preds,
+                            labels=mt_target_labels,
+                            weights=mt_target_weights,
+                            num_candidates=sample.candidates_features_kjt.lengths().view(
+                                len(sample.candidates_features_kjt.keys()), -1
+                            )[0],
+                        )
+                        eval_batch_idx += 1
+                        if output_trace:
+                            assert profiler is not None
+                            profiler.step()
+                        if eval_batch_idx % metric_log_frequency == 0:
+                            metric_logger.compute_and_log(mode="eval")
+                        if (
+                            num_eval_batches is not None
+                            and eval_batch_idx >= num_eval_batches
+                        ):
+                            break
+                    for k, v in metric_logger.compute(mode="eval").items():
+                        print(f"{k}: {v}")
+                model.train()
+            if num_train_batches is not None and train_batch_idx >= num_train_batches:
+                break
+
+
+@gin.configurable
+def streaming_train_eval_loop(
+    rank: int,
+    model: torch.nn.Module,
+    optimizer: Optimizer,
+    metric_logger: MetricsLogger,
+    device: torch.device,
+    num_train_ts: int,
+    hstu_config: DlrmHSTUConfig,
+    embedding_table_configs: Dict[str, EmbeddingConfig],
+    num_train_batches: Optional[int] = None,
+    num_eval_batches: Optional[int] = None,
+    output_trace: bool = False,
+    metric_log_frequency: int = 1,
+    checkpoint_frequency: int = 100,
+) -> None:
+    profiler = Profiler(rank, active=10) if output_trace else None
+    dataset_class, kwargs = get_dataset()
+    kwargs["embedding_config"] = embedding_table_configs
+    dataset = HammerToTorchDataset(
+        dataset=dataset_class(hstu_config=hstu_config, is_inference=False, **kwargs)
+    )
+    for train_ts in range(num_train_ts):
+        train_batch_idx: int = 0
+        train_dataloader = make_streaming_dataloader(dataset=dataset, ts=train_ts)
+        train_data_iterator = iter(train_dataloader)
+        while True:
+            model.train()
+            try:
+                sample = next(train_data_iterator)
+            except StopIteration:
+                break
+            optimizer.zero_grad()
+            sample.to(device)
+            (
+                _,
+                _,
+                aux_losses,
+                mt_target_preds,
+                mt_target_labels,
+                mt_target_weights,
+            ) = model.forward(
+                sample.uih_features_kjt,
+                sample.candidates_features_kjt,
+            )
+            # pyre-ignore
+            sum(aux_losses.values()).backward()
+            optimizer.step()
+            metric_logger.update(
+                mode="train",
+                predictions=mt_target_preds,
+                labels=mt_target_labels,
+                weights=mt_target_weights,
+                num_candidates=sample.candidates_features_kjt.lengths().view(
+                    len(sample.candidates_features_kjt.keys()), -1
+                )[0],
+            )
+            if train_batch_idx % metric_log_frequency == 0:
+                metric_logger.compute_and_log(
+                    mode="train",
+                    additional_logs={
+                        "losses": aux_losses,
+                    },
+                )
+            train_batch_idx += 1
+            if output_trace:
+                assert profiler is not None
+                profiler.step()
+            if num_train_batches is not None and train_batch_idx >= num_train_batches:
+                break
+        eval_ts = train_ts + 1
+        dataset.dataset.is_eval = True  # pyre-ignore [16]
+        model.eval()
+        eval_batch_idx: int = 0
+        eval_dataloader = make_streaming_dataloader(dataset=dataset, ts=eval_ts)
+        eval_data_iterator = iter(eval_dataloader)
+        with torch.no_grad():
+            while True:
+                try:
+                    sample = next(eval_data_iterator)
+                except StopIteration:
+                    break
+                sample.to(device)
+                (
+                    _,
+                    _,
+                    _,
+                    mt_target_preds,
+                    mt_target_labels,
+                    mt_target_weights,
+                ) = model.forward(
+                    sample.uih_features_kjt,
+                    sample.candidates_features_kjt,
+                )
+                metric_logger.update(
+                    mode="eval",
+                    predictions=mt_target_preds,
+                    labels=mt_target_labels,
+                    weights=mt_target_weights,
+                    num_candidates=sample.candidates_features_kjt.lengths().view(
+                        len(sample.candidates_features_kjt.keys()), -1
+                    )[0],
+                )
+                eval_batch_idx += 1
+                if output_trace:
+                    assert profiler is not None
+                    profiler.step()
+                if eval_batch_idx % metric_log_frequency == 0:
+                    metric_logger.compute_and_log(mode="eval")
+                if num_eval_batches is not None and eval_batch_idx >= num_eval_batches:
+                    break
+            for k, v in metric_logger.compute(mode="eval").items():
+                print(f"{k}: {v}")
+        if (
+            train_ts % checkpoint_frequency == 0 and train_ts > 0
+        ) or train_ts == num_train_ts - 1:
+            save_dmp_checkpoint(
+                model=model,
+                optimizer=optimizer,
+                metric_logger=metric_logger,
+                rank=rank,
+                batch_idx=train_ts,
+            )
+
+    eval_ts = num_train_ts
+    dataset.dataset.is_eval = True
+    model.eval()
+    eval_batch_idx: int = 0
+    eval_dataloader = make_streaming_dataloader(dataset=dataset, ts=eval_ts)
+    eval_data_iterator = iter(eval_dataloader)
+    with torch.no_grad():
+        while True:
+            try:
+                sample = next(eval_data_iterator)
+            except StopIteration:
+                break
+            sample.to(device)
+            (
+                _,
+                _,
+                _,
+                mt_target_preds,
+                mt_target_labels,
+                mt_target_weights,
+            ) = model.forward(
+                sample.uih_features_kjt,
+                sample.candidates_features_kjt,
+            )
+            metric_logger.update(
+                mode="eval",
+                predictions=mt_target_preds,
+                labels=mt_target_labels,
+                weights=mt_target_weights,
+                num_candidates=sample.candidates_features_kjt.lengths().view(
+                    len(sample.candidates_features_kjt.keys()), -1
+                )[0],
+            )
+            eval_batch_idx += 1
+            if output_trace:
+                assert profiler is not None
+                profiler.step()
+            if eval_batch_idx % metric_log_frequency == 0:
+                metric_logger.compute_and_log(mode="eval")
+            if num_eval_batches is not None and eval_batch_idx >= num_eval_batches:
+                break
+        for k, v in metric_logger.compute(mode="eval").items():
+            print(f"{k}: {v}")
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
new file mode 100644
index 000000000..52091d8dd
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
@@ -0,0 +1,652 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+"""
+mlperf dlrm_v3 inference benchmarking tool.
+"""
+
+import contextlib
+import logging
+import os
+import time
+from typing import Callable, Dict, List, Optional
+
+import gin
+import tensorboard  # @manual=//tensorboard:lib  # noqa: F401 - required implicit dep when using torch.utils.tensorboard
+import torch
+from generative_recommenders.dlrm_v3.datasets.dataset import DLRMv3RandomDataset
+from generative_recommenders.dlrm_v3.datasets.kuairand import DLRMv3KuaiRandDataset
+from generative_recommenders.dlrm_v3.datasets.movie_lens import DLRMv3MovieLensDataset
+from generative_recommenders.dlrm_v3.datasets.synthetic_movie_lens import (
+    DLRMv3SyntheticMovieLensDataset,
+)
+from generative_recommenders.dlrm_v3.datasets.synthetic_streaming import (
+    DLRMv3SyntheticStreamingDataset,
+)
+from generative_recommenders.dlrm_v3.datasets.yambda import DLRMv3YambdaDataset
+from generative_recommenders.modules.multitask_module import (
+    MultitaskTaskType,
+    TaskConfig,
+)
+from torch.profiler import profile, profiler, ProfilerActivity  # pyre-ignore [21]
+from torch.utils.tensorboard import SummaryWriter
+from torchrec.metrics.accuracy import AccuracyMetricComputation
+from torchrec.metrics.auc import AUCMetricComputation, compute_auc
+from torchrec.metrics.gauc import GAUCMetricComputation
+from torchrec.metrics.mae import MAEMetricComputation
+from torchrec.metrics.metrics_namespace import MetricName, MetricPrefix
+from torchrec.metrics.mse import MSEMetricComputation
+from torchrec.metrics.ne import NEMetricComputation
+from torchrec.metrics.rec_metric import (
+    MetricComputationReport,
+    RecMetricComputation,
+)
+
+
+class LifetimeAUCMetricComputation(AUCMetricComputation):
+    """AUC over all predictions seen so far (uncapped buffer); emits with the LIFETIME prefix."""
+
+    def _compute(self) -> List[MetricComputationReport]:
+        from typing import cast as _cast
+        from torchrec.metrics.auc import LABELS, PREDICTIONS, WEIGHTS
+        return [
+            MetricComputationReport(
+                name=MetricName.AUC,
+                metric_prefix=MetricPrefix.LIFETIME,
+                value=compute_auc(
+                    self._n_tasks,
+                    _cast(List[torch.Tensor], getattr(self, PREDICTIONS)),
+                    _cast(List[torch.Tensor], getattr(self, LABELS)),
+                    _cast(List[torch.Tensor], getattr(self, WEIGHTS)),
+                    self._apply_bin,
+                ),
+            )
+        ]
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger("utils")
+
+
+def _on_trace_ready_fn(
+    rank: Optional[int] = None,
+) -> Callable[[torch.profiler.profile], None]:
+    """
+    Create a callback function for handling profiler trace output.
+
+    Args:
+        rank: Optional process rank for distributed training (included in filename).
+
+    Returns:
+        A callback function that exports profiler traces to Manifold storage.
+    """
+
+    def handle_fn(p: torch.profiler.profile) -> None:
+        bucket_name = "hammer_gpu_traces"
+        pid = os.getpid()
+        rank_str = f"_rank_{rank}" if rank is not None else ""
+        file_name = f"libkineto_activities_{pid}_{rank_str}.json"
+        manifold_path = "tree/dlrm_v3_bench"
+        target_object_name = manifold_path + "/" + file_name + ".gz"
+        path = f"manifold://{bucket_name}/{manifold_path}/{file_name}"
+        logger.warning(
+            p.key_averages(group_by_input_shape=True).table(
+                sort_by="self_cuda_time_total"
+            )
+        )
+        logger.warning(
+            f"trace url: https://www.internalfb.com/intern/perfdoctor/trace_view?filepath={target_object_name}&bucket={bucket_name}"
+        )
+        p.export_chrome_trace(path)
+
+    return handle_fn
+
+
+def profiler_or_nullcontext(enabled: bool, with_stack: bool):
+    """
+    Create a profiler context manager or null context based on enabled flag.
+
+    Args:
+        enabled: Whether to enable profiling.
+        with_stack: Whether to include stack traces in profile.
+
+    Returns:
+        Either a torch.profiler.profile context manager or nullcontext.
+    """
+    return (
+        profile(
+            # pyre-fixme[16]: Module `profiler` has no attribute `ProfilerActivity`.
+            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+            on_trace_ready=_on_trace_ready_fn(),
+            with_stack=with_stack,
+        )
+        if enabled
+        else contextlib.nullcontext()
+    )
+
+
+class Profiler:
+    """
+    Wrapper around PyTorch profiler with scheduled profiling.
+
+    Implements a wait-warmup-active schedule for controlled profiling that
+    avoids startup noise and captures representative performance data.
+
+    Args:
+        rank: Process rank for trace file naming.
+        active: Number of active profiling steps (default: 50).
+    """
+
+    def __init__(self, rank, active: int = 50) -> None:
+        self.rank = rank
+        self._profiler: profiler.profile = torch.profiler.profile(
+            schedule=torch.profiler.schedule(
+                wait=10,
+                warmup=20,
+                active=active,
+                repeat=1,
+            ),
+            on_trace_ready=_on_trace_ready_fn(self.rank),
+            # pyre-fixme[16]: Module `profiler` has no attribute `ProfilerActivity`.
+            activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+            record_shapes=True,
+            profile_memory=False,
+            with_stack=False,
+            with_flops=False,
+            with_modules=False,
+        )
+
+    def step(self) -> None:
+        """Advance the profiler to the next step."""
+        self._profiler.step()
+
+
+@gin.configurable
+class MetricsLogger:
+    """
+    Logger for tracking and computing recommendation metrics.
+
+    Supports both classification metrics (NE, Accuracy, GAUC) and regression
+    metrics (MSE, MAE) based on multitask configuration.
+
+    Args:
+        multitask_configs: List of task configurations defining metric types.
+        batch_size: Batch size for metric computation.
+        window_size: Window size for running metric aggregation.
+        device: Device to place metric tensors on.
+        rank: Process rank for distributed training.
+        tensorboard_log_path: Optional path for TensorBoard logging.
+    """
+
+    def __init__(
+        self,
+        multitask_configs: List[TaskConfig],
+        batch_size: int,
+        window_size: int,
+        device: torch.device,
+        rank: int,
+        tensorboard_log_path: str = "",
+        world_size: int = 1,
+        auc_threshold: Optional[float] = None,
+    ) -> None:
+        self.multitask_configs: List[TaskConfig] = multitask_configs
+        all_classification_tasks: List[str] = [
+            task.task_name
+            for task in self.multitask_configs
+            if task.task_type != MultitaskTaskType.REGRESSION
+        ]
+        all_regression_tasks: List[str] = [
+            task.task_name
+            for task in self.multitask_configs
+            if task.task_type == MultitaskTaskType.REGRESSION
+        ]
+        assert all_classification_tasks + all_regression_tasks == [
+            task.task_name for task in multitask_configs
+        ]
+        self.task_names: List[str] = all_classification_tasks + all_regression_tasks
+
+        self.class_metrics: Dict[str, List[RecMetricComputation]] = {
+            "train": [],
+            "eval": [],
+        }
+        if all_classification_tasks:
+            for mode in ["train", "eval"]:
+                self.class_metrics[mode].append(
+                    NEMetricComputation(
+                        my_rank=rank,
+                        batch_size=batch_size,
+                        n_tasks=len(all_classification_tasks),
+                        window_size=window_size,
+                    ).to(device)
+                )
+                self.class_metrics[mode].append(
+                    AccuracyMetricComputation(
+                        my_rank=rank,
+                        batch_size=batch_size,
+                        n_tasks=len(all_classification_tasks),
+                        window_size=window_size,
+                    ).to(device)
+                )
+                self.class_metrics[mode].append(
+                    GAUCMetricComputation(
+                        my_rank=rank,
+                        batch_size=batch_size,
+                        n_tasks=len(all_classification_tasks),
+                        window_size=window_size,
+                    ).to(device)
+                )
+                self.class_metrics[mode].append(
+                    AUCMetricComputation(
+                        my_rank=rank,
+                        batch_size=batch_size,
+                        n_tasks=len(all_classification_tasks),
+                        window_size=window_size,
+                    ).to(device)
+                )
+                self.class_metrics[mode].append(
+                    LifetimeAUCMetricComputation(
+                        my_rank=rank,
+                        batch_size=batch_size,
+                        n_tasks=len(all_classification_tasks),
+                        window_size=10_000_000,
+                    ).to(device)
+                )
+
+        self.regression_metrics: Dict[str, List[RecMetricComputation]] = {
+            "train": [],
+            "eval": [],
+        }
+        if all_regression_tasks:
+            for mode in ["train", "eval"]:
+                self.regression_metrics[mode].append(
+                    MSEMetricComputation(
+                        my_rank=rank,
+                        batch_size=batch_size,
+                        n_tasks=len(all_regression_tasks),
+                        window_size=window_size,
+                    ).to(device)
+                )
+                self.regression_metrics[mode].append(
+                    MAEMetricComputation(
+                        my_rank=rank,
+                        batch_size=batch_size,
+                        n_tasks=len(all_regression_tasks),
+                        window_size=window_size,
+                    ).to(device)
+                )
+
+        self.global_step: Dict[str, int] = {"train": 0, "eval": 0}
+        self.tb_logger: Optional[SummaryWriter] = None
+        if tensorboard_log_path != "":
+            self.tb_logger = SummaryWriter(log_dir=tensorboard_log_path, purge_step=0)
+            self.tb_logger.flush()
+
+        # Throughput / time-to-target tracking. Counters are train-only; eval
+        # samples are not relevant for headline samples/sec numbers.
+        self._world_size: int = max(1, int(world_size))
+        self._auc_threshold: Optional[float] = auc_threshold
+        self._time_to_target_logged: bool = False
+        self._perf_t_start: float = time.perf_counter()
+        self._perf_t_window: float = self._perf_t_start
+        self._perf_steps_in_window: int = 0
+        self._perf_total_samples: int = 0
+        self._perf_samples_counter: torch.Tensor = torch.zeros(
+            1, dtype=torch.long, device=device
+        )
+
+    @property
+    def all_metrics(self) -> Dict[str, List[RecMetricComputation]]:
+        """
+        Get all metrics for train and eval modes.
+
+        Returns:
+            Dictionary mapping mode ('train'/'eval') to list of metric computations.
+        """
+        return {
+            "train": self.class_metrics["train"] + self.regression_metrics["train"],
+            "eval": self.class_metrics["eval"] + self.regression_metrics["eval"],
+        }
+
+    def update(
+        self,
+        predictions: torch.Tensor,
+        weights: torch.Tensor,
+        labels: torch.Tensor,
+        num_candidates: torch.Tensor,
+        mode: str = "train",
+    ) -> None:
+        """
+        Update metrics with new batch of predictions and labels.
+
+        Args:
+            predictions: Model prediction tensor.
+            weights: Sample weight tensor.
+            labels: Ground truth label tensor.
+            num_candidates: Number of candidates per sample (for GAUC).
+            mode: Either 'train' or 'eval'.
+        """
+        for metric in self.all_metrics[mode]:
+            if isinstance(metric, GAUCMetricComputation):
+                metric.update(
+                    predictions=predictions,
+                    labels=labels,
+                    weights=weights,
+                    num_candidates=num_candidates,
+                )
+            else:
+                metric.update(
+                    predictions=predictions,
+                    labels=labels,
+                    weights=weights,
+                )
+        self.global_step[mode] += 1
+        if mode == "train":
+            # Accumulate on-device to avoid a per-step GPU->CPU sync; we read
+            # the counter only at compute_and_log boundaries.
+            self._perf_samples_counter += num_candidates.sum().to(
+                self._perf_samples_counter.dtype
+            )
+            self._perf_steps_in_window += 1
+
+    def compute(self, mode: str = "train") -> Dict[str, float]:
+        """
+        Compute and return all metrics for the current window.
+
+        Args:
+            mode: Either 'train' or 'eval'.
+
+        Returns:
+            Dictionary mapping metric names to their computed values.
+        """
+        all_computed_metrics = {}
+
+        for metric in self.all_metrics[mode]:
+            computed_metrics = metric.compute()
+            for computed in computed_metrics:
+                all_values = computed.value.cpu()
+                for i, task_name in enumerate(self.task_names):
+                    key = f"metric/{str(computed.metric_prefix) + str(computed.name)}/{task_name}"
+                    all_computed_metrics[key] = all_values[i]
+
+        logger.info(
+            f"{mode} - Step {self.global_step[mode]} metrics: {all_computed_metrics}"
+        )
+        return all_computed_metrics
+
+    def compute_and_log(
+        self,
+        mode: str = "train",
+        additional_logs: Optional[Dict[str, Dict[str, torch.Tensor]]] = None,
+    ) -> Dict[str, float]:
+        """
+        Compute metrics and log to TensorBoard.
+
+        Args:
+            mode: Either 'train' or 'eval'.
+            additional_logs: Optional additional data to log.
+
+        Returns:
+            Dictionary mapping metric names to their computed values.
+
+        Raises:
+            AssertionError: If TensorBoard logger is not configured.
+        """
+        assert self.tb_logger is not None
+        all_computed_metrics = self.compute(mode)
+        for k, v in all_computed_metrics.items():
+            self.tb_logger.add_scalar(  # pyre-ignore [16]
+                f"{mode}_{k}",
+                v,
+                global_step=self.global_step[mode],
+            )
+
+        if additional_logs is not None:
+            for tag, data in additional_logs.items():
+                for data_name, data_value in data.items():
+                    self.tb_logger.add_scalar(
+                        f"{tag}/{mode}_{data_name}",
+                        data_value.detach().clone().cpu(),
+                        global_step=self.global_step[mode],
+                    )
+
+        # Throughput metrics (train only). One GPU->CPU sync per call.
+        if mode == "train" and self._perf_steps_in_window > 0:
+            now = time.perf_counter()
+            dt = max(now - self._perf_t_window, 1e-6)
+            n_samples = int(self._perf_samples_counter.item())
+            self._perf_total_samples += n_samples
+            local_sps = n_samples / dt
+            global_sps = local_sps * self._world_size
+            step_ms = dt * 1000.0 / self._perf_steps_in_window
+            elapsed = now - self._perf_t_start
+            step = self.global_step["train"]
+            self.tb_logger.add_scalar(
+                "perf/train_samples_per_sec_local", local_sps, global_step=step
+            )
+            self.tb_logger.add_scalar(
+                "perf/train_samples_per_sec_global", global_sps, global_step=step
+            )
+            self.tb_logger.add_scalar(
+                "perf/train_step_time_ms", step_ms, global_step=step
+            )
+            self.tb_logger.add_scalar(
+                "perf/train_total_samples", self._perf_total_samples, global_step=step
+            )
+            self.tb_logger.add_scalar(
+                "perf/train_elapsed_sec", elapsed, global_step=step
+            )
+            logger.info(
+                f"train - Step {step} perf: local_sps={local_sps:.1f} "
+                f"global_sps={global_sps:.1f} step_ms={step_ms:.2f} "
+                f"elapsed_sec={elapsed:.1f} total_samples={self._perf_total_samples}"
+            )
+            self._perf_t_window = now
+            self._perf_steps_in_window = 0
+            self._perf_samples_counter.zero_()
+
+        # Time-to-target: latch wall-clock once any task's AUC crosses threshold.
+        # Matches MLPerf DLRM-DCNv2 reporting style (default upstream target 0.80275).
+        if (
+            self._auc_threshold is not None
+            and not self._time_to_target_logged
+        ):
+            for key, val in all_computed_metrics.items():
+                metric_short = key.split("/")[-2] if "/" in key else key
+                if metric_short.endswith("auc") and not metric_short.endswith("gauc"):
+                    if float(val) >= self._auc_threshold:
+                        ttt = time.perf_counter() - self._perf_t_start
+                        self.tb_logger.add_scalar(
+                            f"perf/time_to_auc_{self._auc_threshold:.5f}_sec",
+                            ttt,
+                            global_step=self.global_step[mode],
+                        )
+                        logger.info(
+                            f"REACHED AUC>={self._auc_threshold} on {key}="
+                            f"{float(val):.6f} at elapsed_sec={ttt:.2f} "
+                            f"step={self.global_step[mode]}"
+                        )
+                        self._time_to_target_logged = True
+                        break
+
+        return all_computed_metrics
+
+    def reset(self, mode: str = "train"):
+        """
+        Reset all metrics for a given mode.
+
+        Args:
+            mode: Either 'train' or 'eval'.
+        """
+        for metric in self.all_metrics[mode]:
+            metric.reset()
+
+
+# the datasets we support
+SUPPORTED_DATASETS = [
+    "debug",
+    "movielens-1m",
+    "movielens-20m",
+    "movielens-13b",
+    "movielens-18b",
+    "kuairand-1k",
+    "streaming-400m",
+    "streaming-200b",
+    "streaming-100b",
+    "sampled-streaming-100b",
+    "yambda-5b",
+]
+
+
+@gin.configurable
+def get_dataset(name: str, new_path_prefix: str = ""):
+    """
+    Get dataset class and configuration by name.
+
+    Args:
+        name: Dataset identifier (must be in SUPPORTED_DATASETS).
+        new_path_prefix: Optional prefix to prepend to data paths.
+
+    Returns:
+        Tuple of (dataset_class, kwargs_dict) for dataset instantiation.
+
+    Raises:
+        AssertionError: If dataset name is not supported.
+    """
+    assert name in SUPPORTED_DATASETS, f"dataset {name} not supported"
+    if name == "debug":
+        return DLRMv3RandomDataset, {}
+    if name == "movielens-1m":
+        return (
+            DLRMv3MovieLensDataset,
+            {
+                "ratings_file": os.path.join(
+                    new_path_prefix, "data/ml-1m/sasrec_format.csv"
+                ),
+            },
+        )
+    if name == "movielens-20m":
+        return (
+            DLRMv3MovieLensDataset,
+            {
+                "ratings_file": os.path.join(
+                    new_path_prefix, "data/ml-20m/sasrec_format.csv"
+                ),
+            },
+        )
+    if name == "movielens-13b":
+        return (
+            DLRMv3SyntheticMovieLensDataset,
+            {
+                "ratings_file_prefix": os.path.join(
+                    new_path_prefix, "data/ml-13b/16x16384"
+                ),
+            },
+        )
+    if name == "movielens-18b":
+        return (
+            DLRMv3SyntheticMovieLensDataset,
+            {
+                "ratings_file_prefix": os.path.join(
+                    new_path_prefix, "data/ml-18b/20x36864"
+                ),
+            },
+        )
+    if name == "kuairand-1k":
+        return (
+            DLRMv3KuaiRandDataset,
+            {
+                "seq_logs_file": os.path.join(
+                    new_path_prefix, "data/KuaiRand-1K/data/processed_seqs.csv"
+                ),
+            },
+        )
+    if name == "streaming-400m":
+        return (
+            DLRMv3SyntheticStreamingDataset,
+            {
+                "ratings_file_prefix": os.path.join(
+                    new_path_prefix, "data/streaming-400m/"
+                ),
+                "train_ts": 8,
+                "total_ts": 10,
+                "num_files": 3,
+                "num_users": 150_000,
+                "num_items": 1_500_000,
+                "num_categories": 128,
+            },
+        )
+    if name == "streaming-200b":
+        return (
+            DLRMv3SyntheticStreamingDataset,
+            {
+                "ratings_file_prefix": os.path.join(
+                    new_path_prefix, "data/streaming-200b/"
+                ),
+                "train_ts": 90,
+                "total_ts": 100,
+                "num_files": 100,
+                "num_users": 10_000_000,
+                "num_items": 1_000_000_000,
+                "num_categories": 128,
+            },
+        )
+    if name == "streaming-100b":
+        return (
+            DLRMv3SyntheticStreamingDataset,
+            {
+                "ratings_file_prefix": os.path.join(
+                    new_path_prefix, "data/streaming-100b/"
+                ),
+                "train_ts": 90,
+                "total_ts": 100,
+                "num_files": 100,
+                "num_users": 5_000_000,
+                "num_items": 1_000_000_000,
+                "num_categories": 128,
+            },
+        )
+    if name == "yambda-5b":
+        from generative_recommenders.dlrm_v3.configs import YAMBDA_5B_CROSS_SPECS
+
+        return (
+            DLRMv3YambdaDataset,
+            {
+                # Layout: <new_path_prefix>/processed_5b/{train_sessions.parquet,...}
+                # and <new_path_prefix>/shared_metadata/{artist,album}_item_mapping.parquet.
+                # The dataset auto-builds a MAP_SHARED-mmap'd cache of the
+                # flat columns + LISTEN-anchor positions under
+                # <processed_dir>/hstu_cache_L<history_length>/ on first use;
+                # all ranks on a node share the same physical pages.
+                "processed_dir": os.path.join(new_path_prefix, "processed_5b"),
+                "metadata_dir": os.path.join(new_path_prefix, "shared_metadata"),
+                "history_length": 4096,
+                "scan_window": 20000,
+                "cross_specs": YAMBDA_5B_CROSS_SPECS,
+            },
+        )
+    if name == "sampled-streaming-100b":
+        return (
+            DLRMv3SyntheticStreamingDataset,
+            {
+                "ratings_file_prefix": os.path.join(
+                    new_path_prefix, "data/streaming-100b/sampled_data/"
+                ),
+                "train_ts": 90,
+                "total_ts": 100,
+                "num_files": 1,
+                "num_users": 50_000,
+                "num_items": 1_000_000_000,
+                "num_categories": 128,
+            },
+        )
diff --git a/recommendation_v4/generative_recommenders/modules/action_encoder.py b/recommendation_v4/generative_recommenders/modules/action_encoder.py
new file mode 100644
index 000000000..13b65557e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/action_encoder.py
@@ -0,0 +1,112 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from generative_recommenders.common import HammerModule
+from generative_recommenders.ops.jagged_tensors import concat_2D_jagged
+
+
+class ActionEncoder(HammerModule):
+    def __init__(
+        self,
+        action_embedding_dim: int,
+        action_feature_name: str,
+        action_weights: List[int],
+        watchtime_feature_name: str = "",
+        watchtime_to_action_thresholds_and_weights: Optional[
+            List[Tuple[int, int]]
+        ] = None,
+        embedding_init_std: float = 0.1,
+        is_inference: bool = False,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+        self._watchtime_feature_name: str = watchtime_feature_name
+        self._action_feature_name: str = action_feature_name
+        self._watchtime_to_action_thresholds_and_weights: List[Tuple[int, int]] = (
+            watchtime_to_action_thresholds_and_weights
+            if watchtime_to_action_thresholds_and_weights is not None
+            else []
+        )
+        self.register_buffer(
+            "_combined_action_weights",
+            torch.tensor(
+                action_weights
+                + [x[1] for x in self._watchtime_to_action_thresholds_and_weights]
+            ),
+        )
+        self._num_action_types: int = len(action_weights) + len(
+            self._watchtime_to_action_thresholds_and_weights
+        )
+        self._action_embedding_dim = action_embedding_dim
+        self._action_embedding_table: torch.nn.Parameter = torch.nn.Parameter(
+            torch.empty((self._num_action_types, action_embedding_dim)).normal_(
+                mean=0, std=embedding_init_std
+            ),
+        )
+        self._target_action_embedding_table: torch.nn.Parameter = torch.nn.Parameter(
+            torch.empty((1, self._num_action_types * action_embedding_dim)).normal_(
+                mean=0, std=embedding_init_std
+            ),
+        )
+
+    @property
+    def output_embedding_dim(self) -> int:
+        return self._action_embedding_dim * self._num_action_types
+
+    def forward(
+        self,
+        max_uih_len: int,
+        max_targets: int,
+        uih_offsets: torch.Tensor,
+        target_offsets: torch.Tensor,
+        seq_embeddings: torch.Tensor,
+        seq_payloads: Dict[str, torch.Tensor],
+    ) -> torch.Tensor:
+        seq_actions = seq_payloads[self._action_feature_name]
+        if len(self._watchtime_to_action_thresholds_and_weights) > 0:
+            watchtimes = seq_payloads[self._watchtime_feature_name]
+            for threshold, weight in self._watchtime_to_action_thresholds_and_weights:
+                seq_actions = torch.bitwise_or(
+                    seq_actions, (watchtimes >= threshold).to(torch.int64) * weight
+                )
+        exploded_actions = (
+            torch.bitwise_and(
+                seq_actions.unsqueeze(-1), self._combined_action_weights.unsqueeze(0)
+            )
+            > 0
+        )
+        action_embeddings = (
+            exploded_actions.unsqueeze(-1) * self._action_embedding_table.unsqueeze(0)
+        ).view(-1, self._num_action_types * self._action_embedding_dim)
+        total_targets: int = seq_embeddings.size(0) - action_embeddings.size(0)
+        action_embeddings = concat_2D_jagged(
+            max_seq_len=max_uih_len + max_targets,
+            values_left=action_embeddings,
+            values_right=self._target_action_embedding_table.tile(
+                total_targets,
+                1,
+            ),
+            max_len_left=max_uih_len,
+            max_len_right=max_targets,
+            offsets_left=uih_offsets,
+            offsets_right=target_offsets,
+            kernel=self.hammer_kernel(),
+        )
+        return action_embeddings
diff --git a/recommendation_v4/generative_recommenders/modules/content_encoder.py b/recommendation_v4/generative_recommenders/modules/content_encoder.py
new file mode 100644
index 000000000..acca82dbf
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/content_encoder.py
@@ -0,0 +1,109 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from typing import Dict, List, Optional
+
+import torch
+from generative_recommenders.common import HammerModule
+from generative_recommenders.ops.jagged_tensors import concat_2D_jagged
+
+
+class ContentEncoder(HammerModule):
+    def __init__(
+        self,
+        input_embedding_dim: int,
+        additional_content_features: Optional[Dict[str, int]] = None,
+        target_enrich_features: Optional[Dict[str, int]] = None,
+        is_inference: bool = False,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+        self._input_embedding_dim: int = input_embedding_dim
+        self._additional_content_features: Dict[str, int] = (
+            additional_content_features
+            if additional_content_features is not None
+            else {}
+        )
+        self._target_enrich_features: Dict[str, int] = (
+            target_enrich_features if target_enrich_features is not None else {}
+        )
+        self._target_enrich_dummy_embeddings: torch.nn.ParameterDict = (
+            torch.nn.ParameterDict(
+                {
+                    name: torch.nn.Parameter(
+                        torch.empty((1, dim)).normal_(mean=0, std=0.1),
+                    )
+                    for name, dim in self._target_enrich_features.items()
+                }
+            )
+        )
+
+    @property
+    def output_embedding_dim(self) -> int:
+        return self._input_embedding_dim + sum(
+            list(self._additional_content_features.values())
+            + list(self._target_enrich_features.values())
+        )
+
+    def forward(
+        self,
+        max_uih_len: int,
+        max_targets: int,
+        uih_offsets: torch.Tensor,
+        target_offsets: torch.Tensor,
+        seq_embeddings: torch.Tensor,
+        seq_payloads: Dict[str, torch.Tensor],
+    ) -> torch.Tensor:
+        content_embeddings_list: List[torch.Tensor] = [seq_embeddings]
+        if len(self._additional_content_features) > 0:
+            content_embeddings_list = content_embeddings_list + [
+                (seq_payloads[x].to(seq_embeddings.dtype))
+                for x in self._additional_content_features.keys()
+            ]
+
+        if self._target_enrich_dummy_embeddings:
+            total_seq_len: int = seq_embeddings.size(0)
+            for name, param in self._target_enrich_dummy_embeddings.items():
+                enrich_embeddings_target = seq_payloads[name].to(seq_embeddings.dtype)
+                total_targets: int = enrich_embeddings_target.size(0)
+                total_uih_len: int = total_seq_len - total_targets
+                enrich_embeddings_uih = param.tile(total_uih_len, 1).to(
+                    seq_embeddings.dtype
+                )
+                enrich_embeddings = concat_2D_jagged(
+                    max_seq_len=max_uih_len + max_targets,
+                    values_left=enrich_embeddings_uih,
+                    values_right=enrich_embeddings_target,
+                    max_len_left=max_uih_len,
+                    max_len_right=max_targets,
+                    offsets_left=uih_offsets,
+                    offsets_right=target_offsets,
+                    kernel=self.hammer_kernel(),
+                )
+                content_embeddings_list.append(enrich_embeddings)
+
+        if (
+            len(self._target_enrich_features) == 0
+            and len(self._additional_content_features) == 0
+        ):
+            return seq_embeddings
+        else:
+            content_embeddings = torch.cat(
+                content_embeddings_list,
+                dim=1,
+            )
+            return content_embeddings
diff --git a/recommendation_v4/generative_recommenders/modules/contextual_interleave_preprocessor.py b/recommendation_v4/generative_recommenders/modules/contextual_interleave_preprocessor.py
new file mode 100644
index 000000000..fff0d72f0
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/contextual_interleave_preprocessor.py
@@ -0,0 +1,357 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from math import sqrt
+from typing import Callable, Dict, Optional, Tuple
+
+import torch
+from generative_recommenders.common import fx_unwrap_optional_tensor
+from generative_recommenders.modules.action_encoder import ActionEncoder
+from generative_recommenders.modules.content_encoder import ContentEncoder
+from generative_recommenders.modules.contextualize_mlps import (
+    ContextualizedMLP,
+    ParameterizedContextualizedMLP,
+)
+from generative_recommenders.modules.preprocessors import (
+    get_contextual_input_embeddings,
+    InputPreprocessor,
+)
+from generative_recommenders.ops.jagged_tensors import concat_2D_jagged
+
+
+class ContextualInterleavePreprocessor(InputPreprocessor):
+    def __init__(
+        self,
+        input_embedding_dim: int,
+        output_embedding_dim: int,
+        contextual_feature_to_max_length: Dict[str, int],
+        contextual_feature_to_min_uih_length: Dict[str, int],
+        content_encoder: ContentEncoder,
+        content_contextualize_mlp_fn: Callable[
+            [int, int, int, bool], ContextualizedMLP
+        ],
+        action_encoder: ActionEncoder,
+        action_contextualize_mlp_fn: Callable[[int, int, int, bool], ContextualizedMLP],
+        pmlp_contextual_dropout_ratio: float = 0.0,
+        enable_interleaving: bool = False,
+        is_inference: bool = False,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+        self._input_embedding_dim: int = input_embedding_dim
+        self._output_embedding_dim: int = output_embedding_dim
+        self._contextual_feature_to_max_length: Dict[str, int] = (
+            contextual_feature_to_max_length
+        )
+        self._max_contextual_seq_len: int = sum(
+            contextual_feature_to_max_length.values()
+        )
+        self._contextual_feature_to_min_uih_length: Dict[str, int] = (
+            contextual_feature_to_min_uih_length
+        )
+        std = 1.0 * sqrt(2.0 / float(input_embedding_dim + output_embedding_dim))
+        self._batched_contextual_linear_weights = torch.nn.Parameter(
+            torch.empty(
+                (
+                    self._max_contextual_seq_len,
+                    input_embedding_dim,
+                    output_embedding_dim,
+                )
+            ).normal_(0.0, std)
+        )
+        self._pmlp_contextual_dropout_ratio: float = pmlp_contextual_dropout_ratio
+        self._batched_contextual_linear_bias = torch.nn.Parameter(
+            torch.empty((self._max_contextual_seq_len, 1, output_embedding_dim)).fill_(
+                0.0
+            )
+        )
+        contextual_embedding_dim: int = (
+            self._max_contextual_seq_len * input_embedding_dim
+        )
+        self._content_encoder: ContentEncoder = content_encoder
+        self._content_embedding_mlp: ContextualizedMLP = content_contextualize_mlp_fn(
+            self._content_encoder.output_embedding_dim,
+            output_embedding_dim,
+            contextual_embedding_dim,
+            is_inference,
+        )
+        self._action_encoder: ActionEncoder = action_encoder
+        self._action_embedding_mlp: ContextualizedMLP = action_contextualize_mlp_fn(
+            self._action_encoder.output_embedding_dim,
+            output_embedding_dim,
+            contextual_embedding_dim,
+            is_inference,
+        )
+        self._enable_interleaving: bool = enable_interleaving
+
+    def combine_embeddings(
+        self,
+        max_uih_len: int,
+        max_targets: int,
+        total_uih_len: int,
+        total_targets: int,
+        seq_lengths: torch.Tensor,
+        seq_timestamps: torch.Tensor,
+        content_embeddings: torch.Tensor,
+        action_embeddings: torch.Tensor,
+        contextual_embeddings: Optional[torch.Tensor],
+        num_targets: torch.Tensor,
+    ) -> Tuple[
+        int,
+        int,
+        int,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+    ]:
+        if self._enable_interleaving:
+            output_seq_timestamps = seq_timestamps.repeat_interleave(2)
+            output_seq_embeddings = torch.stack(
+                [content_embeddings, action_embeddings], dim=1
+            ).reshape(-1, self._output_embedding_dim)
+            if self.interleave_targets():
+                output_seq_lengths = seq_lengths * 2
+                output_max_seq_len = (max_uih_len + max_targets) * 2
+                output_num_targets = num_targets * 2
+                output_total_uih_len = total_uih_len * 2
+                output_total_targets = total_targets * 2
+            else:
+                seq_lengths_by_2 = seq_lengths * 2
+                output_seq_lengths = seq_lengths_by_2 - num_targets
+                output_max_seq_len = 2 * max_uih_len + max_targets
+                indices = torch.arange(
+                    2 * (max_uih_len + max_targets), device=seq_lengths.device
+                ).view(1, -1)
+                valid_mask = torch.logical_and(
+                    indices < seq_lengths_by_2.view(-1, 1),
+                    torch.logical_or(
+                        indices < (output_seq_lengths - num_targets).view(-1, 1),
+                        torch.remainder(indices, 2) == 0,
+                    ),
+                )
+                jagged_valid_mask = (
+                    torch.ops.fbgemm.dense_to_jagged(
+                        valid_mask.int().unsqueeze(-1),
+                        [
+                            torch.ops.fbgemm.asynchronous_complete_cumsum(
+                                seq_lengths_by_2
+                            )
+                        ],
+                    )[0]
+                    .to(torch.bool)
+                    .squeeze(1)
+                )
+                output_seq_embeddings = output_seq_embeddings[jagged_valid_mask]
+                output_seq_timestamps = output_seq_timestamps[jagged_valid_mask]
+                output_num_targets = num_targets
+                output_total_uih_len = total_uih_len * 2
+                output_total_targets = total_targets
+        else:
+            output_max_seq_len = max_uih_len + max_targets
+            output_seq_lengths = seq_lengths
+            output_num_targets = num_targets
+            output_seq_timestamps = seq_timestamps
+            output_seq_embeddings = content_embeddings + action_embeddings
+            output_total_uih_len = total_uih_len
+            output_total_targets = total_targets
+
+        # concat contextual embeddings
+        output_seq_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(
+            output_seq_lengths
+        )
+        if self._max_contextual_seq_len > 0:
+            output_seq_embeddings = concat_2D_jagged(
+                max_seq_len=self._max_contextual_seq_len + output_max_seq_len,
+                values_left=fx_unwrap_optional_tensor(contextual_embeddings).reshape(
+                    -1, self._output_embedding_dim
+                ),
+                values_right=output_seq_embeddings,
+                max_len_left=self._max_contextual_seq_len,
+                max_len_right=output_max_seq_len,
+                offsets_left=None,
+                offsets_right=output_seq_offsets,
+                kernel=self.hammer_kernel(),
+            )
+            output_seq_timestamps = concat_2D_jagged(
+                max_seq_len=self._max_contextual_seq_len + output_max_seq_len,
+                values_left=torch.zeros(
+                    (output_seq_lengths.size(0) * self._max_contextual_seq_len, 1),
+                    dtype=output_seq_timestamps.dtype,
+                    device=output_seq_timestamps.device,
+                ),
+                values_right=output_seq_timestamps.unsqueeze(-1),
+                max_len_left=self._max_contextual_seq_len,
+                max_len_right=output_max_seq_len,
+                offsets_left=None,
+                offsets_right=output_seq_offsets,
+                kernel=self.hammer_kernel(),
+            ).squeeze(-1)
+            output_max_seq_len = output_max_seq_len + self._max_contextual_seq_len
+            output_total_uih_len = (
+                output_total_uih_len
+                + self._max_contextual_seq_len * output_seq_lengths.size(0)
+            )
+            output_seq_lengths = output_seq_lengths + self._max_contextual_seq_len
+            output_seq_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(
+                output_seq_lengths
+            )
+
+        return (
+            output_max_seq_len,
+            output_total_uih_len,
+            output_total_targets,
+            output_seq_lengths,
+            output_seq_offsets,
+            output_seq_timestamps,
+            output_seq_embeddings,
+            output_num_targets,
+        )
+
+    def forward(  # noqa C901
+        self,
+        max_uih_len: int,
+        max_targets: int,
+        total_uih_len: int,
+        total_targets: int,
+        seq_lengths: torch.Tensor,
+        seq_timestamps: torch.Tensor,
+        seq_embeddings: torch.Tensor,
+        num_targets: torch.Tensor,
+        seq_payloads: Dict[str, torch.Tensor],
+    ) -> Tuple[
+        int,
+        int,
+        int,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        Dict[str, torch.Tensor],
+    ]:
+        max_seq_len = max_uih_len + max_targets
+        with torch.autocast(
+            "cuda",
+            dtype=torch.bfloat16,
+            enabled=(not self.is_inference and self._training_dtype == torch.bfloat16),
+        ):
+            # get contextual_embeddings
+            contextual_embeddings: Optional[torch.Tensor] = None
+            pmlp_contextual_embeddings: Optional[torch.Tensor] = None
+            if self._max_contextual_seq_len > 0:
+                contextual_input_embeddings = get_contextual_input_embeddings(
+                    seq_lengths=seq_lengths,
+                    seq_payloads=seq_payloads,
+                    contextual_feature_to_max_length=self._contextual_feature_to_max_length,
+                    contextual_feature_to_min_uih_length=self._contextual_feature_to_min_uih_length,
+                    dtype=seq_embeddings.dtype,
+                )
+                if isinstance(
+                    self._action_embedding_mlp, ParameterizedContextualizedMLP
+                ) or isinstance(
+                    self._action_embedding_mlp, ParameterizedContextualizedMLP
+                ):
+                    pmlp_contextual_embeddings = torch.nn.functional.dropout(
+                        contextual_input_embeddings,
+                        p=self._pmlp_contextual_dropout_ratio,
+                        training=self.training,
+                    )
+                contextual_embeddings = torch.baddbmm(
+                    self._batched_contextual_linear_bias.to(
+                        contextual_input_embeddings.dtype
+                    ),
+                    contextual_input_embeddings.view(
+                        -1, self._max_contextual_seq_len, self._input_embedding_dim
+                    ).transpose(0, 1),
+                    self._batched_contextual_linear_weights.to(
+                        contextual_input_embeddings.dtype
+                    ),
+                ).transpose(0, 1)
+
+            # content embeddings
+            seq_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(seq_lengths)
+            target_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(num_targets)
+            uih_offsets = seq_offsets - target_offsets
+            content_embeddings = self._content_encoder(
+                max_uih_len=max_uih_len,
+                max_targets=max_targets,
+                uih_offsets=uih_offsets,
+                target_offsets=target_offsets,
+                seq_embeddings=seq_embeddings,
+                seq_payloads=seq_payloads,
+            )
+            content_embeddings = self._content_embedding_mlp(
+                seq_embeddings=content_embeddings,
+                seq_offsets=seq_offsets,
+                max_seq_len=max_seq_len,
+                contextual_embeddings=pmlp_contextual_embeddings,
+            )
+
+            # action embeddings
+            action_embeddings = self._action_encoder(
+                max_uih_len=max_uih_len,
+                max_targets=max_targets,
+                uih_offsets=uih_offsets,
+                target_offsets=target_offsets,
+                seq_embeddings=seq_embeddings,
+                seq_payloads=seq_payloads,
+            ).to(seq_embeddings.dtype)
+            action_embeddings = self._action_embedding_mlp(
+                seq_embeddings=action_embeddings,
+                seq_offsets=seq_offsets,
+                max_seq_len=max_seq_len,
+                contextual_embeddings=pmlp_contextual_embeddings,
+            )
+
+            (
+                output_max_seq_len,
+                output_total_uih_len,
+                output_total_targets,
+                output_seq_lengths,
+                output_seq_offsets,
+                output_seq_timestamps,
+                output_seq_embeddings,
+                output_num_targets,
+            ) = self.combine_embeddings(
+                max_uih_len=max_uih_len,
+                max_targets=max_targets,
+                total_uih_len=total_uih_len,
+                total_targets=total_targets,
+                seq_lengths=seq_lengths,
+                seq_timestamps=seq_timestamps,
+                content_embeddings=content_embeddings,
+                action_embeddings=action_embeddings,
+                contextual_embeddings=contextual_embeddings,
+                num_targets=num_targets,
+            )
+
+        return (
+            output_max_seq_len,
+            output_total_uih_len,
+            output_total_targets,
+            output_seq_lengths,
+            output_seq_offsets,
+            output_seq_timestamps,
+            output_seq_embeddings,
+            output_num_targets,
+            seq_payloads,
+        )
+
+    def interleave_targets(self) -> bool:
+        return self.is_train and self._enable_interleaving
diff --git a/recommendation_v4/generative_recommenders/modules/contextualize_mlps.py b/recommendation_v4/generative_recommenders/modules/contextualize_mlps.py
new file mode 100644
index 000000000..dc49effeb
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/contextualize_mlps.py
@@ -0,0 +1,141 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+import abc
+from typing import Optional
+
+import torch
+from generative_recommenders.common import HammerModule, init_mlp_weights_optional_bias
+from generative_recommenders.ops.jagged_tensors import jagged_dense_bmm_broadcast_add
+from generative_recommenders.ops.layer_norm import LayerNorm, SwishLayerNorm
+from libfb.py.pyre import none_throws
+
+
+class ContextualizedMLP(HammerModule):
+    @abc.abstractmethod
+    def forward(
+        self,
+        max_seq_len: int,
+        seq_embeddings: torch.Tensor,
+        seq_offsets: torch.Tensor,
+        contextual_embeddings: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        """
+        Args:
+            seq_embeddings: (L, D)
+            seq_offsets: (B + 1,)
+            max_seq_len: int
+            contextual_embeddings: (B, D')
+        """
+        pass
+
+
+class SimpleContextualizedMLP(ContextualizedMLP):
+    def __init__(
+        self,
+        sequential_input_dim: int,
+        sequential_output_dim: int,
+        hidden_dim: int,
+        is_inference: bool = False,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+        self._mlp: torch.nn.Module = torch.nn.Sequential(
+            torch.nn.Linear(
+                in_features=sequential_input_dim,
+                out_features=hidden_dim,
+            ),
+            SwishLayerNorm(hidden_dim, is_inference=is_inference),
+            torch.nn.Linear(
+                in_features=hidden_dim,
+                out_features=sequential_output_dim,
+            ),
+            LayerNorm(sequential_output_dim),
+        ).apply(init_mlp_weights_optional_bias)
+
+    def forward(
+        self,
+        seq_embeddings: torch.Tensor,
+        seq_offsets: torch.Tensor,
+        max_seq_len: int,
+        contextual_embeddings: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        return self._mlp(seq_embeddings)
+
+
+class ParameterizedContextualizedMLP(ContextualizedMLP):
+    def __init__(
+        self,
+        contextual_embedding_dim: int,
+        sequential_input_dim: int,
+        sequential_output_dim: int,
+        hidden_dim: int,
+        is_inference: bool = False,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+
+        self._sequential_input_dim: int = sequential_input_dim
+        self._sequential_output_dim: int = sequential_output_dim
+
+        self._dense_features_compress: torch.nn.Module = torch.nn.Linear(
+            in_features=contextual_embedding_dim,
+            out_features=hidden_dim,
+        ).apply(init_mlp_weights_optional_bias)
+
+        self._attn_raw_weights: torch.nn.Module = torch.nn.Sequential(
+            torch.nn.Linear(
+                in_features=hidden_dim,
+                out_features=sequential_input_dim * sequential_output_dim,
+            ),
+        ).apply(init_mlp_weights_optional_bias)
+
+        self._attn_weights_norm: torch.nn.Module = torch.nn.LayerNorm(
+            [sequential_input_dim, sequential_output_dim]
+        )
+
+        self._res_weights: torch.nn.Module = torch.nn.Sequential(
+            torch.nn.Linear(
+                in_features=hidden_dim,
+                out_features=hidden_dim,
+            ),
+            SwishLayerNorm(hidden_dim),
+            torch.nn.Linear(
+                in_features=hidden_dim,
+                out_features=sequential_output_dim,
+            ),
+        ).apply(init_mlp_weights_optional_bias)
+
+    def forward(
+        self,
+        seq_embeddings: torch.Tensor,
+        seq_offsets: torch.Tensor,
+        max_seq_len: int,
+        contextual_embeddings: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        shared_input = self._dense_features_compress(none_throws(contextual_embeddings))
+        attn_weights = self._attn_weights_norm(
+            self._attn_raw_weights(shared_input).reshape(
+                -1, self._sequential_input_dim, self._sequential_output_dim
+            )
+        )
+        return jagged_dense_bmm_broadcast_add(
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            jagged=seq_embeddings,
+            dense=attn_weights.to(seq_embeddings.dtype),
+            bias=self._res_weights(shared_input),
+            kernel=self.hammer_kernel(),
+        )
diff --git a/recommendation_v4/generative_recommenders/modules/dlrm_hstu.py b/recommendation_v4/generative_recommenders/modules/dlrm_hstu.py
new file mode 100644
index 000000000..af2edc998
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/dlrm_hstu.py
@@ -0,0 +1,626 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+
+import logging
+from dataclasses import dataclass, field
+from typing import Dict, List, NamedTuple, Optional, Tuple
+
+import torch
+from generative_recommenders.common import (
+    fx_infer_max_len,
+    fx_mark_length_features,
+    HammerKernel,
+    HammerModule,
+    init_mlp_weights_optional_bias,
+    set_static_max_seq_lens,
+)
+from generative_recommenders.modules.hstu_transducer import HSTUTransducer
+from generative_recommenders.modules.multitask_module import (
+    DefaultMultitaskModule,
+    MultitaskTaskType,
+    TaskConfig,
+)
+from generative_recommenders.modules.positional_encoder import HSTUPositionalEncoder
+from generative_recommenders.modules.postprocessors import (
+    LayerNormPostprocessor,
+    TimestampLayerNormPostprocessor,
+)
+from generative_recommenders.modules.preprocessors import ContextualPreprocessor
+from generative_recommenders.modules.stu import STU, STULayer, STULayerConfig, STUStack
+from generative_recommenders.ops.jagged_tensors import concat_2D_jagged
+from generative_recommenders.ops.layer_norm import LayerNorm, SwishLayerNorm
+from torch.autograd.profiler import record_function
+from torchrec import KeyedJaggedTensor
+from torchrec.modules.embedding_configs import EmbeddingConfig
+from torchrec.modules.embedding_modules import EmbeddingCollection
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+torch.fx.wrap("fx_infer_max_len")
+torch.fx.wrap("len")
+
+
+class SequenceEmbedding(NamedTuple):
+    lengths: torch.Tensor
+    embedding: torch.Tensor
+
+
+@dataclass
+class DlrmHSTUConfig:
+    max_seq_len: int = 16384
+    max_num_candidates: int = 10
+    max_num_candidates_inference: int = 5
+    hstu_num_heads: int = 1
+    hstu_attn_linear_dim: int = 256
+    hstu_attn_qk_dim: int = 128
+    hstu_attn_num_layers: int = 12
+    hstu_embedding_table_dim: int = 192
+    hstu_preprocessor_hidden_dim: int = 256
+    hstu_transducer_embedding_dim: int = 0
+    hstu_group_norm: bool = False
+    hstu_input_dropout_ratio: float = 0.2
+    hstu_linear_dropout_rate: float = 0.2
+    contextual_feature_to_max_length: Dict[str, int] = field(default_factory=dict)
+    contextual_feature_to_min_uih_length: Dict[str, int] = field(default_factory=dict)
+    candidates_weight_feature_name: str = ""
+    candidates_watchtime_feature_name: str = ""
+    candidates_querytime_feature_name: str = ""
+    causal_multitask_weights: float = 0.2
+    multitask_configs: List[TaskConfig] = field(default_factory=list)
+    user_embedding_feature_names: List[str] = field(default_factory=list)
+    item_embedding_feature_names: List[str] = field(default_factory=list)
+    uih_post_id_feature_name: str = ""
+    uih_action_time_feature_name: str = ""
+    uih_weight_feature_name: str = ""
+    hstu_uih_feature_names: List[str] = field(default_factory=list)
+    hstu_candidate_feature_names: List[str] = field(default_factory=list)
+    merge_uih_candidate_feature_mapping: List[Tuple[str, str]] = field(
+        default_factory=list
+    )
+    action_weights: Optional[List[int]] = None
+    action_embedding_init_std: float = 0.1
+    enable_postprocessor: bool = True
+    use_layer_norm_postprocessor: bool = False
+
+
+def _get_supervision_labels_and_weights(
+    supervision_bitmasks: torch.Tensor,
+    watchtime_sequence: torch.Tensor,
+    task_configs: List[TaskConfig],
+) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:
+    supervision_labels: Dict[str, torch.Tensor] = {}
+    supervision_weights: Dict[str, torch.Tensor] = {}
+    for task in task_configs:
+        if task.task_type == MultitaskTaskType.REGRESSION:
+            supervision_labels[task.task_name] = watchtime_sequence.to(torch.float32)
+        elif task.task_type == MultitaskTaskType.BINARY_CLASSIFICATION:
+            supervision_labels[task.task_name] = (
+                torch.bitwise_and(supervision_bitmasks, task.task_weight) > 0
+            ).to(torch.float32)
+        else:
+            raise RuntimeError("Unsupported MultitaskTaskType")
+    return supervision_labels, supervision_weights
+
+
+class DlrmHSTU(HammerModule):
+    def __init__(  # noqa C901
+        self,
+        hstu_configs: DlrmHSTUConfig,
+        embedding_tables: Dict[str, EmbeddingConfig],
+        is_inference: bool,
+        is_dense: bool = False,
+        bf16_training: bool = True,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+        logger.info(f"Initialize HSTU module with configs {hstu_configs}")
+        self._hstu_configs = hstu_configs
+        self._bf16_training: bool = bf16_training
+        set_static_max_seq_lens([self._hstu_configs.max_seq_len])
+
+        if not is_dense:
+            self._embedding_collection: EmbeddingCollection = EmbeddingCollection(
+                tables=list(embedding_tables.values()),
+                need_indices=False,
+                device=torch.device("meta"),
+            )
+
+        # multitask configs must be sorted by task types
+        self._multitask_configs: List[TaskConfig] = hstu_configs.multitask_configs
+        self._multitask_module = DefaultMultitaskModule(
+            task_configs=self._multitask_configs,
+            embedding_dim=hstu_configs.hstu_transducer_embedding_dim,
+            prediction_fn=lambda in_dim, num_tasks: torch.nn.Sequential(
+                torch.nn.Linear(in_features=in_dim, out_features=512),
+                SwishLayerNorm(512),
+                torch.nn.Linear(in_features=512, out_features=num_tasks),
+            ).apply(init_mlp_weights_optional_bias),
+            causal_multitask_weights=hstu_configs.causal_multitask_weights,
+            is_inference=self._is_inference,
+        )
+        self._additional_embedding_features: List[str] = [
+            uih_feature_name
+            for (
+                uih_feature_name,
+                candidate_feature_name,
+            ) in self._hstu_configs.merge_uih_candidate_feature_mapping
+            if (
+                candidate_feature_name
+                in self._hstu_configs.item_embedding_feature_names
+            )
+            and (uih_feature_name in self._hstu_configs.user_embedding_feature_names)
+            and (uih_feature_name is not self._hstu_configs.uih_post_id_feature_name)
+        ]
+
+        # preprocessor setup
+        preprocessor = ContextualPreprocessor(
+            input_embedding_dim=hstu_configs.hstu_embedding_table_dim,
+            hidden_dim=hstu_configs.hstu_preprocessor_hidden_dim,
+            output_embedding_dim=hstu_configs.hstu_transducer_embedding_dim,
+            contextual_feature_to_max_length=hstu_configs.contextual_feature_to_max_length,
+            contextual_feature_to_min_uih_length=hstu_configs.contextual_feature_to_min_uih_length,
+            action_embedding_dim=8,
+            action_feature_name=self._hstu_configs.uih_weight_feature_name,
+            action_weights=self._hstu_configs.action_weights,
+            action_embedding_init_std=self._hstu_configs.action_embedding_init_std,
+            additional_embedding_features=self._additional_embedding_features,
+            is_inference=is_inference,
+        )
+
+        # positional encoder
+        positional_encoder = HSTUPositionalEncoder(
+            num_position_buckets=8192,
+            num_time_buckets=2048,
+            embedding_dim=hstu_configs.hstu_transducer_embedding_dim,
+            contextual_seq_len=sum(
+                dict(hstu_configs.contextual_feature_to_max_length).values()
+            ),
+            is_inference=self._is_inference,
+        )
+
+        if hstu_configs.enable_postprocessor:
+            if hstu_configs.use_layer_norm_postprocessor:
+                postprocessor = LayerNormPostprocessor(
+                    embedding_dim=hstu_configs.hstu_transducer_embedding_dim,
+                    eps=1e-5,
+                    is_inference=self._is_inference,
+                )
+            else:
+                postprocessor = TimestampLayerNormPostprocessor(
+                    embedding_dim=hstu_configs.hstu_transducer_embedding_dim,
+                    time_duration_features=[
+                        (60 * 60, 24),  # hour of day
+                        (24 * 60 * 60, 7),  # day of week
+                        # (24 * 60 * 60, 365), # time of year (approximate)
+                    ],
+                    eps=1e-5,
+                    is_inference=self._is_inference,
+                )
+        else:
+            postprocessor = None
+
+        # construct HSTU
+        stu_module: STU = STUStack(
+            stu_list=[
+                STULayer(
+                    config=STULayerConfig(
+                        embedding_dim=hstu_configs.hstu_transducer_embedding_dim,
+                        num_heads=hstu_configs.hstu_num_heads,
+                        hidden_dim=hstu_configs.hstu_attn_linear_dim,
+                        attention_dim=hstu_configs.hstu_attn_qk_dim,
+                        output_dropout_ratio=hstu_configs.hstu_linear_dropout_rate,
+                        use_group_norm=hstu_configs.hstu_group_norm,
+                        causal=True,
+                        target_aware=True,
+                        max_attn_len=None,
+                        attn_alpha=None,
+                        recompute_normed_x=True,
+                        recompute_uvqk=True,
+                        recompute_y=True,
+                        sort_by_length=True,
+                        contextual_seq_len=0,
+                    ),
+                    is_inference=is_inference,
+                )
+                for _ in range(hstu_configs.hstu_attn_num_layers)
+            ],
+            is_inference=is_inference,
+        )
+        self._hstu_transducer: HSTUTransducer = HSTUTransducer(
+            stu_module=stu_module,
+            input_preprocessor=preprocessor,
+            output_postprocessor=postprocessor,
+            input_dropout_ratio=hstu_configs.hstu_input_dropout_ratio,
+            positional_encoder=positional_encoder,
+            is_inference=self._is_inference,
+            return_full_embeddings=False,
+            listwise=False,
+        )
+
+        # item embeddings
+        self._item_embedding_mlp: torch.nn.Module = torch.nn.Sequential(
+            torch.nn.Linear(
+                in_features=hstu_configs.hstu_embedding_table_dim
+                * len(self._hstu_configs.item_embedding_feature_names),
+                out_features=512,
+            ),
+            SwishLayerNorm(512),
+            torch.nn.Linear(
+                in_features=512,
+                out_features=hstu_configs.hstu_transducer_embedding_dim,
+            ),
+            LayerNorm(hstu_configs.hstu_transducer_embedding_dim),
+        ).apply(init_mlp_weights_optional_bias)
+
+    def _construct_payload(
+        self,
+        payload_features: Dict[str, torch.Tensor],
+        seq_embeddings: Dict[str, SequenceEmbedding],
+    ) -> Dict[str, torch.Tensor]:
+        if len(self._hstu_configs.contextual_feature_to_max_length) > 0:
+            contextual_offsets: List[torch.Tensor] = []
+            for x in self._hstu_configs.contextual_feature_to_max_length.keys():
+                contextual_offsets.append(
+                    torch.ops.fbgemm.asynchronous_complete_cumsum(
+                        seq_embeddings[x].lengths
+                    )
+                )
+        else:
+            # Dummy, offsets are unused
+            contextual_offsets = torch.empty((0, 0))
+        if torch.jit.is_scripting():
+            # Explicit loops are TS-clean (avoid the dict-merge / dict-comp
+            # idioms below, which TorchScript cannot script).
+            out: Dict[str, torch.Tensor] = {}
+            for k, v in payload_features.items():
+                out[k] = v
+            for x in self._hstu_configs.contextual_feature_to_max_length.keys():
+                out[x] = seq_embeddings[x].embedding
+            i = 0
+            for x in self._hstu_configs.contextual_feature_to_max_length.keys():
+                # pyre-ignore[6]
+                out[x + "_offsets"] = contextual_offsets[i]
+                i += 1
+            for x in self._additional_embedding_features:
+                out[x] = seq_embeddings[x].embedding
+            return out
+        return {
+            **payload_features,
+            **{
+                x: seq_embeddings[x].embedding
+                for x in self._hstu_configs.contextual_feature_to_max_length.keys()
+            },
+            **{
+                x + "_offsets": contextual_offsets[i]
+                for i, x in enumerate(
+                    list(self._hstu_configs.contextual_feature_to_max_length.keys())
+                )
+            },
+            **{
+                x: seq_embeddings[x].embedding
+                for x in self._additional_embedding_features
+            },
+        }
+
+    def _user_forward(
+        self,
+        max_uih_len: int,
+        max_candidates: int,
+        seq_embeddings: Dict[str, SequenceEmbedding],
+        payload_features: Dict[str, torch.Tensor],
+        num_candidates: torch.Tensor,
+        total_uih_len: Optional[int] = None,
+        total_targets: Optional[int] = None,
+    ) -> torch.Tensor:
+        source_lengths = seq_embeddings[
+            self._hstu_configs.uih_post_id_feature_name
+        ].lengths
+        source_timestamps = concat_2D_jagged(
+            max_seq_len=max_uih_len + max_candidates,
+            max_len_left=max_uih_len,
+            offsets_left=payload_features["uih_offsets"],
+            values_left=payload_features[
+                self._hstu_configs.uih_action_time_feature_name
+            ].unsqueeze(-1),
+            max_len_right=max_candidates,
+            offsets_right=payload_features["candidate_offsets"],
+            values_right=payload_features[
+                self._hstu_configs.candidates_querytime_feature_name
+            ].unsqueeze(-1),
+            kernel=self.hammer_kernel(),
+        ).squeeze(-1)
+        if total_targets is None:
+            total_targets = int(num_candidates.sum().item())
+        if total_uih_len is None:
+            total_uih_len = source_timestamps.numel() - total_targets
+        embedding = seq_embeddings[
+            self._hstu_configs.uih_post_id_feature_name
+        ].embedding
+        dtype = embedding.dtype
+        if (not self.is_inference) and self._bf16_training:
+            embedding = embedding.to(torch.bfloat16)
+        if torch.jit.is_scripting():
+            # TorchScript does not support ``with torch.autocast(...)``.
+            # In script-mode inference the dense path is already in bf16
+            # (move_sparse_output_to_device upcasts on the C++ side), so
+            # autocast is a no-op for the path the predictor exercises.
+            candidates_user_embeddings, _ = self._hstu_transducer(
+                max_uih_len=max_uih_len,
+                max_targets=max_candidates,
+                total_uih_len=total_uih_len,
+                total_targets=total_targets,
+                seq_embeddings=embedding,
+                seq_lengths=source_lengths,
+                seq_timestamps=source_timestamps,
+                seq_payloads=self._construct_payload(
+                    payload_features=payload_features,
+                    seq_embeddings=seq_embeddings,
+                ),
+                num_targets=num_candidates,
+            )
+        else:
+            with torch.autocast(
+                "cuda",
+                dtype=torch.bfloat16,
+                enabled=(not self.is_inference) and self._bf16_training,
+            ):
+                candidates_user_embeddings, _ = self._hstu_transducer(
+                    max_uih_len=max_uih_len,
+                    max_targets=max_candidates,
+                    total_uih_len=total_uih_len,
+                    total_targets=total_targets,
+                    seq_embeddings=embedding,
+                    seq_lengths=source_lengths,
+                    seq_timestamps=source_timestamps,
+                    seq_payloads=self._construct_payload(
+                        payload_features=payload_features,
+                        seq_embeddings=seq_embeddings,
+                    ),
+                    num_targets=num_candidates,
+                )
+        candidates_user_embeddings = candidates_user_embeddings.to(dtype)
+
+        return candidates_user_embeddings
+
+    def _item_forward(
+        self,
+        seq_embeddings: Dict[str, SequenceEmbedding],
+    ) -> torch.Tensor:  # [L, D]
+        all_embeddings = torch.cat(
+            [
+                seq_embeddings[name].embedding
+                for name in self._hstu_configs.item_embedding_feature_names
+            ],
+            dim=-1,
+        )
+        item_embeddings = self._item_embedding_mlp(all_embeddings)
+        return item_embeddings
+
+    def preprocess(
+        self,
+        uih_features: KeyedJaggedTensor,
+        candidates_features: KeyedJaggedTensor,
+    ) -> Tuple[
+        Dict[str, SequenceEmbedding],
+        Dict[str, torch.Tensor],
+        int,
+        torch.Tensor,
+        int,
+        torch.Tensor,
+    ]:
+        # embedding lookup for uih and candidates
+        merged_sparse_features = KeyedJaggedTensor.from_lengths_sync(
+            keys=uih_features.keys() + candidates_features.keys(),
+            values=torch.cat(
+                [uih_features.values(), candidates_features.values()],
+                dim=0,
+            ),
+            lengths=torch.cat(
+                [uih_features.lengths(), candidates_features.lengths()],
+                dim=0,
+            ),
+        )
+        seq_embeddings_dict = self._embedding_collection(merged_sparse_features)
+        num_candidates = fx_mark_length_features(
+            candidates_features.lengths().view(len(candidates_features.keys()), -1)
+        )[0]
+        max_num_candidates = fx_infer_max_len(num_candidates)
+        uih_seq_lengths = uih_features[
+            self._hstu_configs.uih_post_id_feature_name
+        ].lengths()
+        max_uih_len = fx_infer_max_len(uih_seq_lengths)
+
+        # prepare payload features
+        payload_features: Dict[str, torch.Tensor] = {}
+        for (
+            uih_feature_name,
+            candidate_feature_name,
+        ) in self._hstu_configs.merge_uih_candidate_feature_mapping:
+            if (
+                candidate_feature_name
+                not in self._hstu_configs.item_embedding_feature_names
+                and uih_feature_name
+                not in self._hstu_configs.user_embedding_feature_names
+            ):
+                values_left = uih_features[uih_feature_name].values()
+                if self._is_inference and (
+                    candidate_feature_name
+                    == self._hstu_configs.candidates_weight_feature_name
+                    or candidate_feature_name
+                    == self._hstu_configs.candidates_watchtime_feature_name
+                ):
+                    total_candidates = torch.sum(num_candidates).item()
+                    values_right = torch.zeros(
+                        total_candidates,  # pyre-ignore
+                        dtype=torch.int64,
+                        device=values_left.device,
+                    )
+                else:
+                    values_right = candidates_features[candidate_feature_name].values()
+                payload_features[uih_feature_name] = values_left
+                payload_features[candidate_feature_name] = values_right
+        payload_features["uih_offsets"] = torch.ops.fbgemm.asynchronous_complete_cumsum(
+            uih_seq_lengths
+        )
+        payload_features["candidate_offsets"] = (
+            torch.ops.fbgemm.asynchronous_complete_cumsum(num_candidates)
+        )
+
+        seq_embeddings = {
+            k: SequenceEmbedding(
+                lengths=seq_embeddings_dict[k].lengths(),
+                embedding=seq_embeddings_dict[k].values(),
+            )
+            for k in self._hstu_configs.user_embedding_feature_names
+            + self._hstu_configs.item_embedding_feature_names
+        }
+
+        return (
+            seq_embeddings,
+            payload_features,
+            max_uih_len,
+            uih_seq_lengths,
+            max_num_candidates,
+            num_candidates,
+        )
+
+    def main_forward(
+        self,
+        seq_embeddings: Dict[str, SequenceEmbedding],
+        payload_features: Dict[str, torch.Tensor],
+        max_uih_len: int,
+        uih_seq_lengths: torch.Tensor,
+        max_num_candidates: int,
+        num_candidates: torch.Tensor,
+        total_uih_len: Optional[int] = None,
+        total_targets: Optional[int] = None,
+    ) -> Tuple[
+        torch.Tensor,
+        torch.Tensor,
+        Dict[str, torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
+        # merge uih and candidates embeddings
+        for (
+            uih_feature_name,
+            candidate_feature_name,
+        ) in self._hstu_configs.merge_uih_candidate_feature_mapping:
+            if uih_feature_name in seq_embeddings:
+                seq_embeddings[uih_feature_name] = SequenceEmbedding(
+                    lengths=uih_seq_lengths + num_candidates,
+                    embedding=concat_2D_jagged(
+                        max_seq_len=max_uih_len + max_num_candidates,
+                        max_len_left=max_uih_len,
+                        offsets_left=torch.ops.fbgemm.asynchronous_complete_cumsum(
+                            uih_seq_lengths
+                        ),
+                        values_left=seq_embeddings[uih_feature_name].embedding,
+                        max_len_right=max_num_candidates,
+                        offsets_right=torch.ops.fbgemm.asynchronous_complete_cumsum(
+                            num_candidates
+                        ),
+                        values_right=seq_embeddings[candidate_feature_name].embedding,
+                        kernel=self.hammer_kernel(),
+                    ),
+                )
+
+        with record_function("## item_forward ##"):
+            candidates_item_embeddings = self._item_forward(
+                seq_embeddings,
+            )
+        with record_function("## user_forward ##"):
+            candidates_user_embeddings = self._user_forward(
+                max_uih_len=max_uih_len,
+                max_candidates=max_num_candidates,
+                seq_embeddings=seq_embeddings,
+                payload_features=payload_features,
+                num_candidates=num_candidates,
+                total_uih_len=total_uih_len,
+                total_targets=total_targets,
+            )
+        with record_function("## multitask_module ##"):
+            supervision_labels, supervision_weights = (
+                _get_supervision_labels_and_weights(
+                    supervision_bitmasks=payload_features[
+                        self._hstu_configs.candidates_weight_feature_name
+                    ],
+                    watchtime_sequence=payload_features[
+                        self._hstu_configs.candidates_watchtime_feature_name
+                    ],
+                    task_configs=self._multitask_configs,
+                )
+            )
+            mt_target_preds, mt_target_labels, mt_target_weights, mt_losses = (
+                self._multitask_module(
+                    encoded_user_embeddings=candidates_user_embeddings,
+                    item_embeddings=candidates_item_embeddings,
+                    supervision_labels=supervision_labels,
+                    supervision_weights=supervision_weights,
+                )
+            )
+
+        aux_losses: Dict[str, torch.Tensor] = {}
+        if not self._is_inference and self.training:
+            for i, task in enumerate(self._multitask_configs):
+                aux_losses[task.task_name] = mt_losses[i]
+
+        return (
+            candidates_user_embeddings,
+            candidates_item_embeddings,
+            aux_losses,
+            mt_target_preds,
+            mt_target_labels,
+            mt_target_weights,
+        )
+
+    def forward(
+        self,
+        uih_features: KeyedJaggedTensor,
+        candidates_features: KeyedJaggedTensor,
+    ) -> Tuple[
+        torch.Tensor,
+        torch.Tensor,
+        Dict[str, torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
+        with record_function("## preprocess ##"):
+            (
+                seq_embeddings,
+                payload_features,
+                max_uih_len,
+                uih_seq_lengths,
+                max_num_candidates,
+                num_candidates,
+            ) = self.preprocess(
+                uih_features=uih_features,
+                candidates_features=candidates_features,
+            )
+
+        with record_function("## main_forward ##"):
+            return self.main_forward(
+                seq_embeddings=seq_embeddings,
+                payload_features=payload_features,
+                max_uih_len=max_uih_len,
+                uih_seq_lengths=uih_seq_lengths,
+                max_num_candidates=max_num_candidates,
+                num_candidates=num_candidates,
+            )
diff --git a/recommendation_v4/generative_recommenders/modules/dynamic_stu.py b/recommendation_v4/generative_recommenders/modules/dynamic_stu.py
new file mode 100644
index 000000000..e1fe8ad16
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/dynamic_stu.py
@@ -0,0 +1,304 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+import abc
+import contextlib
+from typing import Any, Generator, Optional, Tuple
+
+import torch
+from generative_recommenders.common import fx_infer_max_len
+from generative_recommenders.modules.stu import STU
+from generative_recommenders.ops.jagged_tensors import (
+    hstu_concat_l2_embeddings,
+    hstu_split_l2_embeddings,
+)
+
+
+try:
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+except OSError:
+    pass
+
+
+@contextlib.contextmanager
+# pyre-ignore[3]
+def _freeze_rng_state() -> Generator[Any, None, None]:
+    rng_state = torch.get_rng_state()
+    if torch.cuda.is_available():
+        cuda_rng_state = torch.cuda.get_rng_state()
+    try:
+        yield
+    finally:
+        if torch.cuda.is_available():
+            # pyre-ignore[61]
+            torch.cuda.set_rng_state(cuda_rng_state)
+        torch.set_rng_state(rng_state)
+
+
+class DynamicSTU(STU):
+    def __init__(self, stu: STU, is_inference: bool) -> None:
+        super().__init__(is_inference)
+        self._stu = stu
+
+    @abc.abstractmethod
+    def _preprocess(
+        self,
+        x: torch.Tensor,
+        x_lengths: torch.Tensor,
+        x_offsets: torch.Tensor,
+        max_seq_len: int,
+        num_targets: torch.Tensor,
+        max_kv_caching_len: int = 0,
+        kv_caching_lengths: Optional[torch.Tensor] = None,
+    ) -> Tuple[
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        int,
+        torch.Tensor,
+        int,
+        Optional[torch.Tensor],
+    ]:
+        pass
+
+    @abc.abstractmethod
+    def _postprocess(
+        self,
+        stu_output: torch.Tensor,
+    ) -> torch.Tensor:
+        pass
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lengths: torch.Tensor,
+        x_offsets: torch.Tensor,
+        max_seq_len: int,
+        num_targets: torch.Tensor,
+        max_kv_caching_len: int = 0,
+        kv_caching_lengths: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        (
+            x,
+            x_lengths,
+            x_offsets,
+            max_seq_len,
+            num_targets,
+            max_kv_caching_len,
+            kv_caching_lengths,
+        ) = self._preprocess(
+            x=x,
+            x_lengths=x_lengths,
+            x_offsets=x_offsets,
+            max_seq_len=max_seq_len,
+            num_targets=num_targets,
+            max_kv_caching_len=max_kv_caching_len,
+            kv_caching_lengths=kv_caching_lengths,
+        )
+
+        stu_output = self._stu(
+            x=x,
+            x_lengths=x_lengths,
+            x_offsets=x_offsets,
+            max_seq_len=max_seq_len,
+            num_targets=num_targets,
+            max_kv_caching_len=max_kv_caching_len,
+            kv_caching_lengths=kv_caching_lengths,
+        )
+
+        return self._postprocess(
+            stu_output=stu_output,
+        )
+
+
+class SDSTU(DynamicSTU):
+    def __init__(
+        self,
+        stu: STU,
+        is_inference: bool,
+        dropout_ratio: float = 0.5,
+        seed: int = 0,
+    ) -> None:
+        """
+        Stochastic Depth STU
+        """
+        super().__init__(stu=stu, is_inference=is_inference)
+        self._dropout_ratio: float = dropout_ratio
+        self._iter: int = 0
+        self._seed: int = seed
+        self._skip_x: Optional[torch.Tensor] = None
+
+    def _preprocess(
+        self,
+        x: torch.Tensor,
+        x_lengths: torch.Tensor,
+        x_offsets: torch.Tensor,
+        max_seq_len: int,
+        num_targets: torch.Tensor,
+        max_kv_caching_len: int = 0,
+        kv_caching_lengths: Optional[torch.Tensor] = None,
+    ) -> Tuple[
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        int,
+        torch.Tensor,
+        int,
+        Optional[torch.Tensor],
+    ]:
+        if self.training:
+            with _freeze_rng_state():
+                torch.manual_seed(self._iter + self._seed)
+                prob = torch.rand(1)
+                if prob.item() <= self._dropout_ratio:
+                    new_x = torch.empty(size=(0, x.shape[1]), device=x.device)
+                    self._skip_x = x
+                    new_x_lengths = torch.zeros_like(x_lengths)
+                    new_x_offsets = torch.zeros_like(x_offsets)
+                    new_max_seq_len = 1
+                else:
+                    new_x = x
+                    new_x_lengths = x_lengths
+                    new_x_offsets = x_offsets
+                    new_max_seq_len = max_seq_len
+            self._iter += 1
+        else:
+            new_x = x
+            new_x_lengths = x_lengths
+            new_x_offsets = x_offsets
+            new_max_seq_len = max_seq_len
+        return (
+            new_x,
+            new_x_lengths,
+            new_x_offsets,
+            new_max_seq_len,
+            num_targets,
+            max_kv_caching_len,
+            kv_caching_lengths,
+        )
+
+    def _postprocess(
+        self,
+        stu_output: torch.Tensor,
+    ) -> torch.Tensor:
+        if self.training and self._skip_x is not None:
+            ret = self._skip_x
+            self._skip_x = None
+            return ret
+        else:
+            return stu_output
+
+
+@torch.fx.wrap
+def _fx_unwrap_optional_tuple_tensor(
+    optional: Optional[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]],
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    assert optional is not None, "Expected optional to be non-None"
+    return optional
+
+
+class L2STU(DynamicSTU):
+    def __init__(
+        self,
+        stu: STU,
+        max_l2_len: int,
+        is_inference: bool,
+        contextual_seq_len: int = 0,
+    ) -> None:
+        """
+        Stochastic Depth STU
+        """
+        super().__init__(stu=stu, is_inference=is_inference)
+        self._max_l2_len: int = max_l2_len
+        self._contextual_seq_len: int = contextual_seq_len
+        self._saved_tensors: Optional[
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]
+        ] = None
+        self._runtime_max_l2_len: int = 0
+        self._runtime_prefix_len: int = 0
+
+    def _preprocess(
+        self,
+        x: torch.Tensor,
+        x_lengths: torch.Tensor,
+        x_offsets: torch.Tensor,
+        max_seq_len: int,
+        num_targets: torch.Tensor,
+        max_kv_caching_len: int = 0,
+        kv_caching_lengths: Optional[torch.Tensor] = None,
+    ) -> Tuple[
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        int,
+        torch.Tensor,
+        int,
+        Optional[torch.Tensor],
+    ]:
+        prefix_lengths = (
+            x_lengths - self._max_l2_len - num_targets - self._contextual_seq_len
+        )
+        prefix_lengths = torch.clamp(prefix_lengths, min=0)
+        prefix_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(prefix_lengths)
+        l2_lengths = x_lengths - prefix_lengths
+        l2_offsets = x_offsets - prefix_offsets
+        self._runtime_max_l2_len: int = fx_infer_max_len(l2_lengths)
+        self._runtime_prefix_len: int = fx_infer_max_len(prefix_lengths)
+        prefix_x, l2_x = hstu_split_l2_embeddings(
+            max_seq_len=max_seq_len,
+            x=x,
+            prefix_offsets=prefix_offsets,
+            l2_offsets=l2_offsets,
+            contextual_seq_len=self._contextual_seq_len,
+            kernel=self.hammer_kernel(),
+        )
+        self._saved_tensors = (
+            prefix_offsets,
+            prefix_x,
+            l2_offsets,
+        )
+        return (
+            l2_x,
+            l2_lengths,
+            l2_offsets,
+            self._runtime_max_l2_len,
+            num_targets,
+            max_kv_caching_len,
+            kv_caching_lengths,
+        )
+
+    def _postprocess(
+        self,
+        stu_output: torch.Tensor,
+    ) -> torch.Tensor:
+        (
+            prefix_offsets,
+            prefix_x,
+            l2_offsets,
+        ) = _fx_unwrap_optional_tuple_tensor(self._saved_tensors)
+        self._saved_tensors = None
+        return hstu_concat_l2_embeddings(
+            max_prefix_len=self._runtime_prefix_len,
+            prefix_x=prefix_x,
+            prefix_offsets=prefix_offsets,
+            max_l2_len=self._runtime_max_l2_len,
+            l2_x=stu_output,
+            l2_offsets=l2_offsets,
+            contextual_seq_len=self._contextual_seq_len,
+            kernel=self.hammer_kernel(),
+        )
diff --git a/recommendation_v4/generative_recommenders/modules/hstu_transducer.py b/recommendation_v4/generative_recommenders/modules/hstu_transducer.py
new file mode 100644
index 000000000..ce91a67c9
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/hstu_transducer.py
@@ -0,0 +1,330 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import logging
+from typing import Dict, Optional, Tuple
+
+import torch
+from generative_recommenders.common import fx_unwrap_optional_tensor, HammerModule
+from generative_recommenders.modules.positional_encoder import HSTUPositionalEncoder
+from generative_recommenders.modules.postprocessors import (
+    L2NormPostprocessor,
+    OutputPostprocessor,
+)
+from generative_recommenders.modules.preprocessors import InputPreprocessor
+from generative_recommenders.modules.stu import STU
+from generative_recommenders.ops.jagged_tensors import split_2D_jagged
+from torch.profiler import record_function
+
+logger: logging.Logger = logging.getLogger(__name__)
+torch.fx.wrap("len")
+
+try:
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+except OSError:
+    pass
+
+
+@torch.fx.wrap
+def default_seq_payload(
+    seq_payloads: Optional[Dict[str, torch.Tensor]],
+) -> Dict[str, torch.Tensor]:
+    if seq_payloads is None:
+        return {}
+    else:
+        return torch.jit._unwrap_optional(seq_payloads)
+
+
+class HSTUTransducer(HammerModule):
+    def __init__(
+        self,
+        stu_module: STU,
+        input_preprocessor: InputPreprocessor,
+        output_postprocessor: Optional[OutputPostprocessor] = None,
+        input_dropout_ratio: float = 0.0,
+        positional_encoder: Optional[HSTUPositionalEncoder] = None,
+        is_inference: bool = True,
+        return_full_embeddings: bool = False,
+        listwise: bool = False,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+        self._stu_module = stu_module
+        self._input_preprocessor: InputPreprocessor = input_preprocessor
+        self._output_postprocessor: OutputPostprocessor = (
+            output_postprocessor
+            if output_postprocessor is not None
+            else L2NormPostprocessor(is_inference=is_inference)
+        )
+        assert self._is_inference == self._input_preprocessor._is_inference, (
+            f"input_preprocessor must have the same mode; self: {self._is_inference} vs input_preprocessor {self._input_preprocessor._is_inference}"
+        )
+        self._positional_encoder: Optional[HSTUPositionalEncoder] = positional_encoder
+        self._input_dropout_ratio: float = input_dropout_ratio
+        self._return_full_embeddings: bool = return_full_embeddings
+        self._listwise_training: bool = listwise and self.is_train
+
+        for name, m in self.named_modules():
+            if "_stu_module" in name:
+                continue
+            elif isinstance(m, torch.nn.Linear):
+                torch.nn.init.xavier_normal_(m.weight)
+            elif isinstance(m, torch.nn.LayerNorm):
+                if m.weight.dim() >= 2:
+                    torch.nn.init.xavier_normal_(m.weight)
+                if m.bias is not None and m.bias.dim() >= 2:
+                    torch.nn.init.xavier_normal_(m.bias)
+
+    def _preprocess(
+        self,
+        max_uih_len: int,
+        max_targets: int,
+        total_uih_len: int,
+        total_targets: int,
+        seq_lengths: torch.Tensor,
+        seq_timestamps: torch.Tensor,
+        seq_embeddings: torch.Tensor,
+        num_targets: torch.Tensor,
+        seq_payloads: Dict[str, torch.Tensor],
+    ) -> Tuple[
+        int,
+        int,
+        int,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        Dict[str, torch.Tensor],
+    ]:
+        seq_payloads = default_seq_payload(seq_payloads)
+
+        with record_function("hstu_input_preprocessor"):
+            (
+                output_max_seq_len,
+                output_total_uih_len,
+                output_total_targets,
+                output_seq_lengths,
+                output_seq_offsets,
+                output_seq_timestamps,
+                output_seq_embeddings,
+                output_num_targets,
+                output_seq_payloads,
+            ) = self._input_preprocessor(
+                max_uih_len=max_uih_len,
+                max_targets=max_targets,
+                total_uih_len=total_uih_len,
+                total_targets=total_targets,
+                seq_lengths=seq_lengths,
+                seq_timestamps=seq_timestamps,
+                seq_embeddings=seq_embeddings,
+                num_targets=num_targets,
+                seq_payloads=seq_payloads,
+            )
+
+        with record_function("hstu_positional_encoder"):
+            if self._positional_encoder is not None:
+                output_seq_embeddings = self._positional_encoder(
+                    max_seq_len=output_max_seq_len,
+                    seq_lengths=output_seq_lengths,
+                    seq_offsets=output_seq_offsets,
+                    seq_timestamps=output_seq_timestamps,
+                    seq_embeddings=output_seq_embeddings,
+                    num_targets=(
+                        None if self._listwise_training else output_num_targets
+                    ),
+                )
+
+        output_seq_embeddings = torch.nn.functional.dropout(
+            output_seq_embeddings,
+            p=self._input_dropout_ratio,
+            training=self.training,
+        )
+
+        return (
+            output_max_seq_len,
+            output_total_uih_len,
+            output_total_targets,
+            output_seq_lengths,
+            output_seq_offsets,
+            output_seq_timestamps,
+            output_seq_embeddings,
+            output_num_targets,
+            output_seq_payloads,
+        )
+
+    def _hstu_compute(
+        self,
+        max_seq_len: int,
+        seq_lengths: torch.Tensor,
+        seq_offsets: torch.Tensor,
+        seq_timestamps: torch.Tensor,
+        seq_embeddings: torch.Tensor,
+        num_targets: torch.Tensor,
+    ) -> torch.Tensor:
+        with record_function("hstu"):
+            seq_embeddings = self._stu_module(
+                max_seq_len=max_seq_len,
+                x=seq_embeddings,
+                x_lengths=seq_lengths,
+                x_offsets=seq_offsets,
+                num_targets=(None if self._listwise_training else num_targets),
+            )
+        return seq_embeddings
+
+    def _postprocess(
+        self,
+        max_seq_len: int,
+        max_uih_len: int,
+        max_targets: int,
+        total_uih_len: int,
+        total_targets: int,
+        seq_lengths: torch.Tensor,
+        seq_timestamps: torch.Tensor,
+        seq_embeddings: torch.Tensor,
+        num_targets: torch.Tensor,
+        seq_payloads: Dict[str, torch.Tensor],
+    ) -> Tuple[Optional[torch.Tensor], torch.Tensor]:
+        with record_function("hstu_output_postprocessor"):
+            if self._return_full_embeddings:
+                seq_embeddings = self._output_postprocessor(
+                    seq_embeddings=seq_embeddings,
+                    seq_timestamps=seq_timestamps,
+                    seq_payloads=seq_payloads,
+                )
+            uih_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(
+                seq_lengths - num_targets
+            )
+            candidates_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(
+                num_targets
+            )
+            _, candidate_embeddings = split_2D_jagged(
+                values=seq_embeddings,
+                max_seq_len=max_seq_len,
+                total_len_left=total_uih_len,
+                total_len_right=total_targets,
+                max_len_left=max_uih_len,
+                max_len_right=max_targets,
+                offsets_left=uih_offsets,
+                offsets_right=candidates_offsets,
+                kernel=self.hammer_kernel(),
+            )
+            interleave_targets: bool = self._input_preprocessor.interleave_targets()
+            if interleave_targets:
+                candidate_embeddings = candidate_embeddings.view(
+                    -1, 2, candidate_embeddings.size(-1)
+                )[:, 0, :]
+            if not self._return_full_embeddings:
+                _, candidate_timestamps = split_2D_jagged(
+                    values=seq_timestamps.unsqueeze(-1),
+                    max_seq_len=max_seq_len,
+                    total_len_left=total_uih_len,
+                    total_len_right=total_targets,
+                    max_len_left=max_uih_len,
+                    max_len_right=max_targets,
+                    offsets_left=uih_offsets,
+                    offsets_right=candidates_offsets,
+                    kernel=self.hammer_kernel(),
+                )
+                candidate_timestamps = candidate_timestamps.squeeze(-1)
+                if interleave_targets:
+                    candidate_timestamps = candidate_timestamps.view(-1, 2)[:, 0]
+                candidate_embeddings = self._output_postprocessor(
+                    seq_embeddings=candidate_embeddings,
+                    seq_timestamps=candidate_timestamps,
+                    seq_payloads=seq_payloads,
+                )
+
+            return (
+                seq_embeddings if self._return_full_embeddings else None,
+                candidate_embeddings,
+            )
+
+    def forward(
+        self,
+        max_uih_len: int,
+        max_targets: int,
+        total_uih_len: int,
+        total_targets: int,
+        seq_lengths: torch.Tensor,
+        seq_embeddings: torch.Tensor,
+        seq_timestamps: torch.Tensor,
+        num_targets: torch.Tensor,
+        seq_payloads: Dict[str, torch.Tensor],
+    ) -> Tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+    ]:
+        orig_dtype = seq_embeddings.dtype
+        if not self._is_inference:
+            seq_embeddings = seq_embeddings.to(self._training_dtype)
+
+        (
+            max_seq_len,
+            total_uih_len,
+            total_targets,
+            seq_lengths,
+            seq_offsets,
+            seq_timestamps,
+            seq_embeddings,
+            num_targets,
+            seq_payloads,
+        ) = self._preprocess(
+            max_uih_len=max_uih_len,
+            max_targets=max_targets,
+            total_uih_len=total_uih_len,
+            total_targets=total_targets,
+            seq_lengths=seq_lengths,
+            seq_timestamps=seq_timestamps,
+            seq_embeddings=seq_embeddings,
+            num_targets=num_targets,
+            seq_payloads=seq_payloads,
+        )
+
+        encoded_embeddings = self._hstu_compute(
+            max_seq_len=max_seq_len,
+            seq_lengths=seq_lengths,
+            seq_offsets=seq_offsets,
+            seq_timestamps=seq_timestamps,
+            seq_embeddings=seq_embeddings,
+            num_targets=num_targets,
+        )
+
+        encoded_embeddings, encoded_candidate_embeddings = self._postprocess(
+            max_seq_len=max_seq_len,
+            max_uih_len=max_uih_len,
+            max_targets=max_targets,
+            total_uih_len=total_uih_len,
+            total_targets=total_targets,
+            seq_lengths=seq_lengths,
+            seq_embeddings=encoded_embeddings,
+            seq_timestamps=seq_timestamps,
+            num_targets=num_targets,
+            seq_payloads=seq_payloads,
+        )
+
+        if not self._is_inference:
+            encoded_candidate_embeddings = encoded_candidate_embeddings.to(orig_dtype)
+            if self._return_full_embeddings:
+                encoded_embeddings = fx_unwrap_optional_tensor(encoded_embeddings).to(
+                    orig_dtype
+                )
+        return (
+            encoded_candidate_embeddings,
+            encoded_embeddings,
+        )
diff --git a/recommendation_v4/generative_recommenders/modules/multitask_module.py b/recommendation_v4/generative_recommenders/modules/multitask_module.py
new file mode 100644
index 000000000..3cb11996f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/multitask_module.py
@@ -0,0 +1,288 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import abc
+import logging
+from dataclasses import dataclass
+from enum import IntEnum
+from typing import Callable, Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from generative_recommenders.common import HammerModule
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class MultitaskTaskType(IntEnum):
+    BINARY_CLASSIFICATION = 0
+    REGRESSION = 1
+
+
+@dataclass
+class TaskConfig:
+    task_name: str
+    task_weight: int
+    task_type: MultitaskTaskType
+
+
+class MultitaskModule(HammerModule):
+    @abc.abstractmethod
+    def forward(
+        self,
+        encoded_user_embeddings: torch.Tensor,
+        item_embeddings: torch.Tensor,
+        supervision_labels: Dict[str, torch.Tensor],
+        supervision_weights: Dict[str, torch.Tensor],
+    ) -> Tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
+        """
+        Computes multi-task predictions.
+
+        Args:
+            encoded_user_embeddings: (L, D) x float.
+            item_embeddings: (L, D) x float.
+            supervision_labels: Dict[T, L] x float or int
+            supervision_weights: Dict[T', L] x float or int, T' <= T
+        Returns:
+            (T, L) x float, predictions, labels, weights, losses
+        """
+        pass
+
+
+def _compute_pred_and_logits(
+    prediction_module: torch.nn.Module,
+    encoded_user_embeddings: torch.Tensor,
+    item_embeddings: torch.Tensor,
+    task_offsets: List[int],
+    has_multiple_task_types: bool,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    mt_logits = prediction_module(encoded_user_embeddings * item_embeddings).transpose(
+        0, 1
+    )
+    mt_preds_list: List[torch.Tensor] = []
+    for task_type in MultitaskTaskType:
+        logits = mt_logits[
+            task_offsets[task_type] : task_offsets[task_type + 1],
+            :,
+        ]
+        if task_offsets[task_type + 1] - task_offsets[task_type] > 0:
+            if task_type == MultitaskTaskType.REGRESSION:
+                mt_preds_list.append(logits)
+            else:
+                mt_preds_list.append(F.sigmoid(logits))
+    if has_multiple_task_types:
+        mt_preds: torch.Tensor = torch.concat(mt_preds_list, dim=0)
+    else:
+        mt_preds: torch.Tensor = mt_preds_list[0]
+
+    return mt_preds, mt_logits
+
+
+def _compute_labels_and_weights(
+    supervision_labels: Dict[str, torch.Tensor],
+    supervision_weights: Dict[str, torch.Tensor],
+    task_configs: List[TaskConfig],
+    device: torch.device,
+    dtype: torch.dtype = torch.float32,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    first_label: torch.Tensor = list(supervision_labels.values())[0]
+    default_supervision_weight = torch.ones_like(
+        first_label,
+        dtype=dtype,
+        device=device,
+    )
+    mt_lables_list: List[torch.Tensor] = []
+    mt_weights_list: List[torch.Tensor] = []
+    for task in task_configs:
+        mt_lables_list.append(supervision_labels[task.task_name])
+        mt_weights_list.append(
+            supervision_weights.get(task.task_name, default_supervision_weight)
+        )
+    if len(task_configs) > 1:
+        mt_labels = torch.stack(mt_lables_list, dim=0)
+        mt_weights = torch.stack(mt_weights_list, dim=0)
+    else:
+        mt_labels = mt_lables_list[0].unsqueeze(0)
+        mt_weights = mt_weights_list[0].unsqueeze(0)
+    return mt_labels, mt_weights
+
+
+def _compute_loss(
+    task_offsets: List[int],
+    causal_multitask_weights: float,
+    mt_logits: torch.Tensor,
+    mt_labels: torch.Tensor,
+    mt_weights: torch.Tensor,
+    has_multiple_task_types: bool,
+) -> torch.Tensor:
+    mt_losses_list: List[torch.Tensor] = []
+    for task_type in MultitaskTaskType:
+        if task_offsets[task_type + 1] - task_offsets[task_type] > 0:
+            logits = mt_logits[
+                task_offsets[task_type] : task_offsets[task_type + 1],
+                :,
+            ]
+            labels = mt_labels[
+                task_offsets[task_type] : task_offsets[task_type + 1],
+                :,
+            ]
+            weights = mt_weights[
+                task_offsets[task_type] : task_offsets[task_type + 1],
+                :,
+            ]
+            if task_type == MultitaskTaskType.REGRESSION:
+                mt_losses_list.append(
+                    F.mse_loss(logits, labels, reduction="none") * weights
+                )
+            else:
+                mt_losses_list.append(
+                    F.binary_cross_entropy_with_logits(
+                        input=logits, target=labels, reduction="none"
+                    )
+                    * weights
+                )
+
+    if has_multiple_task_types:
+        mt_losses = torch.concat(mt_losses_list, dim=0)
+    else:
+        mt_losses = mt_losses_list[0]
+    mt_losses = (
+        mt_losses.sum(-1) / mt_weights.sum(-1).clamp(min=1.0) * causal_multitask_weights
+    )
+    return mt_losses
+
+
+class DefaultMultitaskModule(MultitaskModule):
+    def __init__(
+        self,
+        task_configs: List[TaskConfig],
+        embedding_dim: int,
+        prediction_fn: Callable[[int, int], torch.nn.Module],
+        causal_multitask_weights: float,
+        is_inference: bool,
+    ) -> None:
+        super().__init__(is_inference)
+        assert sorted(task_configs, key=lambda x: x.task_type) == task_configs, (
+            "task_configs must be sorted by task_type."
+        )
+        assert len(task_configs) > 0, "task_configs must be non-empty."
+        self._task_configs: List[TaskConfig] = task_configs
+        self._task_offsets: List[int] = [0] * (len(MultitaskTaskType) + 1)
+        for task in self._task_configs:
+            self._task_offsets[task.task_type + 1] += 1
+        self._has_multiple_task_types: bool = self._task_offsets.count(0) < len(
+            MultitaskTaskType
+        )
+        self._task_offsets[1:] = np.cumsum(self._task_offsets[1:]).tolist()
+        self._causal_multitask_weights: float = causal_multitask_weights
+        self._prediction_module: torch.nn.Module = prediction_fn(
+            embedding_dim, len(task_configs)
+        )
+
+    def forward(
+        self,
+        encoded_user_embeddings: torch.Tensor,
+        item_embeddings: torch.Tensor,
+        supervision_labels: Dict[str, torch.Tensor],
+        supervision_weights: Dict[str, torch.Tensor],
+    ) -> Tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
+        orig_dtype = encoded_user_embeddings.dtype
+        if not self._is_inference:
+            encoded_user_embeddings = encoded_user_embeddings.to(self._training_dtype)
+            item_embeddings = item_embeddings.to(self._training_dtype)
+
+        if torch.jit.is_scripting():
+            # Script-mode fast path: skip torch.autocast (unsupported in TS)
+            # and inline _compute_pred_and_logits to avoid its
+            # `torch.nn.Module` parameter annotation (TS only knows
+            # concrete module types). The dense module is already in bf16
+            # at this point, so autocast is a no-op for the predictor path.
+            mt_logits = self._prediction_module(
+                encoded_user_embeddings * item_embeddings
+            ).transpose(0, 1)
+            mt_preds_list: List[torch.Tensor] = []
+            # MultitaskTaskType is an IntEnum (BINARY_CLASSIFICATION=0,
+            # REGRESSION=1) but TorchScript treats it as an opaque Enum.
+            # Iterate by the integer task indices directly.
+            for task_type in range(len(self._task_offsets) - 1):
+                start = self._task_offsets[task_type]
+                end = self._task_offsets[task_type + 1]
+                logits = mt_logits[start:end, :]
+                if end - start > 0:
+                    # 1 == MultitaskTaskType.REGRESSION
+                    if task_type == 1:
+                        mt_preds_list.append(logits)
+                    else:
+                        mt_preds_list.append(F.sigmoid(logits))
+            if self._has_multiple_task_types:
+                mt_preds: torch.Tensor = torch.concat(mt_preds_list, dim=0)
+            else:
+                mt_preds: torch.Tensor = mt_preds_list[0]
+            return mt_preds, None, None, None
+
+        with torch.autocast(
+            "cuda",
+            dtype=torch.bfloat16,
+            enabled=(not self.is_inference and self._training_dtype == torch.bfloat16),
+        ):
+            mt_preds, mt_logits = _compute_pred_and_logits(
+                prediction_module=self._prediction_module,
+                encoded_user_embeddings=encoded_user_embeddings,
+                item_embeddings=item_embeddings,
+                task_offsets=self._task_offsets,
+                has_multiple_task_types=self._has_multiple_task_types,
+            )
+
+        # losses are always computed in fp32
+        mt_labels: Optional[torch.Tensor] = None
+        mt_weights: Optional[torch.Tensor] = None
+        mt_losses: Optional[torch.Tensor] = None
+        if not self._is_inference:
+            mt_labels, mt_weights = _compute_labels_and_weights(
+                supervision_labels=supervision_labels,
+                supervision_weights=supervision_weights,
+                task_configs=self._task_configs,
+                device=encoded_user_embeddings.device,
+            )
+            mt_losses = _compute_loss(
+                task_offsets=self._task_offsets,
+                causal_multitask_weights=self._causal_multitask_weights,
+                mt_logits=mt_logits.to(mt_labels.dtype),
+                mt_labels=mt_labels,
+                mt_weights=mt_weights,
+                has_multiple_task_types=self._has_multiple_task_types,
+            )
+            mt_preds = mt_preds.to(orig_dtype)
+
+        return (
+            mt_preds,
+            mt_labels,
+            mt_weights,
+            mt_losses,
+        )
diff --git a/recommendation_v4/generative_recommenders/modules/positional_encoder.py b/recommendation_v4/generative_recommenders/modules/positional_encoder.py
new file mode 100644
index 000000000..99d904fd4
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/positional_encoder.py
@@ -0,0 +1,75 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from math import sqrt
+from typing import Optional
+
+import torch
+from generative_recommenders.common import HammerModule
+from generative_recommenders.ops.position import add_timestamp_positional_embeddings
+
+
+class HSTUPositionalEncoder(HammerModule):
+    def __init__(
+        self,
+        num_position_buckets: int,
+        num_time_buckets: int,
+        embedding_dim: int,
+        contextual_seq_len: int,
+        is_inference: bool = True,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+        self._embedding_dim: int = embedding_dim
+        self._contextual_seq_len: int = contextual_seq_len
+        self._position_embeddings_weight: torch.nn.Parameter = torch.nn.Parameter(
+            torch.empty(num_position_buckets, embedding_dim).uniform_(
+                -sqrt(1.0 / num_position_buckets),
+                sqrt(1.0 / num_position_buckets),
+            ),
+        )
+        self._timestamp_embeddings_weight: torch.nn.Parameter = torch.nn.Parameter(
+            torch.empty(num_time_buckets + 1, embedding_dim).uniform_(
+                -sqrt(1.0 / num_time_buckets),
+                sqrt(1.0 / num_time_buckets),
+            ),
+        )
+
+    def forward(
+        self,
+        max_seq_len: int,
+        seq_lengths: torch.Tensor,
+        seq_offsets: torch.Tensor,
+        seq_timestamps: torch.Tensor,
+        seq_embeddings: torch.Tensor,
+        num_targets: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        seq_embeddings = add_timestamp_positional_embeddings(
+            alpha=self._embedding_dim**0.5,
+            max_seq_len=max_seq_len,
+            max_contextual_seq_len=self._contextual_seq_len,
+            position_embeddings_weight=self._position_embeddings_weight,
+            timestamp_embeddings_weight=self._timestamp_embeddings_weight,
+            seq_offsets=seq_offsets,
+            seq_lengths=seq_lengths,
+            seq_embeddings=seq_embeddings,
+            timestamps=seq_timestamps,
+            num_targets=num_targets,
+            interleave_targets=False,
+            kernel=self.hammer_kernel(),
+        )
+        return seq_embeddings
diff --git a/recommendation_v4/generative_recommenders/modules/postprocessors.py b/recommendation_v4/generative_recommenders/modules/postprocessors.py
new file mode 100644
index 000000000..7958e3fa9
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/postprocessors.py
@@ -0,0 +1,182 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from abc import abstractmethod
+from typing import Dict, List, Tuple
+
+import torch
+from generative_recommenders.common import HammerModule, init_mlp_weights_optional_bias
+
+
+@torch.fx.wrap
+def _cast_dtype(t: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    if t.dtype != dtype:
+        return t.to(dtype)
+    return t
+
+
+class OutputPostprocessor(HammerModule):
+    """An abstract class for post-processing user embeddings after HSTU layers."""
+
+    @abstractmethod
+    def forward(
+        self,
+        seq_embeddings: torch.Tensor,
+        seq_timestamps: torch.Tensor,
+        seq_payloads: Dict[str, torch.Tensor],
+    ) -> torch.Tensor:
+        """
+        Args:
+            seq_embeddings: (L, D)
+            seq_timestamps: (L, )
+            seq_payloads: str-keyed tensors. Implementation specific.
+
+        Returns:
+            postprocessed seq_embeddings, (L, D)
+        """
+        pass
+
+
+class L2NormPostprocessor(OutputPostprocessor):
+    """Postprocesses user embeddings with l2 norm."""
+
+    def __init__(self, is_inference: bool = False) -> None:
+        super().__init__(is_inference=is_inference)
+
+    def forward(
+        self,
+        seq_embeddings: torch.Tensor,
+        seq_timestamps: torch.Tensor,
+        seq_payloads: Dict[str, torch.Tensor],
+    ) -> torch.Tensor:
+        return seq_embeddings / torch.linalg.norm(
+            seq_embeddings, ord=2, dim=-1, keepdim=True
+        ).clamp(min=1e-6)
+
+
+class LayerNormPostprocessor(OutputPostprocessor):
+    """Postprocesses user embeddings with layer norm."""
+
+    def __init__(
+        self,
+        embedding_dim: int,
+        eps: float = 1e-5,
+        is_inference: bool = False,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+
+        self._layer_norm: torch.nn.Module = torch.nn.LayerNorm(
+            normalized_shape=[embedding_dim], eps=eps
+        )
+
+    def forward(
+        self,
+        seq_embeddings: torch.Tensor,
+        seq_timestamps: torch.Tensor,
+        seq_payloads: Dict[str, torch.Tensor],
+    ) -> torch.Tensor:
+        # pyre-fixme[6]: For 1st argument expected `dtype` but got `Union[dtype,
+        #  Tensor, Module]`.
+        return self._layer_norm(seq_embeddings.to(self._layer_norm.weight.dtype))
+
+
+@torch.fx.wrap
+def _unsqueeze_if_needed(t: torch.Tensor, embedding: torch.Tensor) -> torch.Tensor:
+    if embedding.dim() == 3:
+        return t.unsqueeze(0)
+    return t
+
+
+class TimestampLayerNormPostprocessor(OutputPostprocessor):
+    """Postprocesses user embeddings with timestamp-based MLP -> layer norm."""
+
+    def __init__(
+        self,
+        embedding_dim: int,
+        time_duration_features: List[Tuple[int, int]],
+        eps: float = 1e-5,
+        is_inference: bool = False,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+
+        self._layer_norm: torch.nn.Module = torch.nn.LayerNorm(
+            normalized_shape=[embedding_dim], eps=eps
+        )
+        self.register_buffer(
+            "_period_units",
+            torch.Tensor([f[0] for f in time_duration_features]).view(1, -1),
+        )
+        self.register_buffer(
+            "_units_per_period",
+            torch.Tensor([f[1] for f in time_duration_features]).view(1, -1),
+        )
+        self._time_feature_combiner: torch.nn.Module = torch.nn.Linear(
+            embedding_dim + 2 * len(time_duration_features),
+            embedding_dim,
+        ).apply(init_mlp_weights_optional_bias)
+
+    def _concat_time_features(
+        self,
+        combined_embeddings: torch.Tensor,
+        timestamps: torch.Tensor,  # [B] or [B, D]
+    ) -> torch.Tensor:
+        # concat time representation to combined embeddings
+        period_units = self._period_units
+        units_per_period = self._units_per_period
+
+        timestamps = timestamps.unsqueeze(-1)
+        period_units = _unsqueeze_if_needed(period_units, combined_embeddings)
+        units_per_period = _unsqueeze_if_needed(
+            units_per_period, combined_embeddings
+        ).float()
+        # Compute time features in float32 to avoid bf16 precision loss through
+        # discontinuous floor/remainder ops, matching Inductor fusion behavior.
+        _units_elapsed_type: torch.dtype = combined_embeddings.dtype
+        _units_since_epoch = torch.div(
+            timestamps.float(), period_units.float(), rounding_mode="floor"
+        )  # [sum(N_i), num_time_features] or [B, N, num_time_features]
+        _units_elapsed = (
+            (torch.remainder(_units_since_epoch, units_per_period) / units_per_period)
+            * 2
+            * 3.14
+        )
+        _units_elapsed = torch.view_as_real(
+            torch.polar(
+                _cast_dtype(torch.ones_like(_units_elapsed), torch.float32),
+                _cast_dtype(_units_elapsed, torch.float32),
+            )
+        ).flatten(
+            -2, -1
+        )  # [sum(N_i), num_time_features * 2] or [B, N, num_time_features * 2]
+        _units_elapsed = _cast_dtype(_units_elapsed, _units_elapsed_type)
+        combined_embeddings = torch.cat([combined_embeddings, _units_elapsed], dim=-1)
+        return combined_embeddings
+
+    def forward(
+        self,
+        seq_embeddings: torch.Tensor,
+        seq_timestamps: torch.Tensor,
+        seq_payloads: Dict[str, torch.Tensor],
+    ) -> torch.Tensor:
+        user_embeddings = self._time_feature_combiner(
+            self._concat_time_features(seq_embeddings, timestamps=seq_timestamps).to(
+                self._time_feature_combiner.weight.dtype  # pyre-fixme[6]: For 1st argument expected `dtype` but got `Union[dtype,
+                #  Tensor, Module]`.
+            )
+        )
+        return self._layer_norm(user_embeddings)
diff --git a/recommendation_v4/generative_recommenders/modules/preprocessors.py b/recommendation_v4/generative_recommenders/modules/preprocessors.py
new file mode 100644
index 000000000..dc7806bb4
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/preprocessors.py
@@ -0,0 +1,334 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import abc
+from math import sqrt
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from generative_recommenders.common import (
+    fx_unwrap_optional_tensor,
+    HammerModule,
+    init_mlp_weights_optional_bias,
+    jagged_to_padded_dense,
+)
+from generative_recommenders.modules.action_encoder import ActionEncoder
+from generative_recommenders.ops.jagged_tensors import concat_2D_jagged
+from generative_recommenders.ops.layer_norm import LayerNorm, SwishLayerNorm
+
+
+class InputPreprocessor(HammerModule):
+    """An abstract class for pre-processing sequence embeddings before HSTU layers."""
+
+    @abc.abstractmethod
+    def forward(
+        self,
+        max_uih_len: int,
+        max_targets: int,
+        total_uih_len: int,
+        total_targets: int,
+        seq_lengths: torch.Tensor,
+        seq_timestamps: torch.Tensor,
+        seq_embeddings: torch.Tensor,
+        num_targets: torch.Tensor,
+        seq_payloads: Dict[str, torch.Tensor],
+    ) -> Tuple[
+        int,
+        int,
+        int,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        Dict[str, torch.Tensor],
+    ]:
+        """
+        Args:
+            max_uih_len: int
+            max_targets: int
+            total_uih_len: int
+            total_targets: int
+            seq_lengths: (B,)
+            seq_embeddings: (L, D)
+            seq_timestamps: (B, N)
+            num_targets: (B,) Optional.
+            seq_payloads: str-keyed tensors. Implementation specific.
+
+        Returns:
+            (max_seq_len, total_uih_len, total_targets, lengths, offsets, timestamps, embeddings, num_targets, payloads) updated based on input preprocessor.
+        """
+        pass
+
+    def interleave_targets(self) -> bool:
+        return False
+
+
+def get_contextual_input_embeddings(
+    seq_lengths: torch.Tensor,
+    seq_payloads: Dict[str, torch.Tensor],
+    contextual_feature_to_max_length: Dict[str, int],
+    contextual_feature_to_min_uih_length: Dict[str, int],
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    padded_values: List[torch.Tensor] = []
+    for key, max_len in contextual_feature_to_max_length.items():
+        v = torch.flatten(
+            jagged_to_padded_dense(
+                values=seq_payloads[key].to(dtype),
+                offsets=[seq_payloads[key + "_offsets"]],
+                max_lengths=[max_len],
+                padding_value=0.0,
+            ),
+            1,
+            2,
+        )
+        min_uih_length = contextual_feature_to_min_uih_length.get(key, 0)
+        if min_uih_length > 0:
+            v = v * (seq_lengths.view(-1, 1) >= min_uih_length)
+        padded_values.append(v)
+    return torch.cat(padded_values, dim=1)
+
+
+class ContextualPreprocessor(InputPreprocessor):
+    def __init__(
+        self,
+        input_embedding_dim: int,
+        hidden_dim: int,
+        output_embedding_dim: int,
+        contextual_feature_to_max_length: Dict[str, int],
+        contextual_feature_to_min_uih_length: Dict[str, int],
+        action_embedding_dim: int = 8,
+        action_feature_name: str = "",
+        action_weights: Optional[List[int]] = None,
+        additional_embedding_features: List[str] = [],
+        action_embedding_init_std: float = 0.1,
+        is_inference: bool = True,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+        self._output_embedding_dim: int = output_embedding_dim
+        self._input_embedding_dim: int = input_embedding_dim
+        self._hidden_dim: int = hidden_dim
+        self._contextual_feature_to_max_length: Dict[str, int] = (
+            contextual_feature_to_max_length
+        )
+        self._max_contextual_seq_len: int = sum(
+            contextual_feature_to_max_length.values()
+        )
+        self._contextual_feature_to_min_uih_length: Dict[str, int] = (
+            contextual_feature_to_min_uih_length
+        )
+        if self._max_contextual_seq_len > 0:
+            std = 1.0 * sqrt(
+                2.0 / float(input_embedding_dim + self._output_embedding_dim)
+            )
+            self._batched_contextual_linear_weights: torch.nn.Parameter = (
+                torch.nn.Parameter(
+                    torch.empty(
+                        (
+                            self._max_contextual_seq_len,
+                            input_embedding_dim,
+                            self._output_embedding_dim,
+                        )
+                    ).normal_(0.0, std)
+                )
+            )
+            self._batched_contextual_linear_bias: torch.nn.Parameter = (
+                torch.nn.Parameter(
+                    torch.empty(
+                        (self._max_contextual_seq_len, self._output_embedding_dim)
+                    ).fill_(0.0)
+                )
+            )
+        self._content_embedding_mlp: torch.nn.Module = torch.nn.Sequential(
+            torch.nn.Linear(
+                in_features=self._input_embedding_dim,
+                out_features=self._hidden_dim,
+            ),
+            SwishLayerNorm(self._hidden_dim),
+            torch.nn.Linear(
+                in_features=self._hidden_dim,
+                out_features=self._output_embedding_dim,
+            ),
+            LayerNorm(self._output_embedding_dim),
+        ).apply(init_mlp_weights_optional_bias)
+        self._additional_embedding_features: List[str] = additional_embedding_features
+        self._additional_embedding_mlp: torch.nn.Module = torch.nn.Sequential(
+            torch.nn.Linear(
+                in_features=self._input_embedding_dim
+                * len(additional_embedding_features),
+                out_features=self._hidden_dim,
+            ),
+            SwishLayerNorm(self._hidden_dim),
+            torch.nn.Linear(
+                in_features=self._hidden_dim,
+                out_features=self._output_embedding_dim,
+            ),
+            LayerNorm(self._output_embedding_dim),
+        ).apply(init_mlp_weights_optional_bias)
+        self._action_feature_name: str = action_feature_name
+        self._action_weights: Optional[List[int]] = action_weights
+        if self._action_weights is not None:
+            self._action_encoder: ActionEncoder = ActionEncoder(
+                action_feature_name=action_feature_name,
+                action_weights=self._action_weights,
+                action_embedding_dim=action_embedding_dim,
+                embedding_init_std=action_embedding_init_std,
+                is_inference=is_inference,
+            )
+            self._action_embedding_mlp: torch.nn.Module = torch.nn.Sequential(
+                torch.nn.Linear(
+                    in_features=self._action_encoder.output_embedding_dim,
+                    out_features=self._hidden_dim,
+                ),
+                SwishLayerNorm(self._hidden_dim),
+                torch.nn.Linear(
+                    in_features=self._hidden_dim,
+                    out_features=self._output_embedding_dim,
+                ),
+                LayerNorm(self._output_embedding_dim),
+            ).apply(init_mlp_weights_optional_bias)
+
+    def forward(  # noqa C901
+        self,
+        max_uih_len: int,
+        max_targets: int,
+        total_uih_len: int,
+        total_targets: int,
+        seq_lengths: torch.Tensor,
+        seq_timestamps: torch.Tensor,
+        seq_embeddings: torch.Tensor,
+        num_targets: torch.Tensor,
+        seq_payloads: Dict[str, torch.Tensor],
+    ) -> Tuple[
+        int,
+        int,
+        int,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        Dict[str, torch.Tensor],
+    ]:
+        output_seq_embeddings = self._content_embedding_mlp(seq_embeddings)
+        if len(self._additional_embedding_features) > 0:
+            additional_embeddings = torch.cat(
+                [
+                    seq_payloads[feature]
+                    for feature in self._additional_embedding_features
+                ],
+                dim=1,
+            )
+            output_seq_embeddings = (
+                output_seq_embeddings
+                + self._additional_embedding_mlp(additional_embeddings)
+            )
+        max_seq_len = max_uih_len + max_targets
+        target_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(num_targets)
+        seq_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(seq_lengths)
+        uih_offsets = seq_offsets - target_offsets
+        if self._action_weights is not None:
+            action_embeddings = self._action_encoder(
+                max_uih_len=max_uih_len,
+                max_targets=max_targets,
+                uih_offsets=uih_offsets,
+                target_offsets=target_offsets,
+                seq_embeddings=seq_embeddings,
+                seq_payloads=seq_payloads,
+            )
+            output_seq_embeddings = output_seq_embeddings + self._action_embedding_mlp(
+                action_embeddings
+            )
+
+        output_max_seq_len = max_seq_len
+        output_total_uih_len = total_uih_len
+        output_total_targets = total_targets
+        output_seq_lengths = seq_lengths
+        output_num_targets = num_targets
+        output_seq_timestamps = seq_timestamps
+        output_seq_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(
+            output_seq_lengths
+        )
+        # concat contextual embeddings
+        if self._max_contextual_seq_len > 0:
+            contextual_input_embeddings = get_contextual_input_embeddings(
+                seq_lengths=seq_lengths,
+                seq_payloads=seq_payloads,
+                contextual_feature_to_max_length=self._contextual_feature_to_max_length,
+                contextual_feature_to_min_uih_length=self._contextual_feature_to_min_uih_length,
+                dtype=seq_embeddings.dtype,
+            )
+            contextual_embeddings = torch.baddbmm(
+                self._batched_contextual_linear_bias.view(
+                    -1, 1, self._output_embedding_dim
+                ).to(contextual_input_embeddings.dtype),
+                contextual_input_embeddings.view(
+                    -1, self._max_contextual_seq_len, self._input_embedding_dim
+                ).transpose(0, 1),
+                self._batched_contextual_linear_weights.to(
+                    contextual_input_embeddings.dtype
+                ),
+            ).transpose(0, 1)
+            output_seq_embeddings = concat_2D_jagged(
+                max_seq_len=self._max_contextual_seq_len + output_max_seq_len,
+                values_left=fx_unwrap_optional_tensor(contextual_embeddings).reshape(
+                    -1, self._output_embedding_dim
+                ),
+                values_right=output_seq_embeddings,
+                max_len_left=self._max_contextual_seq_len,
+                max_len_right=output_max_seq_len,
+                offsets_left=None,
+                offsets_right=output_seq_offsets,
+                kernel=self.hammer_kernel(),
+            )
+            output_seq_timestamps = concat_2D_jagged(
+                max_seq_len=self._max_contextual_seq_len + output_max_seq_len,
+                values_left=torch.zeros(
+                    (output_seq_lengths.size(0) * self._max_contextual_seq_len, 1),
+                    dtype=output_seq_timestamps.dtype,
+                    device=output_seq_timestamps.device,
+                ),
+                values_right=output_seq_timestamps.unsqueeze(-1),
+                max_len_left=self._max_contextual_seq_len,
+                max_len_right=output_max_seq_len,
+                offsets_left=None,
+                offsets_right=output_seq_offsets,
+                kernel=self.hammer_kernel(),
+            ).squeeze(-1)
+            output_max_seq_len = output_max_seq_len + self._max_contextual_seq_len
+            output_seq_lengths = output_seq_lengths + self._max_contextual_seq_len
+            output_seq_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(
+                output_seq_lengths
+            )
+            output_total_uih_len = (
+                output_total_uih_len
+                + self._max_contextual_seq_len * output_seq_lengths.size(0)
+            )
+
+        return (
+            output_max_seq_len,
+            output_total_uih_len,
+            output_total_targets,
+            output_seq_lengths,
+            output_seq_offsets,
+            output_seq_timestamps,
+            output_seq_embeddings,
+            output_num_targets,
+            seq_payloads,
+        )
diff --git a/recommendation_v4/generative_recommenders/modules/stu.py b/recommendation_v4/generative_recommenders/modules/stu.py
new file mode 100644
index 000000000..45c6ea5f3
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/stu.py
@@ -0,0 +1,471 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+import abc
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
+import torch
+from generative_recommenders.common import fx_unwrap_optional_tensor, HammerModule
+from generative_recommenders.ops.hstu_attention import delta_hstu_mha
+from generative_recommenders.ops.hstu_compute import (
+    hstu_compute_output,
+    hstu_compute_uqvk,
+    hstu_preprocess_and_attention,
+)
+from generative_recommenders.ops.jagged_tensors import concat_2D_jagged, split_2D_jagged
+from torch.autograd.profiler import record_function
+
+
+try:
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+except OSError:
+    pass
+
+
+class STU(HammerModule, abc.ABC):
+    def cached_forward(
+        self,
+        delta_x: torch.Tensor,
+        num_targets: torch.Tensor,
+        max_kv_caching_len: int = 0,
+        kv_caching_lengths: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lengths: torch.Tensor,
+        x_offsets: torch.Tensor,
+        max_seq_len: int,
+        num_targets: torch.Tensor,
+        max_kv_caching_len: int = 0,
+        kv_caching_lengths: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        pass
+
+
+@dataclass
+class STULayerConfig:
+    embedding_dim: int
+    num_heads: int
+    hidden_dim: int
+    attention_dim: int
+    output_dropout_ratio: float = 0.3
+    causal: bool = True
+    target_aware: bool = True
+    max_attn_len: Optional[int] = None
+    attn_alpha: Optional[float] = None
+    use_group_norm: bool = False
+    recompute_normed_x: bool = True
+    recompute_uvqk: bool = True
+    recompute_y: bool = True
+    sort_by_length: bool = True
+    contextual_seq_len: int = 0
+
+
+@torch.fx.wrap
+def _update_kv_cache(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    k: Optional[torch.Tensor],
+    v: Optional[torch.Tensor],
+    max_kv_caching_len: int,
+    kv_caching_lengths: Optional[torch.Tensor],
+    orig_k_cache: Optional[torch.Tensor],
+    orig_v_cache: Optional[torch.Tensor],
+    orig_max_kv_caching_len: int,
+    orig_kv_caching_offsets: Optional[torch.Tensor],
+) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], int, Optional[torch.Tensor]]:
+    if kv_caching_lengths is not None:
+        kv_caching_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(
+            kv_caching_lengths
+        )
+        delta_offsets = seq_offsets - kv_caching_offsets
+        k_cache, _ = split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=fx_unwrap_optional_tensor(k).flatten(1, 2),
+            max_len_left=None,
+            max_len_right=None,
+            offsets_left=kv_caching_offsets,
+            offsets_right=delta_offsets,
+        )
+        v_cache, _ = split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=fx_unwrap_optional_tensor(v).flatten(1, 2),
+            max_len_left=None,
+            max_len_right=None,
+            offsets_left=kv_caching_offsets,
+            offsets_right=delta_offsets,
+        )
+        if max_kv_caching_len == 0:
+            max_kv_caching_len = int(kv_caching_lengths.max().item())
+        return (
+            k_cache,
+            v_cache,
+            max_kv_caching_len,
+            kv_caching_offsets,
+        )
+    else:
+        return (
+            orig_k_cache,
+            orig_v_cache,
+            orig_max_kv_caching_len,
+            orig_kv_caching_offsets,
+        )
+
+
+@torch.fx.wrap
+def _construct_full_kv(
+    delta_k: torch.Tensor,
+    delta_v: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    max_kv_caching_len: int,
+    kv_caching_offsets: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor, int, torch.Tensor]:
+    L, _ = delta_k.shape
+    B = kv_caching_offsets.shape[0] - 1
+    delta_size = L // B
+    full_k = concat_2D_jagged(
+        max_seq_len=max_kv_caching_len + delta_size,
+        values_left=k_cache,
+        values_right=delta_k,
+        max_len_left=max_kv_caching_len,
+        max_len_right=delta_size,
+        offsets_left=kv_caching_offsets,
+        offsets_right=None,
+    )
+    full_v = concat_2D_jagged(
+        max_seq_len=max_kv_caching_len + delta_size,
+        values_left=v_cache,
+        values_right=delta_v,
+        max_len_left=max_kv_caching_len,
+        max_len_right=delta_size,
+        offsets_left=kv_caching_offsets,
+        offsets_right=None,
+    )
+    full_kv_caching_offsets = kv_caching_offsets + delta_size * torch.arange(
+        B + 1, device=delta_k.device
+    )
+    return (
+        full_k,
+        full_v,
+        max_kv_caching_len + delta_size,
+        full_kv_caching_offsets,
+    )
+
+
+class STULayer(STU):
+    max_kv_caching_len: int
+    k_cache: Optional[torch.Tensor]
+    v_cache: Optional[torch.Tensor]
+    kv_caching_offsets: Optional[torch.Tensor]
+
+    def __init__(
+        self,
+        config: STULayerConfig,
+        is_inference: bool = False,
+    ) -> None:
+        super().__init__(
+            is_inference=is_inference,
+        )
+        self.reset_kv_cache()
+        self._num_heads: int = config.num_heads
+        self._embedding_dim: int = config.embedding_dim
+        self._hidden_dim: int = config.hidden_dim
+        self._attention_dim: int = config.attention_dim
+        self._output_dropout_ratio: float = config.output_dropout_ratio
+        self._target_aware: bool = config.target_aware
+        self._causal: bool = config.causal
+        self._max_attn_len: int = config.max_attn_len or 0
+        self._attn_alpha: float = config.attn_alpha or 1.0 / (self._attention_dim**0.5)
+        self._use_group_norm: bool = config.use_group_norm
+        self._recompute_normed_x: bool = config.recompute_normed_x
+        self._recompute_uvqk: bool = config.recompute_uvqk
+        self._recompute_y: bool = config.recompute_y
+        self._sort_by_length: bool = config.sort_by_length
+        self._contextual_seq_len: int = config.contextual_seq_len
+
+        self._uvqk_weight: torch.nn.Parameter = torch.nn.Parameter(
+            torch.empty(
+                (
+                    self._embedding_dim,
+                    (self._hidden_dim * 2 + self._attention_dim * 2) * self._num_heads,
+                )
+            ),
+        )
+        torch.nn.init.xavier_uniform_(self._uvqk_weight)
+        self._uvqk_beta: torch.nn.Parameter = torch.nn.Parameter(
+            torch.zeros(
+                (self._hidden_dim * 2 + self._attention_dim * 2) * self._num_heads,
+            ),
+        )
+        self._input_norm_weight: torch.nn.Parameter = torch.nn.Parameter(
+            torch.ones((self._embedding_dim,)),
+        )
+        self._input_norm_bias: torch.nn.Parameter = torch.nn.Parameter(
+            torch.zeros((self._embedding_dim,)),
+        )
+        self._output_weight = torch.nn.Parameter(
+            torch.empty(
+                (
+                    self._hidden_dim * self._num_heads * 3,
+                    self._embedding_dim,
+                )
+            ),
+        )
+        torch.nn.init.xavier_uniform_(self._output_weight)
+        output_norm_shape: int = (
+            self._hidden_dim * self._num_heads
+            if not self._use_group_norm
+            else self._num_heads
+        )
+        self._output_norm_weight: torch.nn.Parameter = torch.nn.Parameter(
+            torch.ones((output_norm_shape,)),
+        )
+        self._output_norm_bias: torch.nn.Parameter = torch.nn.Parameter(
+            torch.zeros((output_norm_shape,)),
+        )
+
+    def reset_kv_cache(self) -> None:
+        self.k_cache = None
+        self.v_cache = None
+        self.kv_caching_offsets = None
+        self.max_kv_caching_len = 0
+
+    def update_kv_cache(
+        self,
+        max_seq_len: int,
+        seq_offsets: torch.Tensor,
+        k: Optional[torch.Tensor],
+        v: Optional[torch.Tensor],
+        max_kv_caching_len: int,
+        kv_caching_lengths: Optional[torch.Tensor],
+    ) -> None:
+        self.k_cache, self.v_cache, self.max_kv_caching_len, self.kv_caching_offsets = (
+            _update_kv_cache(
+                max_seq_len=max_seq_len,
+                seq_offsets=seq_offsets,
+                k=k,
+                v=v,
+                max_kv_caching_len=max_kv_caching_len,
+                kv_caching_lengths=kv_caching_lengths,
+                orig_k_cache=self.k_cache,
+                orig_v_cache=self.v_cache,
+                orig_max_kv_caching_len=self.max_kv_caching_len,
+                orig_kv_caching_offsets=self.kv_caching_offsets,
+            )
+        )
+
+    def construct_full_kv(
+        self,
+        delta_k: torch.Tensor,
+        delta_v: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, int, torch.Tensor]:
+        return _construct_full_kv(
+            delta_k=delta_k,
+            delta_v=delta_v,
+            k_cache=fx_unwrap_optional_tensor(self.k_cache),
+            v_cache=fx_unwrap_optional_tensor(self.v_cache),
+            max_kv_caching_len=self.max_kv_caching_len,
+            kv_caching_offsets=self.kv_caching_offsets,
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lengths: torch.Tensor,
+        x_offsets: torch.Tensor,
+        max_seq_len: int,
+        num_targets: torch.Tensor,
+        max_kv_caching_len: int = 0,
+        kv_caching_lengths: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        with record_function("## stu_preprocess_and_attention ##"):
+            u, attn_output, k, v = hstu_preprocess_and_attention(
+                x=x,
+                norm_weight=self._input_norm_weight.to(x.dtype),
+                norm_bias=self._input_norm_bias.to(x.dtype),
+                norm_eps=1e-6,
+                num_heads=self._num_heads,
+                attn_dim=self._attention_dim,
+                hidden_dim=self._hidden_dim,
+                uvqk_weight=self._uvqk_weight.to(x.dtype),
+                uvqk_bias=self._uvqk_beta.to(x.dtype),
+                max_seq_len=max_seq_len,
+                seq_offsets=x_offsets,
+                attn_alpha=self._attn_alpha,
+                causal=self._causal,
+                num_targets=num_targets if self._target_aware else None,
+                max_attn_len=self._max_attn_len,
+                contextual_seq_len=self._contextual_seq_len,
+                recompute_uvqk_in_backward=self._recompute_uvqk,
+                recompute_normed_x_in_backward=self._recompute_normed_x,
+                sort_by_length=self._sort_by_length,
+                prefill=kv_caching_lengths is not None,
+                kernel=self.hammer_kernel(),
+            )
+
+        self.update_kv_cache(
+            max_seq_len=max_seq_len,
+            seq_offsets=x_offsets,
+            k=k,
+            v=v,
+            max_kv_caching_len=max_kv_caching_len,
+            kv_caching_lengths=kv_caching_lengths,
+        )
+
+        with record_function("## stu_compute_output ##"):
+            return hstu_compute_output(
+                attn=attn_output,
+                u=u,
+                x=x,
+                norm_weight=self._output_norm_weight.to(x.dtype),
+                norm_bias=self._output_norm_bias.to(x.dtype),
+                norm_eps=1e-6,
+                dropout_ratio=self._output_dropout_ratio,
+                output_weight=self._output_weight.to(x.dtype),
+                group_norm=self._use_group_norm,
+                num_heads=self._num_heads,
+                linear_dim=self._hidden_dim,
+                concat_u=True,
+                concat_x=True,
+                mul_u_activation_type="none",
+                training=self.training,
+                kernel=self.hammer_kernel(),
+                recompute_y_in_backward=self._recompute_y,
+            )
+
+    def cached_forward(
+        self,
+        delta_x: torch.Tensor,
+        num_targets: torch.Tensor,
+        max_kv_caching_len: int = 0,
+        kv_caching_lengths: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        with record_function("## stu_compute_uqvk ##"):
+            delta_u, delta_q, delta_k, delta_v = hstu_compute_uqvk(
+                x=delta_x,
+                norm_weight=self._input_norm_weight.to(delta_x.dtype),
+                norm_bias=self._input_norm_bias.to(delta_x.dtype),
+                norm_eps=1e-6,
+                num_heads=self._num_heads,
+                attn_dim=self._attention_dim,
+                hidden_dim=self._hidden_dim,
+                uvqk_weight=self._uvqk_weight.to(delta_x.dtype),
+                uvqk_bias=self._uvqk_beta.to(delta_x.dtype),
+                kernel=self.hammer_kernel(),
+            )
+        k, v, max_seq_len, seq_offsets = self.construct_full_kv(
+            delta_k=delta_k.flatten(1, 2),
+            delta_v=delta_v.flatten(1, 2),
+        )
+        self.update_kv_cache(
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            k=k,
+            v=v,
+            max_kv_caching_len=max_kv_caching_len,
+            kv_caching_lengths=kv_caching_lengths,
+        )
+        k = k.view(-1, self._num_heads, self._attention_dim)
+        v = v.view(-1, self._num_heads, self._hidden_dim)
+        with record_function("## delta_hstu_mha ##"):
+            delta_attn_output = delta_hstu_mha(
+                max_seq_len=max_seq_len,
+                alpha=self._attn_alpha,
+                delta_q=delta_q,
+                k=k,
+                v=v,
+                seq_offsets=seq_offsets,
+                num_targets=num_targets if self._target_aware else None,
+                max_attn_len=self._max_attn_len,
+                contextual_seq_len=self._contextual_seq_len,
+                kernel=self.hammer_kernel(),
+            ).view(-1, self._hidden_dim * self._num_heads)
+        with record_function("## stu_compute_output ##"):
+            return hstu_compute_output(
+                attn=delta_attn_output,
+                u=delta_u,
+                x=delta_x,
+                norm_weight=self._output_norm_weight.to(delta_x.dtype),
+                norm_bias=self._output_norm_bias.to(delta_x.dtype),
+                norm_eps=1e-6,
+                dropout_ratio=self._output_dropout_ratio,
+                output_weight=self._output_weight.to(delta_x.dtype),
+                group_norm=self._use_group_norm,
+                num_heads=self._num_heads,
+                linear_dim=self._hidden_dim,
+                concat_u=True,
+                concat_x=True,
+                mul_u_activation_type="none",
+                training=self.training,
+                kernel=self.hammer_kernel(),
+                recompute_y_in_backward=self._recompute_y,
+            )
+
+
+class STUStack(STU):
+    def __init__(
+        self,
+        stu_list: List[STU],
+        is_inference: bool = False,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+        self._stu_layers: torch.nn.ModuleList = torch.nn.ModuleList(modules=stu_list)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_lengths: torch.Tensor,
+        x_offsets: torch.Tensor,
+        max_seq_len: int,
+        num_targets: torch.Tensor,
+        max_kv_caching_len: int = 0,
+        kv_caching_lengths: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        for layer in self._stu_layers:
+            x = layer(
+                x=x,
+                x_lengths=x_lengths,
+                x_offsets=x_offsets,
+                max_seq_len=max_seq_len,
+                num_targets=num_targets,
+                max_kv_caching_len=max_kv_caching_len,
+                kv_caching_lengths=kv_caching_lengths,
+            )
+        return x
+
+    def cached_forward(
+        self,
+        delta_x: torch.Tensor,
+        num_targets: torch.Tensor,
+        max_kv_caching_len: int = 0,
+        kv_caching_lengths: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        for layer in self._stu_layers:
+            delta_x = layer.cached_forward(  # pyre-ignore [29]
+                delta_x=delta_x,
+                num_targets=num_targets,
+                max_kv_caching_len=max_kv_caching_len,
+                kv_caching_lengths=kv_caching_lengths,
+            )
+        return delta_x
diff --git a/recommendation_v4/generative_recommenders/modules/tests/action_encoder_test.py b/recommendation_v4/generative_recommenders/modules/tests/action_encoder_test.py
new file mode 100644
index 000000000..184b314ea
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/tests/action_encoder_test.py
@@ -0,0 +1,113 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import unittest
+
+import torch
+from generative_recommenders.common import gpu_unavailable
+from generative_recommenders.modules.action_encoder import ActionEncoder
+
+
+class ActionEncoderTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    def test_forward(self) -> None:
+        device = torch.device("cuda")
+        action_embedding_dim = 32
+        action_weights = [1, 2, 4, 8, 16]
+        watchtime_to_action_thresholds_and_weights = [
+            (30, 32),
+            (60, 64),
+            (100, 128),
+        ]
+        num_action_types = len(action_weights) + len(
+            watchtime_to_action_thresholds_and_weights
+        )
+        combined_action_weights = action_weights + [
+            x[1] for x in watchtime_to_action_thresholds_and_weights
+        ]
+        enabled_actions = [
+            [0],
+            [0, 1],
+            [1, 3, 4],
+            [1, 2, 3, 4],
+            [1, 2],
+            [2],
+        ]
+        watchtimes = [40, 20, 110, 31, 26, 55]
+        for i, wt in enumerate(watchtimes):
+            for j, w in enumerate(watchtime_to_action_thresholds_and_weights):
+                if wt > w[0]:
+                    enabled_actions[i].append(j + len(action_weights))
+        actions = [
+            sum([combined_action_weights[t] for t in x]) for x in enabled_actions
+        ]
+
+        encoder = ActionEncoder(
+            watchtime_feature_name="watchtimes",
+            action_feature_name="actions",
+            action_weights=action_weights,
+            watchtime_to_action_thresholds_and_weights=watchtime_to_action_thresholds_and_weights,
+            action_embedding_dim=action_embedding_dim,
+            is_inference=False,
+        ).to(device)
+
+        seq_lengths = [6, 3]
+        seq_offsets = [0, 6, 9]
+        num_targets = [2, 1]
+        uih_offsets = [0, 4, 6]
+        target_offsets = [0, 2, 3]
+        seq_embeddings = torch.rand(9, 128, device=device)
+        action_embeddings = encoder(
+            max_uih_len=4,
+            max_targets=2,
+            uih_offsets=torch.tensor(uih_offsets, device=device),
+            target_offsets=torch.tensor(target_offsets, device=device),
+            seq_embeddings=seq_embeddings,
+            seq_payloads={
+                "watchtimes": torch.tensor(watchtimes, device=device),
+                "actions": torch.tensor(actions, device=device),
+            },
+        )
+        self.assertEqual(
+            action_embeddings.shape, (9, action_embedding_dim * num_action_types)
+        )
+        for b in range(len(seq_lengths)):
+            b_start = seq_offsets[b]
+            b_end = seq_offsets[b + 1]
+            u_start = uih_offsets[b]
+            for j in range(b_start, b_end):
+                embedding = action_embeddings[j].view(num_action_types, -1)
+                for atype in range(num_action_types):
+                    if b_end - j <= num_targets[b]:
+                        torch.testing.assert_allclose(
+                            embedding[atype],
+                            encoder._target_action_embedding_table.view(
+                                num_action_types, -1
+                            )[atype],
+                        )
+                    else:
+                        if atype in enabled_actions[j - b_start + u_start]:
+                            torch.testing.assert_allclose(
+                                embedding[atype],
+                                encoder._action_embedding_table[atype],
+                            )
+                        else:
+                            torch.testing.assert_allclose(
+                                embedding[atype], torch.zeros_like(embedding[atype])
+                            )
+        action_embeddings.sum().backward()
diff --git a/recommendation_v4/generative_recommenders/modules/tests/content_encoder_test.py b/recommendation_v4/generative_recommenders/modules/tests/content_encoder_test.py
new file mode 100644
index 000000000..e67656388
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/tests/content_encoder_test.py
@@ -0,0 +1,74 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import unittest
+
+import torch
+from generative_recommenders.common import gpu_unavailable
+from generative_recommenders.modules.content_encoder import ContentEncoder
+
+
+class ContentEncoderTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    def test_forward(self) -> None:
+        device = torch.device("cuda")
+        input_embedding_dim = 32
+        additional_embedding_dim = 64
+        enrich_embedding_dim = 16
+        encoder = ContentEncoder(
+            input_embedding_dim=input_embedding_dim,
+            additional_content_features={
+                "a0": additional_embedding_dim,
+                "a1": additional_embedding_dim,
+            },
+            target_enrich_features={
+                "t0": enrich_embedding_dim,
+                "t1": enrich_embedding_dim,
+            },
+            is_inference=False,
+        ).to(device)
+        seq_lengths = [6, 3]
+        num_targets = [2, 1]
+        uih_offsets = [0, 4, 6]
+        target_offsets = [0, 2, 3]
+        seq_embeddings = torch.rand(
+            sum(seq_lengths), input_embedding_dim, device=device
+        ).requires_grad_(True)
+        seq_payloads = {
+            "a0": torch.rand(
+                sum(seq_lengths), additional_embedding_dim, device=device
+            ).requires_grad_(True),
+            "a1": torch.rand(
+                sum(seq_lengths), additional_embedding_dim, device=device
+            ).requires_grad_(True),
+            "t0": torch.rand(
+                sum(num_targets), enrich_embedding_dim, device=device
+            ).requires_grad_(True),
+            "t1": torch.rand(
+                sum(num_targets), enrich_embedding_dim, device=device
+            ).requires_grad_(True),
+        }
+        content_embeddings = encoder(
+            max_uih_len=4,
+            max_targets=2,
+            uih_offsets=torch.tensor(uih_offsets, device=device),
+            target_offsets=torch.tensor(target_offsets, device=device),
+            seq_embeddings=seq_embeddings,
+            seq_payloads=seq_payloads,
+        )
+        content_embeddings.sum().backward()
diff --git a/recommendation_v4/generative_recommenders/modules/tests/contextual_interleave_preprocessor_test.py b/recommendation_v4/generative_recommenders/modules/tests/contextual_interleave_preprocessor_test.py
new file mode 100644
index 000000000..c3202072c
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/tests/contextual_interleave_preprocessor_test.py
@@ -0,0 +1,499 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import unittest
+
+import torch
+from generative_recommenders.common import gpu_unavailable
+from generative_recommenders.modules.action_encoder import ActionEncoder
+from generative_recommenders.modules.content_encoder import ContentEncoder
+from generative_recommenders.modules.contextual_interleave_preprocessor import (
+    ContextualInterleavePreprocessor,
+)
+from generative_recommenders.modules.contextualize_mlps import (
+    ParameterizedContextualizedMLP,
+    SimpleContextualizedMLP,
+)
+from hypothesis import given, settings, strategies as st, Verbosity
+
+
+class ContextualInterleavePreprocessorTest(unittest.TestCase):
+    # pyre-ignore
+    @given(
+        enable_interleaving=st.sampled_from([True, False]),
+        enable_pmlp=st.sampled_from([True, False]),
+        is_train=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.float32, torch.bfloat16]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @unittest.skipIf(*gpu_unavailable)
+    @settings(verbosity=Verbosity.verbose, max_examples=50, deadline=None)
+    def test_forward(
+        self,
+        enable_interleaving: bool,
+        enable_pmlp: bool,
+        is_train: bool,
+        dtype: torch.dtype,
+    ) -> None:
+        device = torch.device("cuda")
+
+        input_embedding_dim = 64
+        output_embedding_dim = 32
+        action_embedding_dim = 16
+        action_encoder_hidden_dim = 256
+        content_encoder_hidden_dim = 128
+        contextual_len = 3
+
+        content_encoder = ContentEncoder(
+            input_embedding_dim=input_embedding_dim,
+            additional_content_features={
+                "a0": input_embedding_dim,
+                "a1": input_embedding_dim,
+            },
+            target_enrich_features={
+                "t0": input_embedding_dim,
+                "t1": input_embedding_dim,
+            },
+            is_inference=False,
+        ).to(device)
+        action_embedding_dim = 32
+        action_weights = [1, 2, 4, 8, 16]
+        watchtime_to_action_thresholds_and_weights = [
+            (30, 32),
+            (60, 64),
+            (100, 128),
+        ]
+        action_encoder = ActionEncoder(
+            watchtime_feature_name="watchtimes",
+            action_feature_name="actions",
+            action_weights=action_weights,
+            watchtime_to_action_thresholds_and_weights=watchtime_to_action_thresholds_and_weights,
+            action_embedding_dim=action_embedding_dim,
+            is_inference=False,
+        ).to(device)
+
+        preprocessor = ContextualInterleavePreprocessor(
+            input_embedding_dim=input_embedding_dim,
+            output_embedding_dim=output_embedding_dim,
+            contextual_feature_to_max_length={"c_0": 1, "c_1": 2},
+            contextual_feature_to_min_uih_length={"c_1": 4},
+            pmlp_contextual_dropout_ratio=0.2,
+            content_encoder=content_encoder,
+            content_contextualize_mlp_fn=lambda in_dim,
+            out_dim,
+            contextual_dim,
+            is_inference: ParameterizedContextualizedMLP(
+                contextual_embedding_dim=contextual_dim,
+                sequential_input_dim=in_dim,
+                sequential_output_dim=out_dim,
+                hidden_dim=content_encoder_hidden_dim,
+                is_inference=is_inference,
+            )
+            if enable_pmlp
+            else SimpleContextualizedMLP(
+                sequential_input_dim=in_dim,
+                sequential_output_dim=out_dim,
+                hidden_dim=content_encoder_hidden_dim,
+                is_inference=is_inference,
+            ),
+            action_encoder=action_encoder,
+            action_contextualize_mlp_fn=lambda in_dim,
+            out_dim,
+            contextual_dim,
+            is_inference: ParameterizedContextualizedMLP(
+                contextual_embedding_dim=contextual_dim,
+                sequential_input_dim=in_dim,
+                sequential_output_dim=out_dim,
+                hidden_dim=action_encoder_hidden_dim,
+                is_inference=is_inference,
+            )
+            if enable_pmlp
+            else SimpleContextualizedMLP(
+                sequential_input_dim=in_dim,
+                sequential_output_dim=out_dim,
+                hidden_dim=action_encoder_hidden_dim,
+                is_inference=is_inference,
+            ),
+            enable_interleaving=enable_interleaving,
+            is_inference=False,
+        ).to(device)
+        preprocessor.set_training_dtype(dtype)
+        if not is_train:
+            preprocessor.eval()
+
+        # inputs
+        seq_lengths = [6, 3]
+        num_targets = [2, 1]
+        seq_embeddings = torch.rand(
+            (sum(seq_lengths), input_embedding_dim),
+            device=device,
+            dtype=dtype,
+        )
+        seq_timestamps = torch.tensor(
+            [1, 2, 3, 4, 5, 6, 10, 20, 30],
+            device=device,
+        )
+        watchtimes = [40, 20, 110, 31, 26, 55]
+        actions = [1, 3, 26, 30, 6, 4]
+        (
+            output_max_seq_len,
+            output_total_uih_len,
+            output_total_targets,
+            output_seq_lengths,
+            output_seq_offsets,
+            output_seq_timestamps,
+            output_seq_embeddings,
+            output_num_targets,
+            _,
+        ) = preprocessor(
+            max_uih_len=4,
+            max_targets=2,
+            total_uih_len=sum(seq_lengths) - sum(num_targets),
+            total_targets=sum(num_targets),
+            seq_lengths=torch.tensor(seq_lengths, device=device),
+            seq_timestamps=seq_timestamps,
+            seq_embeddings=seq_embeddings,
+            seq_payloads={
+                # contextual
+                "c_0": torch.rand((2, input_embedding_dim), device=device, dtype=dtype),
+                "c_0_offsets": torch.tensor([0, 1, 1], device=device),
+                "c_1": torch.rand((4, input_embedding_dim), device=device, dtype=dtype),
+                "c_1_offsets": torch.tensor([0, 2, 3], device=device),
+                # action
+                "watchtimes": torch.tensor(watchtimes, device=device),
+                "actions": torch.tensor(actions, device=device),
+                # content
+                "a0": torch.rand_like(seq_embeddings).requires_grad_(True),
+                "a1": torch.rand_like(seq_embeddings).requires_grad_(True),
+                "t0": torch.rand(
+                    sum(num_targets), input_embedding_dim, device=device, dtype=dtype
+                ).requires_grad_(True),
+                "t1": torch.rand(
+                    sum(num_targets), input_embedding_dim, device=device, dtype=dtype
+                ).requires_grad_(True),
+            },
+            num_targets=torch.tensor(num_targets, device=device),
+        )
+        if enable_interleaving:
+            if is_train:
+                expected_output_seq_lengths = [
+                    2 * s + contextual_len for s in seq_lengths
+                ]
+                expected_max_seq_len = max(expected_output_seq_lengths)
+                expected_output_num_targets = [2 * s for s in num_targets]
+                expected_seq_embedding_size = (
+                    sum(expected_output_seq_lengths),
+                    output_embedding_dim,
+                )
+                expected_seq_timestamps_size = (sum(expected_output_seq_lengths),)
+                expected_output_seq_timestamps = [
+                    0,
+                    0,
+                    0,
+                    1,
+                    1,
+                    2,
+                    2,
+                    3,
+                    3,
+                    4,
+                    4,
+                    5,
+                    5,
+                    6,
+                    6,
+                    0,
+                    0,
+                    0,
+                    10,
+                    10,
+                    20,
+                    20,
+                    30,
+                    30,
+                ]
+            else:
+                expected_output_seq_lengths = [
+                    2 * s - n + contextual_len for s, n in zip(seq_lengths, num_targets)
+                ]
+                expected_max_seq_len = max(expected_output_seq_lengths)
+                expected_output_num_targets = num_targets
+                expected_seq_embedding_size = (
+                    sum(expected_output_seq_lengths),
+                    output_embedding_dim,
+                )
+                expected_seq_timestamps_size = (sum(expected_output_seq_lengths),)
+                expected_output_seq_timestamps = [
+                    0,
+                    0,
+                    0,
+                    1,
+                    1,
+                    2,
+                    2,
+                    3,
+                    3,
+                    4,
+                    4,
+                    5,
+                    6,
+                    0,
+                    0,
+                    0,
+                    10,
+                    10,
+                    20,
+                    20,
+                    30,
+                ]
+        else:
+            expected_output_seq_lengths = [s + contextual_len for s in seq_lengths]
+            expected_max_seq_len = max(expected_output_seq_lengths)
+            expected_output_num_targets = num_targets
+            expected_seq_embedding_size = (
+                sum(expected_output_seq_lengths),
+                output_embedding_dim,
+            )
+            expected_seq_timestamps_size = (sum(expected_output_seq_lengths),)
+            expected_output_seq_timestamps = [
+                0,
+                0,
+                0,
+                1,
+                2,
+                3,
+                4,
+                5,
+                6,
+                0,
+                0,
+                0,
+                10,
+                20,
+                30,
+            ]
+
+        self.assertEqual(output_max_seq_len, expected_max_seq_len)
+        self.assertEqual(output_seq_lengths.tolist(), expected_output_seq_lengths)
+        torch.testing.assert_allclose(
+            torch.ops.fbgemm.asynchronous_complete_cumsum(output_seq_lengths),
+            output_seq_offsets,
+        )
+        self.assertEqual(output_num_targets.tolist(), expected_output_num_targets)
+        self.assertEqual(
+            output_seq_embeddings.size(),
+            expected_seq_embedding_size,
+        )
+        self.assertEqual(
+            output_seq_timestamps.size(),
+            expected_seq_timestamps_size,
+        )
+        self.assertEqual(
+            output_seq_timestamps.tolist(),
+            expected_output_seq_timestamps,
+        )
+
+        # test combine embeddings
+        batch_size = 10
+        max_uih_len = 100
+        max_targets = 20
+        max_seq_len = max_uih_len + max_targets
+        seq_lengths = torch.randint(0, max_uih_len, (batch_size,), device=device)
+        total_uih_len = int(seq_lengths.sum().item())
+        num_targets = torch.randint(1, max_targets, (batch_size,), device=device)
+        total_targets = int(num_targets.sum().item())
+        seq_lengths = seq_lengths + num_targets
+        seq_offsets = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        seq_offsets[1:] = torch.cumsum(seq_lengths, dim=0)
+        total_seq_len = int(torch.sum(seq_lengths).item())
+        seq_timestamps = torch.randint(0, 1000000, (total_seq_len,), device=device)
+        content_embeddings = torch.rand(
+            (total_seq_len, output_embedding_dim),
+            device=device,
+        ).requires_grad_(True)
+        action_embeddings = torch.rand(
+            (total_seq_len, output_embedding_dim),
+            device=device,
+        ).requires_grad_(True)
+        contextual_embeddings = torch.rand(
+            (total_seq_len, 3 * output_embedding_dim), device=device
+        ).requires_grad_(True)
+        (
+            output_max_seq_len,
+            output_total_uih_len,
+            output_total_targets,
+            output_seq_lengths,
+            output_seq_offsets,
+            output_seq_timestamps,
+            output_seq_embeddings,
+            output_num_targets,
+        ) = preprocessor.combine_embeddings(
+            max_uih_len=max_uih_len,
+            max_targets=max_targets,
+            total_uih_len=total_uih_len,
+            total_targets=total_targets,
+            seq_lengths=seq_lengths,
+            seq_timestamps=seq_timestamps,
+            content_embeddings=content_embeddings,
+            action_embeddings=action_embeddings,
+            contextual_embeddings=contextual_embeddings,
+            num_targets=num_targets,
+        )
+        seq_embeddings = action_embeddings + content_embeddings
+        if enable_interleaving:
+            if is_train:
+                self.assertEqual(
+                    output_max_seq_len,
+                    max_seq_len * 2 + contextual_len,
+                )
+                self.assertEqual(
+                    output_total_uih_len,
+                    total_uih_len * 2 + contextual_len * batch_size,
+                )
+                self.assertEqual(
+                    output_total_targets,
+                    total_targets * 2,
+                )
+                torch.testing.assert_allclose(
+                    output_seq_lengths, seq_lengths * 2 + contextual_len
+                )
+                torch.testing.assert_allclose(output_num_targets, num_targets * 2)
+            else:
+                self.assertEqual(
+                    output_max_seq_len,
+                    max_uih_len * 2 + max_targets + contextual_len,
+                )
+                self.assertEqual(
+                    output_total_uih_len,
+                    total_uih_len * 2 + contextual_len * batch_size,
+                )
+                self.assertEqual(
+                    output_total_targets,
+                    total_targets,
+                )
+                torch.testing.assert_allclose(
+                    output_seq_lengths, seq_lengths * 2 - num_targets + contextual_len
+                )
+                torch.testing.assert_allclose(output_num_targets, num_targets)
+        else:
+            self.assertEqual(
+                output_max_seq_len,
+                max_seq_len + contextual_len,
+            )
+            self.assertEqual(
+                output_total_uih_len,
+                total_uih_len + contextual_len * batch_size,
+            )
+            self.assertEqual(
+                output_total_targets,
+                total_targets,
+            )
+            torch.testing.assert_allclose(
+                output_seq_lengths, seq_lengths + contextual_len
+            )
+            torch.testing.assert_allclose(output_num_targets, num_targets)
+        for b in range(batch_size):
+            input_start = int(seq_offsets[b].item())
+            input_end = int(seq_offsets[b + 1].item())
+            output_start = int(output_seq_offsets[b].item())
+            output_end = int(output_seq_offsets[b + 1].item())
+            input_targets = int(num_targets[b].item())
+            output_targets = int(output_num_targets[b].item())
+            torch.testing.assert_allclose(
+                output_seq_timestamps[output_start : output_start + contextual_len],
+                torch.zeros(3, device=device),
+            )
+            torch.testing.assert_allclose(
+                output_seq_embeddings[
+                    output_start : output_start + contextual_len
+                ].view(-1),
+                contextual_embeddings[b],
+            )
+            if enable_interleaving:
+                torch.testing.assert_allclose(
+                    output_seq_timestamps[
+                        output_start + contextual_len : output_end - output_targets
+                    ].view(-1, 2)[:, 0],
+                    seq_timestamps[input_start : input_end - input_targets],
+                )
+                torch.testing.assert_allclose(
+                    output_seq_timestamps[
+                        output_start + contextual_len : output_end - output_targets
+                    ].view(-1, 2)[:, 1],
+                    seq_timestamps[input_start : input_end - input_targets],
+                )
+                torch.testing.assert_allclose(
+                    output_seq_embeddings[
+                        output_start + contextual_len : output_end - output_targets
+                    ].view(-1, 2, output_embedding_dim)[:, 0, :],
+                    content_embeddings[input_start : input_end - input_targets],
+                )
+                torch.testing.assert_allclose(
+                    output_seq_embeddings[
+                        output_start + contextual_len : output_end - output_targets
+                    ].view(-1, 2, output_embedding_dim)[:, 1, :],
+                    action_embeddings[input_start : input_end - input_targets],
+                )
+                if is_train:
+                    torch.testing.assert_allclose(
+                        output_seq_timestamps[
+                            output_end - output_targets : output_end
+                        ].view(-1, 2)[:, 0],
+                        seq_timestamps[input_end - input_targets : input_end],
+                    )
+                    torch.testing.assert_allclose(
+                        output_seq_timestamps[
+                            output_end - output_targets : output_end
+                        ].view(-1, 2)[:, 1],
+                        seq_timestamps[input_end - input_targets : input_end],
+                    )
+                    torch.testing.assert_allclose(
+                        output_seq_embeddings[
+                            output_end - output_targets : output_end
+                        ].view(-1, 2, output_embedding_dim)[:, 0, :],
+                        content_embeddings[input_end - input_targets : input_end],
+                    )
+                    torch.testing.assert_allclose(
+                        output_seq_embeddings[
+                            output_end - output_targets : output_end
+                        ].view(-1, 2, output_embedding_dim)[:, 1, :],
+                        action_embeddings[input_end - input_targets : input_end],
+                    )
+                else:
+                    torch.testing.assert_allclose(
+                        output_seq_timestamps[output_end - output_targets : output_end],
+                        seq_timestamps[input_end - input_targets : input_end],
+                    )
+                    torch.testing.assert_allclose(
+                        output_seq_embeddings[output_end - output_targets : output_end],
+                        content_embeddings[input_end - input_targets : input_end],
+                    )
+            else:
+                torch.testing.assert_allclose(
+                    output_seq_timestamps[output_start + contextual_len : output_end],
+                    seq_timestamps[input_start:input_end],
+                )
+                torch.testing.assert_allclose(
+                    output_seq_embeddings[output_start + contextual_len : output_end],
+                    seq_embeddings[input_start:input_end],
+                )
diff --git a/recommendation_v4/generative_recommenders/modules/tests/dynamic_stu_test.py b/recommendation_v4/generative_recommenders/modules/tests/dynamic_stu_test.py
new file mode 100644
index 000000000..c1c598f1f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/tests/dynamic_stu_test.py
@@ -0,0 +1,279 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import copy
+import unittest
+from typing import List
+
+import torch
+from generative_recommenders.common import gpu_unavailable, HammerKernel, set_dev_mode
+from generative_recommenders.modules.dynamic_stu import L2STU, SDSTU
+from generative_recommenders.modules.stu import STU, STULayer, STULayerConfig, STUStack
+from hypothesis import given, settings, strategies as st, Verbosity
+
+
+class DynamicStuTest(unittest.TestCase):
+    # pyre-ignore
+    @given(
+        causal=st.sampled_from([True]),
+        num_layers=st.sampled_from([2]),
+        num_heads=st.sampled_from([2]),
+        max_uih_len=st.sampled_from([300]),
+        batch_size=st.sampled_from([8]),
+        embedding_dim=st.sampled_from([16]),
+        attention_dim=st.sampled_from([32]),
+        linear_hidden_dim=st.sampled_from([64]),
+        has_multiple_targets=st.sampled_from([True, False]),
+        contextual_seq_len=st.sampled_from([0, 10]),
+        dtype=st.sampled_from(
+            [torch.float32, torch.bfloat16]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @unittest.skipIf(*gpu_unavailable)
+    @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
+    def test_l2_stu(
+        self,
+        causal: bool,
+        num_layers: int,
+        num_heads: int,
+        max_uih_len: int,
+        batch_size: int,
+        embedding_dim: int,
+        attention_dim: int,
+        linear_hidden_dim: int,
+        has_multiple_targets: bool,
+        contextual_seq_len: int,
+        dtype: torch.dtype,
+    ) -> None:
+        set_dev_mode(True)
+        device = torch.device("cuda")
+        l3_stu_layers: List[STU] = [
+            STULayer(
+                config=STULayerConfig(
+                    embedding_dim=embedding_dim,
+                    num_heads=num_heads,
+                    hidden_dim=linear_hidden_dim,
+                    attention_dim=attention_dim,
+                    output_dropout_ratio=0.0,
+                    causal=causal,
+                    target_aware=has_multiple_targets,
+                    contextual_seq_len=contextual_seq_len,
+                ),
+                is_inference=False,
+            )
+            for _ in range(num_layers)
+        ]
+        l3_stu: List[STU] = [
+            L2STU(
+                stu=STUStack(
+                    stu_list=l3_stu_layers,
+                    is_inference=False,
+                ),
+                max_l2_len=100,
+                contextual_seq_len=contextual_seq_len,
+                is_inference=False,
+            )
+        ]
+        l2_stu_layers: List[STU] = [
+            STULayer(
+                config=STULayerConfig(
+                    embedding_dim=embedding_dim,
+                    num_heads=num_heads,
+                    hidden_dim=linear_hidden_dim,
+                    attention_dim=attention_dim,
+                    output_dropout_ratio=0.0,
+                    causal=causal,
+                    target_aware=has_multiple_targets,
+                    contextual_seq_len=contextual_seq_len,
+                ),
+                is_inference=False,
+            )
+            for _ in range(num_layers)
+        ] + l3_stu
+        l2_stu: List[STU] = [
+            L2STU(
+                stu=STUStack(
+                    stu_list=l2_stu_layers,
+                    is_inference=False,
+                ),
+                max_l2_len=200,
+                contextual_seq_len=contextual_seq_len,
+                is_inference=False,
+            )
+        ]
+        stu_layers: List[STU] = [
+            STULayer(
+                config=STULayerConfig(
+                    embedding_dim=embedding_dim,
+                    num_heads=num_heads,
+                    hidden_dim=linear_hidden_dim,
+                    attention_dim=attention_dim,
+                    output_dropout_ratio=0.0,
+                    causal=causal,
+                    target_aware=has_multiple_targets,
+                    contextual_seq_len=contextual_seq_len,
+                ),
+                is_inference=False,
+            )
+            for _ in range(num_layers)
+        ] + l2_stu
+        stu = STUStack(
+            stu_list=stu_layers,
+            is_inference=False,
+        ).to(device)
+        stu.recursive_setattr("_hammer_kernel", HammerKernel.TRITON)
+
+        x_lengths = torch.randint(max_uih_len + 1, (batch_size,), device=device)
+        x_lengths = x_lengths + contextual_seq_len
+        max_seq_len = max_uih_len + contextual_seq_len
+        max_targets = 20
+        num_targets = torch.randint(1, max_targets, size=(batch_size,), device=device)
+        if has_multiple_targets:
+            x_lengths = x_lengths + num_targets
+            max_seq_len = max_seq_len + max_targets
+        x_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(x_lengths)
+        total_seq_len = int(x_offsets[-1].cpu().item())
+        x = torch.randn(
+            int(total_seq_len),
+            embedding_dim,
+            device=device,
+            dtype=dtype,
+        ).requires_grad_(True)
+        stu_output = stu(
+            x=x,
+            x_lengths=x_lengths,
+            x_offsets=x_offsets,
+            max_seq_len=max_seq_len,
+            num_targets=num_targets,
+        )
+        dout = torch.randn_like(stu_output)
+        stu_output.backward(dout)
+        self.assertTrue(stu_output.shape == x.shape)
+
+    # pyre-ignore
+    @given(
+        causal=st.sampled_from([True]),
+        num_layers=st.sampled_from([2]),
+        num_heads=st.sampled_from([2]),
+        max_uih_len=st.sampled_from([300]),
+        batch_size=st.sampled_from([8]),
+        embedding_dim=st.sampled_from([16]),
+        attention_dim=st.sampled_from([32]),
+        linear_hidden_dim=st.sampled_from([64]),
+        has_multiple_targets=st.sampled_from([True, False]),
+        contextual_seq_len=st.sampled_from([0, 10]),
+        dropout_ratio=st.sampled_from([0.0, 0.3, 1.0]),
+        dtype=st.sampled_from(
+            [torch.float32, torch.bfloat16]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @unittest.skipIf(*gpu_unavailable)
+    @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
+    def test_sd_stu(
+        self,
+        causal: bool,
+        num_layers: int,
+        num_heads: int,
+        max_uih_len: int,
+        batch_size: int,
+        embedding_dim: int,
+        attention_dim: int,
+        linear_hidden_dim: int,
+        has_multiple_targets: bool,
+        contextual_seq_len: int,
+        dropout_ratio: float,
+        dtype: torch.dtype,
+    ) -> None:
+        set_dev_mode(True)
+        device = torch.device("cuda")
+        stu_layers: List[STU] = [
+            STULayer(
+                config=STULayerConfig(
+                    embedding_dim=embedding_dim,
+                    num_heads=num_heads,
+                    hidden_dim=linear_hidden_dim,
+                    attention_dim=attention_dim,
+                    output_dropout_ratio=0.0,
+                    causal=causal,
+                    target_aware=has_multiple_targets,
+                    contextual_seq_len=contextual_seq_len,
+                ),
+                is_inference=False,
+            )
+            for _ in range(num_layers)
+        ]
+        stu = STUStack(
+            stu_list=stu_layers,
+            is_inference=False,
+        ).to(device)
+        sd_stu = SDSTU(
+            stu=copy.deepcopy(stu),
+            dropout_ratio=dropout_ratio,
+            is_inference=False,
+        ).to(device)
+        stu.recursive_setattr("_hammer_kernel", HammerKernel.PYTORCH)
+        sd_stu.recursive_setattr("_hammer_kernel", HammerKernel.PYTORCH)
+        x_lengths = torch.randint(max_uih_len + 1, (batch_size,), device=device)
+        x_lengths = x_lengths + contextual_seq_len
+        max_seq_len = max_uih_len + contextual_seq_len
+        max_targets = 20
+        num_targets = torch.randint(1, max_targets, size=(batch_size,), device=device)
+        if has_multiple_targets:
+            x_lengths = x_lengths + num_targets
+            max_seq_len = max_seq_len + max_targets
+        x_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(x_lengths)
+        total_seq_len = int(x_offsets[-1].cpu().item())
+        x = torch.randn(
+            int(total_seq_len),
+            embedding_dim,
+            device=device,
+            dtype=dtype,
+        ).requires_grad_(True)
+        stu_output = stu(
+            x=x,
+            x_lengths=x_lengths,
+            x_offsets=x_offsets,
+            max_seq_len=max_seq_len,
+            num_targets=num_targets,
+        )
+        dout = torch.randn_like(stu_output)
+        stu_output.backward(dout)
+        assert x.grad is not None
+        d_x, x.grad = x.grad.detach().clone(), None
+        x = x.detach().clone().requires_grad_(True)
+        sd_stu_output = sd_stu(
+            x=x,
+            x_lengths=x_lengths,
+            x_offsets=x_offsets,
+            max_seq_len=max_seq_len,
+            num_targets=num_targets,
+        )
+        dout = dout.detach().clone()
+        sd_stu_output.backward(dout)
+        d_sd_x = x.grad.detach().clone()
+
+        self.assertTrue(sd_stu_output.shape == x.shape)
+        if dropout_ratio == 0.0:
+            torch.testing.assert_close(stu_output, sd_stu_output)
+            torch.testing.assert_close(d_x, d_sd_x)
+        if dropout_ratio == 1.0:
+            torch.testing.assert_close(x, sd_stu_output)
diff --git a/recommendation_v4/generative_recommenders/modules/tests/multitask_module_test.py b/recommendation_v4/generative_recommenders/modules/tests/multitask_module_test.py
new file mode 100644
index 000000000..66f2db185
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/tests/multitask_module_test.py
@@ -0,0 +1,233 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import unittest
+from typing import Dict, List, Tuple
+
+import torch
+from generative_recommenders.common import gpu_unavailable, set_dev_mode
+from generative_recommenders.modules.multitask_module import (
+    DefaultMultitaskModule,
+    MultitaskTaskType,
+    TaskConfig,
+)
+from generative_recommenders.ops.layer_norm import SwishLayerNorm
+from hypothesis import given, settings, strategies as st, Verbosity
+
+
+_task_configs: List[List[TaskConfig]] = [
+    [
+        TaskConfig(
+            task_name="is_click",
+            task_weight=1,
+            task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+        ),
+    ],
+    [
+        TaskConfig(
+            task_name="vvp",
+            task_weight=2,
+            task_type=MultitaskTaskType.REGRESSION,
+        ),
+    ],
+    [
+        TaskConfig(
+            task_name="is_click",
+            task_weight=1,
+            task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+        ),
+        TaskConfig(
+            task_name="is_like",
+            task_weight=2,
+            task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+        ),
+        TaskConfig(
+            task_name="is_follow",
+            task_weight=4,
+            task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+        ),
+    ],
+    [
+        TaskConfig(
+            task_name="rating",
+            task_weight=1,
+            task_type=MultitaskTaskType.REGRESSION,
+        ),
+        TaskConfig(
+            task_name="vvp",
+            task_weight=2,
+            task_type=MultitaskTaskType.REGRESSION,
+        ),
+    ],
+    [
+        TaskConfig(
+            task_name="type_1",
+            task_weight=2,
+            task_type=MultitaskTaskType.REGRESSION,
+        ),
+    ],
+    [
+        TaskConfig(
+            task_name="is_click",
+            task_weight=1,
+            task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+        ),
+        TaskConfig(
+            task_name="is_like",
+            task_weight=2,
+            task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+        ),
+        TaskConfig(
+            task_name="is_follow",
+            task_weight=4,
+            task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+        ),
+        TaskConfig(
+            task_name="rating",
+            task_weight=1,
+            task_type=MultitaskTaskType.REGRESSION,
+        ),
+        TaskConfig(
+            task_name="vvp",
+            task_weight=2,
+            task_type=MultitaskTaskType.REGRESSION,
+        ),
+    ],
+    [
+        TaskConfig(
+            task_name="is_click",
+            task_weight=1,
+            task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+        ),
+        TaskConfig(
+            task_name="is_like",
+            task_weight=2,
+            task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+        ),
+        TaskConfig(
+            task_name="is_follow",
+            task_weight=4,
+            task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
+        ),
+        TaskConfig(
+            task_name="rating",
+            task_weight=1,
+            task_type=MultitaskTaskType.REGRESSION,
+        ),
+        TaskConfig(
+            task_name="vvp",
+            task_weight=2,
+            task_type=MultitaskTaskType.REGRESSION,
+        ),
+    ],
+]
+
+
+def _get_random_supervision_labels_and_weights(
+    num_examples: int,
+    task_configs: List[TaskConfig],
+    device: torch.device,
+) -> Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:
+    supervision_labels: Dict[str, torch.Tensor] = {}
+    supervision_weights: Dict[str, torch.Tensor] = {}
+    for task in task_configs:
+        if task.task_type == MultitaskTaskType.REGRESSION:
+            supervision_labels[task.task_name] = torch.randn(
+                num_examples, device=device
+            ).to(torch.float32)
+        else:
+            supervision_labels[task.task_name] = torch.randint(
+                0,
+                11,
+                (num_examples,),
+                device=device,
+            ).to(torch.float32)
+
+    return supervision_labels, supervision_weights
+
+
+class MultiTaskModuleTest(unittest.TestCase):
+    # pyre-ignore
+    @given(
+        task_config_idx=st.sampled_from(range(len(_task_configs))),
+        training=st.booleans(),
+        is_inference=st.booleans(),
+        dtype=st.sampled_from(
+            [torch.float32, torch.bfloat16]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @unittest.skipIf(*gpu_unavailable)
+    @settings(verbosity=Verbosity.verbose, max_examples=50, deadline=None)
+    def test_default_multitask_module(
+        self,
+        task_config_idx: int,
+        training: bool,
+        is_inference: bool,
+        dtype: torch.dtype,
+    ) -> None:
+        set_dev_mode(True)
+        device = torch.device("cuda")
+
+        L = 200
+        embedding_dim = 64
+        causal_multitask_weights = 0.3
+
+        task_configs: List[TaskConfig] = _task_configs[task_config_idx]
+        task_configs.sort(key=lambda x: x.task_type)
+        multitask_module = DefaultMultitaskModule(
+            task_configs=task_configs,
+            embedding_dim=embedding_dim,
+            prediction_fn=lambda in_dim, num_tasks: torch.nn.Sequential(
+                torch.nn.Linear(in_features=in_dim, out_features=512),
+                SwishLayerNorm(512),
+                torch.nn.Linear(in_features=512, out_features=num_tasks),
+            ),
+            causal_multitask_weights=causal_multitask_weights,
+            is_inference=is_inference,
+        ).to(device)
+        multitask_module.set_training_dtype(dtype)
+        supervision_labels, supervision_weights = (
+            _get_random_supervision_labels_and_weights(
+                num_examples=L,
+                task_configs=task_configs,
+                device=device,
+            )
+        )
+        encoded_user_embeddings = torch.rand(L, embedding_dim, device=device)
+        item_embeddings = torch.rand(L, embedding_dim, device=device)
+
+        (
+            mt_preds,
+            mt_labels,
+            mt_weights,
+            mt_losses,
+        ) = multitask_module(
+            encoded_user_embeddings=encoded_user_embeddings,
+            item_embeddings=item_embeddings,
+            supervision_labels=supervision_labels,
+            supervision_weights=supervision_weights,
+        )
+
+        self.assertEqual(mt_preds.size(), (len(task_configs), L))
+        if not is_inference:
+            self.assertEqual(mt_labels.size(), (len(task_configs), L))
+            self.assertEqual(mt_weights.size(), (len(task_configs), L))
+            if training:
+                self.assertEqual(mt_losses.size(), (len(task_configs),))
diff --git a/recommendation_v4/generative_recommenders/modules/tests/stu_test.py b/recommendation_v4/generative_recommenders/modules/tests/stu_test.py
new file mode 100644
index 000000000..f6440e55e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/modules/tests/stu_test.py
@@ -0,0 +1,453 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import copy
+import unittest
+from typing import List
+
+import torch
+from generative_recommenders.common import gpu_unavailable, HammerKernel, set_dev_mode
+from generative_recommenders.modules.stu import STU, STULayer, STULayerConfig, STUStack
+from generative_recommenders.ops.jagged_tensors import split_2D_jagged
+from hypothesis import given, settings, strategies as st, Verbosity
+
+
+def _inplace_swap(
+    batch_size: int,
+    x: torch.Tensor,
+    swap_from: torch.Tensor,
+    swap_to: torch.Tensor,
+) -> torch.Tensor:
+    for i in range(batch_size):
+        tmp = x[i, swap_from[i], :].detach().clone()
+        x[i, swap_from[i], :] = x[i, swap_to[i], :]
+        x[i, swap_to[i], :] = tmp
+    return x
+
+
+class StuTest(unittest.TestCase):
+    # pyre-ignore
+    @given(
+        causal=st.sampled_from([True]),
+        num_layers=st.sampled_from([2]),
+        num_heads=st.sampled_from([1, 2]),
+        max_uih_len=st.sampled_from([20, 64]),
+        batch_size=st.sampled_from([8]),
+        embedding_dim=st.sampled_from([16]),
+        attention_dim=st.sampled_from([32]),
+        linear_hidden_dim=st.sampled_from([64]),
+        has_multiple_targets=st.sampled_from([True, False]),
+        contextual_seq_len=st.sampled_from([0, 10]),
+        use_group_norm=st.sampled_from([True, False]),
+        recompute_uvqk_in_backward=st.sampled_from([True, False]),
+        recompute_normed_x_in_backward=st.sampled_from([True, False]),
+        recompute_y_in_backward=st.sampled_from([True, False]),
+        empty_inputs=st.sampled_from([False]),
+        dtype=st.sampled_from(
+            [torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @unittest.skipIf(*gpu_unavailable)
+    @settings(verbosity=Verbosity.verbose, max_examples=100, deadline=None)
+    def test_triton(
+        self,
+        causal: bool,
+        num_layers: int,
+        num_heads: int,
+        max_uih_len: int,
+        batch_size: int,
+        embedding_dim: int,
+        attention_dim: int,
+        linear_hidden_dim: int,
+        has_multiple_targets: bool,
+        contextual_seq_len: int,
+        use_group_norm: bool,
+        recompute_uvqk_in_backward: bool,
+        recompute_normed_x_in_backward: bool,
+        recompute_y_in_backward: bool,
+        empty_inputs: bool,  # test the case where all the seqlen in the batch are 0
+        dtype: torch.dtype,
+    ) -> None:
+        set_dev_mode(True)
+        device = torch.device("cuda")
+
+        stu_layers: List[STU] = [
+            STULayer(
+                config=STULayerConfig(
+                    embedding_dim=embedding_dim,
+                    num_heads=num_heads,
+                    hidden_dim=linear_hidden_dim,
+                    attention_dim=attention_dim,
+                    output_dropout_ratio=0.0,
+                    causal=causal,
+                    target_aware=has_multiple_targets,
+                    max_attn_len=None,
+                    attn_alpha=None,
+                    use_group_norm=use_group_norm,
+                    recompute_normed_x=recompute_normed_x_in_backward,
+                    recompute_uvqk=recompute_uvqk_in_backward,
+                    recompute_y=recompute_y_in_backward,
+                    sort_by_length=True,
+                    contextual_seq_len=contextual_seq_len,
+                ),
+                is_inference=False,
+            )
+            for _ in range(num_layers)
+        ]
+        stu = STUStack(
+            stu_list=stu_layers,
+            is_inference=False,
+        ).to(device)
+        stu.recursive_setattr("_hammer_kernel", HammerKernel.PYTORCH)
+        stu_triton = copy.deepcopy(stu)
+        stu_triton.recursive_setattr("_hammer_kernel", HammerKernel.TRITON)
+
+        if empty_inputs:
+            x_lengths = torch.zeros(batch_size, dtype=torch.int32, device=device)
+            num_targets = torch.zeros(batch_size, dtype=torch.int32, device=device)
+            contextual_seq_len = 0
+            max_seq_len = 16
+        else:
+            x_lengths = torch.randint(max_uih_len + 1, (batch_size,), device=device)
+            x_lengths = x_lengths + contextual_seq_len
+            max_seq_len = max_uih_len + contextual_seq_len
+            max_targets = 20
+            num_targets = torch.randint(
+                1, max_targets, size=(batch_size,), device=device
+            )
+            if has_multiple_targets:
+                x_lengths = x_lengths + num_targets
+                max_seq_len = max_seq_len + max_targets
+        x_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(x_lengths)
+        total_seq_len = int(x_offsets[-1].cpu().item())
+        x = torch.randn(
+            int(total_seq_len),
+            embedding_dim,
+            device=device,
+            dtype=dtype,
+        ).requires_grad_(True)
+        x_triton = x.clone().detach().requires_grad_()
+        stu_output = stu(
+            x=x,
+            x_lengths=x_lengths,
+            x_offsets=x_offsets,
+            max_seq_len=max_seq_len,
+            num_targets=num_targets,
+        )
+        stu_triton_output = stu_triton(
+            x=x_triton,
+            x_lengths=x_lengths,
+            x_offsets=x_offsets,
+            max_seq_len=max_seq_len,
+            num_targets=num_targets,
+        )
+        atol = 5e-3 if dtype == torch.bfloat16 else None
+        rtol = 1e-2 if dtype == torch.bfloat16 else None
+        torch.testing.assert_close(stu_triton_output, stu_output, atol=atol, rtol=rtol)
+        dout = torch.randn_like(stu_output)
+        stu_output.backward(dout)
+        dout = dout.detach().clone()
+        stu_triton_output.backward(dout)
+        torch.testing.assert_close(x.grad, x_triton.grad, atol=atol, rtol=rtol)
+
+    # pyre-ignore
+    @given(
+        dtype=st.sampled_from(
+            [torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @unittest.skipIf(*gpu_unavailable)
+    @settings(verbosity=Verbosity.verbose, max_examples=8, deadline=None)
+    def test_target_invariance(
+        self,
+        dtype: torch.dtype,
+    ) -> None:
+        set_dev_mode(True)
+        device = torch.device("cuda")
+        num_layers = 2
+        num_heads = 2
+        max_seq_len = 32
+        batch_size = 8
+        embedding_dim = 16
+        attention_dim = 32
+        linear_hidden_dim = 32
+        causal = True
+        use_group_norm = False
+        recompute_normed_x_in_backward = False
+        recompute_uvqk_in_backward = False
+        recompute_y_in_backward = False
+        max_attn_len = None
+        stu_layers: List[STU] = [
+            STULayer(
+                config=STULayerConfig(
+                    embedding_dim=embedding_dim,
+                    num_heads=num_heads,
+                    hidden_dim=linear_hidden_dim,
+                    attention_dim=attention_dim,
+                    output_dropout_ratio=0.0,
+                    causal=causal,
+                    target_aware=True,
+                    max_attn_len=max_attn_len,
+                    attn_alpha=None,
+                    use_group_norm=use_group_norm,
+                    recompute_normed_x=recompute_normed_x_in_backward,
+                    recompute_uvqk=recompute_uvqk_in_backward,
+                    recompute_y=recompute_y_in_backward,
+                    sort_by_length=True,
+                    contextual_seq_len=0,
+                ),
+                is_inference=False,
+            )
+            for _ in range(num_layers)
+        ]
+        stu = STUStack(
+            stu_list=stu_layers,
+            is_inference=False,
+        ).to(device)
+
+        x_lengths = torch.randint(
+            low=2, high=max_seq_len + 1, size=(batch_size,), device=device
+        )
+        num_targets = torch.randint(low=2, high=10, size=(batch_size,), device=device)
+        x_lengths = x_lengths + num_targets
+        x_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(x_lengths)
+        total_seq_len = int(x_offsets[-1].cpu())
+
+        swap_from = torch.remainder(
+            torch.randint(20, (batch_size,), device=device), num_targets
+        )
+        swap_to = torch.remainder(
+            torch.randint(20, (batch_size,), device=device), num_targets
+        )
+        swap_from = x_lengths - 1 - swap_from
+        swap_to = x_lengths - 1 - swap_to
+        max_seq_len = int(x_lengths.max().item())
+
+        # forward()
+        x = torch.randn(
+            int(total_seq_len),
+            embedding_dim,
+            device=device,
+            dtype=dtype,
+        ).requires_grad_(True)
+        stu_output = stu(
+            x=x,
+            x_lengths=x_lengths,
+            x_offsets=x_offsets,
+            max_seq_len=max_seq_len,
+            num_targets=num_targets,
+        )
+        stu_output_dense = torch.ops.fbgemm.jagged_to_padded_dense(
+            values=stu_output,
+            offsets=[x_offsets],
+            max_lengths=[max_seq_len],
+            padding_value=0.0,
+        )
+
+        # swapped forward().
+        dense_x = torch.ops.fbgemm.jagged_to_padded_dense(
+            x.detach(),
+            [x_offsets],
+            [max_seq_len],
+        )
+        swapped_dense_x = _inplace_swap(batch_size, dense_x, swap_from, swap_to)
+        swapped_x = torch.ops.fbgemm.dense_to_jagged(
+            swapped_dense_x,
+            [x_offsets],
+        )[0].requires_grad_(True)
+        swapped_stu_output = stu(
+            x=swapped_x,
+            x_lengths=x_lengths,
+            x_offsets=x_offsets,
+            max_seq_len=max_seq_len,
+            num_targets=num_targets,
+        )
+        swapped_stu_output_dense = torch.ops.fbgemm.jagged_to_padded_dense(
+            values=swapped_stu_output,
+            offsets=[x_offsets],
+            max_lengths=[max_seq_len],
+            padding_value=0.0,
+        )
+
+        # backward
+        dout = torch.randn_like(stu_output_dense)
+        stu_output_dense.backward(dout)
+        dout = dout.detach().clone()
+        swapped_stu_output_dense.backward(
+            _inplace_swap(batch_size, dout, swap_from, swap_to)
+        )
+
+        swapped_swapped_stu_output_dense = _inplace_swap(
+            batch_size, swapped_stu_output_dense, swap_from, swap_to
+        )
+        torch.testing.assert_close(stu_output_dense, swapped_swapped_stu_output_dense)
+
+        # backward
+        torch.testing.assert_close(
+            torch.ops.fbgemm.jagged_to_padded_dense(
+                swapped_x.grad,
+                [x_offsets],
+                [max_seq_len],
+            ),
+            _inplace_swap(
+                batch_size,
+                torch.ops.fbgemm.jagged_to_padded_dense(
+                    x.grad,
+                    [x_offsets],
+                    [max_seq_len],
+                ),
+                swap_from,
+                swap_to,
+            ),
+        )
+
+    # pyre-ignore[56]
+    @given(
+        num_layers=st.sampled_from([1, 2, 4]),
+        num_heads=st.sampled_from([1, 4]),
+        max_uih_len=st.sampled_from([20, 128]),
+        batch_size=st.sampled_from([4, 8]),
+        embedding_dim=st.sampled_from([32]),
+        attention_dim=st.sampled_from([16]),
+        linear_hidden_dim=st.sampled_from([64]),
+        contextual_seq_len=st.sampled_from([0, 10]),
+    )
+    @settings(verbosity=Verbosity.verbose, max_examples=20, deadline=None)
+    @unittest.skipIf(*gpu_unavailable)
+    @torch.inference_mode()
+    def test_cached_forward(
+        self,
+        num_layers: int,
+        num_heads: int,
+        max_uih_len: int,
+        batch_size: int,
+        embedding_dim: int,
+        attention_dim: int,
+        linear_hidden_dim: int,
+        contextual_seq_len: int,
+    ) -> None:
+        set_dev_mode(True)
+        device = torch.device("cuda")
+
+        torch.backends.cudnn.allow_tf32 = True
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+        use_group_norm = False
+        recompute_normed_x_in_backward = False
+        recompute_uvqk_in_backward = False
+        recompute_y_in_backward = False
+        max_attn_len = None
+        stu_layers: List[STU] = [
+            STULayer(
+                config=STULayerConfig(
+                    embedding_dim=embedding_dim,
+                    num_heads=num_heads,
+                    hidden_dim=linear_hidden_dim,
+                    attention_dim=attention_dim,
+                    output_dropout_ratio=0.0,
+                    causal=True,
+                    target_aware=True,
+                    max_attn_len=max_attn_len,
+                    attn_alpha=None,
+                    use_group_norm=use_group_norm,
+                    recompute_normed_x=recompute_normed_x_in_backward,
+                    recompute_uvqk=recompute_uvqk_in_backward,
+                    recompute_y=recompute_y_in_backward,
+                    sort_by_length=True,
+                    contextual_seq_len=contextual_seq_len,
+                ),
+                is_inference=True,
+            )
+            for _ in range(num_layers)
+        ]
+        stu = STUStack(
+            stu_list=stu_layers,
+            is_inference=True,
+        ).to(device)
+        stu.recursive_setattr("_hammer_kernel", HammerKernel.TRITON)
+        stu.eval()
+
+        x_lengths = torch.randint(
+            max_uih_len, max_uih_len + 1, (batch_size,), device=device
+        )
+        x_lengths = x_lengths + contextual_seq_len
+        max_seq_len = max_uih_len + contextual_seq_len
+        delta_size = 20
+        max_targets = delta_size * 2
+        num_targets = torch.randint(
+            delta_size, max_targets + 1, size=(batch_size,), device=device
+        )
+        x_lengths = x_lengths + num_targets + contextual_seq_len
+        max_seq_len = max_seq_len + max_targets + contextual_seq_len
+        x_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(x_lengths)
+        total_seq_len = int(x_offsets[-1].cpu().item())
+        x = torch.randn(
+            int(total_seq_len),
+            embedding_dim,
+            device=device,
+        ).requires_grad_(True)
+
+        # default forward().
+        ref_y = stu(
+            x=x,
+            x_lengths=x_lengths,
+            x_offsets=x_offsets,
+            max_seq_len=max_seq_len,
+            num_targets=num_targets,
+        )
+        prime_lengths = x_lengths - delta_size
+        prime_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(prime_lengths)
+        _, ref_delta_y = split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=ref_y,
+            max_len_left=None,
+            max_len_right=delta_size,
+            offsets_left=prime_offsets,
+            offsets_right=None,
+            kernel=HammerKernel.TRITON,
+        )
+
+        # cached forward().
+        prime_x, delta_x = split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=x,
+            max_len_left=None,
+            max_len_right=delta_size,
+            offsets_left=prime_offsets,
+            offsets_right=None,
+            kernel=HammerKernel.TRITON,
+        )
+        _ = stu(
+            x=prime_x,
+            x_lengths=prime_lengths,
+            x_offsets=prime_offsets,
+            max_seq_len=max_seq_len,
+            num_targets=num_targets - delta_size,
+            max_kv_caching_len=max_seq_len - delta_size,
+            kv_caching_lengths=x_lengths - delta_size,
+        )
+        delta_y = stu.cached_forward(
+            delta_x=delta_x,
+            num_targets=num_targets,
+        )
+
+        torch.testing.assert_close(ref_delta_y, delta_y)
diff --git a/recommendation_v4/generative_recommenders/ops/benchmarks/addmm_bench.py b/recommendation_v4/generative_recommenders/ops/benchmarks/addmm_bench.py
new file mode 100644
index 000000000..b1be3a803
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/benchmarks/addmm_bench.py
@@ -0,0 +1,174 @@
+# pyre-unsafe
+import time
+from typing import List, Optional, Tuple
+
+import click
+import pandas as pd
+import torch
+
+# @manual=//triton:triton
+import triton
+from generative_recommenders.common import HammerKernel
+from generative_recommenders.ops.triton.triton_addmm import (
+    triton_addmm_fwd,
+    triton_addmm_fwd_tma_persistent,
+    triton_addmm_fwd_tma_ws_persistent_tlx,
+    triton_addmm_fwd_tma_ws_tlx,
+)
+from generative_recommenders.ops.utils import is_sm100
+
+try:
+    # @manual=//triton:triton
+    import triton.language.extra.tlx as tlx  # type: ignore
+
+    HAS_TLX = True
+except ImportError:
+    tlx = None
+    HAS_TLX = False
+
+
+def get_kernel(provider: str) -> HammerKernel:
+    if provider == "triton":
+        return HammerKernel.TRITON
+    elif provider == "pytorch":
+        return HammerKernel.PYTORCH
+    else:
+        raise ValueError(f"Unknown provider {provider}")
+
+
+def get_dtype(dtype: str) -> torch.dtype:
+    if dtype == "bfloat16":
+        return torch.bfloat16
+    elif dtype == "float32":
+        return torch.float32
+    elif dtype == "float16":
+        return torch.float16
+    else:
+        raise ValueError(f"Not supported dtype {dtype}")
+
+
+@click.command()
+@click.option("--m", type=int, default=0)
+@click.option("--k", type=int, default=4096)
+@click.option("--n", type=int, default=4096)
+@click.option("--dtype", type=str, default="bfloat16")
+@click.option("--return-result", type=bool, default=False)
+@click.option("--broadcast-y", type=bool, is_flag=True, default=False)
+def main(
+    m: int,
+    k: int,
+    n: int,
+    dtype: str,
+    return_result: bool,
+    broadcast_y: bool,
+) -> Optional[Tuple[List[triton.testing.Benchmark], List[pd.DataFrame]]]:
+    if m == 0:
+        batch_sizes = [64, 128, 256, 512]
+    else:
+        batch_sizes = [m]
+    line_vals = [
+        "pytorch",
+        "triton",
+        "triton_tma_persistent",
+        "triton_tma_persistent_ws",
+    ]
+    line_names = [
+        "PyTorch",
+        "Triton",
+        "Triton TMA Persistent",
+        "Triton TMA Persistent WS",
+    ]
+    styles = [
+        ("red", "-"),
+        ("green", "-"),
+        ("orange", "-"),
+        ("purple", "-"),
+    ]
+    if is_sm100() and HAS_TLX:  # tmem is only supported on Blackwell
+        line_vals.append("triton_tma_ws_tlx")
+        line_names.append("Triton TMA WS TLX")
+        styles.append(("cyan", "-"))
+        line_vals.append("triton_tma_persistent_ws_tlx")
+        line_names.append("Triton TMA Persistent WS TLX")
+        styles.append(("magenta", "-"))
+    configs: List[triton.testing.Benchmark] = [
+        triton.testing.Benchmark(
+            x_names=["batch_size"],
+            x_vals=batch_sizes,
+            line_arg="provider",
+            line_vals=line_vals,
+            line_names=line_names,
+            styles=styles,
+            ylabel="ms",
+            plot_name=f"addmm-K-{k}-N-{n}-mode-{mode}-dtype-{dtype}-broadcast_y-{broadcast_y}",
+            args={
+                "K": k,
+                "N": n,
+                "dtype": dtype,
+                "broadcast_y": broadcast_y,
+            },
+        )
+        for mode in ["fwd"]
+    ]
+
+    @triton.testing.perf_report(configs)
+    def bench_addmm(
+        batch_size: int,
+        K: int,
+        N: int,
+        dtype: str,
+        provider: str,
+        broadcast_y: bool,
+    ) -> float:
+        warmup = 20
+        rep = 2000
+        x = torch.randn(
+            (batch_size, K), dtype=get_dtype(dtype), device=torch.device("cuda")
+        ).requires_grad_(True)
+        weight = torch.randn(
+            (N, K), dtype=get_dtype(dtype), device=torch.device("cuda")
+        ).requires_grad_(True)
+        if broadcast_y:
+            y = torch.randn(
+                (N), dtype=get_dtype(dtype), device=torch.device("cuda")
+            ).requires_grad_(True)
+        else:
+            y = torch.randn(
+                (batch_size, N), dtype=get_dtype(dtype), device=torch.device("cuda")
+            ).requires_grad_(True)
+
+        # Make sure tensors are contiguous for TMA kernels
+        weight_t_contiguous = weight.T.contiguous()
+
+        if provider == "pytorch":
+            fn = lambda: torch.addmm(y, x, weight.T)  # noqa E731
+        elif provider == "triton_tma_persistent":
+            fn = lambda: triton_addmm_fwd_tma_persistent(
+                x, weight_t_contiguous, y, warp_specialize=False
+            )  # noqa E731
+        elif provider == "triton_tma_persistent_ws":
+            fn = lambda: triton_addmm_fwd_tma_persistent(
+                x, weight_t_contiguous, y, warp_specialize=True
+            )  # noqa E731
+        elif provider == "triton_tma_persistent_ws_tlx":
+            fn = lambda: triton_addmm_fwd_tma_ws_persistent_tlx(
+                x, weight_t_contiguous, y
+            )  # noqa E731
+        elif provider == "triton_tma_ws_tlx":
+            fn = lambda: triton_addmm_fwd_tma_ws_tlx(x, weight_t_contiguous, y)  # noqa E731
+        elif provider == "triton":
+            fn = lambda: triton_addmm_fwd(x, weight_t_contiguous, y)  # noqa E731
+        else:
+            raise ValueError(f"Unknown provider: {provider}")
+        time.sleep(2)
+        ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+        return ms
+
+    df = bench_addmm.run(print_data=True, return_df=return_result)
+
+    if return_result:
+        return configs, df
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/ops/benchmarks/hstu_attention_bench.py b/recommendation_v4/generative_recommenders/ops/benchmarks/hstu_attention_bench.py
new file mode 100644
index 000000000..cc7fbede7
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/benchmarks/hstu_attention_bench.py
@@ -0,0 +1,406 @@
+# pyre-strict
+import os
+from typing import List, Optional, Tuple
+
+import click
+import pandas as pd
+import torch
+
+# @manual=//triton:triton
+import triton
+from generative_recommenders.common import (
+    apply_sampling,
+    blackwell_tlx_unavailable,
+    generate_sparse_seq_len,
+    HammerKernel,
+)
+from generative_recommenders.ops.cpp.cuda_hstu_attention import cuda_hstu_mha
+from generative_recommenders.ops.hstu_attention import delta_hstu_mha, hstu_mha
+
+try:
+    from hammer.ops.ragged_hstu_attention import ragged_hstu_mha
+    from hammer.utils import HammerKernel as HammerKernel2
+except ImportError:
+    pass
+
+
+def _get_kernel(provider: str) -> HammerKernel:
+    if provider == "triton":
+        return HammerKernel.TRITON
+    elif provider == "tlx":
+        return HammerKernel.TLX
+    elif provider == "pytorch":
+        return HammerKernel.PYTORCH
+    else:
+        raise ValueError(f"Unknown provider {provider}")
+
+
+def _flops(
+    batch_size: int,
+    max_seqlen: int,
+    attn_dim: int,
+    hidden_dim: int,
+    nheads: int,
+    seq_offsets: torch.Tensor,
+    mode: str = "fwd",
+) -> float:
+    assert mode in ["fwd", "bwd", "fwd_bwd"]
+    ratio = 2.0  # triangular masking
+    f1 = 0.0
+    f2 = 0.0
+    for i in range(batch_size):
+        seq_len = int((seq_offsets[i + 1] - seq_offsets[i]).item())
+        # (QK^T), dQ = d(QK^T)K, dK^T = Q^Td(QK^T)
+        f1 += 2 * nheads * attn_dim * seq_len**2 // ratio
+        # (QK^T)V, d(QK^T) = dOV^T, dV = (QK^T)^TdO,
+        f2 += 2 * nheads * hidden_dim * seq_len**2 // ratio
+    if mode == "fwd":
+        return f1 + f2  # computes (QK^T) and (QK^T)V
+    elif mode == "bwd":
+        return 3 * f1 + 2 * f2  # computes (QK^T), dQ, dK, dV, d(QK^T)
+    else:
+        return 4 * f1 + 3 * f2
+
+
+@click.command()
+@click.option(
+    "--batch-size",
+    type=int,
+    default=512,
+)
+@click.option("--heads", type=int, default=4)
+@click.option("--attn-dim", type=int, default=128)
+@click.option("--hidden-dim", type=int, default=128)
+@click.option("--max-seq-len-log2", type=int, default=13)
+@click.option("--data-type", type=str, default="bf16")
+@click.option("--seq-sparsity", type=float, default=0.95)
+@click.option("--has-delta-q", type=bool, default=False)
+@click.option("--delta-size", type=int, default=256)
+@click.option("--target-size", type=int, default=20)
+@click.option("--bench-backward", type=bool, default=True)
+@click.option("--bench-forward", type=bool, default=True)
+@click.option("--bench-tlx", type=bool, default=False)
+@click.option("--bench-pytorch", type=bool, default=False)
+@click.option("--bench-ragged", type=bool, default=True)
+@click.option("--report-flops", type=bool, default=False)
+@click.option("--return-result", type=bool, default=False)
+@click.option("--max-attn-len", type=int, default=0)
+@click.option("--min-full-attn-seq-len", type=int, default=0)
+@click.option("--contextual-seq-len", type=int, default=0)
+@click.option("--sampling-alpha", type=float, default=2.0)
+@click.option("--triton-enable-tma", type=bool, default=False)
+@click.option("--dynamic-attn-scale", type=bool, default=False)
+@click.option("--num-softmax-heads", type=int, default=0)
+def main(  # noqa: C901
+    batch_size: int,
+    heads: int,
+    attn_dim: int,
+    hidden_dim: int,
+    max_seq_len_log2: int,
+    data_type: str,
+    seq_sparsity: float,
+    has_delta_q: bool,
+    delta_size: int,
+    target_size: int,
+    bench_backward: bool,
+    bench_forward: bool,
+    bench_tlx: bool,
+    bench_pytorch: bool,
+    bench_ragged: bool,
+    report_flops: bool,
+    return_result: bool,
+    max_attn_len: int,
+    min_full_attn_seq_len: int,
+    contextual_seq_len: int,
+    sampling_alpha: float,
+    triton_enable_tma: bool,
+    dynamic_attn_scale: bool,
+    num_softmax_heads: int,
+) -> Optional[Tuple[List[triton.testing.Benchmark], List[pd.DataFrame]]]:
+    torch.backends.cudnn.allow_tf32 = True
+    torch.backends.cuda.matmul.allow_tf32 = True
+    if data_type == "fp32":
+        dtype = torch.float32
+    elif data_type == "fp16":
+        dtype = torch.float16
+    elif data_type == "bf16":
+        dtype = torch.bfloat16
+    else:
+        raise ValueError(f"Unsupported data type: {data_type}.")
+
+    line_vals = ["triton", "flash_cuda_jagged"]
+    line_names = ["triton", "flash_cuda_jagged"]
+    styles = [("blue", "-"), ("green", "-")]
+    if bench_pytorch:
+        line_vals.append("pytorch")
+        line_names.append("PyTorch")
+        styles.append(("green", "-"))
+    if bench_ragged:
+        line_vals.append("ragged")
+        line_names.append("ragged")
+        styles.append(("red", "-"))
+    if bench_tlx and not blackwell_tlx_unavailable[0]:
+        line_vals.append("tlx")
+        line_names.append("tlx")
+        styles.append(("cyan", "-"))
+
+    bench_backward = False if has_delta_q else bench_backward
+    modes = []
+    if bench_forward:
+        modes.append("fwd")
+    if bench_backward:
+        modes.append("bwd")
+    assert len(modes) > 0
+
+    configs: List[triton.testing.Benchmark] = [
+        triton.testing.Benchmark(
+            x_names=["seq_len"],
+            x_vals=[2**i for i in range(8, max_seq_len_log2)],
+            line_arg="provider",
+            line_vals=line_vals,
+            line_names=line_names,
+            styles=styles,
+            ylabel="ms",
+            plot_name=f"hstu-attn-b{batch_size}-h{heads}-d{attn_dim}-v{hidden_dim}--sparsity{seq_sparsity}-{mode}-{dtype}-target{target_size}-mattn{max_attn_len}-full{min_full_attn_seq_len}-c{contextual_seq_len}-sl_alpha{sampling_alpha}-triton_tma{triton_enable_tma}-dynamic_scale{dynamic_attn_scale}-num_softmax_heads{num_softmax_heads}",
+            args={
+                "batch_size": batch_size,
+                "heads": heads,
+                "attn_dim": attn_dim,
+                "hidden_dim": hidden_dim,
+                "dtype": dtype,
+                "mode": mode,
+                "seq_sparsity": seq_sparsity,
+                "has_delta_q": has_delta_q,
+                "delta_size": delta_size,
+                "target_size": target_size,
+                "bench_backward": bench_backward,
+                "report_flops": report_flops,
+                "max_attn_len": max_attn_len,
+                "min_full_attn_seq_len": min_full_attn_seq_len,
+                "contextual_seq_len": contextual_seq_len,
+                "sampling_alpha": sampling_alpha,
+                "triton_enable_tma": triton_enable_tma,
+                "dynamic_attn_scale": dynamic_attn_scale,
+                "num_softmax_heads": num_softmax_heads,
+            },
+        )
+        for mode in modes
+    ]
+
+    @triton.testing.perf_report(configs)
+    def _bench_hstu_attention(
+        batch_size: int,
+        heads: int,
+        seq_len: int,
+        attn_dim: int,
+        hidden_dim: int,
+        mode: str,
+        provider: str,
+        dtype: torch.dtype,
+        seq_sparsity: float,
+        has_delta_q: bool,
+        delta_size: int,
+        target_size: int,
+        bench_backward: bool,
+        report_flops: bool,
+        max_attn_len: int,
+        min_full_attn_seq_len: int,
+        contextual_seq_len: int,
+        sampling_alpha: float,
+        triton_enable_tma: bool,
+        dynamic_attn_scale: bool,
+        num_softmax_heads: int,
+    ) -> float:
+        assert mode in ["fwd", "bwd"]
+        warmup = 25
+        rep = 1000
+        torch.manual_seed(1001)  # for reproducibility
+        alpha = 1.0 / attn_dim
+        causal = True
+        lengths = generate_sparse_seq_len(
+            size=batch_size,
+            max_seq_len=seq_len,
+            sparsity=seq_sparsity,
+            device=torch.device("cuda"),
+        )
+        lengths = apply_sampling(lengths, sampling_alpha, max_seq_len=seq_len)
+        if has_delta_q:
+            lengths = lengths + delta_size
+            num_targets = torch.ones_like(lengths) * delta_size
+            seq_len = seq_len + delta_size
+        else:
+            delta_size = 0
+            num_targets = None
+            if target_size != 0:
+                num_targets = torch.randint(
+                    1,
+                    target_size + 1,
+                    (batch_size,),
+                    device=lengths.device,
+                    dtype=lengths.dtype,
+                )
+                num_targets = torch.where(
+                    num_targets > lengths, lengths, num_targets
+                ).to(torch.int32)
+        max_attn_len = max_attn_len if max_attn_len < seq_len else seq_len
+        seq_offsets = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        seq_offsets[1:] = torch.cumsum(lengths, dim=0)
+        L = int(seq_offsets[-1].item())
+        x = torch.empty(
+            (L, heads, attn_dim * 2 + hidden_dim),
+            dtype=dtype,
+            device=torch.device("cuda"),
+        ).uniform_(-0.01, 0.01)
+        q, k, v = torch.split(x, [attn_dim, attn_dim, hidden_dim], dim=-1)
+        delta_q = torch.empty(
+            (batch_size * delta_size, heads, attn_dim),
+            dtype=dtype,
+            device=torch.device("cuda"),
+        ).uniform_(-0.1, 0.1)
+        delta_x_offsets = torch.arange(0, delta_size, device=torch.device("cuda"))
+        delta_x_offsets = (seq_offsets[1:] - delta_size).view(
+            batch_size, 1
+        ) + delta_x_offsets.view(1, delta_size)
+        delta_x_offsets = delta_x_offsets.view(-1)
+        attn_scale = torch.empty(
+            (L,),
+            dtype=torch.float32,
+            device=torch.device("cuda"),
+        ).uniform_(0.5, 1.0)
+
+        if bench_backward:
+            q = q.requires_grad_(True)
+            k = k.requires_grad_(True)
+            v = v.requires_grad_(True)
+        assert provider in [
+            "triton",
+            "pytorch",
+            "flash_cuda_jagged",
+            "flash_cuda",
+            "tlx",
+            "ragged",
+        ]
+        if has_delta_q:
+            fn = lambda: delta_hstu_mha(  # noqa E731
+                max_seq_len=seq_len,
+                alpha=alpha,
+                delta_q=delta_q,
+                k=k,
+                v=v,
+                seq_offsets=seq_offsets,
+                num_targets=num_targets,
+                kernel=_get_kernel(provider),
+            )
+        else:
+            if provider == "flash_cuda_jagged":
+                fn = lambda: cuda_hstu_mha(  # noqa E731
+                    q=q,
+                    k=k,
+                    v=v,
+                    alpha=alpha,
+                    causal=True,
+                    seq_offsets=seq_offsets.to(torch.int32),
+                    attn_scale=attn_scale if dynamic_attn_scale else None,
+                    max_seq_len=seq_len,
+                    max_attn_len=max_attn_len,
+                    min_full_attn_seq_len=min_full_attn_seq_len,
+                    contextual_seq_len=contextual_seq_len,
+                    num_targets=num_targets,
+                    sort_by_length=False,
+                    num_softmax_heads=num_softmax_heads,
+                )
+            elif provider == "flash_cuda":
+                q, k, v = [
+                    torch.randn(
+                        batch_size,
+                        seq_len,
+                        heads,
+                        attn_dim,
+                        device="cuda",
+                        dtype=dtype,
+                        requires_grad=True,
+                    )
+                    for _ in range(3)
+                ]
+                fn = lambda: cuda_hstu_mha(  # noqa E731
+                    q=q,
+                    k=k,
+                    v=v,
+                    alpha=alpha,
+                    causal=True,
+                    max_seq_len=seq_len,
+                    max_attn_len=max_attn_len,
+                    min_full_attn_seq_len=min_full_attn_seq_len,
+                    contextual_seq_len=contextual_seq_len,
+                    num_targets=num_targets,
+                    sort_by_length=False,
+                    num_softmax_heads=num_softmax_heads,
+                )
+            elif provider == "ragged":
+                fn = lambda: ragged_hstu_mha(  # noqa E731
+                    max_seq_len=seq_len,
+                    alpha=alpha,
+                    q=q,
+                    k=k,
+                    v=v,
+                    seq_offsets=seq_offsets,
+                    dropout_pr=0.0,
+                    training=True,
+                    invalid_attn_mask_type="lower_triangular",
+                    num_targets=num_targets,
+                    attn_scale=attn_scale if dynamic_attn_scale else None,
+                    max_attn_len=max_attn_len,
+                    contextual_seq_len=contextual_seq_len,
+                    full_attn_size=min_full_attn_seq_len,
+                    sort_by_length=True,
+                    kernel=HammerKernel2.TRITON,
+                    num_softmax_heads=num_softmax_heads,
+                )
+            else:
+                fn = lambda: hstu_mha(  # noqa E731
+                    max_seq_len=seq_len,
+                    alpha=alpha,
+                    q=q,
+                    k=k,
+                    v=v,
+                    seq_offsets=seq_offsets,
+                    causal=causal,
+                    dropout_pr=0.0,
+                    training=True,
+                    num_targets=num_targets,
+                    max_attn_len=max_attn_len,
+                    contextual_seq_len=contextual_seq_len,
+                    sort_by_length=True,
+                    kernel=_get_kernel(provider),
+                    enable_tma=triton_enable_tma,
+                )
+        if mode == "bwd":
+            o = fn()
+            do = torch.randn_like(o)
+            fn = lambda: o.backward(do, retain_graph=True)  # noqa E731
+        ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+        all_flops = _flops(
+            batch_size, seq_len, attn_dim, hidden_dim, heads, seq_offsets, mode
+        )
+        if has_delta_q:
+            all_flops = all_flops / seq_len * delta_size
+        if report_flops:
+            return all_flops / ms / 1e9
+        else:
+            return ms
+
+    df = _bench_hstu_attention.run(
+        print_data=True,
+        show_plots=False,
+        save_path="/tmp/" + os.environ["USER"],
+        return_df=return_result,
+    )
+
+    if return_result:
+        return configs, df
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/ops/benchmarks/jagged_dense_bmm_bench.py b/recommendation_v4/generative_recommenders/ops/benchmarks/jagged_dense_bmm_bench.py
new file mode 100644
index 000000000..dcfb9819e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/benchmarks/jagged_dense_bmm_bench.py
@@ -0,0 +1,199 @@
+# pyre-strict
+import math
+from typing import List, Optional, Tuple
+
+import click
+import pandas as pd
+import torch
+
+# @manual=//triton:triton
+import triton
+from generative_recommenders.common import HammerKernel
+from generative_recommenders.ops.triton.triton_jagged import triton_jagged_dense_bmm
+
+# buck2 run @mode/{opt,inplace} //generative_recommenders/ops/benchmarks:jagged_dense_bmm_bench -- --fwd-only
+
+
+def jagged_dense_bmm(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    jagged: torch.Tensor,
+    dense: torch.Tensor,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> torch.Tensor:
+    """
+    Wrapper function for jagged_dense_bmm with kernel selection.
+    Computing out = jagged x dense
+    jagged has shape (sum_B(M_i), K), dense has shape (B, K, N)
+    out has shape (sum_B(M_i), N)
+    """
+    if kernel == HammerKernel.TRITON:
+        return triton_jagged_dense_bmm(
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            jagged=jagged,
+            dense=dense,
+        )
+    elif kernel == HammerKernel.PYTORCH:
+        # PyTorch implementation - manual implementation using standard operations
+        B, K, N = dense.shape
+        outputs = []
+        for i in range(B):
+            start_idx = seq_offsets[i]
+            end_idx = seq_offsets[i + 1]
+            if start_idx < end_idx:
+                jagged_seq = jagged[start_idx:end_idx]  # (seq_len, K)
+                dense_seq = dense[i]  # (K, N)
+                output_seq = torch.mm(jagged_seq, dense_seq)  # (seq_len, N)
+                outputs.append(output_seq)
+        return (
+            torch.cat(outputs, dim=0)
+            if outputs
+            else torch.empty(0, N, device=jagged.device, dtype=jagged.dtype)
+        )
+    else:
+        raise ValueError(f"Unsupported kernel: {kernel}")
+
+
+@click.command()
+@click.option(
+    "--batch-size",
+    type=int,
+    default=512,
+)
+@click.option(
+    "--max-seq-len",
+    type=int,
+    default=8192,
+    show_default=True,
+)
+@click.option(
+    "-d",
+    type=int,
+    default=64,
+    show_default=True,
+)
+@click.option(
+    "-k",
+    type=int,
+    default=64,
+    show_default=True,
+)
+@click.option("--dtype", type=str, default="bf16")
+@click.option("--fwd-only", is_flag=True)
+@click.option("--return-result", type=bool, default=False)
+def main(
+    batch_size: int,
+    max_seq_len: int,
+    d: int,
+    k: int,
+    dtype: str,
+    fwd_only: bool,
+    return_result: bool,
+) -> Optional[Tuple[List[triton.testing.Benchmark], List[pd.DataFrame]]]:
+    torch.backends.cudnn.allow_tf32 = True
+    torch.backends.cuda.matmul.allow_tf32 = True
+    max_seq_len_log2 = int(round(math.log2(max_seq_len)))
+    if dtype == "fp32":
+        pt_dtype = torch.float32
+    elif dtype == "fp16":
+        pt_dtype = torch.float16
+    elif dtype == "bf16":
+        pt_dtype = torch.bfloat16
+    else:
+        raise ValueError(f"Unsupported data type: {dtype}.")
+
+    configs: List[triton.testing.Benchmark] = [
+        triton.testing.Benchmark(
+            x_names=["seq_len"],
+            x_vals=[2**i for i in range(5, max_seq_len_log2 + 1)],
+            line_arg="provider",
+            line_vals=["triton", "pytorch"],
+            line_names=["Triton", "Pytorch"],
+            styles=[("red", "-"), ("blue", "-")],
+            ylabel="ms",
+            plot_name=f"jagged_dense_bmm-b{batch_size}-D{d}-K{k}-{dtype}",
+            args={
+                "batch_size": batch_size,
+                "D": d,
+                "K": k,
+                "dtype": pt_dtype,
+                "mode": mode,
+            },
+        )
+        for mode in (["fwd"] if fwd_only else ["fwd", "bwd"])
+    ]
+
+    @triton.testing.perf_report(configs)
+    def bench_jagged_dense_bmm(
+        batch_size: int,
+        seq_len: int,
+        D: int,
+        K: int,
+        mode: str,
+        provider: str,
+        dtype: torch.dtype,
+    ) -> float:
+        assert mode in ["fwd", "bwd"]
+        warmup = 25
+        rep = 100
+
+        max_seq_len = seq_len
+        lengths = torch.randint(
+            max_seq_len + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+        seq_offsets = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        seq_offsets[1:] = torch.cumsum(lengths, dim=0)
+        jagged_size = int(seq_offsets[-1].item())
+        jagged = (
+            torch.empty((jagged_size, D), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        dense = (
+            torch.empty((batch_size, D, K), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+
+        if provider == "triton":
+            fn = lambda: jagged_dense_bmm(  # noqa E731
+                max_seq_len=max_seq_len,
+                seq_offsets=seq_offsets,
+                jagged=jagged,
+                dense=dense,
+                kernel=HammerKernel.TRITON,
+            )
+            if mode == "bwd":
+                o = fn()
+                do = torch.randn_like(o)
+                fn = lambda: o.backward(do, retain_graph=True)  # noqa E731
+            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+            return ms
+        elif provider == "pytorch":
+            fn = lambda: jagged_dense_bmm(  # noqa E731
+                max_seq_len=max_seq_len,
+                seq_offsets=seq_offsets,
+                jagged=jagged,
+                dense=dense,
+                kernel=HammerKernel.PYTORCH,
+            )
+            if mode == "bwd":
+                o = fn()
+                do = torch.randn_like(o)
+                fn = lambda: o.backward(do, retain_graph=True)  # noqa E731
+            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+            return ms
+        else:
+            raise ValueError(f"unsupported provider: {provider}")
+
+    df = bench_jagged_dense_bmm.run(print_data=True, return_df=return_result)
+
+    if return_result:
+        return configs, df
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/ops/benchmarks/jagged_dense_bmm_broadcast_add_bench.py b/recommendation_v4/generative_recommenders/ops/benchmarks/jagged_dense_bmm_broadcast_add_bench.py
new file mode 100644
index 000000000..193704bf9
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/benchmarks/jagged_dense_bmm_broadcast_add_bench.py
@@ -0,0 +1,270 @@
+# pyre-strict
+import math
+import pickle
+from typing import List
+
+import click
+import torch
+
+# @manual=//triton:triton
+import triton
+from generative_recommenders.common import HammerKernel
+from generative_recommenders.ops.jagged_tensors import jagged_dense_bmm_broadcast_add
+from generative_recommenders.ops.triton.triton_jagged import (
+    jagged_dense_bmm_broadcast_add_kernel,
+    triton_jagged_dense_bmm,
+    triton_jagged_dense_broadcast_add,
+)
+
+# buck2 run @mode/{opt,inplace} //generative_recommenders/ops/benchmarks:jagged_dense_bmm_broadcast_add_bench -- --fwd-only
+
+# To dump the jagged_dense_bmm_broadcast_add_kernel cache
+# buck2 run @mode/opt //generative_recommenders/ops/benchmarks:jagged_dense_bmm_broadcast_add_bench -- --fwd-only --dump-cache-dir=/home/${USER}/fbsource/fbcode/generative_recommenders/ops/triton/jagged_dense_bmm_broadcast_add_kernel_cache.pkl
+
+
+def get_kernel(provider: str) -> HammerKernel:
+    if provider == "triton":
+        return HammerKernel.TRITON
+    elif provider == "pytorch":
+        return HammerKernel.PYTORCH
+    else:
+        raise ValueError(f"Unknown provider {provider}")
+
+
+def jagged_dense_broadcast_add(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    jagged: torch.Tensor,
+    dense: torch.Tensor,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> torch.Tensor:
+    """
+    Wrapper function for jagged_dense_broadcast_add with kernel selection.
+    Computing out = jagged + dense (broadcasted)
+    jagged has shape (sum_B(M_i), N), dense has shape (B, N)
+    out has shape (sum_B(M_i), N)
+    """
+    if kernel == HammerKernel.TRITON:
+        return triton_jagged_dense_broadcast_add(
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            jagged=jagged,
+            dense=dense,
+        )
+    elif kernel == HammerKernel.PYTORCH:
+        # PyTorch implementation - manual implementation using standard operations
+        B, N = dense.shape
+        outputs = []
+        for i in range(B):
+            start_idx = seq_offsets[i]
+            end_idx = seq_offsets[i + 1]
+            if start_idx < end_idx:
+                jagged_seq = jagged[start_idx:end_idx]  # (seq_len, N)
+                dense_seq = dense[i]  # (N,)
+                output_seq = jagged_seq + dense_seq  # (seq_len, N)
+                outputs.append(output_seq)
+        return (
+            torch.cat(outputs, dim=0)
+            if outputs
+            else torch.empty(0, N, device=jagged.device, dtype=jagged.dtype)
+        )
+    else:
+        raise ValueError(f"Unsupported kernel: {kernel}")
+
+
+def jagged_dense_bmm(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    jagged: torch.Tensor,
+    dense: torch.Tensor,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> torch.Tensor:
+    """
+    Wrapper function for jagged_dense_bmm with kernel selection.
+    Computing out = jagged x dense
+    jagged has shape (sum_B(M_i), K), dense has shape (B, K, N)
+    out has shape (sum_B(M_i), N)
+    """
+    if kernel == HammerKernel.TRITON:
+        return triton_jagged_dense_bmm(
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            jagged=jagged,
+            dense=dense,
+        )
+    elif kernel == HammerKernel.PYTORCH:
+        # PyTorch implementation - manual implementation using standard operations
+        B, K, N = dense.shape
+        outputs = []
+        for i in range(B):
+            start_idx = seq_offsets[i]
+            end_idx = seq_offsets[i + 1]
+            if start_idx < end_idx:
+                jagged_seq = jagged[start_idx:end_idx]  # (seq_len, K)
+                dense_seq = dense[i]  # (K, N)
+                output_seq = torch.mm(jagged_seq, dense_seq)  # (seq_len, N)
+                outputs.append(output_seq)
+        return (
+            torch.cat(outputs, dim=0)
+            if outputs
+            else torch.empty(0, N, device=jagged.device, dtype=jagged.dtype)
+        )
+    else:
+        raise ValueError(f"Unsupported kernel: {kernel}")
+
+
+@click.command()
+@click.option(
+    "--batch-size",
+    type=int,
+    default=384,
+)
+@click.option(
+    "--max-seq-len",
+    type=int,
+    default=4096,
+    show_default=True,
+)
+@click.option(
+    "-d",
+    type=int,
+    default=512,
+    show_default=True,
+)
+@click.option(
+    "-k",
+    type=int,
+    default=512,
+    show_default=True,
+)
+@click.option("--dtype", type=str, default="bf16")
+@click.option("--fwd-only", is_flag=True)
+@click.option("--dump-cache-dir", type=str, default="")
+def main(
+    batch_size: int,
+    max_seq_len: int,
+    d: int,
+    k: int,
+    dtype: str,
+    fwd_only: bool,
+    dump_cache_dir: str,
+) -> None:
+    torch.backends.cudnn.allow_tf32 = True
+    torch.backends.cuda.matmul.allow_tf32 = True
+    max_seq_len_log2 = int(round(math.log2(max_seq_len)))
+    if dtype == "fp32":
+        pt_dtype = torch.float32
+    elif dtype == "fp16":
+        pt_dtype = torch.float16
+    elif dtype == "bf16":
+        pt_dtype = torch.bfloat16
+    else:
+        raise ValueError(f"Unsupported data type: {dtype}.")
+
+    configs: List[triton.testing.Benchmark] = [
+        triton.testing.Benchmark(
+            x_names=["seq_len"],
+            x_vals=[2**i for i in range(8, max_seq_len_log2 + 1)],
+            line_arg="provider",
+            line_vals=["triton", "pytorch", "triton_nonfused"],
+            line_names=["Triton", "Pytorch", "Triton_Nonfused"],
+            styles=[("red", "-"), ("blue", "-"), ("green", "-")],
+            ylabel="ms",
+            plot_name=f"jagged_dense_bmm_broadcast_add-{mode}-b{batch_size}-D{d}-K{k}-{dtype}",
+            args={
+                "batch_size": batch_size,
+                "D": d,
+                "K": k,
+                "dtype": pt_dtype,
+                "mode": mode,
+            },
+        )
+        for mode in (["fwd"] if fwd_only else ["fwd", "bwd"])
+    ]
+
+    @triton.testing.perf_report(configs)
+    def bench_jagged_dense_bmm_broadcast_add(
+        batch_size: int,
+        seq_len: int,
+        D: int,
+        K: int,
+        mode: str,
+        provider: str,
+        dtype: torch.dtype,
+    ) -> float:
+        assert mode in ["fwd", "bwd"]
+        warmup = 25
+        rep = 100
+
+        max_seq_len = seq_len
+        lengths = torch.randint(
+            max_seq_len + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+        seq_offsets = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        seq_offsets[1:] = torch.cumsum(lengths, dim=0)
+        jagged_size = int(seq_offsets[-1].item())
+        jagged = (
+            torch.empty((jagged_size, D), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        dense = (
+            torch.empty((batch_size, D, K), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        bias = (
+            torch.empty((batch_size, K), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+
+        if provider in ["triton", "pytorch"]:
+            fn = lambda: jagged_dense_bmm_broadcast_add(  # noqa E731
+                max_seq_len=max_seq_len,
+                seq_offsets=seq_offsets,
+                jagged=jagged,
+                dense=dense,
+                bias=bias,
+                kernel=get_kernel(provider),
+            )
+            if mode == "bwd":
+                o = fn()
+                do = torch.randn_like(o)
+                fn = lambda: o.backward(do, retain_graph=True)  # noqa E731
+            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+            return ms
+        elif provider == "triton_nonfused":
+            fn = lambda: jagged_dense_broadcast_add(  # noqa E731
+                max_seq_len=max_seq_len,
+                seq_offsets=seq_offsets,
+                jagged=jagged_dense_bmm(
+                    max_seq_len=max_seq_len,
+                    seq_offsets=seq_offsets,
+                    jagged=jagged,
+                    dense=dense,
+                    kernel=HammerKernel.TRITON,
+                ),
+                dense=bias,
+                kernel=HammerKernel.TRITON,
+            )
+            if mode == "bwd":
+                o = fn()
+                do = torch.randn_like(o)
+                fn = lambda: o.backward(do, retain_graph=True)  # noqa E731
+            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+            return ms
+        else:
+            raise ValueError(f"unsupported provider: {provider}")
+
+    bench_jagged_dense_bmm_broadcast_add.run(print_data=True)
+    if dump_cache_dir:
+        with open(dump_cache_dir, "wb") as data:
+            # @lint-ignore PYTHONPICKLEISBAD
+            pickle.dump(jagged_dense_bmm_broadcast_add_kernel.cache, data)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/ops/benchmarks/jagged_dense_broadcast_add_bench.py b/recommendation_v4/generative_recommenders/ops/benchmarks/jagged_dense_broadcast_add_bench.py
new file mode 100644
index 000000000..049258e3d
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/benchmarks/jagged_dense_broadcast_add_bench.py
@@ -0,0 +1,205 @@
+# pyre-strict
+import math
+import pickle
+from typing import List, Optional, Tuple
+
+import click
+import pandas as pd
+import torch
+
+# @manual=//triton:triton
+import triton
+from generative_recommenders.common import HammerKernel
+from generative_recommenders.ops.triton.triton_jagged import (
+    jagged_dense_broadcast_add_kernel,
+    triton_jagged_dense_broadcast_add,
+)
+
+# buck2 run @mode/{opt,inplace} //generative_recommenders/ops/benchmarks:jagged_dense_broadcast_add_bench
+
+
+# To dump the jagged_dense_broadcast_add_kernel cache, run:
+# buck2 run @mode/{opt,inplace} //generative_recommenders/ops/benchmarks:jagged_dense_broadcast_add_bench -- --dump-ragged-tuner-cache-dir=/home/${USER}/fbsource/fbcode/generative_recommenders/ops/triton/jagged_dense_broadcast_add_kernel_cache.pkl
+
+
+def jagged_dense_broadcast_add(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    jagged: torch.Tensor,
+    dense: torch.Tensor,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> torch.Tensor:
+    """
+    Wrapper function for jagged_dense_broadcast_add with kernel selection.
+    Computing out = jagged + dense (broadcasted)
+    jagged has shape (sum_B(M_i), N), dense has shape (B, N)
+    out has shape (sum_B(M_i), N)
+    """
+    if kernel == HammerKernel.TRITON:
+        return triton_jagged_dense_broadcast_add(
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            jagged=jagged,
+            dense=dense,
+        )
+    elif kernel == HammerKernel.PYTORCH:
+        # PyTorch implementation - manual implementation using standard operations
+        B, N = dense.shape
+        outputs = []
+        for i in range(B):
+            start_idx = seq_offsets[i]
+            end_idx = seq_offsets[i + 1]
+            if start_idx < end_idx:
+                jagged_seq = jagged[start_idx:end_idx]  # (seq_len, N)
+                dense_seq = dense[i]  # (N,)
+                output_seq = jagged_seq + dense_seq  # (seq_len, N)
+                outputs.append(output_seq)
+        return (
+            torch.cat(outputs, dim=0)
+            if outputs
+            else torch.empty(0, N, device=jagged.device, dtype=jagged.dtype)
+        )
+    else:
+        raise ValueError(f"Unsupported kernel: {kernel}")
+
+
+@click.command()
+@click.option(
+    "--batch-size",
+    type=int,
+    default=512,
+)
+@click.option(
+    "--max-seq-len",
+    type=int,
+    default=8192,
+    show_default=True,
+)
+@click.option(
+    "-d",
+    type=int,
+    default=64,
+    show_default=True,
+)
+@click.option("--dtype", type=str, default="fp32")
+@click.option("--fwd-only", is_flag=True)
+@click.option("--dump-ragged-tuner-cache-dir", type=str, default="")
+@click.option("--return-result", type=bool, default=False)
+def main(
+    batch_size: int,
+    max_seq_len: int,
+    d: int,
+    dtype: str,
+    fwd_only: bool,
+    dump_ragged_tuner_cache_dir: str,
+    return_result: bool,
+) -> Optional[Tuple[List[triton.testing.Benchmark], List[pd.DataFrame]]]:
+    torch.backends.cudnn.allow_tf32 = True
+    torch.backends.cuda.matmul.allow_tf32 = True
+    max_seq_len_log2 = int(round(math.log2(max_seq_len)))
+    if dtype == "fp32":
+        pt_dtype = torch.float32
+    elif dtype == "fp16":
+        pt_dtype = torch.float16
+    elif dtype == "bf16":
+        pt_dtype = torch.bfloat16
+    else:
+        raise ValueError(f"Unsupported data type: {dtype}.")
+
+    configs: List[triton.testing.Benchmark] = [
+        triton.testing.Benchmark(
+            x_names=["seq_len"],
+            x_vals=[2**i for i in range(5, max_seq_len_log2 + 1)],
+            line_arg="provider",
+            line_vals=["triton", "pytorch"],
+            line_names=["Triton", "Pytorch"],
+            styles=[("red", "-"), ("blue", "-")],
+            ylabel="ms",
+            plot_name=f"jagged_dense_broadcast_add-b{batch_size}-D{d}-{dtype}-{mode}",
+            args={
+                "batch_size": batch_size,
+                "D": d,
+                "dtype": pt_dtype,
+                "mode": mode,
+            },
+        )
+        for mode in (["fwd"] if fwd_only else ["fwd", "fwd+bwd"])
+    ]
+
+    @triton.testing.perf_report(configs)
+    def bench_jagged_dense_broadcast_add(
+        batch_size: int,
+        seq_len: int,
+        D: int,
+        mode: str,
+        provider: str,
+        dtype: torch.dtype,
+    ) -> float:
+        assert mode in ["fwd", "bwd", "fwd+bwd"]
+        warmup = 25
+        rep = 100
+
+        max_seq_len = seq_len
+        lengths = torch.randint(
+            max_seq_len + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+        seq_offsets = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        seq_offsets[1:] = torch.cumsum(lengths, dim=0)
+        jagged_size = int(seq_offsets[-1].item())
+        jagged = (
+            torch.empty((jagged_size, D), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        dense = (
+            torch.empty((batch_size, D), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+
+        if provider == "triton":
+            fn = lambda: jagged_dense_broadcast_add(  # noqa E731
+                max_seq_len=max_seq_len,
+                seq_offsets=seq_offsets,
+                jagged=jagged,
+                dense=dense,
+                kernel=HammerKernel.TRITON,
+            )
+            if mode == "bwd":
+                o = fn()
+                do = torch.randn_like(o)
+                fn = lambda: o.backward(do, retain_graph=True)  # noqa E731
+            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+            return ms
+        elif provider == "pytorch":
+            fn = lambda: jagged_dense_broadcast_add(  # noqa E731
+                max_seq_len=max_seq_len,
+                seq_offsets=seq_offsets,
+                jagged=jagged,
+                dense=dense,
+                kernel=HammerKernel.PYTORCH,
+            )
+            if mode == "bwd":
+                o = fn()
+                do = torch.randn_like(o)
+                fn = lambda: o.backward(do, retain_graph=True)  # noqa E731
+            ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+            return ms
+        else:
+            raise ValueError(f"unsupported provider: {provider}")
+
+    df = bench_jagged_dense_broadcast_add.run(print_data=True, return_df=return_result)
+
+    if dump_ragged_tuner_cache_dir:
+        with open(dump_ragged_tuner_cache_dir, "wb") as data:
+            # @lint-ignore PYTHONPICKLEISBAD
+            pickle.dump(jagged_dense_broadcast_add_kernel.cache, data)
+
+    if return_result:
+        return configs, df
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/concat_1d_jagged_jagged_bench.py b/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/concat_1d_jagged_jagged_bench.py
new file mode 100644
index 000000000..95c43853f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/concat_1d_jagged_jagged_bench.py
@@ -0,0 +1,125 @@
+# pyre-strict
+from typing import List
+
+import click
+import torch
+
+# @manual=//triton:triton
+import triton
+from hammer.ops.jagged import concat_1D_jagged_jagged
+
+# buck2 run @//mode/opt -c fbcode.nvcc_arch=h100 //generative_recommenders/ops/cpp/benchmarks:concat_1d_jagged_jagged_bench
+
+torch.ops.load_library("//generative_recommenders/ops/cpp:cpp_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+
+
+@click.command()
+@click.option("--data-type", type=str, default="float32")
+@click.option("--batch-size", type=int, default=512)
+@click.option("--max-seq-len-log2", type=int, default=20)
+@click.option("--seq-sparsity", type=float, default=0.8)
+def main(
+    data_type: str,
+    batch_size: int,
+    max_seq_len_log2: int,
+    seq_sparsity: float,
+) -> None:
+    if data_type == "float32":
+        dtype = torch.float32
+    elif data_type == "float16":
+        dtype = torch.float16
+    elif data_type == "bfloat16":
+        dtype = torch.bfloat16
+    else:
+        raise ValueError(f"Unsupported data type: {data_type}.")
+
+    configs: List[triton.testing.Benchmark] = [
+        triton.testing.Benchmark(
+            x_names=["max_seq_len"],
+            x_vals=[2**i for i in range(6, max_seq_len_log2)],
+            line_arg="method",
+            line_vals=[
+                "custom_cuda",
+                "hammer_pytorch",
+            ],
+            line_names=["Custom CUDA", "Hammer PyTorch"],
+            styles=[("green", "-"), ("orange", "--")],
+            ylabel="ms",
+            plot_name=f"concat_1d_jagged_jagged_batch{batch_size}_sparsity{seq_sparsity}_{data_type}",
+            args={
+                "dtype": dtype,
+                "batch_size": batch_size,
+                "seq_sparsity": seq_sparsity,
+            },
+        )
+    ]
+
+    @triton.testing.perf_report(configs)
+    def bench_concat_1d_jagged_jagged(
+        max_seq_len: int,
+        batch_size: int,
+        method: str,
+        dtype: torch.dtype,
+        seq_sparsity: float,
+    ) -> float:
+        warmup = 50
+        rep = 500
+        torch.manual_seed(1001)
+
+        lengths_left = torch.randint(
+            1, int(max_seq_len * seq_sparsity) + 1, (batch_size,), dtype=torch.int32
+        )
+        lengths_right = torch.randint(
+            1, int(max_seq_len * seq_sparsity) + 1, (batch_size,), dtype=torch.int32
+        )
+
+        total_left = int(lengths_left.sum().item())
+        total_right = int(lengths_right.sum().item())
+
+        values_left = torch.randn(total_left, dtype=dtype)
+        values_right = torch.randn(total_right, dtype=dtype)
+
+        offsets_left = torch.zeros(
+            (batch_size + 1,), dtype=lengths_left.dtype, device=lengths_left.device
+        )
+        offsets_left[1:] = torch.cumsum(lengths_left.view(-1), dim=0)
+        offsets_right = torch.zeros(
+            (batch_size + 1,), dtype=lengths_right.dtype, device=lengths_right.device
+        )
+        offsets_right[1:] = torch.cumsum(lengths_right.view(-1), dim=0)
+        max_seq_len_left = int(lengths_left.max().item())
+        max_seq_len_right = int(lengths_right.max().item())
+
+        lengths_left = lengths_left.cuda()
+        lengths_right = lengths_right.cuda()
+        values_left = values_left.cuda()
+        values_right = values_right.cuda()
+        offsets_left = offsets_left.cuda()
+        offsets_right = offsets_right.cuda()
+
+        if method == "custom_cuda":
+            fn = lambda: torch.ops.hstu.concat_1d_jagged_jagged(  # noqa E731
+                lengths_left, values_left, lengths_right, values_right
+            )
+        elif method == "hammer_pytorch":
+            fn = lambda: concat_1D_jagged_jagged(  # noqa E731
+                max_seq_len_left,
+                offsets_left,
+                values_left,
+                max_seq_len_right,
+                offsets_right,
+                values_right,
+            )
+        else:
+            raise ValueError(f"unknown method: {method}")
+
+        ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+        return ms
+
+    bench_concat_1d_jagged_jagged.run(print_data=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/jagged_transpose_1d_bench.py b/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/jagged_transpose_1d_bench.py
new file mode 100644
index 000000000..7806d6970
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/jagged_transpose_1d_bench.py
@@ -0,0 +1,117 @@
+# pyre-strict
+from typing import List
+
+import click
+import torch
+
+# @manual=//triton:triton
+import triton
+from hammer.ops.jagged import jagged_transpose_1D
+
+# buck2 run @//mode/opt -c fbcode.nvcc_arch=h100 //generative_recommenders/ops/cpp/benchmarks:jagged_transpose_1d_bench
+
+torch.ops.load_library("//generative_recommenders/ops/cpp:cpp_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+
+
+@click.command()
+@click.option("--data-type", type=str, default="float32")
+@click.option("--size1", type=int, default=32)
+@click.option("--size2", type=int, default=16)
+@click.option("--max-len-log2", type=int, default=19)
+@click.option("--seq-sparsity", type=float, default=0.8)
+def main(
+    data_type: str,
+    size1: int,
+    size2: int,
+    max_len_log2: int,
+    seq_sparsity: float,
+) -> None:
+    if data_type == "float32":
+        dtype = torch.float32
+    elif data_type == "float16":
+        dtype = torch.float16
+    elif data_type == "bfloat16":
+        dtype = torch.bfloat16
+    else:
+        raise ValueError(f"Unsupported data type: {data_type}.")
+
+    configs: List[triton.testing.Benchmark] = [
+        triton.testing.Benchmark(
+            x_names=["max_len"],
+            x_vals=[2**i for i in range(4, max_len_log2)],
+            line_arg="method",
+            line_vals=[
+                "custom_cuda",
+                "hammer_pytorch",
+            ],
+            line_names=["Custom CUDA", "Hammer PyTorch"],
+            styles=[("green", "-"), ("orange", "--")],
+            ylabel="ms",
+            plot_name=f"jagged_transpose_1d_size1_{size1}_size2_{size2}_sparsity{seq_sparsity}_{data_type}",
+            args={
+                "dtype": dtype,
+                "size1": size1,
+                "size2": size2,
+                "seq_sparsity": seq_sparsity,
+            },
+        )
+    ]
+
+    @triton.testing.perf_report(configs)
+    def bench_jagged_transpose_1d(
+        max_len: int,
+        size1: int,
+        size2: int,
+        method: str,
+        dtype: torch.dtype,
+        seq_sparsity: float,
+    ) -> float:
+        warmup = 50
+        rep = 500
+        torch.manual_seed(1001)
+
+        lengths = torch.randint(
+            1, int(max_len * seq_sparsity) + 1, (size1 * size2,), dtype=torch.int32
+        )
+        offsets = torch.zeros(
+            (size1 * size2 + 1,), dtype=lengths.dtype, device=lengths.device
+        )
+        offsets[1:] = torch.cumsum(lengths.view(-1), dim=0)
+
+        values = torch.randn(int(offsets[-1].item()), dtype=dtype)
+
+        lengths = lengths.cuda()
+        offsets = offsets.cuda()
+        values = values.cuda()
+
+        if method == "custom_cuda":
+            fn = lambda: torch.ops.hstu.jagged_transpose_1d(  # noqa E731
+                values=values,
+                offsets=offsets,
+                lengths=lengths,
+                max_len=max_len,
+                size1=size1,
+                size2=size2,
+            )
+        elif method == "hammer_pytorch":
+            fn = lambda: jagged_transpose_1D(  # noqa E731
+                values=values,
+                offsets=offsets,
+                lengths=lengths,
+                max_len=max_len,
+                size1=size1,
+                size2=size2,
+            )
+        else:
+            raise ValueError(f"unknown method: {method}")
+
+        ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+        return ms
+
+    bench_jagged_transpose_1d.run(print_data=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/replace_last_n_with_jagged_bench.py b/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/replace_last_n_with_jagged_bench.py
new file mode 100644
index 000000000..a3f2483fa
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/replace_last_n_with_jagged_bench.py
@@ -0,0 +1,150 @@
+# pyre-strict
+from typing import List
+
+import click
+import torch
+
+# @manual=//triton:triton
+import triton
+from hammer.ops.jagged import replace_last_n_with_jagged
+from hammer.utils import HammerKernel
+
+# buck2 run @//mode/opt -c fbcode.nvcc_arch=h100 //generative_recommenders/ops/cpp/benchmarks:replace_last_n_with_jagged_bench
+
+torch.ops.load_library("//generative_recommenders/ops/cpp:cpp_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+
+
+@click.command()
+@click.option("--data-type", type=str, default="float32")
+@click.option("--batch-size", type=int, default=512)
+@click.option("--embedding-dim", type=int, default=64)
+@click.option("--max-seq-len-log2", type=int, default=16)
+@click.option("--seq-sparsity", type=float, default=0.8)
+def main(
+    data_type: str,
+    batch_size: int,
+    embedding_dim: int,
+    max_seq_len_log2: int,
+    seq_sparsity: float,
+) -> None:
+    if data_type == "float32":
+        dtype = torch.float32
+    elif data_type == "float16":
+        dtype = torch.float16
+    elif data_type == "bfloat16":
+        dtype = torch.bfloat16
+    else:
+        raise ValueError(f"Unsupported data type: {data_type}.")
+
+    configs: List[triton.testing.Benchmark] = [
+        triton.testing.Benchmark(
+            x_names=["max_seq_len"],
+            x_vals=[2**i for i in range(6, max_seq_len_log2)],
+            line_arg="method",
+            line_vals=[
+                "custom_cuda",
+                "hammer_pytorch",
+                "hammer_triton",
+            ],
+            line_names=[
+                "Custom CUDA",
+                "Hammer PyTorch",
+                "Hammer Triton",
+            ],
+            styles=[
+                ("green", "-"),
+                ("orange", "--"),
+                ("purple", "-."),
+            ],
+            ylabel="ms",
+            plot_name=f"replace_last_n_with_jagged_batch{batch_size}_dim{embedding_dim}_sparsity{seq_sparsity}_{data_type}",
+            args={
+                "dtype": dtype,
+                "batch_size": batch_size,
+                "embedding_dim": embedding_dim,
+                "seq_sparsity": seq_sparsity,
+            },
+        )
+    ]
+
+    @triton.testing.perf_report(configs)
+    def bench_replace_last_n_with_jagged(
+        max_seq_len: int,
+        batch_size: int,
+        method: str,
+        dtype: torch.dtype,
+        embedding_dim: int,
+        seq_sparsity: float,
+    ) -> float:
+        warmup = 50
+        rep = 500
+        torch.manual_seed(1001)
+
+        min_left_len = max(1, int(max_seq_len * seq_sparsity * 0.3))
+        max_left_len = int(max_seq_len * seq_sparsity)
+
+        lengths_left = torch.randint(
+            min_left_len, max_left_len + 1, (batch_size,), dtype=torch.int32
+        )
+        lengths_right = torch.randint(
+            1, min_left_len + 1, (batch_size,), dtype=torch.int32
+        )
+
+        lengths_right = torch.min(lengths_right, lengths_left)
+
+        total_left = int(lengths_left.sum().item())
+        total_right = int(lengths_right.sum().item())
+
+        values_left = torch.randn(total_left, embedding_dim, dtype=dtype)
+        values_right = torch.randn(total_right, embedding_dim, dtype=dtype)
+
+        offsets_left = torch.zeros(
+            (batch_size + 1,), dtype=lengths_left.dtype, device=lengths_left.device
+        )
+        offsets_left[1:] = torch.cumsum(lengths_left.view(-1), dim=0)
+        offsets_right = torch.zeros(
+            (batch_size + 1,), dtype=lengths_right.dtype, device=lengths_right.device
+        )
+        offsets_right[1:] = torch.cumsum(lengths_right.view(-1), dim=0)
+
+        lengths_left = lengths_left.cuda()
+        lengths_right = lengths_right.cuda()
+        values_left = values_left.cuda()
+        values_right = values_right.cuda()
+        offsets_left = offsets_left.cuda()
+        offsets_right = offsets_right.cuda()
+
+        if method == "custom_cuda":
+            fn = lambda: torch.ops.hstu.replace_last_n_with_jagged(  # noqa E731
+                lengths_left, values_left, lengths_right, values_right
+            )
+        elif method == "hammer_pytorch":
+            fn = lambda: replace_last_n_with_jagged(  # noqa E731
+                max_seq_len_left=max_seq_len,
+                offsets_left=offsets_left,
+                values_left=values_left,
+                offsets_right=offsets_right,
+                values_right=values_right,
+            )
+        elif method == "hammer_triton":
+            fn = lambda: replace_last_n_with_jagged(  # noqa E731
+                max_seq_len_left=max_seq_len,
+                offsets_left=offsets_left,
+                values_left=values_left,
+                offsets_right=offsets_right,
+                values_right=values_right,
+                kernel=HammerKernel.TRITON,
+            )
+        else:
+            raise ValueError(f"unknown method: {method}")
+
+        ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+        return ms
+
+    bench_replace_last_n_with_jagged.run(print_data=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/split_1d_jagged_jagged_bench.py b/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/split_1d_jagged_jagged_bench.py
new file mode 100644
index 000000000..4aaa9d77c
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/split_1d_jagged_jagged_bench.py
@@ -0,0 +1,116 @@
+# pyre-strict
+from typing import List
+
+import click
+import torch
+
+# @manual=//triton:triton
+import triton
+from hammer.ops.jagged import split_1D_jagged_jagged
+
+# buck2 run @//mode/opt -c fbcode.nvcc_arch=h100 //generative_recommenders/ops/cpp/benchmarks:split_1d_jagged_jagged_bench
+
+torch.ops.load_library("//generative_recommenders/ops/cpp:cpp_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+
+
+@click.command()
+@click.option("--data-type", type=str, default="float32")
+@click.option("--batch-size", type=int, default=512)
+@click.option("--max-seq-len-log2", type=int, default=20)
+@click.option("--seq-sparsity", type=float, default=0.8)
+def main(
+    data_type: str,
+    batch_size: int,
+    max_seq_len_log2: int,
+    seq_sparsity: float,
+) -> None:
+    if data_type == "float32":
+        dtype = torch.float32
+    elif data_type == "float16":
+        dtype = torch.float16
+    elif data_type == "bfloat16":
+        dtype = torch.bfloat16
+    else:
+        raise ValueError(f"Unsupported data type: {data_type}.")
+
+    configs: List[triton.testing.Benchmark] = [
+        triton.testing.Benchmark(
+            x_names=["max_seq_len"],
+            x_vals=[2**i for i in range(6, max_seq_len_log2)],
+            line_arg="method",
+            line_vals=[
+                "custom_cuda",
+                "hammer_pytorch",
+            ],
+            line_names=["Custom CUDA", "Hammer PyTorch"],
+            styles=[("green", "-"), ("orange", "--")],
+            ylabel="ms",
+            plot_name=f"split_1d_jagged_jagged_batch{batch_size}_sparsity{seq_sparsity}_{data_type}",
+            args={
+                "dtype": dtype,
+                "batch_size": batch_size,
+                "seq_sparsity": seq_sparsity,
+            },
+        )
+    ]
+
+    @triton.testing.perf_report(configs)
+    def bench_split_1d_jagged_jagged(
+        max_seq_len: int,
+        batch_size: int,
+        method: str,
+        dtype: torch.dtype,
+        seq_sparsity: float,
+    ) -> float:
+        warmup = 50
+        rep = 500
+        torch.manual_seed(1001)
+
+        lengths_left = torch.randint(
+            1, int(max_seq_len * seq_sparsity) + 1, (batch_size,), dtype=torch.int32
+        )
+        lengths_right = torch.randint(
+            1, int(max_seq_len * seq_sparsity) + 1, (batch_size,), dtype=torch.int32
+        )
+
+        offsets_left = torch.zeros(
+            (batch_size + 1,), dtype=lengths_left.dtype, device=lengths_left.device
+        )
+        offsets_left[1:] = torch.cumsum(lengths_left.view(-1), dim=0)
+        offsets_right = torch.zeros(
+            (batch_size + 1,), dtype=lengths_right.dtype, device=lengths_right.device
+        )
+        offsets_right[1:] = torch.cumsum(lengths_right.view(-1), dim=0)
+
+        combined_offsets = offsets_left + offsets_right
+        combined_values = torch.randn(int(combined_offsets[-1].item()), dtype=dtype)
+
+        max_seq_len_combined = int((lengths_left + lengths_right).max().item())
+
+        lengths_left = lengths_left.cuda()
+        lengths_right = lengths_right.cuda()
+        combined_values = combined_values.cuda()
+        offsets_left = offsets_left.cuda()
+        offsets_right = offsets_right.cuda()
+
+        if method == "custom_cuda":
+            fn = lambda: torch.ops.hstu.split_1d_jagged_jagged(  # noqa E731
+                lengths_left, lengths_right, combined_values
+            )
+        elif method == "hammer_pytorch":
+            fn = lambda: split_1D_jagged_jagged(  # noqa E731
+                max_seq_len_combined, combined_values, offsets_left, offsets_right
+            )
+        else:
+            raise ValueError(f"unknown method: {method}")
+
+        ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+        return ms
+
+    bench_split_1d_jagged_jagged.run(print_data=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/common.h b/recommendation_v4/generative_recommenders/ops/cpp/common.h
new file mode 100644
index 000000000..1c4b43768
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/common.h
@@ -0,0 +1,60 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#define AT_DISPATCH_CASE_FLOATING_TYPES_AND4(                \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, ...) \
+  AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)               \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)                 \
+  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)
+
+#define AT_DISPATCH_FLOATING_TYPES_AND4(                                 \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, TYPE, NAME, ...) \
+  AT_DISPATCH_SWITCH(                                                    \
+      TYPE,                                                              \
+      NAME,                                                              \
+      AT_DISPATCH_CASE_FLOATING_TYPES_AND4(                              \
+          SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, __VA_ARGS__))
+
+inline __attribute__((always_inline)) uint32_t
+div_round_up(uint32_t a, uint32_t b) {
+  return (a + b - 1) / b;
+};
+
+inline __attribute__((always_inline)) uint32_t next_power_of_2(uint32_t n) {
+  n--;
+  n |= n >> 1;
+  n |= n >> 2;
+  n |= n >> 4;
+  n |= n >> 8;
+  n |= n >> 16;
+  n++;
+  return n;
+}
+
+/*
+ * Because different .SO may include the same CUDA CUB kernels, this results in
+ * confusion, where libA may end up calling libB's cub kernel and causing
+ * failures when we static link libcudart_static.a. To avoid this, we annotate
+ * only the public functions and hide the rest.
+ */
+#define DLL_PUBLIC __attribute__((visibility("default")))
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/complete_cumsum.cpp b/recommendation_v4/generative_recommenders/ops/cpp/complete_cumsum.cpp
new file mode 100644
index 000000000..4ebd426d7
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/complete_cumsum.cpp
@@ -0,0 +1,44 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/extension.h>
+
+#include "fbgemm_gpu/sparse_ops.h" // @manual
+
+namespace hstu {
+
+at::Tensor complete_cumsum_cpu(const at::Tensor& values) {
+  TORCH_CHECK(values.dim() == 1);
+  auto len = values.size(0);
+  const torch::Tensor index = at::range(0, len, at::kLong).cpu();
+  auto output = fbgemm_gpu::asynchronous_complete_cumsum_cpu(values);
+  return output;
+}
+
+at::Tensor complete_cumsum_meta(const at::Tensor& values) {
+  auto len = values.sym_size(0);
+  auto output = at::native::empty_meta_symint(
+      {len + 1},
+      /*dtype=*/::std::make_optional(values.scalar_type()),
+      /*layout=*/::std::make_optional(values.layout()),
+      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
+      /*pin_memory=*/::std::nullopt);
+  return output;
+}
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/complete_cumsum.cu b/recommendation_v4/generative_recommenders/ops/cpp/complete_cumsum.cu
new file mode 100644
index 000000000..06d1abdd6
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/complete_cumsum.cu
@@ -0,0 +1,51 @@
+#include "common.h"
+
+#include <cub/device/device_scan.cuh>
+
+namespace hstu {
+
+DLL_PUBLIC at::Tensor complete_cumsum_cuda(const at::Tensor& values) {
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(values.get_device());
+
+  TORCH_CHECK(values.numel() < std::numeric_limits<int32_t>::max());
+  TORCH_CHECK(values.dim() == 1);
+  const auto values_contig = values.contiguous();
+
+  auto cumsum = at::empty({values_contig.numel() + 1}, values_contig.options());
+  cumsum[0].zero_();
+
+  AT_DISPATCH_FLOATING_TYPES_AND4(
+      at::ScalarType::Int,
+      at::ScalarType::Long,
+      at::ScalarType::Half,
+      at::ScalarType::BFloat16,
+      values_contig.scalar_type(),
+      "complete_cumsum_cuda",
+      [&] {
+        size_t temp_storage_bytes = 0;
+        AT_CUDA_CHECK(
+            cub::DeviceScan::InclusiveSum(
+                nullptr,
+                temp_storage_bytes,
+                values_contig.data_ptr<scalar_t>(),
+                cumsum.data_ptr<scalar_t>() + 1,
+                values_contig.numel(),
+                at::cuda::getCurrentCUDAStream()));
+        auto temp_storage = at::empty(
+            {static_cast<int64_t>(temp_storage_bytes)},
+            values_contig.options().dtype(at::kByte));
+        AT_CUDA_CHECK(
+            cub::DeviceScan::InclusiveSum(
+                temp_storage.data_ptr(),
+                temp_storage_bytes,
+                values_contig.data_ptr<scalar_t>(),
+                cumsum.data_ptr<scalar_t>() + 1,
+                values_contig.numel(),
+                at::cuda::getCurrentCUDAStream()));
+      });
+
+  return cumsum;
+}
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/concat_1d_jagged_jagged.cpp b/recommendation_v4/generative_recommenders/ops/cpp/concat_1d_jagged_jagged.cpp
new file mode 100644
index 000000000..51b313443
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/concat_1d_jagged_jagged.cpp
@@ -0,0 +1,111 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/extension.h>
+#include <torch/library.h>
+
+#include "fbgemm_gpu/sparse_ops.h" // @manual
+
+namespace hstu {
+
+template <typename index_t, typename val_t>
+void _concat_1d_jagged_jagged_cpu_kernel(
+    int32_t B,
+    const at::TensorAccessor<index_t, 1>& offsets_left,
+    const at::TensorAccessor<val_t, 1>& values_left,
+    const at::TensorAccessor<index_t, 1>& offsets_right,
+    const at::TensorAccessor<val_t, 1>& values_right,
+    at::TensorAccessor<val_t, 1> combined_values) {
+  for (auto b : c10::irange(B)) {
+    auto left_start = offsets_left[b];
+    auto left_len = offsets_left[b + 1] - left_start;
+    auto right_start = offsets_right[b];
+    auto right_len = offsets_right[b + 1] - right_start;
+    auto combined_start = left_start + right_start;
+    for (auto i = 0; i < left_len; ++i) {
+      combined_values[combined_start + i] = values_left[left_start + i];
+    }
+    for (auto i = 0; i < right_len; ++i) {
+      combined_values[left_len + combined_start + i] =
+          values_right[right_start + i];
+    }
+  }
+}
+
+at::Tensor concat_1d_jagged_jagged_cpu(
+    const at::Tensor& lengths_left,
+    const at::Tensor& values_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& values_right) {
+  TORCH_INTERNAL_ASSERT(lengths_left.device().type() == at::DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT(values_left.device().type() == at::DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT(lengths_right.device().type() == at::DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT(values_right.device().type() == at::DeviceType::CPU);
+  auto L = values_left.numel() + values_right.numel();
+  TORCH_CHECK(L < std::numeric_limits<int32_t>::max());
+  TORCH_CHECK(lengths_left.size(0) == lengths_right.size(0));
+  auto B = lengths_left.size(0);
+  auto combined_values = at::empty({L}, values_left.options());
+  if (L == 0) {
+    return combined_values;
+  }
+  const auto offsets_left =
+      fbgemm_gpu::asynchronous_complete_cumsum_cpu(lengths_left.view({-1}));
+  const auto offsets_right =
+      fbgemm_gpu::asynchronous_complete_cumsum_cpu(lengths_right.view({-1}));
+  AT_DISPATCH_INTEGRAL_TYPES(
+      lengths_left.scalar_type(),
+      "concat_1d_jagged_jagged_values_cpu_kernel_input1",
+      [&] {
+        using index_t = scalar_t;
+        AT_DISPATCH_ALL_TYPES_AND2(
+            at::ScalarType::BFloat16,
+            at::ScalarType::Half,
+            values_left.scalar_type(),
+            "concat_1d_jagged_jagged_values_cpu_kernel_input2",
+            [&] {
+              using val_t = scalar_t;
+              _concat_1d_jagged_jagged_cpu_kernel<index_t, val_t>(
+                  B,
+                  offsets_left.accessor<index_t, 1>(),
+                  values_left.accessor<val_t, 1>(),
+                  offsets_right.accessor<index_t, 1>(),
+                  values_right.accessor<val_t, 1>(),
+                  combined_values.accessor<val_t, 1>());
+            });
+      });
+  return combined_values;
+}
+
+at::Tensor concat_1d_jagged_jagged_meta(
+    const at::Tensor& lengths_left,
+    const at::Tensor& values_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& values_right) {
+  auto L = values_left.numel() + values_right.numel();
+  return at::native::empty_meta_symint(
+      {L},
+      /*dtype=*/::std::make_optional(values_left.scalar_type()),
+      /*layout=*/::std::make_optional(values_left.layout()),
+      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
+      /*pin_memory=*/::std::nullopt);
+}
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/concat_1d_jagged_jagged.cu b/recommendation_v4/generative_recommenders/ops/cpp/concat_1d_jagged_jagged.cu
new file mode 100644
index 000000000..8eeae9d59
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/concat_1d_jagged_jagged.cu
@@ -0,0 +1,130 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/library.h>
+
+#include "common.h"
+#include "fbgemm_gpu/sparse_ops.h" // @manual
+#include "fbgemm_gpu/utils/fixed_divisor.cuh" // @manual
+
+namespace hstu {
+
+static constexpr int32_t kMaxThreads = 1024;
+
+template <typename index_t, typename val_t>
+__global__
+__launch_bounds__(kMaxThreads) void _concat_1d_jagged_jagged_cuda_kernel(
+    int32_t B,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        offsets_left,
+    const at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits>
+        values_left,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        offsets_right,
+    const at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits>
+        values_right,
+    at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits>
+        combined_values) {
+  for (auto b = blockIdx.x * blockDim.y + threadIdx.y;
+       b < static_cast<uint32_t>(B);
+       b += gridDim.x * blockDim.y) {
+    auto left_start = offsets_left[b];
+    auto left_len = offsets_left[b + 1] - left_start;
+    auto right_start = offsets_right[b];
+    auto right_len = offsets_right[b + 1] - right_start;
+    auto combined_start = left_start + right_start;
+    for (auto i = threadIdx.x; i < static_cast<uint32_t>(left_len + right_len);
+         i += blockDim.x) {
+      if (i < static_cast<uint32_t>(left_len)) {
+        combined_values[combined_start + i] = values_left[left_start + i];
+      } else {
+        combined_values[combined_start + i] =
+            values_right[right_start + i - left_len];
+      }
+    }
+  }
+}
+
+at::Tensor concat_1d_jagged_jagged_cuda(
+    const at::Tensor& lengths_left,
+    const at::Tensor& values_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& values_right) {
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(values_left.get_device());
+  TORCH_INTERNAL_ASSERT(lengths_left.device().type() == at::DeviceType::CUDA);
+  TORCH_INTERNAL_ASSERT(values_left.device().type() == at::DeviceType::CUDA);
+  TORCH_INTERNAL_ASSERT(lengths_right.device().type() == at::DeviceType::CUDA);
+  TORCH_INTERNAL_ASSERT(values_right.device().type() == at::DeviceType::CUDA);
+  auto L = values_left.numel() + values_right.numel();
+  TORCH_CHECK(L < std::numeric_limits<int32_t>::max());
+  TORCH_CHECK(values_left.get_device() == lengths_left.get_device());
+  TORCH_CHECK(values_left.get_device() == lengths_right.get_device());
+  TORCH_CHECK(values_left.get_device() == values_right.get_device());
+  auto B = lengths_left.size(0);
+  auto combined_values = at::empty({L}, values_left.options());
+  if (L == 0) {
+    return combined_values;
+  }
+  const auto offsets_left =
+      fbgemm_gpu::asynchronous_complete_cumsum_gpu(lengths_left.view({-1}));
+  const auto offsets_right =
+      fbgemm_gpu::asynchronous_complete_cumsum_gpu(lengths_right.view({-1}));
+  // Optimized thread block configuration based on benchmark results
+  uint32_t B_blocks = 4;
+  dim3 threads(256, B_blocks);
+  auto blocks = div_round_up(B, B_blocks);
+  AT_DISPATCH_INTEGRAL_TYPES(
+      lengths_left.scalar_type(),
+      "concat_1d_jagged_jagged_values_cuda_kernel_input1",
+      [&] {
+        using index_t = scalar_t;
+        AT_DISPATCH_ALL_TYPES_AND2(
+            at::ScalarType::BFloat16,
+            at::ScalarType::Half,
+            values_left.scalar_type(),
+            "concat_1d_jagged_jagged_values_cuda_kernel_input2",
+            [&] {
+              using val_t = scalar_t;
+              _concat_1d_jagged_jagged_cuda_kernel<index_t, val_t>
+                  <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                      B,
+                      offsets_left.packed_accessor32<
+                          index_t,
+                          1,
+                          at::RestrictPtrTraits>(),
+                      values_left
+                          .packed_accessor32<val_t, 1, at::RestrictPtrTraits>(),
+                      offsets_right.packed_accessor32<
+                          index_t,
+                          1,
+                          at::RestrictPtrTraits>(),
+                      values_right
+                          .packed_accessor32<val_t, 1, at::RestrictPtrTraits>(),
+                      combined_values.packed_accessor32<
+                          val_t,
+                          1,
+                          at::RestrictPtrTraits>());
+            });
+      });
+  return combined_values;
+}
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/cpp_ops.cpp b/recommendation_v4/generative_recommenders/ops/cpp/cpp_ops.cpp
new file mode 100644
index 000000000..155cc7572
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/cpp_ops.cpp
@@ -0,0 +1,207 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/extension.h>
+#include <torch/library.h>
+
+/*
+ * Because different .SO may include the same CUDA CUB kernels, this results in
+ * confusion, where libA may end up calling libB's cub kernel and causing
+ * failures when we static link libcudart_static.a. To avoid this, we annotate
+ * only the public functions and hide the rest.
+ */
+#define DLL_PUBLIC __attribute__((visibility("default")))
+
+namespace hstu {
+at::Tensor expand_1d_jagged_to_dense_cpu(
+    const at::Tensor& values,
+    const at::Tensor& offsets,
+    const int64_t max_len);
+
+at::Tensor expand_1d_jagged_to_dense_meta(
+    const at::Tensor& values,
+    const at::Tensor& offsets,
+    const c10::SymInt max_len);
+
+at::Tensor expand_1d_jagged_to_dense_cuda(
+    const at::Tensor& values,
+    const at::Tensor& offsets,
+    const int64_t max_len);
+
+at::Tensor complete_cumsum_cpu(const at::Tensor& values);
+
+at::Tensor complete_cumsum_cuda(const at::Tensor& values);
+
+at::Tensor complete_cumsum_meta(const at::Tensor& values);
+
+at::Tensor concat_1d_jagged_jagged_cpu(
+    const at::Tensor& lengths_left,
+    const at::Tensor& values_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& values_right);
+
+at::Tensor concat_1d_jagged_jagged_cuda(
+    const at::Tensor& lengths_left,
+    const at::Tensor& values_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& values_right);
+
+at::Tensor concat_1d_jagged_jagged_meta(
+    const at::Tensor& lengths_left,
+    const at::Tensor& values_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& values_right);
+
+std::tuple<at::Tensor, at::Tensor> split_1d_jagged_jagged_cpu(
+    const at::Tensor& lengths_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& combined_values);
+
+std::tuple<at::Tensor, at::Tensor> split_1d_jagged_jagged_cuda(
+    const at::Tensor& lengths_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& combined_values);
+
+std::tuple<at::Tensor, at::Tensor> split_1d_jagged_jagged_meta(
+    const at::Tensor& lengths_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& combined_values);
+
+at::Tensor replace_last_n_with_jagged_cpu(
+    const at::Tensor& lengths_left,
+    const at::Tensor& values_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& values_right);
+
+at::Tensor replace_last_n_with_jagged_cuda(
+    const at::Tensor& lengths_left,
+    const at::Tensor& values_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& values_right);
+
+at::Tensor replace_last_n_with_jagged_meta(
+    const at::Tensor& lengths_left,
+    const at::Tensor& values_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& values_right);
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> jagged_transpose_1d_cpu(
+    const at::Tensor& values,
+    const at::Tensor& offsets,
+    const at::Tensor& lengths,
+    const int64_t max_len,
+    const int64_t size1,
+    const int64_t size2);
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> jagged_transpose_1d_cuda(
+    const at::Tensor& values,
+    const at::Tensor& offsets,
+    const at::Tensor& lengths,
+    const int64_t max_len,
+    const int64_t size1,
+    const int64_t size2);
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> jagged_transpose_1d_meta(
+    const at::Tensor& values,
+    const at::Tensor& offsets,
+    const at::Tensor& lengths,
+    const int64_t max_len,
+    const int64_t size1,
+    const int64_t size2);
+
+DLL_PUBLIC std::tuple<at::Tensor, at::Tensor> sort_kv_pairs_meta(
+    const at::Tensor& keys,
+    const at::Tensor& values,
+    const std::optional<int64_t>& end_bit,
+    const bool descending = false) {
+  TORCH_CHECK(
+      keys.dtype() == at::kInt || keys.dtype() == at::kLong ||
+      keys.dtype() == at::kByte || keys.dtype() == at::kShort);
+  TORCH_CHECK(keys.dim() == 1);
+  TORCH_CHECK(values.dim() == 1);
+  return {at::empty_like(keys), at::empty_like(values)};
+}
+
+std::tuple<at::Tensor, at::Tensor> sort_kv_pairs_cuda(
+    const at::Tensor& keys,
+    const at::Tensor& values,
+    const std::optional<int64_t>& end_bit,
+    const bool descending = false);
+
+} // namespace hstu
+
+TORCH_LIBRARY_FRAGMENT(hstu, m) {
+  m.def(
+      "expand_1d_jagged_to_dense(Tensor values, Tensor offsets, SymInt max_len) -> Tensor");
+  m.def(
+      "concat_1d_jagged_jagged(Tensor lengths_left, Tensor values_left, Tensor lengths_right, Tensor values_right) -> Tensor");
+  m.def(
+      "split_1d_jagged_jagged(Tensor lengths_left, Tensor lengths_right, Tensor combined_values) -> (Tensor, Tensor)");
+  m.def(
+      "replace_last_n_with_jagged(Tensor lengths_left, Tensor values_left, Tensor lengths_right, Tensor values_right) -> Tensor");
+  m.def(
+      "jagged_transpose_1d(Tensor values, Tensor offsets, Tensor lengths, int max_len, int size1, int size2) -> (Tensor, Tensor, Tensor)");
+  m.def("complete_cumsum(Tensor values) -> Tensor");
+  m.def(
+      "sort_kv_pairs(Tensor keys, Tensor values, int? end_bit=None, bool descending=False) -> (Tensor, Tensor)");
+}
+
+TORCH_LIBRARY_IMPL(hstu, CPU, m) {
+  m.impl("expand_1d_jagged_to_dense", hstu::expand_1d_jagged_to_dense_cpu);
+  m.impl("concat_1d_jagged_jagged", hstu::concat_1d_jagged_jagged_cpu);
+  m.impl("split_1d_jagged_jagged", hstu::split_1d_jagged_jagged_cpu);
+  m.impl("replace_last_n_with_jagged", hstu::replace_last_n_with_jagged_cpu);
+  m.impl("jagged_transpose_1d", hstu::jagged_transpose_1d_cpu);
+  m.impl("complete_cumsum", hstu::complete_cumsum_cpu);
+}
+
+TORCH_LIBRARY_IMPL(hstu, CUDA, m) {
+  m.impl("expand_1d_jagged_to_dense", hstu::expand_1d_jagged_to_dense_cuda);
+  m.impl("concat_1d_jagged_jagged", hstu::concat_1d_jagged_jagged_cuda);
+  m.impl("split_1d_jagged_jagged", hstu::split_1d_jagged_jagged_cuda);
+  m.impl("replace_last_n_with_jagged", hstu::replace_last_n_with_jagged_cuda);
+  m.impl("jagged_transpose_1d", hstu::jagged_transpose_1d_cuda);
+  m.impl("complete_cumsum", hstu::complete_cumsum_cuda);
+  m.impl(
+      "sort_kv_pairs",
+      torch::dispatch(
+          c10::DispatchKey::CUDA, TORCH_FN(hstu::sort_kv_pairs_cuda)));
+}
+
+TORCH_LIBRARY_IMPL(hstu, Meta, m) {
+  m.impl("expand_1d_jagged_to_dense", hstu::expand_1d_jagged_to_dense_meta);
+  m.impl("concat_1d_jagged_jagged", hstu::concat_1d_jagged_jagged_meta);
+  m.impl("split_1d_jagged_jagged", hstu::split_1d_jagged_jagged_meta);
+  m.impl("replace_last_n_with_jagged", hstu::replace_last_n_with_jagged_meta);
+  m.impl("jagged_transpose_1d", hstu::jagged_transpose_1d_meta);
+  m.impl("complete_cumsum", hstu::complete_cumsum_meta);
+  m.impl(
+      "sort_kv_pairs",
+      torch::dispatch(
+          c10::DispatchKey::Meta, TORCH_FN(hstu::sort_kv_pairs_meta)));
+}
+
+TORCH_LIBRARY_IMPL(hstu, Autograd, m) {
+  m.impl(
+      "expand_1d_jagged_to_dense",
+      torch::autograd::autogradNotImplementedFallback());
+  m.impl("complete_cumsum", torch::autograd::autogradNotImplementedFallback());
+}
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/cuda_hstu_attention.py b/recommendation_v4/generative_recommenders/ops/cpp/cuda_hstu_attention.py
new file mode 100644
index 000000000..0f9458c8b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/cuda_hstu_attention.py
@@ -0,0 +1,193 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+
+from typing import Optional
+
+import torch
+from generative_recommenders.ops.utils import is_sm100_plus
+
+try:
+    # We need to import the CUDA kernels after importing torch
+    import hstu._C  # pyre-ignore [21]
+except:
+    pass
+try:
+    torch.ops.load_library(
+        "//generative_recommenders/fb/ultra/ops/blackwell/hstu_attention:hstu_flash_attention"
+    )
+    torch.ops.load_library(
+        "//generative_recommenders/ops/cpp/hstu_attention:hstu_flash_attention"
+    )
+except:
+    pass
+
+
+def cuda_hstu_mha(
+    max_seq_len: int,
+    alpha: float,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: Optional[torch.Tensor] = None,
+    causal: bool = False,
+    num_targets: Optional[torch.Tensor] = None,
+    attn_scale: Optional[torch.Tensor] = None,
+    max_attn_len: int = 0,
+    min_full_attn_seq_len: int = 0,
+    contextual_seq_len: int = 0,
+    q_descale: Optional[torch.Tensor] = None,
+    k_descale: Optional[torch.Tensor] = None,
+    v_descale: Optional[torch.Tensor] = None,
+    sort_by_length: bool = False,
+    deterministic: bool = False,
+    sm_margin: int = 0,
+    max_q_len: int = 0,
+    seq_offsets_q: Optional[torch.Tensor] = None,
+    num_softmax_heads: int = 0,
+    training: bool = True,
+    max_seq_len_tensor: Optional[torch.Tensor] = None,
+    contextual_seq_len_tensor: Optional[torch.Tensor] = None,
+    max_attn_len_tensor: Optional[torch.Tensor] = None,
+    min_full_attn_seq_len_tensor: Optional[torch.Tensor] = None,
+    num_groups: int = 1,
+    is_inference: bool = False,
+) -> torch.Tensor:
+    """
+    Arguments:
+        q, k, v: (batch_size, seqlen, nheads, headdim) or (total_seqlen, nheads, headdim)
+    """
+    if is_sm100_plus() and not is_inference:
+        return torch.ops.bw_hstu.bw_hstu_mha(
+            max_seq_len,
+            alpha,
+            q,
+            k,
+            v,
+            seq_offsets,
+            causal,
+            num_targets,
+            attn_scale,
+            max_attn_len,
+            min_full_attn_seq_len,
+            contextual_seq_len,
+            q_descale,
+            k_descale,
+            v_descale,
+            sort_by_length,
+            deterministic,
+            sm_margin,
+            max_q_len,
+            seq_offsets_q,
+            max_seq_len_tensor,
+            contextual_seq_len_tensor,
+            max_attn_len_tensor,
+            min_full_attn_seq_len_tensor,
+            num_groups,
+            num_softmax_heads,
+        )
+    else:
+        return cuda_hstu_mha_inference_wrapper(
+            max_seq_len,
+            alpha,
+            q,
+            k,
+            v,
+            seq_offsets,
+            causal,
+            num_targets,
+            attn_scale,
+            max_attn_len,
+            min_full_attn_seq_len,
+            contextual_seq_len,
+            q_descale,
+            k_descale,
+            v_descale,
+            sort_by_length,
+            deterministic,
+            sm_margin,
+            max_q_len,
+            seq_offsets_q,
+            num_softmax_heads,
+            training,
+            max_seq_len_tensor,
+            contextual_seq_len_tensor,
+            max_attn_len_tensor,
+            min_full_attn_seq_len_tensor,
+            num_groups,
+        )
+
+
+@torch.fx.wrap
+def cuda_hstu_mha_inference_wrapper(
+    max_seq_len: int,
+    alpha: float,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: Optional[torch.Tensor] = None,
+    causal: bool = False,
+    num_targets: Optional[torch.Tensor] = None,
+    attn_scale: Optional[torch.Tensor] = None,
+    max_attn_len: int = 0,
+    min_full_attn_seq_len: int = 0,
+    contextual_seq_len: int = 0,
+    q_descale: Optional[torch.Tensor] = None,
+    k_descale: Optional[torch.Tensor] = None,
+    v_descale: Optional[torch.Tensor] = None,
+    sort_by_length: bool = False,
+    deterministic: bool = False,
+    sm_margin: int = 0,
+    max_q_len: int = 0,
+    seq_offsets_q: Optional[torch.Tensor] = None,
+    num_softmax_heads: int = 0,
+    training: bool = True,
+    max_seq_len_tensor: Optional[torch.Tensor] = None,
+    contextual_seq_len_tensor: Optional[torch.Tensor] = None,
+    max_attn_len_tensor: Optional[torch.Tensor] = None,
+    min_full_attn_seq_len_tensor: Optional[torch.Tensor] = None,
+    num_groups: int = 1,
+) -> torch.Tensor:
+    attn_scale = attn_scale.to(torch.float32) if attn_scale is not None else attn_scale
+
+    return torch.ops.hstu.hstu_mha(
+        max_seq_len,
+        alpha,
+        q,
+        k,
+        v,
+        seq_offsets,
+        causal,
+        num_targets,
+        attn_scale,
+        max_attn_len,
+        min_full_attn_seq_len,
+        contextual_seq_len,
+        q_descale,
+        k_descale,
+        v_descale,
+        sort_by_length,
+        deterministic,
+        sm_margin,
+        max_q_len,
+        seq_offsets_q,
+        num_softmax_heads,
+        training,
+        max_seq_len_tensor,
+        contextual_seq_len_tensor,
+        max_attn_len_tensor,
+        min_full_attn_seq_len_tensor,
+        num_groups,
+    )
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/cuda_hstu_preprocess_and_attention.py b/recommendation_v4/generative_recommenders/ops/cpp/cuda_hstu_preprocess_and_attention.py
new file mode 100644
index 000000000..2184ef2a5
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/cuda_hstu_preprocess_and_attention.py
@@ -0,0 +1,668 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from typing import Optional, Tuple
+
+import torch
+from generative_recommenders.ops.triton.triton_addmm import (
+    maybe_triton_addmm_fwd,
+    triton_addmm_bwd,
+)
+from generative_recommenders.ops.triton.triton_layer_norm import (
+    triton_weighted_layer_norm_bwd,
+)
+from generative_recommenders.ops.utils import copy_if_different_ptr, is_sm100_plus
+from torch.nn import functional as F
+
+try:
+    from generative_recommenders.fb.ultra.ops.fp8.fp8_addmm import (
+        fp8_rowwise_quantize_addmm,
+    )
+    from generative_recommenders.fb.ultra.ops.fp8.layer_norm_quantization import (
+        triton_weighted_layer_norm_quantization_fwd,
+    )
+    from hammer.ops.triton.triton_apply_rope import (
+        triton_apply_rope_bwd,
+        triton_apply_rope_fwd,
+    )
+
+    if is_sm100_plus():
+        print("is sm100_plus architecture, loading hstu flash attention for blackwell")
+        torch.ops.load_library(
+            "//generative_recommenders/fb/ultra/ops/blackwell/hstu_attention:hstu_flash_attention"
+        )
+    print("loading hstu flash attention for general architecture")
+    torch.ops.load_library(
+        "//generative_recommenders/ops/cpp/hstu_attention:hstu_flash_attention"
+    )
+except Exception as ex:
+    print(f"Library importing error when importing library: {ex}")
+
+
+class _HSTUPreprocessAndAttentionFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore [14]
+    def forward(
+        ctx,  # pyre-ignore [2]
+        x: torch.Tensor,
+        norm_weight: torch.Tensor,
+        norm_bias: torch.Tensor,
+        norm_eps: float,
+        num_heads: int,
+        attn_dim: int,
+        hidden_dim: int,
+        uvqk_weight: torch.Tensor,
+        uvqk_bias: Optional[torch.Tensor],
+        max_seq_len: int,
+        seq_offsets: torch.Tensor,
+        alpha: float,
+        invalid_attn_mask_type: str,
+        num_targets: Optional[torch.Tensor],
+        rotary_weights: Optional[
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+        ] = None,
+        attn_scale: Optional[torch.Tensor] = None,
+        recompute_uvqk_in_backward: bool = False,
+        recompute_normed_x_in_backward: bool = False,
+        contextual_seq_len: int = 0,
+        sort_by_length: bool = False,
+        max_attn_len: Optional[int] = None,
+        full_attn_size: Optional[int] = None,
+        silu_u: bool = True,
+        fp8_in_addmm_fwd: bool = False,
+        num_softmax_heads: int = 0,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        max_attn_len = max_attn_len or 0
+        full_attn_size = full_attn_size or 0
+        normed_x, x_mean, x_rstd, BLOCK_D, x_scale, normed_x_fp8 = (
+            triton_weighted_layer_norm_quantization_fwd(
+                x=x,
+                weight=norm_weight,
+                bias=norm_bias,
+                eps=norm_eps,
+                quantize_output=fp8_in_addmm_fwd,
+            )
+        )
+        # When silu_u is False and we want to recompute in backward, we split the weight
+        # for u and vqk separately during training to compute them independently.
+        # This avoids needing to clone u (which would otherwise keep the whole uvqk alive).
+        if not silu_u and recompute_uvqk_in_backward:
+            # Split the weights/biases to compute u and vqk separately
+            u_weight, vqk_weight = uvqk_weight.split(
+                [
+                    hidden_dim * num_heads,
+                    hidden_dim * num_heads
+                    + attn_dim * num_heads
+                    + attn_dim * num_heads,
+                ],
+                dim=1,
+            )
+            if uvqk_bias is not None:
+                u_bias, vqk_bias = uvqk_bias.split(
+                    [
+                        hidden_dim * num_heads,
+                        hidden_dim * num_heads
+                        + attn_dim * num_heads
+                        + attn_dim * num_heads,
+                    ],
+                    dim=0,
+                )
+            else:
+                u_bias, vqk_bias = None, None
+            if fp8_in_addmm_fwd:
+                assert x_scale is not None and normed_x_fp8 is not None
+                u = fp8_rowwise_quantize_addmm(
+                    x=normed_x,
+                    x_fp8=normed_x_fp8,
+                    w=u_weight,
+                    y=u_bias,
+                    x_scale=x_scale,
+                    custom_kernel=False,
+                    is_inference=False,
+                ).contiguous()
+                vqk = fp8_rowwise_quantize_addmm(
+                    x=normed_x,
+                    x_fp8=normed_x_fp8,
+                    w=vqk_weight,
+                    y=vqk_bias,
+                    x_scale=x_scale,
+                    custom_kernel=False,
+                    is_inference=False,
+                ).contiguous()
+            else:
+                u = maybe_triton_addmm_fwd(normed_x, u_weight, u_bias).contiguous()
+                vqk = maybe_triton_addmm_fwd(
+                    normed_x, vqk_weight, vqk_bias
+                ).contiguous()
+            v, q, k = vqk.split(
+                [
+                    hidden_dim * num_heads,
+                    attn_dim * num_heads,
+                    attn_dim * num_heads,
+                ],
+                dim=1,
+            )
+            # uvqk is not used since we split the computation, but we need it
+            # for saving in case recompute_uvqk_in_backward is False in a
+            # different code path. Set to None to satisfy type checker.
+            uvqk = None
+        else:
+            if fp8_in_addmm_fwd:
+                assert (
+                    x_scale is not None
+                    and normed_x_fp8 is not None
+                    and uvqk_bias is not None
+                )
+                uvqk = fp8_rowwise_quantize_addmm(
+                    x=normed_x,
+                    x_fp8=normed_x_fp8,
+                    w=uvqk_weight,
+                    y=uvqk_bias,
+                    x_scale=x_scale,
+                    custom_kernel=False,
+                    is_inference=False,
+                ).contiguous()
+            else:
+                uvqk = maybe_triton_addmm_fwd(
+                    normed_x, uvqk_weight, uvqk_bias
+                ).contiguous()
+            u, v, q, k = uvqk.split(
+                [
+                    hidden_dim * num_heads,
+                    hidden_dim * num_heads,
+                    attn_dim * num_heads,
+                    attn_dim * num_heads,
+                ],
+                dim=1,
+            )
+            if silu_u:
+                u = F.silu(u)
+        if rotary_weights is not None:
+            q_cos_weights = rotary_weights[0]
+            q_sin_weights = rotary_weights[1]
+            k_cos_weights = rotary_weights[2]
+            k_sin_weights = rotary_weights[3]
+            _q = triton_apply_rope_fwd(
+                x=q.view(-1, num_heads, attn_dim),
+                N=max_seq_len,
+                seq_offsets=seq_offsets,
+                cos_rope=q_cos_weights,
+                sin_rope=q_sin_weights,
+            ).view(-1, num_heads * attn_dim)
+            _k = triton_apply_rope_fwd(
+                x=k.view(-1, num_heads, attn_dim),
+                N=max_seq_len,
+                seq_offsets=seq_offsets,
+                cos_rope=k_cos_weights,
+                sin_rope=k_sin_weights,
+            ).view(-1, num_heads * attn_dim)
+            copy_if_different_ptr(q, _q)
+            copy_if_different_ptr(k, _k)
+        q = q.view(-1, num_heads, attn_dim)
+        k = k.view(-1, num_heads, attn_dim)
+        v = v.view(-1, num_heads, hidden_dim)
+        if is_sm100_plus():
+            out, softmax_lse = torch.ops.bw_hstu.bw_hstu_mha_fwd(
+                max_seq_len,
+                alpha,
+                q,
+                k,
+                v,
+                seq_offsets,
+                True,  # causal
+                num_targets,
+                attn_scale,
+                max_attn_len,
+                full_attn_size,
+                contextual_seq_len,
+                None,  # q_descale
+                None,  # k_descale
+                None,  # v_descale
+                0,  # sm_margin
+                max_seq_len,  # max_q_len,
+                None,  # seq_offsets_q,
+                None,  # max_seq_len_tensor,
+                None,  # contextual_seq_len_tensor,
+                None,  # max_attn_len_tensor,
+                None,  # min_full_attn_seq_len_tensor,
+                1,  # num_groups
+                num_softmax_heads,  # num_softmax_heads
+            )
+        else:
+            out, softmax_lse = torch.ops.hstu.hstu_mha_fwd(
+                max_seq_len,
+                alpha,
+                q,
+                k,
+                v,
+                seq_offsets,
+                True,  # causal
+                num_targets,
+                attn_scale,
+                max_attn_len,
+                full_attn_size,
+                contextual_seq_len,
+                None,  # q_descale
+                None,  # k_descale
+                None,  # v_descale
+                0,  # sm_margin
+                0,  # max_q_len,
+                None,  # seq_offsets_q,
+                num_softmax_heads,  # num_softmax_heads,
+            )
+        # update ctx
+        saved_tensors = [
+            x,
+            norm_weight,
+            norm_bias,
+            x_mean,
+            x_rstd,
+            uvqk_weight,
+            seq_offsets,
+            out,
+        ]
+        if num_softmax_heads > 0:
+            saved_tensors.append(softmax_lse)
+        if num_targets is not None:
+            saved_tensors.append(num_targets)
+        if attn_scale is not None:
+            saved_tensors.append(attn_scale)
+        if not recompute_normed_x_in_backward:
+            saved_tensors.append(normed_x)
+        if recompute_uvqk_in_backward:
+            if uvqk_bias is not None:
+                saved_tensors.append(uvqk_bias)
+            if fp8_in_addmm_fwd:
+                saved_tensors.append(x_scale)  # pyre-ignore
+                saved_tensors.append(normed_x_fp8)  # pyre-ignore
+        else:
+            saved_tensors.append(uvqk)
+        if rotary_weights is not None:
+            saved_tensors.append(rotary_weights[0])
+            saved_tensors.append(rotary_weights[1])
+            saved_tensors.append(rotary_weights[2])
+            saved_tensors.append(rotary_weights[3])
+        ctx.save_for_backward(*saved_tensors)
+        ctx.alpha = alpha
+        ctx.invalid_attn_mask_type = invalid_attn_mask_type
+        ctx.has_multiple_targets = num_targets is not None
+        ctx.has_rotary_weights = rotary_weights is not None
+        ctx.has_attn_scale = attn_scale is not None
+        ctx.max_seq_len = max_seq_len
+        ctx.max_attn_len = max_attn_len
+        ctx.full_attn_size = full_attn_size
+        ctx.recompute_normed_x_in_backward = recompute_normed_x_in_backward
+        ctx.recompute_uvqk_in_backward = recompute_uvqk_in_backward
+        ctx.hidden_dim = hidden_dim
+        ctx.attn_dim = attn_dim
+        ctx.num_heads = num_heads
+        ctx.has_uvqk_bias = uvqk_bias is not None
+        ctx.uvqk_bias_1d = uvqk_bias.dim() == 1 if uvqk_bias is not None else False
+        ctx.norm_eps = norm_eps
+        ctx.norm_BLOCK_D = BLOCK_D
+        ctx.contextual_seq_len = contextual_seq_len
+        ctx.sort_by_length = sort_by_length
+        ctx.silu_u = silu_u
+        ctx.fp8_in_addmm_fwd = fp8_in_addmm_fwd
+        ctx.num_softmax_heads = num_softmax_heads
+        return u, out
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx,  # pyre-ignore[2]
+        _du: torch.Tensor,
+        dout: torch.Tensor,
+    ) -> Tuple[
+        torch.Tensor,  # d_x
+        torch.Tensor,  # d_norm_weight
+        torch.Tensor,  # d_norm_bias
+        None,
+        None,
+        None,
+        None,
+        torch.Tensor,  # d_uvqk_weight
+        torch.Tensor,  # d_uvqk_bias
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+    ]:
+        x, norm_weight, norm_bias, x_mean, x_rstd, uvqk_weight, seq_offsets, out = (
+            ctx.saved_tensors[:8]
+        )
+        idx = 8
+        if ctx.num_softmax_heads > 0:
+            softmax_lse = ctx.saved_tensors[idx]
+            idx += 1
+        else:
+            softmax_lse = None
+        if ctx.has_multiple_targets:
+            num_targets = ctx.saved_tensors[idx]
+            idx += 1
+        else:
+            num_targets = None
+        if ctx.has_attn_scale:
+            attn_scale = ctx.saved_tensors[idx]
+            idx += 1
+        else:
+            attn_scale = None
+        if ctx.recompute_normed_x_in_backward:
+            normed_x, _, _, _, _, _ = triton_weighted_layer_norm_quantization_fwd(
+                x=x,
+                weight=norm_weight,
+                bias=norm_bias,
+                eps=ctx.norm_eps,
+                mean=x_mean,
+                rstd=x_rstd,
+                quantize_output=ctx.fp8_in_addmm_fwd,
+            )
+        else:
+            normed_x = ctx.saved_tensors[idx]
+            idx += 1
+        if ctx.recompute_uvqk_in_backward:
+            if ctx.has_uvqk_bias:
+                uvqk_bias = ctx.saved_tensors[idx]
+                idx += 1
+            else:
+                uvqk_bias = None
+            if not ctx.silu_u:
+                # When silu_u is False, we only recompute vqk (not u)
+                # Split the weights/biases to extract vqk portion
+                _, vqk_weight = uvqk_weight.split(
+                    [
+                        ctx.hidden_dim * ctx.num_heads,
+                        ctx.hidden_dim * ctx.num_heads
+                        + ctx.attn_dim * ctx.num_heads
+                        + ctx.attn_dim * ctx.num_heads,
+                    ],
+                    dim=1,
+                )
+                vqk_bias = None
+                if ctx.has_uvqk_bias:
+                    _, vqk_bias = uvqk_bias.split(
+                        [
+                            ctx.hidden_dim * ctx.num_heads,
+                            ctx.hidden_dim * ctx.num_heads
+                            + ctx.attn_dim * ctx.num_heads
+                            + ctx.attn_dim * ctx.num_heads,
+                        ],
+                        dim=0,
+                    )
+                if ctx.fp8_in_addmm_fwd:
+                    x_scale, normed_x_fp8 = ctx.saved_tensors[idx : idx + 2]
+                    vqk = fp8_rowwise_quantize_addmm(
+                        x=normed_x,
+                        x_fp8=normed_x_fp8,
+                        w=vqk_weight,
+                        y=vqk_bias,
+                        x_scale=x_scale,
+                        custom_kernel=False,
+                        is_inference=False,
+                    )
+                    idx += 2
+                else:
+                    vqk = maybe_triton_addmm_fwd(
+                        normed_x, vqk_weight, vqk_bias
+                    ).contiguous()
+                # Split vqk into v, q, k components
+                v, q, k = vqk.split(
+                    [
+                        ctx.hidden_dim * ctx.num_heads,
+                        ctx.attn_dim * ctx.num_heads,
+                        ctx.attn_dim * ctx.num_heads,
+                    ],
+                    dim=1,
+                )
+                u = None
+            else:
+                # When silu_u is True, we recompute uvqk (all components)
+                if ctx.fp8_in_addmm_fwd:
+                    x_scale, normed_x_fp8 = ctx.saved_tensors[idx : idx + 2]
+                    uvqk = fp8_rowwise_quantize_addmm(
+                        x=normed_x,
+                        x_fp8=normed_x_fp8,
+                        w=uvqk_weight,
+                        y=uvqk_bias,
+                        x_scale=x_scale,
+                        custom_kernel=False,
+                        is_inference=False,
+                    )
+                    idx += 2
+                else:
+                    uvqk = maybe_triton_addmm_fwd(
+                        normed_x, uvqk_weight, uvqk_bias
+                    ).contiguous()
+                # Split uvqk into u, v, q, k components
+                u, v, q, k = uvqk.split(
+                    [
+                        ctx.hidden_dim * ctx.num_heads,
+                        ctx.hidden_dim * ctx.num_heads,
+                        ctx.attn_dim * ctx.num_heads,
+                        ctx.attn_dim * ctx.num_heads,
+                    ],
+                    dim=1,
+                )
+        else:
+            uvqk = ctx.saved_tensors[idx]
+            idx += 1
+            # Split saved uvqk into u, v, q, k components
+            u, v, q, k = uvqk.split(
+                [
+                    ctx.hidden_dim * ctx.num_heads,
+                    ctx.hidden_dim * ctx.num_heads,
+                    ctx.attn_dim * ctx.num_heads,
+                    ctx.attn_dim * ctx.num_heads,
+                ],
+                dim=1,
+            )
+        if ctx.has_rotary_weights:
+            q_cos_weights, q_sin_weights, k_cos_weights, k_sin_weights = (
+                ctx.saved_tensors[idx : idx + 4]
+            )
+            idx += 4
+        else:
+            q_cos_weights, q_sin_weights, k_cos_weights, k_sin_weights = (
+                None,
+                None,
+                None,
+                None,
+            )
+
+        duvqk = torch.empty(
+            [
+                x.size(0),
+                ctx.hidden_dim * ctx.num_heads * 2 + ctx.attn_dim * ctx.num_heads * 2,
+            ],
+            device=x.device,
+            dtype=x.dtype,
+        )
+        du, dv, dq, dk = duvqk.split(
+            [
+                ctx.hidden_dim * ctx.num_heads,
+                ctx.hidden_dim * ctx.num_heads,
+                ctx.attn_dim * ctx.num_heads,
+                ctx.attn_dim * ctx.num_heads,
+            ],
+            dim=1,
+        )
+        q = q.view(-1, ctx.num_heads, ctx.attn_dim)
+        k = k.view(-1, ctx.num_heads, ctx.attn_dim)
+        v = v.view(-1, ctx.num_heads, ctx.hidden_dim)
+        dq = dq.view(-1, ctx.num_heads, ctx.attn_dim)
+        dk = dk.view(-1, ctx.num_heads, ctx.attn_dim)
+        dv = dv.view(-1, ctx.num_heads, ctx.hidden_dim)
+        if (
+            ctx.recompute_uvqk_in_backward and ctx.has_rotary_weights
+        ):  # recompute ROPE on qk
+            q = triton_apply_rope_fwd(
+                x=q,
+                N=ctx.max_seq_len,
+                seq_offsets=seq_offsets,
+                cos_rope=q_cos_weights,
+                sin_rope=q_sin_weights,
+            )
+            k = triton_apply_rope_fwd(
+                x=k,
+                N=ctx.max_seq_len,
+                seq_offsets=seq_offsets,
+                cos_rope=k_cos_weights,
+                sin_rope=k_sin_weights,
+            )
+        dq = dq.view(-1, ctx.num_heads, ctx.attn_dim)
+        dk = dk.view(-1, ctx.num_heads, ctx.attn_dim)
+        dv = dv.view(-1, ctx.num_heads, ctx.hidden_dim)
+        # Note: the two operations below update duvqk in place
+        if is_sm100_plus():
+            _dq, _dk, _dv = torch.ops.bw_hstu.bw_hstu_mha_bwd(
+                ctx.max_seq_len,
+                ctx.alpha,
+                dout,
+                q,
+                k,
+                v,
+                dq,
+                dk,
+                dv,
+                seq_offsets,
+                True,  # causal
+                num_targets,
+                attn_scale,
+                ctx.max_attn_len,
+                ctx.full_attn_size,
+                ctx.contextual_seq_len,
+                ctx.sort_by_length,
+                False,  # deterministic
+                0,  # sm_margin
+                ctx.max_seq_len,  # max_q_len,
+                None,  # seq_offsets_q,
+                None,  # max_seq_len_tensor,
+                None,  # contextual_seq_len_tensor,
+                None,  # max_attn_len_tensor,
+                None,  # min_full_attn_seq_len_tensor,
+                1,  # num_groups
+                ctx.num_softmax_heads,  # num_softmax_heads
+                out,  # out
+                softmax_lse,  # lse
+            )
+        else:
+            _dq, _dk, _dv = torch.ops.hstu.hstu_mha_bwd(
+                ctx.max_seq_len,
+                ctx.alpha,
+                dout,
+                q,
+                k,
+                v,
+                dq,
+                dk,
+                dv,
+                out,
+                seq_offsets,
+                True,  # causal
+                num_targets,
+                attn_scale,
+                ctx.max_attn_len,
+                ctx.full_attn_size,
+                ctx.contextual_seq_len,
+                ctx.sort_by_length,
+                False,  # deterministic
+                0,  # sm_margin
+                0,  # max_q_len,
+                None,  # seq_offsets_q,
+                ctx.num_softmax_heads,  # num_softmax_heads,
+                softmax_lse,
+            )
+        if ctx.has_rotary_weights:
+            _dq = triton_apply_rope_bwd(
+                grad=_dq,
+                N=ctx.max_seq_len,
+                seq_offsets=seq_offsets,
+                cos_rope=q_cos_weights,
+                sin_rope=q_sin_weights,
+            )
+            _dk = triton_apply_rope_bwd(
+                grad=_dk,
+                N=ctx.max_seq_len,
+                seq_offsets=seq_offsets,
+                cos_rope=k_cos_weights,
+                sin_rope=k_sin_weights,
+            )
+        copy_if_different_ptr(dq, _dq)
+        copy_if_different_ptr(dk, _dk)
+        copy_if_different_ptr(dv, _dv)
+        if ctx.silu_u:
+            torch.ops.aten.silu_backward(_du, u, grad_input=du)
+        else:
+            copy_if_different_ptr(du, _du)
+        d_normed_x, d_uvqk_weight, d_uvqk_bias = triton_addmm_bwd(
+            x=normed_x,
+            w=uvqk_weight,
+            dz=duvqk,
+            is_y_1d=ctx.uvqk_bias_1d and ctx.has_uvqk_bias,
+        )
+        d_x, d_norm_weight, d_norm_bias = triton_weighted_layer_norm_bwd(
+            dy=d_normed_x,
+            x=x,
+            weight=norm_weight,
+            bias=norm_bias,
+            mean=x_mean,
+            rstd=x_rstd,
+            learnable=True,
+            eps=ctx.norm_eps,
+            BLOCK_D=ctx.norm_BLOCK_D,
+        )
+        # pyre-ignore[7]
+        return (
+            d_x,
+            d_norm_weight,
+            d_norm_bias,
+            None,
+            None,
+            None,
+            None,
+            d_uvqk_weight,
+            d_uvqk_bias if ctx.has_uvqk_bias else None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/expand_1d_jagged_to_dense.cpp b/recommendation_v4/generative_recommenders/ops/cpp/expand_1d_jagged_to_dense.cpp
new file mode 100644
index 000000000..4730078e6
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/expand_1d_jagged_to_dense.cpp
@@ -0,0 +1,97 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/extension.h>
+#include <torch/library.h>
+
+namespace hstu {
+
+template <typename index_t, typename val_t>
+void expand_1d_jagged_to_dense_cpu_kernel_(
+    int64_t B,
+    int64_t max_len,
+    const at::TensorAccessor<val_t, 1>& values,
+    const at::TensorAccessor<index_t, 1>& offsets,
+    at::TensorAccessor<val_t, 2> output) {
+  for (auto i : c10::irange(B)) {
+    int64_t begin = offsets[i];
+    int64_t end = offsets[i + 1];
+    if (end - begin == 0) {
+      for (int64_t j : c10::irange(max_len)) {
+        output[i][j] = 0;
+        continue;
+      }
+    } else {
+      int64_t j = 0;
+      for (; j < std::min(end - begin, max_len); ++j) {
+        output[i][j] = values[begin + j];
+      }
+      for (; j < max_len; ++j) {
+        output[i][j] = values[end - 1];
+      }
+    }
+  } // for each i
+}
+
+at::Tensor expand_1d_jagged_to_dense_cpu(
+    const at::Tensor& values,
+    const at::Tensor& offsets,
+    const int64_t max_len) {
+  TORCH_INTERNAL_ASSERT(values.device().type() == at::DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT(offsets.device().type() == at::DeviceType::CPU);
+  TORCH_CHECK(values.numel() < std::numeric_limits<int32_t>::max());
+  TORCH_CHECK(max_len >= 0);
+  auto B = offsets.size(0) - 1;
+  auto output = at::empty({B, max_len}, values.options());
+  if (values.numel() == 0 || max_len == 0) {
+    return output;
+  }
+  AT_DISPATCH_ALL_TYPES_AND2(
+      at::ScalarType::BFloat16,
+      at::ScalarType::Half,
+      values.scalar_type(),
+      "expand_1d_jagged_to_dense_cpu_input1",
+      [&] {
+        using val_t = scalar_t;
+        AT_DISPATCH_INTEGRAL_TYPES(
+            offsets.scalar_type(), "expand_1d_jagged_to_dense_cpu_input2", [&] {
+              using index_t = scalar_t;
+              expand_1d_jagged_to_dense_cpu_kernel_<index_t, val_t>(
+                  B,
+                  max_len,
+                  values.accessor<val_t, 1>(),
+                  offsets.accessor<index_t, 1>(),
+                  output.accessor<val_t, 2>());
+            });
+      });
+  return output;
+}
+
+at::Tensor expand_1d_jagged_to_dense_meta(
+    const at::Tensor& values,
+    const at::Tensor& offsets,
+    const c10::SymInt max_len) {
+  auto B = offsets.sym_size(0) - 1;
+  auto output = at::empty_symint({B, max_len}, values.options());
+  return output;
+}
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/expand_1d_jagged_to_dense.cu b/recommendation_v4/generative_recommenders/ops/cpp/expand_1d_jagged_to_dense.cu
new file mode 100644
index 000000000..aa3678d2b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/expand_1d_jagged_to_dense.cu
@@ -0,0 +1,103 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/library.h>
+
+#include "common.h"
+
+static constexpr int32_t kMaxThreads = 1024;
+
+namespace hstu {
+
+template <typename index_t, typename val_t>
+__global__
+__launch_bounds__(kMaxThreads) void expand_1d_jagged_to_dense_cuda_kernel_(
+    int64_t B,
+    int64_t max_len,
+    const at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits> values,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> offsets,
+    at::PackedTensorAccessor32<val_t, 2, at::RestrictPtrTraits> output) {
+  int64_t b = blockIdx.y;
+  int64_t begin = offsets[b];
+  int64_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t end = offsets[b + 1];
+  if (end - begin == 0) {
+    if (i < max_len) {
+      output[b][i] = 0;
+    }
+  } else {
+    if (i < std::min(end - begin, max_len)) {
+      output[b][i] = values[i + begin];
+    } else if (i < max_len) {
+      output[b][i] = values[end - 1];
+    }
+  }
+}
+
+at::Tensor expand_1d_jagged_to_dense_cuda(
+    const at::Tensor& values,
+    const at::Tensor& offsets,
+    const int64_t max_len) {
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(values.get_device());
+  TORCH_INTERNAL_ASSERT(values.device().type() == at::DeviceType::CUDA);
+  TORCH_INTERNAL_ASSERT(offsets.device().type() == at::DeviceType::CUDA);
+  TORCH_CHECK(values.numel() < std::numeric_limits<int32_t>::max());
+  TORCH_CHECK(values.get_device() == offsets.get_device());
+  TORCH_CHECK(max_len >= 0);
+  auto B = offsets.size(0) - 1;
+  auto output = at::empty({B, max_len}, values.options());
+  if (values.numel() == 0 || max_len == 0) {
+    return output;
+  }
+  uint32_t nthreads_per_block = max_len > 64 ? 64 : max_len;
+  dim3 grid_size = dim3(div_round_up(max_len, nthreads_per_block), B);
+  AT_DISPATCH_ALL_TYPES_AND2(
+      at::ScalarType::BFloat16,
+      at::ScalarType::Half,
+      values.scalar_type(),
+      "expand_1d_jagged_to_dense_cuda_input1",
+      [&] {
+        using val_t = scalar_t;
+        AT_DISPATCH_INTEGRAL_TYPES(
+            offsets.scalar_type(),
+            "expand_1d_jagged_to_dense_cuda_input2",
+            [&] {
+              using index_t = scalar_t;
+              expand_1d_jagged_to_dense_cuda_kernel_<index_t, val_t><<<
+                  grid_size,
+                  nthreads_per_block,
+                  0,
+                  at::cuda::getCurrentCUDAStream()>>>(
+                  B,
+                  max_len,
+                  values.packed_accessor32<val_t, 1, at::RestrictPtrTraits>(),
+                  offsets
+                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                  output.packed_accessor32<val_t, 2, at::RestrictPtrTraits>());
+              C10_CUDA_KERNEL_LAUNCH_CHECK();
+            });
+      });
+
+  return output;
+}
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/copy_sm90_bulk_reduce.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/copy_sm90_bulk_reduce.h
new file mode 100644
index 000000000..a22ae7745
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/copy_sm90_bulk_reduce.h
@@ -0,0 +1,66 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cute/arch/copy_sm90_tma.hpp>
+
+namespace cute {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct SM90_BULK_REDUCE_ADD {
+  CUTE_HOST_DEVICE static void
+  copy(float const* smem_ptr, float* gmem_ptr, int32_t store_bytes) {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(smem_ptr);
+    asm volatile(
+        "cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32 [%0], [%1], %2;\n"
+        :
+        : "l"(gmem_ptr), "r"(smem_int_ptr), "r"(store_bytes)
+        : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH(
+        "Trying to use BULK_REDUCE_ADD without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+
+  CUTE_HOST_DEVICE static void copy(
+      float const* smem_ptr,
+      float* gmem_ptr,
+      int32_t store_bytes,
+      uint64_t cache_hint) {
+#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
+    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(smem_ptr);
+    asm volatile(
+        "cp.reduce.async.bulk.global.shared::cta.bulk_group.L2::cache_hint.add.f32 [%0], [%1], %2, %3;\n"
+        :
+        : "l"(gmem_ptr), "r"(smem_int_ptr), "r"(store_bytes), "l"(cache_hint)
+        : "memory");
+#else
+    CUTE_INVALID_CONTROL_PATH(
+        "Trying to use BULK_REDUCE_ADD without CUTE_ARCH_TMA_SM90_ENABLED.");
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cute
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/epilogue_bwd.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/epilogue_bwd.h
new file mode 100644
index 000000000..833f3ae28
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/epilogue_bwd.h
@@ -0,0 +1,481 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cutlass/barrier.h>
+#include <cutlass/cutlass.h>
+
+#include "cutlass/gemm/collective/builders/sm90_common.inl"
+
+#include "copy_sm90_bulk_reduce.h"
+#include "named_barrier.h"
+#include "seqlen.h"
+#include "utils.h"
+
+namespace hstu {
+
+using namespace cute;
+
+template <
+    class TileShape_MNK_,
+    class Element_,
+    class ArchTag_,
+    int NumEpilogueThreads_,
+    bool Jagged,
+    bool dKV_swapAB_,
+    int AtomLayoutKdKV = 1>
+struct CollectiveEpilogueBwd {
+  using TileShape_MNK = TileShape_MNK_;
+  using Element = Element_;
+  using ArchTag = ArchTag_;
+  static constexpr int NumEpilogueThreads = NumEpilogueThreads_;
+  static constexpr bool dKV_swapAB = dKV_swapAB_;
+  static constexpr bool Use_TMA =
+      !Jagged && ArchTag::kMinComputeCapability >= 90;
+
+  static_assert(ArchTag::kMinComputeCapability >= 80);
+
+  using GmemTiledCopydKVTMA = cute::SM90_TMA_STORE;
+
+  // These are for storing the output tensor without TMA (e.g., for setting
+  // output to zero)
+  static constexpr int kGmemElemsPerLoad =
+      sizeof(cute::uint128_t) / sizeof(Element);
+  static_assert(
+      get<2>(TileShape_MNK{}) % kGmemElemsPerLoad == 0,
+      "Headdim must be a multiple of kGmemElemsPerLoad");
+  static constexpr int kHeadDim = get<2>(TileShape_MNK{});
+  static constexpr int kGmemThreadsPerRow =
+      cutlass::gcd(kHeadDim / kGmemElemsPerLoad, NumEpilogueThreads);
+  static_assert(
+      NumEpilogueThreads % kGmemThreadsPerRow == 0,
+      "NumEpilogueThreads must be a multiple of kGmemThreadsPerRow");
+  using GmemLayoutAtom = Layout<
+      Shape<
+          Int<NumEpilogueThreads / kGmemThreadsPerRow>,
+          Int<kGmemThreadsPerRow>>,
+      Stride<Int<kGmemThreadsPerRow>, _1>>;
+  using GmemTiledCopydKV = decltype(make_tiled_copy(
+      Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, Element>{},
+      GmemLayoutAtom{},
+      Layout<Shape<_1, Int<kGmemElemsPerLoad>>>{})); // Val layout, 8 or 16 vals
+                                                     // per store
+
+  using SmemLayoutAtomdKVTMA =
+      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+               GMMA::Major::K,
+               Element,
+               // TODO: do we have to change this if dKV_swapAB is true?
+               decltype(cute::get<1>(TileShape_MNK{})),
+               Int<CUTE_STATIC_V(cute::get<2>(TileShape_MNK{})) /
+                   AtomLayoutKdKV>>());
+  using SmemLayoutdKVTMA = decltype(tile_to_shape(
+      SmemLayoutAtomdKVTMA{},
+      select<1, 2>(TileShape_MNK{})));
+  using SmemLayoutdKVtTMA = decltype(cute::composition(
+      SmemLayoutdKVTMA{},
+      make_layout(
+          make_shape(get<2>(TileShape_MNK{}), get<1>(TileShape_MNK{})),
+          make_stride(decltype(get<1>(TileShape_MNK{})){}, _1{}))));
+
+  // If we don't use TMA
+  static constexpr int kBlockKSmem =
+      kHeadDim % 64 == 0 ? 64 : (kHeadDim % 32 == 0 ? 32 : 16);
+  static constexpr int kSwizzle =
+      kBlockKSmem == 64 ? 3 : (kBlockKSmem == 32 ? 2 : 1);
+  using SmemLayoutAtomdKVSTG = decltype(composition(
+      Swizzle<kSwizzle, 3, 3>{},
+      Layout<Shape<Int<8>, Int<kBlockKSmem>>, Stride<Int<kBlockKSmem>, _1>>{}));
+
+  using SmemLayoutAtomdKV =
+      std::conditional_t<Use_TMA, SmemLayoutAtomdKVTMA, SmemLayoutAtomdKVSTG>;
+  using SmemLayoutdKV = decltype(tile_to_shape(
+      SmemLayoutAtomdKV{},
+      select<1, 2>(TileShape_MNK{})));
+  using SmemLayoutdKVt = decltype(cute::composition(
+      SmemLayoutdKV{},
+      make_layout(
+          make_shape(get<2>(TileShape_MNK{}), get<1>(TileShape_MNK{})),
+          make_stride(decltype(get<1>(TileShape_MNK{})){}, _1{}))));
+
+  using SmemCopyAtomdKV = Copy_Atom<
+      std::conditional_t<
+          ArchTag::kMinComputeCapability >= 90,
+          std::conditional_t<
+              !dKV_swapAB,
+              cute::SM90_U32x4_STSM_N,
+              cute::SM90_U16x8_STSM_T>,
+          AutoVectorizingCopyWithAssumedAlignment<128>>,
+      Element>;
+
+  static constexpr size_t SmemAlignmentdKV =
+      ArchTag::kMinComputeCapability >= 90
+      ? cutlass::detail::alignment_for_swizzle(SmemLayoutdKV{})
+      : 128;
+  static_assert(SmemAlignmentdKV >= 128, "Require at least 128B alignment");
+
+  struct TensorStorage : cute::aligned_struct<SmemAlignmentdKV> {
+    cute::
+        array_aligned<Element, cute::cosize_v<SmemLayoutdKV>, SmemAlignmentdKV>
+            smem_dk;
+    cute::
+        array_aligned<Element, cute::cosize_v<SmemLayoutdKV>, SmemAlignmentdKV>
+            smem_dv;
+  };
+
+  using ShapedKV =
+      cute::Shape<int32_t, int32_t, int32_t, int32_t>; // (seqlen_k, d, head,
+                                                       // batch)
+  using StridedKV = cute::Stride<int64_t, _1, int64_t, int64_t>;
+
+  using TMA_dKV = std::conditional_t<
+      Use_TMA,
+      decltype(make_tma_copy(
+          GmemTiledCopydKVTMA{},
+          make_tensor(
+              make_gmem_ptr(static_cast<Element*>(nullptr)),
+              ShapedKV{},
+              StridedKV{}),
+          SmemLayoutdKVTMA{},
+          select<1, 2>(TileShape_MNK{}),
+          _1{})), // no mcast for dKV
+      std::nullptr_t>;
+
+  // Host side kernel arguments
+  struct Arguments {
+    Element* ptr_dK;
+    ShapedKV const shape_dK;
+    StridedKV const stride_dK;
+    Element* ptr_dV;
+    StridedKV const stride_dV;
+    int const num_heads_q;
+    int const* seq_offsets;
+  };
+
+  // Device side kernel params
+  struct Params {
+    Element* ptr_dK;
+    ShapedKV const shape_dK;
+    StridedKV const stride_dK;
+    Element* ptr_dV;
+    StridedKV const stride_dV;
+    TMA_dKV tma_store_dK, tma_store_dV;
+    int const* seq_offsets = nullptr;
+  };
+
+  static Params to_underlying_arguments(Arguments const& args) {
+    Tensor mdK =
+        make_tensor(make_gmem_ptr(args.ptr_dK), args.shape_dK, args.stride_dK);
+    Tensor mdV =
+        make_tensor(make_gmem_ptr(args.ptr_dV), args.shape_dK, args.stride_dV);
+    TMA_dKV tma_store_dK = [&] {
+      if constexpr (Use_TMA) {
+        return make_tma_copy(
+            GmemTiledCopydKVTMA{},
+            mdK,
+            SmemLayoutdKVTMA{},
+            select<1, 2>(TileShape_MNK{}),
+            _1{}); // no mcast for dKV
+      } else {
+        return nullptr;
+      }
+    }();
+    TMA_dKV tma_store_dV = [&] {
+      if constexpr (Use_TMA) {
+        return make_tma_copy(
+            GmemTiledCopydKVTMA{},
+            mdV,
+            SmemLayoutdKVTMA{},
+            select<1, 2>(TileShape_MNK{}),
+            _1{}); // no mcast for dKV
+      } else {
+        return nullptr;
+      }
+    }();
+    return {
+        args.ptr_dK,
+        args.shape_dK,
+        args.stride_dK,
+        args.ptr_dV,
+        args.stride_dV,
+        tma_store_dK,
+        tma_store_dV,
+        args.seq_offsets};
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best
+  /// performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& params) {
+    if constexpr (Use_TMA) {
+      cute::prefetch_tma_descriptor(params.tma_store_dK.get_tma_descriptor());
+      cute::prefetch_tma_descriptor(params.tma_store_dV.get_tma_descriptor());
+    }
+  }
+
+  template <typename SharedStorage, typename FrgTensorO, typename TiledMma>
+  CUTLASS_DEVICE void store(
+      Params const& params,
+      FrgTensorO const& tdKrdK,
+      FrgTensorO const& tdVrdV,
+      SharedStorage& shared_storage,
+      TiledMma tiled_mma,
+      int thread_idx,
+      cute::tuple<int32_t, int32_t, int32_t> const& block_coord) {
+    auto [n_block, bidh, bidb] = block_coord;
+    Tensor sdK = cute::as_position_independent_swizzle_tensor(make_tensor(
+        make_smem_ptr(shared_storage.tensors.epilogue.smem_dk.data()),
+        SmemLayoutdKV{}));
+    Tensor sdV = cute::as_position_independent_swizzle_tensor(make_tensor(
+        make_smem_ptr(shared_storage.tensors.epilogue.smem_dv.data()),
+        SmemLayoutdKV{}));
+    Tensor sdKt = cute::as_position_independent_swizzle_tensor(make_tensor(
+        make_smem_ptr(shared_storage.tensors.epilogue.smem_dk.data()),
+        SmemLayoutdKVt{}));
+    Tensor sdVt = cute::as_position_independent_swizzle_tensor(make_tensor(
+        make_smem_ptr(shared_storage.tensors.epilogue.smem_dv.data()),
+        SmemLayoutdKVt{}));
+    auto smem_tiled_copy_dKV = make_tiled_copy_C(SmemCopyAtomdKV{}, tiled_mma);
+    auto smem_thr_copy_dKV = smem_tiled_copy_dKV.get_thread_slice(thread_idx);
+
+    Tensor tdVrdV_out = make_tensor_like<Element>(tdVrdV);
+    hstu::convert_type_out(tdVrdV, tdVrdV_out);
+    Tensor tdKrdK_out = make_tensor_like<Element>(tdKrdK);
+    hstu::convert_type_out(tdKrdK, tdKrdK_out);
+    Tensor taccdKrdK = smem_thr_copy_dKV.retile_S(
+        tdKrdK_out); // ((Atom,AtomNum), MMA_M, MMA_N)
+    Tensor taccdVrdV = smem_thr_copy_dKV.retile_S(
+        tdVrdV_out); // ((Atom,AtomNum), MMA_M, MMA_N)
+    // if (blockIdx.x == 0 && threadIdx.x == 128) { print(smem_thr_copy_dKV);
+    // print(sdK); printf("\n"); print(sdKt); printf("\n"); }
+    Tensor taccdKsdK = smem_thr_copy_dKV.partition_D(
+        cute::conditional_return<!dKV_swapAB>(
+            sdK, sdKt)); // ((Atom,AtomNum),PIPE_M,PIPE_N)
+    Tensor taccdVsdV = smem_thr_copy_dKV.partition_D(
+        cute::conditional_return<!dKV_swapAB>(
+            sdV, sdVt)); // ((Atom,AtomNum),PIPE_M,PIPE_N)
+
+    // Make sure all WGs have finished reading K and V
+    hstu::named_barrier_sync(
+        NumEpilogueThreads,
+        cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+    cute::copy(smem_tiled_copy_dKV, taccdVrdV, taccdVsdV);
+    cute::copy(smem_tiled_copy_dKV, taccdKrdK, taccdKsdK);
+    if constexpr (Use_TMA) {
+      cutlass::arch::fence_view_async_shared(); // ensure smem writes are
+                                                // visible to TMA
+      cutlass::arch::NamedBarrier::arrive(
+          NumEpilogueThreads + cutlass::NumThreadsPerWarp,
+          cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+
+      Tensor mdK = params.tma_store_dK.get_tma_tensor(params.shape_dK);
+      Tensor mdV = params.tma_store_dV.get_tma_tensor(params.shape_dK);
+      Tensor gdK = local_tile(
+          mdK(_, _, bidh, bidb),
+          select<1, 2>(TileShape_MNK{}),
+          make_coord(n_block, _0{})); // (M, K)
+      Tensor gdV = local_tile(
+          mdV(_, _, bidh, bidb),
+          select<1, 2>(TileShape_MNK{}),
+          make_coord(n_block, _0{})); // (M, K)
+      auto block_tma_dK = params.tma_store_dK.get_slice(_0{});
+      auto block_tma_dV = params.tma_store_dV.get_slice(_0{});
+      Tensor tdKgdK = block_tma_dK.partition_D(gdK); // (TMA, TMA_M, TMA_K)
+      Tensor tdKsdK = block_tma_dK.partition_S(sdK); // (TMA, TMA_M, TMA_K)
+      Tensor tdVgdV = block_tma_dV.partition_D(gdV); // (TMA, TMA_M, TMA_K)
+      Tensor tdVsdV = block_tma_dV.partition_S(sdV); // (TMA, TMA_M, TMA_K)
+      int warp_idx_sync =
+          __shfl_sync(0xffffffff, thread_idx / cutlass::NumThreadsPerWarp, 0);
+      if (warp_idx_sync ==
+          NumEpilogueThreads / cutlass::NumThreadsPerWarp - 1) {
+        cutlass::arch::NamedBarrier::sync(
+            NumEpilogueThreads + cutlass::NumThreadsPerWarp,
+            cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+        if (cute::elect_one_sync()) {
+          cute::copy(params.tma_store_dV, tdVsdV, tdVgdV);
+          cute::copy(params.tma_store_dK, tdKsdK, tdKgdK);
+          tma_store_arrive();
+        }
+      }
+      tma_store_wait<0>();
+      // // Tell warp 0 that smem_k and smem_v are ready
+      // cutlass::arch::NamedBarrier::arrive(NumEpilogueThreads +
+      // cutlass::NumThreadsPerWarp,
+      // static_cast<uint32_t>(BwdNamedBarriers::KVEmpty) /*id*/);
+
+    } else {
+      hstu::named_barrier_sync(
+          NumEpilogueThreads,
+          cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+      static constexpr int kBlockN = get<1>(TileShape_MNK{});
+      hstu::SeqlenInfo<Jagged, kBlockN> seqlen_info{
+          bidb, size<0>(params.shape_dK), params.seq_offsets};
+      Tensor mdK = make_tensor(
+          make_gmem_ptr(params.ptr_dK), params.shape_dK, params.stride_dK)(
+          _, _, bidh, !Jagged ? bidb : 0);
+      Tensor gdK = local_tile(
+          cute::domain_offset(make_coord(seqlen_info.offset, _0{}), mdK),
+          select<1, 2>(TileShape_MNK{}),
+          make_coord(n_block, _0{})); // (M, K)
+      Tensor mdV = make_tensor(
+          make_gmem_ptr(params.ptr_dV), params.shape_dK, params.stride_dV)(
+          _, _, bidh, !Jagged ? bidb : 0);
+      Tensor gdV = local_tile(
+          cute::domain_offset(make_coord(seqlen_info.offset, _0{}), mdV),
+          select<1, 2>(TileShape_MNK{}),
+          make_coord(n_block, _0{})); // (M, K)
+
+      GmemTiledCopydKV gmem_tiled_copy_dKV;
+      auto gmem_thr_copy_dKV = gmem_tiled_copy_dKV.get_thread_slice(thread_idx);
+      Tensor tdKVgdV = gmem_thr_copy_dKV.partition_D(gdV);
+      Tensor tdKVsdV =
+          gmem_thr_copy_dKV.partition_S(sdV); // (TMA, TMA_M, TMA_K)
+      Tensor tdKVgdK = gmem_thr_copy_dKV.partition_D(gdK);
+      Tensor tdKVsdK =
+          gmem_thr_copy_dKV.partition_S(sdK); // (TMA, TMA_M, TMA_K)
+      Tensor tdKVrdV = make_fragment_like(tdKVgdV);
+      Tensor tdKVrdK = make_fragment_like(tdKVgdK);
+      Tensor cdKV = cute::make_identity_tensor(
+          select<1, 2>(TileShape_MNK{})); // (BLK_M,BLK_K) -> (blk_m,blk_k)
+      // Repeat the partitioning with identity layouts
+      Tensor tdKVcdKV = gmem_thr_copy_dKV.partition_D(cdKV);
+      Tensor tdKVpdKV = make_tensor<bool>(make_shape(size<2>(tdKVgdV)));
+#pragma unroll
+      for (int k = 0; k < size(tdKVpdKV); ++k) {
+        tdKVpdKV(k) = get<1>(tdKVcdKV(_0{}, _0{}, k)) < get<1>(params.shape_dK);
+      }
+      // Need to check OOB when reading from smem if kBlockN isn't evenly tiled
+      static constexpr bool EvenN =
+          kBlockN % CUTE_STATIC_V(size<0>(GmemLayoutAtom{})) == 0;
+      hstu::copy<
+          /*Is_even_MN=*/EvenN,
+          /*Is_even_K=*/true,
+          /*Clear_OOB_MN=*/false>(
+          gmem_tiled_copy_dKV, tdKVsdV, tdKVrdV, tdKVcdKV, tdKVpdKV, kBlockN);
+      hstu::copy<
+          /*Is_even_MN=*/EvenN,
+          /*Is_even_K=*/true,
+          /*Clear_OOB_MN=*/false>(
+          gmem_tiled_copy_dKV, tdKVsdK, tdKVrdK, tdKVcdKV, tdKVpdKV, kBlockN);
+      // // Tell warp 0 that smem_k and smem_v are ready
+      // cutlass::arch::fence_view_async_shared(); // ensure smem reads are done
+      // before next TMA to smem_k/v
+      // hstu::named_barrier_arrive(NumEpilogueThreads +
+      // cutlass::NumThreadsPerWarp,
+      // static_cast<uint32_t>(BwdNamedBarriers::KVEmpty) /*id*/); Construct
+      // identity layout for gdKV Clear_OOB_K must be false since we don't want
+      // to write zeros to gmem
+      hstu::copy<
+          /*Is_even_MN=*/false,
+          /*Is_even_K=*/false,
+          /*Clear_OOB_MN=*/false,
+          /*Clear_OOB_K=*/false>(
+          gmem_tiled_copy_dKV,
+          tdKVrdV,
+          tdKVgdV,
+          tdKVcdKV,
+          tdKVpdKV,
+          std::min(seqlen_info.seqlen - n_block * kBlockN, kBlockN));
+      hstu::copy<
+          /*Is_even_MN=*/false,
+          /*Is_even_K=*/false,
+          /*Clear_OOB_MN=*/false,
+          /*Clear_OOB_K=*/false>(
+          gmem_tiled_copy_dKV,
+          tdKVrdK,
+          tdKVgdK,
+          tdKVcdKV,
+          tdKVpdKV,
+          std::min(seqlen_info.seqlen - n_block * kBlockN, kBlockN));
+    }
+  }
+
+  CUTLASS_DEVICE void store_tail() {
+    // if constexpr (Use_TMA) { tma_store_wait<0>(); }
+  }
+
+  // Write 0 to dK and dV
+  CUTLASS_DEVICE void store_zero(
+      Params const& params,
+      int thread_idx,
+      cute::tuple<int32_t, int32_t, int32_t> const& block_coord) {
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+    auto [n_block, bidh, bidb] = block_coord;
+    hstu::SeqlenInfo<Jagged, kBlockN> seqlen_info{
+        bidb, size<0>(params.shape_dK), params.seq_offsets};
+    Tensor mdK = make_tensor(
+        make_gmem_ptr(params.ptr_dK), params.shape_dK, params.stride_dK)(
+        _, _, bidh, !Jagged ? bidb : 0);
+    Tensor gdK = local_tile(
+        cute::domain_offset(make_coord(seqlen_info.offset, _0{}), mdK),
+        select<1, 2>(TileShape_MNK{}),
+        make_coord(n_block, _0{})); // (M, K)
+    Tensor mdV = make_tensor(
+        make_gmem_ptr(params.ptr_dV), params.shape_dK, params.stride_dV)(
+        _, _, bidh, !Jagged ? bidb : 0);
+    Tensor gdV = local_tile(
+        cute::domain_offset(make_coord(seqlen_info.offset, _0{}), mdV),
+        select<1, 2>(TileShape_MNK{}),
+        make_coord(n_block, _0{})); // (M, K)
+
+    GmemTiledCopydKV gmem_tiled_copy_dKV;
+    auto gmem_thr_copy_dKV = gmem_tiled_copy_dKV.get_thread_slice(thread_idx);
+    Tensor tdKVgdK = gmem_thr_copy_dKV.partition_D(gdK);
+    Tensor tdKVgdV = gmem_thr_copy_dKV.partition_D(gdV);
+    Tensor tdKVrdKV = make_fragment_like(tdKVgdK);
+    clear(tdKVrdKV);
+    // Construct identity layout for gdKV
+    Tensor cdKV = cute::make_identity_tensor(
+        select<1, 2>(TileShape_MNK{})); // (BLK_M,BLK_K) -> (blk_m,blk_k)
+    // Repeat the partitioning with identity layouts
+    Tensor tdKVcdKV = gmem_thr_copy_dKV.partition_D(cdKV);
+    Tensor tdKVpdKV = make_tensor<bool>(make_shape(size<2>(tdKVgdK)));
+#pragma unroll
+    for (int k = 0; k < size(tdKVpdKV); ++k) {
+      tdKVpdKV(k) = get<1>(tdKVcdKV(_0{}, _0{}, k)) < get<1>(params.shape_dK);
+    }
+    // Clear_OOB_K must be false since we don't want to write zeros to gmem
+    hstu::copy<
+        /*Is_even_MN=*/false,
+        /*Is_even_K=*/false,
+        /*Clear_OOB_MN=*/false,
+        /*Clear_OOB_K=*/false>(
+        gmem_tiled_copy_dKV,
+        tdKVrdKV,
+        tdKVgdK,
+        tdKVcdKV,
+        tdKVpdKV,
+        seqlen_info.seqlen - n_block * kBlockN);
+    hstu::copy<
+        /*Is_even_MN=*/false,
+        /*Is_even_K=*/false,
+        /*Clear_OOB_MN=*/false,
+        /*Clear_OOB_K=*/false>(
+        gmem_tiled_copy_dKV,
+        tdKVrdKV,
+        tdKVgdV,
+        tdKVcdKV,
+        tdKVpdKV,
+        seqlen_info.seqlen - n_block * kBlockN);
+  }
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/epilogue_fwd.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/epilogue_fwd.h
new file mode 100644
index 000000000..c794a114c
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/epilogue_fwd.h
@@ -0,0 +1,550 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cutlass/cutlass.h>
+#include <cutlass/fast_math.h> // For FastDivMod
+#include "cute/tensor.hpp"
+
+#include "cutlass/epilogue/collective/builders/sm90_common.inl"
+#include "cutlass/gemm/collective/builders/sm90_common.inl"
+
+#include "named_barrier.h"
+#include "seqlen.h"
+#include "utils.h"
+
+namespace hstu {
+
+using namespace cute;
+
+template <
+    class TileShape_MNK_,
+    class ClusterShape_,
+    class Element_,
+    class ArchTag_,
+    int NumEpilogueThreads_,
+    bool Jagged,
+    bool FP8PermuteCol = false>
+struct CollectiveEpilogueFwd {
+  using TileShape_MNK = TileShape_MNK_;
+  using ClusterShape = ClusterShape_;
+  using Element = Element_;
+  using ArchTag = ArchTag_;
+  static constexpr int NumEpilogueThreads = NumEpilogueThreads_;
+  static constexpr bool Use_smem = sizeof(Element) <= 2;
+  static constexpr bool Use_TMA_O =
+      ArchTag::kMinComputeCapability >= 90 && !Jagged && Use_smem;
+
+  static_assert(ArchTag::kMinComputeCapability >= 80);
+  static_assert(
+      ArchTag::kMinComputeCapability >= 90 ||
+      CUTE_STATIC_V(size(ClusterShape{})) == 1);
+
+  static constexpr int kBlockM = get<0>(TileShape_MNK{});
+  static constexpr int kHeadDim = get<2>(TileShape_MNK{});
+
+  using GmemTiledCopyOTMA = cute::SM90_TMA_STORE;
+
+  // These are for storing the output tensor without TMA (e.g., for setting
+  // output to zero)
+  static constexpr int kGmemElemsPerStore =
+      sizeof(cute::uint128_t) / sizeof(Element);
+  static_assert(
+      kHeadDim % kGmemElemsPerStore == 0,
+      "Headdim must be a multiple of kGmemElemsPerStore");
+  // We want each "row" to have 64 elements (128 bytes, i.e. 1 cache line). We
+  // want each thread to have 4 elements in the M direction and 2 elements in
+  // the K direction. In the case of PackGQA, this reduces the number of times
+  // we need to call divmod.
+  static constexpr int kBytePerRow = kHeadDim * sizeof(Element);
+  static constexpr int kBlockKGmem =
+      (kBytePerRow % 128 == 0 ? 128 : (kBytePerRow % 64 == 0 ? 64 : 32)) /
+      sizeof(Element);
+  // static constexpr int kBlockKGmem = kHeadDim % 128 == 0 ? 128 : (kHeadDim %
+  // 64 == 0 ? 64 : 32); static constexpr int kGmemThreadsPerRow =
+  // cutlass::gcd(kHeadDim / kGmemElemsPerStore, NumEpilogueThreads);
+  static constexpr int kGmemThreadsPerRow = kBlockKGmem / kGmemElemsPerStore;
+  // If PackGQA, we split the work of compute O_ptr among threads in the same
+  // row, so we need this to within a warp
+  static_assert(cutlass::NumThreadsPerWarp % kGmemThreadsPerRow == 0);
+  static_assert(
+      NumEpilogueThreads % kGmemThreadsPerRow == 0,
+      "NumEpilogueThreads must be a multiple of kGmemThreadsPerRow");
+  using GmemLayoutAtom = Layout<
+      Shape<
+          Int<NumEpilogueThreads / kGmemThreadsPerRow>,
+          Int<kGmemThreadsPerRow>>,
+      Stride<Int<kGmemThreadsPerRow>, _1>>;
+  static_assert(
+      kBlockM % CUTE_STATIC_V(shape<0>(GmemLayoutAtom{})) == 0,
+      "kBlockM must be a multiple of NumEpilogueThreads / kGmemThreadsPerRow");
+  using GmemTiledCopyO = decltype(make_tiled_copy(
+      Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, Element>{},
+      GmemLayoutAtom{},
+      Layout<Shape<_1, Int<kGmemElemsPerStore>>>{})); // Val layout, 8 or 16
+                                                      // vals per store
+
+  using SmemLayoutAtomOTMA =
+      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+               GMMA::Major::K,
+               Element,
+               decltype(cute::get<0>(TileShape_MNK{})),
+               decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutOTMA = decltype(tile_to_shape(
+      SmemLayoutAtomOTMA{},
+      select<0, 2>(TileShape_MNK{})));
+  static constexpr int kSwizzle = kBlockKGmem == 128
+      ? 4
+      : (kBlockKGmem == 64 ? 3 : (kBlockKGmem == 32 ? 2 : 1));
+  static constexpr int kSwizzleBase =
+      sizeof(Element) == 4 ? 2 : (sizeof(Element) == 2 ? 3 : 4);
+  using SmemLayoutAtomO = decltype(composition(
+      Swizzle<kSwizzle, kSwizzleBase, kSwizzleBase>{},
+      Layout<Shape<_8, Int<kBlockKGmem>>, Stride<Int<kBlockKGmem>, _1>>{}));
+  using SmemLayoutOSTS =
+      decltype(tile_to_shape(SmemLayoutAtomO{}, select<0, 2>(TileShape_MNK{})));
+  using SmemLayoutO = std::conditional_t<
+      ArchTag::kMinComputeCapability >= 90,
+      SmemLayoutOTMA,
+      SmemLayoutOSTS>;
+
+  using ShapeO =
+      cute::Shape<int32_t, int32_t, int32_t, int32_t, int32_t>; // (seqlen_q, d,
+                                                                // head, batch,
+                                                                // num_splits)
+  using StrideO = cute::Stride<int64_t, _1, int64_t, int64_t, int64_t>;
+  // ((qhead_per_khead, seqlen_q), d, nheads, batch, num_splits)
+  using ShapeOPacked = ShapeO;
+  using StrideOPacked = StrideO;
+  // ((qhead_per_khead, seqlen_q), nheads, batch, num_splits)
+  using StrideLSE =
+      cute::Stride<_1, int64_t, int64_t, int64_t>; // (seqlen_q, head, batch,
+  // num_splits)
+  using ShapeLSEPacked = cute::Shape<int32_t, int32_t, int32_t, int32_t>;
+  using StrideLSEPacked = StrideLSE;
+  using EpilogueTile_MN = decltype(select<0, 1>(TileShape_MNK{}));
+  using CopyOpR2S = std::conditional_t<
+      ArchTag::kMinComputeCapability >= 90,
+      // cute::SM90_U32x4_STSM_N if Element size is 2 bytes (fp16, bf16)
+      decltype(cutlass::epilogue::collective::detail::
+                   sm90_get_smem_store_op_for_accumulator<
+                       StrideO,
+                       Element,
+                       EpilogueTile_MN>()),
+      AutoVectorizingCopyWithAssumedAlignment<128>>;
+  using SmemCopyAtomO = Copy_Atom<CopyOpR2S, Element>;
+
+  // static constexpr size_t SmemAlignmentO =
+  // cutlass::detail::alignment_for_swizzle(SmemLayoutO{});
+  // static_assert(SmemAlignmentO >= 128, "Require at least 128B alignment");
+  // struct TensorStorage : cute::aligned_struct<SmemAlignmentO> {
+  //     cute::array_aligned<Element, Use_smem ? cute::cosize_v<SmemLayoutO> :
+  //     0, SmemAlignmentO> smem_o;
+  // };
+  struct TensorStorage : cute::aligned_struct<128> {
+    cute::array_aligned<Element, Use_smem ? cute::cosize_v<SmemLayoutO> : 0>
+        smem_o;
+  };
+
+  using TMA_O = std::conditional_t<
+      Use_TMA_O,
+      decltype(make_tma_copy(
+          GmemTiledCopyOTMA{},
+          make_tensor(
+              make_gmem_ptr(static_cast<Element*>(nullptr)),
+              ShapeO{},
+              StrideO{}),
+          SmemLayoutOTMA{},
+          select<0, 2>(TileShape_MNK{}),
+          _1{})), // no mcast for O
+      std::nullptr_t>;
+
+  // Host side kernel arguments
+  struct Arguments {
+    Element* ptr_O;
+    ShapeO const shape_O;
+    StrideO const stride_O;
+    int32_t const nheads;
+    int32_t const num_softmax_heads;
+    StrideLSE const stride_lse;
+    float* ptr_lse = nullptr;
+    int const* seq_offsets = nullptr;
+  };
+
+  // Device side kernel params
+  struct Params {
+    Element* ptr_O;
+    ShapeO const shape_O;
+    StrideO const stride_O;
+    ShapeOPacked const shape_O_packed;
+    StrideOPacked const stride_O_packed;
+    float* ptr_lse;
+    StrideLSE const stride_lse;
+    ShapeLSEPacked const shape_lse_packed;
+    StrideLSEPacked const stride_lse_packed;
+    TMA_O tma_store_O;
+    int const* seq_offsets = nullptr;
+  };
+
+  static Params to_underlying_arguments(Arguments const& args) {
+    Tensor mO =
+        make_tensor(make_gmem_ptr(args.ptr_O), args.shape_O, args.stride_O);
+    TMA_O tma_store_O = [&] {
+      if constexpr (Use_TMA_O) {
+        return make_tma_copy(
+            GmemTiledCopyOTMA{},
+            mO,
+            SmemLayoutO{},
+            select<0, 2>(TileShape_MNK{}),
+            _1{}); // no mcast
+      } else {
+        return nullptr;
+      }
+    }();
+    // If PackGQA, reshape O to be ((qhead_per_khead, seqlen_q), head_size,
+    // nhead_k, batch_size, num_splits)
+    int const qhead_per_khead = 1;
+    auto const shape_O_packed = cute::conditional_return<true>(
+        args.shape_O,
+        make_shape(
+            make_shape(qhead_per_khead, get<0>(args.shape_O)),
+            get<1>(args.shape_O),
+            args.nheads,
+            get<3>(args.shape_O),
+            get<4>(args.shape_O)));
+    auto const stride_O_packed = cute::conditional_return<true>(
+        args.stride_O,
+        make_stride(
+            make_stride(get<2>(args.stride_O), get<0>(args.stride_O)),
+            get<1>(args.stride_O),
+            get<2>(args.stride_O) * qhead_per_khead,
+            get<3>(args.stride_O),
+            get<4>(args.stride_O)));
+    auto const shape_lse_packed = select<0, 2, 3, 4>(args.shape_O);
+    auto const stride_lse_packed = args.stride_lse;
+    return {
+        args.ptr_O,
+        args.shape_O,
+        args.stride_O,
+        shape_O_packed,
+        stride_O_packed,
+        args.ptr_lse,
+        args.stride_lse,
+        shape_lse_packed,
+        stride_lse_packed,
+        tma_store_O,
+        args.seq_offsets};
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best
+  /// performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& params) {
+    if constexpr (Use_TMA_O) {
+      cute::prefetch_tma_descriptor(params.tma_store_O.get_tma_descriptor());
+    }
+  }
+
+  template <typename SharedStorage, typename FrgTensorO, typename TiledMma>
+  CUTLASS_DEVICE void store(
+      Params const& params,
+      FrgTensorO const& tOrO,
+      SharedStorage& shared_storage,
+      TiledMma tiled_mma,
+      int thread_idx,
+      cute::tuple<int32_t, int32_t, int32_t, int32_t> const& block_coord) {
+    auto [m_block, bidh, bidb, split_idx] = block_coord;
+    Tensor sO = make_tensor(
+        make_smem_ptr(shared_storage.tensors.epilogue.smem_o.data()),
+        SmemLayoutO{});
+    // Tensor sO_pi = cute::as_position_independent_swizzle_tensor(sO);
+
+    Tensor tOrO_out = make_tensor_like<Element>(tOrO);
+    hstu::convert_type_out(tOrO, tOrO_out);
+    if constexpr (
+        FP8PermuteCol && (sizeof(Element) == 2 || sizeof(Element) == 4)) {
+      hstu::permute_output_fp8_Vcolmajor(tOrO_out);
+    }
+
+    // Make sure all WGs have finished reading V
+    // Technically we don't need this if we're not using smem, but the mainloop
+    // makes the assumption that all epilogue threads sync at least once during
+    // the epilogue (so that we can start loading Q with cp.async if we need).
+    hstu::named_barrier_sync(
+        NumEpilogueThreads,
+        cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+
+    // Step 1: Write O from rmem -> smem
+    if constexpr (Use_smem) {
+      auto smem_tiled_copy_O = make_tiled_copy_C(SmemCopyAtomO{}, tiled_mma);
+      auto smem_thr_copy_O = smem_tiled_copy_O.get_thread_slice(thread_idx);
+      Tensor taccOrO =
+          smem_thr_copy_O.retile_S(tOrO_out); // ((Atom,AtomNum), MMA_M, MMA_N)
+      Tensor taccOsO =
+          smem_thr_copy_O.partition_D(sO); // ((Atom,AtomNum),PIPE_M,PIPE_N)
+      // Tensor taccOsO = smem_thr_copy_O.partition_D(sO_pi);     //
+      // ((Atom,AtomNum),PIPE_M,PIPE_N)
+      cute::copy(smem_tiled_copy_O, taccOrO, taccOsO);
+      if constexpr (Use_TMA_O) {
+        cutlass::arch::fence_view_async_shared(); // ensure smem writes are
+                                                  // visible to TMA
+        cutlass::arch::NamedBarrier::arrive(
+            NumEpilogueThreads + cutlass::NumThreadsPerWarp,
+            cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+      } else {
+        hstu::named_barrier_sync(
+            NumEpilogueThreads,
+            cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+      }
+    } else {
+      if constexpr (ArchTag::kMinComputeCapability >= 90) {
+#pragma unroll
+        for (uint32_t cta_id = 0; cta_id < size(ClusterShape{}); ++cta_id) {
+          shared_storage.pipelines.barrier_O.arrive(cta_id);
+        }
+      }
+    }
+
+    hstu::SeqlenInfo<Jagged, kBlockM> seqlen_info{
+        bidb, size<0>(params.shape_O), params.seq_offsets};
+    int offset_o = seqlen_info.offset;
+    int seqlen_o = seqlen_info.seqlen;
+
+    // Step 2: Write LSE from rmem -> gmem
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+    // (MMA,MMA_M,MMA_K)
+    Tensor taccOcO = thread_mma.partition_C(
+        cute::make_identity_tensor(select<0, 2>(TileShape_MNK{})));
+    static_assert(decltype(size<0, 0>(taccOcO))::value == 2);
+    static_assert(decltype(size<0, 1>(taccOcO))::value == 2);
+    Tensor taccOcO_rowcol = make_tensor(
+        taccOcO.data(), hstu::convert_layout_acc_rowcol(taccOcO.layout()));
+    Tensor taccOcO_row = taccOcO_rowcol(_, _0{});
+    // Step 3: Write O from smem -> gmem
+    if constexpr (Use_TMA_O) {
+      Tensor mO = params.tma_store_O.get_tma_tensor(params.shape_O)(
+          _, _, bidh, bidb, split_idx);
+      Tensor gO = local_tile(
+          mO,
+          select<0, 2>(TileShape_MNK{}),
+          make_coord(m_block, _0{})); // (M, K)
+      auto block_tma_O = params.tma_store_O.get_slice(_0{});
+      Tensor tOgO = block_tma_O.partition_D(gO); // (TMA, TMA_M, TMA_K)
+      Tensor tOsO = block_tma_O.partition_S(sO); // (TMA, TMA_M, TMA_K)
+      int warp_idx_sync =
+          __shfl_sync(0xffffffff, thread_idx / cutlass::NumThreadsPerWarp, 0);
+      if (warp_idx_sync ==
+          NumEpilogueThreads / cutlass::NumThreadsPerWarp - 1) {
+        cutlass::arch::NamedBarrier::sync(
+            NumEpilogueThreads + cutlass::NumThreadsPerWarp,
+            cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
+        if (cute::elect_one_sync()) {
+          cute::copy(params.tma_store_O, tOsO, tOgO);
+          tma_store_arrive();
+          tma_store_wait<0>();
+#pragma unroll
+          for (uint32_t cta_id = 0; cta_id < size(ClusterShape{}); ++cta_id) {
+            shared_storage.pipelines.barrier_O.arrive(cta_id);
+          }
+        }
+      }
+    } else { // Don't use TMA in Jagged case since we don't want to overwrite
+             // the output of another sequence
+      Tensor mO = make_tensor(
+          make_gmem_ptr(params.ptr_O + offset_o * get<0>(params.stride_O)),
+          params.shape_O_packed,
+          params.stride_O_packed)(_, _, bidh, !Jagged ? bidb : 0, split_idx);
+      Tensor gO = local_tile(
+          mO,
+          select<0, 2>(TileShape_MNK{}),
+          make_coord(m_block, _0{})); // (M, K)
+      // if (thread_idx == 0) { printf("Before O write, m_block: %d, bidh: %d,
+      // bidb: %d, split_idx: %d, offset_o: %d, seqlen_o: %d, mO_addr = %p, addr
+      // diff = %d\n", m_block, bidh, bidb, split_idx, offset_o, seqlen_o,
+      // mO.data(), reinterpret_cast<int>(&mO(0)) -
+      // reinterpret_cast<int>(params.ptr_O)); }
+      if constexpr (Use_smem) {
+        GmemTiledCopyO gmem_tiled_copy_O;
+        auto gmem_thr_copy_O = gmem_tiled_copy_O.get_thread_slice(thread_idx);
+        Tensor tOsO =
+            gmem_thr_copy_O.partition_S(sO); // ((Atom,AtomNum),ATOM_M,ATOM_N)
+        // Tensor tOsO = gmem_thr_copy_O.partition_S(sO_pi);        //
+        // ((Atom,AtomNum),ATOM_M,ATOM_N)
+        Tensor tOrO = make_fragment_like(tOsO);
+        cute::copy(gmem_tiled_copy_O, tOsO, tOrO);
+        if constexpr (ArchTag::kMinComputeCapability >= 90) {
+          cutlass::arch::fence_view_async_shared(); // ensure smem reads are
+                                                    // done before next TMA to
+                                                    // smem_v
+#pragma unroll
+          for (uint32_t cta_id = 0; cta_id < size(ClusterShape{}); ++cta_id) {
+            shared_storage.pipelines.barrier_O.arrive(cta_id);
+          }
+        }
+        // (BLK_M,BLK_K) -> (blk_m,blk_k)
+        Tensor tOcO = gmem_thr_copy_O.partition_D(
+            cute::make_identity_tensor(select<0, 2>(TileShape_MNK{})));
+        Tensor tOpO = make_tensor<bool>(make_shape(size<2>(tOsO)));
+#pragma unroll
+        for (int k = 0; k < size(tOpO); ++k) {
+          tOpO(k) = get<1>(tOcO(_0{}, _0{}, k)) < get<1>(params.shape_O);
+        }
+        Tensor tOgO = gmem_thr_copy_O.partition_D(gO);
+        // Clear_OOB_K must be false since we don't want to write zeros to
+        // gmem
+        hstu::copy<
+            /*Is_even_MN=*/false,
+            /*Is_even_K=*/false,
+            /*Clear_OOB_MN=*/false,
+            /*Clear_OOB_K=*/false>(
+            gmem_tiled_copy_O,
+            tOrO,
+            tOgO,
+            tOcO,
+            tOpO,
+            seqlen_o - m_block * kBlockM);
+      } else {
+        // We already arrived on barrier_O earlier
+        static constexpr int kGmemElemsPerStoreDirect = 2;
+        cute::Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, Element>
+            gmem_copy_direct;
+        // Reshape acc from ((2, 2, V), MMA_M, MMA_N) to (nrow=(2, MMA_M),
+        // ncol=(2, V, MMA_N))
+        Tensor tOrO_rowcol = make_tensor(
+            tOrO_out.data(), hstu::convert_layout_acc_rowcol(tOrO.layout()));
+        Tensor tOrO_copy = cute::tiled_divide(
+            tOrO_rowcol, Shape<_1, Int<kGmemElemsPerStoreDirect>>{});
+        Tensor tOgO = thread_mma.partition_C(gO);
+        Tensor tOgO_rowcol = make_tensor(
+            tOgO.data(), hstu::convert_layout_acc_rowcol(tOgO.layout()));
+        Tensor tOgO_copy = cute::tiled_divide(
+            tOgO_rowcol, Shape<_1, Int<kGmemElemsPerStoreDirect>>{});
+        Tensor taccOcO_col = taccOcO_rowcol(_0{}, _);
+#pragma unroll
+        for (int m = 0; m < size(taccOcO_row); ++m) {
+          if (get<0>(taccOcO_row(m)) < seqlen_o - m_block * kBlockM) {
+#pragma unroll
+            for (int k = 0; k < size(taccOcO_col) / kGmemElemsPerStoreDirect;
+                 ++k) {
+              if (get<1>(taccOcO_col(k * kGmemElemsPerStoreDirect)) <
+                  get<1>(params.shape_O)) {
+                cute::copy(
+                    gmem_copy_direct, tOrO_copy(_, m, k), tOgO_copy(_, m, k));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  template <typename FrgTensorLSE, typename TiledMma>
+  CUTLASS_DEVICE void store_softmax(
+      Params const& params,
+      FrgTensorLSE const& lse,
+      TiledMma tiled_mma,
+      int thread_idx,
+      cute::tuple<int32_t, int32_t, int32_t, int32_t> const& block_coord) {
+    auto [m_block, bidh, bidb, split_idx] = block_coord;
+    hstu::SeqlenInfo<Jagged, kBlockM> seqlen_info{
+        bidb, size<0>(params.shape_O), params.seq_offsets};
+    int offset_o = seqlen_info.offset;
+    int seqlen_o = seqlen_info.seqlen;
+    // Step 2: Write LSE from rmem -> gmem
+    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
+    // (MMA,MMA_M,MMA_K)
+    Tensor taccOcO = thread_mma.partition_C(
+        cute::make_identity_tensor(select<0, 2>(TileShape_MNK{})));
+    static_assert(decltype(size<0, 0>(taccOcO))::value == 2);
+    static_assert(decltype(size<0, 1>(taccOcO))::value == 2);
+    Tensor taccOcO_rowcol = make_tensor(
+        taccOcO.data(), hstu::convert_layout_acc_rowcol(taccOcO.layout()));
+    Tensor taccOcO_row = taccOcO_rowcol(_, _0{});
+    CUTE_STATIC_ASSERT_V(size(lse) == size(taccOcO_row)); // MMA_M
+    Tensor mLSE = make_tensor(
+        make_gmem_ptr(params.ptr_lse + offset_o * get<0>(params.stride_lse)),
+        params.shape_lse_packed,
+        params.stride_lse_packed)(_, bidh, !Jagged ? bidb : 0, 0);
+#pragma unroll
+    for (int mi = 0; mi < size(lse); ++mi) {
+      int const row = m_block * kBlockM + get<0>(taccOcO_row(mi));
+      if (get<1>(taccOcO_row(_0{})) == 0 && row < seqlen_o) {
+        mLSE(row) = lse(mi);
+      }
+    }
+  }
+
+  CUTLASS_DEVICE void store_tail() {
+    // Don't need to do tma_store_wait<0>() here since we already did in @store
+  }
+
+  // Write 0 to output and -inf to LSE
+  template <bool Clear_O = true>
+  CUTLASS_DEVICE void store_zero(
+      Params const& params,
+      int thread_idx,
+      cute::tuple<int32_t, int32_t, int32_t, int32_t> const& block_coord) {
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    auto [m_block, bidh, bidb, split_idx] = block_coord;
+    hstu::SeqlenInfo<Jagged, kBlockM> seqlen_info{
+        bidb, size<0>(params.shape_O), params.seq_offsets};
+    int offset_o = seqlen_info.offset;
+    int seqlen_o = seqlen_info.seqlen;
+    Tensor mO = make_tensor(
+        make_gmem_ptr(params.ptr_O + offset_o * get<0>(params.stride_O)),
+        params.shape_O_packed,
+        params.stride_O_packed)(_, _, bidh, !Jagged ? bidb : 0, split_idx);
+
+    static_assert(kBlockM <= NumEpilogueThreads);
+    if constexpr (!Clear_O) {
+      return;
+    }
+
+    GmemTiledCopyO gmem_tiled_copy_O;
+    auto gmem_thr_copy_O = gmem_tiled_copy_O.get_thread_slice(thread_idx);
+    Tensor tOcO = gmem_thr_copy_O.partition_D(
+        cute::make_identity_tensor(select<0, 2>(TileShape_MNK{})));
+    Tensor tOpO = make_tensor<bool>(make_shape(size<2>(tOcO)));
+#pragma unroll
+    for (int k = 0; k < size(tOpO); ++k) {
+      tOpO(k) = get<1>(tOcO(_0{}, _0{}, k)) < get<1>(params.shape_O);
+    }
+    Tensor gO = local_tile(
+        mO, select<0, 2>(TileShape_MNK{}), make_coord(m_block, _0{})); // (M, K)
+    Tensor tOgO = gmem_thr_copy_O.partition_D(gO);
+    Tensor tOrO = make_fragment_like(tOgO);
+    cute::clear(tOrO);
+    // Clear_OOB_K must be false since we don't want to write zeros to gmem
+    hstu::copy<
+        /*Is_even_MN=*/false,
+        /*Is_even_K=*/false,
+        /*Clear_OOB_MN=*/false,
+        /*Clear_OOB_K=*/false>(
+        gmem_tiled_copy_O,
+        tOrO,
+        tOgO,
+        tOcO,
+        tOpO,
+        seqlen_o - m_block * kBlockM);
+  }
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash.h
new file mode 100644
index 000000000..ef37e3408
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash.h
@@ -0,0 +1,157 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda.h>
+#include <vector>
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+namespace hstu {
+
+struct Qkv_params {
+  using index_t = int64_t;
+  // The QKV matrices.
+  void* __restrict__ q_ptr;
+  void* __restrict__ k_ptr;
+  void* __restrict__ v_ptr;
+
+  // The stride between rows of the Q, K and V matrices.
+  index_t q_batch_stride;
+  index_t k_batch_stride;
+  index_t v_batch_stride;
+  index_t q_row_stride;
+  index_t k_row_stride;
+  index_t v_row_stride;
+  index_t q_head_stride;
+  index_t k_head_stride;
+  index_t v_head_stride;
+  index_t v_dim_stride;
+
+  // The number of heads.
+  int h;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Flash_fwd_params : public Qkv_params {
+  using index_t = int64_t;
+
+  // The O matrix (output).
+  void* __restrict__ o_ptr;
+
+  // The stride between rows of O.
+  index_t o_batch_stride;
+  index_t o_row_stride;
+  index_t o_head_stride;
+
+  // For FP8 scaling
+  float* __restrict__ q_descale_ptr;
+  float* __restrict__ k_descale_ptr;
+  float* __restrict__ v_descale_ptr;
+  index_t q_descale_batch_stride;
+  index_t q_descale_head_stride;
+  index_t k_descale_batch_stride;
+  index_t k_descale_head_stride;
+  index_t v_descale_batch_stride;
+  index_t v_descale_head_stride;
+
+  // The dimensions.
+  int b, max_kv_len, max_q_len, qk_d, v_d, total_seq_len_q, total_seq_len_kv;
+
+  // groups
+  int num_groups, batch_size_per_group;
+  int* __restrict__ max_seq_len_tensor;
+  int* __restrict__ contextual_seq_len_tensor;
+  int* __restrict__ max_attn_len_tensor;
+  int* __restrict__ min_full_attn_seq_len_tensor;
+
+  // The scaling factors for the kernel.
+  float alpha;
+
+  int* __restrict__ seq_offsets;
+  int* __restrict__ seq_offsets_q;
+  float* __restrict__ softmax_lse;
+  int* __restrict__ num_targets;
+  float* __restrict__ attn_scale;
+
+  // Local window size
+  int max_attn_len, contextual_seq_len, min_full_attn_seq_len,
+      num_softmax_heads;
+
+  // Pointer to the RNG seed (idx 0) and offset (idx 1).
+  uint64_t* rng_state;
+
+  bool is_bf16;
+  bool is_fp32;
+  bool is_e4m3;
+  bool is_causal;
+  bool is_local;
+  bool has_contexual_mask;
+  bool scalar_scale;
+  bool training;
+
+  int* __restrict__ tile_count_semaphore;
+
+  int arch;
+  int num_sm;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+struct Flash_bwd_params : public Flash_fwd_params {
+  using index_t = int64_t;
+
+  // The dO and dQKV matrices.
+  void* __restrict__ do_ptr;
+  void* __restrict__ dq_ptr;
+  void* __restrict__ dk_ptr;
+  void* __restrict__ dv_ptr;
+  float* __restrict__ softmax_lse_log2;
+  float* __restrict__ softmax_d;
+
+  // To accumulate dQ
+  void* __restrict__ dq_accum_ptr;
+  int* __restrict__ dq_semaphore;
+
+  // The stride between rows of the dO, dQ, dK and dV matrices.
+  index_t do_batch_stride;
+  index_t do_row_stride;
+  index_t do_head_stride;
+  index_t dq_batch_stride;
+  index_t dk_batch_stride;
+  index_t dv_batch_stride;
+  index_t dq_row_stride;
+  index_t dk_row_stride;
+  index_t dv_row_stride;
+  index_t dq_head_stride;
+  index_t dk_head_stride;
+  index_t dv_head_stride;
+
+  int* __restrict__ sort_by_length_indices;
+
+  int max_q_len_rounded, qk_d_rounded, v_d_rounded;
+
+  bool deterministic;
+  index_t dq_accum_split_stride;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int Arch, typename T, int Headdim, bool Softmax>
+void run_mha_fwd_(Flash_fwd_params& params, cudaStream_t stream);
+template <int Arch, typename T, int Headdim, bool Softmax>
+void run_mha_bwd_(Flash_bwd_params& params, cudaStream_t stream);
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_api.cpp b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_api.cpp
new file mode 100644
index 000000000..389e6620f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_api.cpp
@@ -0,0 +1,322 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#include <ATen/cuda/CUDAContext.h>
+#include <Python.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cutlass/numeric_types.h>
+#include <torch/csrc/autograd/custom_function.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/library.h> // @manual
+#include <torch/nn/functional.h>
+#include "flash_common.h"
+
+extern "C" {
+/* Creates a dummy empty _C module that can be imported from Python.
+    The import from Python will load the .so consisting of this file
+    in this extension, so that the TORCH_LIBRARY static initializers
+    below are run. */
+PyObject* PyInit__C(void) {
+  static struct PyModuleDef module_def = {
+      PyModuleDef_HEAD_INIT,
+      "_C", /* name of module */
+      NULL, /* module documentation, may be NULL */
+      -1, /* size of per-interpreter state of the module,
+              or -1 if the module keeps state in global variables. */
+      NULL, /* methods */
+  };
+  return PyModule_Create(&module_def);
+}
+}
+
+namespace hstu {
+
+class HSTUFlashAttentionFunctionGPU
+    : public torch::autograd::Function<HSTUFlashAttentionFunctionGPU> {
+ public:
+  static at::Tensor forward(
+      torch::autograd::AutogradContext* ctx,
+      int64_t max_seq_len,
+      double alpha,
+      at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
+      at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
+      at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
+      const std::optional<at::Tensor>& seq_offsets,
+      bool causal,
+      const std::optional<at::Tensor>& num_targets,
+      const std::optional<at::Tensor>& attn_scale,
+      int64_t max_attn_len,
+      int64_t min_full_attn_seq_len,
+      int64_t contextual_seq_len,
+      const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
+      const std::optional<at::Tensor>& k_descale, // (b, h_k)
+      const std::optional<at::Tensor>& v_descale, // (b, h_k)
+      bool sort_by_length,
+      bool deterministic,
+      const int64_t sm_margin,
+      int64_t max_q_len,
+      const std::optional<at::Tensor>& seq_offsets_q,
+      int64_t num_softmax_heads,
+      bool training,
+      const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
+      const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
+      const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
+      const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
+          std::nullopt,
+      int64_t num_groups = 1) {
+    ctx->saved_data["max_seq_len"] = max_seq_len;
+    ctx->saved_data["alpha"] = alpha;
+    ctx->saved_data["causal"] = causal;
+    ctx->saved_data["max_attn_len"] = max_attn_len;
+    ctx->saved_data["min_full_attn_seq_len"] = min_full_attn_seq_len;
+    ctx->saved_data["contextual_seq_len"] = contextual_seq_len;
+    ctx->saved_data["deterministic"] = deterministic;
+    ctx->saved_data["sort_by_length"] = sort_by_length;
+    ctx->saved_data["sm_margin"] = sm_margin;
+    ctx->saved_data["max_q_len"] = max_q_len;
+    ctx->saved_data["num_softmax_heads"] = num_softmax_heads;
+    ctx->saved_data["num_groups"] = num_groups;
+    auto fwd_out = hstu::hstu_mha_fwd(
+        max_seq_len, // max_seq_len
+        alpha, // alpha
+        q, // q
+        k, // k
+        v, // v
+        seq_offsets, // seq_offsets
+        causal, // causal
+        num_targets, // num_targets
+        attn_scale, // attn_scale
+        max_attn_len, // max_attn_len
+        min_full_attn_seq_len, // min_full_attn_seq_len
+        contextual_seq_len, // contextual_seq_len
+        q_descale, // q_descale
+        k_descale, // k_descale
+        v_descale, // v_descale
+        sm_margin, // sm_margin
+        max_q_len, // max_q_len
+        seq_offsets_q, // seq_offsets_q
+        num_softmax_heads, // num_softmax_heads
+        training,
+        max_seq_len_tensor,
+        contextual_seq_len_tensor,
+        max_attn_len_tensor,
+        min_full_attn_seq_len_tensor,
+        num_groups);
+    auto out = get<0>(fwd_out);
+    auto softmax_lse = get<1>(fwd_out);
+    ctx->save_for_backward(
+        {q,
+         k,
+         v,
+         out,
+         seq_offsets.value_or(at::Tensor()),
+         num_targets.value_or(at::Tensor()),
+         attn_scale.value_or(at::Tensor()),
+         seq_offsets_q.value_or(at::Tensor()),
+         softmax_lse.value_or(at::Tensor()),
+         max_seq_len_tensor.value_or(at::Tensor()),
+         contextual_seq_len_tensor.value_or(at::Tensor()),
+         max_attn_len_tensor.value_or(at::Tensor()),
+         min_full_attn_seq_len_tensor.value_or(at::Tensor())});
+    return out;
+  }
+
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext* ctx,
+      torch::autograd::variable_list grad_outputs) {
+    auto saved_tensors = ctx->get_saved_variables();
+    auto saved_data = ctx->saved_data;
+    auto q = saved_tensors[0];
+    auto k = saved_tensors[1];
+    auto v = saved_tensors[2];
+    auto out = saved_tensors[3];
+    auto seq_offsets = saved_tensors[4];
+    auto num_targets = saved_tensors[5];
+    auto attn_scale = saved_tensors[6];
+    auto seq_offsets_q = saved_tensors[7];
+    auto softmax_lse = saved_tensors[8];
+    auto max_seq_len_tensor = saved_tensors[9];
+    auto contextual_seq_len_tensor = saved_tensors[10];
+    auto max_attn_len_tensor = saved_tensors[11];
+    auto min_full_attn_seq_len_tensor = saved_tensors[12];
+    auto seq_offsets_opt =
+        seq_offsets.defined() ? std::optional(seq_offsets) : std::nullopt;
+    auto num_targets_opt =
+        num_targets.defined() ? std::optional(num_targets) : std::nullopt;
+    auto attn_scale_opt =
+        attn_scale.defined() ? std::optional(attn_scale) : std::nullopt;
+    auto seq_offsets_q_opt =
+        seq_offsets_q.defined() ? std::optional(seq_offsets_q) : std::nullopt;
+    auto softmax_lse_opt =
+        softmax_lse.defined() ? std::optional(softmax_lse) : std::nullopt;
+    auto max_seq_len_tensor_opt = max_seq_len_tensor.defined()
+        ? std::optional(max_seq_len_tensor)
+        : std::nullopt;
+    auto contextual_seq_len_tensor_opt = contextual_seq_len_tensor.defined()
+        ? std::optional(contextual_seq_len_tensor)
+        : std::nullopt;
+    auto max_attn_len_tensor_opt = max_attn_len_tensor.defined()
+        ? std::optional(max_attn_len_tensor)
+        : std::nullopt;
+    auto min_full_attn_seq_len_tensor_opt =
+        min_full_attn_seq_len_tensor.defined()
+        ? std::optional(min_full_attn_seq_len_tensor)
+        : std::nullopt;
+
+    auto dq = at::empty_like(q);
+    auto dk = at::empty_like(k);
+    auto dv = at::empty_like(v);
+
+    auto bwd_res = hstu::hstu_mha_bwd(
+        saved_data["max_seq_len"].toInt(), // max_seq_len
+        saved_data["alpha"].toDouble(), // alpha
+        grad_outputs[0], // dout
+        q, // q
+        k, // k
+        v, // v
+        dq, // dq
+        dk, // dk
+        dv, // dv
+        out, // out
+        seq_offsets_opt, // seq_offsets
+        saved_data["causal"].toBool(), // causal
+        num_targets_opt, // num_targets
+        attn_scale_opt, // attn_scale
+        saved_data["max_attn_len"].toInt(), // max_attn_len
+        saved_data["min_full_attn_seq_len"].toInt(), // min_full_attn_seq_len
+        saved_data["contextual_seq_len"].toInt(), // contextual_seq_len
+        saved_data["sort_by_length"].toBool(), // sort_by_length
+        saved_data["deterministic"].toBool(), // deterministic
+        saved_data["sm_margin"].toInt(), // sm_margin
+        saved_data["max_q_len"].toInt(), // max_q_len
+        seq_offsets_q_opt, // seq_offsets_q
+        saved_data["num_softmax_heads"].toInt(), // num_softmax_heads
+        softmax_lse_opt,
+        max_seq_len_tensor_opt,
+        contextual_seq_len_tensor_opt,
+        max_attn_len_tensor_opt,
+        min_full_attn_seq_len_tensor_opt,
+        saved_data["num_groups"].toInt());
+
+    return {
+        torch::autograd::Variable(), // max_seq_len
+        torch::autograd::Variable(), // alpha
+        bwd_res[0], // dq
+        bwd_res[1], // dk
+        bwd_res[2], // dv
+        torch::autograd::Variable(), // seq_offsets
+        torch::autograd::Variable(), // causal
+        torch::autograd::Variable(), // num_targets
+        torch::autograd::Variable(), // attn_scale
+        torch::autograd::Variable(), // max_attn_len
+        torch::autograd::Variable(), // min_full_attn_seq_len
+        torch::autograd::Variable(), // contextual_seq_len
+        torch::autograd::Variable(), // q_descale
+        torch::autograd::Variable(), // k_descale
+        torch::autograd::Variable(), // v_descale
+        torch::autograd::Variable(), // sort_by_length
+        torch::autograd::Variable(), // deterministic
+        torch::autograd::Variable(), // sm_margin
+        torch::autograd::Variable(), // max_q_len
+        torch::autograd::Variable(), // seq_offsets_q
+        torch::autograd::Variable(), // num_softmax_heads
+        torch::autograd::Variable(), // training
+        torch::autograd::Variable(), // max_seq_len_tensor
+        torch::autograd::Variable(), // contextual_seq_len_tensor
+        torch::autograd::Variable(), // max_attn_len_tensor
+        torch::autograd::Variable(), // min_full_attn_seq_len_tensor
+        torch::autograd::Variable(), // num_groups
+    };
+  }
+};
+
+at::Tensor cuda_hstu_mha(
+    int64_t max_seq_len,
+    double alpha,
+    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
+    const std::optional<at::Tensor>& seq_offsets,
+    bool causal,
+    const std::optional<at::Tensor>& num_targets,
+    const std::optional<at::Tensor>& attn_scale,
+    int64_t max_attn_len,
+    int64_t min_full_attn_seq_len,
+    int64_t contextual_seq_len,
+    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
+    const std::optional<at::Tensor>& k_descale, // (b, h_k)
+    const std::optional<at::Tensor>& v_descale, // (b, h_k)
+    bool sort_by_length,
+    bool deterministic,
+    const int64_t sm_margin = 0,
+    int64_t max_q_len = 0,
+    const std::optional<at::Tensor>& seq_offsets_q = std::nullopt,
+    int64_t num_softmax_heads = 0,
+    bool training = true,
+    const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
+        std::nullopt,
+    int64_t num_groups = 1) {
+  return hstu::HSTUFlashAttentionFunctionGPU::apply(
+      max_seq_len,
+      alpha,
+      q,
+      k,
+      v,
+      seq_offsets,
+      causal,
+      num_targets,
+      attn_scale,
+      max_attn_len,
+      min_full_attn_seq_len,
+      contextual_seq_len,
+      q_descale,
+      k_descale,
+      v_descale,
+      sort_by_length,
+      deterministic,
+      sm_margin,
+      max_q_len,
+      seq_offsets_q,
+      num_softmax_heads,
+      training,
+      max_seq_len_tensor,
+      contextual_seq_len_tensor,
+      max_attn_len_tensor,
+      min_full_attn_seq_len_tensor,
+      num_groups);
+}
+
+TORCH_LIBRARY_FRAGMENT(hstu, m) {
+  m.impl(
+      "hstu_mha",
+      torch::dispatch(c10::DispatchKey::CUDA, TORCH_FN(cuda_hstu_mha)));
+
+  m.impl(
+      "hstu_mha_fwd",
+      torch::dispatch(c10::DispatchKey::CUDA, TORCH_FN(hstu::hstu_mha_fwd)));
+
+  m.impl(
+      "hstu_mha_bwd",
+      torch::dispatch(c10::DispatchKey::CUDA, TORCH_FN(hstu::hstu_mha_bwd)));
+}
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_api_cpu.cpp b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_api_cpu.cpp
new file mode 100644
index 000000000..c02424efe
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_api_cpu.cpp
@@ -0,0 +1,256 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#include <torch/library.h> // @manual
+#include <torch/nn/functional.h>
+#include "flash_common_cpu.h"
+
+namespace hstu {
+
+at::Tensor hstu_mha_cpu(
+    int64_t max_seq_len,
+    double alpha,
+    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
+    const std::optional<at::Tensor>& seq_offsets,
+    bool causal,
+    const std::optional<at::Tensor>& num_targets,
+    const std::optional<at::Tensor>& attn_scale,
+    int64_t max_attn_len,
+    int64_t min_full_attn_seq_len,
+    int64_t contextual_seq_len,
+    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
+    const std::optional<at::Tensor>& k_descale, // (b, h_k)
+    const std::optional<at::Tensor>& v_descale, // (b, h_k)
+    bool sort_by_length,
+    bool deterministic,
+    const int64_t sm_margin = 0,
+    int64_t max_q_len = 0,
+    const std::optional<at::Tensor>& seq_offsets_q = std::nullopt,
+    int64_t num_softmax_heads = 0,
+    bool training = true,
+    const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
+        std::nullopt,
+    int64_t num_groups = 1) {
+  auto fwd_out = hstu::hstu_mha_fwd_dummy(
+      max_seq_len,
+      alpha,
+      q,
+      k,
+      v,
+      seq_offsets,
+      causal,
+      num_targets,
+      attn_scale,
+      max_attn_len,
+      min_full_attn_seq_len,
+      contextual_seq_len,
+      q_descale,
+      k_descale,
+      v_descale,
+      sm_margin,
+      max_q_len,
+      seq_offsets_q,
+      num_softmax_heads,
+      training);
+  return get<0>(fwd_out);
+}
+
+at::Tensor hstu_mha_meta(
+    const at::SymInt max_seq_len,
+    double alpha,
+    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
+    const std::optional<at::Tensor>& seq_offsets,
+    bool causal,
+    const std::optional<at::Tensor>& num_targets,
+    const std::optional<at::Tensor>& attn_scale,
+    int64_t max_attn_len,
+    int64_t min_full_attn_seq_len,
+    int64_t contextual_seq_len,
+    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
+    const std::optional<at::Tensor>& k_descale, // (b, h_k)
+    const std::optional<at::Tensor>& v_descale, // (b, h_k)
+    bool sort_by_length,
+    bool deterministic,
+    const int64_t sm_margin = 0,
+    int64_t max_q_len = 0,
+    const std::optional<at::Tensor>& seq_offsets_q = std::nullopt,
+    int64_t num_softmax_heads = 0,
+    bool training = true,
+    const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
+        std::nullopt,
+    int64_t num_groups = 1) {
+  auto fwd_out = hstu::hstu_mha_fwd_meta(
+      max_seq_len,
+      alpha,
+      q,
+      k,
+      v,
+      seq_offsets,
+      causal,
+      num_targets,
+      attn_scale,
+      max_attn_len,
+      min_full_attn_seq_len,
+      contextual_seq_len,
+      q_descale,
+      k_descale,
+      v_descale,
+      sm_margin,
+      max_q_len,
+      seq_offsets_q,
+      num_softmax_heads,
+      training);
+  return get<0>(fwd_out);
+}
+
+// CPU-only implementation that registers under main hstu namespace
+// This provides fallback implementations when GPU code is not compiled
+TORCH_LIBRARY_FRAGMENT(hstu, m) {
+  // Only register operators if they haven't been registered by GPU code
+  // This allows CPU-only builds to work while GPU builds use GPU
+  // implementations
+
+  m.def(
+      "hstu_mha_fwd("
+      "SymInt max_seq_len, "
+      "float alpha, "
+      "Tensor q, "
+      "Tensor k, "
+      "Tensor v, "
+      "Tensor? seq_offsets, "
+      "bool causal, "
+      "Tensor? num_targets, "
+      "Tensor? attn_scale, "
+      "int max_attn_len, "
+      "int min_full_attn_seq_len, "
+      "int contextual_seq_len, "
+      "Tensor? q_descale, "
+      "Tensor? k_descale, "
+      "Tensor? v_descale, "
+      "int sm_margin = 0,"
+      "int max_q_len = 0,"
+      "Tensor? seq_offsets_q = None,"
+      "int num_softmax_heads = 0,"
+      "bool training = True,"
+      "Tensor? max_seq_len_tensor = None,"
+      "Tensor? contextual_seq_len_tensor = None,"
+      "Tensor? max_attn_len_tensor = None,"
+      "Tensor? min_full_attn_seq_len_tensor = None,"
+      "int num_groups = 1"
+      ") -> (Tensor, Tensor?)");
+
+  m.def(
+      "hstu_mha_bwd("
+      "int max_seq_len, "
+      "float alpha, "
+      "Tensor dout, "
+      "Tensor q, "
+      "Tensor k, "
+      "Tensor v, "
+      "Tensor dq, "
+      "Tensor dk, "
+      "Tensor dv, "
+      "Tensor out, "
+      "Tensor? seq_offsets, "
+      "bool causal, "
+      "Tensor? num_targets, "
+      "Tensor? attn_scale, "
+      "int max_attn_len, "
+      "int min_full_attn_seq_len, "
+      "int contextual_seq_len, "
+      "bool sort_by_length,"
+      "bool deterministic,"
+      "int sm_margin = 0,"
+      "int max_q_len = 0,"
+      "Tensor? seq_offsets_q = None,"
+      "int num_softmax_heads = 0,"
+      "Tensor? softmax_lse = None,"
+      "Tensor? max_seq_len_tensor = None,"
+      "Tensor? contextual_seq_len_tensor = None,"
+      "Tensor? max_attn_len_tensor = None,"
+      "Tensor? min_full_attn_seq_len_tensor = None,"
+      "int num_groups = 1"
+      ") -> Tensor[]");
+
+  m.def(
+      "hstu_mha("
+      "SymInt max_seq_len, "
+      "float alpha, "
+      "Tensor q, "
+      "Tensor k, "
+      "Tensor v, "
+      "Tensor? seq_offsets, "
+      "bool causal, "
+      "Tensor? num_targets, "
+      "Tensor? attn_scale, "
+      "int max_attn_len, "
+      "int min_full_attn_seq_len, "
+      "int contextual_seq_len, "
+      "Tensor? q_descale, "
+      "Tensor? k_descale, "
+      "Tensor? v_descale, "
+      "bool sort_by_length, "
+      "bool deterministic, "
+      "int sm_margin = 0,"
+      "int max_q_len = 0,"
+      "Tensor? seq_offsets_q = None,"
+      "int num_softmax_heads = 0,"
+      "bool training = True,"
+      "Tensor? max_seq_len_tensor = None,"
+      "Tensor? contextual_seq_len_tensor = None,"
+      "Tensor? max_attn_len_tensor = None,"
+      "Tensor? min_full_attn_seq_len_tensor = None,"
+      "int num_groups = 1"
+      ") -> Tensor");
+
+  // Register CPU implementations
+  m.impl(
+      "hstu_mha",
+      torch::dispatch(c10::DispatchKey::CPU, TORCH_FN(hstu_mha_cpu)));
+  m.impl(
+      "hstu_mha",
+      torch::dispatch(c10::DispatchKey::Meta, TORCH_FN(hstu_mha_meta)));
+
+  m.impl(
+      "hstu_mha_fwd",
+      torch::dispatch(
+          c10::DispatchKey::CPU, TORCH_FN(hstu::hstu_mha_fwd_dummy)));
+  m.impl(
+      "hstu_mha_fwd",
+      torch::dispatch(
+          c10::DispatchKey::Meta, TORCH_FN(hstu::hstu_mha_fwd_meta)));
+
+  m.impl(
+      "hstu_mha_bwd",
+      torch::dispatch(
+          c10::DispatchKey::CPU, TORCH_FN(hstu::hstu_mha_bwd_dummy)));
+}
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_kernel_sm90.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_kernel_sm90.h
new file mode 100644
index 000000000..051d5141b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_kernel_sm90.h
@@ -0,0 +1,402 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cute/tensor.hpp"
+
+#include <cutlass/arch/reg_reconfig.h>
+#include <cutlass/array.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/kernel_hardware_info.h>
+#include <cutlass/numeric_conversion.h>
+#include <cutlass/numeric_types.h>
+#include "cutlass/pipeline/pipeline.hpp"
+
+#include "tile_scheduler.h"
+#include "utils.h"
+
+namespace hstu {
+
+using namespace cute;
+
+template <
+    bool Softmax,
+    class CollectiveMainloop_,
+    class CollectiveEpilogue_,
+    class TileScheduler_>
+class FlashAttnBwdSm90 {
+ public:
+  // Mainloop derived types
+  using CollectiveMainloop = CollectiveMainloop_;
+  using TileShape_MNK = typename CollectiveMainloop::TileShape_MNK;
+  using TiledMmaSdP = typename CollectiveMainloop::TiledMmaSdP;
+  using TiledMmadKV = typename CollectiveMainloop::TiledMmadKV;
+  using ArchTag = typename CollectiveMainloop::ArchTag;
+  using ClusterShape = typename CollectiveMainloop::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  static constexpr bool dKV_swapAB = CollectiveMainloop::dKV_swapAB;
+
+  // Epilogue derived types
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  using TileScheduler = TileScheduler_;
+  using TileSchedulerArguments = typename hstu::TileSchedulerArguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  static constexpr uint32_t NumLoadWarpGroups = 1;
+  static constexpr uint32_t NumMmaWarpGroups =
+      CUTE_STATIC_V(size(TiledMmaSdP{})) / cutlass::NumThreadsPerWarpGroup;
+  static constexpr uint32_t MaxThreadsPerBlock =
+      CUTE_STATIC_V(size(TiledMmaSdP{})) +
+      (NumLoadWarpGroups * cutlass::NumThreadsPerWarpGroup);
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+  static_assert(NumMmaWarpGroups == 2 || NumMmaWarpGroups == 3);
+
+  /// Register requirement for Load and Math WGs
+  static constexpr uint32_t LoadRegisterRequirement =
+      NumMmaWarpGroups == 2 ? 24 : 32;
+  static constexpr uint32_t MmaRegisterRequirement =
+      NumMmaWarpGroups == 2 ? 240 : 160;
+  // If you want to print from the producer warp, you'd need to increase the
+  // number of registers Otherwise you'll get CUDA error. static constexpr
+  // uint32_t LoadRegisterRequirement = 40; static constexpr uint32_t
+  // MmaRegisterRequirement = NumMmaWarpGroups == 2 ? 232 : 152;
+
+  // Kernel level shared memory storage
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128> {
+      union {
+        typename CollectiveMainloop::TensorStorage mainloop;
+        typename CollectiveEpilogue::TensorStorage epilogue;
+      };
+    } tensors;
+
+    struct PipelineStorage : cute::aligned_struct<16> {
+      alignas(16) cutlass::arch::ClusterTransactionBarrier barrier_KV;
+      alignas(16) typename CollectiveMainloop::MainloopPipeline::SharedStorage
+          pipeline_q;
+      alignas(16)
+          typename CollectiveMainloop::MainloopPipeline_dO::SharedStorage
+          pipeline_do;
+      alignas(16) typename TileScheduler::SharedStorage smem_scheduler;
+    } pipelines;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  // Device side arguments
+  struct Arguments {
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    cutlass::KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    cutlass::KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the
+  // aliased type.
+  static Params to_underlying_arguments(Arguments const& args) {
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST(
+          "  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+          args.hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST(
+        "to_underlying_arguments(): Setting persistent grid SM count to "
+        << sm_count);
+
+    cutlass::KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+    return {
+        CollectiveMainloop::to_underlying_arguments(args.mainloop),
+        CollectiveEpilogue::to_underlying_arguments(args.epilogue),
+        hw_info,
+        TileScheduler::to_underlying_arguments(args.scheduler)};
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3 get_grid_shape(Params const& params) {
+    return TileScheduler::get_grid_shape(
+        params.scheduler, params.hw_info.sm_count);
+  }
+
+  static dim3 get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void operator()(Params const& params, char* smem_buf) {
+    static constexpr int NumMmaThreads =
+        NumMmaWarpGroups * cutlass::NumThreadsPerWarpGroup;
+    static constexpr int NumCopyThreads =
+        NumLoadWarpGroups * cutlass::NumThreadsPerWarpGroup;
+
+    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
+    using PipelineParams = typename MainloopPipeline::Params;
+    using PipelineState = typename MainloopPipeline::PipelineState;
+    using MainloopPipeline_dO =
+        typename CollectiveMainloop::MainloopPipeline_dO;
+    using PipelineParams_dO = typename MainloopPipeline_dO::Params;
+    using PipelineState_dO = typename MainloopPipeline_dO::PipelineState;
+    static constexpr bool Q_dO_same_stages =
+        std::is_same_v<MainloopPipeline, MainloopPipeline_dO>;
+
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int const lane_predicate = cute::elect_one_sync();
+    int const warp_idx = cutlass::canonical_warp_idx_sync();
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if (warp_idx == 0 && lane_predicate) {
+      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
+      CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
+    }
+
+    // Obtain warp index
+    int const warp_group_thread_idx =
+        threadIdx.x % cutlass::NumThreadsPerWarpGroup;
+
+    PipelineParams pipeline_params;
+    if constexpr (Softmax) {
+      pipeline_params.transaction_bytes =
+          CollectiveMainloop::TmaTransactionBytesQ +
+          CollectiveMainloop::TmaTransactionBytesLSE;
+    } else {
+      pipeline_params.transaction_bytes =
+          CollectiveMainloop::TmaTransactionBytesQ;
+    }
+    int warp_group_idx = cutlass::canonical_warp_group_idx();
+    pipeline_params.role = warp_group_idx == 0
+        ? MainloopPipeline::ThreadCategory::Producer
+        : MainloopPipeline::ThreadCategory::Consumer;
+    pipeline_params.is_leader = warp_group_thread_idx == 0;
+    pipeline_params.num_consumers = NumMmaThreads;
+
+    if (warp_idx == 0 && lane_predicate) {
+      shared_storage.pipelines.barrier_KV.init(1 /*numThreads*/);
+    }
+    // We're counting on pipeline_q to call cutlass::arch::fence_barrier_init();
+    MainloopPipeline pipeline_q(
+        shared_storage.pipelines.pipeline_q, pipeline_params, ClusterShape{});
+    auto role_dO = warp_group_idx == 0
+        ? MainloopPipeline_dO::ThreadCategory::Producer
+        : MainloopPipeline_dO::ThreadCategory::Consumer;
+    PipelineParams_dO pipeline_params_dO{
+        pipeline_params.transaction_bytes,
+        role_dO,
+        pipeline_params.is_leader,
+        pipeline_params.num_consumers};
+    MainloopPipeline_dO pipeline_do(
+        shared_storage.pipelines.pipeline_do,
+        cute::conditional_return<Q_dO_same_stages>(
+            pipeline_params, pipeline_params_dO),
+        ClusterShape{});
+
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue;
+
+    // We need this to guarantee that the Pipeline init is visible to all
+    // producers and consumer blocks in the Cluster
+    if constexpr (size(ClusterShape{}) > 1) {
+      cute::cluster_arrive_relaxed();
+      cute::cluster_wait();
+    } else {
+      __syncthreads();
+    }
+
+    if (warp_group_idx == 0) { // Producer
+      cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
+
+      int warp_idx_in_warpgroup =
+          __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0);
+      if (warp_idx_in_warpgroup == 0) { // Load K, V, and do TMA on Q and dO
+        PipelineState smem_pipe_write =
+            cutlass::make_producer_start_state<MainloopPipeline>();
+        PipelineState_dO smem_pipe_write_do =
+            cutlass::make_producer_start_state<MainloopPipeline_dO>();
+
+        TileScheduler scheduler(
+            reinterpret_cast<typename TileScheduler::SharedStorage*>(
+                &shared_storage.pipelines.smem_scheduler));
+        for (auto work_tile_info =
+                 scheduler.template get_initial_work</*IsProducerWarp=*/true>(
+                     params.scheduler);
+             work_tile_info.is_valid(params.scheduler);
+             work_tile_info =
+                 scheduler.template get_next_work</*IsProducerWarp=*/true>(
+                     params.scheduler, work_tile_info)) {
+          auto block_coord_ = work_tile_info.get_block_coord(params.scheduler);
+          auto [n_block, bidh, bidb, _ /*split_idx*/] = block_coord_;
+          cute::tuple<int32_t, int32_t, int32_t> block_coord = {
+              n_block, bidh, bidb};
+          auto scheduler_prefetch = [&scheduler, &params, &work_tile_info]() {
+            scheduler.prefetch_next_work(params.scheduler, work_tile_info);
+          };
+          collective_mainloop.load(
+              params.mainloop,
+              pipeline_q,
+              pipeline_do,
+              smem_pipe_write,
+              smem_pipe_write_do,
+              shared_storage,
+              scheduler_prefetch,
+              block_coord);
+        }
+        collective_mainloop.load_tail(
+            pipeline_q, pipeline_do, smem_pipe_write, smem_pipe_write_do);
+      } else if (warp_idx_in_warpgroup == 1) {
+        TileScheduler scheduler(
+            reinterpret_cast<typename TileScheduler::SharedStorage*>(
+                &shared_storage.pipelines.smem_scheduler));
+        for (auto work_tile_info =
+                 scheduler.template get_initial_work</*IsProducerWarp=*/false>(
+                     params.scheduler);
+             work_tile_info.is_valid(params.scheduler);
+             work_tile_info =
+                 scheduler.template get_next_work</*IsProducerWarp=*/false>(
+                     params.scheduler, work_tile_info)) {
+          auto block_coord_ = work_tile_info.get_block_coord(params.scheduler);
+          auto [n_block, bidh, bidb, _ /*split_idx*/] = block_coord_;
+          cute::tuple<int32_t, int32_t, int32_t> block_coord = {
+              n_block, bidh, bidb};
+          collective_mainloop.store_dq(
+              params.mainloop, shared_storage, block_coord);
+        }
+      }
+    } else { // Consumer
+      cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
+
+      TileScheduler scheduler(
+          reinterpret_cast<typename TileScheduler::SharedStorage*>(
+              &shared_storage.pipelines.smem_scheduler));
+      // Initialize matmul objects.
+      TiledMmadKV tiled_mma_dKV;
+
+      PipelineState smem_pipe_read;
+      PipelineState_dO smem_pipe_read_do;
+
+      collective_mainloop.mma_init();
+      scheduler.init_consumer();
+
+      int work_idx = 0;
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (auto work_tile_info =
+               scheduler.template get_initial_work</*IsProducerWarp=*/false>(
+                   params.scheduler);
+           work_tile_info.is_valid(params.scheduler);
+           work_tile_info =
+               scheduler.template get_next_work</*IsProducerWarp=*/false>(
+                   params.scheduler, work_tile_info)) {
+        auto block_coord_ = work_tile_info.get_block_coord(params.scheduler);
+        auto [n_block, bidh, bidb, _ /*split_idx*/] = block_coord_;
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+        if (threadIdx.x == 0 || threadIdx.x == 128) {
+          std::printf(
+              "n_block: (%d), bidh: (%d), bidb: (%d), blockIdx.x: (%d), blockIdx.y: (%d), blockIdx.z: (%d)\n",
+              n_block,
+              bidh,
+              bidb,
+              blockIdx.x,
+              blockIdx.y,
+              blockIdx.z);
+        }
+#endif
+        cute::tuple<int32_t, int32_t, int32_t> block_coord = {
+            n_block, bidh, bidb};
+
+        // dK and dV output accumulator.
+        Tensor tdKrdK = partition_fragment_C(
+            tiled_mma_dKV,
+            select<!dKV_swapAB ? 1 : 2, !dKV_swapAB ? 2 : 1>(TileShape_MNK{}));
+        Tensor tdVrdV = partition_fragment_C(
+            tiled_mma_dKV,
+            select<!dKV_swapAB ? 1 : 2, !dKV_swapAB ? 2 : 1>(TileShape_MNK{}));
+
+        bool tile_valid;
+        if constexpr (Softmax) {
+          tile_valid = collective_mainloop.mma_softmax(
+              params.mainloop,
+              pipeline_q,
+              pipeline_do,
+              smem_pipe_read,
+              smem_pipe_read_do,
+              tdKrdK,
+              tdVrdV,
+              threadIdx.x - NumCopyThreads,
+              work_idx,
+              block_coord,
+              shared_storage);
+        } else {
+          tile_valid = collective_mainloop.mma(
+              params.mainloop,
+              pipeline_q,
+              pipeline_do,
+              smem_pipe_read,
+              smem_pipe_read_do,
+              tdKrdK,
+              tdVrdV,
+              threadIdx.x - NumCopyThreads,
+              work_idx,
+              block_coord,
+              shared_storage);
+        }
+        if (tile_valid) {
+          collective_epilogue.store(
+              params.epilogue,
+              tdKrdK,
+              tdVrdV,
+              shared_storage,
+              tiled_mma_dKV,
+              threadIdx.x - NumCopyThreads,
+              block_coord);
+        } else {
+          collective_epilogue.store_zero(
+              params.epilogue, threadIdx.x - NumCopyThreads, block_coord);
+        }
+      }
+      collective_epilogue.store_tail();
+    }
+  }
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_launch_template.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_launch_template.h
new file mode 100644
index 000000000..6900852df
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_launch_template.h
@@ -0,0 +1,492 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cute/tensor.hpp"
+
+#include "cutlass/cluster_launch.hpp" // For ClusterLauncher
+#include "cutlass/device_kernel.h" // For device_kernel
+#include "cutlass/kernel_launch.h" // For kernel_launch
+
+#include "epilogue_bwd.h"
+#include "flash.h"
+#include "flash_bwd_kernel_sm90.h"
+#include "flash_bwd_postprocess_kernel.h"
+#include "flash_bwd_preprocess_kernel.h"
+#include "mainloop_bwd_sm90_tma_gmma_ws.h"
+#include "static_switch.h"
+#include "tile_scheduler.h"
+#include "tile_size.h"
+
+namespace hstu {
+
+using namespace cute;
+
+template <
+    int Arch,
+    int kHeadDim,
+    int kBlockM,
+    int kBlockN,
+    typename Element,
+    bool Causal,
+    bool Local,
+    bool Contexual_mask,
+    bool Jagged,
+    bool Has_targets,
+    bool Deterministic,
+    int Stages_dO = 2,
+    int Stages_dS_or_QSm80 = 2,
+    bool SdP_swapAB = true,
+    bool dKV_swapAB = false,
+    bool dQ_swapAB = false,
+    int NumMmaWarpGroups = 2,
+    int AtomLayoutMSdP = 1,
+    int AtomLayoutNdKV = 2,
+    int AtomLayoutMdQ = 1,
+    bool V_in_regs = false,
+    bool Cross = false,
+    bool Softmax = false>
+void run_flash_bwd(hstu::Flash_bwd_params& params, cudaStream_t stream) {
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+  std::printf(
+      "[flash_bwd_launch_template] Local: (%d), Jagged: (%d), Has_targets: (%d), Causal: (%d), max_kv_len: (%d), kHeadDim: (%d), kBlockM: (%d), kBlockN: (%d)\n",
+      Local,
+      Jagged,
+      Has_targets,
+      Causal,
+      params.max_kv_len,
+      kHeadDim,
+      kBlockM,
+      kBlockN);
+#endif
+  static_assert(
+      !(Causal && Local), "Causal and Local cannot be true at the same time.");
+  using ElementAccum = float;
+  using ArchTag =
+      std::conditional_t<Arch >= 90, cutlass::arch::Sm90, cutlass::arch::Sm80>;
+
+  int const total_q_padded_rounded =
+      cute::round_up(params.total_seq_len_q + params.b * kBlockM, kBlockM);
+  int seqlen_q = !Jagged ? params.max_q_len : params.total_seq_len_q;
+  int seqlen_kv = !Jagged ? params.max_kv_len : params.total_seq_len_kv;
+  int seqlen_q_rounded =
+      !Jagged ? params.max_q_len_rounded : total_q_padded_rounded;
+  int batch = !Jagged ? params.b : 1;
+
+  using TileShape_MK = cute::Shape<Int<kBlockM>, Int<kHeadDim>>;
+  using PreprocessKernel = hstu::FlashAttnBwdPreprocess<
+      TileShape_MK,
+      Element,
+      ElementAccum,
+      ArchTag,
+      /*Clear_dQaccum=*/true,
+      Jagged,
+      Softmax>;
+  typename PreprocessKernel::Arguments preprocess_args{
+      static_cast<Element const*>(params.o_ptr),
+      {seqlen_q, params.v_d, params.h, batch}, // shape_O
+      {params.o_row_stride,
+       _1{},
+       params.o_head_stride,
+       !Jagged ? params.o_batch_stride : 0}, // stride_O
+      static_cast<Element const*>(params.do_ptr),
+      {params.do_row_stride,
+       _1{},
+       params.do_head_stride,
+       !Jagged ? params.do_batch_stride : 0}, // stride_dO
+      static_cast<float*>(params.softmax_d),
+      {seqlen_q_rounded, params.num_softmax_heads, batch}, // shape_dPsum
+      {_1{},
+       seqlen_q_rounded,
+       !Jagged ? params.num_softmax_heads * params.max_q_len_rounded
+               : 0}, // stride_dPsum
+      static_cast<float*>(params.softmax_lse),
+      {_1{},
+       seqlen_q,
+       !Jagged ? params.num_softmax_heads * params.max_q_len_rounded
+               : 0}, // stride_LSE
+      static_cast<float*>(params.softmax_lse_log2),
+      {_1{},
+       seqlen_q_rounded,
+       !Jagged ? params.num_softmax_heads * params.max_q_len_rounded
+               : 0}, // stride_LSE_log2
+      static_cast<ElementAccum*>(params.dq_accum_ptr),
+      {seqlen_q_rounded * params.qk_d_rounded,
+       params.h,
+       batch}, // shape_dQaccum
+      {_1{},
+       seqlen_q_rounded * params.qk_d_rounded,
+       !Jagged ? params.qk_d_rounded * params.max_q_len_rounded * params.h
+               : 0}, // stride_dQaccum
+      params.b,
+      params.h,
+      params.num_softmax_heads,
+      params.max_q_len,
+      params.dq_semaphore,
+      Cross ? params.seq_offsets_q : params.seq_offsets};
+  typename PreprocessKernel::Params preprocess_params =
+      PreprocessKernel::to_underlying_arguments(preprocess_args);
+  int num_m_block = cute::ceil_div(params.max_q_len, kBlockM);
+  dim3 grid_m(num_m_block, params.h, params.b);
+  cutlass::kernel_launch<PreprocessKernel>(
+      grid_m,
+      PreprocessKernel::MaxThreadsPerBlock,
+      PreprocessKernel::SharedStorageSize,
+      stream,
+      preprocess_params,
+      false /*launch_with_pdl*/);
+  CHECK_CUDA_KERNEL_LAUNCH();
+
+  using TileShape_MNK = cute::Shape<Int<kBlockM>, Int<kBlockN>, Int<kHeadDim>>;
+  using ClusterShape =
+      cute::Shape<_1, Int<1>, _1>; // Currently doesn't not support cluster
+  // Stages_dS_or_QSm80 is Stages_dS if Sm90 and Stages if Sm80
+  static constexpr int Stages = Arch >= 90 ? 2 : Stages_dS_or_QSm80;
+  static constexpr int Stages_dS = Arch >= 90 ? Stages_dS_or_QSm80 : 1;
+  using CollectiveMainloop = hstu::CollectiveMainloopBwdSm90<
+      Stages,
+      Stages_dO,
+      Stages_dS,
+      ClusterShape,
+      TileShape_MNK,
+      Element,
+      ElementAccum,
+      cutlass::arch::Sm90,
+      Causal,
+      Local,
+      Contexual_mask,
+      Jagged,
+      Has_targets,
+      Deterministic,
+      SdP_swapAB,
+      dKV_swapAB,
+      dQ_swapAB,
+      NumMmaWarpGroups,
+      AtomLayoutMSdP,
+      AtomLayoutNdKV,
+      AtomLayoutMdQ,
+      V_in_regs,
+      Cross,
+      Softmax>;
+  using CollectiveEpilogue = hstu::CollectiveEpilogueBwd<
+      TileShape_MNK,
+      Element,
+      ArchTag,
+      CollectiveMainloop::NumMmaThreads,
+      Jagged,
+      dKV_swapAB,
+      NumMmaWarpGroups*(Arch >= 90 ? 1 : cutlass::NumWarpsPerWarpGroup) /
+          AtomLayoutNdKV>;
+  using Scheduler =
+      hstu::SingleTileScheduler<Jagged, kBlockN, false /*Sort_by_length*/>;
+  using AttnKernel = hstu::enable_sm90_or_later<hstu::FlashAttnBwdSm90<
+      Softmax,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      Scheduler>>;
+
+  typename CollectiveMainloop::Arguments mainloop_args{
+      static_cast<Element const*>(params.q_ptr),
+      {seqlen_q, params.qk_d, params.h, batch}, // shape_Q
+      {params.q_row_stride,
+       _1{},
+       params.q_head_stride,
+       !Jagged ? params.q_batch_stride : 0}, // stride_Q
+      static_cast<Element const*>(params.k_ptr),
+      {seqlen_kv, params.qk_d, params.h, batch}, // shape_K
+      {params.k_row_stride,
+       _1{},
+       params.k_head_stride,
+       !Jagged ? params.k_batch_stride : 0}, // stride_K
+      static_cast<Element const*>(params.v_ptr),
+      {seqlen_kv, params.v_d, params.h, batch}, // shape_V
+      {params.v_row_stride,
+       _1{},
+       params.v_head_stride,
+       !Jagged ? params.v_batch_stride : 0}, // stride_V
+      static_cast<Element const*>(params.do_ptr),
+      {seqlen_q, params.v_d, params.h, batch}, // shape_dO
+      {params.do_row_stride,
+       _1{},
+       params.do_head_stride,
+       !Jagged ? params.do_batch_stride : 0}, // stride_dO
+      static_cast<ElementAccum*>(params.dq_accum_ptr),
+      {seqlen_q_rounded * params.qk_d_rounded,
+       params.h,
+       batch}, // shape_dQaccum
+      {_1{},
+       seqlen_q_rounded * params.qk_d_rounded,
+       !Jagged ? params.qk_d_rounded * params.max_q_len_rounded * params.h
+               : 0}, // stride_dQaccum
+      static_cast<float*>(params.softmax_lse_log2),
+      {seqlen_q_rounded, params.num_softmax_heads, batch}, // shape_LSE
+      {_1{},
+       seqlen_q_rounded,
+       !Jagged ? params.num_softmax_heads * params.max_q_len_rounded
+               : 0}, // stride_LSE_log2
+      static_cast<float*>(params.softmax_d),
+      {_1{},
+       seqlen_q_rounded,
+       !Jagged ? params.num_softmax_heads * params.max_q_len_rounded
+               : 0}, // stride_dPsum
+      params.max_attn_len,
+      params.min_full_attn_seq_len,
+      params.contextual_seq_len,
+      1.0f / params.max_kv_len,
+      params.alpha,
+      params.b,
+      params.num_softmax_heads,
+      params.num_groups,
+      params.batch_size_per_group,
+      params.dq_semaphore,
+      params.seq_offsets,
+      params.seq_offsets_q,
+      params.num_targets,
+      params.max_seq_len_tensor,
+      params.contextual_seq_len_tensor,
+      params.max_attn_len_tensor,
+      params.min_full_attn_seq_len_tensor,
+      params.attn_scale,
+      params.scalar_scale};
+  typename CollectiveEpilogue::Arguments epilogue_args{
+      static_cast<typename CollectiveEpilogue::Element*>(params.dk_ptr),
+      [&] {
+        return typename CollectiveEpilogue::ShapedKV{
+            seqlen_kv, params.qk_d, params.h, batch}; // shape_dK
+      }(),
+      [&] {
+        return typename CollectiveEpilogue::StridedKV{
+            params.dk_row_stride,
+            _1{},
+            params.dk_head_stride,
+            !Jagged ? params.dk_batch_stride : 0}; // stride_dK
+      }(),
+      static_cast<typename CollectiveEpilogue::Element*>(params.dv_ptr),
+      [&] {
+        return typename CollectiveEpilogue::StridedKV{
+            params.dv_row_stride,
+            _1{},
+            params.dv_head_stride,
+            !Jagged ? params.dv_batch_stride : 0}; // stride_dV
+      }(),
+      params.h,
+      params.seq_offsets};
+
+  int num_blocks_n =
+      cutlass::ceil_div(params.max_kv_len, get<1>(TileShape_MNK{}));
+  num_blocks_n = cutlass::round_up(num_blocks_n, size<1>(ClusterShape{}));
+  typename hstu::TileSchedulerArguments scheduler_args{
+      num_blocks_n,
+      params.h,
+      params.b,
+      params.max_kv_len,
+      params.qk_d,
+      sizeof(Element),
+      params.tile_count_semaphore,
+      params.seq_offsets,
+      params.sort_by_length_indices};
+
+  int device;
+  cudaGetDevice(&device);
+  typename AttnKernel::Params kernel_params =
+      AttnKernel::to_underlying_arguments(
+          {mainloop_args,
+           epilogue_args,
+           {device, params.num_sm},
+           scheduler_args});
+
+  dim3 grid_dims = AttnKernel::get_grid_shape(kernel_params);
+  dim3 block_dims = AttnKernel::get_block_shape();
+  int smem_size = AttnKernel::SharedStorageSize;
+  if constexpr (size(ClusterShape{}) > 1) {
+    void const* kernel = (void const*)cutlass::device_kernel<AttnKernel>;
+    if (smem_size >= 48 * 1024) {
+      CHECK_CUDA(cudaFuncSetAttribute(
+          kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+    }
+    dim3 cluster_dims(
+        size<0>(ClusterShape{}),
+        size<1>(ClusterShape{}),
+        size<2>(ClusterShape{}));
+    cutlass::ClusterLauncher::launch(
+        grid_dims,
+        cluster_dims,
+        block_dims,
+        smem_size,
+        stream,
+        kernel,
+        kernel_params,
+        false /*launch_with_pdl*/);
+  } else {
+    if (smem_size >= 48 * 1024) {
+      CHECK_CUDA(cudaFuncSetAttribute(
+          cutlass::device_kernel<AttnKernel>,
+          cudaFuncAttributeMaxDynamicSharedMemorySize,
+          smem_size));
+    }
+    cutlass::kernel_launch<AttnKernel>(
+        grid_dims,
+        block_dims,
+        smem_size,
+        stream,
+        kernel_params,
+        false /*launch_with_pdl*/);
+  }
+  CHECK_CUDA_KERNEL_LAUNCH();
+
+  using PostprocessKernel = hstu::FlashAttnBwdPostprocessConvertdQ<
+      TileShape_MK,
+      Element,
+      ElementAccum,
+      ArchTag,
+      AttnKernel::CollectiveMainloop::NumMmaThreads,
+      typename AttnKernel::CollectiveMainloop::TiledMmadQ,
+      AttnKernel::CollectiveMainloop::dQ_swapAB,
+      Jagged,
+      Softmax>;
+  typename PostprocessKernel::Arguments postprocess_args{
+      static_cast<ElementAccum const*>(params.dq_accum_ptr),
+      {seqlen_q_rounded * params.qk_d_rounded,
+       params.h,
+       batch}, // shape_dQaccum
+      {_1{},
+       seqlen_q_rounded * params.qk_d_rounded,
+       !Jagged ? params.qk_d_rounded * params.max_q_len_rounded * params.h
+               : 0}, // stride_dQaccum
+      static_cast<Element*>(params.dq_ptr),
+      {seqlen_q, params.qk_d, params.h, batch}, // shape_dQ
+      {params.dq_row_stride,
+       _1{},
+       params.dq_head_stride,
+       params.dq_batch_stride}, // stride_dQ
+      Cross ? params.seq_offsets_q : params.seq_offsets};
+  typename PostprocessKernel::Params postprocess_params =
+      PostprocessKernel::to_underlying_arguments(postprocess_args);
+  int num_m_block_postprocess =
+      cute::ceil_div(params.max_q_len, get<0>(TileShape_MK{}));
+  dim3 grid_m_postprocess(num_m_block_postprocess, params.h, params.b);
+  int smem_size_postprocess = PostprocessKernel::SharedStorageSize;
+  if (smem_size_postprocess >= 48 * 1024) {
+    CHECK_CUDA(cudaFuncSetAttribute(
+        cutlass::device_kernel<PostprocessKernel>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize,
+        smem_size_postprocess));
+  }
+  cutlass::kernel_launch<PostprocessKernel>(
+      grid_m_postprocess,
+      PostprocessKernel::MaxThreadsPerBlock,
+      smem_size_postprocess,
+      stream,
+      postprocess_params,
+      false /*launch_with_pdl*/);
+  CHECK_CUDA_KERNEL_LAUNCH();
+}
+
+template <
+    int Arch,
+    typename T,
+    int kBlockM,
+    int kBlockN,
+    int kHeadDim,
+    bool Causal,
+    bool Local,
+    int Stages_dO = 2,
+    int Stages_dS_or_QSm80 = 2,
+    bool SdP_swapAB = true,
+    bool dKV_swapAB = false,
+    bool dQ_swapAB = false,
+    int NumMmaWarpGroups = 2,
+    int AtomLayoutMSdP = 1,
+    int AtomLayoutNdKV = 2,
+    int AtomLayoutMdQ = 1,
+    bool V_in_regs = false,
+    bool Softmax = false>
+void run_mha_bwd_dispatch(hstu::Flash_bwd_params& params, cudaStream_t stream) {
+  BOOL_SWITCH(params.seq_offsets != nullptr, Jagged, [&] {
+    BOOL_SWITCH(params.num_targets != nullptr, Has_targets, [&] {
+      BOOL_SWITCH(params.has_contexual_mask, Contexual_mask, [&] {
+        BOOL_SWITCH(params.seq_offsets_q, Cross, [&] {
+          run_flash_bwd<
+              Arch,
+              kHeadDim,
+              kBlockM,
+              kBlockN,
+              T,
+              Causal,
+              Local,
+              Contexual_mask,
+              Jagged,
+              Has_targets,
+              false /*Deterministic*/,
+              Stages_dO,
+              Stages_dS_or_QSm80,
+              SdP_swapAB,
+              dKV_swapAB,
+              dQ_swapAB,
+              NumMmaWarpGroups,
+              AtomLayoutMSdP,
+              AtomLayoutNdKV,
+              AtomLayoutMdQ,
+              V_in_regs,
+              Cross,
+              Softmax>(params, stream);
+        });
+      });
+    });
+  });
+}
+
+template <int Arch, typename T, int kHeadDim, bool Softmax>
+void run_mha_bwd_(hstu::Flash_bwd_params& params, cudaStream_t stream) {
+  CAUSAL_LOCAL_SWITCH(params.is_causal, params.is_local, Causal, Local, [&] {
+    int const kBlockM = hstu::kBlockM_bwd(Arch, kHeadDim, Causal, Local);
+    int const kBlockN = hstu::kBlockN_bwd(Arch, kHeadDim);
+    bool const V_in_regs = hstu::V_in_regs_bwd(Arch, kHeadDim);
+    static constexpr std::tuple<int, int> Stages =
+        hstu::Stages_bwd(Arch, kHeadDim);
+    static constexpr std::tuple<bool, bool, bool> swapAB =
+        hstu::swapAB_bwd(Arch, kHeadDim, Causal, Local);
+    int const NumMmaWarpGroups = hstu::NumMmaWarpGroups_bwd(Arch, kHeadDim);
+    static constexpr std::tuple<int, int, int> AtomLayout =
+        hstu::AtomLayout_bwd(Arch, kHeadDim);
+    run_mha_bwd_dispatch<
+        Arch,
+        T,
+        kBlockM,
+        kBlockN,
+        kHeadDim,
+        Causal,
+        Local,
+        std::get<0>(Stages), /*Stages_dO*/
+        std::get<1>(Stages), /*Stages_dS_or_QSm80*/
+        std::get<0>(swapAB), /*SdP_swapAB*/
+        std::get<1>(swapAB), /*dKV_swapAB*/
+        std::get<2>(swapAB), /*dQ_swapAB*/
+        NumMmaWarpGroups,
+        std::get<0>(AtomLayout), /*AtomLayoutMSdP*/
+        std::get<1>(AtomLayout), /*AtomLayoutNdKV*/
+        std::get<2>(AtomLayout), /*AtomLayoutMdQ*/
+        V_in_regs,
+        Softmax>(params, stream);
+  });
+}
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_postprocess_kernel.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_postprocess_kernel.h
new file mode 100644
index 000000000..ca04a1456
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_postprocess_kernel.h
@@ -0,0 +1,348 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cute/tensor.hpp"
+
+#include <cutlass/array.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/numeric_conversion.h>
+#include <cutlass/numeric_types.h>
+#include "cutlass/arch/barrier.h"
+
+#include "seqlen.h"
+#include "utils.h"
+
+namespace hstu {
+
+using namespace cute;
+
+template <
+    class TileShape_MK_,
+    class Element,
+    class ElementAccum,
+    class ArchTag_,
+    int kNThreads,
+    class TiledMma,
+    bool dQ_swapAB,
+    bool Jagged,
+    bool Softmax>
+class FlashAttnBwdPostprocessConvertdQ {
+ public:
+  // Type Aliases
+  using TileShape_MK = TileShape_MK_;
+  using ArchTag = ArchTag_;
+
+  static_assert(ArchTag::kMinComputeCapability >= 75);
+  static constexpr bool IsSm90 = ArchTag::kMinComputeCapability >= 90;
+
+  static constexpr uint32_t MaxThreadsPerBlock = kNThreads;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 2;
+
+  static constexpr int kBlockM = get<0>(TileShape_MK{});
+  static constexpr int kHeadDim = get<1>(TileShape_MK{});
+  static_assert(
+      !IsSm90 || kNThreads % cutlass::NumThreadsPerWarpGroup == 0,
+      "kNThreads must be a multiple of NumThreadsPerWarpGroup");
+  static constexpr int NumdQWarpGgroups =
+      kNThreads / cutlass::NumThreadsPerWarpGroup;
+  using R2SLayoutAtomdQaccum = std::conditional_t<
+      IsSm90,
+      Layout<
+          Shape<Int<cutlass::NumThreadsPerWarpGroup>, Int<NumdQWarpGgroups>>>,
+      Layout<Shape<Int<kNThreads>>>>;
+  using R2STiledCopydQaccum = decltype(make_tiled_copy(
+      Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, ElementAccum>{},
+      R2SLayoutAtomdQaccum{},
+      Layout<Shape<Int<IsSm90 ? 4 : 1>>>{})); // Val layout, 1 or 4 vals per
+                                              // read
+  using G2SLayoutAtomdQaccum = Layout<Shape<Int<kNThreads>>>;
+  // UniversalCopy instead of AutoVectorizingCopyWithAssumedAlignment as the
+  // latter generates cp.async instructions
+  using G2STiledCopydQaccum = decltype(make_tiled_copy(
+      Copy_Atom<UniversalCopy<uint128_t>, ElementAccum>{},
+      G2SLayoutAtomdQaccum{},
+      Layout<Shape<_4>>{})); // Val layout, 4 vals per read
+  // We don't do bound checking for the gmem -> smem load so we just assert
+  // here.
+  static_assert(IsSm90 || (kBlockM * kHeadDim) % (kNThreads * 4) == 0);
+  static constexpr int SmemdQaccumSize = size(TileShape_MK{});
+  using SmemLayoutdQaccumFlat = Layout<Shape<Int<SmemdQaccumSize>>>;
+  using SmemLayoutdQaccum = std::conditional_t<
+      IsSm90,
+      Layout<Shape<
+          Int<kBlockM * kHeadDim / NumdQWarpGgroups>,
+          Int<NumdQWarpGgroups>>>,
+      Layout<Shape<Int<kBlockM * kHeadDim>>>>;
+
+  // We can't just use kHeadDim here. E.g. if MMA shape is 64 x 96 but split
+  // across 2 WGs, then setting kBlockKSmem to 32 will cause "Static shape_div
+  // failure". We want to treat it as 64 x 48, so kBlockKSmem should be 16.
+  static constexpr int MmaShapeN = get<1>(typename TiledMma::AtomShape_MNK{});
+  static constexpr int kBlockKSmem =
+      MmaShapeN % 64 == 0 ? 64 : (MmaShapeN % 32 == 0 ? 32 : 16);
+  static constexpr int kSwizzle =
+      kBlockKSmem == 64 ? 3 : (kBlockKSmem == 32 ? 2 : 1);
+  using SmemLayoutAtomdQ = decltype(composition(
+      Swizzle<kSwizzle, 3, 3>{},
+      Layout<Shape<Int<8>, Int<kBlockKSmem>>, Stride<Int<kBlockKSmem>, _1>>{}));
+  using SmemLayoutdQ =
+      decltype(tile_to_shape(SmemLayoutAtomdQ{}, TileShape_MK{}));
+  using SmemLayoutdQt = decltype(cute::composition(
+      SmemLayoutdQ{},
+      make_layout(
+          make_shape(get<1>(TileShape_MK{}), get<0>(TileShape_MK{})),
+          make_stride(Int<get<0>(TileShape_MK{})>{}, _1{}))));
+
+  using SmemCopyAtomdQ = Copy_Atom<
+      std::conditional_t<
+          IsSm90,
+          std::conditional_t<
+              !dQ_swapAB,
+              cute::SM90_U32x4_STSM_N,
+              cute::SM90_U16x8_STSM_T>,
+          AutoVectorizingCopyWithAssumedAlignment<128>>,
+      Element>;
+
+  static constexpr int kGmemElemsPerLoad =
+      sizeof(cute::uint128_t) / sizeof(Element);
+  static_assert(
+      kHeadDim % kGmemElemsPerLoad == 0,
+      "Headdim must be a multiple of kGmemElemsPerLoad");
+  static constexpr int kGmemThreadsPerRow =
+      cutlass::gcd(kHeadDim / kGmemElemsPerLoad, int(MaxThreadsPerBlock));
+  static_assert(
+      MaxThreadsPerBlock % kGmemThreadsPerRow == 0,
+      "MaxThreadsPerBlock must be a multiple of kGmemThreadsPerRow");
+  using GmemLayoutAtom = Layout<
+      Shape<
+          Int<MaxThreadsPerBlock / kGmemThreadsPerRow>,
+          Int<kGmemThreadsPerRow>>,
+      Stride<Int<kGmemThreadsPerRow>, _1>>;
+  using GmemTiledCopy = decltype(make_tiled_copy(
+      Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, Element>{},
+      GmemLayoutAtom{},
+      Layout<Shape<_1, Int<kGmemElemsPerLoad>>>{})); // Val layout, 8 or 16 vals
+                                                     // per load
+
+  struct SharedStorage : cute::aligned_struct<128> {
+    cute::array_aligned<ElementAccum, cute::cosize_v<SmemLayoutdQaccum>>
+        smem_dqacc;
+    cute::array_aligned<Element, cute::cosize_v<SmemLayoutdQ>> smem_dq;
+    alignas(16) cutlass::arch::ClusterTransactionBarrier barrier_dQaccum;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  using ShapedQ =
+      cute::Shape<int32_t, int32_t, int32_t, int32_t>; // (seqlen_q, d, head,
+                                                       // batch)
+  using StridedQ = cute::Stride<int64_t, _1, int64_t, int64_t>;
+  using ShapedQaccum =
+      cute::Shape<int32_t, int32_t, int32_t>; // (seqlen_q * d, head, batch)
+  using StridedQaccum = cute::Stride<_1, int64_t, int64_t>;
+
+  // Device side arguments
+  struct Arguments {
+    ElementAccum const* ptr_dQaccum;
+    ShapedQaccum const shape_dQaccum;
+    StridedQaccum const stride_dQaccum;
+    Element* ptr_dQ;
+    ShapedQ const shape_dQ;
+    StridedQ const stride_dQ;
+    int const* seq_offsets = nullptr;
+  };
+
+  // Kernel entry point API
+  struct Params {
+    ElementAccum const* ptr_dQaccum;
+    ShapedQaccum const shape_dQaccum;
+    StridedQaccum const stride_dQaccum;
+    Element* ptr_dQ;
+    ShapedQ const shape_dQ;
+    StridedQ const stride_dQ;
+    int const* seq_offsets = nullptr;
+  };
+
+  // Convert to underlying arguments. In this case, a simple copy for the
+  // aliased type.
+  static Params to_underlying_arguments(Arguments const& args) {
+    return {
+        args.ptr_dQaccum,
+        args.shape_dQaccum,
+        args.stride_dQaccum,
+        args.ptr_dQ,
+        args.shape_dQ,
+        args.stride_dQ,
+        args.seq_offsets};
+  }
+
+  CUTLASS_DEVICE
+  void operator()(Params const& params, char* smem_buf) {
+    static constexpr int kBlockM = get<0>(TileShape_MK{});
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    Tensor sdQaccum = make_tensor(
+        make_smem_ptr(shared_storage.smem_dqacc.data()), SmemLayoutdQaccum{});
+    Tensor sdQaccum_flat = make_tensor(
+        make_smem_ptr(shared_storage.smem_dqacc.data()),
+        SmemLayoutdQaccumFlat{});
+    Tensor sdQ = make_tensor(
+        make_smem_ptr(shared_storage.smem_dq.data()), SmemLayoutdQ{});
+    Tensor sdQt = make_tensor(
+        make_smem_ptr(shared_storage.smem_dq.data()), SmemLayoutdQt{});
+
+    int const thread_idx = threadIdx.x;
+    int const m_block = blockIdx.x;
+    int const bidh = blockIdx.y;
+    int const bidb = blockIdx.z;
+
+    hstu::SeqlenInfo<Jagged, kBlockM> seqlen_info(
+        bidb, size<0>(params.shape_dQ), params.seq_offsets);
+    if (Jagged && m_block * kBlockM >= seqlen_info.seqlen) {
+      return;
+    }
+
+    // Step 1: load dQaccum from gmem to smem
+    Tensor mdQaccum = make_tensor(
+        make_gmem_ptr(
+            reinterpret_cast<ElementAccum const*>(params.ptr_dQaccum)),
+        params.shape_dQaccum,
+        params.stride_dQaccum)(_, bidh, !Jagged ? bidb : 0);
+    Tensor gdQaccum = local_tile(
+        domain_offset(
+            make_coord(seqlen_info.offset_padded * kHeadDim), mdQaccum),
+        Shape<Int<kBlockM * kHeadDim>>{},
+        make_coord(m_block)); // (M * K)
+    if constexpr (IsSm90) { // Use BulkCopy
+      static constexpr uint32_t TmaTransactionBytesdQaccum =
+          static_cast<uint32_t>(
+              size(SmemLayoutdQaccumFlat{}) *
+              cute::sizeof_bits_v<ElementAccum> / 8);
+      auto bulk_copy = Copy_Traits<SM90_BULK_COPY_AUTO>{};
+      // if (thread0()) { print(gdQaccum); printf("\n"); print(sdQaccum_flat);
+      // printf("\n"); }
+      if (thread_idx == 0) {
+        shared_storage.barrier_dQaccum.init(1 /*numThreads*/);
+        shared_storage.barrier_dQaccum.arrive_and_expect_tx(
+            TmaTransactionBytesdQaccum);
+        copy(
+            bulk_copy.with(
+                *reinterpret_cast<uint64_t*>(&shared_storage.barrier_dQaccum)),
+            gdQaccum,
+            sdQaccum_flat);
+      }
+      __syncthreads();
+      shared_storage.barrier_dQaccum.wait(0);
+    } else {
+      G2STiledCopydQaccum g2s_tiled_copy_dQaccum;
+      auto g2s_thr_copy_dQaccum =
+          g2s_tiled_copy_dQaccum.get_thread_slice(thread_idx);
+      Tensor tdQgdQaccumg2s = g2s_thr_copy_dQaccum.partition_S(gdQaccum);
+      Tensor tdQsdQaccumg2s = g2s_thr_copy_dQaccum.partition_D(sdQaccum);
+      cute::copy(g2s_tiled_copy_dQaccum, tdQgdQaccumg2s, tdQsdQaccumg2s);
+      __syncthreads();
+    }
+
+    // __syncthreads(); if (cute::thread0()) { print_tensor(sdQaccum); }
+
+    // Step 2: Load dQaccum from smem to register, then convert fp32 ->
+    // fp16/bf16
+    R2STiledCopydQaccum s2r_tiled_copy_dQaccum;
+    auto s2r_thr_copy_dQaccum =
+        s2r_tiled_copy_dQaccum.get_thread_slice(thread_idx);
+    Tensor tdQsdQaccum = s2r_thr_copy_dQaccum.partition_S(sdQaccum);
+    TiledMma tiled_mma_dQ;
+    Tensor taccdQrdQaccum = partition_fragment_C(
+        tiled_mma_dQ,
+        select<!dQ_swapAB ? 0 : 1, !dQ_swapAB ? 1 : 0>(TileShape_MK{}));
+    // if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 1) {
+    // print(tiled_mma_dQ); printf("\n"); } if (blockIdx.x == 0 && blockIdx.y ==
+    // 0 && threadIdx.x == 1) { print(tdQsdQaccum); } if (blockIdx.x == 0 &&
+    // blockIdx.y == 0 && threadIdx.x == 1) { print(taccdQrdQaccum); }
+    CUTE_STATIC_ASSERT_V(size(taccdQrdQaccum) == size(tdQsdQaccum));
+    Tensor tdQrdQaccum = s2r_thr_copy_dQaccum.retile_D(taccdQrdQaccum);
+    cute::copy(s2r_tiled_copy_dQaccum, tdQsdQaccum, tdQrdQaccum);
+    // Convert tdQrdQ from fp32 to fp16
+    Tensor rdQ = make_tensor_like<Element>(taccdQrdQaccum);
+    hstu::convert_type_out(taccdQrdQaccum, rdQ);
+
+    // Step 3: Copy dQ from register to smem
+    auto smem_tiled_copy_dQ = make_tiled_copy_C(SmemCopyAtomdQ{}, tiled_mma_dQ);
+    auto smem_thr_copy_dQ = smem_tiled_copy_dQ.get_thread_slice(thread_idx);
+    Tensor taccdQrdQ =
+        smem_thr_copy_dQ.retile_S(rdQ); // ((Atom,AtomNum), MMA_N, MMA_N)
+    // if (cute::thread0()) { print(smem_tiled_copy_dQ); }
+    // if (cute::thread0()) { print(smem_thr_copy_dQ); }
+    // if (cute::thread0()) { print(sdQ); }
+    Tensor taccdQsdQ = smem_thr_copy_dQ.partition_D(
+        cute::conditional_return<!dQ_swapAB>(
+            sdQ, sdQt)); // ((Atom,AtomNum),PIPE_M,PIPE_N)
+    cute::copy(smem_tiled_copy_dQ, taccdQrdQ, taccdQsdQ);
+    __syncthreads();
+
+    // Step 4: Copy dQ from smem to register to prepare for coalesced write to
+    // gmem
+    Tensor mdQ = make_tensor(
+        make_gmem_ptr(params.ptr_dQ), params.shape_dQ, params.stride_dQ)(
+        _, _, bidh, !Jagged ? bidb : 0);
+    Tensor gdQ = local_tile(
+        domain_offset(make_coord(seqlen_info.offset, _0{}), mdQ),
+        TileShape_MK{},
+        make_coord(m_block, _0{})); // (M, K)
+    GmemTiledCopy gmem_tiled_copy_dQ;
+    auto gmem_thr_copy_dQ = gmem_tiled_copy_dQ.get_thread_slice(thread_idx);
+    Tensor tdQsdQ =
+        gmem_thr_copy_dQ.partition_S(sdQ); // ((Atom,AtomNum),ATOM_M,ATOM_N)
+    Tensor tdQgdQ = gmem_thr_copy_dQ.partition_D(gdQ);
+
+    Tensor tdQrdQ = make_fragment_like(tdQsdQ);
+    Tensor tdQcdQ = gmem_thr_copy_dQ.partition_D(
+        cute::make_identity_tensor(TileShape_MK{}));
+    Tensor tdQpdQ = make_tensor<bool>(make_shape(size<2>(tdQgdQ)));
+#pragma unroll
+    for (int k = 0; k < size(tdQpdQ); ++k) {
+      tdQpdQ(k) = get<1>(tdQcdQ(_0{}, _0{}, k)) < get<1>(params.shape_dQ);
+    }
+    // Need to check OOB when reading from smem if kBlockM isn't evenly tiled
+    static constexpr bool EvenM =
+        kBlockM % CUTE_STATIC_V(size<0>(GmemLayoutAtom{})) == 0;
+    hstu::
+        copy</*Is_even_MN=*/EvenM, /*Is_even_K=*/true, /*Clear_OOB_MN=*/false>(
+            gmem_tiled_copy_dQ, tdQsdQ, tdQrdQ, tdQcdQ, tdQpdQ, kBlockM);
+
+    // Step 5: Copy dQ from register to gmem
+    // Clear_OOB_K must be false since we don't want to write zeros to gmem
+    hstu::copy<
+        /*Is_even_MN=*/false,
+        /*Is_even_K=*/false,
+        /*Clear_OOB_MN=*/false,
+        /*Clear_OOB_K=*/false>(
+        gmem_tiled_copy_dQ,
+        tdQrdQ,
+        tdQgdQ,
+        tdQcdQ,
+        tdQpdQ,
+        std::min(seqlen_info.seqlen - m_block * kBlockM, kBlockM));
+  }
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_preprocess_kernel.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_preprocess_kernel.h
new file mode 100644
index 000000000..8d29778af
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_preprocess_kernel.h
@@ -0,0 +1,349 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cute/tensor.hpp"
+
+#include <cutlass/array.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/numeric_conversion.h>
+#include <cutlass/numeric_types.h>
+
+#include "seqlen.h"
+
+namespace hstu {
+
+using namespace cute;
+
+template <
+    class TileShape_MK_,
+    class Element,
+    class ElementAccum,
+    class ArchTag_,
+    bool Clear_dQaccum,
+    bool Jagged,
+    bool Softmax>
+class FlashAttnBwdPreprocess {
+ public:
+  // Type Aliases
+  using TileShape_MK = TileShape_MK_;
+  using ArchTag = ArchTag_;
+
+  static_assert(
+      std::is_same_v<Element, cutlass::half_t> &&
+          ArchTag::kMinComputeCapability >= 75 ||
+      std::is_same_v<Element, cutlass::bfloat16_t> &&
+          ArchTag::kMinComputeCapability >= 80 ||
+      std::is_same_v<Element, cutlass::float_e4m3_t> &&
+          ArchTag::kMinComputeCapability >= 89);
+
+  static constexpr uint32_t MaxThreadsPerBlock = 256;
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 2;
+  static constexpr int SharedStorageSize = 0;
+
+  static constexpr int kGmemElemsPerLoad =
+      sizeof(cute::uint128_t) / sizeof(Element);
+  static_assert(
+      get<1>(TileShape_MK{}) % kGmemElemsPerLoad == 0,
+      "Headdim must be a multiple of kGmemElemsPerLoad");
+  static constexpr int kBlockM = get<0>(TileShape_MK{});
+  static constexpr int kHeadDim = get<1>(TileShape_MK{});
+  // We want kBlockKGmem to be a power of 2 so that when we do the summing,
+  // it's just between threads in the same warp
+  static constexpr int kBlockKGmem =
+      kHeadDim % 128 == 0 ? 128 : (kHeadDim % 64 == 0 ? 64 : 32);
+  static constexpr int kGmemThreadsPerRow = kBlockKGmem / kGmemElemsPerLoad;
+  static_assert(
+      MaxThreadsPerBlock % kGmemThreadsPerRow == 0,
+      "MaxThreadsPerBlock must be a multiple of kGmemThreadsPerRow");
+  using GmemLayoutAtom = Layout<
+      Shape<
+          Int<MaxThreadsPerBlock / kGmemThreadsPerRow>,
+          Int<kGmemThreadsPerRow>>,
+      Stride<Int<kGmemThreadsPerRow>, _1>>;
+  using GmemTiledCopy = decltype(make_tiled_copy(
+      Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, Element>{},
+      GmemLayoutAtom{},
+      Layout<Shape<_1, Int<kGmemElemsPerLoad>>>{})); // Val layout, 8 or 16 vals
+                                                     // per load
+
+  static constexpr int kGmemElemsPerLoadAccum =
+      sizeof(cute::uint128_t) / sizeof(ElementAccum);
+  static_assert(
+      (kBlockM * kHeadDim / kGmemElemsPerLoadAccum) % MaxThreadsPerBlock == 0,
+      "MaxThreadsPerBlock must divide kBlockM * kHeadDim / kGmemElemsPerLoadAccum");
+  using GmemLayoutAtomAccum = Layout<Shape<Int<MaxThreadsPerBlock>>>;
+  using GmemTiledCopyAccum = decltype(make_tiled_copy(
+      Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, ElementAccum>{},
+      GmemLayoutAtomAccum{},
+      Layout<Shape<Int<kGmemElemsPerLoadAccum>>>{})); // Val layout, 4 vals per
+                                                      // store
+
+  using ShapeO =
+      cute::Shape<int32_t, int32_t, int32_t, int32_t>; // (seqlen_q, d, head,
+                                                       // batch)
+  using StrideO = cute::Stride<int64_t, _1, int64_t, int64_t>;
+  using ShapedPsum =
+      cute::Shape<int32_t, int32_t, int32_t>; // (seqlen_q, head, batch)
+  using StridedPsum = cute::Stride<_1, int64_t, int64_t>;
+  using ShapedQaccum =
+      cute::Shape<int32_t, int32_t, int32_t>; // (seqlen_q * d, head, batch)
+  using StridedQaccum = cute::Stride<_1, int64_t, int64_t>;
+
+  // Device side arguments
+  struct Arguments {
+    Element const* ptr_O;
+    ShapeO const shape_O;
+    StrideO const stride_O;
+    Element const* ptr_dO;
+    StrideO const stride_dO;
+    float* ptr_dPsum;
+    ShapedPsum const shape_dPsum;
+    StridedPsum const stride_dPsum;
+    float const* ptr_LSE;
+    StridedPsum const stride_LSE;
+    float* ptr_LSE_log2;
+    StridedPsum const stride_LSE_log2;
+    ElementAccum* ptr_dQaccum;
+    ShapedQaccum const shape_dQaccum;
+    StridedQaccum const stride_dQaccum;
+    int num_batch; // We need this to know the size of dq_semaphore in case of
+                   // jagged
+    int num_heads;
+    int num_softmax_heads;
+    int max_seq_len;
+    int* dq_semaphore;
+    int const* seq_offsets = nullptr;
+  };
+
+  // Kernel entry point API
+  struct Params {
+    Element const* ptr_O;
+    ShapeO const shape_O;
+    StrideO const stride_O;
+    Element const* ptr_dO;
+    StrideO const stride_dO;
+    float* ptr_dPsum;
+    ShapedPsum const shape_dPsum;
+    StridedPsum const stride_dPsum;
+    float const* ptr_LSE;
+    StridedPsum const stride_LSE;
+    float* ptr_LSE_log2;
+    StridedPsum const stride_LSE_log2;
+    ElementAccum* ptr_dQaccum;
+    ShapedQaccum const shape_dQaccum;
+    StridedQaccum const stride_dQaccum;
+    int num_batch;
+    int num_heads;
+    int num_softmax_heads;
+    int max_seq_len;
+    int* dq_semaphore;
+    int const* seq_offsets = nullptr;
+  };
+
+  // Convert to underlying arguments. In this case, a simple copy for the
+  // aliased type.
+  static Params to_underlying_arguments(Arguments const& args) {
+    return {args.ptr_O,       args.shape_O,       args.stride_O,
+            args.ptr_dO,      args.stride_dO,     args.ptr_dPsum,
+            args.shape_dPsum, args.stride_dPsum,  args.ptr_LSE,
+            args.stride_LSE,  args.ptr_LSE_log2,  args.stride_LSE_log2,
+            args.ptr_dQaccum, args.shape_dQaccum, args.stride_dQaccum,
+            args.num_batch,   args.num_heads,     args.num_softmax_heads,
+            args.max_seq_len, args.dq_semaphore,  args.seq_offsets};
+  }
+
+  CUTLASS_DEVICE
+  void operator()(Params const& params, [[maybe_unused]] char* smem_buf) {
+    static constexpr int kBlockM = get<0>(TileShape_MK{});
+
+    int const thread_idx = threadIdx.x;
+    int const m_block = blockIdx.x;
+    int const bidh = blockIdx.y;
+    int const bidb = blockIdx.z;
+
+    hstu::SeqlenInfo<Jagged, kBlockM> seqlen_info(
+        bidb, params.max_seq_len, params.seq_offsets);
+    int const seqlen_o = seqlen_info.seqlen;
+    if (Jagged && m_block * kBlockM >= seqlen_o) {
+      return;
+    }
+
+    if constexpr (Softmax) {
+      Tensor mO = make_tensor(
+          make_gmem_ptr(params.ptr_O), params.shape_O, params.stride_O)(
+          _, _, bidh, !Jagged ? bidb : 0);
+      Tensor gO = local_tile(
+          cute::domain_offset(make_coord(seqlen_info.offset, _0{}), mO),
+          TileShape_MK{},
+          make_coord(m_block, _0{})); // (M, K)
+      Tensor mdO = make_tensor(
+          make_gmem_ptr(params.ptr_dO), params.shape_O, params.stride_dO)(
+          _, _, bidh, !Jagged ? bidb : 0);
+      Tensor gdO = local_tile(
+          cute::domain_offset(make_coord(seqlen_info.offset, _0{}), mdO),
+          TileShape_MK{},
+          make_coord(m_block, _0{})); // (M, K)
+
+      auto shape_LSE = select<0, 2, 3>(params.shape_O);
+      Tensor mLSE = make_tensor(
+          make_gmem_ptr(params.ptr_LSE), shape_LSE, params.stride_LSE)(
+          _, bidh, !Jagged ? bidb : 0);
+      Tensor gLSE = local_tile(
+          cute::domain_offset(make_coord(seqlen_info.offset), mLSE),
+          Shape<Int<kBlockM>>{},
+          make_coord(m_block));
+      static_assert(kBlockM <= MaxThreadsPerBlock);
+      float lse =
+          thread_idx < seqlen_o - m_block * kBlockM && thread_idx < kBlockM
+          ? gLSE(thread_idx)
+          : 0.0f;
+
+      GmemTiledCopy gmem_tiled_copy_O;
+      auto gmem_thr_copy_O = gmem_tiled_copy_O.get_thread_slice(thread_idx);
+
+      Tensor tOgO = gmem_thr_copy_O.partition_S(gO);
+      Tensor tOgdO = gmem_thr_copy_O.partition_S(gdO);
+      // Construct identity layout for gO
+      Tensor cO = cute::make_identity_tensor(
+          TileShape_MK{}); // (BLK_M,BLK_K) -> (blk_m,blk_k)
+      // Repeat the partitioning with identity layouts
+      Tensor tOcO = gmem_thr_copy_O.partition_D(cO);
+      Tensor tOpO = make_tensor<bool>(make_shape(size<2>(tOgO)));
+#pragma unroll
+      for (int k = 0; k < size(tOpO); ++k) {
+        tOpO(k) = get<1>(tOcO(_0{}, _0{}, k)) < get<1>(params.shape_O);
+      }
+
+      // (8, kBlockM / 32, kHeadDim / 64) or (8, kBlockM / 16, kHeadDim / 128)
+      Tensor tOrO = make_fragment_like(tOgO);
+      Tensor tOrdO = make_fragment_like(tOgdO);
+      hstu::copy<
+          /*Is_even_MN=*/false,
+          /*Is_even_K=*/false,
+          /*Clear_OOB_MN=*/true,
+          /*Clearn_OOB_K=*/true>(
+          gmem_tiled_copy_O,
+          tOgO,
+          tOrO,
+          tOcO,
+          tOpO,
+          seqlen_o - m_block * kBlockM);
+      hstu::copy<
+          /*Is_even_MN=*/false,
+          /*Is_even_K=*/false,
+          /*Clear_OOB_MN=*/true,
+          /*Clearn_OOB_K=*/true>(
+          gmem_tiled_copy_O,
+          tOgdO,
+          tOrdO,
+          tOcO,
+          tOpO,
+          seqlen_o - m_block * kBlockM);
+      // Reshape from e.g. (8, kBlockM / 32, kHeadDim / 64) to (kBlockM / 32,
+      // (8, kHeadDim / 64))
+      Layout l = make_layout(
+          get<1>(tOrO.layout()),
+          make_layout(get<0>(tOrO.layout()), get<2>(tOrO.layout())));
+      Tensor tOrO_l = make_tensor(tOrO.data(), l);
+      Tensor o_fp32 = make_tensor_like<float>(tOrO_l);
+      hstu::convert_type_out(tOrO_l, o_fp32);
+      Tensor tOrdO_l = make_tensor(tOrdO.data(), l);
+      Tensor do_fp32 = make_tensor_like<float>(tOrdO_l);
+      hstu::convert_type_out(tOrdO_l, do_fp32);
+      // Sum across the last dimension
+      Tensor dP_sum = make_tensor<float>(make_shape(size<0>(o_fp32)));
+#pragma unroll
+      for (int mi = 0; mi < size<0>(o_fp32); ++mi) {
+        float dP_sum_cur = do_fp32(mi, 0) * o_fp32(mi, 0);
+#pragma unroll
+        for (int ni = 1; ni < size<1>(o_fp32); ni++) {
+          dP_sum_cur += do_fp32(mi, ni) * o_fp32(mi, ni);
+        }
+        hstu::SumOp<float> sum_op;
+        dP_sum(mi) =
+            hstu::Allreduce<kGmemThreadsPerRow>::run(dP_sum_cur, sum_op);
+      }
+
+      Tensor mdPsum = make_tensor(
+          make_gmem_ptr(params.ptr_dPsum),
+          params.shape_dPsum,
+          params.stride_dPsum)(_, bidh, !Jagged ? bidb : 0);
+      Tensor gdPsum = local_tile(
+          cute::domain_offset(make_coord(seqlen_info.offset_padded), mdPsum),
+          Shape<Int<kBlockM>>{},
+          make_coord(m_block));
+      if (get<1>(tOcO(_0{}, _0{}, _0{})) == 0) {
+#pragma unroll
+        for (int mi = 0; mi < size(dP_sum); ++mi) {
+          int const row = get<0>(tOcO(_0{}, mi, _0{}));
+          gdPsum(row) = row < seqlen_o - m_block * kBlockM ? dP_sum(mi) : 0;
+        }
+      }
+
+      int const seqlen_rounded = cute::round_up(seqlen_o, kBlockM);
+      Tensor mLSElog2 = make_tensor(
+          make_gmem_ptr(params.ptr_LSE_log2),
+          params.shape_dPsum,
+          params.stride_LSE_log2)(_, bidh, !Jagged ? bidb : 0);
+      Tensor gLSElog2 = local_tile(
+          cute::domain_offset(make_coord(seqlen_info.offset_padded), mLSElog2),
+          Shape<Int<kBlockM>>{},
+          make_coord(m_block));
+      if (thread_idx < seqlen_rounded - m_block * kBlockM &&
+          thread_idx < kBlockM) {
+        gLSElog2(thread_idx) = lse * float(M_LOG2E);
+      }
+    }
+    if constexpr (Clear_dQaccum) {
+      Tensor mdQaccum = make_tensor(
+          make_gmem_ptr(params.ptr_dQaccum),
+          params.shape_dQaccum,
+          params.stride_dQaccum)(_, bidh, !Jagged ? bidb : 0);
+      Tensor gdQaccum = local_tile(
+          cute::domain_offset(
+              make_coord(seqlen_info.offset_padded * kHeadDim), mdQaccum),
+          Shape<Int<kBlockM * kHeadDim>>{},
+          make_coord(m_block));
+      GmemTiledCopyAccum gmem_tiled_copy_dQaccum;
+      auto gmem_thr_copy_dQaccum =
+          gmem_tiled_copy_dQaccum.get_thread_slice(thread_idx);
+      Tensor tdQgdQaccum = gmem_thr_copy_dQaccum.partition_D(gdQaccum);
+      Tensor zero = make_fragment_like(tdQgdQaccum);
+      clear(zero);
+      cute::copy(
+          Copy_Atom<
+              AutoVectorizingCopyWithAssumedAlignment<128>,
+              ElementAccum>{},
+          zero,
+          tdQgdQaccum);
+    }
+
+    if (params.dq_semaphore != nullptr && thread_idx == 0) {
+      int const num_batch = params.num_batch;
+      int const num_head = params.num_heads;
+      params.dq_semaphore
+          [bidh + bidb * num_head + m_block * num_head * num_batch] = 0;
+    }
+  }
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common.cpp b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common.cpp
new file mode 100644
index 000000000..66ec445be
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common.cpp
@@ -0,0 +1,1165 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+// Include these 2 headers instead of torch/extension.h since we don't need all
+// of the torch headers.
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/util/Optional.h>
+#include <torch/nn/functional.h>
+#include <torch/version.h> // For TORCH_VERSION* macros
+
+#include <cutlass/numeric_types.h>
+
+#include "flash.h"
+#include "flash_common.h"
+#include "static_switch.h"
+#include "tile_size.h"
+
+#define CHECK_DEVICE(x) TORCH_CHECK(x.is_cuda(), #x " must be on CUDA")
+#define CHECK_SHAPE(x, ...)                           \
+  TORCH_CHECK(                                        \
+      x.sizes() == torch::IntArrayRef({__VA_ARGS__}), \
+      #x " must have shape (" #__VA_ARGS__ ")")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+
+at::Tensor switch_to_contiguous_if_needed(const at::Tensor& x) {
+  if (x.stride(x.dim() - 1) == 1) {
+    return x;
+  }
+  return x.contiguous();
+}
+
+namespace hstu {
+
+void set_params_fprop(
+    hstu::Flash_fwd_params& params,
+    // sizes
+    const size_t b,
+    const size_t total_seq_len_kv,
+    const size_t total_seq_len_q,
+    const size_t max_seq_len,
+    const size_t max_q_len,
+    const size_t h,
+    const size_t qk_d,
+    const size_t v_d,
+    // device pointers
+    const at::Tensor& q,
+    const at::Tensor& k,
+    const at::Tensor& v,
+    const at::Tensor& out,
+    void* seq_offsets,
+    void* num_targets,
+    void* attn_scale,
+    void* seq_offsets_q,
+    void* softmax_lse,
+    void* max_seq_len_tensor,
+    void* contextual_seq_len_tensor,
+    void* max_attn_len_tensor,
+    void* min_full_attn_seq_len_tensor,
+    const int num_groups,
+    bool causal,
+    float alpha,
+    const bool scalar_scale,
+    const int max_attn_len,
+    const int min_full_attn_seq_len,
+    const int contextual_seq_len,
+    const int num_softmax_heads,
+    const bool training,
+    const int sm_margin = 0) {
+  // Reset the parameters
+  params = {};
+
+  params.is_bf16 = q.dtype() == torch::kBFloat16;
+  params.is_e4m3 = q.dtype() == torch::kFloat8_e4m3fn;
+
+  // Set the pointers and strides.
+  params.q_ptr = q.data_ptr();
+  params.k_ptr = k.data_ptr();
+  params.v_ptr = v.data_ptr();
+  params.o_ptr = out.data_ptr();
+  // All stride are in elements, not bytes.
+  params.q_row_stride = q.stride(-3);
+  params.k_row_stride = k.stride(-3);
+  params.v_row_stride = v.stride(-3);
+  params.o_row_stride = out.stride(-3);
+  params.q_head_stride = q.stride(-2);
+  params.k_head_stride = k.stride(-2);
+  params.v_head_stride = v.stride(-2);
+  params.o_head_stride = out.stride(-2);
+  params.v_dim_stride = v.stride(-1);
+
+  if (seq_offsets == nullptr) {
+    params.q_batch_stride = q.stride(0);
+    params.k_batch_stride = k.stride(0);
+    params.v_batch_stride = v.stride(0);
+    params.o_batch_stride = out.stride(0);
+  }
+
+  params.seq_offsets = static_cast<int*>(seq_offsets);
+  params.seq_offsets_q = static_cast<int*>(seq_offsets_q);
+  params.num_targets = static_cast<int*>(num_targets);
+  params.attn_scale = static_cast<float*>(attn_scale);
+  params.softmax_lse = static_cast<float*>(softmax_lse);
+  params.max_seq_len_tensor = static_cast<int*>(max_seq_len_tensor);
+  params.contextual_seq_len_tensor =
+      static_cast<int*>(contextual_seq_len_tensor);
+  params.max_attn_len_tensor = static_cast<int*>(max_attn_len_tensor);
+  params.min_full_attn_seq_len_tensor =
+      static_cast<int*>(min_full_attn_seq_len_tensor);
+  params.num_groups = num_groups;
+  params.batch_size_per_group = b / num_groups;
+
+  // Set the dimensions.
+  params.b = b;
+  params.h = h;
+  params.total_seq_len_q = total_seq_len_q;
+  params.total_seq_len_kv = total_seq_len_kv;
+  params.max_kv_len = max_seq_len;
+  params.max_q_len = max_q_len;
+  params.qk_d = qk_d;
+  params.v_d = v_d;
+
+  params.alpha = alpha;
+
+  // Note: when num_groups > 1, max_attn_len, contextual_seq_len,
+  // min_full_attn_seq_len represent the max value in the tensor.
+  params.is_local = max_attn_len > 0;
+  params.is_causal = causal && (!params.is_local);
+  params.has_contexual_mask = contextual_seq_len > 0;
+  params.scalar_scale = scalar_scale;
+  params.num_softmax_heads = num_softmax_heads;
+  params.training = training;
+
+  params.max_attn_len = max_attn_len;
+  params.min_full_attn_seq_len = min_full_attn_seq_len;
+  params.contextual_seq_len = contextual_seq_len;
+
+  params.arch = at::cuda::getCurrentDeviceProperties()->major * 10 +
+      at::cuda::getCurrentDeviceProperties()->minor;
+  params.num_sm =
+      at::cuda::getCurrentDeviceProperties()->multiProcessorCount - sm_margin;
+
+#ifdef FLASHATTENTION_DISABLE_LOCAL
+  TORCH_CHECK(
+      !params.is_local,
+      "This flash attention build does not support local attention.");
+#endif
+}
+
+void set_params_dgrad(
+    hstu::Flash_bwd_params& params,
+    // sizes
+    const size_t b,
+    const size_t total_seq_len_kv,
+    const size_t total_seq_len_q,
+    const size_t max_seq_len,
+    const size_t max_q_len,
+    const size_t max_q_len_rounded,
+    const size_t h,
+    const size_t qk_d,
+    const size_t v_d,
+    const size_t qk_d_rounded,
+    const size_t v_d_rounded,
+    // device pointers
+    const at::Tensor& q,
+    const at::Tensor& k,
+    const at::Tensor& v,
+    const at::Tensor& out,
+    const at::Tensor& dout,
+    const at::Tensor& dq,
+    const at::Tensor& dk,
+    const at::Tensor& dv,
+    void* dq_accum_d,
+    void* seq_offsets,
+    void* num_targets,
+    void* attn_scale,
+    void* sort_by_length_indices,
+    void* seq_offsets_q,
+    void* softmax_lse,
+    void* softmax_d,
+    void* softmax_lse_log2,
+    void* max_seq_len_tensor,
+    void* contextual_seq_len_tensor,
+    void* max_attn_len_tensor,
+    void* min_full_attn_seq_len_tensor,
+    const int num_groups,
+    const bool scalar_scale,
+    const bool causal,
+    const float alpha,
+    const int max_attn_len,
+    const int min_full_attn_seq_len,
+    const int contextual_seq_len,
+    const int num_softmax_heads,
+    bool deterministic = false,
+    int const sm_margin = 0) {
+  hstu::set_params_fprop(
+      params,
+      b,
+      total_seq_len_kv,
+      total_seq_len_q,
+      max_seq_len,
+      max_q_len,
+      h,
+      qk_d,
+      v_d,
+      q,
+      k,
+      v,
+      out,
+      seq_offsets,
+      num_targets,
+      attn_scale,
+      seq_offsets_q,
+      softmax_lse,
+      max_seq_len_tensor,
+      contextual_seq_len_tensor,
+      max_attn_len_tensor,
+      min_full_attn_seq_len_tensor,
+      num_groups,
+      causal,
+      alpha,
+      scalar_scale,
+      max_attn_len,
+      min_full_attn_seq_len,
+      contextual_seq_len,
+      num_softmax_heads,
+      false /* training */,
+      sm_margin);
+
+  // Set the pointers and strides.
+  params.do_ptr = dout.data_ptr();
+  params.do_row_stride = dout.stride(-3);
+  params.do_head_stride = dout.stride(-2);
+  params.dq_ptr = dq.data_ptr();
+  params.dk_ptr = dk.data_ptr();
+  params.dv_ptr = dv.data_ptr();
+  params.dq_row_stride = dq.stride(-3);
+  params.dk_row_stride = dk.stride(-3);
+  params.dv_row_stride = dv.stride(-3);
+  params.dq_head_stride = dq.stride(-2);
+  params.dk_head_stride = dk.stride(-2);
+  params.dv_head_stride = dv.stride(-2);
+
+  params.qk_d_rounded = qk_d_rounded;
+  params.v_d_rounded = v_d_rounded;
+  params.max_q_len_rounded = max_q_len_rounded;
+
+  params.sort_by_length_indices = static_cast<int*>(sort_by_length_indices);
+
+  if (seq_offsets == nullptr) {
+    params.do_batch_stride = dout.stride(0);
+    params.dq_batch_stride = dq.stride(0);
+    params.dk_batch_stride = dk.stride(0);
+    params.dv_batch_stride = dv.stride(0);
+  }
+  params.dq_accum_ptr = dq_accum_d;
+  params.softmax_lse_log2 = static_cast<float*>(softmax_lse_log2);
+  params.softmax_d = static_cast<float*>(softmax_d);
+  params.deterministic = deterministic;
+}
+
+void run_mha_fwd(hstu::Flash_fwd_params& params, cudaStream_t stream) {
+  // HEADDIM_SWITCH(params.d, [&] {
+  //     hstu::run_mha_fwd_<cutlass::half_t, kHeadSize>(params, stream);
+  // });
+  ARCH_SWITCH(params.arch, Arch, [&] {
+    BOOL_SWITCH(params.num_softmax_heads == params.h, Softmax, [&] {
+      if (!params.is_e4m3) {
+        if (params.is_bf16) {
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+          if (params.qk_d <= 64) {
+            return hstu::run_mha_fwd_<Arch, cutlass::bfloat16_t, 64, Softmax>(
+                params, stream);
+          }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+          if (params.qk_d <= 96) {
+            return hstu::run_mha_fwd_<Arch, cutlass::bfloat16_t, 96, Softmax>(
+                params, stream);
+          }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+          if (params.qk_d <= 128) {
+            return hstu::run_mha_fwd_<Arch, cutlass::bfloat16_t, 128, Softmax>(
+                params, stream);
+          }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+          if (params.qk_d <= 192) {
+            return hstu::run_mha_fwd_<Arch, cutlass::bfloat16_t, 192, Softmax>(
+                params, stream);
+          }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+          if (params.qk_d <= 256) {
+            return hstu::run_mha_fwd_<Arch, cutlass::bfloat16_t, 256, Softmax>(
+                params, stream);
+          }
+#endif
+        } else {
+#ifndef FLASHATTENTION_DISABLE_FP16
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+          if (params.qk_d <= 64) {
+            return hstu::run_mha_fwd_<Arch, cutlass::half_t, 64, Softmax>(
+                params, stream);
+          }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+          if (params.qk_d <= 96) {
+            return hstu::run_mha_fwd_<Arch, cutlass::half_t, 96, Softmax>(
+                params, stream);
+          }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+          if (params.qk_d <= 128) {
+            return hstu::run_mha_fwd_<Arch, cutlass::half_t, 128, Softmax>(
+                params, stream);
+          }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+          if (params.qk_d <= 192) {
+            return hstu::run_mha_fwd_<Arch, cutlass::half_t, 192, Softmax>(
+                params, stream);
+          }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+          if (params.qk_d <= 256) {
+            return hstu::run_mha_fwd_<Arch, cutlass::half_t, 256, Softmax>(
+                params, stream);
+          }
+#endif
+#else
+                                TORCH_CHECK(false, "This flash attention build does not support FP16.");
+#endif
+        }
+      } else {
+#ifndef FLASHATTENTION_DISABLE_FP8
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+        if (params.qk_d <= 64) {
+          return hstu::run_mha_fwd_<90, cutlass::float_e4m3_t, 64, Softmax>(
+              params, stream);
+        }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+        if (params.qk_d <= 96) {
+          return hstu::run_mha_fwd_<90, cutlass::float_e4m3_t, 96, Softmax>(
+              params, stream);
+        }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+        if (params.qk_d <= 128) {
+          return hstu::run_mha_fwd_<90, cutlass::float_e4m3_t, 128, Softmax>(
+              params, stream);
+        }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+        if (params.qk_d <= 192) {
+          return hstu::run_mha_fwd_<90, cutlass::float_e4m3_t, 192, Softmax>(
+              params, stream);
+        }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+        if (params.qk_d <= 256) {
+          return hstu::run_mha_fwd_<90, cutlass::float_e4m3_t, 256, Softmax>(
+              params, stream);
+        }
+#endif
+#else
+                            TORCH_CHECK(false, "This flash attention build does not support FP8.");
+#endif
+      }
+    });
+  });
+}
+
+std::tuple<at::Tensor, std::optional<at::Tensor>> hstu_mha_fwd(
+    int64_t max_seq_len,
+    double alpha,
+    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
+    const std::optional<at::Tensor>& seq_offsets,
+    bool causal,
+    const std::optional<at::Tensor>& num_targets,
+    const std::optional<at::Tensor>& attn_scale,
+    int64_t max_attn_len,
+    int64_t min_full_attn_seq_len,
+    int64_t contextual_seq_len,
+    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
+    const std::optional<at::Tensor>& k_descale, // (b, h_k)
+    const std::optional<at::Tensor>& v_descale, // (b, h_k)
+    const int64_t sm_margin,
+    int64_t max_q_len,
+    const std::optional<at::Tensor>& seq_offsets_q,
+    int64_t num_softmax_heads,
+    bool training,
+    const std::optional<at::Tensor>& max_seq_len_tensor,
+    const std::optional<at::Tensor>& contextual_seq_len_tensor,
+    const std::optional<at::Tensor>& max_attn_len_tensor,
+    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor,
+    int64_t num_groups) {
+  auto dprops = at::cuda::getCurrentDeviceProperties();
+  bool is_sm9x = dprops->major >= 9;
+  TORCH_CHECK(is_sm9x, "HSTU Attention only supports Hopper GPUs or newer.");
+
+  q = switch_to_contiguous_if_needed(q);
+  k = switch_to_contiguous_if_needed(k);
+  v = switch_to_contiguous_if_needed(v);
+
+  auto q_type = q.scalar_type();
+  TORCH_CHECK(
+      q_type == at::ScalarType::Half || q_type == at::ScalarType::BFloat16 ||
+          q_type == at::ScalarType::Float8_e4m3fn,
+      "FlashAttention only supports fp16, bf16, and fp8_e4m3 data type");
+  if (dprops->major < 9) {
+    TORCH_CHECK(
+        q_type == at::ScalarType::Half || q_type == at::ScalarType::BFloat16,
+        "FlashAttention on Ampere/Ada cards only supports fp16 and bf16 data type");
+  }
+  TORCH_CHECK(
+      k.scalar_type() == q_type, "query and key must have the same dtype");
+  TORCH_CHECK(
+      v.scalar_type() == q_type, "query and value must have the same dtype");
+
+  CHECK_DEVICE(q);
+  CHECK_DEVICE(k);
+  CHECK_DEVICE(v);
+
+  TORCH_CHECK(
+      q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+  TORCH_CHECK(
+      k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+  TORCH_CHECK(
+      v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+
+  at::Tensor seq_offsets_;
+  bool const is_jagged = seq_offsets.has_value();
+  if (is_jagged) {
+    seq_offsets_ = seq_offsets.value();
+    CHECK_DEVICE(seq_offsets_);
+    CHECK_CONTIGUOUS(seq_offsets_);
+    TORCH_CHECK(
+        seq_offsets_.dtype() == torch::kInt32,
+        "seq_offsets_ must have dtype torch.int32");
+  }
+  at::Tensor num_targets_;
+  bool const has_multiple_targets = num_targets.has_value();
+  if (has_multiple_targets) {
+    num_targets_ = num_targets.value();
+    CHECK_DEVICE(num_targets_);
+    CHECK_CONTIGUOUS(num_targets_);
+    TORCH_CHECK(
+        num_targets_.dtype() == torch::kInt32,
+        "num_targets_ must have dtype torch.int32");
+  }
+  at::Tensor seq_offsets_q_;
+  bool const is_cross_attn = seq_offsets_q.has_value();
+  if (is_cross_attn) {
+    seq_offsets_q_ = seq_offsets_q.value();
+    CHECK_DEVICE(seq_offsets_q_);
+    CHECK_CONTIGUOUS(seq_offsets_q_);
+    TORCH_CHECK(
+        seq_offsets_q_.dtype() == torch::kInt32,
+        "seq_offsets_q_ must have dtype torch.int32");
+  } else {
+    max_q_len = max_seq_len;
+  }
+  at::Tensor attn_scale_;
+  bool scalar_scale = true;
+  bool const has_attn_scale = attn_scale.has_value();
+  if (has_attn_scale) {
+    attn_scale_ = attn_scale.value();
+    scalar_scale = attn_scale_.numel() == num_groups;
+    CHECK_DEVICE(attn_scale_);
+    TORCH_CHECK(
+        attn_scale_.dtype() == torch::kFloat32,
+        "attn_scale_ must have dtype torch.float32");
+  }
+  at::Tensor max_seq_len_tensor_;
+  at::Tensor contextual_seq_len_tensor_;
+  at::Tensor max_attn_len_tensor_;
+  at::Tensor min_full_attn_seq_len_tensor_;
+  if (num_groups > 1) {
+    TORCH_CHECK(
+        max_seq_len_tensor.has_value(),
+        "max_seq_len_tensor cannot be empty for num_groups > 1.");
+    max_seq_len_tensor_ = max_seq_len_tensor.value();
+    CHECK_DEVICE(max_seq_len_tensor_);
+    TORCH_CHECK(max_seq_len_tensor_.dtype() == torch::kInt32);
+    if (!is_cross_attn) {
+      TORCH_CHECK(
+          contextual_seq_len_tensor.has_value(),
+          "contextual_seq_len_tensor cannot be empty for num_groups > 1 and not cross_attn.");
+      TORCH_CHECK(
+          max_attn_len_tensor.has_value(),
+          "max_attn_len_tensor cannot be empty for num_groups > 1 and not cross_attn.");
+      TORCH_CHECK(
+          min_full_attn_seq_len_tensor.has_value(),
+          "min_full_attn_seq_len_tensor cannot be empty for num_groups > 1 and not cross_attn.");
+      contextual_seq_len_tensor_ = contextual_seq_len_tensor.value();
+      max_attn_len_tensor_ = max_attn_len_tensor.value();
+      min_full_attn_seq_len_tensor_ = min_full_attn_seq_len_tensor.value();
+      CHECK_DEVICE(contextual_seq_len_tensor_);
+      CHECK_DEVICE(max_attn_len_tensor_);
+      CHECK_DEVICE(min_full_attn_seq_len_tensor_);
+      TORCH_CHECK(contextual_seq_len_tensor_.dtype() == torch::kInt32);
+      TORCH_CHECK(max_attn_len_tensor_.dtype() == torch::kInt32);
+      TORCH_CHECK(min_full_attn_seq_len_tensor_.dtype() == torch::kInt32);
+    }
+  }
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+  if (is_jagged && has_multiple_targets) {
+    auto uih_lengths = seq_offsets_.slice(0, 1)
+                           .sub(seq_offsets_.slice(0, 0, -1))
+                           .sub(num_targets_);
+    TORCH_CHECK(
+        (uih_lengths.gt(0)).sum().item<int64_t>() == num_targets_.size(0),
+        "some uih seqlen is 0");
+    TORCH_CHECK(
+        (uih_lengths.greater_equal(contextual_seq_len)).sum().item<int64_t>() ==
+            num_targets_.size(0),
+        "some uih seqlen is less than contextual_seq_len");
+  }
+#endif
+  TORCH_CHECK(
+      q.size(-1) == k.size(-1) && k.size(-1) == v.size(-1),
+      "only attndim == hidden_dim is supported");
+
+  auto const sizes_q = q.sizes();
+  auto const sizes_k = k.sizes();
+  const int batch_size = !is_jagged ? sizes_q[0] : seq_offsets_.size(0) - 1;
+  TORCH_CHECK(
+      batch_size % num_groups == 0, "batch_size not divisible by num_groups");
+  int total_seq_len_q = !is_jagged ? batch_size * max_q_len : sizes_q[0];
+  int total_seq_len_kv = !is_jagged ? batch_size * max_seq_len : sizes_k[0];
+  int num_heads = q.size(-2);
+  int const qk_head_size = q.size(-1);
+  int const v_head_size = v.size(-1);
+  int const max_headdim = get_max_headdim();
+  TORCH_CHECK(
+      qk_head_size <= max_headdim && v_head_size <= max_headdim,
+      "FlashAttention forward only supports head dimension at most " +
+          std::to_string(max_headdim));
+  TORCH_CHECK(max_attn_len >= 0, "max_attn_len must be at least 0");
+  TORCH_CHECK(
+      min_full_attn_seq_len >= 0, "min_full_attn_seq_len must be at least 0");
+  TORCH_CHECK(contextual_seq_len >= 0, "contextual_seq_len must be at least 0");
+  if (max_attn_len > 0) {
+    TORCH_CHECK(
+        min_full_attn_seq_len > 0,
+        "min_full_attn_seq_len=0 not supported when max_attn_len > 0");
+  }
+  TORCH_CHECK(
+      0 == num_softmax_heads || num_softmax_heads == num_heads,
+      "num_softmax_heads must be either 0 or num_heads");
+  if (!is_jagged) {
+    CHECK_SHAPE(q, batch_size, max_q_len, num_heads, qk_head_size);
+    CHECK_SHAPE(k, batch_size, max_seq_len, num_heads, qk_head_size);
+    CHECK_SHAPE(v, batch_size, max_seq_len, num_heads, v_head_size);
+  } else {
+    CHECK_SHAPE(q, total_seq_len_q, num_heads, qk_head_size);
+    CHECK_SHAPE(k, total_seq_len_kv, num_heads, qk_head_size);
+    CHECK_SHAPE(v, total_seq_len_kv, num_heads, v_head_size);
+    CHECK_SHAPE(seq_offsets_, batch_size + 1);
+  }
+  if (has_multiple_targets) {
+    CHECK_SHAPE(num_targets_, batch_size);
+  }
+  if (is_cross_attn) {
+    CHECK_SHAPE(seq_offsets_q_, batch_size + 1);
+  }
+
+  int const alignment = q_type == torch::kFloat8_e4m3fn ? 16 : 8;
+  TORCH_CHECK(
+      qk_head_size % alignment == 0 && v_head_size % alignment == 0,
+      "head_size should be a multiple of " + std::to_string(alignment));
+
+  auto opts = q.options();
+  auto out_type = q_type == at::ScalarType::Float8_e4m3fn
+      ? at::ScalarType::BFloat16
+      : q_type;
+  at::Tensor out;
+  if (!is_jagged) {
+    out = torch::empty(
+        {batch_size, max_q_len, num_heads, v_head_size}, opts.dtype(out_type));
+  } else {
+    out = torch::empty(
+        {total_seq_len_q, num_heads, v_head_size}, opts.dtype(out_type));
+  }
+  std::optional<at::Tensor> softmax_lse = std::nullopt;
+
+  // Early return for empty sequences to avoid TMA descriptor
+  // initialization failure
+  if (total_seq_len_kv == 0 || total_seq_len_q == 0) {
+    return {out, std::nullopt};
+  }
+
+  if (num_softmax_heads > 0) {
+    if (!is_jagged) {
+      softmax_lse = torch::empty(
+          {batch_size, num_softmax_heads, max_q_len}, opts.dtype(at::kFloat));
+    } else {
+      softmax_lse = torch::empty(
+          {num_softmax_heads, total_seq_len_q}, opts.dtype(at::kFloat));
+    }
+  }
+
+  // Otherwise the kernel will be launched from cuda:0 device
+  // Cast to char to avoid compiler warning about narrowing
+  at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+  hstu::Flash_fwd_params params;
+  hstu::set_params_fprop(
+      params,
+      batch_size,
+      total_seq_len_kv,
+      total_seq_len_q,
+      max_seq_len,
+      max_q_len,
+      num_heads,
+      qk_head_size,
+      v_head_size,
+      q,
+      k,
+      v,
+      out,
+      !is_jagged ? nullptr : seq_offsets_.data_ptr(),
+      !has_multiple_targets ? nullptr : num_targets_.data_ptr(),
+      !has_attn_scale ? nullptr : attn_scale_.data_ptr(),
+      !is_cross_attn ? nullptr : seq_offsets_q_.data_ptr(),
+      (num_softmax_heads == 0) ? nullptr : softmax_lse.value().data_ptr(),
+      num_groups > 1 ? max_seq_len_tensor_.data_ptr() : nullptr,
+      ((num_groups > 1) && (!is_cross_attn))
+          ? contextual_seq_len_tensor_.data_ptr()
+          : nullptr,
+      ((num_groups > 1) && (!is_cross_attn)) ? max_attn_len_tensor_.data_ptr()
+                                             : nullptr,
+      ((num_groups > 1) && (!is_cross_attn))
+          ? min_full_attn_seq_len_tensor_.data_ptr()
+          : nullptr,
+      num_groups,
+      causal,
+      alpha,
+      scalar_scale,
+      max_attn_len,
+      min_full_attn_seq_len,
+      contextual_seq_len,
+      num_softmax_heads,
+      training,
+      sm_margin);
+  at::Tensor tile_count_semaphore;
+  // We don't use the persistent scheduler if not jagged
+  bool const persistent_scheduler = params.arch >= 90
+      ? (params.is_causal || params.is_local || is_jagged)
+      : (params.is_causal || is_jagged);
+  if (persistent_scheduler) {
+    tile_count_semaphore = torch::zeros({1}, opts.dtype(torch::kInt32));
+    params.tile_count_semaphore = tile_count_semaphore.data_ptr<int>();
+  } else {
+    params.tile_count_semaphore = nullptr;
+  }
+
+  if (q_type == at::ScalarType::Float8_e4m3fn) {
+    if (q_descale.has_value()) {
+      auto q_descale_ = q_descale.value();
+      CHECK_DEVICE(q_descale_);
+      CHECK_SHAPE(q_descale_, batch_size, num_heads);
+      params.q_descale_ptr = q_descale_.data_ptr<float>();
+      params.q_descale_batch_stride = q_descale_.stride(0);
+      params.q_descale_head_stride = q_descale_.stride(1);
+    } else {
+      params.q_descale_ptr = nullptr;
+    }
+    if (k_descale.has_value()) {
+      auto k_descale_ = k_descale.value();
+      CHECK_DEVICE(k_descale_);
+      CHECK_SHAPE(k_descale_, batch_size, num_heads);
+      params.k_descale_ptr = k_descale_.data_ptr<float>();
+      params.k_descale_batch_stride = k_descale_.stride(0);
+      params.k_descale_head_stride = k_descale_.stride(1);
+    } else {
+      params.k_descale_ptr = nullptr;
+    }
+    if (v_descale.has_value()) {
+      auto v_descale_ = v_descale.value();
+      CHECK_DEVICE(v_descale_);
+      CHECK_SHAPE(v_descale_, batch_size, num_heads);
+      params.v_descale_ptr = v_descale_.data_ptr<float>();
+      params.v_descale_batch_stride = v_descale_.stride(0);
+      params.v_descale_head_stride = v_descale_.stride(1);
+    } else {
+      params.v_descale_ptr = nullptr;
+    }
+  }
+
+#ifdef FLASHATTENTION_DISABLE_LOCAL
+  TORCH_CHECK(
+      !params.is_local,
+      "This flash attention build does not support local attention.");
+#endif
+
+  if (total_seq_len_q > 0 && num_heads > 0) {
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    run_mha_fwd(params, stream);
+  }
+  return {out, softmax_lse};
+}
+
+void run_mha_bwd(hstu::Flash_bwd_params& params, cudaStream_t stream) {
+#ifndef FLASHATTENTION_DISABLE_BACKWARD
+  // FP16_SWITCH(!params.is_bf16, [&] {
+  //     HEADDIM_SWITCH(params.d, [&] {
+  //         hstu::run_mha_bwd_<elem_type, kHeadDim>(params, stream);
+  //     });
+  // });
+  ARCH_SWITCH(params.arch, Arch, [&] {
+    BOOL_SWITCH(params.num_softmax_heads == params.h, Softmax, [&] {
+      if (!params.is_bf16) {
+#ifndef FLASHATTENTION_DISABLE_FP16
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+        if (params.qk_d <= 64) {
+          return hstu::run_mha_bwd_<Arch, cutlass::half_t, 64, Softmax>(
+              params, stream);
+        }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+        if (params.qk_d <= 96) {
+          return hstu::run_mha_bwd_<Arch, cutlass::half_t, 96, Softmax>(
+              params, stream);
+        }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+        if (params.qk_d <= 128) {
+          return hstu::run_mha_bwd_<Arch, cutlass::half_t, 128, Softmax>(
+              params, stream);
+        }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+        if (params.qk_d <= 192) {
+          return hstu::run_mha_bwd_<Arch, cutlass::half_t, 192, Softmax>(
+              params, stream);
+        }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+        if (params.qk_d <= 256) {
+          return hstu::run_mha_bwd_<Arch, cutlass::half_t, 256, Softmax>(
+              params, stream);
+        }
+#endif
+#else
+                TORCH_CHECK(false, "This flash attention build does not support FP16.");
+#endif
+      } else {
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+        if (params.qk_d <= 64) {
+          return hstu::run_mha_bwd_<Arch, cutlass::bfloat16_t, 64, Softmax>(
+              params, stream);
+        }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+        if (params.qk_d <= 96) {
+          return hstu::run_mha_bwd_<Arch, cutlass::bfloat16_t, 96, Softmax>(
+              params, stream);
+        }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+        if (params.qk_d <= 128) {
+          return hstu::run_mha_bwd_<Arch, cutlass::bfloat16_t, 128, Softmax>(
+              params, stream);
+        }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+        if (params.qk_d <= 192) {
+          return hstu::run_mha_bwd_<Arch, cutlass::bfloat16_t, 192, Softmax>(
+              params, stream);
+        }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+        if (params.qk_d <= 256) {
+          return hstu::run_mha_bwd_<Arch, cutlass::bfloat16_t, 256, Softmax>(
+              params, stream);
+        }
+#endif
+      }
+    });
+  });
+#endif
+}
+
+std::vector<at::Tensor> hstu_mha_bwd(
+    int64_t max_seq_len,
+    double alpha,
+    at::Tensor& dout,
+    at::Tensor& q,
+    at::Tensor& k,
+    at::Tensor& v,
+    at::Tensor& dq,
+    at::Tensor& dk,
+    at::Tensor& dv,
+    at::Tensor& out,
+    const std::optional<at::Tensor>& seq_offsets,
+    bool causal,
+    const std::optional<at::Tensor>& num_targets,
+    const std::optional<at::Tensor>& attn_scale,
+    int64_t max_attn_len,
+    int64_t min_full_attn_seq_len,
+    int64_t contextual_seq_len,
+    bool sort_by_length,
+    bool const deterministic,
+    const int64_t sm_margin,
+    int64_t max_q_len,
+    const std::optional<at::Tensor>& seq_offsets_q,
+    int64_t num_softmax_heads,
+    const std::optional<at::Tensor>& softmax_lse,
+    const std::optional<at::Tensor>& max_seq_len_tensor,
+    const std::optional<at::Tensor>& contextual_seq_len_tensor,
+    const std::optional<at::Tensor>& max_attn_len_tensor,
+    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor,
+    int64_t num_groups) {
+#ifdef FLASHATTENTION_DISABLE_BACKWARD
+  TORCH_CHECK(false, "This flash attention build does not support backward.");
+#endif
+
+  auto dprops = at::cuda::getCurrentDeviceProperties();
+  bool is_sm9x = dprops->major >= 9;
+  TORCH_CHECK(is_sm9x, "HSTU Attention only supports Hopper GPUs or newer.");
+
+  q = switch_to_contiguous_if_needed(q);
+  k = switch_to_contiguous_if_needed(k);
+  v = switch_to_contiguous_if_needed(v);
+  out = switch_to_contiguous_if_needed(out);
+  dout = switch_to_contiguous_if_needed(dout);
+
+  auto q_type = q.dtype();
+  TORCH_CHECK(
+      q_type == torch::kFloat16 || q_type == torch::kBFloat16,
+      "FlashAttention only support fp16 and bf16 data type");
+  TORCH_CHECK(k.dtype() == q_type, "query and key must have the same dtype");
+  TORCH_CHECK(v.dtype() == q_type, "query and value must have the same dtype");
+  TORCH_CHECK(
+      dout.dtype() == q_type, "query and dout must have the same dtype");
+
+  CHECK_DEVICE(q);
+  CHECK_DEVICE(k);
+  CHECK_DEVICE(v);
+  CHECK_DEVICE(dout);
+
+  TORCH_CHECK(
+      q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+  TORCH_CHECK(
+      k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+  TORCH_CHECK(
+      v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
+  TORCH_CHECK(
+      dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");
+
+  at::Tensor seq_offsets_;
+  bool const is_jagged = seq_offsets.has_value();
+  if (is_jagged) {
+    seq_offsets_ = seq_offsets.value();
+    CHECK_DEVICE(seq_offsets_);
+    CHECK_CONTIGUOUS(seq_offsets_);
+    TORCH_CHECK(
+        seq_offsets_.dtype() == torch::kInt32,
+        "seq_offsets_ must have dtype torch.int32");
+  }
+  at::Tensor sort_by_length_indices_;
+  if (sort_by_length && is_jagged) {
+    auto seq_lengths =
+        seq_offsets_.slice(0, 1).sub(seq_offsets_.slice(0, 0, -1));
+    std::tuple<torch::Tensor, torch::Tensor> sort_result = torch::sort(
+        seq_lengths, false /*stable*/, 0 /*dim*/, true /*descending*/);
+    sort_by_length_indices_ = std::get<1>(sort_result).to(torch::kInt32);
+    CHECK_DEVICE(sort_by_length_indices_);
+    CHECK_CONTIGUOUS(sort_by_length_indices_);
+    TORCH_CHECK(
+        sort_by_length_indices_.dtype() == torch::kInt32,
+        "sort_by_length_indices_ must have dtype torch.int32");
+  }
+  at::Tensor num_targets_;
+  bool const has_multiple_targets = num_targets.has_value();
+  if (has_multiple_targets) {
+    num_targets_ = num_targets.value();
+    CHECK_DEVICE(num_targets_);
+    CHECK_CONTIGUOUS(num_targets_);
+    TORCH_CHECK(
+        num_targets_.dtype() == torch::kInt32,
+        "num_targets_ must have dtype torch.int32");
+  }
+  at::Tensor attn_scale_;
+  bool scalar_scale = true;
+  bool const has_attn_scale = attn_scale.has_value();
+  if (has_attn_scale) {
+    attn_scale_ = attn_scale.value();
+    scalar_scale = attn_scale_.numel() == num_groups;
+    CHECK_DEVICE(attn_scale_);
+    TORCH_CHECK(
+        attn_scale_.dtype() == torch::kFloat32,
+        "attn_scale_ must have dtype torch.float32");
+  }
+  at::Tensor seq_offsets_q_;
+  bool const is_cross_attn = seq_offsets_q.has_value();
+  if (is_cross_attn) {
+    seq_offsets_q_ = seq_offsets_q.value();
+    CHECK_DEVICE(seq_offsets_q_);
+    CHECK_CONTIGUOUS(seq_offsets_q_);
+    TORCH_CHECK(
+        seq_offsets_q_.dtype() == torch::kInt32,
+        "seq_offsets_q_ must have dtype torch.int32");
+  } else {
+    max_q_len = max_seq_len;
+  }
+  at::Tensor max_seq_len_tensor_;
+  at::Tensor contextual_seq_len_tensor_;
+  at::Tensor max_attn_len_tensor_;
+  at::Tensor min_full_attn_seq_len_tensor_;
+  if (num_groups > 1) {
+    TORCH_CHECK(
+        max_seq_len_tensor.has_value(),
+        "max_seq_len_tensor cannot be empty for num_groups > 1.");
+    max_seq_len_tensor_ = max_seq_len_tensor.value();
+    CHECK_DEVICE(max_seq_len_tensor_);
+    TORCH_CHECK(max_seq_len_tensor_.dtype() == torch::kInt32);
+    if (!is_cross_attn) {
+      TORCH_CHECK(
+          contextual_seq_len_tensor.has_value(),
+          "contextual_seq_len_tensor cannot be empty for num_groups > 1 and not cross_attn.");
+      TORCH_CHECK(
+          max_attn_len_tensor.has_value(),
+          "max_attn_len_tensor cannot be empty for num_groups > 1 and not cross_attn.");
+      TORCH_CHECK(
+          min_full_attn_seq_len_tensor.has_value(),
+          "min_full_attn_seq_len_tensor cannot be empty for num_groups > 1 and not cross_attn.");
+      contextual_seq_len_tensor_ = contextual_seq_len_tensor.value();
+      max_attn_len_tensor_ = max_attn_len_tensor.value();
+      min_full_attn_seq_len_tensor_ = min_full_attn_seq_len_tensor.value();
+      CHECK_DEVICE(contextual_seq_len_tensor_);
+      CHECK_DEVICE(max_attn_len_tensor_);
+      CHECK_DEVICE(min_full_attn_seq_len_tensor_);
+      TORCH_CHECK(contextual_seq_len_tensor_.dtype() == torch::kInt32);
+      TORCH_CHECK(max_attn_len_tensor_.dtype() == torch::kInt32);
+      TORCH_CHECK(min_full_attn_seq_len_tensor_.dtype() == torch::kInt32);
+    }
+  }
+  auto const sizes_q = q.sizes();
+  auto const sizes_kv = k.sizes();
+  int const batch_size = !is_jagged ? sizes_q[0] : seq_offsets_.size(0) - 1;
+  TORCH_CHECK(
+      batch_size % num_groups == 0, "batch_size not divisible by num_groups");
+  if (!is_jagged) {
+    max_seq_len = sizes_kv[1];
+  }
+  int const total_seq_len_q = !is_jagged ? batch_size * sizes_q[1] : sizes_q[0];
+  int const total_seq_len_kv =
+      !is_jagged ? batch_size * sizes_kv[1] : sizes_kv[0];
+  int const num_heads = q.size(-2);
+  int const qk_head_size = q.size(-1);
+  int const v_head_size = v.size(-1);
+  TORCH_CHECK(
+      qk_head_size % 8 == 0 && v_head_size % 8 == 0,
+      "head_size should be a multiple of 8");
+  int const max_headdim = get_max_headdim();
+  TORCH_CHECK(
+      qk_head_size <= max_headdim && v_head_size <= max_headdim,
+      "FlashAttention backward only supports head dimension at most " +
+          std::to_string(max_headdim));
+  TORCH_CHECK(max_attn_len >= 0, "max_attn_len must be at least 0");
+  TORCH_CHECK(
+      min_full_attn_seq_len >= 0, "min_full_attn_seq_len must be at least 0");
+  TORCH_CHECK(contextual_seq_len >= 0, "contextual_seq_len must be at least 0");
+  if (!is_jagged) {
+    CHECK_SHAPE(q, batch_size, max_q_len, num_heads, qk_head_size);
+    CHECK_SHAPE(k, batch_size, max_seq_len, num_heads, qk_head_size);
+    CHECK_SHAPE(v, batch_size, max_seq_len, num_heads, v_head_size);
+    CHECK_SHAPE(dout, batch_size, max_q_len, num_heads, v_head_size);
+    CHECK_SHAPE(dq, batch_size, max_q_len, num_heads, qk_head_size);
+    CHECK_SHAPE(dk, batch_size, max_seq_len, num_heads, qk_head_size);
+    CHECK_SHAPE(dv, batch_size, max_seq_len, num_heads, v_head_size);
+  } else {
+    CHECK_SHAPE(q, total_seq_len_q, num_heads, qk_head_size);
+    CHECK_SHAPE(k, total_seq_len_kv, num_heads, qk_head_size);
+    CHECK_SHAPE(v, total_seq_len_kv, num_heads, v_head_size);
+    CHECK_SHAPE(dout, total_seq_len_q, num_heads, v_head_size);
+    CHECK_SHAPE(dq, total_seq_len_q, num_heads, qk_head_size);
+    CHECK_SHAPE(dk, total_seq_len_kv, num_heads, qk_head_size);
+    CHECK_SHAPE(dv, total_seq_len_kv, num_heads, v_head_size);
+    CHECK_SHAPE(seq_offsets_, batch_size + 1);
+  }
+  if (has_multiple_targets) {
+    CHECK_SHAPE(num_targets_, batch_size);
+  }
+  if (is_cross_attn) {
+    CHECK_SHAPE(seq_offsets_q_, batch_size + 1);
+  }
+  int const arch = at::cuda::getCurrentDeviceProperties()->major * 10 +
+      at::cuda::getCurrentDeviceProperties()->minor;
+  int const qk_head_size_rounded = round_up_headdim(qk_head_size);
+  int const v_head_size_rounded = round_up_headdim(v_head_size);
+  // Very important that these match the kernel configs
+  bool const is_local = max_attn_len > 0;
+  int const kBlockM =
+      hstu::kBlockM_bwd(arch, qk_head_size_rounded, causal, is_local);
+  auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
+  int const max_q_len_rounded = round_multiple(max_q_len, kBlockM);
+  int const total_seq_len_q_padded_rounded =
+      round_multiple(total_seq_len_q + batch_size * kBlockM, kBlockM);
+
+  TORCH_CHECK(dq.dtype() == q_type, "dq must have the same dtype as q");
+  CHECK_DEVICE(dq);
+  TORCH_CHECK(dq.stride(-1) == 1, "dq must have contiguous last dimension");
+  if (!is_jagged) {
+    CHECK_SHAPE(dq, batch_size, max_q_len, num_heads, qk_head_size);
+  } else {
+    CHECK_SHAPE(dq, total_seq_len_q, num_heads, qk_head_size);
+  }
+  TORCH_CHECK(dk.dtype() == q_type, "dk must have the same dtype as q");
+  CHECK_DEVICE(dk);
+  TORCH_CHECK(dk.stride(-1) == 1, "dk must have contiguous last dimension");
+  if (!is_jagged) {
+    CHECK_SHAPE(dk, batch_size, max_seq_len, num_heads, qk_head_size);
+  } else {
+    CHECK_SHAPE(dk, total_seq_len_kv, num_heads, qk_head_size);
+  }
+  TORCH_CHECK(dv.dtype() == q_type, "dv must have the same dtype as q");
+  CHECK_DEVICE(dv);
+  TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension");
+  if (!is_jagged) {
+    CHECK_SHAPE(dv, batch_size, max_seq_len, num_heads, v_head_size);
+  } else {
+    CHECK_SHAPE(dv, total_seq_len_kv, num_heads, v_head_size);
+  }
+
+  // Otherwise the kernel will be launched from cuda:0 device
+  // Cast to char to avoid compiler warning about narrowing
+  at::cuda::CUDAGuard device_guard{(char)q.get_device()};
+  auto opts = q.options();
+
+  at::Tensor dq_accum;
+  if (!is_jagged) {
+    dq_accum = torch::empty(
+        {batch_size, num_heads, max_q_len_rounded * qk_head_size_rounded},
+        opts.dtype(at::kFloat));
+  } else {
+    dq_accum = torch::empty(
+        {num_heads, total_seq_len_q_padded_rounded * qk_head_size_rounded},
+        opts.dtype(at::kFloat));
+  }
+  at::Tensor softmax_d, softmax_lse_log2;
+  if (!is_jagged) {
+    // Need softmax_d to have seqlen_q_rounded since we want its address to be
+    // aligned by 16/8 bytes for TMA / LDG.64
+    softmax_d = torch::empty(
+        {batch_size, num_softmax_heads, max_q_len_rounded},
+        opts.dtype(at::kFloat));
+    softmax_lse_log2 = torch::empty(
+        {batch_size, num_softmax_heads, max_q_len_rounded},
+        opts.dtype(at::kFloat));
+  } else {
+    softmax_d = torch::empty(
+        {num_softmax_heads, total_seq_len_q_padded_rounded},
+        opts.dtype(at::kFloat));
+    softmax_lse_log2 = torch::empty(
+        {num_softmax_heads, total_seq_len_q_padded_rounded},
+        opts.dtype(at::kFloat));
+  }
+
+  // Early return for empty sequences; analog to TMA prevention guard
+  // in hstu_mha_fwd
+  if (total_seq_len_kv == 0 || total_seq_len_q == 0) {
+    return {dq, dk, dv};
+  }
+
+  hstu::Flash_bwd_params params;
+  hstu::set_params_dgrad(
+      params,
+      batch_size,
+      total_seq_len_kv,
+      total_seq_len_q,
+      max_seq_len,
+      max_q_len,
+      max_q_len_rounded,
+      num_heads,
+      qk_head_size,
+      v_head_size,
+      qk_head_size_rounded,
+      v_head_size_rounded,
+      q,
+      k,
+      v,
+      out,
+      dout,
+      dq,
+      dk,
+      dv,
+      dq_accum.data_ptr(),
+      !is_jagged ? nullptr : seq_offsets_.data_ptr(),
+      !has_multiple_targets ? nullptr : num_targets_.data_ptr(),
+      !has_attn_scale ? nullptr : attn_scale_.data_ptr(),
+      !(sort_by_length && is_jagged) ? nullptr
+                                     : sort_by_length_indices_.data_ptr(),
+      !is_cross_attn ? nullptr : seq_offsets_q_.data_ptr(),
+      num_softmax_heads == 0 ? nullptr : softmax_lse.value().data_ptr(),
+      num_softmax_heads == 0 ? nullptr : softmax_d.data_ptr(),
+      num_softmax_heads == 0 ? nullptr : softmax_lse_log2.data_ptr(),
+      num_groups > 1 ? max_seq_len_tensor_.data_ptr() : nullptr,
+      ((num_groups > 1) && (!is_cross_attn))
+          ? contextual_seq_len_tensor_.data_ptr()
+          : nullptr,
+      ((num_groups > 1) && (!is_cross_attn)) ? max_attn_len_tensor_.data_ptr()
+                                             : nullptr,
+      ((num_groups > 1) && (!is_cross_attn))
+          ? min_full_attn_seq_len_tensor_.data_ptr()
+          : nullptr,
+      num_groups,
+      scalar_scale,
+      causal,
+      alpha,
+      max_attn_len,
+      min_full_attn_seq_len,
+      contextual_seq_len,
+      num_softmax_heads,
+      deterministic,
+      sm_margin);
+
+  // auto tile_count_semaphore = (params.is_causal || params.is_local) ?
+  // torch::zeros({1}, opts.dtype(torch::kInt32)) : torch::empty({1},
+  // opts.dtype(torch::kInt32)); params.tile_count_semaphore =
+  // tile_count_semaphore.data_ptr<int>(); Will be zero'ed out in the
+  // backward preprocess kernel
+  at::Tensor dq_semaphore = torch::empty(
+      {(max_seq_len + kBlockM - 1) / kBlockM, batch_size, num_heads},
+      opts.dtype(torch::kInt32));
+  params.dq_semaphore = dq_semaphore.data_ptr<int>();
+
+#ifdef FLASHATTENTION_DISABLE_LOCAL
+  TORCH_CHECK(
+      !params.is_local,
+      "This flash attention build does not support local attention.");
+#endif
+
+  if (total_seq_len_q > 0 && num_heads > 0) {
+    auto stream = at::cuda::getCurrentCUDAStream().stream();
+    run_mha_bwd(params, stream);
+  }
+  return {dq, dk, dv};
+}
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common.h
new file mode 100644
index 000000000..98ca009f5
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common.h
@@ -0,0 +1,149 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+// Include these 2 headers instead of torch/extension.h since we don't need all
+// of the torch headers.
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <c10/util/Optional.h>
+
+#include <torch/nn/functional.h>
+
+#include <torch/library.h> // @manual
+
+#define CHECK_DEVICE(x) TORCH_CHECK(x.is_cuda(), #x " must be on CUDA")
+#define CHECK_SHAPE(x, ...)                           \
+  TORCH_CHECK(                                        \
+      x.sizes() == torch::IntArrayRef({__VA_ARGS__}), \
+      #x " must have shape (" #__VA_ARGS__ ")")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+
+inline int round_up_headdim(int head_size) {
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+  if (head_size <= 64) {
+    return 64;
+  }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+  if (head_size <= 96) {
+    return 96;
+  }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+  if (head_size <= 128) {
+    return 128;
+  }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+  if (head_size <= 192) {
+    return 192;
+  }
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+  if (head_size <= 256) {
+    return 256;
+  }
+#endif
+  return 256;
+}
+
+inline int get_max_headdim() {
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+  return 256;
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+  return 192;
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+  return 128;
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+  return 96;
+#endif
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+  return 64;
+#endif
+  return 0;
+}
+
+namespace hstu {
+
+std::tuple<at::Tensor, std::optional<at::Tensor>> hstu_mha_fwd(
+    int64_t max_seq_len,
+    double alpha,
+    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
+    const std::optional<at::Tensor>& seq_offsets,
+    bool causal,
+    const std::optional<at::Tensor>& num_targets,
+    const std::optional<at::Tensor>& attn_scale,
+    int64_t max_attn_len,
+    int64_t min_full_attn_seq_len,
+    int64_t contextual_seq_len,
+    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
+    const std::optional<at::Tensor>& k_descale, // (b, h_k)
+    const std::optional<at::Tensor>& v_descale, // (b, h_k)
+    const int64_t sm_margin = 0,
+    int64_t max_q_len = 0,
+    const std::optional<at::Tensor>& seq_offsets_q = std::nullopt,
+    int64_t num_softmax_heads = 0,
+    bool training = true,
+    const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
+        std::nullopt,
+    int64_t num_groups = 1);
+
+std::vector<at::Tensor> hstu_mha_bwd(
+    int64_t max_seq_len,
+    double alpha,
+    at::Tensor& dout,
+    at::Tensor& q,
+    at::Tensor& k,
+    at::Tensor& v,
+    at::Tensor& dq,
+    at::Tensor& dk,
+    at::Tensor& dv,
+    at::Tensor& out,
+    const std::optional<at::Tensor>& seq_offsets,
+    bool causal,
+    const std::optional<at::Tensor>& num_targets,
+    const std::optional<at::Tensor>& attn_scale,
+    int64_t max_attn_len,
+    int64_t min_full_attn_seq_len,
+    int64_t contextual_seq_len,
+    bool sort_by_length,
+    bool const deterministic,
+    const int64_t sm_margin = 0,
+    int64_t max_q_len = 0,
+    const std::optional<at::Tensor>& seq_offsets_q = std::nullopt,
+    int64_t num_softmax_heads = 0,
+    const std::optional<at::Tensor>& softmax_lse = std::nullopt,
+    const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
+        std::nullopt,
+    int64_t num_groups = 1);
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common_cpu.cpp b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common_cpu.cpp
new file mode 100644
index 000000000..30d4f792c
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common_cpu.cpp
@@ -0,0 +1,172 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#include <c10/util/Optional.h>
+#include <torch/nn/functional.h>
+
+#include "flash_common_cpu.h"
+
+namespace hstu {
+
+std::tuple<at::Tensor, std::optional<at::Tensor>> hstu_mha_fwd_meta(
+    const at::SymInt max_seq_len,
+    double alpha,
+    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
+    const std::optional<at::Tensor>& seq_offsets,
+    bool causal,
+    const std::optional<at::Tensor>& num_targets,
+    const std::optional<at::Tensor>& attn_scale,
+    int64_t max_attn_len,
+    int64_t min_full_attn_seq_len,
+    int64_t contextual_seq_len,
+    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
+    const std::optional<at::Tensor>& k_descale, // (b, h_k)
+    const std::optional<at::Tensor>& v_descale, // (b, h_k)
+    const int64_t sm_margin,
+    int64_t max_q_len,
+    const std::optional<at::Tensor>& seq_offsets_q,
+    int64_t num_softmax_heads,
+    bool training,
+    const std::optional<at::Tensor>& max_seq_len_tensor,
+    const std::optional<at::Tensor>& contextual_seq_len_tensor,
+    const std::optional<at::Tensor>& max_attn_len_tensor,
+    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor,
+    int64_t num_groups) {
+  auto q_type = q.scalar_type();
+  auto const sizes = q.sym_sizes();
+  at::Tensor seq_offsets_;
+  bool const is_jagged = seq_offsets.has_value();
+  if (is_jagged) {
+    seq_offsets_ = seq_offsets.value();
+  }
+  const c10::SymInt batch_size =
+      !is_jagged ? sizes[0] : seq_offsets_.sym_sizes()[0] - 1;
+  auto total_seq_len = !is_jagged ? batch_size * max_seq_len : sizes[0];
+  const auto& num_heads = sizes[sizes.size() - 2];
+  auto v_head_size = v.sym_sizes()[v.sym_sizes().size() - 1];
+  auto out_type = q_type == at::ScalarType::Float8_e4m3fn
+      ? at::ScalarType::BFloat16
+      : q_type;
+  auto opts = q.options();
+
+  at::Tensor out;
+  if (!is_jagged) {
+    out = at::empty_symint(
+        {batch_size, max_seq_len, num_heads, v_head_size},
+        opts.dtype(out_type));
+  } else {
+    out = at::empty_symint(
+        {total_seq_len, num_heads, v_head_size}, opts.dtype(out_type));
+  }
+  return {out, std::nullopt};
+};
+
+std::tuple<at::Tensor, std::optional<at::Tensor>> hstu_mha_fwd_dummy(
+    int64_t max_seq_len,
+    double alpha,
+    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
+    const std::optional<at::Tensor>& seq_offsets,
+    bool causal,
+    const std::optional<at::Tensor>& num_targets,
+    const std::optional<at::Tensor>& attn_scale,
+    int64_t max_attn_len,
+    int64_t min_full_attn_seq_len,
+    int64_t contextual_seq_len,
+    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
+    const std::optional<at::Tensor>& k_descale, // (b, h_k)
+    const std::optional<at::Tensor>& v_descale, // (b, h_k)
+    const int64_t sm_margin,
+    const int64_t max_q_len,
+    const std::optional<at::Tensor>& seq_offsets_q,
+    int64_t num_softmax_heads,
+    bool training,
+    const std::optional<at::Tensor>& max_seq_len_tensor,
+    const std::optional<at::Tensor>& contextual_seq_len_tensor,
+    const std::optional<at::Tensor>& max_attn_len_tensor,
+    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor,
+    int64_t num_groups) {
+  auto q_type = q.scalar_type();
+  auto const sizes = q.sizes();
+  at::Tensor seq_offsets_;
+  bool const is_jagged = seq_offsets.has_value();
+  if (is_jagged) {
+    seq_offsets_ = seq_offsets.value();
+  }
+  const int batch_size = !is_jagged ? sizes[0] : seq_offsets_.size(0) - 1;
+  int total_seq_len = !is_jagged ? batch_size * max_seq_len : sizes[0];
+  int num_heads = q.size(-2);
+  //   int const qk_head_size = q.size(-1);
+  int const v_head_size = v.size(-1);
+  //   int const max_headdim = get_max_headdim();
+  auto out_type = q_type == at::ScalarType::Float8_e4m3fn
+      ? at::ScalarType::BFloat16
+      : q_type;
+  auto opts = q.options();
+
+  at::Tensor out;
+  if (!is_jagged) {
+    out = torch::empty(
+        {batch_size, max_seq_len, num_heads, v_head_size},
+        opts.dtype(out_type));
+  } else {
+    out = torch::empty(
+        {total_seq_len, num_heads, v_head_size}, opts.dtype(out_type));
+  }
+  return {out, std::nullopt};
+};
+
+std::vector<at::Tensor> hstu_mha_bwd_dummy(
+    int64_t max_seq_len,
+    double alpha,
+    at::Tensor& dout,
+    at::Tensor& q,
+    at::Tensor& k,
+    at::Tensor& v,
+    at::Tensor& dq,
+    at::Tensor& dk,
+    at::Tensor& dv,
+    at::Tensor& out,
+    const std::optional<at::Tensor>& seq_offsets,
+    bool causal,
+    const std::optional<at::Tensor>& num_targets,
+    const std::optional<at::Tensor>& attn_scale,
+    int64_t max_attn_len,
+    int64_t min_full_attn_seq_len,
+    int64_t contextual_seq_len,
+    bool sort_by_length,
+    bool const deterministic,
+    const int64_t sm_margin,
+    const int64_t max_q_len,
+    const std::optional<at::Tensor>& seq_offsets_q,
+    int64_t num_softmax_heads,
+    const std::optional<at::Tensor>& softmax_lse,
+    const std::optional<at::Tensor>& max_seq_len_tensor,
+    const std::optional<at::Tensor>& contextual_seq_len_tensor,
+    const std::optional<at::Tensor>& max_attn_len_tensor,
+    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor,
+    int64_t num_groups) {
+  return {dq, dk, dv};
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common_cpu.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common_cpu.h
new file mode 100644
index 000000000..9d0e18a71
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common_cpu.h
@@ -0,0 +1,114 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#include <c10/util/Optional.h>
+#include <torch/library.h> // @manual
+#include <torch/nn/functional.h>
+
+namespace hstu {
+
+std::tuple<at::Tensor, std::optional<at::Tensor>> hstu_mha_fwd_dummy(
+    int64_t max_seq_len,
+    double alpha,
+    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
+    const std::optional<at::Tensor>& seq_offsets,
+    bool causal,
+    const std::optional<at::Tensor>& num_targets,
+    const std::optional<at::Tensor>& attn_scale,
+    int64_t max_attn_len,
+    int64_t min_full_attn_seq_len,
+    int64_t contextual_seq_len,
+    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
+    const std::optional<at::Tensor>& k_descale, // (b, h_k)
+    const std::optional<at::Tensor>& v_descale, // (b, h_k)
+    const int64_t sm_margin = 0,
+    int64_t max_q_len = 0,
+    const std::optional<at::Tensor>& seq_offsets_q = std::nullopt,
+    int64_t num_softmax_heads = 0,
+    bool training = true,
+    const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
+        std::nullopt,
+    int64_t num_groups = 1);
+
+std::vector<at::Tensor> hstu_mha_bwd_dummy(
+    int64_t max_seq_len,
+    double alpha,
+    at::Tensor& dout,
+    at::Tensor& q,
+    at::Tensor& k,
+    at::Tensor& v,
+    at::Tensor& dq,
+    at::Tensor& dk,
+    at::Tensor& dv,
+    at::Tensor& out,
+    const std::optional<at::Tensor>& seq_offsets,
+    bool causal,
+    const std::optional<at::Tensor>& num_targets,
+    const std::optional<at::Tensor>& attn_scale,
+    int64_t max_attn_len,
+    int64_t min_full_attn_seq_len,
+    int64_t contextual_seq_len,
+    bool sort_by_length,
+    bool const deterministic,
+    const int64_t sm_margin = 0,
+    int64_t max_q_len = 0,
+    const std::optional<at::Tensor>& seq_offsets_q = std::nullopt,
+    int64_t num_softmax_heads = 0,
+    const std::optional<at::Tensor>& softmax_lse = std::nullopt,
+    const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
+        std::nullopt,
+    int64_t num_groups = 1);
+
+std::tuple<at::Tensor, std::optional<at::Tensor>> hstu_mha_fwd_meta(
+    const at::SymInt max_seq_len,
+    double alpha,
+    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
+    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
+    const std::optional<at::Tensor>& seq_offsets,
+    bool causal,
+    const std::optional<at::Tensor>& num_targets,
+    const std::optional<at::Tensor>& attn_scale,
+    int64_t max_attn_len,
+    int64_t min_full_attn_seq_len,
+    int64_t contextual_seq_len,
+    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
+    const std::optional<at::Tensor>& k_descale, // (b, h_k)
+    const std::optional<at::Tensor>& v_descale, // (b, h_k)
+    const int64_t sm_margin = 0,
+    int64_t max_q_len = 0,
+    const std::optional<at::Tensor>& seq_offsets_q = std::nullopt,
+    int64_t num_softmax_heads = 0,
+    bool training = true,
+    const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
+    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
+        std::nullopt,
+    int64_t num_groups = 1);
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_fwd_kernel_sm90.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_fwd_kernel_sm90.h
new file mode 100644
index 000000000..2e3d0916b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_fwd_kernel_sm90.h
@@ -0,0 +1,511 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cute/tensor.hpp"
+
+#include <cutlass/arch/reg_reconfig.h>
+#include <cutlass/array.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/kernel_hardware_info.h>
+#include <cutlass/numeric_conversion.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/pipeline/pipeline.hpp>
+
+#include "seqlen.h"
+#include "softmax.h"
+#include "tile_scheduler.h"
+#include "utils.h"
+
+namespace hstu {
+
+using namespace cute;
+
+template <
+    bool Softmax,
+    class CollectiveMainloop_,
+    class CollectiveEpilogue_,
+    class TileScheduler_>
+class FlashAttnFwdSm90 {
+ public:
+  // Type Aliases
+  using CollectiveMainloop = CollectiveMainloop_;
+  using CollectiveEpilogue = CollectiveEpilogue_;
+  static constexpr bool Is_FP8 = CollectiveMainloop::Is_FP8;
+  static constexpr bool Transpose_V = CollectiveMainloop::Transpose_V;
+  static constexpr bool Use_TMA_O = CollectiveEpilogue::Use_TMA_O;
+  static constexpr int NumProducerThreads =
+      CollectiveMainloop::NumProducerThreads;
+  using SeqlenInfo_t = typename CollectiveMainloop::SeqlenInfo_t;
+
+  // Mainloop derived types
+  using TileShape_MNK = typename CollectiveMainloop::TileShape_MNK;
+  using TiledMma0 = typename CollectiveMainloop::TiledMma0;
+  using TiledMma1 = typename CollectiveMainloop::TiledMma1;
+  using ArchTag = typename CollectiveMainloop::ArchTag;
+  using ClusterShape = typename CollectiveMainloop::ClusterShape;
+  using MainloopArguments = typename CollectiveMainloop::Arguments;
+  using MainloopParams = typename CollectiveMainloop::Params;
+  using BarrierQ = cutlass::arch::ClusterTransactionBarrier;
+
+  // Epilogue derived types
+  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
+  using EpilogueParams = typename CollectiveEpilogue::Params;
+
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  using TileScheduler = TileScheduler_;
+  using TileSchedulerArguments = typename hstu::TileSchedulerArguments;
+  using TileSchedulerParams = typename TileScheduler::Params;
+
+  static constexpr uint32_t NumLoadWarpGroups = 1;
+  static constexpr uint32_t NumMmaWarpGroups =
+      CUTE_STATIC_V(size(TiledMma0{})) / cutlass::NumThreadsPerWarpGroup;
+  static constexpr uint32_t MaxThreadsPerBlock =
+      CUTE_STATIC_V(size(TiledMma0{})) +
+      (NumLoadWarpGroups * cutlass::NumThreadsPerWarpGroup);
+  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
+  static_assert(
+      NumMmaWarpGroups == 1 || NumMmaWarpGroups == 2 || NumMmaWarpGroups == 3);
+
+  /// Register requirement for Load and Math WGs
+  // If we use cp.async to load K and V, we need more registers for the producer
+  // WG.
+  static constexpr uint32_t LoadRegisterRequirement =
+      NumMmaWarpGroups == 1 ? 56 : (NumMmaWarpGroups == 2 ? 24 : 32);
+  static constexpr uint32_t MmaRegisterRequirement =
+      NumMmaWarpGroups == 1 ? 256 : (NumMmaWarpGroups == 2 ? 240 : 160);
+  // If you want to print from the producer warp, you'd need to increase the
+  // number of registers Otherwise you'll get CUDA error. static constexpr
+  // uint32_t LoadRegisterRequirement = 40; static constexpr uint32_t
+  // MmaRegisterRequirement = NumMmaWarpGroups == 2 ? 232 : 152;
+
+  // Kernel level shared memory storage
+  // We overlap the shared memory for the mainloop and epilogue. However, we
+  // only want smem_o to overlap with smem_v and nothing else, so we'll pad in
+  // case sizeof(smem_o) > sizeof(smem_v).
+  static constexpr int mainloop_smem_padding_ =
+      int(sizeof(typename CollectiveEpilogue::TensorStorage)) -
+      int(sizeof(
+          decltype((typename CollectiveMainloop::TensorStorage{}).smem_v)));
+  static constexpr int mainloop_smem_padding =
+      mainloop_smem_padding_ < 0 ? 0 : mainloop_smem_padding_;
+  struct SharedStorage {
+    struct TensorStorage : cute::aligned_struct<128> {
+      union {
+        struct {
+          cute::array<uint32_t, mainloop_smem_padding / sizeof(uint32_t)>
+              padding_;
+          typename CollectiveMainloop::TensorStorage mainloop;
+        };
+        // We want smem_o to line up with the start of smem_v
+        typename CollectiveEpilogue::TensorStorage epilogue;
+      };
+    } tensors;
+
+    struct PipelineStorage : cute::aligned_struct<16> {
+      alignas(16) BarrierQ barrier_Q;
+      alignas(16) cutlass::arch::ClusterBarrier barrier_O;
+      alignas(16) typename CollectiveMainloop::MainloopPipelineK::SharedStorage
+          pipeline_k;
+      alignas(16) typename CollectiveMainloop::MainloopPipelineV::SharedStorage
+          pipeline_v;
+      alignas(16) typename CollectiveMainloop::MainloopPipelineVt::SharedStorage
+          pipeline_vt;
+      alignas(16) typename TileScheduler::SharedStorage smem_scheduler;
+    } pipelines;
+  };
+
+  static constexpr int SharedStorageSize = sizeof(SharedStorage);
+
+  // Device side arguments
+  struct Arguments {
+    MainloopArguments mainloop{};
+    EpilogueArguments epilogue{};
+    cutlass::KernelHardwareInfo hw_info{};
+    TileSchedulerArguments scheduler{};
+  };
+
+  // Kernel entry point API
+  struct Params {
+    MainloopParams mainloop{};
+    EpilogueParams epilogue{};
+    cutlass::KernelHardwareInfo hw_info{};
+    TileSchedulerParams scheduler{};
+  };
+
+  //
+  // Methods
+  //
+
+  // Convert to underlying arguments. In this case, a simple copy for the
+  // aliased type.
+  static Params to_underlying_arguments(Arguments const& args) {
+    CUTLASS_TRACE_HOST("to_underlying_arguments():");
+
+    // Get SM count if needed, otherwise use user supplied SM count
+    int sm_count = args.hw_info.sm_count;
+    if (sm_count <= 0) {
+      CUTLASS_TRACE_HOST(
+          "  WARNING: Arguments do not include a valid SM count.\n"
+          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
+      sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
+          args.hw_info.device_id);
+    }
+
+    CUTLASS_TRACE_HOST(
+        "to_underlying_arguments(): Setting persistent grid SM count to "
+        << sm_count);
+
+    cutlass::KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
+    return {
+        CollectiveMainloop::to_underlying_arguments(args.mainloop),
+        CollectiveEpilogue::to_underlying_arguments(args.epilogue),
+        hw_info,
+        TileScheduler::to_underlying_arguments(args.scheduler)};
+  }
+
+  // Computes the kernel launch grid shape based on runtime parameters
+  static dim3 get_grid_shape(Params const& params) {
+    return TileScheduler::get_grid_shape(
+        params.scheduler, params.hw_info.sm_count);
+  }
+
+  static dim3 get_block_shape() {
+    return dim3(MaxThreadsPerBlock, 1, 1);
+  }
+
+  CUTLASS_DEVICE
+  void operator()(Params const& params, char* smem_buf) {
+    static constexpr int NumMmaThreads =
+        NumMmaWarpGroups * cutlass::NumThreadsPerWarpGroup;
+    static constexpr int MmaThreadOffset =
+        NumLoadWarpGroups * cutlass::NumThreadsPerWarpGroup;
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+
+    using MainloopPipelineK = typename CollectiveMainloop::MainloopPipelineK;
+    using MainloopPipelineV = typename CollectiveMainloop::MainloopPipelineV;
+    using MainloopPipelineVt = typename CollectiveMainloop::MainloopPipelineVt;
+    using MainloopPipelineKVNew =
+        typename CollectiveMainloop::MainloopPipelineKVNew;
+    using PipelineState = typename CollectiveMainloop::PipelineState;
+    using PipelineParamsK = typename MainloopPipelineK::Params;
+    using PipelineParamsV = typename MainloopPipelineV::Params;
+    using PipelineParamsVt = typename MainloopPipelineVt::Params;
+    using PipelineParamsKVNew = typename MainloopPipelineKVNew::Params;
+
+    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+
+    int const lane_predicate = cute::elect_one_sync();
+    int const warp_idx = cutlass::canonical_warp_idx_sync();
+
+    // Issue Tma Descriptor Prefetch from a single thread
+    if (warp_idx == 0 && lane_predicate) {
+      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
+      CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
+    }
+
+    // Obtain warp index
+    int const warp_group_thread_idx =
+        threadIdx.x % cutlass::NumThreadsPerWarpGroup;
+    int warp_group_idx = cutlass::canonical_warp_group_idx();
+
+    if (warp_idx == 0 && lane_predicate) {
+      shared_storage.pipelines.barrier_Q.init(1 /*numThreads*/);
+      shared_storage.pipelines.barrier_O.init(
+          size(ClusterShape{}) *
+          (Use_TMA_O ? 1 : NumMmaThreads) /*numThreads*/);
+    }
+
+    // We're counting on pipeline_k to call cutlass::arch::fence_barrier_init();
+    PipelineParamsK pipeline_params_k;
+    pipeline_params_k.role = warp_group_idx == 0
+        ? MainloopPipelineK::ThreadCategory::Producer
+        : MainloopPipelineK::ThreadCategory::Consumer;
+    pipeline_params_k.transaction_bytes =
+        CollectiveMainloop::TmaTransactionBytesK;
+    pipeline_params_k.is_leader = warp_group_thread_idx == 0;
+    pipeline_params_k.num_consumers = NumMmaThreads;
+
+    MainloopPipelineK pipeline_k = [&] {
+      return MainloopPipelineK(
+          shared_storage.pipelines.pipeline_k,
+          pipeline_params_k,
+          ClusterShape{});
+    }();
+    // MainloopPipelineV pipeline_v(shared_storage.pipelines.pipeline_v,
+    // pipeline_params_v, ClusterShape{});
+    MainloopPipelineV pipeline_v = [&] {
+      if constexpr (!Transpose_V) {
+        static_assert(is_same_v<PipelineParamsK, PipelineParamsV>);
+        return MainloopPipelineV(
+            shared_storage.pipelines.pipeline_v,
+            pipeline_params_k,
+            ClusterShape{});
+      } else {
+        PipelineParamsV pipeline_params_v;
+        pipeline_params_v.role = warp_group_idx == 0
+            ? MainloopPipelineV::ThreadCategory::Producer
+            : MainloopPipelineV::ThreadCategory::Consumer;
+        pipeline_params_v.producer_arv_count = NumProducerThreads;
+        pipeline_params_v.consumer_arv_count = NumMmaThreads;
+        return MainloopPipelineV(
+            shared_storage.pipelines.pipeline_v, pipeline_params_v);
+      }
+    }();
+    static_assert(is_same_v<PipelineParamsK, PipelineParamsVt>);
+    // If we need to transpose V (e.g. FP8 and V is row-major), we use
+    // pipeline_vt for the TMA, then the producer WG will read from pipeline_vt
+    // and write to pipeline_v. If we don't need to transpose V, we use
+    // pipeline_v for the TMA, and pipeline_vt won't be used. Technically for
+    // pipeline_params_vt, warp0 of WG0 is the producer and all of WG0 are
+    // consumers. However, the thread role isn't used in the pipeline
+    // implementation.
+    MainloopPipelineVt pipeline_vt = [&] {
+      pipeline_params_k.num_consumers =
+          NumProducerThreads; // TMA_V is only consumed by the producer WG
+      return MainloopPipelineVt(
+          shared_storage.pipelines.pipeline_vt,
+          pipeline_params_k,
+          ClusterShape{});
+    }();
+
+    CollectiveMainloop collective_mainloop;
+    CollectiveEpilogue collective_epilogue;
+
+    // We need this to guarantee that the Pipeline init is visible to all
+    // producers and consumer blocks in the Cluster
+    if constexpr (size(ClusterShape{}) > 1) {
+      cute::cluster_arrive_relaxed();
+      cute::cluster_wait();
+    } else {
+      __syncthreads();
+    }
+
+    if (warp_group_idx == 0) { // Producer
+      cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
+
+      // The pipelines for AppendKV and main attention are different, since e.g.
+      // main attention might use cp.async to load KV (if PagedKV) while
+      // AppendKV always uses TMA to load KV_new. Since the pipeline states are
+      // different, we have to manually sync to make sure the two pipelines
+      // don't race when accessing smem_k and smem_v.
+      PipelineState smem_pipe_write =
+          cutlass::make_producer_start_state<MainloopPipelineK>();
+      PipelineState smem_pipe_write_new =
+          cutlass::make_producer_start_state<MainloopPipelineKVNew>();
+      int work_idx = 0;
+
+      TileScheduler scheduler(
+          reinterpret_cast<typename TileScheduler::SharedStorage*>(
+              &shared_storage.pipelines.smem_scheduler));
+      int warp_idx_in_warpgroup =
+          __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0);
+      static constexpr bool SingleProducerWarp =
+          NumProducerThreads == cutlass::NumThreadsPerWarp;
+      if constexpr (SingleProducerWarp) {
+        if (warp_idx_in_warpgroup != 0) {
+          return;
+        }
+      }
+      if (!SingleProducerWarp && warp_idx_in_warpgroup != 0) {
+        scheduler.init_consumer();
+      }
+
+      // Load Q, K, V
+      for (auto work_tile_info = SingleProducerWarp ||
+                   warp_idx_in_warpgroup == 0
+               ? scheduler.template get_initial_work</*IsProducerWarp=*/true>(
+                     params.scheduler)
+               : scheduler.template get_initial_work</*IsProducerWarp=*/false>(
+                     params.scheduler);
+           work_tile_info.is_valid(params.scheduler);
+           work_tile_info = SingleProducerWarp || warp_idx_in_warpgroup == 0
+               ? scheduler.template get_next_work</*IsProducerWarp=*/true>(
+                     params.scheduler, work_tile_info)
+               : scheduler.template get_next_work</*IsProducerWarp=*/false>(
+                     params.scheduler, work_tile_info)) {
+        auto block_coord = work_tile_info.get_block_coord(params.scheduler);
+        SeqlenInfo_t seqlen_info{
+            get<2>(block_coord) /*bidb*/,
+            get<0>(params.mainloop.shape_Q),
+            get<0>(params.mainloop.shape_K),
+            params.mainloop.seq_offsets,
+            params.mainloop.seq_offsets_q,
+            params.mainloop.num_targets,
+        };
+        auto scheduler_prefetch = [&scheduler, &params, &work_tile_info]() {
+          scheduler.prefetch_next_work(params.scheduler, work_tile_info);
+        };
+        // pipeline_vt won't be used if we don't need to transpose V.
+        collective_mainloop.load(
+            params.mainloop,
+            pipeline_k,
+            pipeline_v,
+            pipeline_vt,
+            smem_pipe_write,
+            shared_storage,
+            scheduler_prefetch,
+            seqlen_info,
+            block_coord,
+            work_idx);
+      }
+      collective_mainloop.load_tail(
+          pipeline_k,
+          pipeline_v,
+          pipeline_vt,
+          smem_pipe_write,
+          shared_storage,
+          work_idx);
+    } else { // Consumer
+      cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
+
+      TileScheduler scheduler(
+          reinterpret_cast<typename TileScheduler::SharedStorage*>(
+              &shared_storage.pipelines.smem_scheduler));
+      // Initialize matmul objects.
+      TiledMma1 tiled_mma1;
+
+      PipelineState smem_pipe_read;
+      // We don't need separate variables smem_pipe_release_k and
+      // smem_pipe_release_v (like in Cutlass's gemm) because the read and
+      // release pipeline states are always the same.
+
+      scheduler.init_consumer();
+      collective_mainloop.mma_init();
+
+      int work_idx = 0;
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (auto work_tile_info =
+               scheduler.template get_initial_work</*IsProducerWarp=*/false>(
+                   params.scheduler);
+           work_tile_info.is_valid(params.scheduler);
+           work_tile_info =
+               scheduler.template get_next_work</*IsProducerWarp=*/false>(
+                   params.scheduler, work_tile_info)) {
+        // Attention output (GEMM-II) accumulator.
+        Tensor tOrO =
+            partition_fragment_C(tiled_mma1, select<0, 2>(TileShape_MNK{}));
+        // If there's tanh softcap, the scaling will be done before tanh.
+        auto block_coord = work_tile_info.get_block_coord(params.scheduler);
+        int const bidb = get<2>(block_coord);
+        int const bidh = get<1>(block_coord);
+        if constexpr (Is_FP8) {
+          int const bidh_kv = bidh;
+          float const q_descale = params.mainloop.ptr_q_descale == nullptr
+              ? 1.0f
+              : params.mainloop.ptr_q_descale
+                    [bidb * get<0>(params.mainloop.stride_q_descale) +
+                     bidh_kv * get<1>(params.mainloop.stride_q_descale)];
+          float const k_descale = params.mainloop.ptr_k_descale == nullptr
+              ? 1.0f
+              : params.mainloop.ptr_k_descale
+                    [bidb * get<0>(params.mainloop.stride_k_descale) +
+                     bidh_kv * get<1>(params.mainloop.stride_k_descale)];
+        }
+
+        SeqlenInfo_t seqlen_info{
+            bidb,
+            get<0>(params.mainloop.shape_Q),
+            get<0>(params.mainloop.shape_K),
+            params.mainloop.seq_offsets,
+            params.mainloop.seq_offsets_q,
+            params.mainloop.num_targets,
+        };
+        float alpha_log2 = params.mainloop.alpha_log2;
+        bool tile_valid;
+        if constexpr (Softmax) {
+          hstu::Softmax<
+              2 * (2 * kBlockM / NumMmaThreads),
+              /*Max_offset=*/!Is_FP8 ? 0 : 8>
+              softmax(alpha_log2);
+          tile_valid = collective_mainloop.mma_softmax(
+              params.mainloop,
+              pipeline_k,
+              pipeline_v,
+              smem_pipe_read,
+              tOrO,
+              softmax,
+              threadIdx.x - MmaThreadOffset,
+              work_idx,
+              seqlen_info,
+              block_coord,
+              shared_storage);
+          if (tile_valid) {
+            collective_epilogue.store(
+                params.epilogue,
+                tOrO,
+                shared_storage,
+                tiled_mma1,
+                threadIdx.x - MmaThreadOffset,
+                block_coord);
+            collective_epilogue.store_softmax(
+                params.epilogue,
+                softmax.row_sum,
+                tiled_mma1,
+                threadIdx.x - MmaThreadOffset,
+                block_coord);
+          } else {
+            // Write 0 to gO and -inf to gLSE.
+            // If Split, we don't have to write 0 to O if the mha_combine kernel
+            // is used, since it will not use the value of O if LSE is -inf.
+            collective_epilogue.template store_zero<true /*Clear_O*/>(
+                params.epilogue, threadIdx.x - MmaThreadOffset, block_coord);
+            // collective_epilogue.store_zero(params.epilogue, threadIdx.x -
+            // MmaThreadOffset, block_coord);
+          }
+        } else {
+          tile_valid = collective_mainloop.mma(
+              params.mainloop,
+              pipeline_k,
+              pipeline_v,
+              smem_pipe_read,
+              tOrO,
+              threadIdx.x - MmaThreadOffset,
+              work_idx,
+              seqlen_info,
+              block_coord,
+              shared_storage);
+          if (tile_valid) {
+            collective_epilogue.store(
+                params.epilogue,
+                tOrO,
+                shared_storage,
+                tiled_mma1,
+                threadIdx.x - MmaThreadOffset,
+                block_coord);
+          } else {
+            // Write 0 to gO and -inf to gLSE.
+            // If Split, we don't have to write 0 to O if the mha_combine kernel
+            // is used, since it will not use the value of O if LSE is -inf.
+            collective_epilogue.template store_zero<true /*Clear_O*/>(
+                params.epilogue, threadIdx.x - MmaThreadOffset, block_coord);
+            // collective_epilogue.store_zero(params.epilogue, threadIdx.x -
+            // MmaThreadOffset, block_coord);
+          }
+        }
+      }
+      collective_epilogue.store_tail();
+    }
+  }
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_fwd_launch_template.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_fwd_launch_template.h
new file mode 100644
index 000000000..c79ea3a3f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_fwd_launch_template.h
@@ -0,0 +1,376 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+// clang-format off
+#include "cute/tensor.hpp"
+
+#include "cutlass/cutlass.h"
+#include "cutlass/device_kernel.h"  // For device_kernel
+#include <cutlass/kernel_hardware_info.h>
+#include "cutlass/cluster_launch.hpp"
+#include "cutlass/kernel_launch.h"
+
+#include "static_switch.h"
+#include "flash.h"
+#include "tile_size.h"
+#include "tile_scheduler.h"
+#include "flash_fwd_kernel_sm90.h"
+#include "mainloop_fwd_sm90_tma_gmma_ws.h"
+#include "epilogue_fwd.h"
+// clang-format on
+
+namespace hstu {
+
+using namespace cute;
+
+template <
+    int Arch,
+    int kHeadDim,
+    int ClusterM,
+    typename Element,
+    typename ElementOut,
+    bool Causal,
+    bool Local,
+    bool Contexual_mask,
+    bool Jagged,
+    bool Has_targets,
+    bool V_colmajor,
+    bool Cross,
+    bool Softmax,
+    bool Training>
+void run_flash_fwd(hstu::Flash_fwd_params& params, cudaStream_t stream) {
+  static_assert(
+      !(Causal && Local),
+      "Causal and Local cannot be enabled at the same time");
+  static constexpr bool Is_FP8 =
+      cute::is_same_v<Element, cutlass::float_e4m3_t> ||
+      cute::is_same_v<Element, cutlass::float_e5m2_t>;
+  static constexpr bool FP8_TransposeV = Is_FP8 && !V_colmajor;
+  using ArchTag =
+      std::conditional_t<Arch >= 90, cutlass::arch::Sm90, cutlass::arch::Sm80>;
+
+  // Can't use structured binding since it's not compatible with constexpr
+  static constexpr std::tuple<int, int, bool> kBlockMN_RS =
+      hstu::tile_size_fwd_sm90(
+          kHeadDim,
+          Causal,
+          Local,
+          sizeof(Element) /*element_size*/,
+          V_colmajor,
+          Cross,
+          Training);
+  static constexpr std::tuple<int, int, int, int, bool>
+      kBlockMN_kNWarps_Stages_RS = hstu::tile_size_fwd_sm8x(
+          Arch == 86 || Arch == 89,
+          kHeadDim,
+          Causal,
+          Local,
+          sizeof(Element) /*element_size*/);
+  static constexpr int kBlockM = Arch >= 90
+      ? std::get<0>(kBlockMN_RS)
+      : std::get<0>(kBlockMN_kNWarps_Stages_RS);
+  static constexpr int kBlockN = Arch >= 90
+      ? std::get<1>(kBlockMN_RS)
+      : std::get<1>(kBlockMN_kNWarps_Stages_RS);
+  static constexpr bool Mma1_is_RS = std::get<2>(kBlockMN_RS);
+  static constexpr int kNWarps = std::get<2>(kBlockMN_kNWarps_Stages_RS);
+  static constexpr int kStages =
+      Arch >= 90 ? 2 : std::get<3>(kBlockMN_kNWarps_Stages_RS);
+  static constexpr bool Q_in_regs =
+      Arch >= 90 ? false : std::get<4>(kBlockMN_kNWarps_Stages_RS);
+
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+  std::printf(
+      "kBlockM: (%d), kBlockN: (%d), Mma1_is_RS: (%d), kNWarps: (%d), kStages: (%d), Q_in_regs: (%d)\n",
+      kBlockM,
+      kBlockN,
+      Mma1_is_RS,
+      kNWarps,
+      kStages,
+      Q_in_regs);
+#endif
+
+  using TileShape_MNK = cute::Shape<Int<kBlockM>, Int<kBlockN>, Int<kHeadDim>>;
+  using ClusterShape = cute::Shape<Int<ClusterM>, _1, _1>;
+  using CollectiveMainloop = hstu::CollectiveMainloopFwdSm90<
+      kStages,
+      ClusterShape,
+      TileShape_MNK,
+      Element,
+      float,
+      cutlass::arch::Sm90,
+      Causal,
+      Local,
+      Contexual_mask,
+      Jagged,
+      Has_targets,
+      Mma1_is_RS,
+      V_colmajor,
+      Cross>;
+  using CollectiveEpilogue = hstu::CollectiveEpilogueFwd<
+      TileShape_MNK,
+      ClusterShape,
+      ElementOut,
+      ArchTag,
+      CollectiveMainloop::NumMmaThreads,
+      Jagged,
+      FP8_TransposeV>;
+
+  static constexpr int NumProducerThreads = Arch >= 90
+      ? CollectiveMainloop::NumProducerThreads
+      : CollectiveMainloop::NumMmaThreads;
+  using SchedulerPersistent = std::conditional_t<
+      Jagged,
+      hstu::VarlenDynamicPersistentTileScheduler<
+          kBlockM,
+          CollectiveMainloop::NumMmaThreads,
+          NumProducerThreads,
+          Arch >= 90 /*WarpSpecialized*/>,
+      std::conditional_t<
+          !Causal && !Local,
+          hstu::StaticPersistentTileScheduler,
+          hstu::DynamicPersistentTileScheduler<
+              CollectiveMainloop::NumMmaThreads,
+              NumProducerThreads,
+              Arch >= 90 /*WarpSpecialized*/>>>;
+  using SchedulerSingleTile = hstu::
+      SingleTileScheduler<Jagged, kBlockM, false /*Sort_by_length_indices*/>;
+  // If Split then we probably don't have enough work for PersistentScheduler to
+  // be useful. However, if Jagged (e.g., during decode where we have
+  // max_seqlens), using PersistentScheduler is better since we'll avoid
+  // launching a bunch of thread blocks that immediately exit. On Sm80,
+  // noncausal persistent seems a bit slower.
+  using Scheduler = std::conditional_t<
+      Arch >= 90 ? false : !(Causal && !Jagged),
+      SchedulerSingleTile,
+      SchedulerPersistent>;
+  using AttnKernel = hstu::enable_sm90_or_later<hstu::FlashAttnFwdSm90<
+      Softmax,
+      CollectiveMainloop,
+      CollectiveEpilogue,
+      Scheduler>>;
+
+  int seqlen_q = !Jagged ? params.max_q_len : params.total_seq_len_q;
+  int seqlen_kv = !Jagged ? params.max_kv_len : params.total_seq_len_kv;
+  int batch = !Jagged ? params.b : 1;
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+  std::printf("max/total seqlen: (%d), batch: (%d)\n", seqlen, batch);
+#endif
+  typename CollectiveMainloop::StrideV v_strides =
+      cute::conditional_return<!V_colmajor>(
+          make_stride(
+              params.v_row_stride,
+              _1{},
+              params.v_head_stride,
+              !Jagged ? params.v_batch_stride : 0),
+          make_stride(
+              _1{},
+              params.v_dim_stride,
+              params.v_head_stride,
+              !Jagged ? params.v_batch_stride : 0));
+  typename CollectiveMainloop::Arguments mainloop_args{
+      static_cast<Element const*>(params.q_ptr),
+      {seqlen_q, params.qk_d, params.h, batch}, // shape_Q
+      {params.q_row_stride,
+       _1{},
+       params.q_head_stride,
+       !Jagged ? params.q_batch_stride : 0}, // stride_Q
+      static_cast<Element*>(params.k_ptr),
+      {seqlen_kv, params.qk_d, params.h, batch}, // shape_K
+      {params.k_row_stride,
+       _1{},
+       params.k_head_stride,
+       !Jagged ? params.k_batch_stride : 0}, // stride_K
+      static_cast<Element*>(params.v_ptr),
+      v_strides, // stride_V
+      params.q_descale_ptr,
+      params.k_descale_ptr,
+      params.v_descale_ptr,
+      {params.q_descale_batch_stride, params.q_descale_head_stride},
+      {params.k_descale_batch_stride, params.k_descale_head_stride},
+      {params.v_descale_batch_stride, params.v_descale_head_stride},
+      1.0f / params.max_kv_len,
+      params.alpha,
+      params.max_attn_len,
+      params.min_full_attn_seq_len,
+      params.contextual_seq_len,
+      params.num_softmax_heads,
+      params.num_groups,
+      params.batch_size_per_group,
+      params.seq_offsets,
+      params.seq_offsets_q,
+      params.num_targets,
+      params.max_seq_len_tensor,
+      params.contextual_seq_len_tensor,
+      params.max_attn_len_tensor,
+      params.min_full_attn_seq_len_tensor,
+      params.attn_scale,
+      params.scalar_scale,
+  };
+  typename CollectiveEpilogue::Arguments epilogue_args{
+      static_cast<ElementOut*>(params.o_ptr),
+      {seqlen_q, params.v_d, params.h, batch, 1}, // shape_O
+      {params.o_row_stride,
+       _1{},
+       params.o_head_stride,
+       !Jagged ? params.o_batch_stride : 0,
+       0}, // stride_O
+      params.h,
+      params.num_softmax_heads,
+      {_1{}, seqlen_q, !Jagged ? params.h * seqlen_q : 0, 0}, // stride_LSE}
+      static_cast<float*>(params.softmax_lse),
+      Cross ? params.seq_offsets_q : params.seq_offsets};
+
+  int num_blocks_m =
+      cutlass::ceil_div(params.max_q_len, get<0>(TileShape_MNK{}));
+  num_blocks_m = cutlass::round_up(num_blocks_m, size<0>(ClusterShape{}));
+  typename hstu::TileSchedulerArguments scheduler_args{
+      num_blocks_m,
+      params.h,
+      params.b,
+      params.max_q_len,
+      params.qk_d,
+      sizeof(Element),
+      params.tile_count_semaphore,
+      Cross ? params.seq_offsets_q : params.seq_offsets,
+      nullptr /*sort_by_length_indices*/};
+
+  int device;
+  CHECK_CUDA(cudaGetDevice(&device));
+  typename AttnKernel::Params kernel_params =
+      AttnKernel::to_underlying_arguments(
+          {mainloop_args,
+           epilogue_args,
+           {device, params.num_sm},
+           scheduler_args});
+
+  dim3 grid_dims = AttnKernel::get_grid_shape(kernel_params);
+  dim3 block_dims = AttnKernel::get_block_shape();
+  int smem_size = AttnKernel::SharedStorageSize;
+  // int smem_size_q = sizeof(decltype((typename
+  // CollectiveMainloop::TensorStorage{}).smem_q)); int smem_size_k =
+  // sizeof(decltype((typename CollectiveMainloop::TensorStorage{}).smem_k));
+  // int smem_size_v = sizeof(decltype((typename
+  // CollectiveMainloop::TensorStorage{}).smem_v)); printf("smem_size = %d, q =
+  // %d, k = %d, v = %d\n", smem_size, smem_size_q, smem_size_k, smem_size_v);
+  // Get the ptr to kernel function.
+  if constexpr (size(ClusterShape{}) > 1) {
+    void const* kernel = (void const*)cutlass::device_kernel<AttnKernel>;
+    if (smem_size >= 48 * 1024) {
+      CHECK_CUDA(cudaFuncSetAttribute(
+          kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+    }
+    dim3 cluster_dims(
+        size<0>(ClusterShape{}),
+        size<1>(ClusterShape{}),
+        size<2>(ClusterShape{}));
+    cutlass::ClusterLaunchParams launch_params{
+        grid_dims, block_dims, cluster_dims, smem_size, stream};
+    cutlass::launch_kernel_on_cluster(launch_params, kernel, kernel_params);
+  } else {
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+    std::cout << "ClusterShape = 1" << std::endl;
+    std::cout << "grid_dims = " << grid_dims << std::endl;
+    std::cout << "block_dims = " << block_dims << std::endl;
+    std::cout << "smem_size = " << smem_size << std::endl;
+#endif
+    auto kernel = cutlass::device_kernel<AttnKernel>;
+    if (smem_size >= 48 * 1024) {
+      CHECK_CUDA(cudaFuncSetAttribute(
+          kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+    }
+    kernel<<<grid_dims, block_dims, smem_size, stream>>>(kernel_params);
+  }
+  CHECK_CUDA_KERNEL_LAUNCH();
+}
+
+template <
+    int Arch,
+    int kHeadDim,
+    bool Causal,
+    bool Local,
+    bool Softmax,
+    typename T,
+    typename T_out>
+void run_mha_fwd_dispatch(hstu::Flash_fwd_params& params, cudaStream_t stream) {
+  static constexpr bool V_colmajor = false; // V_colmajor_ && sizeof(T) == 1;
+  BOOL_SWITCH(params.num_targets, Has_targets, [&] {
+    BOOL_SWITCH(params.seq_offsets, Jagged, [&] {
+      BOOL_SWITCH(params.seq_offsets_q, Cross, [&] {
+        BOOL_SWITCH(params.has_contexual_mask, Contexual_mask, [&] {
+          BOOL_SWITCH(params.training, Training, [&] {
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+            std::printf(
+                "[flash_fwd_launch_template] Local: (%d), Jagged: (%d), Has_targets: (%d), Causal: (%d), max_kv_len: (%d), kHeadDim: (%d)\n",
+                Local,
+                Jagged,
+                Has_targets,
+                Causal,
+                params.max_kv_len,
+                kHeadDim);
+#endif
+            // static constexpr bool Enable_cluster = Arch >= 90 &&
+            //     (sizeof(T) == 2 ? (kHeadDim >= 128) : (kHeadDim == 192)) &&
+            //     !Causal && !Local && !Jagged;
+            // static constexpr bool Enable_cluster = false;
+            // CLUSTER_SWITCH(
+            //     cutlass::ceil_div(params.max_q_len, kBlockM) % 2 == 0,
+            //     Use_cluster,
+            //     [&] {
+            // static constexpr int ClusterM =
+            //     Enable_cluster && Use_cluster ? 2 : 1;
+            run_flash_fwd<
+                Arch,
+                kHeadDim,
+                1, // ClusterM,
+                T,
+                T_out,
+                Causal,
+                Local,
+                Contexual_mask,
+                Jagged,
+                Has_targets,
+                V_colmajor,
+                Cross,
+                Softmax,
+                Training>(params, stream);
+          });
+        });
+      });
+    });
+  });
+}
+
+template <int Arch, typename T, int kHeadDim, bool Softmax>
+void run_mha_fwd_(hstu::Flash_fwd_params& params, cudaStream_t stream) {
+  static_assert(
+      sizeof(T) == 2 || sizeof(T) == 1, "Only 16bit and 8bit are supported");
+  static constexpr bool Is_FP8 = cute::is_same_v<T, cutlass::float_e4m3_t> ||
+      cute::is_same_v<T, cutlass::float_e5m2_t>;
+  using T_out = std::conditional_t<!Is_FP8, T, cutlass::bfloat16_t>;
+  CAUSAL_LOCAL_SWITCH(params.is_causal, params.is_local, Causal, Local, [&] {
+    // VCOLMAJOR_SWITCH(params.v_dim_stride != 1, V_colmajor_, [&] {
+    run_mha_fwd_dispatch<Arch, kHeadDim, Causal, Local, Softmax, T, T_out>(
+        params, stream);
+  });
+}
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/generate_kernels.py b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/generate_kernels.py
new file mode 100644
index 000000000..6c3a03188
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/generate_kernels.py
@@ -0,0 +1,236 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+
+# Copied from Driss Guessous's PR in PyTorch: https://github.com/pytorch/pytorch/pull/105602
+
+# This file is run to generate the kernel instantiations for the flash_attn kernels
+# They are written to several files in order to speed up compilation
+
+import argparse
+import itertools
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional, Union
+
+
+DTYPE_MAP = {
+    "fp16": "cutlass::half_t",
+    "bf16": "cutlass::bfloat16_t",
+    "e4m3": "cutlass::float_e4m3_t",
+}
+
+DTYPE_MAP_FWD_SM8x = {
+    "fp16": "cutlass::half_t",
+    "bf16": "cutlass::bfloat16_t",
+}
+
+DTYPE_MAP_BWD = {
+    "fp16": "cutlass::half_t",
+    "bf16": "cutlass::bfloat16_t",
+}
+
+SM = [90]  # Sm kernels support up to
+SOFTMAX = ["true", "false"]
+HEAD_DIMENSIONS = [64, 96, 128, 192, 256]
+
+KERNEL_IMPL_TEMPLATE_FWD_SM90 = """
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {{
+#ifndef FLASHATTENTION_DISABLE_HDIM{HEAD_DIM}
+template void run_mha_fwd_<{ARCH}, {DTYPE}, {HEAD_DIM}, {SOFTMAX}>(Flash_fwd_params &params, cudaStream_t stream);
+#endif
+}} // namespace hstu
+"""
+
+KERNEL_IMPL_TEMPLATE_FWD_SM8x = """
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {{
+#ifndef FLASHATTENTION_DISABLE_SM8x
+#ifndef FLASHATTENTION_DISABLE_HDIM{HEAD_DIM}
+template void run_mha_fwd_<80, {DTYPE}, {HEAD_DIM}, {SOFTMAX}>(Flash_fwd_params &params, cudaStream_t stream);
+#endif
+#endif
+}} // namespace hstu
+"""
+
+KERNEL_IMPL_TEMPLATE_BWD_SM90 = """
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {{
+#ifndef FLASHATTENTION_DISABLE_HDIM{HEAD_DIM}
+template void run_mha_bwd_<{ARCH}, {DTYPE}, {HEAD_DIM}, {SOFTMAX}>(Flash_bwd_params &params, cudaStream_t stream);
+#endif
+}} // namespace hstu
+"""
+
+KERNEL_IMPL_TEMPLATE_BWD_SM8x = """
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {{
+#ifndef FLASHATTENTION_DISABLE_SM8x
+#ifndef FLASHATTENTION_DISABLE_HDIM{HEAD_DIM}
+template void run_mha_bwd_<80, {DTYPE}, {HEAD_DIM}, {SOFTMAX}>(Flash_bwd_params &params, cudaStream_t stream);
+#endif
+#endif
+}} // namespace hstu
+"""
+
+
+@dataclass
+class Kernel:
+    sm: int
+    dtype: str
+    head_dim: int
+    softmax: str
+    direction: str
+
+    @property
+    def template(self) -> str:
+        if self.direction == "fwd":
+            if self.sm == 90:
+                return KERNEL_IMPL_TEMPLATE_FWD_SM90.format(
+                    ARCH=str(self.sm),
+                    DTYPE=DTYPE_MAP[self.dtype],
+                    HEAD_DIM=self.head_dim,
+                    SOFTMAX=self.softmax,
+                )
+            else:
+                # Always enable PackGQA for Sm8x to reduce compilation
+                return KERNEL_IMPL_TEMPLATE_FWD_SM8x.format(
+                    DTYPE=DTYPE_MAP[self.dtype],
+                    HEAD_DIM=self.head_dim,
+                    SOFTMAX=self.softmax,
+                )
+        else:
+            assert self.direction == "bwd"
+            if self.sm == 90:
+                return KERNEL_IMPL_TEMPLATE_BWD_SM90.format(
+                    ARCH=str(self.sm),
+                    DTYPE=DTYPE_MAP[self.dtype],
+                    HEAD_DIM=self.head_dim,
+                    SOFTMAX=self.softmax,
+                )
+            else:
+                return KERNEL_IMPL_TEMPLATE_BWD_SM8x.format(
+                    DTYPE=DTYPE_MAP[self.dtype],
+                    HEAD_DIM=self.head_dim,
+                    SOFTMAX=self.softmax,
+                )
+
+    @property
+    def filename(self) -> str:
+        return f"flash_{self.direction}_hdim{self.head_dim}_{self.dtype}_softmax{self.softmax}_sm{self.sm}.cu"
+
+
+def get_all_kernels() -> List[Kernel]:
+    kernels: List[Kernel] = []
+    for dtype, head_dim, sm, softmax in itertools.product(
+        DTYPE_MAP.keys(), HEAD_DIMENSIONS, SM, SOFTMAX
+    ):
+        # We always enable PackGQA for Sm8x or Split
+        # so we should just pass in packgqa=False to avoid the `_packgqa` in the filename.
+        if sm >= 90 or dtype in DTYPE_MAP_FWD_SM8x:
+            kernels.append(
+                Kernel(
+                    sm=sm,
+                    dtype=dtype,
+                    head_dim=head_dim,
+                    direction="fwd",
+                    softmax=softmax,
+                )
+            )
+    for dtype, head_dim, sm, softmax in itertools.product(
+        DTYPE_MAP_BWD.keys(), HEAD_DIMENSIONS, SM, SOFTMAX
+    ):
+        kernels.append(
+            Kernel(
+                sm=sm,
+                dtype=dtype,
+                head_dim=head_dim,
+                direction="bwd",
+                softmax=softmax,
+            )
+        )
+    return kernels
+
+
+def write_kernel(kernel: Union[Kernel], autogen_dir: Path) -> None:
+    prelude = """
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */ \n
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
+// Splitting the different template instantiations to different files to speed up compilation.
+// This file is auto-generated. See "generate_kernels.py"\n
+"""
+    (autogen_dir / kernel.filename).write_text(prelude + kernel.template)
+
+
+def main(output_dir_name: Optional[str]) -> None:
+    output_dir = (
+        Path(output_dir_name) if output_dir_name is not None else Path(__file__).parent
+    )
+    output_dir.mkdir(parents=True, exist_ok=True)
+    kernels_all = list(get_all_kernels())
+    for kernel in kernels_all:
+        write_kernel(kernel, output_dir)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="generate_kernels",
+        description="Generate the flash_attention kernels template instantiations",
+    )
+    # Set an optional output directory
+    parser.add_argument(
+        "-o",
+        "--output_dir",
+        default="instantiations",
+        required=False,
+        help="Where to generate the kernels  will default to the current directory ",
+    )
+    args = parser.parse_args()
+    main(args.output_dir)
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_bf16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..da0eeb2df
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_bf16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+template void run_mha_bwd_<90, cutlass::bfloat16_t, 128, false>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_bf16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..8d85c2235
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_bf16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+template void run_mha_bwd_<90, cutlass::bfloat16_t, 128, true>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_fp16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..09226cd80
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_fp16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+template void run_mha_bwd_<90, cutlass::half_t, 128, false>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_fp16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..63e451d14
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_fp16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+template void run_mha_bwd_<90, cutlass::half_t, 128, true>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_bf16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..e379d9918
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_bf16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+template void run_mha_bwd_<90, cutlass::bfloat16_t, 192, false>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_bf16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..7faa31376
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_bf16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+template void run_mha_bwd_<90, cutlass::bfloat16_t, 192, true>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_fp16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..5ddc7d7fc
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_fp16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+template void run_mha_bwd_<90, cutlass::half_t, 192, false>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_fp16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..530deae2b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_fp16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+template void run_mha_bwd_<90, cutlass::half_t, 192, true>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_bf16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..185907c5e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_bf16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+template void run_mha_bwd_<90, cutlass::bfloat16_t, 256, false>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_bf16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..39df173bb
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_bf16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+template void run_mha_bwd_<90, cutlass::bfloat16_t, 256, true>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_fp16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..cdc0a9f7e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_fp16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+template void run_mha_bwd_<90, cutlass::half_t, 256, false>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_fp16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..6f3182d34
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_fp16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+template void run_mha_bwd_<90, cutlass::half_t, 256, true>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_bf16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..89285d0be
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_bf16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+template void run_mha_bwd_<90, cutlass::bfloat16_t, 64, false>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_bf16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..ab39c7e06
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_bf16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+template void run_mha_bwd_<90, cutlass::bfloat16_t, 64, true>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_fp16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..8d62b8827
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_fp16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+template void run_mha_bwd_<90, cutlass::half_t, 64, false>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_fp16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..5192d945f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_fp16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+template void run_mha_bwd_<90, cutlass::half_t, 64, true>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_bf16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..cbeeac64a
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_bf16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+template void run_mha_bwd_<90, cutlass::bfloat16_t, 96, false>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_bf16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..b654969e4
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_bf16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+template void run_mha_bwd_<90, cutlass::bfloat16_t, 96, true>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_fp16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..ea81f7ee4
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_fp16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+template void run_mha_bwd_<90, cutlass::half_t, 96, false>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_fp16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..7439f322e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_fp16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_bwd_launch_template.h"
+#else
+#include "flash_bwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+template void run_mha_bwd_<90, cutlass::half_t, 96, true>(
+    Flash_bwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_bf16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..a39bcd505
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_bf16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+template void run_mha_fwd_<90, cutlass::bfloat16_t, 128, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_bf16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..464a0f443
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_bf16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+template void run_mha_fwd_<90, cutlass::bfloat16_t, 128, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_e4m3_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_e4m3_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..3075657bb
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_e4m3_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+template void run_mha_fwd_<90, cutlass::float_e4m3_t, 128, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_e4m3_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_e4m3_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..1ab6e4394
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_e4m3_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+template void run_mha_fwd_<90, cutlass::float_e4m3_t, 128, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_fp16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..be5a6cb0d
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_fp16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+template void run_mha_fwd_<90, cutlass::half_t, 128, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_fp16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..7c303e7ef
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_fp16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM128
+template void run_mha_fwd_<90, cutlass::half_t, 128, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_bf16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..6e8d906d5
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_bf16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+template void run_mha_fwd_<90, cutlass::bfloat16_t, 192, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_bf16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..80367708f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_bf16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+template void run_mha_fwd_<90, cutlass::bfloat16_t, 192, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_e4m3_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_e4m3_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..67ade004b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_e4m3_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+template void run_mha_fwd_<90, cutlass::float_e4m3_t, 192, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_e4m3_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_e4m3_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..9f40d2726
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_e4m3_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+template void run_mha_fwd_<90, cutlass::float_e4m3_t, 192, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_fp16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..1779657c0
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_fp16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+template void run_mha_fwd_<90, cutlass::half_t, 192, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_fp16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..0037dbc17
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_fp16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM192
+template void run_mha_fwd_<90, cutlass::half_t, 192, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_bf16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..93440571c
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_bf16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+template void run_mha_fwd_<90, cutlass::bfloat16_t, 256, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_bf16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..c0634db8f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_bf16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+template void run_mha_fwd_<90, cutlass::bfloat16_t, 256, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_e4m3_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_e4m3_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..a0eb625f5
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_e4m3_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+template void run_mha_fwd_<90, cutlass::float_e4m3_t, 256, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_e4m3_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_e4m3_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..8b7216302
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_e4m3_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+template void run_mha_fwd_<90, cutlass::float_e4m3_t, 256, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_fp16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..fe89b532f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_fp16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+template void run_mha_fwd_<90, cutlass::half_t, 256, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_fp16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..c0857f941
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_fp16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM256
+template void run_mha_fwd_<90, cutlass::half_t, 256, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_bf16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..841e9359e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_bf16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+template void run_mha_fwd_<90, cutlass::bfloat16_t, 64, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_bf16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..3da54d69f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_bf16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+template void run_mha_fwd_<90, cutlass::bfloat16_t, 64, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_e4m3_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_e4m3_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..4761ca635
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_e4m3_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+template void run_mha_fwd_<90, cutlass::float_e4m3_t, 64, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_e4m3_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_e4m3_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..33e66d0a7
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_e4m3_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+template void run_mha_fwd_<90, cutlass::float_e4m3_t, 64, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_fp16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..fab2951ee
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_fp16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+template void run_mha_fwd_<90, cutlass::half_t, 64, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_fp16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..2ef1f29c9
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_fp16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM64
+template void run_mha_fwd_<90, cutlass::half_t, 64, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_bf16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..bc52514e9
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_bf16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+template void run_mha_fwd_<90, cutlass::bfloat16_t, 96, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_bf16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..11ea3bb20
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_bf16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+template void run_mha_fwd_<90, cutlass::bfloat16_t, 96, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_e4m3_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_e4m3_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..9e0b05a31
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_e4m3_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+template void run_mha_fwd_<90, cutlass::float_e4m3_t, 96, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_e4m3_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_e4m3_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..7fa79aa76
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_e4m3_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+template void run_mha_fwd_<90, cutlass::float_e4m3_t, 96, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_fp16_softmaxfalse_sm90.cu
new file mode 100644
index 000000000..83a25a649
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_fp16_softmaxfalse_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+template void run_mha_fwd_<90, cutlass::half_t, 96, false>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_fp16_softmaxtrue_sm90.cu
new file mode 100644
index 000000000..e0526dec8
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_fp16_softmaxtrue_sm90.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
+// different files to speed up compilation. This file is auto-generated. See
+// "generate_kernels.py"
+
+#ifdef OSS_ENV
+#include "hstu_attention/flash_fwd_launch_template.h"
+#else
+#include "flash_fwd_launch_template.h"
+#endif
+
+namespace hstu {
+#ifndef FLASHATTENTION_DISABLE_HDIM96
+template void run_mha_fwd_<90, cutlass::half_t, 96, true>(
+    Flash_fwd_params& params,
+    cudaStream_t stream);
+#endif
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mainloop_bwd_sm90_tma_gmma_ws.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mainloop_bwd_sm90_tma_gmma_ws.h
new file mode 100644
index 000000000..e702faf0b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mainloop_bwd_sm90_tma_gmma_ws.h
@@ -0,0 +1,3166 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cutlass/array.h>
+#include <cutlass/barrier.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/numeric_conversion.h>
+#include <cutlass/numeric_types.h>
+#include "cutlass/pipeline/pipeline.hpp"
+
+#include "cute/tensor.hpp"
+
+#include "cutlass/gemm/collective/builders/sm90_common.inl"
+
+#include "copy_sm90_bulk_reduce.h"
+#include "mask.h"
+#include "named_barrier.h"
+#include "seqlen.h"
+#include "utils.h"
+
+namespace hstu {
+
+using namespace cute;
+
+template <
+    int Stages,
+    int Stages_dO,
+    int Stages_dS,
+    class ClusterShape_,
+    class TileShape_MNK_,
+    class Element_,
+    class ElementAccum_,
+    class ArchTag_,
+    bool Causal,
+    bool Local,
+    bool Contexual_mask,
+    bool Jagged,
+    bool Has_targets,
+    bool Deterministic,
+    bool SdP_swapAB_,
+    bool dKV_swapAB_,
+    bool dQ_swapAB_,
+    int NumMmaWarpGroups = 2,
+    int AtomLayoutMSdP = 1,
+    int AtomLayoutNdKV = 2,
+    int AtomLayoutMdQ = 1,
+    bool Mma_dP_is_RS = false,
+    bool Cross = false,
+    bool Softmax = false>
+struct CollectiveMainloopBwdSm90 {
+  static constexpr int kStages = Stages;
+  static constexpr int kStages_dO = Stages_dO;
+  static constexpr int kStages_dS = Stages_dS;
+  static_assert(kStages >= kStages_dO);
+  static_assert(Stages_dS == 1 || Stages_dS == kStages);
+  static_assert(
+      !Mma_dP_is_RS || SdP_swapAB_); // If Mma_dP_is_RS, we need SdP_SwapAB
+  using ClusterShape = ClusterShape_;
+  using TileShape_MNK = TileShape_MNK_;
+  using Element = Element_;
+  using ElementAccum = ElementAccum_;
+  using ArchTag = ArchTag_;
+  using SeqlenInfo_t = hstu::SeqlenInfoQKBwd<
+      Jagged,
+      Cross,
+      Has_targets,
+      CUTE_STATIC_V(get<0>(TileShape_MNK{}))>;
+
+  static constexpr bool SdP_swapAB = SdP_swapAB_;
+  static constexpr bool dKV_swapAB = dKV_swapAB_;
+  static constexpr bool dQ_swapAB = dQ_swapAB_;
+
+  static constexpr bool Q_dO_same_stages = kStages == kStages_dO;
+
+  static constexpr int kBlockM = get<0>(TileShape_MNK{});
+  static constexpr int kBlockN = get<1>(TileShape_MNK{});
+  static constexpr int kHeadDim = get<2>(TileShape_MNK{});
+
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+  static_assert(get<0>(ClusterShape{}) == 1 && get<2>(ClusterShape{}) == 1);
+
+  static constexpr int NumMmaThreads =
+      NumMmaWarpGroups * cutlass::NumThreadsPerWarpGroup;
+  static constexpr int NumProducerThreads = cutlass::NumThreadsPerWarp * 2;
+
+  static_assert(NumMmaWarpGroups % AtomLayoutMSdP == 0);
+  static_assert(NumMmaWarpGroups % AtomLayoutNdKV == 0);
+  static_assert(NumMmaWarpGroups % AtomLayoutMdQ == 0);
+  static constexpr bool Mma_dKV_is_RS = AtomLayoutMSdP == 1 &&
+      AtomLayoutNdKV == NumMmaWarpGroups && SdP_swapAB && !dKV_swapAB;
+  static constexpr bool Mma_dQ_is_RS = AtomLayoutMSdP == NumMmaWarpGroups &&
+      AtomLayoutMdQ == NumMmaWarpGroups && !SdP_swapAB &&
+      !dQ_swapAB; // If dQ_swapAB we can't use RS
+
+  static constexpr GMMA::Major PdS_Major = GMMA::Major::K;
+  // static constexpr GMMA::Major PdS_Major = GMMA::Major::MN;
+  static constexpr GMMA::Major PdSt_Major =
+      PdS_Major == GMMA::Major::K ? GMMA::Major::MN : GMMA::Major::K;
+
+  using TileShapeAtomSdP = std::conditional_t<
+      !SdP_swapAB,
+      Shape<
+          Int<kBlockM>,
+          Int<kBlockN / (NumMmaWarpGroups / AtomLayoutMSdP)>,
+          Int<kHeadDim>>,
+      Shape<Int<kBlockN>, Int<kBlockM / AtomLayoutMSdP>, Int<kHeadDim>>>;
+  using AtomLayoutSdP = std::conditional_t<
+      !SdP_swapAB,
+      Layout<Shape<
+          Int<AtomLayoutMSdP>,
+          Int<NumMmaWarpGroups / AtomLayoutMSdP>,
+          _1>>,
+      Layout<Shape<
+          Int<NumMmaWarpGroups / AtomLayoutMSdP>,
+          Int<AtomLayoutMSdP>,
+          _1>>>;
+  using TiledMmaSdP = decltype(cute::make_tiled_mma(
+      cute::GMMA::
+          ss_op_selector<Element, Element, ElementAccum, TileShapeAtomSdP>(),
+      AtomLayoutSdP{}));
+
+  using TiledMmadPRS = decltype(cute::make_tiled_mma(
+      cute::GMMA::
+          rs_op_selector<Element, Element, ElementAccum, TileShapeAtomSdP>(),
+      AtomLayoutSdP{}));
+
+  using TileShapeAtomdKV = std::conditional_t<
+      !dKV_swapAB,
+      Shape<
+          Int<kBlockN>,
+          Int<kHeadDim / (NumMmaWarpGroups / AtomLayoutNdKV)>,
+          Int<kBlockM>>,
+      Shape<Int<kHeadDim>, Int<kBlockN / AtomLayoutNdKV>, Int<kBlockM>>>;
+  using AtomLayoutdKV = std::conditional_t<
+      !dKV_swapAB,
+      Layout<Shape<
+          Int<AtomLayoutNdKV>,
+          Int<NumMmaWarpGroups / AtomLayoutNdKV>,
+          _1>>,
+      Layout<Shape<
+          Int<NumMmaWarpGroups / AtomLayoutNdKV>,
+          Int<AtomLayoutNdKV>,
+          _1>>>;
+  using TiledMmadKV = decltype(cute::make_tiled_mma(
+      std::conditional_t<
+          Mma_dKV_is_RS,
+          decltype(cute::GMMA::rs_op_selector<
+                   Element,
+                   Element,
+                   ElementAccum,
+                   TileShapeAtomdKV,
+                   GMMA::Major::K,
+                   GMMA::Major::MN>()),
+          decltype(cute::GMMA::ss_op_selector<
+                   Element,
+                   Element,
+                   ElementAccum,
+                   TileShapeAtomdKV,
+                   !dKV_swapAB ? PdSt_Major : GMMA::Major::MN,
+                   !dKV_swapAB ? GMMA::Major::MN : PdSt_Major>())>{},
+      AtomLayoutdKV{}));
+
+  static constexpr bool dQacc_use_TMA = kHeadDim < 256;
+  // For hdim256, we want to slice the dQ MMA (64 x 256 on 2 WGs) into two (64 x
+  // 128 on 2 WGs) so that we can do atomic add on one half before doing the
+  // other half of the MMA, to reduce register pressure.
+  static constexpr bool Slice_dQKV_Mma = kHeadDim == 256 && !dQacc_use_TMA &&
+      dQ_swapAB && AtomLayoutMdQ == 1 && NumMmaWarpGroups == 2;
+  static_assert(
+      !(Deterministic && Slice_dQKV_Mma),
+      "Deterministic mode not supported with Slice_dQKV_Mma");
+
+  static constexpr int TileShapeAtomdQ_BlockM = kBlockM / AtomLayoutMdQ;
+  static constexpr int TileShapeAtomdQ_HeadDim =
+      (Slice_dQKV_Mma ? kHeadDim / 2 : kHeadDim) /
+      (NumMmaWarpGroups / AtomLayoutMdQ);
+  static_assert(
+      !dQ_swapAB ? TileShapeAtomdQ_BlockM == 64 : TileShapeAtomdQ_HeadDim == 64,
+      "Tile_M must be 64.");
+  using TileShapeAtomdQ = std::conditional_t<
+      !dQ_swapAB,
+      Shape<
+          Int<TileShapeAtomdQ_BlockM>,
+          Int<TileShapeAtomdQ_HeadDim>,
+          Int<kBlockN>>,
+      Shape<
+          Int<TileShapeAtomdQ_HeadDim>,
+          Int<TileShapeAtomdQ_BlockM>,
+          Int<kBlockN>>>;
+  using AtomLayoutdQ = std::conditional_t<
+      !dQ_swapAB,
+      Layout<
+          Shape<Int<AtomLayoutMdQ>, Int<NumMmaWarpGroups / AtomLayoutMdQ>, _1>>,
+      Layout<Shape<
+          Int<NumMmaWarpGroups / AtomLayoutMdQ>,
+          Int<AtomLayoutMdQ>,
+          _1>>>;
+  using TiledMmadQ = decltype(cute::make_tiled_mma(
+      std::conditional_t<
+          Mma_dQ_is_RS,
+          decltype(cute::GMMA::rs_op_selector<
+                   Element,
+                   Element,
+                   ElementAccum,
+                   TileShapeAtomdQ,
+                   GMMA::Major::K,
+                   GMMA::Major::MN>()),
+          decltype(cute::GMMA::ss_op_selector<
+                   Element,
+                   Element,
+                   ElementAccum,
+                   TileShapeAtomdQ,
+                   !dQ_swapAB ? PdS_Major : GMMA::Major::MN,
+                   !dQ_swapAB ? GMMA::Major::MN : PdS_Major>())>{},
+      AtomLayoutdQ{}));
+
+  // We need to accommodate both Q and Q^T (and dO and dO^T) in shared memory.
+  // Q & dO are used in the SdP Mma and Q^T and dO^T are used in the dKV Mma.
+  // Since this is GMMA::Major::K, the M dimension (kBlockM) doesn't matter for
+  // the layout, only the K dimension changes the layout.
+  using SmemLayoutAtomQdO =
+      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+               GMMA::Major::K,
+               Element,
+               Int<kBlockM>,
+               Int<kHeadDim /
+                   (NumMmaWarpGroups / AtomLayoutNdKV)>>()); // for dKV_Mma
+  using SmemLayoutQ = decltype(tile_to_shape(
+      SmemLayoutAtomQdO{},
+      make_shape(
+          shape<0>(TileShape_MNK{}),
+          shape<2>(TileShape_MNK{}),
+          Int<kStages>{})));
+  using SmemLayoutdO = decltype(tile_to_shape(
+      SmemLayoutAtomQdO{},
+      make_shape(
+          shape<0>(TileShape_MNK{}),
+          shape<2>(TileShape_MNK{}),
+          Int<kStages_dO>{})));
+
+  using SmemLayoutAtomK =
+      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+               GMMA::Major::K,
+               Element,
+               Int<kBlockN>,
+               Int<kHeadDim / (NumMmaWarpGroups / AtomLayoutMdQ)>>());
+  using SmemLayoutK =
+      decltype(tile_to_shape(SmemLayoutAtomK{}, select<1, 2>(TileShape_MNK{})));
+
+  using SmemLayoutAtomV =
+      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+               GMMA::Major::K,
+               Element,
+               decltype(cute::get<1>(TileShape_MNK{})),
+               decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutV =
+      decltype(tile_to_shape(SmemLayoutAtomV{}, select<1, 2>(TileShape_MNK{})));
+
+  using SmemLayoutAtomPdS =
+      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+               PdS_Major,
+               Element,
+               Int<kBlockM / AtomLayoutMSdP>,
+               Int<kBlockN / (NumMmaWarpGroups / AtomLayoutMSdP)>>());
+  using SmemLayoutPdS = decltype(tile_to_shape(
+      SmemLayoutAtomPdS{},
+      make_shape(Int<kBlockM>{}, Int<kBlockN>{}, Int<kStages_dS>{}),
+      std::conditional_t<
+          PdS_Major == GMMA::Major::K,
+          cute::Step<_1, _2, _3>,
+          cute::Step<_2, _1, _3>>{}));
+  // Need stride to be multiple of 32, otherwise we get error (misaligned
+  // address) when doing TMA if e.g. kBlockM=80 We set stride to be multiple of
+  // 64 so that if ShuffleLSE, even if threads read from sLSE but out of bounds,
+  // it's still a valid smem address.
+  using SmemLayoutLSE = cute::Layout<
+      cute::Shape<Int<kBlockM>, Int<kStages>>,
+      cute::Stride<_1, Int<cute::round_up(kBlockM, 64)>>>;
+  using SmemLayoutLSEMma = std::conditional_t<
+      SdP_swapAB,
+      cute::Layout<
+          cute::Shape<Int<kBlockN>, Int<kBlockM>, Int<kStages>>,
+          cute::Stride<_0, _1, Int<cute::round_up(kBlockM, 64)>>>,
+      cute::Layout<
+          cute::Shape<Int<kBlockM>, Int<kBlockN>, Int<kStages>>,
+          cute::Stride<_1, _0, Int<cute::round_up(kBlockM, 64)>>>>;
+
+  // Note this is the transpose in terms of the view, not in terms of memory.
+  using SmemLayoutQt = decltype(cute::composition(
+      SmemLayoutQ{},
+      make_layout(
+          make_shape(
+              get<2>(TileShape_MNK{}),
+              get<0>(TileShape_MNK{}),
+              Int<kStages>{}),
+          make_stride(Int<kBlockM>{}, _1{}, Int<kBlockM * kHeadDim>{}))));
+  using SmemLayoutdOt = decltype(cute::composition(
+      SmemLayoutdO{},
+      make_layout(
+          make_shape(
+              get<2>(TileShape_MNK{}),
+              get<0>(TileShape_MNK{}),
+              Int<kStages_dO>{}),
+          make_stride(Int<kBlockM>{}, _1{}, Int<kBlockM * kHeadDim>{}))));
+  using SmemLayoutKt = decltype(cute::composition(
+      SmemLayoutK{},
+      make_layout(
+          make_shape(get<2>(TileShape_MNK{}), get<1>(TileShape_MNK{})),
+          make_stride(Int<kBlockN>{}, _1{}))));
+  using SmemLayoutPdSt = decltype(cute::composition(
+      SmemLayoutPdS{},
+      make_layout(
+          make_shape(Int<kBlockN>{}, Int<kBlockM>{}, Int<kStages_dS>{}),
+          make_stride(Int<kBlockM>{}, _1{}, Int<kBlockM * kBlockN>{}))));
+
+  // Thread layout, 256 or 384 threads per row
+  // We split into NumMmaWarpGroups so that we can do Bulk reduce add for each
+  // WG separately.
+  using R2SLayoutAtomdQaccum = Layout<
+      Shape<Int<cutlass::NumThreadsPerWarpGroup>, Int<NumMmaWarpGroups>>>;
+  using R2STiledCopydQaccum = decltype(make_tiled_copy(
+      Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, ElementAccum>{},
+      R2SLayoutAtomdQaccum{},
+      Layout<Shape<_4>>{})); // Val layout, 4 vals per store
+  using SmemLayoutdQaccum = Layout<
+      Shape<Int<kBlockM * kHeadDim / NumMmaWarpGroups>, Int<NumMmaWarpGroups>>>;
+
+  static constexpr int kNumPdSStore = kBlockM * kBlockN / NumMmaThreads;
+  // If !SdP_swapAB, the accum registers hold P / dS, otherwise they hold Pt /
+  // dSt. If PdS_major is MN, then we need to "transpose" the write.
+  using SmemCopyAtomPdS = Copy_Atom<
+      std::conditional_t<
+          (!SdP_swapAB) ^ (PdS_Major == GMMA::Major::MN),
+          std::conditional_t<
+              kNumPdSStore % 8 == 0,
+              cute::SM90_U32x4_STSM_N,
+              cute::SM90_U32x2_STSM_N>,
+          std::conditional_t<
+              kNumPdSStore % 8 == 0,
+              cute::SM90_U16x8_STSM_T,
+              cute::SM90_U16x4_STSM_T>>,
+      Element>;
+
+  using GmemTiledCopyQdO =
+      decltype(cutlass::gemm::collective::detail::
+                   sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape{})));
+  using GmemTiledCopyKV = cute::SM90_TMA_LOAD;
+
+  using ShapeQKV =
+      cute::Shape<int32_t, int32_t, int32_t, int32_t>; // (seqlen, d, head,
+                                                       // batch)
+  using StrideQKV = cute::Stride<int64_t, _1, int64_t, int64_t>;
+  using ShapeLSE =
+      cute::Shape<int32_t, int32_t, int32_t>; // (seqlen, head, batch)
+  using StrideLSE = cute::Stride<_1, int64_t, int64_t>; // (seqlen, head, batch)
+  using ShapedQaccum =
+      cute::Shape<int32_t, int32_t, int32_t>; // (seqlen * d, head, batch)
+  using StridedQaccum = cute::Stride<_1, int64_t, int64_t>;
+
+  using TMA_QdO = decltype(make_tma_copy_A_sm90(
+      GmemTiledCopyQdO{},
+      make_tensor(
+          make_gmem_ptr(static_cast<Element const*>(nullptr)),
+          ShapeQKV{},
+          StrideQKV{}),
+      take<0, 2>(SmemLayoutQ{}),
+      TileShape_MNK{},
+      ClusterShape{})); // mcast along N mode for this M load, if any
+
+  using TMA_K = decltype(make_tma_copy_B_sm90(
+      GmemTiledCopyKV{},
+      make_tensor(
+          make_gmem_ptr(static_cast<Element const*>(nullptr)),
+          ShapeQKV{},
+          StrideQKV{}),
+      SmemLayoutK{},
+      TileShape_MNK{},
+      ClusterShape{})); // no mcast for KV
+
+  using TMA_V = decltype(make_tma_copy_B_sm90(
+      GmemTiledCopyKV{},
+      make_tensor(
+          make_gmem_ptr(static_cast<Element const*>(nullptr)),
+          ShapeQKV{},
+          StrideQKV{}),
+      SmemLayoutV{},
+      TileShape_MNK{},
+      ClusterShape{})); // no mcast for KV
+
+  using MainloopPipeline = typename cutlass::PipelineTmaAsync<kStages>;
+  using PipelineState = typename MainloopPipeline::PipelineState;
+  using MainloopPipeline_dO = typename cutlass::PipelineTmaAsync<kStages_dO>;
+  using PipelineState_dO = typename MainloopPipeline_dO::PipelineState;
+
+  // Set the bytes transferred in this TMA transaction (may involve multiple
+  // issues)
+  static constexpr uint32_t TmaTransactionBytesQ = static_cast<uint32_t>(
+      size(take<0, 2>(SmemLayoutQ{})) * cutlass::sizeof_bits_v<Element> / 8);
+  static constexpr uint32_t TmaTransactionBytesK = static_cast<uint32_t>(
+      size(SmemLayoutK{}) * cutlass::sizeof_bits_v<Element> / 8);
+  static constexpr uint32_t TmaTransactionBytesV = static_cast<uint32_t>(
+      size(SmemLayoutV{}) * cutlass::sizeof_bits_v<Element> / 8);
+  static constexpr uint32_t TmaTransactionBytesLSE = static_cast<uint32_t>(
+      size(select<0>(SmemLayoutLSE{})) * cutlass::sizeof_bits_v<ElementAccum> /
+      8);
+
+  // These are tuned for speed. They don't affect correctness.
+  // We have separate iterations with causal masking. Not necessary for hdim 128
+  // but for hdim 64 this helps quite a bit to not have to do causal masking for
+  // most of the iterations. For hdim 192, separating masking iterations results
+  // in register spills.
+  static constexpr bool SeparateMaskingIterations = false;
+  // Do we keep the LSE and dPsum in each thread, or split them across 8 threads
+  // that share them and then shuffle to get the value whenever we need? This
+  // can reduce register pressure when SdP_swapAB, where each thread needs to
+  // keep statistics for (kBlockM / 4) rows. If !SdP_swapAB, each thread only
+  // needs to keep statistic for 2 rows.
+  static constexpr bool ShuffleLSE = SdP_swapAB && kHeadDim <= 64;
+  static constexpr bool ShuffledPsum = SdP_swapAB && kHeadDim <= 64;
+  static constexpr size_t SmemAlignmentP =
+      cutlass::detail::alignment_for_swizzle(SmemLayoutPdS{});
+  static constexpr size_t SmemAlignmentdS =
+      cutlass::detail::alignment_for_swizzle(SmemLayoutPdS{});
+  // Without this SmemAlignment, with hdim 256 we get "misaligned address" error
+  // in TMA
+  static constexpr size_t SmemAlignmentQKVdO = kHeadDim % 256 == 0 ? 256 : 128;
+  static constexpr size_t SmemAlignmentV = !Mma_dP_is_RS
+      ? SmemAlignmentQKVdO
+      : cutlass::detail::alignment_for_swizzle(SmemLayoutV{});
+  static_assert(
+      SmemAlignmentP >= 128 && SmemAlignmentdS >= 128,
+      "Require at least 128B alignment");
+
+  // TODO: do we have to worry that smem_dk and smem_dv in the epilogue don't
+  // line up w smem_k and smem_v due to alignment?
+  using SmemdQacc_t = std::conditional_t<
+      !dQacc_use_TMA,
+      cute::array<ElementAccum, 0>,
+      cute::array_aligned<ElementAccum, cute::cosize_v<SmemLayoutdQaccum>>>;
+  using SmemP_t = std::conditional_t<
+      Mma_dKV_is_RS,
+      cute::array<Element, 0>,
+      cute::array_aligned<
+          Element,
+          cute::cosize_v<SmemLayoutPdS>,
+          SmemAlignmentP>>;
+  struct TensorStorage
+      : cute::aligned_struct<
+            cute::max(SmemAlignmentP, SmemAlignmentdS, SmemAlignmentQKVdO)> {
+    cute::
+        array_aligned<Element, cute::cosize_v<SmemLayoutK>, SmemAlignmentQKVdO>
+            smem_k;
+    cute::array_aligned<Element, cute::cosize_v<SmemLayoutV>, SmemAlignmentV>
+        smem_v;
+    SmemdQacc_t smem_dqacc;
+    cute::
+        array_aligned<Element, cute::cosize_v<SmemLayoutQ>, SmemAlignmentQKVdO>
+            smem_q;
+    cute::
+        array_aligned<Element, cute::cosize_v<SmemLayoutdO>, SmemAlignmentQKVdO>
+            smem_do;
+    cute::array_aligned<ElementAccum, cute::cosize_v<SmemLayoutLSE>, 128>
+        smem_lse;
+    cute::array_aligned<ElementAccum, cute::cosize_v<SmemLayoutLSE>, 128>
+        smem_dpsum;
+    SmemP_t smem_p;
+    cute::array_aligned<Element, cute::cosize_v<SmemLayoutPdS>, SmemAlignmentdS>
+        smem_ds;
+  };
+
+  // Host side kernel arguments
+  struct Arguments {
+    Element const* const ptr_Q;
+    ShapeQKV const shape_Q;
+    StrideQKV const stride_Q;
+    Element const* const ptr_K;
+    ShapeQKV const shape_K;
+    StrideQKV const stride_K;
+    Element const* const ptr_V;
+    ShapeQKV const shape_V;
+    StrideQKV const stride_V;
+    Element const* const ptr_dO;
+    ShapeQKV const shape_dO;
+    StrideQKV const stride_dO;
+    ElementAccum* const ptr_dQaccum;
+    ShapedQaccum const shape_dQaccum;
+    StridedQaccum const stride_dQaccum;
+    float const* const ptr_LSE_log2;
+    ShapeLSE const shape_LSE;
+    StrideLSE const stride_LSE_log2;
+    float const* const ptr_dPsum;
+    StrideLSE const stride_dPsum;
+    int const max_attn_len;
+    int const min_full_attn_seq_len;
+    int const contextual_seq_len;
+    float const max_seq_len_inv;
+    float const alpha;
+    int const num_batch;
+    int const num_softmax_heads;
+    int const num_groups;
+    int const batch_size_per_group;
+    int* const dq_semaphore;
+    int const* const seq_offsets = nullptr;
+    int const* const seq_offsets_q = nullptr;
+    int const* const num_targets = nullptr;
+    int const* const max_seq_len_tensor = nullptr;
+    int const* const contextual_seq_len_tensor = nullptr;
+    int const* const max_attn_len_tensor = nullptr;
+    int const* const min_full_attn_seq_len_tensor = nullptr;
+    float const* const attn_scale = nullptr;
+    bool const scalar_scale = true;
+  };
+
+  // Device side kernel params
+  struct Params {
+    ShapeQKV const shape_Q;
+    ShapeQKV const shape_K;
+    ShapeQKV const shape_V;
+    ShapeQKV const shape_dO;
+    ElementAccum* const ptr_dQaccum;
+    ShapedQaccum const shape_dQaccum;
+    StridedQaccum stride_dQaccum;
+    TMA_QdO tma_load_Q, tma_load_dO;
+    TMA_K tma_load_K;
+    TMA_V tma_load_V;
+    float const* const ptr_LSE_log2;
+    ShapeLSE const shape_LSE;
+    StrideLSE const stride_LSE_log2;
+    float const* const ptr_dPsum;
+    StrideLSE const stride_dPsum;
+    int const max_attn_len;
+    int const min_full_attn_seq_len;
+    int const contextual_seq_len;
+    float const max_seq_len_inv;
+    float const alpha;
+    float const alpha_log2;
+    int const num_batch;
+    int const num_softmax_heads;
+    int const num_groups;
+    int const batch_size_per_group;
+    int* const dq_semaphore;
+    int const* const seq_offsets = nullptr;
+    int const* const seq_offsets_q = nullptr;
+    int const* const num_targets;
+    int const* const max_seq_len_tensor = nullptr;
+    int const* const contextual_seq_len_tensor = nullptr;
+    int const* const max_attn_len_tensor = nullptr;
+    int const* const min_full_attn_seq_len_tensor = nullptr;
+    float const* const attn_scale;
+    bool const scalar_scale = true;
+  };
+
+  static Params to_underlying_arguments(Arguments const& args) {
+    Tensor mQ =
+        make_tensor(make_gmem_ptr(args.ptr_Q), args.shape_Q, args.stride_Q);
+    TMA_QdO tma_load_Q = make_tma_copy_A_sm90(
+        GmemTiledCopyQdO{},
+        mQ,
+        SmemLayoutQ{}(_, _, _0{}),
+        TileShape_MNK{},
+        ClusterShape{}); // mcast along N mode for this M load, if any
+    Tensor mdO =
+        make_tensor(make_gmem_ptr(args.ptr_dO), args.shape_Q, args.stride_dO);
+    TMA_QdO tma_load_dO = make_tma_copy_A_sm90(
+        GmemTiledCopyQdO{},
+        mdO,
+        SmemLayoutdO{}(_, _, _0{}),
+        TileShape_MNK{},
+        ClusterShape{}); // mcast along N mode for this M load, if any
+    Tensor mK =
+        make_tensor(make_gmem_ptr(args.ptr_K), args.shape_K, args.stride_K);
+    TMA_K tma_load_K = make_tma_copy_B_sm90(
+        GmemTiledCopyKV{},
+        mK,
+        SmemLayoutK{},
+        TileShape_MNK{},
+        ClusterShape{}); // no mcast for KV
+    Tensor mV =
+        make_tensor(make_gmem_ptr(args.ptr_V), args.shape_K, args.stride_V);
+    TMA_V tma_load_V = make_tma_copy_B_sm90(
+        GmemTiledCopyKV{},
+        mV,
+        SmemLayoutV{},
+        TileShape_MNK{},
+        ClusterShape{}); // no mcast for KV
+    if constexpr (Deterministic) {
+      assert(args.dq_semaphore != nullptr);
+    }
+    return {
+        args.shape_Q,
+        args.shape_K,
+        args.shape_V,
+        args.shape_dO,
+        args.ptr_dQaccum,
+        args.shape_dQaccum,
+        args.stride_dQaccum,
+        tma_load_Q,
+        tma_load_dO,
+        tma_load_K,
+        tma_load_V,
+        args.ptr_LSE_log2,
+        args.shape_LSE,
+        args.stride_LSE_log2,
+        args.ptr_dPsum,
+        args.stride_dPsum,
+        args.max_attn_len,
+        args.min_full_attn_seq_len,
+        args.contextual_seq_len,
+        args.max_seq_len_inv,
+        args.alpha,
+        float(args.alpha * M_LOG2E),
+        args.num_batch,
+        args.num_softmax_heads,
+        args.num_groups,
+        args.batch_size_per_group,
+        args.dq_semaphore,
+        args.seq_offsets,
+        args.seq_offsets_q,
+        args.num_targets,
+        args.max_seq_len_tensor,
+        args.contextual_seq_len_tensor,
+        args.max_attn_len_tensor,
+        args.min_full_attn_seq_len_tensor,
+        args.attn_scale,
+        args.scalar_scale};
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best
+  /// performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& params) {
+    cute::prefetch_tma_descriptor(params.tma_load_Q.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(params.tma_load_dO.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(params.tma_load_K.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(params.tma_load_V.get_tma_descriptor());
+  }
+
+  CUTLASS_DEVICE
+  cute::tuple<int, int> get_m_block_min_max(
+      int const max_attn_len,
+      int const contextual_seq_len,
+      int const uihlen,
+      int const seqlen,
+      int const n_block) {
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+    if constexpr (Has_targets) {
+      int n_idx_min = n_block * kBlockN;
+      if (n_idx_min >= uihlen) {
+        int n_idx_max = (n_block + 1) * kBlockN;
+        return {
+            std::max(0, n_idx_min / kBlockM),
+            cute::ceil_div(std::min(n_idx_max, seqlen), kBlockM)};
+      }
+    }
+    // uih part
+    int m_block_max = cute::ceil_div(seqlen, kBlockM);
+    if constexpr (Local) {
+      int local_m_block_max =
+          cute::ceil_div((n_block + 1) * kBlockN + max_attn_len, kBlockM);
+      if constexpr (Contexual_mask) {
+        // row contexual without sink
+        if (n_block * kBlockN < contextual_seq_len) {
+          local_m_block_max = std::max(
+              local_m_block_max,
+              cute::ceil_div(contextual_seq_len + max_attn_len, kBlockM));
+        }
+      }
+      m_block_max = std::min(m_block_max, local_m_block_max);
+    }
+    int m_block_min = 0;
+    if constexpr (Causal || Local) {
+      m_block_min = std::max(m_block_min, (n_block * kBlockN) / kBlockM);
+    }
+    return {m_block_min, m_block_max};
+  }
+
+  CUTLASS_DEVICE
+  cute::tuple<int, int> get_full_m_block_min_max(
+      int const uihlen,
+      int const seqlen,
+      int const min_full_attn_seq_len,
+      int const m_block_max,
+      int const n_block) {
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+    if constexpr (Cross) {
+      return {0, 0};
+    }
+    if constexpr (!Local) {
+      return {0, 0};
+    }
+    if constexpr (Has_targets) {
+      int n_idx_min = n_block * kBlockN;
+      if (n_idx_min >= uihlen) {
+        return {0, 0};
+      }
+    }
+    if constexpr (Local) {
+      int full_m_block_max = cute::ceil_div(seqlen, kBlockM);
+      int full_m_block_min =
+          std::max(m_block_max, (uihlen - min_full_attn_seq_len) / kBlockM);
+      return {full_m_block_min, full_m_block_max};
+    }
+    return {0, 0};
+  }
+
+  CUTLASS_DEVICE
+  int get_contexual_m_block_max(
+      int const uihlen,
+      int const contextual_seq_len,
+      int const m_block_min,
+      int const n_block) {
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+    if constexpr (Cross) {
+      return 0;
+    }
+    if constexpr (!Contexual_mask) {
+      return 0;
+    }
+    if constexpr (Has_targets) {
+      int n_idx_min = n_block * kBlockN;
+      if (n_idx_min >= uihlen) {
+        return 0;
+      }
+    }
+    if constexpr (Causal || Local) {
+      int contexual_m_block_max =
+          std::min(m_block_min, cute::ceil_div(contextual_seq_len, kBlockM));
+      return contexual_m_block_max;
+    }
+    return 0;
+  }
+
+  CUTLASS_DEVICE
+  int get_next_m_block(
+      int const m_block,
+      int const m_block_min,
+      int const m_block_max,
+      int const contexual_m_block_max,
+      int const full_m_block_min,
+      int const full_m_block_max) {
+    int const out_m_block = m_block + 1;
+    if constexpr (Contexual_mask || Local) {
+      if (out_m_block == m_block_max) {
+        if (contexual_m_block_max > 0) {
+          return 0;
+        }
+        if (full_m_block_max > full_m_block_min) {
+          return full_m_block_min;
+        }
+        return -1;
+      }
+      if (out_m_block == contexual_m_block_max) {
+        if (full_m_block_max > full_m_block_min) {
+          return full_m_block_min;
+        }
+        return -1;
+      }
+      if (out_m_block == full_m_block_max) {
+        return -1;
+      }
+      return out_m_block;
+    }
+    if (out_m_block == m_block_max) {
+      return -1;
+    }
+    return out_m_block;
+  }
+
+  CUTLASS_DEVICE
+  cute::tuple<int, int> get_cross_m_block_min_max(
+      int const uihlen_q,
+      int const seqlen_q,
+      int const seqlen_kv,
+      int const n_block) {
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+    int m_block_max = cute::ceil_div(seqlen_q, kBlockM);
+    if constexpr (!Causal) {
+      return {0, m_block_max};
+    }
+    int m_block_min =
+        std::max(0, (n_block * kBlockN + uihlen_q - seqlen_kv) / kBlockM);
+    return {m_block_min, m_block_max};
+  }
+
+  template <typename SchedulerPrefetch, typename SharedStorage>
+  CUTLASS_DEVICE void load(
+      Params const& params,
+      MainloopPipeline pipeline_q,
+      MainloopPipeline_dO pipeline_do,
+      PipelineState& smem_pipe_write,
+      PipelineState_dO& smem_pipe_write_do,
+      SharedStorage& shared_storage,
+      SchedulerPrefetch const& scheduler_prefetch,
+      cute::tuple<int32_t, int32_t, int32_t> block_coord) {
+    auto [n_block, bidh, bidb] = block_coord;
+    SeqlenInfo_t seqlen_info{
+        bidb,
+        get<0>(params.shape_Q),
+        get<0>(params.shape_K),
+        params.seq_offsets,
+        params.seq_offsets_q,
+        params.num_targets};
+    if constexpr (Jagged) {
+      static constexpr int kBlockN = get<1>(TileShape_MNK{});
+      if (n_block * kBlockN >= seqlen_info.seqlen_kv) {
+        scheduler_prefetch();
+        return;
+      }
+    }
+    int min_full_attn_seq_len_, max_attn_len_, contextual_seq_len_;
+    if constexpr (!Cross) {
+      if (params.num_groups > 1) {
+        int group_id = bidb / params.batch_size_per_group;
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len_tensor[group_id];
+        max_attn_len_ = params.max_attn_len_tensor[group_id];
+        contextual_seq_len_ = params.contextual_seq_len_tensor[group_id];
+      } else {
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len;
+        max_attn_len_ = params.max_attn_len;
+        contextual_seq_len_ = params.contextual_seq_len;
+      }
+    }
+    int m_block_min, m_block_max;
+    if constexpr (Cross) {
+      auto m_block_min_max = get_cross_m_block_min_max(
+          seqlen_info.uihlen_q,
+          seqlen_info.seqlen_q,
+          seqlen_info.seqlen_kv,
+          n_block);
+      m_block_min = get<0>(m_block_min_max);
+      m_block_max = get<1>(m_block_min_max);
+    } else {
+      auto m_block_min_max = get_m_block_min_max(
+          max_attn_len_,
+          contextual_seq_len_,
+          seqlen_info.uihlen_q,
+          seqlen_info.seqlen_q,
+          n_block);
+      m_block_min = get<0>(m_block_min_max);
+      m_block_max = get<1>(m_block_min_max);
+    }
+    auto full_m_block_min_max = get_full_m_block_min_max(
+        seqlen_info.uihlen_q,
+        seqlen_info.seqlen_q,
+        min_full_attn_seq_len_,
+        m_block_max,
+        n_block);
+    int const full_m_block_min = get<0>(full_m_block_min_max);
+    int const full_m_block_max = get<1>(full_m_block_min_max);
+    int contexual_m_block_max = get_contexual_m_block_max(
+        seqlen_info.uihlen_q, contextual_seq_len_, m_block_min, n_block);
+
+    Tensor sQ = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
+        SmemLayoutQ{});
+    Tensor sdO = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_do.data()),
+        SmemLayoutdO{});
+    Tensor sK = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_k.data()),
+        SmemLayoutK{});
+    Tensor sV = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_v.data()),
+        SmemLayoutV{});
+    Tensor sLSE = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_lse.data()),
+        SmemLayoutLSE{});
+    Tensor sdPsum = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_dpsum.data()),
+        SmemLayoutLSE{});
+
+    // Prepare the TMA loads
+    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+    constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+    uint2 cluster_local_block_id = {
+        block_rank_in_cluster % cluster_shape_x,
+        block_rank_in_cluster / cluster_shape_x};
+    Tensor mQ = params.tma_load_Q.get_tma_tensor(params.shape_Q)(
+        _, _, bidh, !Jagged ? bidb : 0);
+    Tensor mdO = params.tma_load_dO.get_tma_tensor(params.shape_Q)(
+        _, _, bidh, !Jagged ? bidb : 0);
+    Tensor mK = params.tma_load_K.get_tma_tensor(params.shape_K)(
+        _, _, bidh, !Jagged ? bidb : 0);
+    Tensor mV = params.tma_load_V.get_tma_tensor(params.shape_K)(
+        _, _, bidh, !Jagged ? bidb : 0);
+    Tensor mLSE = make_tensor(
+        make_gmem_ptr(params.ptr_LSE_log2),
+        params.shape_LSE,
+        params.stride_LSE_log2)(_, bidh, !Jagged ? bidb : 0);
+    Tensor mdPsum = make_tensor(
+        make_gmem_ptr(params.ptr_dPsum), params.shape_LSE, params.stride_dPsum)(
+        _, bidh, !Jagged ? bidb : 0);
+
+    Tensor gQ = local_tile(
+        domain_offset(make_coord(seqlen_info.offset_q, _0{}), mQ),
+        select<0, 2>(TileShape_MNK{}),
+        make_coord(_, _0{})); // (M, K, _)
+    Tensor gdO = local_tile(
+        domain_offset(make_coord(seqlen_info.offset_q, _0{}), mdO),
+        select<0, 2>(TileShape_MNK{}),
+        make_coord(_, _0{})); // (M, K, _)
+    Tensor gK = local_tile(
+        domain_offset(make_coord(seqlen_info.offset_k, _0{}), mK),
+        select<1, 2>(TileShape_MNK{}),
+        make_coord(n_block, _0{})); // (N, K)
+    Tensor gV = local_tile(
+        domain_offset(make_coord(seqlen_info.offset_k, _0{}), mV),
+        select<1, 2>(TileShape_MNK{}),
+        make_coord(n_block, _0{})); // (N, K)
+    Tensor gLSE = local_tile(
+        domain_offset(make_coord(seqlen_info.offset_q_padded), mLSE),
+        select<0>(TileShape_MNK{}),
+        make_coord(_)); // (M, _)
+    Tensor gdPsum = local_tile(
+        domain_offset(make_coord(seqlen_info.offset_q_padded), mdPsum),
+        select<0>(TileShape_MNK{}),
+        make_coord(_)); // (M, _)
+
+    Tensor sK_x =
+        make_tensor(sK.data(), make_layout(sK.layout(), Layout<_1>{}));
+    Tensor gK_x =
+        make_tensor(gK.data(), make_layout(gK.layout(), Layout<_1>{}));
+    Tensor sV_x =
+        make_tensor(sV.data(), make_layout(sV.layout(), Layout<_1>{}));
+    Tensor gV_x =
+        make_tensor(gV.data(), make_layout(gV.layout(), Layout<_1>{}));
+    // auto [tQgQ, tQsQ] = tma_partition(params.tma_load_Q,
+    // block_rank_in_cluster, Layout<ClusterShape>{},
+    //                                   group_modes<0, 2>(sQ), group_modes<0,
+    //                                   2>(gQ));  // (TMA, k), (TMA, PIPE)
+    // auto [tdOgdO, tdOsdO] = tma_partition(params.tma_load_dO,
+    // block_rank_in_cluster, Layout<ClusterShape>{},
+    //                                   group_modes<0, 2>(sdO), group_modes<0,
+    //                                   2>(gdO));  // (TMA, k), (TMA, PIPE)
+    auto block_tma_Q = params.tma_load_Q.get_slice(cluster_local_block_id.y);
+    auto block_tma_dO = params.tma_load_dO.get_slice(cluster_local_block_id.y);
+    Tensor tQgQ = group_modes<0, 3>(block_tma_Q.partition_S(gQ));
+    Tensor tQsQ = group_modes<0, 3>(block_tma_Q.partition_D(sQ));
+    Tensor tdOgdO = group_modes<0, 3>(block_tma_dO.partition_S(gdO));
+    Tensor tdOsdO = group_modes<0, 3>(block_tma_dO.partition_D(sdO));
+    auto [tKgK, tKsK] = tma_partition(
+        params.tma_load_K,
+        _0{},
+        Layout<_1>{},
+        group_modes<0, 2>(sK_x),
+        group_modes<0, 2>(gK_x)); // (TMA), (TMA)
+    auto [tVgV, tVsV] = tma_partition(
+        params.tma_load_V,
+        _0{},
+        Layout<_1>{},
+        group_modes<0, 2>(sV_x),
+        group_modes<0, 2>(gV_x)); // (TMA), (TMA)
+    auto bulk_copy = Copy_Traits<SM90_BULK_COPY_AUTO>{};
+
+    uint16_t mcast_mask_qdo = 0;
+    if constexpr (cute::is_same_v<GmemTiledCopyQdO, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<ClusterShape>{}; // (m,n) -> block_id
+      for (int n = 0; n < size<1>(block_layout); ++n) {
+        mcast_mask_qdo |=
+            (uint16_t(1) << block_layout(cluster_local_block_id.x, n, _0{}));
+      }
+    }
+
+    int m_block = m_block_min;
+    int next_m_block = -1;
+    int lane_predicate = cute::elect_one_sync();
+
+    if (lane_predicate) {
+      pipeline_q.producer_acquire(smem_pipe_write);
+      copy(
+          params.tma_load_Q.with(
+              *pipeline_q.producer_get_barrier(smem_pipe_write),
+              mcast_mask_qdo,
+              TMA::CacheHintSm90::EVICT_LAST),
+          tQgQ(_, m_block),
+          tQsQ(_, smem_pipe_write.index()));
+      if constexpr (Softmax) {
+        copy(
+            bulk_copy.with(*pipeline_q.producer_get_barrier(smem_pipe_write)),
+            gLSE(_, m_block),
+            sLSE(_, smem_pipe_write.index()));
+      }
+    }
+
+    // // Wait for the MMA warpgroups to say that smem_k and smem_v are ready
+    // cutlass::arch::NamedBarrier::sync(NumMmaThreads +
+    // cutlass::NumThreadsPerWarp,
+    // static_cast<uint32_t>(BwdNamedBarriers::KVEmpty) /*id*/);
+
+    auto load_step = [&](int m_block) {
+      // If Q and dO have the same number of stages, we can use the same
+      // pipeline state variable to reduce registers
+      PipelineState_dO smem_pipe_write_do_cur =
+          cute::conditional_return<Q_dO_same_stages>(
+              smem_pipe_write, smem_pipe_write_do);
+      pipeline_do.producer_acquire(smem_pipe_write_do_cur);
+      copy(
+          params.tma_load_dO.with(
+              *pipeline_do.producer_get_barrier(smem_pipe_write_do_cur),
+              mcast_mask_qdo,
+              TMA::CacheHintSm90::EVICT_LAST),
+          tdOgdO(_, m_block),
+          tdOsdO(_, smem_pipe_write_do_cur.index()));
+      if constexpr (Softmax) {
+        copy(
+            bulk_copy.with(
+                *pipeline_do.producer_get_barrier(smem_pipe_write_do_cur)),
+            gdPsum(_, m_block),
+            sdPsum(_, smem_pipe_write_do_cur.index()));
+      }
+      if constexpr (!Q_dO_same_stages) {
+        ++smem_pipe_write_do;
+      }
+      ++smem_pipe_write;
+      next_m_block = get_next_m_block(
+          m_block,
+          m_block_min,
+          m_block_max,
+          contexual_m_block_max,
+          full_m_block_min,
+          full_m_block_max);
+      if (next_m_block != -1) {
+        pipeline_q.producer_acquire(smem_pipe_write);
+        copy(
+            params.tma_load_Q.with(
+                *pipeline_q.producer_get_barrier(smem_pipe_write),
+                mcast_mask_qdo,
+                TMA::CacheHintSm90::EVICT_LAST),
+            tQgQ(_, next_m_block),
+            tQsQ(_, smem_pipe_write.index()));
+        if constexpr (Softmax) {
+          copy(
+              bulk_copy.with(*pipeline_q.producer_get_barrier(smem_pipe_write)),
+              gLSE(_, next_m_block),
+              sLSE(_, smem_pipe_write.index()));
+        }
+      }
+    };
+
+    if (lane_predicate) {
+      // Copy K tile and V tile from GMEM to SMEM.
+      shared_storage.pipelines.barrier_KV.arrive_and_expect_tx(
+          TmaTransactionBytesK + TmaTransactionBytesV);
+      copy(
+          params.tma_load_K.with(
+              reinterpret_cast<
+                  cutlass::arch::ClusterTransactionBarrier::ValueType&>(
+                  shared_storage.pipelines.barrier_KV),
+              0 /*mcast_mask*/),
+          tKgK,
+          tKsK);
+      copy(
+          params.tma_load_V.with(
+              reinterpret_cast<
+                  cutlass::arch::ClusterTransactionBarrier::ValueType&>(
+                  shared_storage.pipelines.barrier_KV),
+              0 /*mcast_mask*/),
+          tVgV,
+          tVsV);
+
+#pragma unroll(kHeadDim < 256 ? 2 : 1)
+      for (; m_block < m_block_max; ++m_block) {
+        load_step(m_block);
+      }
+    }
+    scheduler_prefetch();
+    m_block = next_m_block;
+    if constexpr (Contexual_mask) {
+      if (lane_predicate) {
+        if (m_block >= 0) {
+#pragma unroll(kHeadDim < 256 ? 2 : 1)
+          for (; m_block < contexual_m_block_max; ++m_block) {
+            load_step(m_block);
+          }
+        }
+      }
+    }
+    m_block = next_m_block;
+    if constexpr (Local) {
+      if (lane_predicate) {
+        if (m_block >= 0) {
+#pragma unroll(kHeadDim < 256 ? 2 : 1)
+          for (; m_block < full_m_block_max; ++m_block) {
+            load_step(m_block);
+          }
+        }
+      }
+    }
+    if constexpr (Q_dO_same_stages) {
+      smem_pipe_write_do = smem_pipe_write;
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void load_tail(
+      MainloopPipeline pipeline_q,
+      MainloopPipeline_dO pipeline_do,
+      PipelineState& smem_pipe_write) {
+    static_assert(
+        Q_dO_same_stages, "Q and dO must have the same number of stages");
+    // Need to copy since pipeline_q.producer_tail(smem_pipe_write) will
+    // increment smem_pipe_write
+    PipelineState smem_pipe_write_do = smem_pipe_write;
+    // Issue the epilogue waits
+    if (cute::elect_one_sync()) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all Consumer UNLOCKs), or
+       * if the stage was never used then would just be acquired since the phase
+       * was still inverted from make_producer_start_state
+       */
+      pipeline_q.producer_tail(smem_pipe_write);
+      pipeline_do.producer_tail(smem_pipe_write_do);
+    }
+  }
+
+  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
+  CUTLASS_DEVICE void load_tail(
+      MainloopPipeline pipeline_q,
+      MainloopPipeline_dO pipeline_do,
+      PipelineState& smem_pipe_write,
+      PipelineState_dO& smem_pipe_write_do) {
+    // Issue the epilogue waits
+    if (cute::elect_one_sync()) {
+      /* This helps avoid early exit of blocks in Cluster
+       * Waits for all stages to either be released (all Consumer UNLOCKs), or
+       * if the stage was never used then would just be acquired since the phase
+       * was still inverted from make_producer_start_state
+       */
+      pipeline_q.producer_tail(smem_pipe_write);
+      pipeline_do.producer_tail(smem_pipe_write_do);
+    }
+  }
+
+  template <typename SharedStorage>
+  CUTLASS_DEVICE void store_dq(
+      Params const& params,
+      SharedStorage& shared_storage,
+      cute::tuple<int32_t, int32_t, int32_t> block_coord) {
+    if constexpr (!dQacc_use_TMA) {
+      return;
+    }
+
+    auto [n_block, bidh, bidb] = block_coord;
+    SeqlenInfo_t seqlen_info{
+        bidb,
+        get<0>(params.shape_Q),
+        get<0>(params.shape_K),
+        params.seq_offsets,
+        params.seq_offsets_q,
+        params.num_targets};
+    if constexpr (Jagged) {
+      static constexpr int kBlockN = get<1>(TileShape_MNK{});
+      if (n_block * kBlockN >= seqlen_info.seqlen_kv) {
+        return;
+      }
+    }
+    int min_full_attn_seq_len_, max_attn_len_, contextual_seq_len_;
+    if constexpr (!Cross) {
+      if (params.num_groups > 1) {
+        int group_id = bidb / params.batch_size_per_group;
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len_tensor[group_id];
+        max_attn_len_ = params.max_attn_len_tensor[group_id];
+        contextual_seq_len_ = params.contextual_seq_len_tensor[group_id];
+      } else {
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len;
+        max_attn_len_ = params.max_attn_len;
+        contextual_seq_len_ = params.contextual_seq_len;
+      }
+    }
+    int m_block_min, m_block_max;
+    if constexpr (Cross) {
+      auto m_block_min_max = get_cross_m_block_min_max(
+          seqlen_info.uihlen_q,
+          seqlen_info.seqlen_q,
+          seqlen_info.seqlen_kv,
+          n_block);
+      m_block_min = get<0>(m_block_min_max);
+      m_block_max = get<1>(m_block_min_max);
+    } else {
+      auto m_block_min_max = get_m_block_min_max(
+          max_attn_len_,
+          contextual_seq_len_,
+          seqlen_info.uihlen_q,
+          seqlen_info.seqlen_q,
+          n_block);
+      m_block_min = get<0>(m_block_min_max);
+      m_block_max = get<1>(m_block_min_max);
+    }
+    auto full_m_block_min_max = get_full_m_block_min_max(
+        seqlen_info.uihlen_q,
+        seqlen_info.seqlen_q,
+        min_full_attn_seq_len_,
+        m_block_max,
+        n_block);
+    int const full_m_block_min = get<0>(full_m_block_min_max);
+    int const full_m_block_max = get<1>(full_m_block_min_max);
+    int contexual_m_block_max = get_contexual_m_block_max(
+        seqlen_info.uihlen_q, contextual_seq_len_, m_block_min, n_block);
+
+    Tensor sdQ = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_dqacc.data()),
+        SmemLayoutdQaccum{});
+    static constexpr int dQ_TMA_num_bytes =
+        CUTE_STATIC_V(size<0>(sdQ)) * sizeof(ElementAccum);
+
+    Tensor mdQaccum = make_tensor(
+        make_gmem_ptr(reinterpret_cast<ElementAccum*>(params.ptr_dQaccum)),
+        params.shape_dQaccum,
+        params.stride_dQaccum)(_, bidh, !Jagged ? bidb : 0);
+    Tensor gdQaccum_ = local_tile(
+        domain_offset(
+            make_coord(seqlen_info.offset_q_padded * kHeadDim), mdQaccum),
+        Shape<Int<kBlockM * kHeadDim>>{},
+        make_coord(_)); // (M * K, _)
+    Tensor gdQaccum = cute::flat_divide(
+        gdQaccum_,
+        Int<kBlockM * kHeadDim / NumMmaWarpGroups>{}); // (M * K / WG, WG, _)
+
+    int const num_batch = params.num_batch;
+    int const num_head = get<2>(params.shape_Q);
+    int* lock_ptr =
+        !Deterministic ? nullptr : params.dq_semaphore + bidb * num_head + bidh;
+    using Barrier = cutlass::GenericBarrier<cutlass::detail::SyncwarpSync>;
+    bool const lane_predicate = cute::elect_one_sync();
+
+    auto store_dq_step = [&](int m_block) {
+      if constexpr (Deterministic) {
+        Barrier::wait_eq(
+            lock_ptr,
+            threadIdx.x % cutlass::NumThreadsPerWarp,
+            m_block * num_batch * num_head,
+            n_block);
+      }
+#pragma unroll
+      for (int warpgroup_idx = 0; warpgroup_idx < NumMmaWarpGroups;
+           ++warpgroup_idx) {
+        cutlass::arch::NamedBarrier::sync(
+            cutlass::NumThreadsPerWarpGroup + cutlass::NumThreadsPerWarp,
+            static_cast<uint32_t>(BwdNamedBarriers::dQFullWG1) +
+                warpgroup_idx /*id*/); // sdQ full, to be written to gmem
+        if (lane_predicate) {
+          SM90_BULK_REDUCE_ADD::copy(
+              raw_pointer_cast(sdQ(_, warpgroup_idx).data()),
+              raw_pointer_cast(gdQaccum(_, warpgroup_idx, m_block).data()),
+              dQ_TMA_num_bytes,
+              static_cast<uint64_t>(TMA::CacheHintSm90::EVICT_LAST));
+          tma_store_arrive();
+        }
+      }
+      // Note, the for_each() function is required here to ensure
+      // `warpgroup_idx` is of type Int<x>.
+      for_each(make_int_sequence<NumMmaWarpGroups>{}, [&](auto warpgroup_idx) {
+        if (lane_predicate) {
+          tma_store_wait<NumMmaWarpGroups - 1 - CUTE_STATIC_V(warpgroup_idx)>();
+        }
+        cutlass::arch::NamedBarrier::arrive(
+            cutlass::NumThreadsPerWarpGroup + cutlass::NumThreadsPerWarp,
+            static_cast<uint32_t>(BwdNamedBarriers::dQEmptyWG1) +
+                warpgroup_idx /*id*/); // sdQ empty, ready to be written to
+      });
+      if constexpr (Deterministic) {
+        Barrier::arrive_inc(
+            lock_ptr,
+            threadIdx.x % cutlass::NumThreadsPerWarp,
+            m_block * num_batch * num_head);
+      }
+    };
+
+#pragma unroll 2
+    for (int m_block = m_block_min; m_block < m_block_max; ++m_block) {
+      store_dq_step(m_block);
+    }
+    if constexpr (Contexual_mask) {
+#pragma unroll 2
+      for (int m_block = 0; m_block < contexual_m_block_max; ++m_block) {
+        store_dq_step(m_block);
+      }
+    }
+    if constexpr (Local) {
+#pragma unroll 2
+      for (int m_block = full_m_block_min; m_block < full_m_block_max;
+           ++m_block) {
+        store_dq_step(m_block);
+      }
+    }
+    if constexpr (Local && Deterministic) {
+      constexpr int kBlockM = get<0>(TileShape_MNK{});
+      int const m_block_global_max =
+          cute::ceil_div(seqlen_info.seqlen_q, kBlockM);
+#pragma unroll 2
+      for (int m_block = m_block_max; m_block < m_block_global_max; ++m_block) {
+        Barrier::arrive_inc(
+            lock_ptr,
+            threadIdx.x % cutlass::NumThreadsPerWarp,
+            m_block * num_batch * num_head);
+      }
+    }
+  }
+
+  CUTLASS_DEVICE void mma_init() {
+    // We're not currently using this bc we're not using persistent scheduler
+    // // Tell producer (warp 0) that smem_k and smem_v are ready
+    // cutlass::arch::NamedBarrier::arrive(NumMmaThreads +
+    // cutlass::NumThreadsPerWarp,
+    // static_cast<uint32_t>(BwdNamedBarriers::KVEmpty) /*id*/);
+    int warp_idx_in_warpgroup =
+        __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0);
+    if constexpr (dQacc_use_TMA) {
+      if (warp_idx_in_warpgroup == 0) {
+        cutlass::arch::NamedBarrier::arrive(
+            cutlass::NumThreadsPerWarpGroup + cutlass::NumThreadsPerWarp,
+            static_cast<uint32_t>(BwdNamedBarriers::dQEmptyWG1) - 1 +
+                hstu::canonical_warp_group_idx_nosync() /*id*/); // sdQ empty,
+                                                                 // ready to be
+                                                                 // written to
+      }
+    }
+  }
+
+  template <typename SharedStorage, typename FrgTensordKV>
+  CUTLASS_DEVICE bool mma(
+      Params const& params,
+      MainloopPipeline pipeline_q,
+      MainloopPipeline_dO pipeline_do,
+      PipelineState& smem_pipe_read,
+      PipelineState_dO& smem_pipe_read_do,
+      FrgTensordKV& tdKrdK,
+      FrgTensordKV& tdVrdV,
+      int thread_idx,
+      int& work_idx,
+      cute::tuple<int32_t, int32_t, int32_t> block_coord,
+      SharedStorage& shared_storage) {
+    static_assert(
+        is_rmem<FrgTensordKV>::value,
+        "dK and dV tensor must be rmem resident.");
+
+    int n_block = get<0>(block_coord);
+    int bidb = get<2>(block_coord);
+    SeqlenInfo_t seqlen_info{
+        bidb,
+        get<0>(params.shape_Q),
+        get<0>(params.shape_K),
+        params.seq_offsets,
+        params.seq_offsets_q,
+        params.num_targets};
+    if constexpr (Jagged) {
+      static constexpr int kBlockN = get<1>(TileShape_MNK{});
+      if (n_block * kBlockN >= seqlen_info.seqlen_kv) {
+        return false;
+      }
+    }
+    int min_full_attn_seq_len_, max_attn_len_, contextual_seq_len_;
+    float scalar_scale_val_;
+    if constexpr (!Cross) {
+      if (params.num_groups > 1) {
+        int group_id = bidb / params.batch_size_per_group;
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len_tensor[group_id];
+        max_attn_len_ = params.max_attn_len_tensor[group_id];
+        contextual_seq_len_ = params.contextual_seq_len_tensor[group_id];
+      } else {
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len;
+        max_attn_len_ = params.max_attn_len;
+        contextual_seq_len_ = params.contextual_seq_len;
+      }
+    }
+    if (params.num_groups > 1) {
+      int group_id = bidb / params.batch_size_per_group;
+      int max_seq_len_per_group = params.max_seq_len_tensor[group_id];
+      // attention scale
+      scalar_scale_val_ = params.scalar_scale
+          ? (params.attn_scale == nullptr ? 1.0f / max_seq_len_per_group
+                                          : params.attn_scale[group_id])
+          : 0;
+    } else {
+      // attention scale
+      scalar_scale_val_ = params.scalar_scale
+          ? (params.attn_scale == nullptr ? params.max_seq_len_inv
+                                          : params.attn_scale[0])
+          : 0;
+    }
+    int m_block_min, m_block_max;
+    if constexpr (Cross) {
+      auto m_block_min_max = get_cross_m_block_min_max(
+          seqlen_info.uihlen_q,
+          seqlen_info.seqlen_q,
+          seqlen_info.seqlen_kv,
+          n_block);
+      m_block_min = get<0>(m_block_min_max);
+      m_block_max = get<1>(m_block_min_max);
+    } else {
+      auto m_block_min_max = get_m_block_min_max(
+          max_attn_len_,
+          contextual_seq_len_,
+          seqlen_info.uihlen_q,
+          seqlen_info.seqlen_q,
+          n_block);
+      m_block_min = get<0>(m_block_min_max);
+      m_block_max = get<1>(m_block_min_max);
+    }
+    auto full_m_block_min_max = get_full_m_block_min_max(
+        seqlen_info.uihlen_q,
+        seqlen_info.seqlen_q,
+        min_full_attn_seq_len_,
+        m_block_max,
+        n_block);
+    int const full_m_block_min = get<0>(full_m_block_min_max);
+    int const full_m_block_max = get<1>(full_m_block_min_max);
+    int contexual_m_block_max = get_contexual_m_block_max(
+        seqlen_info.uihlen_q, contextual_seq_len_, m_block_min, n_block);
+
+    Tensor sQ = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
+        SmemLayoutQ{});
+    Tensor sdO = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_do.data()),
+        SmemLayoutdO{});
+    Tensor sK = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_k.data()),
+        SmemLayoutK{});
+    Tensor sV = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_v.data()),
+        SmemLayoutV{});
+    Tensor sQt = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
+        SmemLayoutQt{});
+    Tensor sdOt = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_do.data()),
+        SmemLayoutdOt{});
+    Tensor sKt = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_k.data()),
+        SmemLayoutKt{});
+    Tensor sP = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_p.data()),
+        SmemLayoutPdS{});
+    Tensor sP_pi = cute::as_position_independent_swizzle_tensor(sP);
+    Tensor sPt = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_p.data()),
+        SmemLayoutPdSt{});
+    Tensor sPt_pi = cute::as_position_independent_swizzle_tensor(sPt);
+    Tensor sdS = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_ds.data()),
+        SmemLayoutPdS{});
+    Tensor sdS_pi = cute::as_position_independent_swizzle_tensor(sdS);
+    Tensor sdSt = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_ds.data()),
+        SmemLayoutPdSt{});
+    Tensor sdSt_pi = cute::as_position_independent_swizzle_tensor(sdSt);
+    Tensor sdQ = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_dqacc.data()),
+        SmemLayoutdQaccum{});
+
+    static_assert(
+        stride<0>(typename TiledMmaSdP::ALayout{}) == 0 and
+            stride<0>(typename TiledMmaSdP::BLayout{}) == 0 and
+            size<0>(typename TiledMmaSdP::ALayout{}) ==
+                cutlass::NumThreadsPerWarpGroup and
+            size<0>(typename TiledMmaSdP::BLayout{}) ==
+                cutlass::NumThreadsPerWarpGroup,
+        "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+    constexpr int MmaWarpGroups =
+        NumMmaThreads / cutlass::NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(
+        make_shape(Int<MmaWarpGroups>{}),
+        make_stride(Int<cutlass::NumThreadsPerWarpGroup>{}));
+    Layout warp_group_thread_layout_dq = make_layout(
+        make_shape(Int<NumMmaWarpGroups>{}),
+        make_stride(Int<cutlass::NumThreadsPerWarpGroup>{}));
+
+    int warp_group_idx = __shfl_sync(
+        0xFFFFFFFF, thread_idx / cutlass::NumThreadsPerWarpGroup, 0);
+    TiledMmaSdP tiled_mma_SdP;
+    using TiledMmadP =
+        std::conditional_t<!Mma_dP_is_RS, TiledMmaSdP, TiledMmadPRS>;
+    TiledMmadP tiled_mma_dP;
+    TiledMmadKV tiled_mma_dKV;
+    TiledMmadQ tiled_mma_dQ;
+
+    auto wg_mma_SdP =
+        tiled_mma_SdP.get_slice(warp_group_thread_layout(warp_group_idx));
+    auto wg_mma_dP =
+        tiled_mma_dP.get_slice(warp_group_thread_layout(warp_group_idx));
+    auto thread_mma_SdP = tiled_mma_SdP.get_thread_slice(thread_idx);
+    auto wg_mma_dKV =
+        tiled_mma_dKV.get_slice(warp_group_thread_layout(warp_group_idx));
+    auto wg_mma_dQ =
+        tiled_mma_dQ.get_slice(warp_group_thread_layout_dq(warp_group_idx));
+
+    auto smem_tiled_copy_PdS =
+        make_tiled_copy_C(SmemCopyAtomPdS{}, tiled_mma_SdP);
+    auto smem_thr_copy_PdS = smem_tiled_copy_PdS.get_thread_slice(thread_idx);
+
+    R2STiledCopydQaccum r2s_tiled_copy_dQaccum;
+    auto r2s_thr_copy_dQaccum =
+        r2s_tiled_copy_dQaccum.get_thread_slice(thread_idx);
+    Tensor tdQsdQaccum = r2s_thr_copy_dQaccum.partition_D(sdQ);
+    // if (thread_idx == 0) { print(sdQ); printf("\n"); print(tdQsdQaccum);
+    // printf("\n"); }
+
+    // Allocate "fragments/descriptors"
+    // We have to use the templated mma_partition_fragment_AB instead of
+    // cute::conditional_return or lambda, because some partition_fragment_A/B
+    // don't compile.
+    // https://stackoverflow.com/questions/50051473/if-constexpr-in-c17-does-not-work-in-a-non-templated-function
+    Tensor tSrQ = mma_partition_fragment_AB</*A=*/!SdP_swapAB>(wg_mma_SdP, sQ);
+    Tensor tSrK = mma_partition_fragment_AB</*A=*/SdP_swapAB>(wg_mma_SdP, sK);
+    Tensor tdPrdO =
+        mma_partition_fragment_AB</*A=*/!SdP_swapAB>(wg_mma_SdP, sdO);
+    Tensor tdPrV = mma_partition_fragment_AB</*A=*/SdP_swapAB>(wg_mma_dP, sV);
+    Tensor tdVrdO =
+        mma_partition_fragment_AB</*A=*/dKV_swapAB>(wg_mma_dKV, sdOt);
+    Tensor tdKrQ = mma_partition_fragment_AB</*A=*/dKV_swapAB>(wg_mma_dKV, sQt);
+    Tensor tdQrdS = mma_partition_fragment_AB</*A=*/!dQ_swapAB>(wg_mma_dQ, sdS);
+    Tensor tdQrK = mma_partition_fragment_AB</*A=*/dQ_swapAB>(wg_mma_dQ, sKt);
+
+    Tensor tPsP = smem_thr_copy_PdS.partition_D(
+        cute::conditional_return<!SdP_swapAB>(
+            sP_pi, sPt_pi)); // ((Atom,AtomNum),PIPE_M,PIPE_N)
+    Tensor tdSsdS = smem_thr_copy_PdS.partition_D(
+        cute::conditional_return<!SdP_swapAB>(
+            sdS_pi, sdSt_pi)); // ((Atom,AtomNum),PIPE_M,PIPE_N)
+    // if (blockIdx.x == 0 && threadIdx.x == 128) { print(smem_thr_copy_PdS);
+    // print(sP_pi); printf("\n"); print(sPt_pi); printf("\n"); print(tPsP);
+    // printf("\n"); print(tdSsdS); printf("\n"); }
+
+    auto consumer_wait = [](auto& pipeline, auto& smem_pipe_read) {
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+    };
+
+    int bidh = get<1>(block_coord);
+    // For the case where we do atomicAdd directly to gdQaccum instead of using
+    // TMA
+    Tensor mdQaccum = make_tensor(
+        make_gmem_ptr(reinterpret_cast<ElementAccum*>(params.ptr_dQaccum)),
+        params.shape_dQaccum,
+        params.stride_dQaccum)(_, bidh, !Jagged ? bidb : 0);
+    Tensor gdQaccum_ = local_tile(
+        domain_offset(
+            make_coord(seqlen_info.offset_q_padded * kHeadDim), mdQaccum),
+        Shape<Int<kBlockM * kHeadDim>>{},
+        make_coord(_)); // (M * K, _)
+    Tensor gdQaccum = cute::flat_divide(
+        gdQaccum_,
+        Int<kBlockM * kHeadDim / NumMmaWarpGroups>{}); // (M * K / WG, WG, _)
+    // We can reuse r2s_thr_copy_dQaccum for this partitioning
+    Tensor tdQgdQaccum = r2s_thr_copy_dQaccum.partition_D(gdQaccum);
+    // if (blockIdx.x == 0 && threadIdx.x == 128) { print(mdQaccum);
+    // printf("\n"); print(gdQaccum_); printf("\n"); print(gdQaccum);
+    // printf("\n"); print(tdQgdQaccum); printf("\n"); }
+
+    hstu::Mask<kBlockM, kBlockN, TiledMmaSdP, SdP_swapAB> mask(
+        thread_idx,
+        seqlen_info.seqlen_q,
+        seqlen_info.seqlen_kv,
+        max_attn_len_,
+        min_full_attn_seq_len_,
+        contextual_seq_len_,
+        seqlen_info.uihlen_q);
+
+    int m_block = m_block_min;
+
+    clear(tdKrdK);
+    clear(tdVrdV);
+    // tiled_mma_dKV.accumulate_ = GMMA::ScaleOut::Zero;
+
+    cutlass::ConsumerToken barrier_token = static_cast<cutlass::BarrierStatus>(
+        shared_storage.pipelines.barrier_KV.try_wait(work_idx % 2));
+    if (barrier_token == cutlass::BarrierStatus::WaitAgain) {
+      shared_storage.pipelines.barrier_KV.wait(work_idx % 2);
+    }
+
+    if constexpr (Mma_dP_is_RS) {
+      using SmemCopyAtomV = Copy_Atom<cute::SM75_U32x4_LDSM_N, Element>;
+      auto smem_tiled_copy_V = make_tiled_copy_A(SmemCopyAtomV{}, tiled_mma_dP);
+      auto smem_thr_copy_V = smem_tiled_copy_V.get_thread_slice(thread_idx);
+      Tensor tdPrV_copy_view = smem_thr_copy_V.retile_D(tdPrV);
+      Tensor tdPsV_copy_view = smem_thr_copy_V.partition_S(
+          cute::as_position_independent_swizzle_tensor(sV));
+      cute::copy(smem_tiled_copy_V, tdPsV_copy_view, tdPrV_copy_view);
+    }
+    static constexpr int Qdim = !SdP_swapAB ? 0 : 1;
+    auto thread0_mma_SdP = tiled_mma_SdP.get_thread_slice(_0{});
+    Tensor cS = cute::make_identity_tensor(
+        Shape<
+            Int<!SdP_swapAB ? kBlockM : kBlockN>,
+            Int<!SdP_swapAB ? kBlockN : kBlockM>>{});
+    Tensor tScS = thread_mma_SdP.partition_C(cS);
+    Tensor tScS_rowcol = make_tensor(
+        tScS.data(),
+        hstu::convert_layout_acc_rowcol</*Transposed=*/SdP_swapAB>(
+            tScS.layout()));
+    Tensor t0ScS = thread0_mma_SdP.partition_C(cS);
+    Tensor t0ScS_rowcol = make_tensor(
+        t0ScS.data(),
+        hstu::convert_layout_acc_rowcol</*Transposed=*/SdP_swapAB>(
+            t0ScS.layout()));
+    int const thread_qdim_offset = get<Qdim>(tScS_rowcol(_0{}, _0{}));
+
+    auto bwd_step = [&](int m_block, auto mask_fn) {
+      Tensor tSrS = partition_fragment_C(
+          tiled_mma_SdP,
+          select<!SdP_swapAB ? 0 : 1, !SdP_swapAB ? 1 : 0>(TileShape_MNK{}));
+      consumer_wait(pipeline_q, smem_pipe_read);
+      hstu::gemm</*zero_init=*/true, /*wg_wait=*/-1, /*SwapAB=*/SdP_swapAB>(
+          tiled_mma_SdP, tSrQ(_, _, _, smem_pipe_read.index()), tSrK, tSrS);
+      Tensor tdPrdP = partition_fragment_C(
+          tiled_mma_SdP,
+          select<!SdP_swapAB ? 0 : 1, !SdP_swapAB ? 1 : 0>(TileShape_MNK{}));
+      PipelineState_dO smem_pipe_read_do_cur =
+          cute::conditional_return<Q_dO_same_stages>(
+              smem_pipe_read, smem_pipe_read_do);
+      consumer_wait(pipeline_do, smem_pipe_read_do_cur);
+      hstu::gemm</*zero_init=*/true, /*wg_wait=*/-1, /*SwapAB=*/SdP_swapAB>(
+          tiled_mma_dP,
+          tdPrdO(_, _, _, smem_pipe_read_do_cur.index()),
+          tdPrV,
+          tdPrdP);
+      warpgroup_wait<1>();
+      // Reshape tSrS from ((2, 2, V), MMA_N, MMA_M) to (nrow=(2, V, MMA_M),
+      // ncol=(2, MMA_N))
+      Tensor scores = make_tensor(
+          tSrS.data(),
+          hstu::convert_layout_acc_rowcol</*Transposed=*/SdP_swapAB>(
+              tSrS.layout()));
+      Tensor tSrS_sigmoid = make_tensor_like(tSrS);
+      Tensor sigmoid = make_tensor(
+          tSrS_sigmoid.data(),
+          hstu::convert_layout_acc_rowcol</*Transposed=*/SdP_swapAB>(
+              tSrS_sigmoid.layout()));
+      int qdim_offset = params.scalar_scale
+          ? 0
+          : m_block * kBlockM + thread_qdim_offset + seqlen_info.offset_q;
+      mask_fn(tSrS, m_block);
+#pragma unroll
+      for (int mi = 0; mi < size<0>(scores); ++mi) {
+        float scale = scalar_scale_val_;
+        if (!params.scalar_scale) {
+          int q_index = qdim_offset + int(get<Qdim>(t0ScS_rowcol(mi, _0{})));
+          int q_local = q_index - seqlen_info.offset_q;
+          if (q_local < seqlen_info.seqlen_q) {
+            scale = params.attn_scale[q_index];
+          }
+        }
+#pragma unroll
+        for (int ni = 0; ni < size<1>(scores); ++ni) {
+          scores(mi, ni) = scores(mi, ni) * params.alpha;
+          sigmoid(mi, ni) =
+              __fdividef(1., 1.0f + cutlass::fast_exp(-scores(mi, ni)));
+          scores(mi, ni) = sigmoid(mi, ni) * scores(mi, ni) * scale;
+        }
+      }
+      mask_fn(tSrS_sigmoid, m_block);
+
+      warpgroup_wait<0>();
+      // Reshape tdPrdP from ((2, 2, V), MMA_N, MMA_M) to (nrow=(2, V, MMA_M),
+      // ncol=(2, MMA_N))
+      Tensor dS = make_tensor(tdPrdP.data(), scores.layout());
+#pragma unroll
+      for (int mi = 0; mi < size<0>(dS); ++mi) {
+        float scale = scalar_scale_val_;
+        if (!params.scalar_scale) {
+          int q_index = qdim_offset + int(get<Qdim>(t0ScS_rowcol(mi, _0{})));
+          int q_local = q_index - seqlen_info.offset_q;
+          if (q_local < seqlen_info.seqlen_q) {
+            scale = params.attn_scale[q_index];
+          }
+        }
+#pragma unroll
+        for (int ni = 0; ni < size<1>(dS); ++ni) {
+          dS(mi, ni) = dS(mi, ni) * sigmoid(mi, ni) * scale +
+              dS(mi, ni) * scores(mi, ni) * (1.f - sigmoid(mi, ni));
+          dS(mi, ni) = dS(mi, ni) * params.alpha;
+          //   if (dS(mi, ni) > 0.0001) {
+          //     std::printf(
+          //         "dS(mi, ni) is (%f), (m, n) is (%d, %d), thread_idx is
+          //         (%d), blockIdx.z is (%d)\n", dS(mi, ni), mi, ni,
+          //         threadIdx.x,
+          //         blockIdx.z);
+          //   }
+        }
+      }
+      // Convert scores from fp32 to fp16/bf16
+      Tensor rP = make_tensor_like<Element>(tSrS);
+      hstu::convert_type_out(tSrS, rP);
+      if constexpr (!Mma_dKV_is_RS) {
+        // Need to sync to make sure P has already been used in the previous
+        // iteration before writing new values
+        if constexpr (kStages_dS == 1) {
+          cutlass::arch::NamedBarrier::sync(
+              NumMmaThreads,
+              static_cast<uint32_t>(BwdNamedBarriers::PdS) /*id*/);
+        }
+        Tensor tPaP =
+            smem_thr_copy_PdS.retile_S(rP); // ((Atom,AtomNum), MMA_N, MMA_N)
+        cute::copy(
+            smem_tiled_copy_PdS,
+            tPaP,
+            tPsP(
+                _,
+                _,
+                _,
+                cute::conditional_return<kStages_dS == 1>(
+                    _0{}, smem_pipe_read.index())));
+      }
+      Tensor rdS = make_tensor_like<Element>(tdPrdP);
+      hstu::convert_type_out(tdPrdP, rdS);
+      // If there's double buffering on dS, we don't need to sync here.
+      // Otherwise we might have WG1 writing to dS before WG2 is done reading
+      // from it during MmadQ. But because both WGs have to sync at the end of
+      // the loop and double buffering, this race condition is not possible.
+      // This sync is to ensure (1) P is written in case of !Mma_dKV_is_RS and
+      // (2) dS is already read by the Mma in the previous iteration in case of
+      // Mma_dKV_is_RS.
+      if constexpr (!Mma_dKV_is_RS || (kStages_dS == 1 && Mma_dKV_is_RS)) {
+        cutlass::arch::fence_view_async_shared();
+        cutlass::arch::NamedBarrier::sync(
+            NumMmaThreads, static_cast<uint32_t>(BwdNamedBarriers::PdS) /*id*/);
+      }
+      // For hdim 64, It's faster to write to smem_dS first before the dV gemm
+      Tensor tdSadS =
+          smem_thr_copy_PdS.retile_S(rdS); // ((Atom,AtomNum), MMA_N, MMA_N)
+      cute::copy(
+          smem_tiled_copy_PdS,
+          tdSadS,
+          tdSsdS(
+              _,
+              _,
+              _,
+              cute::conditional_return<kStages_dS == 1>(
+                  _0{}, smem_pipe_read.index())));
+
+      if constexpr (!Slice_dQKV_Mma) {
+        // Most cases take this path, except for hdim256 where we want to slice
+        // to reduce register pressure
+        if constexpr (Mma_dKV_is_RS) {
+          Tensor tdVrP = make_tensor(
+              rP.data(), convert_layout_acc_Aregs<TiledMmadKV>(tSrS.layout()));
+          hstu::gemm</*zero_init=*/false, /*wg_wait=*/-1>(
+              tiled_mma_dKV,
+              tdVrP,
+              tdVrdO(_, _, _, smem_pipe_read_do_cur.index()),
+              tdVrdV);
+        } else {
+          Tensor tdVrP =
+              mma_partition_fragment_AB</*A=*/!dKV_swapAB>(wg_mma_dKV, sPt);
+          Tensor tdVrP_cur = tdVrP(
+              _,
+              _,
+              _,
+              cute::conditional_return<kStages_dS == 1>(
+                  _0{}, smem_pipe_read.index()));
+          hstu::
+              gemm</*zero_init=*/false, /*wg_wait=*/-1, /*SwapAB=*/dKV_swapAB>(
+                  tiled_mma_dKV,
+                  tdVrP_cur,
+                  tdVrdO(_, _, _, smem_pipe_read_do_cur.index()),
+                  tdVrdV);
+        }
+        // SMEM fence to make sure sdS is written before it's read by WGMMA
+        cutlass::arch::fence_view_async_shared();
+        cutlass::arch::NamedBarrier::sync(
+            NumMmaThreads, static_cast<uint32_t>(BwdNamedBarriers::PdS) /*id*/);
+        Tensor tdQrdQ = partition_fragment_C(
+            tiled_mma_dQ,
+            select<!dQ_swapAB ? 0 : 2, !dQ_swapAB ? 2 : 0>(TileShape_MNK{}));
+        Tensor tdQrdS_cur = tdQrdS(
+            _,
+            _,
+            _,
+            cute::conditional_return<kStages_dS == 1>(
+                _0{}, smem_pipe_read.index()));
+        hstu::gemm</*zero_init=*/true, /*wg_wait=*/1, /*SwapAB=*/dQ_swapAB>(
+            tiled_mma_dQ, tdQrdS_cur, tdQrK, tdQrdQ);
+        pipeline_do.consumer_release(smem_pipe_read_do_cur); // release dQ
+
+        if constexpr (Mma_dKV_is_RS) {
+          Tensor tdKrdS = make_tensor(
+              rdS.data(),
+              convert_layout_acc_Aregs<TiledMmadKV>(tdPrdP.layout()));
+          hstu::gemm</*zero_init=*/false, /*wg_wait=*/1>(
+              tiled_mma_dKV,
+              tdKrdS,
+              tdKrQ(_, _, _, smem_pipe_read.index()),
+              tdKrdK);
+        } else {
+          Tensor tdKrdS =
+              mma_partition_fragment_AB</*A=*/!dKV_swapAB>(wg_mma_dKV, sdSt);
+          Tensor tdKrdS_cur = tdKrdS(
+              _,
+              _,
+              _,
+              cute::conditional_return<kStages_dS == 1>(
+                  _0{}, smem_pipe_read.index()));
+          hstu::gemm</*zero_init=*/false, /*wg_wait=*/1, /*SwapAB=*/dKV_swapAB>(
+              tiled_mma_dKV,
+              tdKrdS_cur,
+              tdKrQ(_, _, _, smem_pipe_read.index()),
+              tdKrdK);
+        }
+        if constexpr (dQacc_use_TMA) {
+          int const warp_group_idx =
+              hstu::canonical_warp_group_idx_nosync() - 1;
+          cutlass::arch::NamedBarrier::sync(
+              cutlass::NumThreadsPerWarpGroup + cutlass::NumThreadsPerWarp,
+              static_cast<uint32_t>(BwdNamedBarriers::dQEmptyWG1) +
+                  warp_group_idx /*id*/); // sdQ full, to be written to gmem
+          Tensor taccdQrdQ = r2s_thr_copy_dQaccum.retile_S(tdQrdQ);
+          cute::copy(r2s_tiled_copy_dQaccum, taccdQrdQ, tdQsdQaccum);
+          cutlass::arch::fence_view_async_shared();
+          cutlass::arch::NamedBarrier::arrive(
+              cutlass::NumThreadsPerWarpGroup + cutlass::NumThreadsPerWarp,
+              static_cast<uint32_t>(BwdNamedBarriers::dQFullWG1) +
+                  warp_group_idx /*id*/); // sdQ full, to be written to gmem
+        } else {
+          // We can reuse r2s_thr_copy_dQaccum for this partitioning
+          Tensor tdQrdQ_atomic =
+              recast<float4>(r2s_thr_copy_dQaccum.retile_S(tdQrdQ));
+          Tensor tdQgdQaccum_atomic =
+              recast<float4>(tdQgdQaccum(_, _, _, m_block));
+          static_assert(
+              CUTE_STATIC_V(size(tdQrdQ_atomic)) ==
+              CUTE_STATIC_V(size(tdQgdQaccum_atomic)));
+#pragma unroll
+          for (int i = 0; i < size(tdQrdQ_atomic); ++i) {
+            atomicAdd(&tdQgdQaccum_atomic(i), tdQrdQ_atomic(i));
+          }
+        }
+
+      } else { // Slice_dQKV_Mma
+
+        static_assert(!(Slice_dQKV_Mma && Mma_dKV_is_RS));
+        Tensor tdVrP =
+            mma_partition_fragment_AB</*A=*/!dKV_swapAB>(wg_mma_dKV, sPt);
+        Tensor tdVrP_cur = tdVrP(
+            _,
+            _,
+            _,
+            cute::conditional_return<kStages_dS == 1>(
+                _0{}, smem_pipe_read.index()));
+        hstu::gemm<
+            /*zero_init=*/false,
+            /*wg_wait=*/-1,
+            /*SwapAB=*/dKV_swapAB,
+            /*M_slice=*/0>(
+            tiled_mma_dKV,
+            tdVrP_cur,
+            tdVrdO(_, _, _, smem_pipe_read_do_cur.index()),
+            tdVrdV);
+
+        cutlass::arch::fence_view_async_shared();
+        cutlass::arch::NamedBarrier::sync(
+            NumMmaThreads, static_cast<uint32_t>(BwdNamedBarriers::PdS) /*id*/);
+        Tensor tdQrdQ = partition_fragment_C(
+            tiled_mma_dQ,
+            select<!dQ_swapAB ? 0 : 2, !dQ_swapAB ? 2 : 0>(TileShape_MNK{}));
+        Tensor tdQrdS_cur = tdQrdS(
+            _,
+            _,
+            _,
+            cute::conditional_return<kStages_dS == 1>(
+                _0{}, smem_pipe_read.index()));
+        hstu::gemm<
+            /*zero_init=*/true,
+            /*wg_wait=*/-1,
+            /*SwapAB=*/dQ_swapAB,
+            /*M_slice=*/0>(tiled_mma_dQ, tdQrdS_cur, tdQrK, tdQrdQ);
+        hstu::gemm<
+            /*zero_init=*/false,
+            /*wg_wait=*/1,
+            /*SwapAB=*/dKV_swapAB,
+            /*M_slice=*/1>(
+            tiled_mma_dKV,
+            tdVrP_cur,
+            tdVrdO(_, _, _, smem_pipe_read_do_cur.index()),
+            tdVrdV);
+        Tensor tdQrdQ_atomic =
+            recast<float4>(r2s_thr_copy_dQaccum.retile_S(tdQrdQ));
+        Tensor tdQgdQaccum_atomic =
+            recast<float4>(tdQgdQaccum(_, _, _, m_block));
+#pragma unroll
+        for (int i = 0; i < size(tdQrdQ_atomic) / 2; ++i) {
+          atomicAdd(&tdQgdQaccum_atomic(i), tdQrdQ_atomic(i));
+        }
+
+        Tensor tdKrdS =
+            mma_partition_fragment_AB</*A=*/!dKV_swapAB>(wg_mma_dKV, sdSt);
+        Tensor tdKrdS_cur = tdKrdS(
+            _,
+            _,
+            _,
+            cute::conditional_return<kStages_dS == 1>(
+                _0{}, smem_pipe_read.index()));
+        hstu::gemm<
+            /*zero_init=*/false,
+            /*wg_wait=*/1,
+            /*SwapAB=*/dKV_swapAB,
+            /*M_slice=*/0>(
+            tiled_mma_dKV,
+            tdKrdS_cur,
+            tdKrQ(_, _, _, smem_pipe_read.index()),
+            tdKrdK);
+        pipeline_do.consumer_release(smem_pipe_read_do_cur); // release dO
+
+        hstu::gemm<
+            /*zero_init=*/true,
+            /*wg_wait=*/0,
+            /*SwapAB=*/dQ_swapAB,
+            /*M_slice=*/1>(tiled_mma_dQ, tdQrdS_cur, tdQrK, tdQrdQ);
+#pragma unroll
+        for (int i = size(tdQrdQ_atomic) / 2; i < size(tdQrdQ_atomic); ++i) {
+          atomicAdd(&tdQgdQaccum_atomic(i), tdQrdQ_atomic(i));
+        }
+
+        hstu::gemm<
+            /*zero_init=*/false,
+            /*wg_wait=*/-1,
+            /*SwapAB=*/dKV_swapAB,
+            /*M_slice=*/1>(
+            tiled_mma_dKV,
+            tdKrdS_cur,
+            tdKrQ(_, _, _, smem_pipe_read.index()),
+            tdKrdK);
+      }
+
+      warpgroup_wait<0>();
+      pipeline_q.consumer_release(smem_pipe_read); // release Q
+      ++smem_pipe_read;
+      if constexpr (!Q_dO_same_stages) {
+        ++smem_pipe_read_do;
+      }
+    };
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+
+    if constexpr (Cross) {
+      auto mask_fn = [&](auto& tSrS, int m_block) {
+        mask.template apply<
+            true /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            Causal,
+            false /*Local*/,
+            false /*Contexual_mask*/,
+            false /*Target_mask*/,
+            Cross,
+            false /*Softmax*/>(tSrS, m_block, n_block);
+      };
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; m_block < m_block_max; ++m_block) {
+        bwd_step(m_block, mask_fn);
+      }
+      if constexpr (Q_dO_same_stages) {
+        smem_pipe_read_do = smem_pipe_read;
+      }
+      ++work_idx;
+      return true;
+    }
+    if constexpr (Has_targets) {
+      if (n_block * kBlockN >= seqlen_info.uihlen_q) {
+        auto mask_fn = [&](auto& tSrS, int m_block) {
+          mask.template apply<
+              true /*Seqlenq_mask*/,
+              true /*Seqlenk_mask*/,
+              false /*Causal*/,
+              false /*Local*/,
+              false /*Contexual_mask*/,
+              Has_targets /*Target_mask*/,
+              false /*Cross*/,
+              false /*Softmax*/>(tSrS, m_block, n_block);
+        };
+        CUTLASS_PRAGMA_NO_UNROLL
+        for (; m_block < m_block_max; ++m_block) {
+          bwd_step(m_block, mask_fn);
+        }
+        if constexpr (Q_dO_same_stages) {
+          smem_pipe_read_do = smem_pipe_read;
+        }
+        ++work_idx;
+        return true;
+      } else if ((n_block + 1) * kBlockN >= seqlen_info.uihlen_q) {
+        if constexpr ((Causal || Local) && SeparateMaskingIterations) {
+          auto mask_fn = [&](auto& tSrS, int m_block) {
+            mask.template apply<
+                true /*Seqlenq_mask*/,
+                true /*Seqlenk_mask*/,
+                Causal,
+                Local,
+                Contexual_mask,
+                Has_targets /*Target_mask*/,
+                false /*Cross*/,
+                false /*Softmax*/>(tSrS, m_block, n_block);
+          };
+          int const m_block_masking_max =
+              ((n_block + 1) * kBlockN - 1) / kBlockM + 1;
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (; m_block < std::min(m_block_max, m_block_masking_max);
+               ++m_block) {
+            bwd_step(m_block, mask_fn);
+          }
+        }
+
+        auto mask_fn = [&](auto& tSrS, int m_block) {
+          mask.template apply<
+              true /*Seqlenq_mask*/,
+              true /*Seqlenk_mask*/,
+              Causal && !SeparateMaskingIterations,
+              Local && !SeparateMaskingIterations,
+              Contexual_mask,
+              Has_targets /*Target_mask*/,
+              false /*Cross*/,
+              false /*Softmax*/>(tSrS, m_block, n_block);
+        };
+        if constexpr (SeparateMaskingIterations) {
+          int const m_block_max_before_local_mask =
+              !Local || !SeparateMaskingIterations
+              ? m_block_max
+              : std::min(
+                    m_block_max, (n_block * kBlockN + max_attn_len_) / kBlockM);
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (; m_block < m_block_max_before_local_mask; ++m_block) {
+            bwd_step(m_block, mask_fn);
+          }
+        } else {
+          int num_m_block = m_block_max - m_block_min;
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (int i = 0; i < num_m_block + full_m_block_max -
+                   full_m_block_min + contexual_m_block_max;
+               ++i) {
+            if (i < num_m_block) {
+              m_block = m_block_min + i;
+            } else if (i < num_m_block + contexual_m_block_max) {
+              m_block = i - num_m_block;
+            } else {
+              m_block =
+                  i - num_m_block - contexual_m_block_max + full_m_block_min;
+            }
+            bwd_step(m_block, mask_fn);
+          }
+        }
+
+        if constexpr (Local && SeparateMaskingIterations) {
+          auto mask_fn = [&](auto& tSrS, int m_block) {
+            mask.template apply<
+                true /*Seqlenq_mask*/,
+                true /*Seqlenk_mask*/,
+                false /*Causal_mask*/,
+                Local,
+                Contexual_mask,
+                Has_targets /*Target_mask*/,
+                false /*Cross*/,
+                false /*Softmax*/>(tSrS, m_block, n_block);
+          };
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (; m_block < m_block_max; ++m_block) {
+            bwd_step(m_block, mask_fn);
+          }
+        }
+        if constexpr (Contexual_mask && SeparateMaskingIterations) {
+          auto mask_fn = [&](auto& tSrS, int m_block) {
+            mask.template apply<
+                true /*Seqlenq_mask*/,
+                true /*Seqlenk_mask*/,
+                Causal /*Causal_mask*/,
+                Local /*Local_mask*/,
+                Contexual_mask,
+                Has_targets,
+                false /*Cross*/,
+                false /*Softmax*/>(tSrS, m_block, n_block);
+          };
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (m_block = 0; m_block < contexual_m_block_max; ++m_block) {
+            bwd_step(m_block, mask_fn);
+          }
+        }
+
+        if constexpr (Local && SeparateMaskingIterations) {
+          auto mask_fn = [&](auto& tSrS, int m_block) {
+            mask.template apply<
+                true /*Seqlenq_mask*/,
+                true /*Seqlenk_mask*/,
+                false /*Causal_mask*/,
+                Local,
+                Contexual_mask,
+                Has_targets,
+                false /*Cross*/,
+                false /*Softmax*/>(tSrS, m_block, n_block);
+          };
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (m_block = full_m_block_min; m_block < full_m_block_max;
+               ++m_block) {
+            bwd_step(m_block, mask_fn);
+          }
+        }
+        if constexpr (Q_dO_same_stages) {
+          smem_pipe_read_do = smem_pipe_read;
+        }
+        ++work_idx;
+        return true;
+      }
+    }
+    // We have separate iterations with causal masking. Not necessary for hdim
+    // 128 but for hdim 64 this helps quite a bit to not have to do causal
+    // masking for most of the iterations.
+    if constexpr ((Causal || Local) && SeparateMaskingIterations) {
+      auto mask_fn = [&](auto& tSrS, int m_block) {
+        mask.template apply<
+            true /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            Causal,
+            Local,
+            Contexual_mask,
+            false /*Target_mask*/,
+            false /*Cross*/,
+            false /*Softmax*/>(tSrS, m_block, n_block);
+      };
+      static constexpr int kBlockM = get<0>(TileShape_MNK{});
+      int const m_block_masking_max =
+          ((n_block + 1) * kBlockN - 1) / kBlockM + 1;
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; m_block < std::min(m_block_max, m_block_masking_max); ++m_block) {
+        bwd_step(m_block, mask_fn);
+      }
+    }
+
+    auto mask_fn = [&](auto& tSrS, int m_block) {
+      mask.template apply<
+          true /*Seqlenq_mask*/,
+          true /*Seqlenk_mask*/,
+          Causal && !SeparateMaskingIterations,
+          Local && !SeparateMaskingIterations,
+          Contexual_mask,
+          false /*Target_mask*/,
+          false /*Cross*/,
+          false /*Softmax*/>(tSrS, m_block, n_block);
+    };
+    if constexpr (SeparateMaskingIterations) {
+      int const m_block_max_before_local_mask =
+          !Local || !SeparateMaskingIterations
+          ? m_block_max
+          : std::min(
+                m_block_max, (n_block * kBlockN + max_attn_len_) / kBlockM);
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; m_block < m_block_max_before_local_mask; ++m_block) {
+        bwd_step(m_block, mask_fn);
+      }
+    } else {
+      int num_m_block = m_block_max - m_block_min;
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (int i = 0; i < num_m_block + full_m_block_max - full_m_block_min +
+               contexual_m_block_max;
+           ++i) {
+        if (i < num_m_block) {
+          m_block = m_block_min + i;
+        } else if (i < num_m_block + contexual_m_block_max) {
+          m_block = i - num_m_block;
+        } else {
+          m_block = i - num_m_block - contexual_m_block_max + full_m_block_min;
+        }
+        bwd_step(m_block, mask_fn);
+      }
+    }
+
+    if constexpr (Local && SeparateMaskingIterations) {
+      auto mask_fn = [&](auto& tSrS, int m_block) {
+        mask.template apply<
+            true /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            false /*Causal_mask*/,
+            Local,
+            Contexual_mask,
+            false /*Target_mask*/,
+            false /*Cross*/,
+            false /*Softmax*/>(tSrS, m_block, n_block);
+      };
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; m_block < m_block_max; ++m_block) {
+        bwd_step(m_block, mask_fn);
+      }
+    }
+    if constexpr (Contexual_mask && SeparateMaskingIterations) {
+      auto mask_fn = [&](auto& tSrS, int m_block) {
+        mask.template apply<
+            true /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            Causal /*Causal_mask*/,
+            Local /*Local_mask*/,
+            Contexual_mask,
+            false /*Target_mask*/,
+            false /*Cross*/,
+            false /*Softmax*/>(tSrS, m_block, n_block);
+      };
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (m_block = 0; m_block < contexual_m_block_max; ++m_block) {
+        bwd_step(m_block, mask_fn);
+      }
+    }
+
+    if constexpr (Local && SeparateMaskingIterations) {
+      auto mask_fn = [&](auto& tSrS, int m_block) {
+        mask.template apply<
+            true /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            false /*Causal_mask*/,
+            Local,
+            Contexual_mask,
+            false /*Target_mask*/,
+            false /*Cross*/,
+            false /*Softmax*/>(tSrS, m_block, n_block);
+      };
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (m_block = full_m_block_min; m_block < full_m_block_max; ++m_block) {
+        bwd_step(m_block, mask_fn);
+      }
+    }
+
+    // if (blockIdx.x == 0 && threadIdx.x == 128) { print_tensor(tdVrdV); }
+    if constexpr (Q_dO_same_stages) {
+      smem_pipe_read_do = smem_pipe_read;
+    }
+    ++work_idx;
+    return true;
+  }
+
+  template <typename SharedStorage, typename FrgTensordKV>
+  CUTLASS_DEVICE bool mma_softmax(
+      Params const& params,
+      MainloopPipeline pipeline_q,
+      MainloopPipeline_dO pipeline_do,
+      PipelineState& smem_pipe_read,
+      PipelineState_dO& smem_pipe_read_do,
+      FrgTensordKV& tdKrdK,
+      FrgTensordKV& tdVrdV,
+      int thread_idx,
+      int& work_idx,
+      cute::tuple<int32_t, int32_t, int32_t> block_coord,
+      SharedStorage& shared_storage) {
+    static_assert(
+        is_rmem<FrgTensordKV>::value,
+        "dK and dV tensor must be rmem resident.");
+
+    int n_block = get<0>(block_coord);
+    int bidb = get<2>(block_coord);
+    SeqlenInfo_t seqlen_info{
+        bidb,
+        get<0>(params.shape_Q),
+        get<0>(params.shape_K),
+        params.seq_offsets,
+        params.seq_offsets_q,
+        params.num_targets};
+    if constexpr (Jagged) {
+      static constexpr int kBlockN = get<1>(TileShape_MNK{});
+      if (n_block * kBlockN >= seqlen_info.seqlen_kv) {
+        return false;
+      }
+    }
+    int min_full_attn_seq_len_, max_attn_len_, contextual_seq_len_;
+    if constexpr (!Cross) {
+      if (params.num_groups > 1) {
+        int group_id = bidb / params.num_groups;
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len_tensor[group_id];
+        max_attn_len_ = params.max_attn_len_tensor[group_id];
+        contextual_seq_len_ = params.contextual_seq_len_tensor[group_id];
+      } else {
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len;
+        max_attn_len_ = params.max_attn_len;
+        contextual_seq_len_ = params.contextual_seq_len;
+      }
+    }
+    int m_block_min, m_block_max;
+    if constexpr (Cross) {
+      auto m_block_min_max = get_cross_m_block_min_max(
+          seqlen_info.uihlen_q,
+          seqlen_info.seqlen_q,
+          seqlen_info.seqlen_kv,
+          n_block);
+      m_block_min = get<0>(m_block_min_max);
+      m_block_max = get<1>(m_block_min_max);
+    } else {
+      auto m_block_min_max = get_m_block_min_max(
+          max_attn_len_,
+          contextual_seq_len_,
+          seqlen_info.uihlen_q,
+          seqlen_info.seqlen_q,
+          n_block);
+      m_block_min = get<0>(m_block_min_max);
+      m_block_max = get<1>(m_block_min_max);
+    }
+    auto full_m_block_min_max = get_full_m_block_min_max(
+        seqlen_info.uihlen_q,
+        seqlen_info.seqlen_q,
+        min_full_attn_seq_len_,
+        m_block_max,
+        n_block);
+    int const full_m_block_min = get<0>(full_m_block_min_max);
+    int const full_m_block_max = get<1>(full_m_block_min_max);
+    int contexual_m_block_max = get_contexual_m_block_max(
+        seqlen_info.uihlen_q, contextual_seq_len_, m_block_min, n_block);
+
+    Tensor sQ = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
+        SmemLayoutQ{});
+    Tensor sdO = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_do.data()),
+        SmemLayoutdO{});
+    Tensor sK = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_k.data()),
+        SmemLayoutK{});
+    Tensor sV = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_v.data()),
+        SmemLayoutV{});
+    Tensor sQt = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
+        SmemLayoutQt{});
+    Tensor sdOt = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_do.data()),
+        SmemLayoutdOt{});
+    Tensor sKt = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_k.data()),
+        SmemLayoutKt{});
+    Tensor sP = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_p.data()),
+        SmemLayoutPdS{});
+    Tensor sP_pi = cute::as_position_independent_swizzle_tensor(sP);
+    Tensor sPt = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_p.data()),
+        SmemLayoutPdSt{});
+    Tensor sPt_pi = cute::as_position_independent_swizzle_tensor(sPt);
+    Tensor sdS = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_ds.data()),
+        SmemLayoutPdS{});
+    Tensor sdS_pi = cute::as_position_independent_swizzle_tensor(sdS);
+    Tensor sdSt = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_ds.data()),
+        SmemLayoutPdSt{});
+    Tensor sdSt_pi = cute::as_position_independent_swizzle_tensor(sdSt);
+    Tensor sdQ = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_dqacc.data()),
+        SmemLayoutdQaccum{});
+    Tensor sLSEMma = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_lse.data()),
+        SmemLayoutLSEMma{});
+    Tensor sdPsumMma = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_dpsum.data()),
+        SmemLayoutLSEMma{});
+
+    static_assert(
+        stride<0>(typename TiledMmaSdP::ALayout{}) == 0 and
+            stride<0>(typename TiledMmaSdP::BLayout{}) == 0 and
+            size<0>(typename TiledMmaSdP::ALayout{}) ==
+                cutlass::NumThreadsPerWarpGroup and
+            size<0>(typename TiledMmaSdP::BLayout{}) ==
+                cutlass::NumThreadsPerWarpGroup,
+        "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+    constexpr int MmaWarpGroups =
+        NumMmaThreads / cutlass::NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(
+        make_shape(Int<MmaWarpGroups>{}),
+        make_stride(Int<cutlass::NumThreadsPerWarpGroup>{}));
+    Layout warp_group_thread_layout_dq = make_layout(
+        make_shape(Int<NumMmaWarpGroups>{}),
+        make_stride(Int<cutlass::NumThreadsPerWarpGroup>{}));
+
+    int warp_group_idx = __shfl_sync(
+        0xFFFFFFFF, thread_idx / cutlass::NumThreadsPerWarpGroup, 0);
+    TiledMmaSdP tiled_mma_SdP;
+    using TiledMmadP =
+        std::conditional_t<!Mma_dP_is_RS, TiledMmaSdP, TiledMmadPRS>;
+    TiledMmadP tiled_mma_dP;
+    TiledMmadKV tiled_mma_dKV;
+    TiledMmadQ tiled_mma_dQ;
+
+    auto wg_mma_SdP =
+        tiled_mma_SdP.get_slice(warp_group_thread_layout(warp_group_idx));
+    auto wg_mma_dP =
+        tiled_mma_dP.get_slice(warp_group_thread_layout(warp_group_idx));
+    auto thread_mma_SdP = tiled_mma_SdP.get_thread_slice(thread_idx);
+    auto wg_mma_dKV =
+        tiled_mma_dKV.get_slice(warp_group_thread_layout(warp_group_idx));
+    auto wg_mma_dQ =
+        tiled_mma_dQ.get_slice(warp_group_thread_layout_dq(warp_group_idx));
+
+    auto smem_tiled_copy_PdS =
+        make_tiled_copy_C(SmemCopyAtomPdS{}, tiled_mma_SdP);
+    auto smem_thr_copy_PdS = smem_tiled_copy_PdS.get_thread_slice(thread_idx);
+
+    R2STiledCopydQaccum r2s_tiled_copy_dQaccum;
+    auto r2s_thr_copy_dQaccum =
+        r2s_tiled_copy_dQaccum.get_thread_slice(thread_idx);
+    Tensor tdQsdQaccum = r2s_thr_copy_dQaccum.partition_D(sdQ);
+    // if (thread_idx == 0) { print(sdQ); printf("\n"); print(tdQsdQaccum);
+    // printf("\n"); }
+
+    // Allocate "fragments/descriptors"
+    // We have to use the templated mma_partition_fragment_AB instead of
+    // cute::conditional_return or lambda, because some partition_fragment_A/B
+    // don't compile.
+    // https://stackoverflow.com/questions/50051473/if-constexpr-in-c17-does-not-work-in-a-non-templated-function
+    Tensor tSrQ = mma_partition_fragment_AB</*A=*/!SdP_swapAB>(wg_mma_SdP, sQ);
+    Tensor tSrK = mma_partition_fragment_AB</*A=*/SdP_swapAB>(wg_mma_SdP, sK);
+    Tensor tdPrdO =
+        mma_partition_fragment_AB</*A=*/!SdP_swapAB>(wg_mma_SdP, sdO);
+    Tensor tdPrV = mma_partition_fragment_AB</*A=*/SdP_swapAB>(wg_mma_dP, sV);
+    Tensor tdVrdO =
+        mma_partition_fragment_AB</*A=*/dKV_swapAB>(wg_mma_dKV, sdOt);
+    Tensor tdKrQ = mma_partition_fragment_AB</*A=*/dKV_swapAB>(wg_mma_dKV, sQt);
+    Tensor tdQrdS = mma_partition_fragment_AB</*A=*/!dQ_swapAB>(wg_mma_dQ, sdS);
+    Tensor tdQrK = mma_partition_fragment_AB</*A=*/dQ_swapAB>(wg_mma_dQ, sKt);
+
+    Tensor tPsP = smem_thr_copy_PdS.partition_D(
+        cute::conditional_return<!SdP_swapAB>(
+            sP_pi, sPt_pi)); // ((Atom,AtomNum),PIPE_M,PIPE_N)
+    Tensor tdSsdS = smem_thr_copy_PdS.partition_D(
+        cute::conditional_return<!SdP_swapAB>(
+            sdS_pi, sdSt_pi)); // ((Atom,AtomNum),PIPE_M,PIPE_N)
+    // if (blockIdx.x == 0 && threadIdx.x == 128) { print(smem_thr_copy_PdS);
+    // print(sP_pi); printf("\n"); print(sPt_pi); printf("\n"); print(tPsP);
+    // printf("\n"); print(tdSsdS); printf("\n"); }
+
+    // thread_mma_SdP.partition_C(sLSEMma) has shape ((2, 2, V), MMA_M, MMA_N,
+    // PIPE), we only take the col indices or row indices, depending on whether
+    // SdP_swapAB.
+    Tensor tLSEsLSE = cute::conditional_return<!SdP_swapAB>(
+        group_modes<0, 2>(thread_mma_SdP.partition_C(sLSEMma)(
+            make_coord(_0{}, _, _0{}), _, _0{}, _)), // (2, MMA_M, PIPE)
+        group_modes<0, 3>(thread_mma_SdP.partition_C(sLSEMma)(
+            make_coord(_, _0{}, _), _0{}, _, _))); // (2, V, MMA_N, PIPE)
+    Tensor tLSEsdPsum = cute::conditional_return<!SdP_swapAB>(
+        group_modes<0, 2>(thread_mma_SdP.partition_C(sdPsumMma)(
+            make_coord(_0{}, _, _0{}), _, _0{}, _)),
+        group_modes<0, 3>(thread_mma_SdP.partition_C(sdPsumMma)(
+            make_coord(_, _0{}, _), _0{}, _, _)));
+    // if (blockIdx.x == 0 && threadIdx.x == 128) { print(sLSEMma);
+    // printf("\n"); print(tLSEsLSE); printf("\n"); } If we want to split the
+    // stats among the 8 threads that share the same rows.
+    static constexpr int kStatsPerThread =
+        cute::ceil_div(decltype(size(tLSEsLSE))::value, 8);
+
+    auto consumer_wait = [](auto& pipeline, auto& smem_pipe_read) {
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+    };
+
+    int bidh = get<1>(block_coord);
+    // For the case where we do atomicAdd directly to gdQaccum instead of using
+    // TMA
+    Tensor mdQaccum = make_tensor(
+        make_gmem_ptr(reinterpret_cast<ElementAccum*>(params.ptr_dQaccum)),
+        params.shape_dQaccum,
+        params.stride_dQaccum)(_, bidh, !Jagged ? bidb : 0);
+    Tensor gdQaccum_ = local_tile(
+        domain_offset(
+            make_coord(seqlen_info.offset_q_padded * kHeadDim), mdQaccum),
+        Shape<Int<kBlockM * kHeadDim>>{},
+        make_coord(_)); // (M * K, _)
+    Tensor gdQaccum = cute::flat_divide(
+        gdQaccum_,
+        Int<kBlockM * kHeadDim / NumMmaWarpGroups>{}); // (M * K / WG, WG, _)
+    // We can reuse r2s_thr_copy_dQaccum for this partitioning
+    Tensor tdQgdQaccum = r2s_thr_copy_dQaccum.partition_D(gdQaccum);
+    // if (blockIdx.x == 0 && threadIdx.x == 128) { print(mdQaccum);
+    // printf("\n"); print(gdQaccum_); printf("\n"); print(gdQaccum);
+    // printf("\n"); print(tdQgdQaccum); printf("\n"); }
+
+    hstu::Mask<kBlockM, kBlockN, TiledMmaSdP, SdP_swapAB> mask(
+        thread_idx,
+        seqlen_info.seqlen_q,
+        seqlen_info.seqlen_kv,
+        max_attn_len_,
+        min_full_attn_seq_len_,
+        contextual_seq_len_,
+        seqlen_info.uihlen_q);
+
+    int m_block = m_block_min;
+
+    clear(tdKrdK);
+    clear(tdVrdV);
+    // tiled_mma_dKV.accumulate_ = GMMA::ScaleOut::Zero;
+
+    cutlass::ConsumerToken barrier_token = static_cast<cutlass::BarrierStatus>(
+        shared_storage.pipelines.barrier_KV.try_wait(work_idx % 2));
+    if (barrier_token == cutlass::BarrierStatus::WaitAgain) {
+      shared_storage.pipelines.barrier_KV.wait(work_idx % 2);
+    }
+
+    if constexpr (Mma_dP_is_RS) {
+      using SmemCopyAtomV = Copy_Atom<cute::SM75_U32x4_LDSM_N, Element>;
+      auto smem_tiled_copy_V = make_tiled_copy_A(SmemCopyAtomV{}, tiled_mma_dP);
+      auto smem_thr_copy_V = smem_tiled_copy_V.get_thread_slice(thread_idx);
+      Tensor tdPrV_copy_view = smem_thr_copy_V.retile_D(tdPrV);
+      Tensor tdPsV_copy_view = smem_thr_copy_V.partition_S(
+          cute::as_position_independent_swizzle_tensor(sV));
+      cute::copy(smem_tiled_copy_V, tdPsV_copy_view, tdPrV_copy_view);
+    }
+    static constexpr int Qdim = !SdP_swapAB ? 0 : 1;
+    auto thread0_mma_SdP = tiled_mma_SdP.get_thread_slice(_0{});
+    Tensor cS = cute::make_identity_tensor(
+        Shape<
+            Int<!SdP_swapAB ? kBlockM : kBlockN>,
+            Int<!SdP_swapAB ? kBlockN : kBlockM>>{});
+    Tensor tScS = thread_mma_SdP.partition_C(cS);
+    Tensor tScS_rowcol = make_tensor(
+        tScS.data(),
+        hstu::convert_layout_acc_rowcol</*Transposed=*/SdP_swapAB>(
+            tScS.layout()));
+    Tensor t0ScS = thread0_mma_SdP.partition_C(cS);
+    Tensor t0ScS_rowcol = make_tensor(
+        t0ScS.data(),
+        hstu::convert_layout_acc_rowcol</*Transposed=*/SdP_swapAB>(
+            t0ScS.layout()));
+    int const thread_qdim_offset = get<Qdim>(tScS_rowcol(_0{}, _0{}));
+
+    auto bwd_step = [&](int m_block, auto mask_fn) {
+      Tensor tSrS = partition_fragment_C(
+          tiled_mma_SdP,
+          select<!SdP_swapAB ? 0 : 1, !SdP_swapAB ? 1 : 0>(TileShape_MNK{}));
+      consumer_wait(pipeline_q, smem_pipe_read);
+      hstu::gemm</*zero_init=*/true, /*wg_wait=*/-1, /*SwapAB=*/SdP_swapAB>(
+          tiled_mma_SdP, tSrQ(_, _, _, smem_pipe_read.index()), tSrK, tSrS);
+      Tensor tLSErLSE = cute::conditional_return<!ShuffleLSE>(
+          make_fragment_like(tLSEsLSE(_, _0{})),
+          make_tensor<ElementAccum>(Int<kStatsPerThread>{}));
+      if constexpr (!ShuffleLSE) {
+        cute::copy(tLSEsLSE(_, smem_pipe_read.index()), tLSErLSE);
+      } else {
+#pragma unroll
+        for (int i = 0; i < kStatsPerThread; ++i) {
+          // It's ok to read OOB, since we made sure sLSE is large enough and we
+          // won't use the OOB values
+          tLSErLSE(i) =
+              tLSEsLSE((thread_idx % 32) / 4 + i * 8, smem_pipe_read.index());
+        }
+      }
+      Tensor tdPrdP = partition_fragment_C(
+          tiled_mma_SdP,
+          select<!SdP_swapAB ? 0 : 1, !SdP_swapAB ? 1 : 0>(TileShape_MNK{}));
+      PipelineState_dO smem_pipe_read_do_cur =
+          cute::conditional_return<Q_dO_same_stages>(
+              smem_pipe_read, smem_pipe_read_do);
+      consumer_wait(pipeline_do, smem_pipe_read_do_cur);
+      hstu::gemm</*zero_init=*/true, /*wg_wait=*/-1, /*SwapAB=*/SdP_swapAB>(
+          tiled_mma_dP,
+          tdPrdO(_, _, _, smem_pipe_read_do_cur.index()),
+          tdPrV,
+          tdPrdP);
+      warpgroup_wait<1>();
+      // Reshape tSrS from ((2, 2, V), MMA_N, MMA_M) to (nrow=(2, V, MMA_M),
+      // ncol=(2, MMA_N))
+      Tensor scores = make_tensor(
+          tSrS.data(),
+          hstu::convert_layout_acc_rowcol</*Transposed=*/SdP_swapAB>(
+              tSrS.layout()));
+      mask_fn(tSrS, m_block);
+#pragma unroll
+      for (int mi = 0; mi < size<0>(scores); ++mi) {
+        float const lse_scaled = [&] {
+          if constexpr (!ShuffleLSE)
+            return tLSErLSE(mi);
+          else
+            return __shfl_sync(
+                0xffffffff, tLSErLSE(mi / 8), (mi % 8) * 4 + (thread_idx % 4));
+        }();
+#pragma unroll
+        for (int ni = 0; ni < size<1>(scores); ++ni) {
+          scores(mi, ni) =
+              exp2f(scores(mi, ni) * params.alpha_log2 - lse_scaled);
+        }
+      }
+      Tensor tLSErdPsum = cute::conditional_return<!ShuffledPsum>(
+          make_fragment_like(tLSEsdPsum(_, _0{})),
+          make_tensor<ElementAccum>(Int<kStatsPerThread>{}));
+      if constexpr (!ShuffledPsum) {
+        cute::copy(tLSEsdPsum(_, smem_pipe_read_do_cur.index()), tLSErdPsum);
+      } else {
+#pragma unroll
+        for (int i = 0; i < kStatsPerThread; ++i) {
+          tLSErdPsum(i) = tLSEsdPsum(
+              (thread_idx % 32) / 4 + i * 8, smem_pipe_read_do_cur.index());
+        }
+      }
+
+      warpgroup_wait<0>();
+      // Reshape tdPrdP from ((2, 2, V), MMA_N, MMA_M) to (nrow=(2, V, MMA_M),
+      // ncol=(2, MMA_N))
+      Tensor dS = make_tensor(tdPrdP.data(), scores.layout());
+#pragma unroll
+      for (int mi = 0; mi < size<0>(dS); ++mi) {
+        float const dP_sum_cur = [&] {
+          if constexpr (!ShuffledPsum)
+            return tLSErdPsum(mi);
+          else
+            return __shfl_sync(
+                0xffffffff,
+                tLSErdPsum(mi / 8),
+                (mi % 8) * 4 + (thread_idx % 4));
+        }();
+#pragma unroll
+        for (int ni = 0; ni < size<1>(dS); ++ni) {
+          dS(mi, ni) =
+              scores(mi, ni) * (dS(mi, ni) - dP_sum_cur) * params.alpha;
+        }
+      }
+      // Convert scores from fp32 to fp16/bf16
+      Tensor rP = make_tensor_like<Element>(tSrS);
+      hstu::convert_type_out(tSrS, rP);
+      if constexpr (!Mma_dKV_is_RS) {
+        // Need to sync to make sure P has already been used in the previous
+        // iteration before writing new values
+        if constexpr (kStages_dS == 1) {
+          cutlass::arch::NamedBarrier::sync(
+              NumMmaThreads,
+              static_cast<uint32_t>(BwdNamedBarriers::PdS) /*id*/);
+        }
+        Tensor tPaP =
+            smem_thr_copy_PdS.retile_S(rP); // ((Atom,AtomNum), MMA_N, MMA_N)
+        cute::copy(
+            smem_tiled_copy_PdS,
+            tPaP,
+            tPsP(
+                _,
+                _,
+                _,
+                cute::conditional_return<kStages_dS == 1>(
+                    _0{}, smem_pipe_read.index())));
+      }
+      Tensor rdS = make_tensor_like<Element>(tdPrdP);
+      hstu::convert_type_out(tdPrdP, rdS);
+      // If there's double buffering on dS, we don't need to sync here.
+      // Otherwise we might have WG1 writing to dS before WG2 is done reading
+      // from it during MmadQ. But because both WGs have to sync at the end of
+      // the loop and double buffering, this race condition is not possible.
+      // This sync is to ensure (1) P is written in case of !Mma_dKV_is_RS and
+      // (2) dS is already read by the Mma in the previous iteration in case of
+      // Mma_dKV_is_RS.
+      if constexpr (!Mma_dKV_is_RS || (kStages_dS == 1 && Mma_dKV_is_RS)) {
+        cutlass::arch::fence_view_async_shared();
+        cutlass::arch::NamedBarrier::sync(
+            NumMmaThreads, static_cast<uint32_t>(BwdNamedBarriers::PdS) /*id*/);
+      }
+      // For hdim 64, It's faster to write to smem_dS first before the dV gemm
+      Tensor tdSadS =
+          smem_thr_copy_PdS.retile_S(rdS); // ((Atom,AtomNum), MMA_N, MMA_N)
+      cute::copy(
+          smem_tiled_copy_PdS,
+          tdSadS,
+          tdSsdS(
+              _,
+              _,
+              _,
+              cute::conditional_return<kStages_dS == 1>(
+                  _0{}, smem_pipe_read.index())));
+
+      if constexpr (!Slice_dQKV_Mma) {
+        // Most cases take this path, except for hdim256 where we want to slice
+        // to reduce register pressure
+        if constexpr (Mma_dKV_is_RS) {
+          Tensor tdVrP = make_tensor(
+              rP.data(), convert_layout_acc_Aregs<TiledMmadKV>(tSrS.layout()));
+          hstu::gemm</*zero_init=*/false, /*wg_wait=*/-1>(
+              tiled_mma_dKV,
+              tdVrP,
+              tdVrdO(_, _, _, smem_pipe_read_do_cur.index()),
+              tdVrdV);
+        } else {
+          Tensor tdVrP =
+              mma_partition_fragment_AB</*A=*/!dKV_swapAB>(wg_mma_dKV, sPt);
+          Tensor tdVrP_cur = tdVrP(
+              _,
+              _,
+              _,
+              cute::conditional_return<kStages_dS == 1>(
+                  _0{}, smem_pipe_read.index()));
+          hstu::
+              gemm</*zero_init=*/false, /*wg_wait=*/-1, /*SwapAB=*/dKV_swapAB>(
+                  tiled_mma_dKV,
+                  tdVrP_cur,
+                  tdVrdO(_, _, _, smem_pipe_read_do_cur.index()),
+                  tdVrdV);
+        }
+        // SMEM fence to make sure sdS is written before it's read by WGMMA
+        cutlass::arch::fence_view_async_shared();
+        cutlass::arch::NamedBarrier::sync(
+            NumMmaThreads, static_cast<uint32_t>(BwdNamedBarriers::PdS) /*id*/);
+        Tensor tdQrdQ = partition_fragment_C(
+            tiled_mma_dQ,
+            select<!dQ_swapAB ? 0 : 2, !dQ_swapAB ? 2 : 0>(TileShape_MNK{}));
+        Tensor tdQrdS_cur = tdQrdS(
+            _,
+            _,
+            _,
+            cute::conditional_return<kStages_dS == 1>(
+                _0{}, smem_pipe_read.index()));
+        hstu::gemm</*zero_init=*/true, /*wg_wait=*/1, /*SwapAB=*/dQ_swapAB>(
+            tiled_mma_dQ, tdQrdS_cur, tdQrK, tdQrdQ);
+        pipeline_do.consumer_release(smem_pipe_read_do_cur); // release dQ
+
+        if constexpr (Mma_dKV_is_RS) {
+          Tensor tdKrdS = make_tensor(
+              rdS.data(),
+              convert_layout_acc_Aregs<TiledMmadKV>(tdPrdP.layout()));
+          hstu::gemm</*zero_init=*/false, /*wg_wait=*/1>(
+              tiled_mma_dKV,
+              tdKrdS,
+              tdKrQ(_, _, _, smem_pipe_read.index()),
+              tdKrdK);
+        } else {
+          Tensor tdKrdS =
+              mma_partition_fragment_AB</*A=*/!dKV_swapAB>(wg_mma_dKV, sdSt);
+          Tensor tdKrdS_cur = tdKrdS(
+              _,
+              _,
+              _,
+              cute::conditional_return<kStages_dS == 1>(
+                  _0{}, smem_pipe_read.index()));
+          hstu::gemm</*zero_init=*/false, /*wg_wait=*/1, /*SwapAB=*/dKV_swapAB>(
+              tiled_mma_dKV,
+              tdKrdS_cur,
+              tdKrQ(_, _, _, smem_pipe_read.index()),
+              tdKrdK);
+        }
+        if constexpr (dQacc_use_TMA) {
+          int const warp_group_idx =
+              hstu::canonical_warp_group_idx_nosync() - 1;
+          cutlass::arch::NamedBarrier::sync(
+              cutlass::NumThreadsPerWarpGroup + cutlass::NumThreadsPerWarp,
+              static_cast<uint32_t>(BwdNamedBarriers::dQEmptyWG1) +
+                  warp_group_idx /*id*/); // sdQ full, to be written to gmem
+          Tensor taccdQrdQ = r2s_thr_copy_dQaccum.retile_S(tdQrdQ);
+          cute::copy(r2s_tiled_copy_dQaccum, taccdQrdQ, tdQsdQaccum);
+          cutlass::arch::fence_view_async_shared();
+          cutlass::arch::NamedBarrier::arrive(
+              cutlass::NumThreadsPerWarpGroup + cutlass::NumThreadsPerWarp,
+              static_cast<uint32_t>(BwdNamedBarriers::dQFullWG1) +
+                  warp_group_idx /*id*/); // sdQ full, to be written to gmem
+        } else {
+          // We can reuse r2s_thr_copy_dQaccum for this partitioning
+          Tensor tdQrdQ_atomic =
+              recast<float4>(r2s_thr_copy_dQaccum.retile_S(tdQrdQ));
+          Tensor tdQgdQaccum_atomic =
+              recast<float4>(tdQgdQaccum(_, _, _, m_block));
+          static_assert(
+              CUTE_STATIC_V(size(tdQrdQ_atomic)) ==
+              CUTE_STATIC_V(size(tdQgdQaccum_atomic)));
+#pragma unroll
+          for (int i = 0; i < size(tdQrdQ_atomic); ++i) {
+            atomicAdd(&tdQgdQaccum_atomic(i), tdQrdQ_atomic(i));
+          }
+        }
+
+      } else { // Slice_dQKV_Mma
+
+        static_assert(!(Slice_dQKV_Mma && Mma_dKV_is_RS));
+        Tensor tdVrP =
+            mma_partition_fragment_AB</*A=*/!dKV_swapAB>(wg_mma_dKV, sPt);
+        Tensor tdVrP_cur = tdVrP(
+            _,
+            _,
+            _,
+            cute::conditional_return<kStages_dS == 1>(
+                _0{}, smem_pipe_read.index()));
+        hstu::gemm<
+            /*zero_init=*/false,
+            /*wg_wait=*/-1,
+            /*SwapAB=*/dKV_swapAB,
+            /*M_slice=*/0>(
+            tiled_mma_dKV,
+            tdVrP_cur,
+            tdVrdO(_, _, _, smem_pipe_read_do_cur.index()),
+            tdVrdV);
+
+        cutlass::arch::fence_view_async_shared();
+        cutlass::arch::NamedBarrier::sync(
+            NumMmaThreads, static_cast<uint32_t>(BwdNamedBarriers::PdS) /*id*/);
+        Tensor tdQrdQ = partition_fragment_C(
+            tiled_mma_dQ,
+            select<!dQ_swapAB ? 0 : 2, !dQ_swapAB ? 2 : 0>(TileShape_MNK{}));
+        Tensor tdQrdS_cur = tdQrdS(
+            _,
+            _,
+            _,
+            cute::conditional_return<kStages_dS == 1>(
+                _0{}, smem_pipe_read.index()));
+        hstu::gemm<
+            /*zero_init=*/true,
+            /*wg_wait=*/-1,
+            /*SwapAB=*/dQ_swapAB,
+            /*M_slice=*/0>(tiled_mma_dQ, tdQrdS_cur, tdQrK, tdQrdQ);
+        hstu::gemm<
+            /*zero_init=*/false,
+            /*wg_wait=*/1,
+            /*SwapAB=*/dKV_swapAB,
+            /*M_slice=*/1>(
+            tiled_mma_dKV,
+            tdVrP_cur,
+            tdVrdO(_, _, _, smem_pipe_read_do_cur.index()),
+            tdVrdV);
+        Tensor tdQrdQ_atomic =
+            recast<float4>(r2s_thr_copy_dQaccum.retile_S(tdQrdQ));
+        Tensor tdQgdQaccum_atomic =
+            recast<float4>(tdQgdQaccum(_, _, _, m_block));
+#pragma unroll
+        for (int i = 0; i < size(tdQrdQ_atomic) / 2; ++i) {
+          atomicAdd(&tdQgdQaccum_atomic(i), tdQrdQ_atomic(i));
+        }
+
+        Tensor tdKrdS =
+            mma_partition_fragment_AB</*A=*/!dKV_swapAB>(wg_mma_dKV, sdSt);
+        Tensor tdKrdS_cur = tdKrdS(
+            _,
+            _,
+            _,
+            cute::conditional_return<kStages_dS == 1>(
+                _0{}, smem_pipe_read.index()));
+        hstu::gemm<
+            /*zero_init=*/false,
+            /*wg_wait=*/1,
+            /*SwapAB=*/dKV_swapAB,
+            /*M_slice=*/0>(
+            tiled_mma_dKV,
+            tdKrdS_cur,
+            tdKrQ(_, _, _, smem_pipe_read.index()),
+            tdKrdK);
+        pipeline_do.consumer_release(smem_pipe_read_do_cur); // release dO
+
+        hstu::gemm<
+            /*zero_init=*/true,
+            /*wg_wait=*/0,
+            /*SwapAB=*/dQ_swapAB,
+            /*M_slice=*/1>(tiled_mma_dQ, tdQrdS_cur, tdQrK, tdQrdQ);
+#pragma unroll
+        for (int i = size(tdQrdQ_atomic) / 2; i < size(tdQrdQ_atomic); ++i) {
+          atomicAdd(&tdQgdQaccum_atomic(i), tdQrdQ_atomic(i));
+        }
+
+        hstu::gemm<
+            /*zero_init=*/false,
+            /*wg_wait=*/-1,
+            /*SwapAB=*/dKV_swapAB,
+            /*M_slice=*/1>(
+            tiled_mma_dKV,
+            tdKrdS_cur,
+            tdKrQ(_, _, _, smem_pipe_read.index()),
+            tdKrdK);
+      }
+
+      warpgroup_wait<0>();
+      pipeline_q.consumer_release(smem_pipe_read); // release Q
+      ++smem_pipe_read;
+      if constexpr (!Q_dO_same_stages) {
+        ++smem_pipe_read_do;
+      }
+    };
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+
+    if constexpr (Cross) {
+      auto mask_fn = [&](auto& tSrS, int m_block) {
+        mask.template apply<
+            true /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            Causal,
+            false /*Local*/,
+            false /*Contexual_mask*/,
+            false /*Target_mask*/,
+            Cross,
+            true /*Softmax*/>(tSrS, m_block, n_block);
+      };
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; m_block < m_block_max; ++m_block) {
+        bwd_step(m_block, mask_fn);
+      }
+      if constexpr (Q_dO_same_stages) {
+        smem_pipe_read_do = smem_pipe_read;
+      }
+      ++work_idx;
+      return true;
+    }
+    if constexpr (Has_targets) {
+      if (n_block * kBlockN >= seqlen_info.uihlen_q) {
+        auto mask_fn = [&](auto& tSrS, int m_block) {
+          mask.template apply<
+              true /*Seqlenq_mask*/,
+              true /*Seqlenk_mask*/,
+              false /*Causal*/,
+              false /*Local*/,
+              false /*Contexual_mask*/,
+              Has_targets /*Target_mask*/,
+              false /*Cross*/,
+              true /*Softmax*/>(tSrS, m_block, n_block);
+        };
+        CUTLASS_PRAGMA_NO_UNROLL
+        for (; m_block < m_block_max; ++m_block) {
+          bwd_step(m_block, mask_fn);
+        }
+        if constexpr (Q_dO_same_stages) {
+          smem_pipe_read_do = smem_pipe_read;
+        }
+        ++work_idx;
+        return true;
+      } else if ((n_block + 1) * kBlockN >= seqlen_info.uihlen_q) {
+        if constexpr ((Causal || Local) && SeparateMaskingIterations) {
+          auto mask_fn = [&](auto& tSrS, int m_block) {
+            mask.template apply<
+                true /*Seqlenq_mask*/,
+                true /*Seqlenk_mask*/,
+                Causal,
+                Local,
+                Contexual_mask,
+                Has_targets /*Target_mask*/,
+                false /*Cross*/,
+                true /*Softmax*/>(tSrS, m_block, n_block);
+          };
+          int const m_block_masking_max =
+              ((n_block + 1) * kBlockN - 1) / kBlockM + 1;
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (; m_block < std::min(m_block_max, m_block_masking_max);
+               ++m_block) {
+            bwd_step(m_block, mask_fn);
+          }
+        }
+
+        auto mask_fn = [&](auto& tSrS, int m_block) {
+          mask.template apply<
+              true /*Seqlenq_mask*/,
+              true /*Seqlenk_mask*/,
+              Causal && !SeparateMaskingIterations,
+              Local && !SeparateMaskingIterations,
+              Contexual_mask,
+              Has_targets /*Target_mask*/,
+              false /*Cross*/,
+              true /*Softmax*/>(tSrS, m_block, n_block);
+        };
+        if constexpr (SeparateMaskingIterations) {
+          int const m_block_max_before_local_mask =
+              !Local || !SeparateMaskingIterations
+              ? m_block_max
+              : std::min(
+                    m_block_max, (n_block * kBlockN + max_attn_len_) / kBlockM);
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (; m_block < m_block_max_before_local_mask; ++m_block) {
+            bwd_step(m_block, mask_fn);
+          }
+        } else {
+          int num_m_block = m_block_max - m_block_min;
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (int i = 0; i < num_m_block + full_m_block_max -
+                   full_m_block_min + contexual_m_block_max;
+               ++i) {
+            if (i < num_m_block) {
+              m_block = m_block_min + i;
+            } else if (i < num_m_block + contexual_m_block_max) {
+              m_block = i - num_m_block;
+            } else {
+              m_block =
+                  i - num_m_block - contexual_m_block_max + full_m_block_min;
+            }
+            bwd_step(m_block, mask_fn);
+          }
+        }
+
+        if constexpr (Local && SeparateMaskingIterations) {
+          auto mask_fn = [&](auto& tSrS, int m_block) {
+            mask.template apply<
+                true /*Seqlenq_mask*/,
+                true /*Seqlenk_mask*/,
+                false /*Causal_mask*/,
+                Local,
+                Contexual_mask,
+                Has_targets /*Target_mask*/,
+                false /*Cross*/,
+                true /*Softmax*/>(tSrS, m_block, n_block);
+          };
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (; m_block < m_block_max; ++m_block) {
+            bwd_step(m_block, mask_fn);
+          }
+        }
+        if constexpr (Contexual_mask && SeparateMaskingIterations) {
+          auto mask_fn = [&](auto& tSrS, int m_block) {
+            mask.template apply<
+                true /*Seqlenq_mask*/,
+                true /*Seqlenk_mask*/,
+                Causal /*Causal_mask*/,
+                Local /*Local_mask*/,
+                Contexual_mask,
+                Has_targets,
+                false /*Cross*/,
+                true /*Softmax*/>(tSrS, m_block, n_block);
+          };
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (m_block = 0; m_block < contexual_m_block_max; ++m_block) {
+            bwd_step(m_block, mask_fn);
+          }
+        }
+
+        if constexpr (Local && SeparateMaskingIterations) {
+          auto mask_fn = [&](auto& tSrS, int m_block) {
+            mask.template apply<
+                true /*Seqlenq_mask*/,
+                true /*Seqlenk_mask*/,
+                false /*Causal_mask*/,
+                Local,
+                Contexual_mask,
+                Has_targets,
+                false /*Cross*/,
+                true /*Softmax*/>(tSrS, m_block, n_block);
+          };
+          CUTLASS_PRAGMA_NO_UNROLL
+          for (m_block = full_m_block_min; m_block < full_m_block_max;
+               ++m_block) {
+            bwd_step(m_block, mask_fn);
+          }
+        }
+        if constexpr (Q_dO_same_stages) {
+          smem_pipe_read_do = smem_pipe_read;
+        }
+        ++work_idx;
+        return true;
+      }
+    }
+    // We have separate iterations with causal masking. Not necessary for hdim
+    // 128 but for hdim 64 this helps quite a bit to not have to do causal
+    // masking for most of the iterations.
+    if constexpr ((Causal || Local) && SeparateMaskingIterations) {
+      auto mask_fn = [&](auto& tSrS, int m_block) {
+        mask.template apply<
+            true /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            Causal,
+            Local,
+            Contexual_mask,
+            false /*Target_mask*/,
+            false /*Cross*/,
+            true /*Softmax*/>(tSrS, m_block, n_block);
+      };
+      static constexpr int kBlockM = get<0>(TileShape_MNK{});
+      int const m_block_masking_max =
+          ((n_block + 1) * kBlockN - 1) / kBlockM + 1;
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; m_block < std::min(m_block_max, m_block_masking_max); ++m_block) {
+        bwd_step(m_block, mask_fn);
+      }
+    }
+
+    auto mask_fn = [&](auto& tSrS, int m_block) {
+      mask.template apply<
+          true /*Seqlenq_mask*/,
+          true /*Seqlenk_mask*/,
+          Causal && !SeparateMaskingIterations,
+          Local && !SeparateMaskingIterations,
+          Contexual_mask,
+          false /*Target_mask*/,
+          false /*Cross*/,
+          true /*Softmax*/>(tSrS, m_block, n_block);
+    };
+    if constexpr (SeparateMaskingIterations) {
+      int const m_block_max_before_local_mask =
+          !Local || !SeparateMaskingIterations
+          ? m_block_max
+          : std::min(
+                m_block_max, (n_block * kBlockN + max_attn_len_) / kBlockM);
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; m_block < m_block_max_before_local_mask; ++m_block) {
+        bwd_step(m_block, mask_fn);
+      }
+    } else {
+      int num_m_block = m_block_max - m_block_min;
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (int i = 0; i < num_m_block + full_m_block_max - full_m_block_min +
+               contexual_m_block_max;
+           ++i) {
+        if (i < num_m_block) {
+          m_block = m_block_min + i;
+        } else if (i < num_m_block + contexual_m_block_max) {
+          m_block = i - num_m_block;
+        } else {
+          m_block = i - num_m_block - contexual_m_block_max + full_m_block_min;
+        }
+        bwd_step(m_block, mask_fn);
+      }
+    }
+
+    if constexpr (Local && SeparateMaskingIterations) {
+      auto mask_fn = [&](auto& tSrS, int m_block) {
+        mask.template apply<
+            true /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            false /*Causal_mask*/,
+            Local,
+            Contexual_mask,
+            false /*Target_mask*/,
+            false /*Cross*/,
+            true /*Softmax*/>(tSrS, m_block, n_block);
+      };
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (; m_block < m_block_max; ++m_block) {
+        bwd_step(m_block, mask_fn);
+      }
+    }
+    if constexpr (Contexual_mask && SeparateMaskingIterations) {
+      auto mask_fn = [&](auto& tSrS, int m_block) {
+        mask.template apply<
+            true /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            Causal /*Causal_mask*/,
+            Local /*Local_mask*/,
+            Contexual_mask,
+            false /*Target_mask*/,
+            false /*Cross*/,
+            true /*Softmax*/>(tSrS, m_block, n_block);
+      };
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (m_block = 0; m_block < contexual_m_block_max; ++m_block) {
+        bwd_step(m_block, mask_fn);
+      }
+    }
+
+    if constexpr (Local && SeparateMaskingIterations) {
+      auto mask_fn = [&](auto& tSrS, int m_block) {
+        mask.template apply<
+            true /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            false /*Causal_mask*/,
+            Local,
+            Contexual_mask,
+            false /*Target_mask*/,
+            false /*Cross*/,
+            true /*Softmax*/>(tSrS, m_block, n_block);
+      };
+      CUTLASS_PRAGMA_NO_UNROLL
+      for (m_block = full_m_block_min; m_block < full_m_block_max; ++m_block) {
+        bwd_step(m_block, mask_fn);
+      }
+    }
+
+    // if (blockIdx.x == 0 && threadIdx.x == 128) { print_tensor(tdVrdV); }
+    if constexpr (Q_dO_same_stages) {
+      smem_pipe_read_do = smem_pipe_read;
+    }
+    ++work_idx;
+    return true;
+  }
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mainloop_fwd_sm90_tma_gmma_ws.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mainloop_fwd_sm90_tma_gmma_ws.h
new file mode 100644
index 000000000..7c8a447af
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mainloop_fwd_sm90_tma_gmma_ws.h
@@ -0,0 +1,2180 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cutlass/array.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/numeric_conversion.h>
+#include <cutlass/numeric_types.h>
+#include "cutlass/pipeline/pipeline.hpp"
+
+#include "cute/tensor.hpp"
+
+#include "cutlass/gemm/collective/builders/sm90_common.inl"
+
+#include "mask.h"
+#include "named_barrier.h"
+#include "seqlen.h"
+#include "sm90_pipeline_no_cluster.h"
+#include "utils.h"
+
+namespace hstu {
+
+using namespace cute;
+
+template <
+    int Stages,
+    class ClusterShape_,
+    class TileShape_MNK_,
+    class Element_,
+    class ElementAccum_,
+    class ArchTag_,
+    bool Causal,
+    bool Local,
+    bool Contexual_mask,
+    bool Jagged,
+    bool Has_targets,
+    bool Mma1_is_RS,
+    bool V_colmajor_,
+    bool Cross>
+struct CollectiveMainloopFwdSm90 {
+  static constexpr int kStages = Stages;
+  using ClusterShape = ClusterShape_;
+  using TileShape_MNK = TileShape_MNK_;
+  using Element = Element_;
+  using ElementAccum = ElementAccum_;
+  using ArchTag = ArchTag_;
+  static constexpr bool Is_FP8 =
+      cute::is_same_v<Element, cutlass::float_e4m3_t> ||
+      cute::is_same_v<Element, cutlass::float_e5m2_t>;
+  ;
+  static constexpr bool V_colmajor = V_colmajor_;
+  static constexpr bool Transpose_V = Is_FP8 && !V_colmajor;
+  using SeqlenInfo_t = hstu::SeqlenInfoQKFwd<Jagged, Cross, Has_targets>;
+
+  static_assert(ArchTag::kMinComputeCapability >= 90);
+
+  static constexpr cute::GMMA::Major MmaMajorV =
+      !Is_FP8 && !V_colmajor ? GMMA::Major::MN : GMMA::Major::K;
+  static constexpr cute::GMMA::Major TmaMajorV =
+      !V_colmajor ? GMMA::Major::MN : GMMA::Major::K;
+
+  static constexpr int kBlockM = get<0>(TileShape_MNK{});
+  static constexpr int kBlockN = get<1>(TileShape_MNK{});
+  static constexpr int kHeadDim = get<2>(TileShape_MNK{});
+
+  // Register bandwidth is actually a bottleneck so we don't want Q to be in
+  // registers. Leaving this option here for reference.
+  static constexpr bool Mma0_is_RS = false;
+  // We can have Mma1 (P @ V) with P in smem in rmem to reduce register pressure
+  // at the cost of more smem.
+  static_assert(!(!Mma1_is_RS && Is_FP8), "Mma1 must be RS if FP8");
+  static_assert(
+      !(!Mma1_is_RS && Transpose_V),
+      "Mma1 must be RS if Transpose_V");
+
+  using AtomLayoutMNK = Layout<Shape<Int<kBlockM / 64>, _1, _1>>;
+  using TiledMma0 = decltype(cute::make_tiled_mma(
+      std::conditional_t<
+          !Mma0_is_RS,
+          decltype(cute::GMMA::ss_op_selector<
+                   Element,
+                   Element,
+                   ElementAccum,
+                   TileShape_MNK>()),
+          decltype(cute::GMMA::rs_op_selector<
+                   Element,
+                   Element,
+                   ElementAccum,
+                   TileShape_MNK>())>{},
+      AtomLayoutMNK{}));
+  using TiledMma1 = decltype(cute::make_tiled_mma(
+      std::conditional_t<
+          !Mma1_is_RS,
+          decltype(cute::GMMA::ss_op_selector<
+                   Element,
+                   Element,
+                   ElementAccum,
+                   decltype(select<0, 2, 1>(TileShape_MNK{})),
+                   GMMA::Major::K,
+                   MmaMajorV>()),
+          decltype(cute::GMMA::rs_op_selector<
+                   Element,
+                   Element,
+                   ElementAccum,
+                   decltype(select<0, 2, 1>(TileShape_MNK{})),
+                   GMMA::Major::K,
+                   MmaMajorV>())>{},
+      AtomLayoutMNK{}));
+
+  static constexpr int NumMmaThreads = size(TiledMma0{});
+  static constexpr int NumProducerThreads = !Transpose_V
+      ? cutlass::NumThreadsPerWarp
+      : cutlass::NumThreadsPerWarpGroup;
+  static_assert(NumMmaThreads % cutlass::NumThreadsPerWarpGroup == 0);
+  static constexpr int NumMmaWarpGroups =
+      NumMmaThreads / cutlass::NumThreadsPerWarpGroup;
+  static_assert(
+      NumMmaWarpGroups == 1 || NumMmaWarpGroups == 2 || NumMmaWarpGroups == 3);
+
+  using SmemLayoutAtomQ =
+      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+               GMMA::Major::K,
+               Element,
+               decltype(cute::get<0>(TileShape_MNK{})),
+               decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutQ =
+      decltype(tile_to_shape(SmemLayoutAtomQ{}, select<0, 2>(TileShape_MNK{})));
+
+  using SmemLayoutAtomK =
+      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+               GMMA::Major::K,
+               Element,
+               decltype(cute::get<1>(TileShape_MNK{})),
+               decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutK = decltype(tile_to_shape(
+      SmemLayoutAtomK{},
+      make_shape(
+          shape<1>(TileShape_MNK{}),
+          shape<2>(TileShape_MNK{}),
+          Int<kStages>{})));
+
+  using SmemLayoutAtomVt =
+      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+               TmaMajorV,
+               Element,
+               decltype(cute::get<2>(TileShape_MNK{})),
+               decltype(cute::get<1>(TileShape_MNK{}))>());
+  using SmemLayoutVt = decltype(tile_to_shape(
+      SmemLayoutAtomVt{},
+      make_shape(
+          shape<2>(TileShape_MNK{}),
+          shape<1>(TileShape_MNK{}),
+          Int<kStages>{}),
+      std::conditional_t<
+          TmaMajorV == GMMA::Major::K,
+          cute::Step<_1, _2, _3>,
+          cute::Step<_2, _1, _3>>{}));
+
+  using SmemLayoutAtomVtMma =
+      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+               MmaMajorV,
+               Element,
+               decltype(cute::get<2>(TileShape_MNK{})),
+               decltype(cute::get<1>(TileShape_MNK{}))>());
+  using SmemLayoutVtMma = decltype(tile_to_shape(
+      SmemLayoutAtomVtMma{},
+      make_shape(
+          shape<2>(TileShape_MNK{}),
+          shape<1>(TileShape_MNK{}),
+          Int<kStages>{}),
+      std::conditional_t<
+          MmaMajorV == GMMA::Major::K,
+          cute::Step<_1, _2, _3>,
+          cute::Step<_2, _1, _3>>{}));
+
+  // Only used if we're using cp.async to load V
+  using SmemLayoutAtomVCpAsync =
+      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+               GMMA::Major::K,
+               Element,
+               decltype(cute::get<1>(TileShape_MNK{})),
+               decltype(cute::get<2>(TileShape_MNK{}))>());
+  using SmemLayoutVCpAsync = decltype(tile_to_shape(
+      SmemLayoutAtomVCpAsync{},
+      make_shape(
+          shape<1>(TileShape_MNK{}),
+          shape<2>(TileShape_MNK{}),
+          Int<kStages>{})));
+
+  using SmemLayoutAtomP =
+      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
+               GMMA::Major::K,
+               Element,
+               decltype(cute::get<0>(TileShape_MNK{})),
+               decltype(cute::get<1>(TileShape_MNK{}))>());
+  using SmemLayoutP =
+      decltype(tile_to_shape(SmemLayoutAtomP{}, select<0, 1>(TileShape_MNK{})));
+
+  using SmemCopyAtomP = Copy_Atom<cute::SM90_U32x4_STSM_N, Element>;
+
+  // Use LDSM.T and STSM to transpose V in the case of FP8 and V being
+  // row-major. For FP16/BF16 we don't do any transposing.
+  static_assert(!Transpose_V || (kHeadDim % 32 == 0 && kBlockN % 32 == 0));
+  static constexpr bool kHeadDim_multiple_64 = kHeadDim % 64 == 0;
+  // Either kHeadDim is a multiple of 64 (in which case we use a block size of
+  // 64 x 32 for the transpose), or we need kBlockN to be a multiple of 64 (in
+  // which case we use a block size of 32 x 64 for the transpose).
+  static_assert(!Transpose_V || (kHeadDim_multiple_64 || kBlockN % 64 == 0));
+  using LDSM_thread_shape = std::conditional_t<
+      kHeadDim_multiple_64,
+      Shape<_32, _4, _1, _1>,
+      Shape<_16, _4, _1, _2>>;
+  using LDSM_thread_stride = std::conditional_t<
+      kHeadDim_multiple_64,
+      Stride<_4, _1, _0, _0>,
+      Stride<_4, _1, _0, _64>>;
+  using LDSM_value_shape = Shape<_2, _2, _1, _4>;
+  using LDSM_value_stride = Stride<_1, _2, _16, _4>;
+  using LDSM_divide_shape =
+      std::conditional_t<kHeadDim_multiple_64, Shape<_64, _8>, Shape<_32, _8>>;
+  using S2RTiledCopyVt = decltype(make_tiled_copy(
+      Copy_Atom<SM75_U16x8_LDSM_T, Element>{},
+      Layout<LDSM_thread_shape, LDSM_thread_stride>{},
+      Layout<LDSM_value_shape, LDSM_value_stride>{}));
+
+  using STSM_thread_shape = std::conditional_t<
+      kHeadDim_multiple_64,
+      Shape<_8, _4, _4, _1>,
+      Shape<_8, _4, _2, _2>>;
+  using STSM_thread_stride = std::conditional_t<
+      kHeadDim_multiple_64,
+      Stride<_4, _1, _32, _0>,
+      Stride<_4, _1, _32, _64>>;
+  using STSM_value_shape = Shape<_1, _4, _2, _2>;
+  using STSM_value_stride = Stride<_0, _1, _4, _8>;
+  using STSM_divide_shape = Shape<_8, _16>;
+  // These will not permute the columns of V (the kHeadDim dimension) but incur
+  // bank conflicts so a little slower (e.g. 1150 TFLOPS for hdim 256 instead of
+  // 1200 TFLOPS). Instead we will permute the cols of V, and un-permute the
+  // cols of O in the epilogue. using STSM_value_shape = Shape<_2, _4, _1, _2>;
+  // using STSM_value_stride = Stride<_4, _1, _0, _8>;
+  // using STSM_divide_shape = Shape<_16, _16>;
+  using R2STiledCopyV = decltype(make_tiled_copy(
+      Copy_Atom<SM90_U32x4_STSM_N, Element>{},
+      Layout<STSM_thread_shape, STSM_thread_stride>{},
+      Layout<STSM_value_shape, STSM_value_stride>{}));
+
+  using GmemTiledCopyQ = cute::SM90_TMA_LOAD;
+  using GmemTiledCopyKV =
+      decltype(cutlass::gemm::collective::detail::
+                   sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape{})));
+
+  // We use CpAsync for K and V if PagedKV and AppendKV, since TMA doesn't work
+  // there
+  static constexpr int kGmemElemsPerLoad =
+      sizeof(cute::uint128_t) / sizeof(Element);
+  static_assert(
+      kHeadDim % kGmemElemsPerLoad == 0,
+      "Headdim must be a multiple of kGmemElemsPerLoad");
+  // We want each "row" to have 64 elements (128 bytes, i.e. 1 cache line). E.g.
+  // if hdim=128, we want each thread to have 4 loads in the M direction and 2
+  // vectorized load in the K direction. We want each thread to have at least 2
+  // loads in the K direction since in the case of non-interleaved rotary
+  // (combining elements at indices 0 and rotary_dim/2, 1 and rotary_dim/2+1,
+  // etc), each thread will load twice from the same row.
+  static constexpr int kBytePerHalfRow = kHeadDim / 2 * sizeof(Element);
+  static constexpr int kBlockKGmem =
+      (kBytePerHalfRow % 128 == 0 ? 128
+                                  : (kBytePerHalfRow % 64 == 0 ? 64 : 32)) /
+      sizeof(Element);
+  static constexpr int kGmemThreadsPerRow = kBlockKGmem / kGmemElemsPerLoad;
+  static_assert(
+      NumMmaThreads % kGmemThreadsPerRow == 0,
+      "NumMmaThreads must be a multiple of kGmemThreadsPerRow");
+  // We assume threads loading the same row are in the same warp. This is for an
+  // optimization in PagedKV where these threads share the same page table entry
+  // and share the work of computing pointers to paged K and paged V.
+  static_assert(
+      cutlass::NumThreadsPerWarp % kGmemThreadsPerRow == 0,
+      "kGmemThreadsPerRow must divide NumThreadsPerWarp");
+  using GmemLayoutAtom = Layout<
+      Shape<Int<NumMmaThreads / kGmemThreadsPerRow>, Int<kGmemThreadsPerRow>>,
+      Stride<Int<kGmemThreadsPerRow>, _1>>;
+  // If AppendKV, we'll be loading Q for rotary, and we assume divisibility to
+  // avoid predication
+  static_assert(
+      kBlockM % CUTE_STATIC_V(shape<0>(GmemLayoutAtom{})) == 0,
+      "kBlockM must be a multiple of NumMmaThreads / kGmemThreadsPerRow");
+
+  using ShapeQKV =
+      cute::Shape<int32_t, int32_t, int32_t, int32_t>; // (seqlen, d, head,
+                                                       // batch)
+  using StrideQK = cute::Stride<int64_t, _1, int64_t, int64_t>;
+  using StrideV = std::conditional_t<
+      !V_colmajor,
+      StrideQK,
+      cute::Stride<_1, int64_t, int64_t, int64_t>>;
+  // ((qhead_per_khead, seqlen), d, nheads_kv, batch, num_splits)
+  using ShapeQPacked = ShapeQKV;
+  using StrideQPacked = StrideQK;
+  using StrideDescale = cute::Stride<int64_t, int64_t>;
+
+  using TMA_Q = decltype(make_tma_copy_A_sm90(
+      GmemTiledCopyQ{},
+      make_tensor(
+          make_gmem_ptr(static_cast<Element const*>(nullptr)),
+          ShapeQKV{},
+          StrideQK{}),
+      SmemLayoutQ{},
+      TileShape_MNK{},
+      ClusterShape{}));
+
+  using TMA_K = decltype(make_tma_copy_B_sm90(
+      GmemTiledCopyKV{},
+      make_tensor(
+          make_gmem_ptr(static_cast<Element const*>(nullptr)),
+          ShapeQKV{},
+          StrideQK{}),
+      take<0, 2>(SmemLayoutK{}),
+      TileShape_MNK{},
+      ClusterShape{})); // mcast along M mode for this N load, if any
+
+  using TMA_V = decltype(make_tma_copy(
+      GmemTiledCopyKV{},
+      make_tensor(
+          make_gmem_ptr(static_cast<Element const*>(nullptr)),
+          ShapeQKV{},
+          select<1, 0, 2, 3>(StrideV{})),
+      take<0, 2>(SmemLayoutVt{}),
+      select<2, 1>(TileShape_MNK{}),
+      size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
+
+  // Set the bytes transferred in this TMA transaction (may involve multiple
+  // issues)
+  static constexpr uint32_t TmaTransactionBytesQ = static_cast<uint32_t>(
+      size(SmemLayoutQ{}) * cutlass::sizeof_bits_v<Element> / 8);
+  static constexpr uint32_t TmaTransactionBytesK = static_cast<uint32_t>(
+      size(take<0, 2>(SmemLayoutK{})) * cutlass::sizeof_bits_v<Element> / 8);
+  static constexpr uint32_t TmaTransactionBytesV = static_cast<uint32_t>(
+      size(take<0, 2>(SmemLayoutVt{})) * cutlass::sizeof_bits_v<Element> / 8);
+  static_assert(TmaTransactionBytesK == TmaTransactionBytesV);
+
+  using PipelineTmaAsync = std::conditional_t<
+      CUTE_STATIC_V(size(ClusterShape{})) == 1,
+      typename cutlass::PipelineTmaAsyncNoCluster<kStages>,
+      typename cutlass::PipelineTmaAsync<kStages>>;
+  using MainloopPipelineK = PipelineTmaAsync;
+  using MainloopPipelineV = std::conditional_t<
+      !Transpose_V,
+      PipelineTmaAsync,
+      typename cutlass::PipelineAsync<kStages>>;
+  using MainloopPipelineVt = PipelineTmaAsync;
+  // We always use TMA for K_new and V_new
+  using MainloopPipelineKVNew = PipelineTmaAsync;
+  using PipelineState = cutlass::PipelineState<kStages>;
+
+  // If PackGQA, we use cp.async (instead of TMA) to load Q, so we want smem_q
+  // to be aligned and have sQ being position_independent_swizzle_tensor. If
+  // !Use_TMA_KV, we use cp.async (instead of TMA) to load K & V, so we want
+  // smem_k and smem_v to be aligned.
+  static constexpr size_t SmemAlignmentQ =
+      !Mma0_is_RS ? 128 : cutlass::detail::alignment_for_swizzle(SmemLayoutQ{});
+  static constexpr size_t SmemAlignmentK = 128;
+  static constexpr size_t SmemAlignmentVtNoTranspose =
+      cutlass::detail::alignment_for_swizzle(SmemLayoutVt{});
+  static_assert(
+      SmemAlignmentQ >= 128 and SmemAlignmentK >= 128 &&
+          SmemAlignmentVtNoTranspose >= 128,
+      "Require at least 128B alignment");
+  static constexpr size_t SmemAlignmentP =
+      cutlass::detail::alignment_for_swizzle(SmemLayoutP{});
+  static_assert(SmemAlignmentP >= 128, "Require at least 128B alignment");
+
+  using SmemP_t = std::conditional_t<
+      Mma1_is_RS,
+      cute::array<Element, 0>,
+      cute::
+          array_aligned<Element, cute::cosize_v<SmemLayoutP>, SmemAlignmentP>>;
+  // Sometimes even with SmemP_t = cute::array<Element, 0>, putting it in the
+  // TensorStorage struct causes smem size to go from 227KB to 228KB and we get
+  // "invalid argument".
+
+  struct TensorStorageWithoutPNoTranspose : cute::aligned_struct<cute::max(
+                                                SmemAlignmentQ,
+                                                SmemAlignmentK,
+                                                SmemAlignmentVtNoTranspose)> {
+    cute::array_aligned<
+        Element,
+        cute::cosize_v<SmemLayoutVt>,
+        SmemAlignmentVtNoTranspose>
+        smem_v;
+    cute::array_aligned<Element, cute::cosize_v<SmemLayoutQ>, SmemAlignmentQ>
+        smem_q;
+    cute::array_aligned<Element, cute::cosize_v<SmemLayoutK>, SmemAlignmentK>
+        smem_k;
+  };
+
+  struct TensorStorageWithPNoTranspose : cute::aligned_struct<cute::max(
+                                             SmemAlignmentQ,
+                                             SmemAlignmentK,
+                                             SmemAlignmentVtNoTranspose,
+                                             SmemAlignmentP)> {
+    cute::array_aligned<
+        Element,
+        cute::cosize_v<SmemLayoutVt>,
+        SmemAlignmentVtNoTranspose>
+        smem_v;
+    cute::array_aligned<Element, cute::cosize_v<SmemLayoutQ>, SmemAlignmentQ>
+        smem_q;
+    cute::array_aligned<Element, cute::cosize_v<SmemLayoutK>, SmemAlignmentK>
+        smem_k;
+    SmemP_t smem_p;
+  };
+
+  using TensorStorageNoTranspose = std::conditional_t<
+      Mma1_is_RS,
+      TensorStorageWithoutPNoTranspose,
+      TensorStorageWithPNoTranspose>;
+
+  static constexpr size_t SmemAlignmentVt =
+      cutlass::detail::alignment_for_swizzle(SmemLayoutVt{});
+  static constexpr size_t SmemAlignmentV =
+      cutlass::detail::alignment_for_swizzle(SmemLayoutVtMma{});
+  static_assert(
+      SmemAlignmentVt >= 128 and SmemAlignmentV >= 128,
+      "Require at least 128B alignment");
+  struct TensorStorageTransposeV
+      : cute::aligned_struct<
+            cute::max(SmemAlignmentQ, SmemAlignmentK, SmemAlignmentV)> {
+    cute::
+        array_aligned<Element, cute::cosize_v<SmemLayoutVtMma>, SmemAlignmentV>
+            smem_v;
+    cute::array_aligned<Element, cute::cosize_v<SmemLayoutVt>, SmemAlignmentVt>
+        smem_vt;
+    cute::array_aligned<Element, cute::cosize_v<SmemLayoutQ>, SmemAlignmentQ>
+        smem_q;
+    cute::array_aligned<Element, cute::cosize_v<SmemLayoutK>, SmemAlignmentK>
+        smem_k;
+  };
+
+  using TensorStorage = std::conditional_t<
+      !Transpose_V,
+      TensorStorageNoTranspose,
+      TensorStorageTransposeV>;
+
+  // These are tuned for speed. They don't affect correctness.
+  static constexpr bool UseSchedulerBarrier =
+      (NumMmaWarpGroups >= 2) && (!Is_FP8 ? kHeadDim <= 128 : kHeadDim >= 128);
+  static constexpr bool RescaleOBeforeGemm =
+      kHeadDim > 128 && (!Is_FP8 || V_colmajor);
+
+  // Host side kernel arguments
+  struct Arguments {
+    Element const* const ptr_Q;
+    ShapeQKV const shape_Q;
+    StrideQK const stride_Q;
+    Element* const
+        ptr_K; // Not Element const* since we might append to KV cache in-place
+    ShapeQKV const shape_K;
+    StrideQK const stride_K;
+    Element* const ptr_V;
+    StrideV const stride_V;
+    float const *ptr_q_descale, *ptr_k_descale, *ptr_v_descale;
+    StrideDescale const stride_q_descale, stride_k_descale, stride_v_descale;
+    float const max_seq_len_inv;
+    float const alpha;
+    int const max_attn_len;
+    int const min_full_attn_seq_len;
+    int const contextual_seq_len;
+    int const num_softmax_heads;
+    int const num_groups;
+    int const batch_size_per_group;
+    int const* const seq_offsets = nullptr;
+    int const* const seq_offsets_q = nullptr;
+    int const* const num_targets = nullptr;
+    int const* const max_seq_len_tensor = nullptr;
+    int const* const contextual_seq_len_tensor = nullptr;
+    int const* const max_attn_len_tensor = nullptr;
+    int const* const min_full_attn_seq_len_tensor = nullptr;
+    float const* const attn_scale = nullptr;
+    bool const scalar_scale = true;
+  };
+
+  // Device side kernel params
+  struct Params {
+    Element const* const ptr_Q;
+    ShapeQKV const shape_Q;
+    StrideQK const stride_Q;
+    ShapeQPacked const shape_Q_packed;
+    StrideQPacked const stride_Q_packed;
+    Element* const ptr_K;
+    ShapeQKV const shape_K;
+    StrideQK const stride_K;
+    Element* const ptr_V;
+    StrideV const stride_V;
+    TMA_Q tma_load_Q;
+    TMA_K tma_load_K;
+    TMA_V tma_load_V;
+    float const *ptr_q_descale, *ptr_k_descale, *ptr_v_descale;
+    StrideDescale const stride_q_descale, stride_k_descale, stride_v_descale;
+    float const max_seq_len_inv;
+    float const alpha;
+    float const alpha_log2;
+    int const max_attn_len;
+    int const min_full_attn_seq_len;
+    int const contextual_seq_len;
+    int const num_softmax_heads;
+    int const num_groups;
+    int const batch_size_per_group;
+    int const* const seq_offsets = nullptr;
+    int const* const seq_offsets_q = nullptr;
+    int const* const num_targets = nullptr;
+    int const* const max_seq_len_tensor = nullptr;
+    int const* const contextual_seq_len_tensor = nullptr;
+    int const* const max_attn_len_tensor = nullptr;
+    int const* const min_full_attn_seq_len_tensor = nullptr;
+    float const* const attn_scale = nullptr;
+    bool const scalar_scale = true;
+  };
+
+  static Params to_underlying_arguments(Arguments const& args) {
+    Tensor mQ =
+        make_tensor(make_gmem_ptr(args.ptr_Q), args.shape_Q, args.stride_Q);
+    TMA_Q tma_load_Q = make_tma_copy_A_sm90(
+        GmemTiledCopyQ{},
+        mQ,
+        SmemLayoutQ{},
+        TileShape_MNK{},
+        ClusterShape{}); // no mcast for Q
+    Tensor mK =
+        make_tensor(make_gmem_ptr(args.ptr_K), args.shape_K, args.stride_K);
+    TMA_K tma_load_K = make_tma_copy_B_sm90(
+        GmemTiledCopyKV{},
+        mK,
+        take<0, 2>(SmemLayoutK{}),
+        TileShape_MNK{},
+        ClusterShape{}); // mcast along M mode for this N load, if any
+    Tensor mV = make_tensor(
+        make_gmem_ptr(args.ptr_V),
+        select<1, 0, 2, 3>(args.shape_K),
+        select<1, 0, 2, 3>(args.stride_V));
+    TMA_V tma_load_V = make_tma_copy(
+        GmemTiledCopyKV{},
+        mV,
+        take<0, 2>(SmemLayoutVt{}),
+        select<2, 1>(TileShape_MNK{}),
+        size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
+    auto const shape_Q_packed = cute::conditional_return<true>(
+        args.shape_Q,
+        make_shape(
+            make_shape(1, get<0>(args.shape_Q)),
+            get<1>(args.shape_Q),
+            get<2>(args.shape_K),
+            get<3>(args.shape_Q)));
+    auto const stride_Q_packed = cute::conditional_return<true>(
+        args.stride_Q,
+        make_stride(
+            make_stride(get<2>(args.stride_Q), get<0>(args.stride_Q)),
+            get<1>(args.stride_Q),
+            get<2>(args.stride_Q),
+            get<3>(args.stride_Q)));
+    return {
+        args.ptr_Q,
+        args.shape_Q,
+        args.stride_Q,
+        shape_Q_packed,
+        stride_Q_packed,
+        args.ptr_K,
+        args.shape_K,
+        args.stride_K,
+        args.ptr_V,
+        args.stride_V,
+        tma_load_Q,
+        tma_load_K,
+        tma_load_V,
+        args.ptr_q_descale,
+        args.ptr_k_descale,
+        args.ptr_v_descale,
+        args.stride_q_descale,
+        args.stride_k_descale,
+        args.stride_v_descale,
+        args.max_seq_len_inv,
+        args.alpha,
+        float(args.alpha * M_LOG2E),
+        args.max_attn_len,
+        args.min_full_attn_seq_len,
+        args.contextual_seq_len,
+        args.num_softmax_heads,
+        args.num_groups,
+        args.batch_size_per_group,
+        args.seq_offsets,
+        args.seq_offsets_q,
+        args.num_targets,
+        args.max_seq_len_tensor,
+        args.contextual_seq_len_tensor,
+        args.max_attn_len_tensor,
+        args.min_full_attn_seq_len_tensor,
+        args.attn_scale,
+        args.scalar_scale};
+  }
+
+  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best
+  /// performance
+  CUTLASS_DEVICE
+  static void prefetch_tma_descriptors(Params const& params) {
+    cute::prefetch_tma_descriptor(params.tma_load_Q.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(params.tma_load_K.get_tma_descriptor());
+    cute::prefetch_tma_descriptor(params.tma_load_V.get_tma_descriptor());
+  }
+
+  CUTLASS_DEVICE
+  cute::tuple<int, int> get_n_block_min_max(
+      int max_attn_len,
+      int min_full_attn_seq_len,
+      int contextual_seq_len,
+      int uihlen,
+      int m_block) {
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+    if constexpr (Contexual_mask) {
+      if (m_block * kBlockM < contextual_seq_len) {
+        return {0, cute::ceil_div(uihlen, kBlockN)};
+      }
+    }
+    if constexpr (Has_targets) {
+      int m_idx_max = (m_block + 1) * kBlockM;
+      if (m_idx_max > uihlen) {
+        return {0, cute::ceil_div(uihlen, kBlockN)};
+      }
+    }
+    int n_block_max;
+    int n_block_min;
+    // Non-target part, n_block_max
+    if constexpr (Causal || Local) {
+      int m_idx_max = (m_block + 1) * kBlockM;
+      n_block_max = cute::ceil_div(std::min(m_idx_max, uihlen), kBlockN);
+    } else {
+      n_block_max = cute::ceil_div(uihlen, kBlockN);
+    }
+    // Non-target part, n_block_min
+    if constexpr (Local) {
+      int m_idx_min = m_block * kBlockM;
+      int m_idx_max = (m_block + 1) * kBlockM;
+      if (min_full_attn_seq_len == 0 ||
+          m_idx_max <= uihlen - min_full_attn_seq_len) {
+        n_block_min = std::max(int(0), (m_idx_min - max_attn_len) / kBlockN);
+        if constexpr (Contexual_mask) {
+          // row contexual without sink
+          if (n_block_min * kBlockN < contextual_seq_len) {
+            n_block_min = 0;
+          }
+        }
+      } else {
+        n_block_min = 0;
+      }
+    } else {
+      n_block_min = 0;
+    }
+    return {n_block_min, n_block_max};
+  }
+
+  CUTLASS_DEVICE
+  cute::tuple<int, int> get_target_n_block_min_max(
+      int n_block_max,
+      int uihlen,
+      int seqlen,
+      int m_block) {
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+    int m_idx_max = (m_block + 1) * kBlockM;
+    if (m_idx_max <= uihlen) { // Non-target part
+      return {n_block_max, n_block_max};
+    } else { // Target part
+      int m_idx_min = m_block * kBlockM;
+      return {
+          std::max(n_block_max, m_idx_min / kBlockN),
+          cute::ceil_div(std::min(m_idx_max, seqlen), kBlockN)};
+    }
+  }
+
+  CUTLASS_DEVICE
+  int get_contexual_n_block_max(
+      int n_block_min,
+      int min_full_attn_seq_len,
+      int contextual_seq_len,
+      int uihlen,
+      int m_block) {
+    return 0;
+    // TODO: reenable below once contexual + semi local implementation is
+    // finalized
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+    if constexpr (!Local) {
+      return 0;
+    }
+    if (m_block * kBlockM < contextual_seq_len) {
+      return 0;
+    }
+    int m_idx_max = (m_block + 1) * kBlockM;
+    if constexpr (Has_targets) {
+      if (m_idx_max > uihlen) {
+        return 0;
+      }
+    }
+    if (min_full_attn_seq_len == 0 ||
+        m_idx_max <= uihlen - min_full_attn_seq_len) {
+      return std::min(n_block_min, cute::ceil_div(contextual_seq_len, kBlockN));
+    }
+    return 0;
+  }
+
+  CUTLASS_DEVICE
+  cute::tuple<int, int> get_cross_n_block_min_max(
+      int const uihlen_q,
+      int const seqlen_q,
+      int const seqlen_kv,
+      int const m_block) {
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+    if constexpr (!Causal) {
+      return {0, cute::ceil_div(seqlen_kv, kBlockN)};
+    }
+    int n_block_max =
+        std::min(seqlen_kv, (m_block + 1) * kBlockM + seqlen_kv - uihlen_q);
+    return {0, cute::ceil_div(n_block_max, kBlockN)};
+  }
+
+  template <typename SchedulerPrefetch, typename SharedStorage>
+  CUTLASS_DEVICE void load(
+      Params const& params,
+      MainloopPipelineK pipeline_k,
+      MainloopPipelineV pipeline_v,
+      MainloopPipelineVt pipeline_vt,
+      PipelineState& smem_pipe_write,
+      SharedStorage& shared_storage,
+      SchedulerPrefetch const& scheduler_prefetch,
+      SeqlenInfo_t const& seqlen_info,
+      cute::tuple<int32_t, int32_t, int32_t, int32_t> block_coord,
+      int& work_idx) {
+    auto [m_block, bidh, bidb, split_idx] = block_coord;
+    if constexpr (Jagged) {
+      static constexpr int kBlockM = get<0>(TileShape_MNK{});
+      if (m_block * kBlockM >= seqlen_info.seqlen_q) {
+        scheduler_prefetch();
+        return;
+      }
+    }
+    int min_full_attn_seq_len_, max_attn_len_, contextual_seq_len_;
+    if constexpr (!Cross) {
+      if (params.num_groups > 1) {
+        int group_id = bidb / params.batch_size_per_group;
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len_tensor[group_id];
+        max_attn_len_ = params.max_attn_len_tensor[group_id];
+        contextual_seq_len_ = params.contextual_seq_len_tensor[group_id];
+      } else {
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len;
+        max_attn_len_ = params.max_attn_len;
+        contextual_seq_len_ = params.contextual_seq_len;
+      }
+    }
+    int n_block_min, n_block_max;
+    if constexpr (Cross) {
+      auto n_block_min_max = get_cross_n_block_min_max(
+          seqlen_info.uihlen_q,
+          seqlen_info.seqlen_q,
+          seqlen_info.seqlen_kv,
+          m_block);
+      n_block_min = get<0>(n_block_min_max);
+      n_block_max = get<1>(n_block_min_max);
+    } else {
+      auto n_block_min_max = get_n_block_min_max(
+          max_attn_len_,
+          min_full_attn_seq_len_,
+          contextual_seq_len_,
+          seqlen_info.uihlen_q,
+          m_block);
+      n_block_min = get<0>(n_block_min_max);
+      n_block_max = get<1>(n_block_min_max);
+    }
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+    if (n_block_max <= n_block_min) {
+      std::printf(
+          "mainloop_fwd_sm90: n_block_max <= n_block_min not expected.");
+      scheduler_prefetch();
+      return;
+    }
+#endif
+
+    Tensor sQ = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
+        SmemLayoutQ{});
+    Tensor sK = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_k.data()),
+        SmemLayoutK{});
+    Tensor sK_pi = as_position_independent_swizzle_tensor(sK);
+    // as_position_independent_swizzle_tensor makes address calculation easier
+    // when we do LDSM & STSM to transpose. But it requires smem_vt and smem_v
+    // to be aligned to e.g 512 bytes.
+    Tensor sVt = [&] {
+      if constexpr (!Transpose_V) {
+        return make_tensor(
+            make_smem_ptr(shared_storage.tensors.mainloop.smem_v.data()),
+            SmemLayoutVt{});
+      } else {
+        return cute::as_position_independent_swizzle_tensor(make_tensor(
+            make_smem_ptr(shared_storage.tensors.mainloop.smem_vt.data()),
+            SmemLayoutVt{}));
+      }
+    }();
+    // Only used if Transpose_V
+    Tensor sV = cute::as_position_independent_swizzle_tensor(make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_v.data()),
+        SmemLayoutVtMma{}));
+
+    int const thread_idx = threadIdx.x % NumProducerThreads;
+
+    // Prepare the TMA loads
+    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
+    constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
+    uint2 cluster_local_block_id = {
+        block_rank_in_cluster % cluster_shape_x,
+        block_rank_in_cluster / cluster_shape_x};
+
+    Tensor mQ = params.tma_load_Q.get_tma_tensor(params.shape_Q)(
+        _, _, bidh, !Jagged ? bidb : 0);
+    Tensor mK_TMA = params.tma_load_K.get_tma_tensor(params.shape_K)(
+        _, _, bidh, !Jagged ? bidb : 0);
+    Tensor mVt_TMA = params.tma_load_V.get_tma_tensor(
+        select<1, 0, 2, 3>(params.shape_K))(_, _, bidh, !Jagged ? bidb : 0);
+
+    Tensor gQ = local_tile(
+        domain_offset(make_coord(seqlen_info.offset_q, _0{}), mQ),
+        select<0, 2>(TileShape_MNK{}),
+        make_coord(m_block, _0{})); // (M, K)
+    // if (cute::thread0()) { printf("Jagged = %d, params.leftpad_k = %p,
+    // leftpad_k = %d\n", Jagged, params.leftpad_k, leftpad_k); }
+    Tensor gK_TMA = local_tile(
+        domain_offset(make_coord(seqlen_info.offset_k, _0{}), mK_TMA),
+        select<1, 2>(TileShape_MNK{}),
+        make_coord(_, _0{})); // (N, K, _)
+    Tensor gVt_TMA = local_tile(
+        domain_offset(make_coord(_0{}, seqlen_info.offset_k), mVt_TMA),
+        select<2, 1>(TileShape_MNK{}),
+        make_coord(_0{}, _)); // (K, N, _)
+
+    auto block_tma_Q = params.tma_load_Q.get_slice(_0{});
+    Tensor tQgQ = group_modes<0, 3>(block_tma_Q.partition_S(gQ)); // (TMA)
+    Tensor tQsQ = group_modes<0, 3>(block_tma_Q.partition_D(sQ)); // (TMA)
+    // tma_partition doesn't handle position_independent_swizzle_tensor
+    // correctly, so we need to do it manually
+    auto block_tma_K = params.tma_load_K.get_slice(cluster_local_block_id.x);
+    Tensor tKgK_TMA =
+        group_modes<0, 3>(block_tma_K.partition_S(gK_TMA)); // (TMA, k)
+    Tensor tKsK_TMA =
+        group_modes<0, 3>(block_tma_K.partition_D(sK)); // (TMA, PIPE)
+    auto block_tma_V = params.tma_load_V.get_slice(cluster_local_block_id.x);
+    Tensor tVgVt_TMA =
+        group_modes<0, 3>(block_tma_V.partition_S(gVt_TMA)); // (TMA, k)
+    Tensor tVsVt_TMA =
+        group_modes<0, 3>(block_tma_V.partition_D(sVt)); // (TMA, PIPE)
+
+    // Set up for transposing V, only used if Transpose_V
+    S2RTiledCopyVt s2r_tiled_copy_vt;
+    R2STiledCopyV r2s_tiled_copy_v;
+    auto s2r_thr_copy_vt = s2r_tiled_copy_vt.get_thread_slice(thread_idx);
+    auto r2s_thr_copy_v = r2s_tiled_copy_v.get_thread_slice(thread_idx);
+    // flat_divide(sVt, LDSM_divide_shape{}):  (64, 8, kHeadDim / 64, kBlockN /
+    // 8, kStages)
+    Tensor tTranssVt_ = s2r_thr_copy_vt.partition_S(
+        flat_divide(sVt, LDSM_divide_shape{})); // ((16, 1), 1, 1, kHeadDim /
+                                                // 64, kBlockN / 32, kStages)
+    // flat_divide(sV, STSM_divide_shape{}):  (8, 16, kHeadDim / 8, (4, kBlockN
+    // / 64), kStages)
+    Tensor tTranssV_ = r2s_thr_copy_v.partition_D(
+        flat_divide(sV, STSM_divide_shape{})); // ((16, 1), 1, 1, kHeadDim / 64,
+                                               // (2, kBlockN / 64), kStages)
+    CUTE_STATIC_ASSERT_V(rank(tTranssVt_) == rank(tTranssV_));
+    CUTE_STATIC_ASSERT_V(size<0>(tTranssVt_) == size<0>(tTranssV_));
+    CUTE_STATIC_ASSERT_V(size<1>(tTranssVt_) == size<1>(tTranssV_));
+    CUTE_STATIC_ASSERT_V(size<2>(tTranssVt_) == size<2>(tTranssV_));
+    CUTE_STATIC_ASSERT_V(size<3>(tTranssVt_) == size<3>(tTranssV_));
+    CUTE_STATIC_ASSERT_V(size<4>(tTranssVt_) == size<4>(tTranssV_));
+    // Faster to have 2 LDSM.T, byte permute, STSM for better ILP
+    static constexpr int Transpose_ILP =
+        (size<2>(tTranssVt_) * size<3>(tTranssVt_)) % 2 == 0 ? 2 : 1;
+    Tensor tTranssVt = logical_divide(
+        group_modes<1, rank(tTranssVt_) - 1>(tTranssVt_),
+        Shape<Underscore, Int<Transpose_ILP>>{}); // ((16, 1), (2, kHeadDim / 64
+                                                  // * kBlockN / 32 / 2),
+                                                  // kStages)
+    Tensor tTranssV = logical_divide(
+        group_modes<1, rank(tTranssV_) - 1>(tTranssV_),
+        Shape<Underscore, Int<Transpose_ILP>>{}); // ((16, 1), (2, kHeadDim / 64
+                                                  // * kBlockN / 32 / 2),
+                                                  // kStages)
+    auto transpose_V = [&](int stage) {
+      if constexpr (Transpose_V) {
+#pragma unroll
+        for (int i = 0; i < size<1, 1>(tTranssVt); ++i) {
+          Tensor tTransrV =
+              make_fragment_like(tTranssV(_, make_coord(_, _0{}), _0{}));
+          static_assert(size<0>(tTransrV) == 16);
+          Tensor tTransrV_64 = recast<uint2>(tTransrV);
+          cute::copy(
+              s2r_tiled_copy_vt,
+              tTranssVt(_, make_coord(_, i), stage),
+              tTransrV);
+#pragma unroll
+          for (int j = 0; j < size(tTransrV_64); ++j) {
+            uint32_t upper = tTransrV_64[j].x;
+            uint32_t lower = tTransrV_64[j].y;
+            tTransrV_64[j].x = __byte_perm(upper, lower, 0x6420);
+            tTransrV_64[j].y = __byte_perm(upper, lower, 0x7531);
+          }
+          cute::copy(
+              r2s_tiled_copy_v, tTransrV, tTranssV(_, make_coord(_, i), stage));
+        }
+      }
+    };
+
+    uint16_t mcast_mask_kv = 0;
+    if constexpr (cute::is_same_v<GmemTiledCopyKV, SM90_TMA_LOAD_MULTICAST>) {
+      auto block_layout = Layout<ClusterShape>{}; // (m,n) -> block_id
+      for (int m = 0; m < size<0>(block_layout); ++m) {
+        mcast_mask_kv |=
+            (uint16_t(1) << block_layout(m, cluster_local_block_id.y, _0{}));
+      }
+    }
+
+    auto load_K = [&](int const n_block, auto const& smem_pipe_write) {
+      pipeline_k.producer_acquire(smem_pipe_write);
+      copy(
+          params.tma_load_K.with(
+              *pipeline_k.producer_get_barrier(smem_pipe_write),
+              mcast_mask_kv,
+              TMA::CacheHintSm90::EVICT_LAST),
+          tKgK_TMA(_, n_block),
+          tKsK_TMA(_, smem_pipe_write.index()));
+    };
+
+    auto load_V = [&](int const n_block, auto const& smem_pipe_write) {
+      auto pipeline_v_load =
+          cute::conditional_return<!Transpose_V>(pipeline_v, pipeline_vt);
+      pipeline_v_load.producer_acquire(smem_pipe_write);
+      copy(
+          params.tma_load_V.with(
+              *pipeline_v_load.producer_get_barrier(smem_pipe_write),
+              mcast_mask_kv,
+              TMA::CacheHintSm90::EVICT_LAST),
+          tVgVt_TMA(_, n_block),
+          tVsVt_TMA(_, smem_pipe_write.index()));
+    };
+
+    auto copy_Vt_to_V = [&](auto const& smem_pipe_write) {
+      // Instead of maintaining smem_pipe_read as a separate variable, we can
+      // just use smem_pipe_write, and exploit the invariance that
+      // smem_pipe_write.phase() == smem_pipe_read.phase() ^ 1. This saves 1 or
+      // 2 registers.
+      PipelineState smem_pipe_read{
+          smem_pipe_write.index(),
+          smem_pipe_write.phase() ^ 1,
+          smem_pipe_write.count()};
+      pipeline_vt.consumer_wait(smem_pipe_read);
+      pipeline_v.producer_acquire(smem_pipe_write);
+      transpose_V(smem_pipe_write.index());
+      // SMEM fence to make sure V is transposed before math
+      cutlass::arch::fence_view_async_shared();
+      pipeline_v.producer_commit(smem_pipe_write);
+      // Very important: PipelineTmaAsync::consumer_release assumes that the
+      // warpgroup is synchronized before calling. Without this we get race
+      // conditions.
+      cutlass::arch::NamedBarrier::sync(
+          cutlass::NumThreadsPerWarpGroup,
+          static_cast<uint32_t>(FwdNamedBarriers::ProducerWG) /*id*/);
+      pipeline_vt.consumer_release(smem_pipe_read);
+    };
+
+    int n_block = n_block_max - 1;
+
+    int warp_idx_in_warpgroup =
+        __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0);
+    // If this is true, we're guaranteed that only the first warp will execute
+    // this function
+    static constexpr bool SingleProducerWarp =
+        NumProducerThreads == cutlass::NumThreadsPerWarp;
+    bool should_load_KV =
+        ((SingleProducerWarp || warp_idx_in_warpgroup == 0) &&
+         cute::elect_one_sync());
+
+    if (should_load_KV) {
+      if constexpr (Transpose_V) {
+        load_V(n_block, smem_pipe_write);
+      }
+      // if (thread_idx == 0) { printf("Producer: main load, before load_K,
+      // index = %d\n", smem_pipe_write.index());}
+      load_K(n_block, smem_pipe_write);
+      // if (thread_idx == 0) { printf("Producer: main load, after load K, index
+      // = %d\n", smem_pipe_write.index());}
+    }
+
+    // TMA_Q, Wait for the MMA warpgroups to signal that smem_q is ready
+    if (SingleProducerWarp || warp_idx_in_warpgroup == 0) {
+      cutlass::arch::NamedBarrier::sync(
+          NumMmaThreads + cutlass::NumThreadsPerWarp,
+          static_cast<uint32_t>(FwdNamedBarriers::QueryEmpty) /*id*/);
+    }
+    if ((SingleProducerWarp || warp_idx_in_warpgroup == 0) &&
+        cute::elect_one_sync()) {
+      shared_storage.pipelines.barrier_Q.arrive_and_expect_tx(
+          TmaTransactionBytesQ);
+      copy(
+          params.tma_load_Q.with(
+              reinterpret_cast<typename cutlass::arch::
+                                   ClusterTransactionBarrier::ValueType&>(
+                  shared_storage.pipelines.barrier_Q),
+              0 /*mcast_mask*/,
+              TMA::CacheHintSm90::EVICT_FIRST),
+          tQgQ,
+          tQsQ);
+    }
+
+    // Wait for the MMA WGs to signal that smem_v are ready and V can be copied
+    // from gmem Need ClusterBarrier, not just NamedBarrier. Otherwise we might
+    // have CTA 0 finishing the TMA store on O first, call TMA multicast load on
+    // V, before CTA 1 can finishing TMA store on O. if (thread_idx == 0) {
+    // printf("Producer: main load, before barrier_O, work_idx = %d\n",
+    // work_idx);}
+    shared_storage.pipelines.barrier_O.wait((work_idx + 1) % 2);
+    // if (thread_idx == 0) { printf("Producer: main load, after barrier_O\n");}
+
+    int n_block_prev = n_block;
+    --n_block;
+#pragma unroll(!Transpose_V ? 2 : 1)
+    for (; n_block >= n_block_min; --n_block) {
+      PipelineState smem_pipe_write_v =
+          smem_pipe_write; // copy the state, write_v is always 1 step behind
+      ++smem_pipe_write;
+      if (should_load_KV) {
+        if constexpr (Transpose_V) {
+          load_V(n_block, smem_pipe_write);
+        } else {
+          load_V(n_block_prev, smem_pipe_write_v);
+        }
+        load_K(n_block, smem_pipe_write);
+      }
+      n_block_prev = n_block;
+      if constexpr (Transpose_V) {
+        copy_Vt_to_V(smem_pipe_write_v);
+      }
+    }
+    scheduler_prefetch();
+    if constexpr (!Transpose_V) {
+      if (should_load_KV) {
+        load_V(n_block_prev, smem_pipe_write);
+      }
+    }
+    if constexpr (Transpose_V) {
+      copy_Vt_to_V(smem_pipe_write);
+    }
+    ++smem_pipe_write;
+    if constexpr (!Cross) {
+      if constexpr (Has_targets) {
+        auto [target_n_block_min, target_n_block_max] =
+            get_target_n_block_min_max(
+                n_block_max,
+                seqlen_info.uihlen_q,
+                seqlen_info.seqlen_kv,
+                m_block);
+#pragma unroll 1
+        for (n_block = target_n_block_max - 1; n_block >= target_n_block_min;
+             --n_block) {
+          if (should_load_KV) {
+            load_V(n_block, smem_pipe_write);
+            load_K(n_block, smem_pipe_write);
+          }
+          if constexpr (Transpose_V) {
+            copy_Vt_to_V(smem_pipe_write);
+          }
+          ++smem_pipe_write;
+        }
+      }
+      if constexpr (Contexual_mask) {
+        int contexual_n_block_max = get_contexual_n_block_max(
+            n_block_min,
+            min_full_attn_seq_len_,
+            contextual_seq_len_,
+            seqlen_info.uihlen_q,
+            m_block);
+#pragma unroll 1
+        for (n_block = contexual_n_block_max - 1; n_block >= 0; --n_block) {
+          if (should_load_KV) {
+            load_V(n_block, smem_pipe_write);
+            load_K(n_block, smem_pipe_write);
+          }
+          if constexpr (Transpose_V) {
+            copy_Vt_to_V(smem_pipe_write);
+          }
+          ++smem_pipe_write;
+        }
+      }
+    }
+    // At the end, all threads have the correct smem_pipe_write.
+    ++work_idx;
+  }
+
+  template <typename SharedStorage>
+  CUTLASS_DEVICE void load_tail(
+      MainloopPipelineK pipeline_k,
+      MainloopPipelineV pipeline_v,
+      MainloopPipelineVt pipeline_vt,
+      PipelineState& smem_pipe_write,
+      SharedStorage& shared_storage,
+      int const work_idx) {
+    // If we don't wait for barrier_O here, when using Cluster, CTA0 might exit
+    // early and CTA1 will try to arrive on barrier_O of CTA0, causing
+    // "unspecified launch failure".
+    shared_storage.pipelines.barrier_O.wait((work_idx + 1) % 2);
+    int warp_idx_in_warpgroup =
+        __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0);
+    // Issue the epilogue waits
+    // TODO: check if this should be called by 1 thread or more
+    if (warp_idx_in_warpgroup == 0 && cute::elect_one_sync()) {
+      /* This helps avoid early exit of blocks in Cluster
+       *  Waits for all stages to either be released (all Consumer UNLOCKs), or
+       * if the stage was never used then would just be acquired since the phase
+       * was still inverted from make_producer_start_state
+       */
+      pipeline_k.producer_tail(smem_pipe_write);
+      pipeline_v.producer_tail(smem_pipe_write);
+      if constexpr (Transpose_V) {
+        pipeline_vt.producer_tail(smem_pipe_write);
+      }
+    }
+  }
+
+  CUTLASS_DEVICE void warp_scheduler_barrier_sync() {
+    if constexpr (UseSchedulerBarrier) {
+      cutlass::arch::NamedBarrier::sync(
+          2 * cutlass::NumThreadsPerWarpGroup,
+          static_cast<uint32_t>(FwdNamedBarriers::WarpSchedulerWG1) - 1 +
+              hstu::canonical_warp_group_idx_nosync() /*id*/);
+    }
+  }
+
+  CUTLASS_DEVICE void warp_scheduler_barrier_arrive() {
+    if constexpr (UseSchedulerBarrier) {
+      static_assert(NumMmaWarpGroups == 2 || NumMmaWarpGroups == 3);
+      int const cur_WG = hstu::canonical_warp_group_idx_nosync() - 1;
+      int const next_WG = NumMmaWarpGroups == 2
+          ? 1 - cur_WG
+          : (cur_WG < NumMmaWarpGroups - 1 ? cur_WG + 1 : 0);
+      cutlass::arch::NamedBarrier::arrive(
+          2 * cutlass::NumThreadsPerWarpGroup,
+          static_cast<uint32_t>(FwdNamedBarriers::WarpSchedulerWG1) +
+              next_WG /*id*/);
+    }
+  }
+
+  CUTLASS_DEVICE void mma_init() {
+    // Tell producers that smem_q is ready
+    cutlass::arch::NamedBarrier::arrive(
+        NumMmaThreads + cutlass::NumThreadsPerWarp,
+        static_cast<uint32_t>(FwdNamedBarriers::QueryEmpty) /*id*/);
+    if constexpr (UseSchedulerBarrier) {
+      // We have NamedBarrier for up to 3 WGs
+      static_assert(NumMmaWarpGroups == 2 || NumMmaWarpGroups == 3);
+      // WG1 needs the very first signal to start
+      if (hstu::canonical_warp_group_idx_nosync() == 1) {
+        cutlass::arch::NamedBarrier::arrive(
+            2 * cutlass::NumThreadsPerWarpGroup,
+            static_cast<uint32_t>(FwdNamedBarriers::WarpSchedulerWG1) /*id*/);
+      }
+    }
+  }
+
+  template <typename SharedStorage, typename FrgTensorO>
+  CUTLASS_DEVICE bool mma(
+      Params const& params,
+      MainloopPipelineK pipeline_k,
+      MainloopPipelineV pipeline_v,
+      PipelineState& smem_pipe_read,
+      FrgTensorO& tOrO,
+      int const thread_idx,
+      int& work_idx,
+      SeqlenInfo_t const& seqlen_info,
+      cute::tuple<int32_t, int32_t, int32_t, int32_t> block_coord,
+      SharedStorage& shared_storage) {
+    static_assert(
+        is_rmem<FrgTensorO>::value, "O tensor must be rmem resident.");
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+
+    // can't use auto [m_block, ...] = block_coord since structured binding
+    // cannot be captured in lambda
+    int const m_block = get<0>(block_coord);
+    int const bidh = get<1>(block_coord);
+    int const bidb = get<2>(block_coord);
+    int const split_idx = get<3>(block_coord);
+    if constexpr (Jagged) {
+      static constexpr int kBlockM = get<0>(TileShape_MNK{});
+      if (m_block * kBlockM >= seqlen_info.seqlen_q) {
+        return false;
+      }
+    }
+    int min_full_attn_seq_len_, max_attn_len_, contextual_seq_len_;
+    float scalar_scale_val_;
+    if constexpr (!Cross) {
+      if (params.num_groups > 1) {
+        int group_id = bidb / params.batch_size_per_group;
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len_tensor[group_id];
+        max_attn_len_ = params.max_attn_len_tensor[group_id];
+        contextual_seq_len_ = params.contextual_seq_len_tensor[group_id];
+      } else {
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len;
+        max_attn_len_ = params.max_attn_len;
+        contextual_seq_len_ = params.contextual_seq_len;
+      }
+    }
+    if (params.num_groups > 1) {
+      int group_id = bidb / params.batch_size_per_group;
+      int max_seq_len_per_group = params.max_seq_len_tensor[group_id];
+      // attention scale
+      scalar_scale_val_ = params.scalar_scale
+          ? (params.attn_scale == nullptr ? 1.0f / max_seq_len_per_group
+                                          : params.attn_scale[group_id])
+          : 0;
+    } else {
+      // attention scale
+      scalar_scale_val_ = params.scalar_scale
+          ? (params.attn_scale == nullptr ? params.max_seq_len_inv
+                                          : params.attn_scale[0])
+          : 0;
+    }
+    int n_block_min, n_block_max;
+    if constexpr (Cross) {
+      auto n_block_min_max = get_cross_n_block_min_max(
+          seqlen_info.uihlen_q,
+          seqlen_info.seqlen_q,
+          seqlen_info.seqlen_kv,
+          m_block);
+      n_block_min = get<0>(n_block_min_max);
+      n_block_max = get<1>(n_block_min_max);
+    } else {
+      auto n_block_min_max = get_n_block_min_max(
+          max_attn_len_,
+          min_full_attn_seq_len_,
+          contextual_seq_len_,
+          seqlen_info.uihlen_q,
+          m_block);
+      n_block_min = get<0>(n_block_min_max);
+      n_block_max = get<1>(n_block_min_max);
+    }
+
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+    if (n_block_max <= n_block_min) {
+      std::printf(
+          "mainloop_fwd_sm90: n_block_max <= n_block_min not expected.");
+      return false;
+    }
+#endif
+
+    Tensor sQ = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
+        SmemLayoutQ{});
+    Tensor sK = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_k.data()),
+        SmemLayoutK{});
+    Tensor sV = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_v.data()),
+        SmemLayoutVtMma{});
+    Tensor sP = [&] {
+      if constexpr (Mma1_is_RS) {
+        // We might not have smem_p if !Mma1_is_RS1, just use smem_q as a
+        // placeholder since we don't use it
+        return make_tensor(
+            make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
+            SmemLayoutP{});
+      } else {
+        return make_tensor(
+            make_smem_ptr(shared_storage.tensors.mainloop.smem_p.data()),
+            SmemLayoutP{});
+      }
+    }();
+
+    if constexpr (!Mma0_is_RS) {
+      static_assert(
+          stride<0>(typename TiledMma0::ALayout{}) == 0 and
+              stride<0>(typename TiledMma0::BLayout{}) == 0 and
+              size<0>(typename TiledMma0::ALayout{}) ==
+                  cutlass::NumThreadsPerWarpGroup and
+              size<0>(typename TiledMma0::BLayout{}) ==
+                  cutlass::NumThreadsPerWarpGroup,
+          "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+    }
+    constexpr int MmaWarpGroups =
+        size(TiledMma0{}) / cutlass::NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(
+        make_shape(Int<MmaWarpGroups>{}),
+        make_stride(Int<cutlass::NumThreadsPerWarpGroup>{}));
+
+    int warp_group_idx = __shfl_sync(
+        0xFFFFFFFF, thread_idx / cutlass::NumThreadsPerWarpGroup, 0);
+    TiledMma0 tiled_mma0;
+    TiledMma1 tiled_mma1;
+    auto wg_mma0 =
+        tiled_mma0.get_slice(warp_group_thread_layout(warp_group_idx));
+    auto wg_mma1 =
+        tiled_mma1.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    auto smem_tiled_copy_P = make_tiled_copy_C(SmemCopyAtomP{}, tiled_mma0);
+    auto smem_thr_copy_P = smem_tiled_copy_P.get_thread_slice(thread_idx);
+
+    // Allocate "fragments/descriptors"
+    Tensor tSrQ = wg_mma0.partition_fragment_A(sQ);
+    Tensor tSrK = wg_mma0.partition_fragment_B(sK);
+    Tensor tOrV = wg_mma1.partition_fragment_B(sV);
+    Tensor tOsP = wg_mma1.partition_fragment_A(sP);
+    Tensor tPsP = smem_thr_copy_P.partition_D(
+        cute::as_position_independent_swizzle_tensor(sP));
+
+    auto consumer_wait = [](auto& pipeline, auto& smem_pipe_read) {
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+    };
+
+    clear(tOrO);
+
+    int n_block = n_block_max - 1;
+
+    hstu::Mask<kBlockM, kBlockN, TiledMma0> mask(
+        thread_idx,
+        seqlen_info.seqlen_q,
+        seqlen_info.seqlen_kv,
+        max_attn_len_,
+        min_full_attn_seq_len_,
+        contextual_seq_len_,
+        seqlen_info.uihlen_q);
+
+    auto& barrier_Q = shared_storage.pipelines.barrier_Q;
+    barrier_Q.wait(work_idx % 2);
+
+    static constexpr int Qdim = 0;
+    auto thread_mma = tiled_mma0.get_thread_slice(thread_idx);
+    auto thread0_mma = tiled_mma0.get_thread_slice(_0{});
+    Tensor cS = cute::make_identity_tensor(Shape<Int<kBlockM>, Int<kBlockN>>{});
+    Tensor tScS = thread_mma.partition_C(cS);
+    Tensor tScS_rowcol = make_tensor(
+        tScS.data(),
+        hstu::convert_layout_acc_rowcol</*Transposed=*/false>(tScS.layout()));
+    Tensor t0ScS = thread0_mma.partition_C(cS);
+    Tensor t0ScS_rowcol = make_tensor(
+        t0ScS.data(),
+        hstu::convert_layout_acc_rowcol</*Transposed=*/false>(t0ScS.layout()));
+    int const thread_qdim_offset = get<Qdim>(tScS_rowcol(_0{}, _0{}));
+    SiluScaleOp<float> silu_scale_op;
+    int qdim_offset = params.scalar_scale
+        ? 0
+        : m_block * kBlockM + thread_qdim_offset + seqlen_info.offset_q;
+
+    if constexpr (Mma0_is_RS) {
+      using SmemCopyAtomQ = Copy_Atom<cute::SM75_U32x4_LDSM_N, Element>;
+      auto smem_tiled_copy_Q = make_tiled_copy_A(SmemCopyAtomQ{}, tiled_mma0);
+      auto smem_thr_copy_Q = smem_tiled_copy_Q.get_thread_slice(thread_idx);
+      Tensor tSrQ_copy_view = smem_thr_copy_Q.retile_D(tSrQ);
+      Tensor tSsQ_copy_view = smem_thr_copy_Q.partition_S(
+          cute::as_position_independent_swizzle_tensor(sQ));
+      cute::copy(smem_tiled_copy_Q, tSsQ_copy_view, tSrQ_copy_view);
+    }
+
+    Tensor tSrS =
+        partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));
+    Tensor tSrS_rowcol = make_tensor(
+        tSrS.data(),
+        hstu::convert_layout_acc_rowcol</*Transposed=*/false>(tSrS.layout()));
+    consumer_wait(pipeline_k, smem_pipe_read);
+    hstu::gemm</*zero_init=*/true, /*wg_wait=*/-1>(
+        tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS);
+    warpgroup_wait<0>();
+    pipeline_k.consumer_release(smem_pipe_read);
+#pragma unroll
+    for (int mi = 0; mi < size<0>(tSrS_rowcol); ++mi) {
+      float scale = scalar_scale_val_;
+      if (!params.scalar_scale) {
+        int q_index = qdim_offset + int(get<Qdim>(t0ScS_rowcol(mi, _0{})));
+        // Convert global index to local sequence position for bounds checking
+        int q_local = q_index - seqlen_info.offset_q;
+        if (q_local < seqlen_info.seqlen_q) {
+          scale = params.attn_scale[q_index];
+        }
+      }
+#pragma unroll
+      for (int ni = 0; ni < size<1>(tSrS_rowcol); ++ni) {
+        tSrS_rowcol(mi, ni) =
+            silu_scale_op(tSrS_rowcol(mi, ni) * params.alpha, scale);
+      }
+    }
+    int const m_idx_max = (m_block + 1) * kBlockM;
+    if constexpr (Cross) {
+      mask.template apply<
+          false /*Seqlenq_mask*/,
+          true /*Seqlenk_mask*/,
+          Causal,
+          false /*Local*/,
+          false /*Contexual_mask*/,
+          false /*Target_mask*/,
+          Cross,
+          false /*Softmax*/>(tSrS, m_block, n_block);
+    } else {
+      if (m_idx_max <= seqlen_info.uihlen_q) {
+        mask.template apply<
+            false /*Seqlenq_mask*/,
+            false /*Seqlenk_mask*/,
+            Causal,
+            Local,
+            Contexual_mask,
+            false /*Target_mask*/,
+            Cross,
+            false /*Softmax*/>(tSrS, m_block, n_block);
+      } else if (
+          m_idx_max <=
+          cute::ceil_div(seqlen_info.uihlen_q, kBlockM) * kBlockM) {
+        mask.template apply<
+            false /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            Causal,
+            Local,
+            Contexual_mask,
+            Has_targets,
+            Cross,
+            false /*Softmax*/>(tSrS, m_block, n_block);
+      } else {
+        mask.template apply<
+            false /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            false /*Causal*/,
+            false,
+            Contexual_mask,
+            Has_targets,
+            Cross,
+            false /*Softmax*/>(tSrS, m_block, n_block);
+      }
+    }
+    if constexpr (Is_FP8 && !V_colmajor) {
+      hstu::permute_Cregs_fp8(tSrS);
+    }
+    Tensor tOrP_acc = make_tensor(
+        tSrS.data(), hstu::convert_layout_acc_Aregs<TiledMma1>(tSrS.layout()));
+    Tensor tOrP = make_tensor_like<Element>(tOrP_acc);
+    convert_type_out(tOrP_acc, tOrP);
+    if constexpr (Is_FP8 && V_colmajor) {
+      hstu::permute_Aregs_fp8(tOrP);
+    }
+    if constexpr (!Mma1_is_RS) {
+      cute::copy(smem_tiled_copy_P, smem_thr_copy_P.retile_S(tOrP), tPsP);
+      cutlass::arch::fence_view_async_shared();
+      __syncwarp(); // Only need syncwarp since each warp is using its own P
+                    // values for Mma1
+    }
+    --n_block;
+
+    // Each step does gemm0 and silu for iter n_block and gemm1 for prev iter.
+    auto fwd_step_intra_warp_pipeline = [&](int const n_block, auto mask_fn) {
+      PipelineState smem_pipe_read_v(
+          smem_pipe_read.index(),
+          smem_pipe_read.phase(),
+          smem_pipe_read.count());
+      ++smem_pipe_read;
+      Tensor tSrS =
+          partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));
+      Tensor tSrS_rowcol = make_tensor(
+          tSrS.data(),
+          hstu::convert_layout_acc_rowcol</*Transposed=*/false>(tSrS.layout()));
+      if (!UseSchedulerBarrier || warp_group_idx == 0) {
+        consumer_wait(pipeline_k, smem_pipe_read);
+      }
+      warp_scheduler_barrier_sync();
+      hstu::gemm</*zero_init=*/true, /*wg_wait=*/-1>(
+          tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS);
+      if (!UseSchedulerBarrier || warp_group_idx == 0) {
+        consumer_wait(pipeline_v, smem_pipe_read_v);
+      }
+      hstu::gemm</*zero_init=*/false, /*wg_wait=*/-1>(
+          tiled_mma1,
+          cute::conditional_return<Mma1_is_RS>(tOrP, tOsP),
+          tOrV(_, _, _, smem_pipe_read_v.index()),
+          tOrO);
+      warp_scheduler_barrier_arrive();
+      warpgroup_wait<1>();
+      pipeline_k.consumer_release(smem_pipe_read); // release K
+#pragma unroll
+      for (int mi = 0; mi < size<0>(tSrS_rowcol); ++mi) {
+        float scale = scalar_scale_val_;
+        if (!params.scalar_scale) {
+          int q_index = qdim_offset + int(get<Qdim>(t0ScS_rowcol(mi, _0{})));
+          // Convert global index to local sequence position for bounds checking
+          int q_local = q_index - seqlen_info.offset_q;
+          if (q_local < seqlen_info.seqlen_q) {
+            scale = params.attn_scale[q_index];
+          }
+        }
+#pragma unroll
+        for (int ni = 0; ni < size<1>(tSrS_rowcol); ++ni) {
+          tSrS_rowcol(mi, ni) =
+              silu_scale_op(tSrS_rowcol(mi, ni) * params.alpha, scale);
+        }
+      }
+      mask_fn(tSrS, n_block);
+      warpgroup_wait<0>();
+      pipeline_v.consumer_release(smem_pipe_read_v); // release V
+      if constexpr (Is_FP8 && !V_colmajor) {
+        hstu::permute_Cregs_fp8(tSrS);
+      }
+      convert_type_out(make_tensor(tSrS.data(), tOrP.layout()), tOrP);
+      if constexpr (Is_FP8 && V_colmajor) {
+        hstu::permute_Aregs_fp8(tOrP);
+      }
+      if constexpr (!Mma1_is_RS) {
+        cute::copy(smem_tiled_copy_P, smem_thr_copy_P.retile_S(tOrP), tPsP);
+      }
+      if constexpr (!Mma1_is_RS) {
+        cutlass::arch::fence_view_async_shared();
+        __syncwarp();
+      }
+    };
+
+    if constexpr (Cross) {
+      if constexpr (Causal) {
+        if (m_idx_max <=
+            cute::ceil_div(seqlen_info.uihlen_q, kBlockM) * kBlockM) {
+          auto mask_fn = [&](auto& tSrS, int n_block) {
+            mask.template apply<
+                false /*Seqlenq_mask*/,
+                false /*Seqlenk_mask*/,
+                Causal,
+                false /*Local*/,
+                false /*Contexual_mask*/,
+                false /*Target_mask*/,
+                Cross,
+                false /*Softmax*/>(tSrS, m_block, n_block);
+          };
+          int const m_idx_min = m_block * kBlockM;
+          int const n_block_min_causal_mask = std::max(
+              n_block_min,
+              (m_idx_min + seqlen_info.seqlen_kv - seqlen_info.uihlen_q) /
+                  kBlockN);
+#pragma unroll 1
+          for (; n_block >= n_block_min_causal_mask; --n_block) {
+            fwd_step_intra_warp_pipeline(n_block, mask_fn);
+          }
+        }
+      }
+      auto no_mask_fn = [](auto& tSrS, int n_block) {};
+#pragma unroll 1
+      for (; n_block >= n_block_min; --n_block) {
+        fwd_step_intra_warp_pipeline(n_block, no_mask_fn);
+      }
+    } else {
+      if constexpr (Causal || Local) { // Separate iterations with causal
+                                       // or local masking
+        if (m_idx_max <=
+            cute::ceil_div(seqlen_info.uihlen_q, kBlockM) * kBlockM) {
+          auto mask_fn = [&](auto& tSrS, int n_block) {
+            mask.template apply<
+                false /*Seqlenq_mask*/,
+                false /*Seqlenk_mask*/,
+                Causal,
+                Local,
+                Contexual_mask,
+                false /*Has_targets*/,
+                Cross,
+                false /*Softmax*/>(tSrS, m_block, n_block);
+          };
+          int const m_idx_min = m_block * kBlockM;
+          int const n_block_min_causal_local_mask =
+              std::max(n_block_min, m_idx_min / kBlockN);
+#pragma unroll 1
+          for (; n_block >= n_block_min_causal_local_mask; --n_block) {
+            fwd_step_intra_warp_pipeline(n_block, mask_fn);
+          }
+        }
+      }
+      int n_block_min_before_local_mask = n_block_min;
+      if constexpr (Local) {
+        if (m_idx_max <=
+            cute::ceil_div(
+                seqlen_info.uihlen_q - min_full_attn_seq_len_, kBlockM) *
+                kBlockM) {
+          n_block_min_before_local_mask = std::max(
+              n_block_min, cute::ceil_div(m_idx_max - max_attn_len_, kBlockN));
+        }
+      }
+      auto no_mask_fn = [](auto& tSrS, int n_block) {};
+#pragma unroll 1
+      for (; n_block >= n_block_min_before_local_mask; --n_block) {
+        fwd_step_intra_warp_pipeline(n_block, no_mask_fn);
+      }
+      // Separate masking iterations on the left for local attention
+      if constexpr (Local) {
+        auto local_mask_fn = [&](auto& tSrS, int n_block) {
+          mask.template apply<
+              false /*Seqlenq_mask*/,
+              false /*Seqlenk_mask*/,
+              false /*Causal_mask*/,
+              Local,
+              Contexual_mask,
+              false /*Has_targets*/,
+              Cross,
+              false /*Softmax*/>(tSrS, m_block, n_block);
+        };
+#pragma unroll 1
+        for (; n_block >= n_block_min; --n_block) {
+          fwd_step_intra_warp_pipeline(n_block, local_mask_fn);
+        }
+      }
+      // Target part GEMM
+      if constexpr (Has_targets) {
+        auto [target_n_block_min, target_n_block_max] =
+            get_target_n_block_min_max(
+                n_block_max,
+                seqlen_info.uihlen_q,
+                seqlen_info.seqlen_kv,
+                m_block);
+        auto target_mask_fn = [&](auto& tSrS, int n_block) {
+          mask.template apply<
+              false /*Seqlenq_mask*/,
+              true /*Seqlenk_mask*/,
+              false /*Causal_mask*/,
+              false /*Local*/,
+              Contexual_mask,
+              Has_targets,
+              Cross,
+              false /*Softmax*/>(tSrS, m_block, n_block);
+        };
+#pragma unroll 1
+        for (n_block = target_n_block_max - 1; n_block >= target_n_block_min;
+             --n_block) {
+          fwd_step_intra_warp_pipeline(n_block, target_mask_fn);
+        }
+      }
+      if constexpr (Contexual_mask) {
+        int contexual_n_block_max = get_contexual_n_block_max(
+            n_block_min,
+            min_full_attn_seq_len_,
+            contextual_seq_len_,
+            seqlen_info.uihlen_q,
+            m_block);
+        auto contexual_mask_fn = [&](auto& tSrS, int n_block) {
+          mask.template apply<
+              false /*Seqlenq_mask*/,
+              true /*Seqlenk_mask*/,
+              false /*Causal_mask*/,
+              Local,
+              Contexual_mask,
+              Has_targets,
+              Cross,
+              false /*Softmax*/>(tSrS, m_block, n_block);
+        };
+#pragma unroll 1
+        for (n_block = contexual_n_block_max - 1; n_block >= 0; --n_block) {
+          fwd_step_intra_warp_pipeline(n_block, contexual_mask_fn);
+        }
+      }
+    }
+    // Tell producers that smem_q is ready
+    cutlass::arch::NamedBarrier::arrive(
+        NumMmaThreads + cutlass::NumThreadsPerWarp,
+        static_cast<uint32_t>(FwdNamedBarriers::QueryEmpty) /*id*/);
+    consumer_wait(pipeline_v, smem_pipe_read);
+    hstu::gemm</*zero_init=*/false, /*wg_wait=*/-1>(
+        tiled_mma1,
+        cute::conditional_return<Mma1_is_RS>(tOrP, tOsP),
+        tOrV(_, _, _, smem_pipe_read.index()),
+        tOrO);
+    warpgroup_wait<0>();
+    pipeline_v.consumer_release(
+        smem_pipe_read); // release V, otherwise producers will hang
+    if constexpr (Is_FP8 && !V_colmajor) {
+      hstu::permute_output_fp8(tOrO);
+    }
+    ++smem_pipe_read;
+    ++work_idx;
+    return true;
+  }
+
+  template <typename SharedStorage, typename FrgTensorO, typename Softmax>
+  CUTLASS_DEVICE bool mma_softmax(
+      Params const& params,
+      MainloopPipelineK pipeline_k,
+      MainloopPipelineV pipeline_v,
+      PipelineState& smem_pipe_read,
+      FrgTensorO& tOrO,
+      Softmax& softmax,
+      int const thread_idx,
+      int& work_idx,
+      SeqlenInfo_t const& seqlen_info,
+      cute::tuple<int32_t, int32_t, int32_t, int32_t> block_coord,
+      SharedStorage& shared_storage) {
+    static_assert(
+        is_rmem<FrgTensorO>::value, "O tensor must be rmem resident.");
+    static constexpr int kBlockM = get<0>(TileShape_MNK{});
+    static constexpr int kBlockN = get<1>(TileShape_MNK{});
+
+    // can't use auto [m_block, ...] = block_coord since structured binding
+    // cannot be captured in lambda
+    int const m_block = get<0>(block_coord);
+    int const bidh = get<1>(block_coord);
+    int const bidb = get<2>(block_coord);
+    int const split_idx = get<3>(block_coord);
+    if constexpr (Jagged) {
+      static constexpr int kBlockM = get<0>(TileShape_MNK{});
+      if (m_block * kBlockM >= seqlen_info.seqlen_q) {
+        return false;
+      }
+    }
+    int min_full_attn_seq_len_, max_attn_len_, contextual_seq_len_;
+    if constexpr (!Cross) {
+      if (params.num_groups > 1) {
+        int group_id = bidb / params.batch_size_per_group;
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len_tensor[group_id];
+        max_attn_len_ = params.max_attn_len_tensor[group_id];
+        contextual_seq_len_ = params.contextual_seq_len_tensor[group_id];
+      } else {
+        min_full_attn_seq_len_ = params.min_full_attn_seq_len;
+        max_attn_len_ = params.max_attn_len;
+        contextual_seq_len_ = params.contextual_seq_len;
+      }
+    }
+    int n_block_min, n_block_max;
+    if constexpr (Cross) {
+      auto n_block_min_max = get_cross_n_block_min_max(
+          seqlen_info.uihlen_q,
+          seqlen_info.seqlen_q,
+          seqlen_info.seqlen_kv,
+          m_block);
+      n_block_min = get<0>(n_block_min_max);
+      n_block_max = get<1>(n_block_min_max);
+    } else {
+      auto n_block_min_max = get_n_block_min_max(
+          max_attn_len_,
+          min_full_attn_seq_len_,
+          contextual_seq_len_,
+          seqlen_info.uihlen_q,
+          m_block);
+      n_block_min = get<0>(n_block_min_max);
+      n_block_max = get<1>(n_block_min_max);
+    }
+
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+    if (n_block_max <= n_block_min) {
+      std::printf(
+          "mainloop_fwd_sm90: n_block_max <= n_block_min not expected.");
+      return false;
+    }
+#endif
+
+    Tensor sQ = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
+        SmemLayoutQ{});
+    Tensor sK = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_k.data()),
+        SmemLayoutK{});
+    Tensor sV = make_tensor(
+        make_smem_ptr(shared_storage.tensors.mainloop.smem_v.data()),
+        SmemLayoutVtMma{});
+    Tensor sP = [&] {
+      if constexpr (Mma1_is_RS) {
+        // We might not have smem_p if !Mma1_is_RS1, just use smem_q as a
+        // placeholder since we don't use it
+        return make_tensor(
+            make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
+            SmemLayoutP{});
+      } else {
+        return make_tensor(
+            make_smem_ptr(shared_storage.tensors.mainloop.smem_p.data()),
+            SmemLayoutP{});
+      }
+    }();
+
+    if constexpr (!Mma0_is_RS) {
+      static_assert(
+          stride<0>(typename TiledMma0::ALayout{}) == 0 and
+              stride<0>(typename TiledMma0::BLayout{}) == 0 and
+              size<0>(typename TiledMma0::ALayout{}) ==
+                  cutlass::NumThreadsPerWarpGroup and
+              size<0>(typename TiledMma0::BLayout{}) ==
+                  cutlass::NumThreadsPerWarpGroup,
+          "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
+    }
+    constexpr int MmaWarpGroups =
+        size(TiledMma0{}) / cutlass::NumThreadsPerWarpGroup;
+    Layout warp_group_thread_layout = make_layout(
+        make_shape(Int<MmaWarpGroups>{}),
+        make_stride(Int<cutlass::NumThreadsPerWarpGroup>{}));
+
+    int warp_group_idx = __shfl_sync(
+        0xFFFFFFFF, thread_idx / cutlass::NumThreadsPerWarpGroup, 0);
+    TiledMma0 tiled_mma0;
+    TiledMma1 tiled_mma1;
+    auto wg_mma0 =
+        tiled_mma0.get_slice(warp_group_thread_layout(warp_group_idx));
+    auto wg_mma1 =
+        tiled_mma1.get_slice(warp_group_thread_layout(warp_group_idx));
+
+    auto smem_tiled_copy_P = make_tiled_copy_C(SmemCopyAtomP{}, tiled_mma0);
+    auto smem_thr_copy_P = smem_tiled_copy_P.get_thread_slice(thread_idx);
+
+    // Allocate "fragments/descriptors"
+    Tensor tSrQ = wg_mma0.partition_fragment_A(sQ);
+    Tensor tSrK = wg_mma0.partition_fragment_B(sK);
+    Tensor tOrV = wg_mma1.partition_fragment_B(sV);
+    Tensor tOsP = wg_mma1.partition_fragment_A(sP);
+    Tensor tPsP = smem_thr_copy_P.partition_D(
+        cute::as_position_independent_swizzle_tensor(sP));
+
+    auto consumer_wait = [](auto& pipeline, auto& smem_pipe_read) {
+      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
+      pipeline.consumer_wait(smem_pipe_read, barrier_token);
+    };
+
+    clear(tOrO);
+
+    int n_block = n_block_max - 1;
+
+    hstu::Mask<kBlockM, kBlockN, TiledMma0> mask(
+        thread_idx,
+        seqlen_info.seqlen_q,
+        seqlen_info.seqlen_kv,
+        max_attn_len_,
+        min_full_attn_seq_len_,
+        contextual_seq_len_,
+        seqlen_info.uihlen_q);
+
+    auto& barrier_Q = shared_storage.pipelines.barrier_Q;
+    barrier_Q.wait(work_idx % 2);
+    static constexpr int Qdim = 0;
+    auto thread_mma = tiled_mma0.get_thread_slice(thread_idx);
+    auto thread0_mma = tiled_mma0.get_thread_slice(_0{});
+    Tensor cS = cute::make_identity_tensor(Shape<Int<kBlockM>, Int<kBlockN>>{});
+    Tensor tScS = thread_mma.partition_C(cS);
+    Tensor tScS_rowcol = make_tensor(
+        tScS.data(),
+        hstu::convert_layout_acc_rowcol</*Transposed=*/false>(tScS.layout()));
+    Tensor t0ScS = thread0_mma.partition_C(cS);
+    Tensor t0ScS_rowcol = make_tensor(
+        t0ScS.data(),
+        hstu::convert_layout_acc_rowcol</*Transposed=*/false>(t0ScS.layout()));
+    int const thread_qdim_offset = get<Qdim>(tScS_rowcol(_0{}, _0{}));
+    int qdim_offset = params.scalar_scale
+        ? 0
+        : m_block * kBlockM + thread_qdim_offset + seqlen_info.offset_q;
+
+    if constexpr (Mma0_is_RS) {
+      using SmemCopyAtomQ = Copy_Atom<cute::SM75_U32x4_LDSM_N, Element>;
+      auto smem_tiled_copy_Q = make_tiled_copy_A(SmemCopyAtomQ{}, tiled_mma0);
+      auto smem_thr_copy_Q = smem_tiled_copy_Q.get_thread_slice(thread_idx);
+      Tensor tSrQ_copy_view = smem_thr_copy_Q.retile_D(tSrQ);
+      Tensor tSsQ_copy_view = smem_thr_copy_Q.partition_S(
+          cute::as_position_independent_swizzle_tensor(sQ));
+      cute::copy(smem_tiled_copy_Q, tSsQ_copy_view, tSrQ_copy_view);
+    }
+
+    Tensor tSrS =
+        partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));
+    Tensor tSrS_rowcol = make_tensor(
+        tSrS.data(),
+        hstu::convert_layout_acc_rowcol</*Transposed=*/false>(tSrS.layout()));
+    consumer_wait(pipeline_k, smem_pipe_read);
+    hstu::gemm</*zero_init=*/true, /*wg_wait=*/-1>(
+        tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS);
+    warpgroup_wait<0>();
+    pipeline_k.consumer_release(smem_pipe_read);
+    int const m_idx_max = (m_block + 1) * kBlockM;
+    if constexpr (Cross) {
+      mask.template apply<
+          false /*Seqlenq_mask*/,
+          true /*Seqlenk_mask*/,
+          Causal,
+          false /*Local*/,
+          false /*Contexual_mask*/,
+          false /*Target_mask*/,
+          Cross,
+          true /*Softmax*/>(tSrS, m_block, n_block);
+    } else {
+      if (m_idx_max <= seqlen_info.uihlen_q) {
+        mask.template apply<
+            false /*Seqlenq_mask*/,
+            false /*Seqlenk_mask*/,
+            Causal,
+            Local,
+            Contexual_mask,
+            false /*Target_mask*/,
+            Cross,
+            true /*Softmax*/>(tSrS, m_block, n_block);
+      } else if (
+          m_idx_max <=
+          cute::ceil_div(seqlen_info.uihlen_q, kBlockM) * kBlockM) {
+        mask.template apply<
+            false /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            Causal,
+            Local,
+            Contexual_mask,
+            Has_targets,
+            Cross,
+            true /*Softmax*/>(tSrS, m_block, n_block);
+      } else {
+        mask.template apply<
+            false /*Seqlenq_mask*/,
+            true /*Seqlenk_mask*/,
+            false /*Causal*/,
+            false,
+            Contexual_mask,
+            Has_targets,
+            Cross,
+            true /*Softmax*/>(tSrS, m_block, n_block);
+      }
+    }
+    Tensor scores_scale = softmax.template max_get_scale<
+        /*Is_first=*/true,
+        /*Check_inf=*/true>(tSrS);
+    softmax.template online_softmax</*Is_first=*/true, /*Check_inf=*/true>(
+        tSrS);
+    if constexpr (Is_FP8 && !V_colmajor) {
+      hstu::permute_Cregs_fp8(tSrS);
+    }
+    Tensor tOrP_acc = make_tensor(
+        tSrS.data(), hstu::convert_layout_acc_Aregs<TiledMma1>(tSrS.layout()));
+    Tensor tOrP = make_tensor_like<Element>(tOrP_acc);
+    convert_type_out(tOrP_acc, tOrP);
+    if constexpr (Is_FP8 && V_colmajor) {
+      hstu::permute_Aregs_fp8(tOrP);
+    }
+    if constexpr (!Mma1_is_RS) {
+      cute::copy(smem_tiled_copy_P, smem_thr_copy_P.retile_S(tOrP), tPsP);
+      cutlass::arch::fence_view_async_shared();
+      __syncwarp(); // Only need syncwarp since each warp is using its own P
+                    // values for Mma1
+    }
+    --n_block;
+
+    // Each step does gemm0 and softmax for iter n_block and gemm1 for prev
+    auto fwd_step_intra_warp_pipeline = [&](int const n_block,
+                                            auto mask_fn,
+                                            auto check_inf_type) {
+      static constexpr bool Check_inf = decltype(check_inf_type)::value;
+      PipelineState smem_pipe_read_v(
+          smem_pipe_read.index(),
+          smem_pipe_read.phase(),
+          smem_pipe_read.count());
+      ++smem_pipe_read;
+      Tensor tSrS =
+          partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));
+      Tensor tSrS_rowcol = make_tensor(
+          tSrS.data(),
+          hstu::convert_layout_acc_rowcol</*Transposed=*/false>(tSrS.layout()));
+      if (!UseSchedulerBarrier || warp_group_idx == 0) {
+        consumer_wait(pipeline_k, smem_pipe_read);
+      }
+      warp_scheduler_barrier_sync();
+      hstu::gemm</*zero_init=*/true, /*wg_wait=*/-1>(
+          tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS);
+      if (!UseSchedulerBarrier || warp_group_idx == 0) {
+        consumer_wait(pipeline_v, smem_pipe_read_v);
+      }
+      hstu::gemm</*zero_init=*/false, /*wg_wait=*/-1>(
+          tiled_mma1,
+          cute::conditional_return<Mma1_is_RS>(tOrP, tOsP),
+          tOrV(_, _, _, smem_pipe_read_v.index()),
+          tOrO);
+      warp_scheduler_barrier_arrive();
+      warpgroup_wait<1>();
+      pipeline_k.consumer_release(smem_pipe_read); // release K
+      mask_fn(tSrS, n_block);
+      cute::copy(
+          softmax.template max_get_scale</*Is_first=*/false, Check_inf>(tSrS),
+          scores_scale);
+      softmax.template online_softmax</*Is_first=*/false, Check_inf>(tSrS);
+      warpgroup_wait<0>();
+      pipeline_v.consumer_release(smem_pipe_read_v); // release V
+      if constexpr (Is_FP8 && !V_colmajor) {
+        hstu::permute_Cregs_fp8(tSrS);
+      }
+      convert_type_out(make_tensor(tSrS.data(), tOrP.layout()), tOrP);
+      if constexpr (Is_FP8 && V_colmajor) {
+        hstu::permute_Aregs_fp8(tOrP);
+      }
+      softmax.rescale_o(tOrO, scores_scale);
+      if constexpr (!Mma1_is_RS) {
+        cute::copy(smem_tiled_copy_P, smem_thr_copy_P.retile_S(tOrP), tPsP);
+      }
+      if constexpr (!Mma1_is_RS) {
+        cutlass::arch::fence_view_async_shared();
+        __syncwarp();
+      }
+    };
+
+    if constexpr (Cross) {
+      if constexpr (Causal) {
+        if (m_idx_max <=
+            cute::ceil_div(seqlen_info.uihlen_q, kBlockM) * kBlockM) {
+          auto mask_fn = [&](auto& tSrS, int n_block) {
+            mask.template apply<
+                false /*Seqlenq_mask*/,
+                false /*Seqlenk_mask*/,
+                Causal,
+                false /*Local*/,
+                false /*Contexual_mask*/,
+                false /*Target_mask*/,
+                Cross,
+                true /*Softmax*/>(tSrS, m_block, n_block);
+          };
+          int const m_idx_min = m_block * kBlockM;
+          int const n_block_min_causal_mask = std::max(
+              n_block_min,
+              (m_idx_min + seqlen_info.seqlen_kv - seqlen_info.uihlen_q) /
+                  kBlockN);
+#pragma unroll 1
+          for (; n_block >= n_block_min_causal_mask; --n_block) {
+            fwd_step_intra_warp_pipeline(n_block, mask_fn, cute::true_type{});
+          }
+        }
+      }
+      auto no_mask_fn = [](auto& tSrS, int n_block) {};
+#pragma unroll 1
+      for (; n_block >= n_block_min; --n_block) {
+        fwd_step_intra_warp_pipeline(n_block, no_mask_fn, cute::false_type{});
+      }
+    } else {
+      if constexpr (Causal || Local) { // Separate iterations with causal
+                                       // or local masking
+        if (m_idx_max <=
+            cute::ceil_div(seqlen_info.uihlen_q, kBlockM) * kBlockM) {
+          auto mask_fn = [&](auto& tSrS, int n_block) {
+            mask.template apply<
+                false /*Seqlenq_mask*/,
+                false /*Seqlenk_mask*/,
+                Causal,
+                Local,
+                Contexual_mask,
+                false /*Has_targets*/,
+                Cross,
+                true /*Softmax*/>(tSrS, m_block, n_block);
+          };
+          int const m_idx_min = m_block * kBlockM;
+          int const n_block_min_causal_local_mask =
+              std::max(n_block_min, m_idx_min / kBlockN);
+#pragma unroll 1
+          for (; n_block >= n_block_min_causal_local_mask; --n_block) {
+            fwd_step_intra_warp_pipeline(n_block, mask_fn, cute::true_type{});
+          }
+        }
+      }
+      int n_block_min_before_local_mask = n_block_min;
+      if constexpr (Local) {
+        if (m_idx_max <=
+            cute::ceil_div(
+                seqlen_info.uihlen_q - min_full_attn_seq_len_, kBlockM) *
+                kBlockM) {
+          n_block_min_before_local_mask = std::max(
+              n_block_min, cute::ceil_div(m_idx_max - max_attn_len_, kBlockN));
+        }
+      }
+      auto no_mask_fn = [](auto& tSrS, int n_block) {};
+#pragma unroll 1
+      for (; n_block >= n_block_min_before_local_mask; --n_block) {
+        fwd_step_intra_warp_pipeline(n_block, no_mask_fn, cute::false_type{});
+      }
+      // Separate masking iterations on the left for local attention
+      if constexpr (Local) {
+        auto local_mask_fn = [&](auto& tSrS, int n_block) {
+          mask.template apply<
+              false /*Seqlenq_mask*/,
+              false /*Seqlenk_mask*/,
+              false /*Causal_mask*/,
+              Local,
+              Contexual_mask,
+              false /*Has_targets*/,
+              Cross,
+              true /*Softmax*/>(tSrS, m_block, n_block);
+        };
+#pragma unroll 1
+        for (; n_block >= n_block_min; --n_block) {
+          fwd_step_intra_warp_pipeline(
+              n_block, local_mask_fn, cute::true_type{});
+        }
+      }
+      // Target part GEMM
+      if constexpr (Has_targets) {
+        auto [target_n_block_min, target_n_block_max] =
+            get_target_n_block_min_max(
+                n_block_max,
+                seqlen_info.uihlen_q,
+                seqlen_info.seqlen_kv,
+                m_block);
+        auto target_mask_fn = [&](auto& tSrS, int n_block) {
+          mask.template apply<
+              false /*Seqlenq_mask*/,
+              true /*Seqlenk_mask*/,
+              false /*Causal_mask*/,
+              false /*Local*/,
+              Contexual_mask,
+              Has_targets,
+              Cross,
+              true /*Softmax*/>(tSrS, m_block, n_block);
+        };
+#pragma unroll 1
+        for (n_block = target_n_block_max - 1; n_block >= target_n_block_min;
+             --n_block) {
+          fwd_step_intra_warp_pipeline(
+              n_block, target_mask_fn, cute::true_type{});
+        }
+      }
+      if constexpr (Contexual_mask) {
+        int contexual_n_block_max = get_contexual_n_block_max(
+            n_block_min,
+            min_full_attn_seq_len_,
+            contextual_seq_len_,
+            seqlen_info.uihlen_q,
+            m_block);
+        auto contexual_mask_fn = [&](auto& tSrS, int n_block) {
+          mask.template apply<
+              false /*Seqlenq_mask*/,
+              true /*Seqlenk_mask*/,
+              false /*Causal_mask*/,
+              Local,
+              Contexual_mask,
+              Has_targets,
+              Cross,
+              true /*Softmax*/>(tSrS, m_block, n_block);
+        };
+#pragma unroll 1
+        for (n_block = contexual_n_block_max - 1; n_block >= 0; --n_block) {
+          fwd_step_intra_warp_pipeline(
+              n_block, contexual_mask_fn, cute::true_type{});
+        }
+      }
+    }
+    // Tell producers that smem_q is ready
+    cutlass::arch::NamedBarrier::arrive(
+        NumMmaThreads + cutlass::NumThreadsPerWarp,
+        static_cast<uint32_t>(FwdNamedBarriers::QueryEmpty) /*id*/);
+    consumer_wait(pipeline_v, smem_pipe_read);
+    hstu::gemm</*zero_init=*/false, /*wg_wait=*/-1>(
+        tiled_mma1,
+        cute::conditional_return<Mma1_is_RS>(tOrP, tOsP),
+        tOrV(_, _, _, smem_pipe_read.index()),
+        tOrO);
+    cute::copy(softmax.finalize(1.0f), scores_scale);
+    warpgroup_wait<0>();
+    pipeline_v.consumer_release(
+        smem_pipe_read); // release V, otherwise producers will hang
+    softmax.rescale_o(tOrO, scores_scale);
+    if constexpr (Is_FP8 && !V_colmajor) {
+      hstu::permute_output_fp8(tOrO);
+    }
+    ++smem_pipe_read;
+    ++work_idx;
+    return true;
+  }
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mask.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mask.h
new file mode 100644
index 000000000..e35af5193
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mask.h
@@ -0,0 +1,396 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cute/tensor.hpp>
+
+#include "utils.h"
+
+namespace hstu {
+
+using namespace cute;
+
+template <int BlockM, int BlockN, typename TiledMma, bool SwapAB = false>
+struct Mask {
+  int const thread_idx;
+  int const max_q_len;
+  int const max_kv_len;
+  int const max_attn_len;
+  int const min_full_attn_seq_len;
+  int const contextual_seq_len;
+  int const max_uih_len;
+
+  CUTLASS_DEVICE
+  Mask(
+      const int thread_idx,
+      const int max_q_len,
+      const int max_kv_len,
+      const int max_attn_len,
+      const int min_full_attn_seq_len,
+      const int contextual_seq_len,
+      const int max_uih_len)
+      : thread_idx(thread_idx),
+        max_q_len(max_q_len),
+        max_kv_len(max_kv_len),
+        max_attn_len(max_attn_len),
+        min_full_attn_seq_len(min_full_attn_seq_len),
+        contextual_seq_len(contextual_seq_len),
+        max_uih_len(max_uih_len) {};
+
+  template <
+      bool Seqlenq_mask = false,
+      bool Seqlenk_mask = false,
+      bool Causal_mask = false,
+      bool Local_mask = false,
+      bool Contexual_mask = false,
+      bool Target_mask = false, // If Target_mask, Seqlenk_mask will be disabled
+      bool Cross = false,
+      bool Softmax = false,
+      typename Engine,
+      typename Layout>
+  CUTLASS_DEVICE void apply(
+      Tensor<Engine, Layout>& tSrS,
+      const int m_block,
+      const int n_block) const {
+    static_assert(
+        !(Causal_mask && Local_mask), "Cannot be both causal and local");
+    static_assert(Layout::rank == 3, "Only support 3D Tensor");
+    if constexpr (Cross) {
+      static_assert(
+          (!Local_mask) && (!Contexual_mask) && (!Target_mask),
+          "Local, contexual, and target masks not supported under cross attention");
+    }
+    if (!Seqlenq_mask && !Seqlenk_mask && !Causal_mask && !Local_mask &&
+        !Target_mask) {
+      return;
+    }
+
+    auto thread_mma = TiledMma{}.get_thread_slice(thread_idx);
+    auto thread0_mma = TiledMma{}.get_thread_slice(_0{});
+
+    static constexpr int Qdim = !SwapAB ? 0 : 1, Kdim = !SwapAB ? 1 : 0;
+
+    Tensor cS = cute::make_identity_tensor(
+        Shape<
+            Int<!SwapAB ? BlockM : BlockN>,
+            Int<!SwapAB ? BlockN : BlockM>>{});
+    Tensor tScS = thread_mma.partition_C(cS);
+    Tensor tSrS_rowcol = make_tensor(
+        tSrS.data(),
+        hstu::convert_layout_acc_rowcol</*Transposed=*/SwapAB>(tSrS.layout()));
+    Tensor tScS_rowcol = make_tensor(
+        tScS.data(),
+        hstu::convert_layout_acc_rowcol</*Transposed=*/SwapAB>(tScS.layout()));
+    Tensor t0ScS = thread0_mma.partition_C(cS);
+    Tensor t0ScS_rowcol = make_tensor(
+        t0ScS.data(),
+        hstu::convert_layout_acc_rowcol</*Transposed=*/SwapAB>(t0ScS.layout()));
+    // We want to use the col indices of thread0 to compare, since that is known
+    // at compile time. So we subtract the limit by the first col index of this
+    // thread
+    int const thread_kdim_offset = get<Kdim>(tScS_rowcol(_0{}, _0{}));
+    int const thread_qdim_offset = get<Qdim>(tScS_rowcol(_0{}, _0{}));
+    int const seqlen_k_limit =
+        max_kv_len - n_block * BlockN - thread_kdim_offset;
+    int const uihlen_k_limit =
+        max_uih_len - n_block * BlockN - thread_kdim_offset;
+    int const seqlen_q_limit =
+        max_q_len - m_block * BlockM - thread_qdim_offset;
+    if constexpr (Seqlenq_mask) {
+#pragma unroll
+      for (int m = 0; m < size<0>(tSrS_rowcol); ++m) {
+        if (int(get<Qdim>(t0ScS_rowcol(m, _0{}))) >= seqlen_q_limit) {
+#pragma unroll
+          for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
+            if constexpr (Softmax) {
+              tSrS_rowcol(m, n) = -INFINITY;
+            } else {
+              tSrS_rowcol(m, n) = 0.0f;
+            }
+          }
+        }
+      }
+    }
+    if constexpr (Cross) {
+      if constexpr (!Causal_mask) {
+        if constexpr (Seqlenk_mask) {
+#pragma unroll
+          for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
+            int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(_0{}, n)));
+            if (t0_col_idx >= seqlen_k_limit) {
+#pragma unroll
+              for (int m = 0; m < size<0>(tSrS_rowcol); ++m) {
+                if constexpr (Softmax) {
+                  tSrS_rowcol(m, n) = -INFINITY;
+                } else {
+                  tSrS_rowcol(m, n) = 0.0f;
+                }
+              }
+            }
+          }
+        }
+      } else {
+        int const causal_row_offset = max_kv_len - max_uih_len + 1 -
+            n_block * BlockN - thread_kdim_offset;
+#pragma unroll
+        for (int m = 0; m < size<0>(tSrS_rowcol); ++m) {
+          if constexpr (Seqlenq_mask) {
+            if (int(get<Qdim>(t0ScS_rowcol(m, _0{}))) >= seqlen_q_limit) {
+              continue;
+            }
+          }
+          int const row_idx = get<Qdim>(t0ScS_rowcol(m, _0{})) +
+              m_block * BlockM + thread_qdim_offset;
+          int const col_limit_right = !Seqlenk_mask
+              ? row_idx + causal_row_offset
+              : __viaddmin_s32(row_idx, causal_row_offset, seqlen_k_limit);
+#pragma unroll
+          for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
+            int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(_0{}, n)));
+            if (t0_col_idx >= col_limit_right) {
+              if constexpr (Softmax) {
+                tSrS_rowcol(m, n) = -INFINITY;
+              } else {
+                tSrS_rowcol(m, n) = 0.0f;
+              }
+            }
+          }
+        }
+      }
+    } else {
+      if constexpr (!Causal_mask && !Local_mask) {
+        if constexpr (Seqlenk_mask || Target_mask) {
+#pragma unroll
+          for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
+            int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(_0{}, n)));
+            if constexpr (Target_mask) {
+              if (t0_col_idx >= uihlen_k_limit) {
+                bool const oob_predicate = (t0_col_idx >= seqlen_k_limit);
+                int const col_offset =
+                    t0_col_idx - seqlen_k_limit + seqlen_q_limit;
+#pragma unroll
+                for (int m = 0; m < size<0>(tSrS_rowcol); ++m) {
+                  int const t0_row_idx = int(get<Qdim>(t0ScS_rowcol(m, _0{})));
+                  if ((t0_row_idx != col_offset) || oob_predicate) {
+                    if constexpr (Softmax) {
+                      tSrS_rowcol(m, n) = -INFINITY;
+                    } else {
+                      tSrS_rowcol(m, n) = 0.0f;
+                    }
+                  }
+                }
+              }
+            } else if constexpr (Seqlenk_mask) {
+              if (t0_col_idx >= seqlen_k_limit) {
+#pragma unroll
+                for (int m = 0; m < size<0>(tSrS_rowcol); ++m) {
+                  if constexpr (Softmax) {
+                    tSrS_rowcol(m, n) = -INFINITY;
+                  } else {
+                    tSrS_rowcol(m, n) = 0.0f;
+                  }
+                }
+              }
+            }
+          }
+        }
+      } else { // Causal_mask or Local_mask
+        int const causal_row_offset = 1 - n_block * BlockN - thread_kdim_offset;
+        if constexpr (Causal_mask) {
+#pragma unroll
+          for (int m = 0; m < size<0>(tSrS_rowcol); ++m) {
+            if constexpr (Seqlenq_mask) {
+              if (int(get<Qdim>(t0ScS_rowcol(m, _0{}))) >= seqlen_q_limit) {
+                continue;
+              }
+            }
+            if constexpr (Contexual_mask) {
+              if (int(get<Qdim>(t0ScS_rowcol(m, _0{}))) <
+                  contextual_seq_len - m_block * BlockM - thread_qdim_offset) {
+#pragma unroll
+                for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
+                  int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(_0{}, n)));
+                  if (t0_col_idx >= uihlen_k_limit) {
+                    if constexpr (Softmax) {
+                      tSrS_rowcol(m, n) = -INFINITY;
+                    } else {
+                      tSrS_rowcol(m, n) = 0.0f;
+                    }
+                  }
+                }
+                continue;
+              }
+            }
+            int const row_idx = get<Qdim>(t0ScS_rowcol(m, _0{})) +
+                m_block * BlockM + thread_qdim_offset;
+            if constexpr (!Target_mask) {
+              int const col_limit_right = !Seqlenk_mask
+                  ? row_idx + causal_row_offset
+                  : __viaddmin_s32(row_idx, causal_row_offset, seqlen_k_limit);
+#pragma unroll
+              for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
+                int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(_0{}, n)));
+                if (t0_col_idx >= col_limit_right) {
+                  if constexpr (Softmax) {
+                    tSrS_rowcol(m, n) = -INFINITY;
+                  } else {
+                    tSrS_rowcol(m, n) = 0.0f;
+                  }
+                }
+              }
+            } else {
+#pragma unroll
+              for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
+                int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(_0{}, n)));
+                int const col_idx =
+                    t0_col_idx + n_block * BlockN + thread_kdim_offset;
+                bool const uih_cond =
+                    (t0_col_idx >= row_idx + causal_row_offset) &&
+                    (row_idx < max_uih_len);
+                bool const target_cond = (row_idx != col_idx) &&
+                    (row_idx >= max_uih_len) && (col_idx >= max_uih_len);
+                bool const seqlen_k_cond = (t0_col_idx >= seqlen_k_limit);
+                if (uih_cond || target_cond || seqlen_k_cond) {
+                  if constexpr (Softmax) {
+                    tSrS_rowcol(m, n) = -INFINITY;
+                  } else {
+                    tSrS_rowcol(m, n) = 0.0f;
+                  }
+                }
+              }
+            }
+          }
+        } else { // Local_mask
+          int const local_row_offset_left =
+              causal_row_offset - 1 - max_attn_len;
+          int const col_limit_sink = 0 - n_block * BlockN - thread_kdim_offset;
+#pragma unroll
+          for (int m = 0; m < size<0>(tSrS_rowcol); ++m) {
+            if constexpr (Seqlenq_mask) {
+              if (int(get<Qdim>(t0ScS_rowcol(m, _0{}))) >= seqlen_q_limit) {
+                continue;
+              }
+            }
+            if constexpr (Contexual_mask) {
+              if (int(get<Qdim>(t0ScS_rowcol(m, _0{}))) <
+                  contextual_seq_len - m_block * BlockM - thread_qdim_offset) {
+#pragma unroll
+                for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
+                  int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(_0{}, n)));
+                  if (t0_col_idx >= uihlen_k_limit) {
+                    if constexpr (Softmax) {
+                      tSrS_rowcol(m, n) = -INFINITY;
+                    } else {
+                      tSrS_rowcol(m, n) = 0.0f;
+                    }
+                  }
+                }
+                continue;
+              }
+            }
+            int const row_idx = get<Qdim>(t0ScS_rowcol(m, _0{})) +
+                m_block * BlockM + thread_qdim_offset;
+            int col_limit_left = row_idx + local_row_offset_left;
+            if constexpr (Contexual_mask) {
+              // row contexual without sink
+              if (col_limit_left + n_block * BlockN + thread_kdim_offset <
+                  contextual_seq_len) {
+                col_limit_left = 0;
+              }
+            }
+            if constexpr (!Target_mask) {
+              int const col_limit_right = !Seqlenk_mask
+                  ? row_idx + causal_row_offset
+                  : __viaddmin_s32(row_idx, causal_row_offset, seqlen_k_limit);
+#pragma unroll
+              for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
+                int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(m, n)));
+                if (row_idx < max_uih_len - min_full_attn_seq_len) {
+                  bool const local_left_cond = Contexual_mask
+                      ? (t0_col_idx < col_limit_left &&
+                         t0_col_idx >= col_limit_sink)
+                      : (t0_col_idx < col_limit_left);
+                  if (local_left_cond) {
+                    if constexpr (Softmax) {
+                      tSrS_rowcol(m, n) = -INFINITY;
+                    } else {
+                      tSrS_rowcol(m, n) = 0.0f;
+                    }
+                  }
+                }
+                if (t0_col_idx >= col_limit_right) {
+                  if constexpr (Softmax) {
+                    tSrS_rowcol(m, n) = -INFINITY;
+                  } else {
+                    tSrS_rowcol(m, n) = 0.0f;
+                  }
+                }
+              }
+            } else {
+              int const col_limit_right = row_idx + causal_row_offset;
+#pragma unroll
+              for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
+                int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(_0{}, n)));
+                if (row_idx < max_uih_len) {
+                  if (row_idx < max_uih_len - min_full_attn_seq_len) {
+                    bool const local_left_cond = Contexual_mask
+                        ? (t0_col_idx < col_limit_left &&
+                           t0_col_idx >= col_limit_sink)
+                        : (t0_col_idx < col_limit_left);
+                    if (local_left_cond) {
+                      if constexpr (Softmax) {
+                        tSrS_rowcol(m, n) = -INFINITY;
+                      } else {
+                        tSrS_rowcol(m, n) = 0.0f;
+                      }
+                    }
+                  }
+                  if (t0_col_idx >= col_limit_right) {
+                    if constexpr (Softmax) {
+                      tSrS_rowcol(m, n) = -INFINITY;
+                    } else {
+                      tSrS_rowcol(m, n) = 0.0f;
+                    }
+                  }
+                } else {
+                  int const col_idx =
+                      t0_col_idx + n_block * BlockN + thread_kdim_offset;
+                  bool const target_cond = (row_idx != col_idx) &&
+                      (row_idx >= max_uih_len) && (col_idx >= max_uih_len);
+                  bool const seqlen_k_cond = (t0_col_idx >= seqlen_k_limit);
+                  if (target_cond || seqlen_k_cond) {
+                    if constexpr (Softmax) {
+                      tSrS_rowcol(m, n) = -INFINITY;
+                    } else {
+                      tSrS_rowcol(m, n) = 0.0f;
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  };
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/named_barrier.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/named_barrier.h
new file mode 100644
index 000000000..79dce0dd9
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/named_barrier.h
@@ -0,0 +1,101 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cutlass/arch/barrier.h"
+
+namespace hstu {
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// cutlass::arch::NamedBarrier::sync/arrive are only enabled Sm90 even though
+// they work for Sm80 as well. We reimplement them here, enabled for both Sm90
+// and Sm80.
+
+CUTLASS_DEVICE
+static void named_barrier_sync(uint32_t num_threads, uint32_t barrier_id_) {
+  static constexpr uint32_t ReservedNamedBarrierCount = static_cast<uint32_t>(
+      cutlass::arch::ReservedNamedBarriers::FirstUserBarrier);
+  uint32_t barrier_id = barrier_id_ + ReservedNamedBarrierCount;
+  asm volatile("bar.sync %0, %1;" : : "r"(barrier_id), "r"(num_threads));
+  cutlass::arch::synclog_emit_named_barrier_arrive_and_wait(
+      __LINE__, num_threads, barrier_id);
+}
+
+CUTLASS_DEVICE
+static void named_barrier_sync(
+    uint32_t num_threads,
+    cutlass::arch::ReservedNamedBarriers reserved_named_barriers) {
+  uint32_t barrier_id = static_cast<uint32_t>(reserved_named_barriers);
+  asm volatile("bar.sync %0, %1;" : : "r"(barrier_id), "r"(num_threads));
+  cutlass::arch::synclog_emit_named_barrier_arrive_and_wait(
+      __LINE__, num_threads, barrier_id);
+}
+
+CUTLASS_DEVICE
+static void named_barrier_arrive(uint32_t num_threads, uint32_t barrier_id_) {
+  static constexpr uint32_t ReservedNamedBarrierCount = static_cast<uint32_t>(
+      cutlass::arch::ReservedNamedBarriers::FirstUserBarrier);
+  uint32_t barrier_id = barrier_id_ + ReservedNamedBarrierCount;
+  cutlass::arch::synclog_emit_named_barrier_arrive(
+      __LINE__, num_threads, barrier_id);
+  asm volatile("bar.arrive %0, %1;" : : "r"(barrier_id), "r"(num_threads));
+}
+
+CUTLASS_DEVICE
+static void named_barrier_arrive(
+    uint32_t num_threads,
+    cutlass::arch::ReservedNamedBarriers reserved_named_barriers) {
+  uint32_t barrier_id = static_cast<uint32_t>(reserved_named_barriers);
+  cutlass::arch::synclog_emit_named_barrier_arrive(
+      __LINE__, num_threads, barrier_id);
+  asm volatile("bar.arrive %0, %1;" : : "r"(barrier_id), "r"(num_threads));
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Enumerates the reserved named barriers to avoid potential conflicts
+
+enum class FwdNamedBarriers {
+  QueryEmpty = 0,
+  ProducerWG = 1,
+  TileCountSmemEmpty = 2,
+  TileCountSmemFull = 3,
+  WarpSchedulerWG1 = 4,
+  WarpSchedulerWG2 = 5,
+  WarpSchedulerWG3 = 6,
+};
+
+enum class BwdNamedBarriers {
+  KVEmpty = 0,
+  PdS = 1,
+  // This needs to match FwdNamedBarriers::TileCountSmemEmpty since
+  // TileScheduler uses it
+  TileCountSmemEmpty = 2,
+  TileCountSmemFull = 3,
+  dQEmptyWG1 = 4,
+  dQEmptyWG2 = 5,
+  dQEmptyWG3 = 6,
+  dQFullWG1 = 7,
+  dQFullWG2 = 8,
+  dQFullWG3 = 9,
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/seqlen.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/seqlen.h
new file mode 100644
index 000000000..c5721b272
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/seqlen.h
@@ -0,0 +1,134 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cute/tensor.hpp>
+
+namespace hstu {
+
+// We consolidate all the info related to sequence length here. This is so that
+// we can do all the gmem reads once at the beginning of each tile, rather than
+// having to repeat these reads to compute various things like n_block_min,
+// n_block_max, etc.
+
+template <bool Jagged, int kBlock>
+struct SeqlenInfo {
+  int const offset, offset_padded;
+  int const seqlen;
+
+  CUTLASS_DEVICE
+  SeqlenInfo(
+      int const bidb,
+      int const seqlen_static,
+      int const* const seq_offsets)
+      : offset(!Jagged ? 0 : seq_offsets[bidb]),
+        offset_padded(
+            !Jagged ? 0
+                    : (seq_offsets[bidb] + bidb * kBlock) / kBlock * kBlock),
+        seqlen(
+            !Jagged ? seqlen_static
+                    : (seq_offsets[bidb + 1] - seq_offsets[bidb])) {}
+};
+
+template <bool Jagged, bool Cross, bool Has_targets, int kBlockM>
+struct SeqlenInfoQKBwd {
+  int const offset_q, offset_k, offset_q_padded;
+  int const seqlen_q, seqlen_kv, uihlen_q;
+
+  CUTLASS_DEVICE
+  SeqlenInfoQKBwd(
+      int const bidb,
+      int const max_q_len,
+      int const max_kv_len,
+      int const* const seq_offsets,
+      int const* const seq_offsets_q,
+      int const* const num_targets)
+      : offset_q(
+            !Jagged ? 0 : (Cross ? seq_offsets_q[bidb] : seq_offsets[bidb])),
+        offset_k(!Jagged ? 0 : seq_offsets[bidb])
+        // If jagged, the layout for dQaccum is that we pad
+        // each sequence in the batch by an extra kBlockM, so that the write for
+        // each sequence doesn't touch the next sequence. Sequence i starts at
+        // seq_offsets[i] + i * kBlockM and ends at seq_offsets[i + 1] + i *
+        // kBlockM However, the start must align to multiples of kBlockM.
+        ,
+        offset_q_padded(
+            !Jagged ? 0
+                : Cross
+                ? ((seq_offsets_q[bidb] + bidb * kBlockM) / kBlockM * kBlockM)
+                : ((seq_offsets[bidb] + bidb * kBlockM) / kBlockM * kBlockM)),
+        seqlen_q(
+            !Jagged ? max_q_len
+                    : (Cross ? (seq_offsets_q[bidb + 1] - seq_offsets_q[bidb])
+                             : (seq_offsets[bidb + 1] - seq_offsets[bidb]))),
+        seqlen_kv(
+            !Jagged ? max_kv_len : (seq_offsets[bidb + 1] - seq_offsets[bidb])),
+        uihlen_q(
+            !Jagged
+                ? (Has_targets ? max_q_len - num_targets[bidb] : max_q_len)
+                : (Has_targets
+                       ? (Cross ? (seq_offsets_q[bidb + 1] -
+                                   seq_offsets_q[bidb] - num_targets[bidb])
+                                : (seq_offsets[bidb + 1] - seq_offsets[bidb] -
+                                   num_targets[bidb]))
+                       : (Cross
+                              ? (seq_offsets_q[bidb + 1] - seq_offsets_q[bidb])
+                              : (seq_offsets[bidb + 1] - seq_offsets[bidb])))) {
+  }
+};
+
+template <bool Jagged, bool Cross, bool Has_targets>
+struct SeqlenInfoQKFwd {
+  int const offset_q, offset_k;
+  int const seqlen_q, seqlen_kv, uihlen_q;
+
+  CUTLASS_DEVICE
+  SeqlenInfoQKFwd(
+      int const bidb,
+      int const max_q_len,
+      int const max_kv_len,
+      int const* const seq_offsets,
+      int const* const seq_offsets_q,
+      int const* const num_targets)
+      : offset_q(
+            !Jagged ? 0 : (Cross ? seq_offsets_q[bidb] : seq_offsets[bidb])),
+        offset_k(!Jagged ? 0 : seq_offsets[bidb]),
+        seqlen_q(
+            !Jagged ? max_q_len
+                    : (Cross ? (seq_offsets_q[bidb + 1] - seq_offsets_q[bidb])
+                             : (seq_offsets[bidb + 1] - seq_offsets[bidb]))),
+        seqlen_kv(
+            !Jagged ? max_kv_len : (seq_offsets[bidb + 1] - seq_offsets[bidb])),
+        uihlen_q(
+            !Jagged
+                ? (Has_targets ? max_q_len - num_targets[bidb] : max_q_len)
+                : (Has_targets
+                       ? (Cross ? (seq_offsets_q[bidb + 1] -
+                                   seq_offsets_q[bidb] - num_targets[bidb])
+                                : (seq_offsets[bidb + 1] - seq_offsets[bidb] -
+                                   num_targets[bidb]))
+                       : (Cross
+                              ? (seq_offsets_q[bidb + 1] - seq_offsets_q[bidb])
+                              : (seq_offsets[bidb + 1] - seq_offsets[bidb])))) {
+  }
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/sm90_pipeline_no_cluster.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/sm90_pipeline_no_cluster.h
new file mode 100644
index 000000000..b6428e79c
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/sm90_pipeline_no_cluster.h
@@ -0,0 +1,150 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cutlass/pipeline/sm90_pipeline.hpp>
+
+namespace cutlass {
+
+using namespace cute;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// As of Cutlass v3.6.0, if size(ClusterShape) == 1, PipelineTmaAsync has all
+// threads signaling the barrier during consumer_release. This causes a perf
+// regression in FA3 forward pass (especially hdim 128 causal). We instead
+// reimplement the version of PipelineTmaAsync before v3.6.0 where only 1 out of
+// 128 threads signals the barrier.
+//
+// Assumption: params.num_consumers % NumThreadsPerWarpGroup == 0
+template <int Stages_, class Base = cutlass::PipelineTmaAsync<Stages_>>
+class PipelineTmaAsyncNoCluster : public Base {
+ public:
+  using FullBarrier = typename Base::FullBarrier;
+  using EmptyBarrier = typename Base::EmptyBarrier;
+  static constexpr uint32_t Stages = Stages_;
+  using PipelineState = typename Base::PipelineState;
+
+  using SharedStorage = typename Base::SharedStorage;
+  using ThreadCategory = typename Base::ThreadCategory;
+  using Params = typename Base::Params;
+
+  static CUTLASS_DEVICE void init_barriers(
+      SharedStorage& storage,
+      Params params) {
+    int warp_idx = canonical_warp_idx_sync();
+    bool is_initializing_warp = (warp_idx == 0);
+    if (is_initializing_warp) {
+      // Barrier FULL and EMPTY init
+      constexpr int producer_arv_cnt = 1;
+      uint32_t const num_consumer_warpgroups_per_cluster =
+          params.num_consumers / NumThreadsPerWarpGroup;
+      uint32_t const multicast_consumer_arrival_count =
+          num_consumer_warpgroups_per_cluster;
+
+      cutlass::arch::detail::initialize_barrier_array_pair_aligned<
+          decltype(storage.full_barrier_),
+          decltype(storage.empty_barrier_),
+          Stages>(
+          storage.full_barrier_,
+          storage.empty_barrier_,
+          producer_arv_cnt,
+          multicast_consumer_arrival_count);
+    }
+    cutlass::arch::fence_barrier_init();
+  }
+
+  template <class ClusterShape, class InitBarriers, class InitMasks>
+  CUTLASS_DEVICE PipelineTmaAsyncNoCluster(
+      SharedStorage& storage,
+      Params params,
+      ClusterShape cluster_shape,
+      InitBarriers = {},
+      InitMasks = {})
+      : Base(
+            storage,
+            params,
+            make_shape(_1{}, _1{}, _1{}) /*cluster_shape*/,
+            cute::false_type{} /*init_barriers*/,
+            cute::false_type{} /*init_masks*/),
+        empty_barrier_ptr_(&storage.empty_barrier_[0]) {
+    int warp_idx = canonical_warp_idx_sync();
+    int lane_predicate = cute::elect_one_sync();
+
+    static_assert(
+        cute::is_same_v<InitBarriers, cute::true_type> ||
+        cute::is_same_v<InitBarriers, cute::false_type>);
+    static_assert(
+        cute::is_same_v<InitMasks, cute::true_type> ||
+        cute::is_same_v<InitMasks, cute::false_type>);
+    if constexpr (cute::is_same_v<InitBarriers, cute::true_type>) {
+      init_barriers(storage, params);
+    }
+  }
+
+  // Constructor
+  template <class ClusterShape>
+  CUTLASS_DEVICE PipelineTmaAsyncNoCluster(
+      SharedStorage& storage,
+      Params params,
+      ClusterShape cluster_shape)
+      : PipelineTmaAsyncNoCluster(
+            storage,
+            params,
+            cluster_shape,
+            cute::true_type{},
+            cute::true_type{}) {}
+
+  template <class ClusterShape, class InitBarriers>
+  CUTLASS_DEVICE PipelineTmaAsyncNoCluster(
+      SharedStorage& storage,
+      Params params,
+      ClusterShape cluster_shape,
+      InitBarriers = {})
+      : PipelineTmaAsyncNoCluster(
+            storage,
+            params,
+            cluster_shape,
+            InitBarriers{},
+            cute::true_type{}) {}
+
+  CUTLASS_DEVICE
+  void consumer_release(PipelineState state) {
+    consumer_release(state.index());
+  }
+
+ private:
+  EmptyBarrier* const empty_barrier_ptr_ = nullptr;
+
+  // Consumer signalling Producer of completion
+  // Ensures all blocks in the Same Row and Column get notifed.
+  CUTLASS_DEVICE
+  void consumer_release(uint32_t stage, uint32_t skip = false) {
+    empty_barrier_ptr_[stage].arrive(
+        0 /*dst_blockid_*/,
+        uint32_t(threadIdx.x % cutlass::NumThreadsPerWarpGroup == 0) &
+            (!skip) /*is_signaling_thread*/);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // end namespace cutlass
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/softmax.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/softmax.h
new file mode 100644
index 000000000..1bd6131c6
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/softmax.h
@@ -0,0 +1,256 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <cute/tensor.hpp>
+#include <cutlass/numeric_types.h>
+#include <cmath>
+#include "utils.h"
+
+namespace hstu {
+
+using namespace cute;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    bool zero_init = true,
+    typename Engine0,
+    typename Layout0,
+    typename Engine1,
+    typename Layout1,
+    typename Operator>
+__device__ __forceinline__ void thread_reduce_(
+    Tensor<Engine0, Layout0> const& tensor,
+    Tensor<Engine1, Layout1>& summary,
+    Operator& op) {
+  static_assert(Layout0::rank == 2, "Only support 2D Tensor");
+  static_assert(Layout1::rank == 1, "Only support 1D Tensor");
+  CUTE_STATIC_ASSERT_V(size<0>(summary) == size<0>(tensor));
+#pragma unroll
+  for (int ni = 0; ni < size<1>(tensor); ni++) {
+#pragma unroll
+    for (int mi = 0; mi < size<0>(tensor); mi++) {
+      summary(mi) = zero_init && ni == 0 ? tensor(mi, ni)
+                                         : op(summary(mi), tensor(mi, ni));
+    }
+  }
+}
+
+template <
+    typename Engine0,
+    typename Layout0,
+    typename Engine1,
+    typename Layout1,
+    typename Operator>
+__device__ __forceinline__ void quad_allreduce_(
+    Tensor<Engine0, Layout0>& dst,
+    Tensor<Engine1, Layout1>& src,
+    Operator& op) {
+  CUTE_STATIC_ASSERT_V(size(dst) == size(src));
+#pragma unroll
+  for (int i = 0; i < size(dst); i++) {
+    dst(i) = Allreduce<4>::run(src(i), op);
+  }
+}
+
+template <
+    bool zero_init = true,
+    typename Engine0,
+    typename Layout0,
+    typename Engine1,
+    typename Layout1,
+    typename Operator>
+__device__ __forceinline__ void reduce_(
+    Tensor<Engine0, Layout0> const& tensor,
+    Tensor<Engine1, Layout1>& summary,
+    Operator& op) {
+  thread_reduce_<zero_init>(tensor, summary, op);
+  quad_allreduce_(summary, summary, op);
+}
+
+template <
+    bool zero_init = true,
+    typename Engine0,
+    typename Layout0,
+    typename Engine1,
+    typename Layout1>
+__device__ __forceinline__ void reduce_max(
+    Tensor<Engine0, Layout0> const& tensor,
+    Tensor<Engine1, Layout1>& max) {
+  MaxOp<float> max_op;
+  reduce_<zero_init>(tensor, max, max_op);
+}
+
+template <
+    bool zero_init = true,
+    bool warp_reduce = true,
+    typename Engine0,
+    typename Layout0,
+    typename Engine1,
+    typename Layout1>
+__device__ __forceinline__ void reduce_sum(
+    Tensor<Engine0, Layout0> const& tensor,
+    Tensor<Engine1, Layout1>& sum) {
+  SumOp<float> sum_op;
+  thread_reduce_<zero_init>(tensor, sum, sum_op);
+  if constexpr (warp_reduce) {
+    quad_allreduce_(sum, sum, sum_op);
+  }
+}
+
+// Apply the exp to all the elements.
+template <
+    bool Scale_max = true,
+    bool Check_inf = true,
+    int Max_offset = 0,
+    typename Engine0,
+    typename Layout0,
+    typename Engine1,
+    typename Layout1>
+__forceinline__ __device__ void scale_apply_exp2(
+    Tensor<Engine0, Layout0>& tensor,
+    Tensor<Engine1, Layout1> const& max,
+    const float scale) {
+  // For FP8, we can subtract max by 8.0 so that the value after exp2 is in the
+  // range of [0, 256]. This lets us use more of the FP8 range (instead of just
+  // [0, 1]) to reduce underflow.
+  static constexpr float max_offset =
+      float(Max_offset); // We can only template on int, not float
+  static_assert(Layout0::rank == 2, "Only support 2D Tensor");
+  static_assert(Layout1::rank == 1, "Only support 1D Tensor");
+  CUTE_STATIC_ASSERT_V(size<0>(max) == size<0>(tensor));
+#pragma unroll
+  for (int mi = 0; mi < size<0>(tensor); ++mi) {
+    // If max is -inf, then all elements must have been -inf (possibly due to
+    // masking). We don't want (-inf - (-inf)) since that would give NaN.
+    const float max_scaled = Check_inf
+        ? (max(mi) == -INFINITY
+               ? 0.f
+               : (!Scale_max ? max(mi) : max(mi) * scale) - max_offset)
+        : (!Scale_max ? max(mi) : max(mi) * scale) - max_offset;
+#pragma unroll
+    for (int ni = 0; ni < size<1>(tensor); ++ni) {
+      // Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
+      // max * log_2(e)). This allows the compiler to use the ffma
+      // instruction instead of fadd and fmul separately.
+      tensor(mi, ni) = exp2f(tensor(mi, ni) * scale - max_scaled);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int kNRows, int Max_offset = 0>
+struct Softmax {
+  using TensorT = decltype(make_tensor<float>(Shape<Int<kNRows>>{}));
+  TensorT row_max, row_sum;
+  float const softmax_scale_log2;
+
+  CUTLASS_DEVICE Softmax(float const softmax_scale_log2_)
+      : softmax_scale_log2(softmax_scale_log2_) {};
+
+  template <bool Is_first, bool Check_inf = false, typename Tensor0>
+  __forceinline__ __device__ TensorT max_get_scale(Tensor0& acc_s) {
+    // Reshape acc_s from ((2, 2, V), MMA_M, MMA_N) to (nrow=(2, MMA_M),
+    // ncol=(2, V, MMA_N))
+    Tensor scores = make_tensor(
+        acc_s.data(), hstu::convert_layout_acc_rowcol(acc_s.layout()));
+    static_assert(CUTE_STATIC_V(size<0>(scores)) == kNRows);
+    TensorT scores_scale;
+    if constexpr (Is_first) {
+      hstu::template reduce_max</*zero_init=*/true>(scores, row_max);
+      cute::fill(scores_scale, 1.f);
+    } else {
+      Tensor scores_max_prev = make_fragment_like(row_max);
+      cute::copy(row_max, scores_max_prev);
+      hstu::template reduce_max</*zero_init=*/false>(scores, row_max);
+#pragma unroll
+      for (int mi = 0; mi < size(row_max); ++mi) {
+        float scores_max_cur = !Check_inf
+            ? row_max(mi)
+            : (row_max(mi) == -INFINITY ? 0.0f : row_max(mi));
+        scores_scale(mi) =
+            exp2f((scores_max_prev(mi) - scores_max_cur) * softmax_scale_log2);
+        row_sum(mi) *= scores_scale(mi);
+      }
+    }
+    return scores_scale;
+  };
+
+  template <bool Is_first, bool Check_inf = false, typename Tensor0>
+  __forceinline__ __device__ void online_softmax(Tensor0& acc_s) {
+    // Reshape acc_s from ((2, 2, V), MMA_M, MMA_N) to (nrow=(2, MMA_M),
+    // ncol=(2, V, MMA_N))
+    Tensor scores = make_tensor(
+        acc_s.data(), hstu::convert_layout_acc_rowcol(acc_s.layout()));
+    static_assert(CUTE_STATIC_V(size<0>(scores)) == kNRows);
+    hstu::template scale_apply_exp2</*Scale_max=*/true, Check_inf, Max_offset>(
+        scores, row_max, softmax_scale_log2);
+    // We don't do the reduce across threads here since we don't need to use the
+    // row_sum. We do that reduce at the end when we need to normalize the
+    // softmax.
+    hstu::reduce_sum</*zero_init=*/Is_first, /*warp_reduce=*/false>(
+        scores, row_sum);
+  };
+
+  __forceinline__ __device__ TensorT finalize(float const final_scale = 1.f) {
+    SumOp<float> sum_op;
+    quad_allreduce_(row_sum, row_sum, sum_op);
+    TensorT scores_scale;
+#pragma unroll
+    for (int mi = 0; mi < size(row_sum); ++mi) {
+      float sum = row_sum(mi);
+      float inv_sum = (sum == 0.f || sum != sum) ? 0.f : 1.f / sum;
+      scores_scale(mi) = inv_sum * final_scale;
+      // For FP8, we might have scaled the output of exp by 2**8 so we need to
+      // divide sum by that amount.
+      if constexpr (Max_offset != 0) {
+        static constexpr float sum_scale = 1.f / float(1 << Max_offset);
+        sum *= sum_scale;
+      }
+      row_sum(mi) = (sum == 0.f || sum != sum)
+          ? -INFINITY
+          : row_max(mi) * (softmax_scale_log2 * float(M_LN2)) + __logf(sum);
+    }
+    return scores_scale;
+  };
+
+  template <typename Tensor1>
+  __forceinline__ __device__ void rescale_o(
+      Tensor1& acc_o,
+      TensorT const& scores_scale) {
+    // Reshape acc_o from (MMA=4, MMA_M, MMA_K) to (nrow=(2, MMA_M), ncol=(2,
+    // MMA_K))
+    Tensor acc_o_rowcol = make_tensor(
+        acc_o.data(), hstu::convert_layout_acc_rowcol(acc_o.layout()));
+    static_assert(CUTE_STATIC_V(size<0>(acc_o_rowcol)) == kNRows);
+#pragma unroll
+    for (int mi = 0; mi < size<0>(acc_o_rowcol); ++mi) {
+#pragma unroll
+      for (int ni = 0; ni < size<1>(acc_o_rowcol); ++ni) {
+        acc_o_rowcol(mi, ni) *= scores_scale(mi);
+      }
+    }
+  };
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/static_switch.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/static_switch.h
new file mode 100644
index 000000000..c5759c9d2
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/static_switch.h
@@ -0,0 +1,135 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Inspired by
+// https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h
+// and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h
+
+#pragma once
+
+/// @param COND       - a boolean expression to switch by
+/// @param CONST_NAME - a name given for the constexpr bool variable.
+/// @param ...       - code to execute for true and false
+///
+/// Usage:
+/// ```
+/// BOOL_SWITCH(flag, BoolConst, [&] {
+///     some_function<BoolConst>(...);
+/// });
+/// ```
+//
+
+#define BOOL_SWITCH(COND, CONST_NAME, ...)      \
+  [&] {                                         \
+    if (COND) {                                 \
+      constexpr static bool CONST_NAME = true;  \
+      return __VA_ARGS__();                     \
+    } else {                                    \
+      constexpr static bool CONST_NAME = false; \
+      return __VA_ARGS__();                     \
+    }                                           \
+  }()
+
+#ifdef FLASHATTENTION_DISABLE_LOCAL
+#define CAUSAL_LOCAL_SWITCH(                                           \
+    CAUSAL_COND, LOCAL_COND, CAUSAL_CONST_NAME, LOCAL_CONST_NAME, ...) \
+  [&] {                                                                \
+    constexpr static bool LOCAL_CONST_NAME = false;                    \
+    if (CAUSAL_COND) {                                                 \
+      constexpr static bool CAUSAL_CONST_NAME = true;                  \
+      return __VA_ARGS__();                                            \
+    } else {                                                           \
+      constexpr static bool CAUSAL_CONST_NAME = false;                 \
+      return __VA_ARGS__();                                            \
+    }                                                                  \
+  }()
+#else
+#define CAUSAL_LOCAL_SWITCH(                                           \
+    CAUSAL_COND, LOCAL_COND, CAUSAL_CONST_NAME, LOCAL_CONST_NAME, ...) \
+  [&] {                                                                \
+    if (CAUSAL_COND) {                                                 \
+      constexpr static bool CAUSAL_CONST_NAME = true;                  \
+      constexpr static bool LOCAL_CONST_NAME = false;                  \
+      return __VA_ARGS__();                                            \
+    } else if (LOCAL_COND) {                                           \
+      constexpr static bool CAUSAL_CONST_NAME = false;                 \
+      constexpr static bool LOCAL_CONST_NAME = true;                   \
+      return __VA_ARGS__();                                            \
+    } else {                                                           \
+      constexpr static bool CAUSAL_CONST_NAME = false;                 \
+      constexpr static bool LOCAL_CONST_NAME = false;                  \
+      return __VA_ARGS__();                                            \
+    }                                                                  \
+  }()
+#endif
+
+#ifdef FLASHATTENTION_DISABLE_CLUSTER
+#define CLUSTER_SWITCH(COND, CONST_NAME, ...) \
+  [&] {                                       \
+    constexpr static bool CONST_NAME = false; \
+    return __VA_ARGS__();                     \
+  }()
+#else
+#define CLUSTER_SWITCH BOOL_SWITCH
+#endif
+
+// #ifdef FLASHATTENTION_DISABLE_SM8x
+#define ARCH_SWITCH(ARCH, ARCH_NAME, ...) \
+  [&] {                                   \
+    constexpr static int ARCH_NAME = 90;  \
+    return __VA_ARGS__();                 \
+  }()
+// #else
+// #define ARCH_SWITCH(ARCH, ARCH_NAME, ...)  \
+//   [&] {                                    \
+//     if (ARCH < 90) {                       \
+//       constexpr static int ARCH_NAME = 80; \
+//       return __VA_ARGS__();                \
+//     } else {                               \
+//       constexpr static int ARCH_NAME = 90; \
+//       return __VA_ARGS__();                \
+//     }                                      \
+//   }()
+// #endif
+
+#ifndef FLASHATTENTION_ENABLE_VCOLMAJOR
+#define VCOLMAJOR_SWITCH(COND, CONST_NAME, ...) \
+  [&] {                                         \
+    constexpr static bool CONST_NAME = false;   \
+    return __VA_ARGS__();                       \
+  }()
+#else
+#define VCOLMAJOR_SWITCH BOOL_SWITCH
+#endif
+
+#define HEADDIM_SWITCH(HEADDIM, ...)        \
+  [&] {                                     \
+    if (HEADDIM == 64) {                    \
+      constexpr static int kHeadSize = 64;  \
+      return __VA_ARGS__();                 \
+    } else if (HEADDIM == 96) {             \
+      constexpr static int kHeadSize = 96;  \
+      return __VA_ARGS__();                 \
+    } else if (HEADDIM == 128) {            \
+      constexpr static int kHeadSize = 128; \
+      return __VA_ARGS__();                 \
+    } else if (HEADDIM == 96) {             \
+      constexpr static int kHeadSize = 96;  \
+      return __VA_ARGS__();                 \
+    } else if (HEADDIM == 256) {            \
+      constexpr static int kHeadSize = 256; \
+      return __VA_ARGS__();                 \
+    }                                       \
+  }()
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/tile_scheduler.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/tile_scheduler.h
new file mode 100644
index 000000000..cde7837ce
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/tile_scheduler.h
@@ -0,0 +1,616 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include "cute/tensor.hpp"
+#include "cutlass/arch/barrier.h"
+#include "cutlass/fast_math.h"
+
+#include "named_barrier.h"
+
+namespace hstu {
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Host side kernel arguments
+struct TileSchedulerArguments {
+  int const num_blocks, num_head, num_batch;
+  int const max_seq_len, headdim,
+      element_size; // Used to calculate L2 swizzling
+  int* const tile_count_semaphore = nullptr;
+  int* const seq_offsets = nullptr;
+  int* const sort_by_length_indices = nullptr;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+    bool Jagged = false,
+    int kBlock = 128,
+    bool Sort_by_length_indices = false>
+class SingleTileScheduler {
+ public:
+  using SharedStorage = int;
+
+  // Device side kernel params
+  struct Params {
+    int const num_blocks, num_head, num_batch;
+    int const max_seq_len;
+    int* const seq_offsets;
+    int* const sort_by_length_indices;
+  };
+
+  static Params to_underlying_arguments(TileSchedulerArguments const& args) {
+    return {
+        args.num_blocks,
+        args.num_head,
+        args.num_batch,
+        args.max_seq_len,
+        !Jagged ? nullptr : args.seq_offsets,
+        !Sort_by_length_indices ? nullptr : args.sort_by_length_indices};
+  }
+
+  static dim3 get_grid_shape(Params const& params, int num_sm) {
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+    std::printf(
+        "SingleTileScheduler::get_grid_shape: %d, %d, %d\n",
+        params.num_blocks,
+        params.num_head,
+        params.num_batch);
+#endif
+    return {
+        uint32_t(params.num_blocks),
+        uint32_t(params.num_head),
+        uint32_t(params.num_batch)};
+  }
+
+  struct WorkTileInfo {
+    int block_idx = 0;
+    int bidh = 0;
+    int bidb = 0;
+    bool is_valid_tile = false;
+
+    CUTLASS_DEVICE
+    bool is_valid(Params const& params) const {
+      return is_valid_tile;
+    }
+
+    CUTLASS_DEVICE
+    cute::tuple<int32_t, int32_t, int32_t, int32_t> get_block_coord(
+        Params const& params) const {
+      return {block_idx, bidh, bidb, 0 /*split_idx*/};
+    }
+  };
+
+  CUTLASS_DEVICE
+  SingleTileScheduler(SharedStorage* const smem_scheduler) {}
+
+  template <bool IsProducerWarp = false>
+  CUTLASS_DEVICE WorkTileInfo get_initial_work(Params const& params) const {
+    int bidb = int(blockIdx.z);
+    if constexpr (Sort_by_length_indices) {
+      bidb = params.sort_by_length_indices[bidb];
+    }
+    WorkTileInfo work_info{int(blockIdx.x), int(blockIdx.y), bidb, true};
+    if constexpr (Jagged) {
+      int seqlen =
+          (params.seq_offsets ? params.seq_offsets[work_info.bidb + 1] -
+                   params.seq_offsets[work_info.bidb]
+                              : params.max_seq_len);
+      work_info.is_valid_tile = work_info.block_idx * kBlock < seqlen;
+    }
+    return work_info;
+  }
+
+  CUTLASS_DEVICE
+  void init_consumer() const {}
+
+  CUTLASS_DEVICE
+  void prefetch_next_work(Params const& params, WorkTileInfo& current_work)
+      const {}
+
+  template <bool IsProducerWarp = false>
+  CUTLASS_DEVICE WorkTileInfo
+  get_next_work(Params const& params, WorkTileInfo const& current_work) const {
+    return {-1, -1, -1, false};
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+class StaticPersistentTileScheduler {
+ public:
+  using SharedStorage = int;
+
+  // Device side kernel params
+  struct Params {
+    int total_blocks;
+    cutlass::FastDivmod m_block_divmod, head_divmod;
+    cutlass::FastDivmod nsplits_divmod;
+  };
+
+  static Params to_underlying_arguments(TileSchedulerArguments const& args) {
+    return {
+        args.num_blocks * args.num_head * args.num_batch,
+        cutlass::FastDivmod(args.num_blocks),
+        cutlass::FastDivmod(args.num_head),
+        cutlass::FastDivmod(1)};
+  }
+
+  static dim3 get_grid_shape(Params const& params, int num_sm) {
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+    std::printf("StaticPersistentTileScheduler::get_grid_shape %d\n", num_sm);
+#endif
+    return {uint32_t(num_sm)};
+  }
+
+  struct WorkTileInfo {
+    int tile_idx;
+
+    CUTLASS_DEVICE
+    bool is_valid(Params const& params) const {
+      return tile_idx < params.total_blocks;
+    }
+
+    CUTLASS_DEVICE
+    cute::tuple<int32_t, int32_t, int32_t, int32_t> get_block_coord(
+        Params const& params) const {
+      int block, bidh, bidb;
+      bidb = params.head_divmod.divmod(
+          bidh, params.m_block_divmod.divmod(block, tile_idx));
+      int split_idx = 0;
+      return {block, bidh, bidb, split_idx};
+    }
+  };
+
+  CUTLASS_DEVICE
+  StaticPersistentTileScheduler(SharedStorage* const smem_scheduler) {};
+
+  template <bool IsProducerWarp = false>
+  CUTLASS_DEVICE WorkTileInfo get_initial_work(Params const& params) const {
+    return {int(blockIdx.x)};
+  }
+
+  CUTLASS_DEVICE
+  void init_consumer() const {}
+
+  CUTLASS_DEVICE
+  void prefetch_next_work(Params const& params, WorkTileInfo& current_work)
+      const {}
+
+  template <bool IsProducerWarp = false>
+  CUTLASS_DEVICE WorkTileInfo
+  get_next_work(Params const& params, WorkTileInfo const& current_work) const {
+    return {current_work.tile_idx + int(gridDim.x)};
+  }
+};
+
+template <
+    int NumMmaThreads = 2 * cutlass::NumThreadsPerWarpGroup,
+    int NumProducerThreads = cutlass::NumThreadsPerWarp,
+    bool WarpSpecialized = true>
+class DynamicPersistentTileScheduler {
+  // This scheduler targets the causal (or local) case where each tile takes
+  // different amount of time. We use longest-processing-time-first scheduling:
+  // the longest remaining tile is assigned to the first SM that's free.
+  // SM indicates they are free by incrementing a semaphore.
+  // However, we have to make sure K & V still fit into L2 cache, so we perform
+  // scheduling on "sections" of the head & batch dimension, each section
+  // consisting of e.g. 8 heads. This is the L2 swizzling part. The size of each
+  // section is precomputed based on the size of K & V and the L2 cache size.
+
+  static_assert(WarpSpecialized || NumProducerThreads == NumMmaThreads);
+  static constexpr int NumThreads =
+      WarpSpecialized ? NumMmaThreads + NumProducerThreads : NumMmaThreads;
+
+ public:
+  using SharedStorage = int;
+
+ protected:
+  SharedStorage* const tile_count_smem;
+
+ public:
+  // Device side kernel params
+  struct Params {
+    int const total_blocks;
+    cutlass::FastDivmod const m_block_divmod, head_divmod;
+    cutlass::FastDivmod const l2_minor_divmod, l2_major_divmod;
+    cutlass::FastDivmod const l2_minor_residual_divmod;
+    int const num_hb_quotient;
+    int* const tile_count_semaphore;
+  };
+
+  static Params to_underlying_arguments(TileSchedulerArguments const& args) {
+    int const size_one_kv_head =
+        args.max_seq_len * args.headdim * args.element_size * 2;
+    int const size_l2 = 32 * 1024 * 1024; // 32 MB for K & V
+    // Swizzle is the size of each "section". Round swizzle to a power of 2
+    // If not PackGQA already, the size of each section can increase by
+    // qhead_per_khead
+    int const swizzle = (1 << cutlass::find_log2(size_l2 / size_one_kv_head));
+    // If we're in the last section (called residual), we don't want to divide
+    // by swizzle. Instead we want to divide by the remainder.
+    int const num_hb_remainder = (args.num_head * args.num_batch) % swizzle;
+    int const num_split_blocks = args.num_blocks;
+    // printf("num_split_blocks = %d, num_head = %d, num_batch = %d, swizzle =
+    // %d, PackGQA = %d, qhead_per_khead = %d, num_hb_remainder = %d\n",
+    // num_split_blocks, args.num_head, args.num_batch, swizzle, int(PackGQA),
+    // args.qhead_per_khead, num_hb_remainder);
+    assert(args.tile_count_semaphore != nullptr);
+    return {
+        num_split_blocks * args.num_head * args.num_batch,
+        cutlass::FastDivmod(args.num_blocks),
+        cutlass::FastDivmod(args.num_head),
+        cutlass::FastDivmod(swizzle),
+        cutlass::FastDivmod(swizzle * num_split_blocks),
+        // don't divide by 0
+        cutlass::FastDivmod(num_hb_remainder > 0 ? num_hb_remainder : 1),
+        (args.num_head * args.num_batch) / swizzle,
+        args.tile_count_semaphore};
+  }
+
+  static dim3 get_grid_shape(Params const& params, int num_sm) {
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+    std::printf("DynamicPersistentTileScheduler::get_grid_shape %d\n", num_sm);
+#endif
+    return {uint32_t(num_sm)};
+  }
+
+  struct WorkTileInfo {
+    int tile_idx;
+
+    CUTLASS_DEVICE
+    bool is_valid(Params const& params) const {
+      return tile_idx < params.total_blocks;
+    }
+
+    CUTLASS_DEVICE
+    cute::tuple<int32_t, int32_t, int32_t, int32_t> get_block_coord(
+        Params const& params) const {
+      int block, bidh, bidb;
+      int l2_mod, bidhb, bidhb_residual;
+      bidhb = params.l2_major_divmod.divmod(l2_mod, tile_idx);
+      // If we're in the last section (called residual), we don't want to divide
+      // by swizzle. Instead we want to divide by the remainder.
+      if (bidhb < params.num_hb_quotient) {
+        block = params.l2_minor_divmod.divmod(bidhb_residual, l2_mod);
+      } else {
+        block = params.l2_minor_residual_divmod.divmod(bidhb_residual, l2_mod);
+      }
+      bidb = params.head_divmod.divmod(
+          bidh, bidhb * params.l2_minor_divmod.divisor + bidhb_residual);
+      int split_idx = 0;
+      // Longest-processing-time-first
+      block = params.m_block_divmod.divisor - 1 - block;
+      return {block, bidh, bidb, split_idx};
+    }
+  };
+
+  CUTLASS_DEVICE
+  DynamicPersistentTileScheduler(SharedStorage* const smem_scheduler)
+      : tile_count_smem(smem_scheduler) {};
+
+  template <bool IsProducerWarp = false>
+  CUTLASS_DEVICE WorkTileInfo get_initial_work(Params const& params) const {
+    return {int(blockIdx.x)};
+  }
+
+  CUTLASS_DEVICE
+  void init_consumer() const {
+    if (WarpSpecialized || cutlass::canonical_warp_idx_sync() > 0) {
+      hstu::named_barrier_arrive(
+          NumThreads,
+          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemEmpty) /*id*/);
+    }
+  }
+
+  CUTLASS_DEVICE
+  void prefetch_next_work(Params const& params, WorkTileInfo& current_work)
+      const {
+    if (threadIdx.x % NumProducerThreads == 0) {
+      current_work.tile_idx =
+          atomicAdd(params.tile_count_semaphore, 1) + int(gridDim.x);
+    }
+  }
+
+  template <bool IsProducerWarp = false>
+  CUTLASS_DEVICE WorkTileInfo
+  get_next_work(Params const& params, WorkTileInfo const& current_work) const {
+    if constexpr (IsProducerWarp) {
+      // thread 0 already has the right tile_idx, just need to broadcast to the
+      // rest of warp 0
+      int new_tile_idx =
+          __shfl_sync(0xffffffff, current_work.tile_idx, 0 /*lane*/);
+      hstu::named_barrier_sync(
+          NumThreads,
+          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemEmpty) /*id*/);
+      if (threadIdx.x % NumProducerThreads == 0) {
+        *tile_count_smem = current_work.tile_idx;
+      }
+      hstu::named_barrier_arrive(
+          NumThreads,
+          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemFull) /*id*/);
+      return {new_tile_idx};
+    } else {
+      hstu::named_barrier_sync(
+          NumThreads,
+          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemFull) /*id*/);
+      int tile_idx = *tile_count_smem;
+      hstu::named_barrier_arrive(
+          NumThreads,
+          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemEmpty) /*id*/);
+      return {tile_idx};
+    }
+  }
+};
+
+template <
+    int kBlock,
+    int NumMmaThreads = 2 * cutlass::NumThreadsPerWarpGroup,
+    int NumProducerThreads = cutlass::NumThreadsPerWarp,
+    bool WarpSpecialized = true>
+class VarlenDynamicPersistentTileScheduler {
+  static_assert(WarpSpecialized || NumProducerThreads == NumMmaThreads);
+  static constexpr int NumThreads =
+      WarpSpecialized ? NumMmaThreads + NumProducerThreads : NumMmaThreads;
+
+ public:
+  using SharedStorage = int4;
+
+ protected:
+  SharedStorage* const work_info_smem;
+
+ public:
+  // Device side kernel params
+  struct Params {
+    int num_head, num_batch;
+    int const max_seq_len;
+    cutlass::FastDivmod nsplits_divmod;
+    int* const tile_count_semaphore;
+    int* const seq_offsets;
+  };
+
+  static Params to_underlying_arguments(TileSchedulerArguments const& args) {
+    // If Split, for the purpose of scheduling, we pretend that instead there
+    // are (args.num_splits * args.num_head) number of heads.
+    assert(args.tile_count_semaphore != nullptr);
+    return {
+        args.num_head,
+        args.num_batch,
+        args.max_seq_len,
+        cutlass::FastDivmod(1),
+        args.tile_count_semaphore,
+        args.seq_offsets};
+  }
+
+  static dim3 get_grid_shape(Params const& params, int num_sm) {
+#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
+    std::printf(
+        "VarlenDynamicPersistentTileScheduler::get_grid_shape %d\n", num_sm);
+#endif
+    return {uint32_t(num_sm)};
+  }
+
+  struct WorkTileInfo {
+    int tile_idx, block, bidh, bidb;
+
+    CUTLASS_DEVICE
+    bool is_valid(Params const& params) const {
+      // if (blockIdx.x >= 0 && (threadIdx.x == 128 || threadIdx.x == 0)) {
+      // printf("blockIdx.x = %d, threadIdx.x = %d, checking valid, bidb = %d,
+      // params.num_batch = %d\n", blockIdx.x, threadIdx.x, bidb,
+      // params.num_batch); }
+      return bidb < params.num_batch;
+    }
+
+    CUTLASS_DEVICE
+    cute::tuple<int32_t, int32_t, int32_t, int32_t> get_block_coord(
+        Params const& params) const {
+      return {block, bidh, bidb, 0 /*split_idx*/};
+    }
+  };
+
+  CUTLASS_DEVICE
+  VarlenDynamicPersistentTileScheduler(SharedStorage* const smem_scheduler)
+      : work_info_smem(smem_scheduler) {};
+
+  CUTLASS_DEVICE
+  WorkTileInfo tile_idx_to_work_tile(
+      Params const& params,
+      int next_tile_idx,
+      WorkTileInfo const& current_work) const {
+    auto prefix_sum = [](int val) {
+      auto lane = threadIdx.x % cutlass::NumThreadsPerWarp;
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 1; i < cutlass::NumThreadsPerWarp; i <<= 1) {
+        int32_t partial_sum = __shfl_up_sync(0xffffffff, val, i);
+        if (lane >= i) {
+          val += partial_sum;
+        }
+      }
+      return val;
+    };
+
+    auto get_num_m_blocks = [&](int bidb_start) {
+      auto lane = threadIdx.x % cutlass::NumThreadsPerWarp;
+      int seqlen;
+      if (params.seq_offsets) {
+        int cur_cu_seqlen = lane + bidb_start <= params.num_batch
+            ? params.seq_offsets[lane + bidb_start]
+            : 0;
+        int next_cu_seqlen = __shfl_down_sync(0xffffffff, cur_cu_seqlen, 1);
+        seqlen = next_cu_seqlen - cur_cu_seqlen;
+      } else {
+        seqlen = params.max_seq_len;
+      }
+      return lane + bidb_start < params.num_batch &&
+              lane < cutlass::NumThreadsPerWarp - 1
+          ? cute::ceil_div(seqlen, kBlock)
+          : 0;
+    };
+
+    int num_m_blocks =
+        get_num_m_blocks(current_work.bidb); // Different for each lane
+    // Cumulative number of blocks for the next 31 batches
+    int num_m_blocks_cumulative = prefix_sum(num_m_blocks);
+    // Total number of blocks for the next 31 batches
+    int m_blocks_in_group = __shfl_sync(
+        0xffffffff, num_m_blocks_cumulative, cutlass::NumThreadsPerWarp - 1);
+    int group_end_tile = current_work.tile_idx - current_work.block -
+        current_work.bidh * __shfl_sync(0xffffffff, num_m_blocks, 0 /*lane*/) +
+        m_blocks_in_group * params.num_head; // Same for all lanes
+    int bidb = current_work.bidb;
+    // if (blockIdx.x <= 9 && threadIdx.x == 0) {
+    //     printf("Before while, blockIdx.x = %d, threadIdx.x = %d, bidb = %d,
+    //     num_m_blocks = %d, next_tile_idx = %d, group_end_tile = %d,
+    //     m_blocks_in_group = %d\n", blockIdx.x, threadIdx.x, bidb,
+    //     num_m_blocks, next_tile_idx, group_end_tile, m_blocks_in_group);
+    // }
+    while (group_end_tile <= next_tile_idx) {
+      bidb += cutlass::NumThreadsPerWarp - 1;
+      if (bidb >= params.num_batch) {
+        // if (blockIdx.x <= 9 && threadIdx.x == 0) {
+        //     printf("Returning early, blockIdx.x = %d, threadIdx.x = %d, bidb
+        //     = %d, num_m_blocks = %d, next_tile_idx = %d, group_end_tile = %d,
+        //     m_blocks_in_group = %d\n", blockIdx.x, threadIdx.x, bidb,
+        //     num_m_blocks, next_tile_idx, group_end_tile, m_blocks_in_group);
+        // }
+        return {next_tile_idx, 0, 0, params.num_batch};
+      }
+      num_m_blocks = get_num_m_blocks(bidb);
+      num_m_blocks_cumulative = prefix_sum(num_m_blocks);
+      m_blocks_in_group = __shfl_sync(
+          0xffffffff, num_m_blocks_cumulative, cutlass::NumThreadsPerWarp - 1);
+      group_end_tile += m_blocks_in_group * params.num_head;
+      // if (blockIdx.x <= 9 && threadIdx.x == 0) {
+      //     printf("Bottom of while, blockIdx.x = %d, threadIdx.x = %d, bidb =
+      //     %d, num_m_blocks = %d, next_tile_idx = %d, group_end_tile = %d,
+      //     m_blocks_in_group = %d\n", blockIdx.x, threadIdx.x, bidb,
+      //     num_m_blocks, next_tile_idx, group_end_tile, m_blocks_in_group);
+      // }
+    }
+    int group_start_tile = group_end_tile - m_blocks_in_group * params.num_head;
+    // The next problem to process is the first one that does not have ending
+    // tile position that is greater than or equal to tile index.
+    int batch_idx_in_group = __popc(__ballot_sync(
+        0xffffffff,
+        group_start_tile + num_m_blocks_cumulative * params.num_head <=
+            next_tile_idx));
+    bidb += batch_idx_in_group;
+    num_m_blocks = __shfl_sync(0xffffffff, num_m_blocks, batch_idx_in_group);
+    int mh_block = next_tile_idx - group_start_tile -
+        (batch_idx_in_group == 0 ? 0
+                                 : __shfl_sync(
+                                       0xffffffff,
+                                       num_m_blocks_cumulative,
+                                       batch_idx_in_group - 1)) *
+            params.num_head;
+    int bidh = mh_block / num_m_blocks;
+    int block = mh_block - bidh * num_m_blocks;
+    // if (blockIdx.x <= 9 && threadIdx.x == 0) {
+    //     printf("blockIdx.x = %d, threadIdx.x = %d, batch_idx_in_group = %d,
+    //     bidb = %d, num_m_blocks = %d, next_tile_idx = %d, group_end_tile =
+    //     %d, m_blocks_in_group = %d, mh_block = %d, bidh = %d, block = %d\n",
+    //     blockIdx.x, threadIdx.x, batch_idx_in_group, bidb, num_m_blocks,
+    //     next_tile_idx, group_end_tile, m_blocks_in_group, mh_block, bidh,
+    //     block);
+    // }
+    return {next_tile_idx, block, bidh, bidb};
+  }
+
+  template <bool IsProducerWarp = false>
+  CUTLASS_DEVICE WorkTileInfo get_initial_work(Params const& params) const {
+    if constexpr (IsProducerWarp) {
+      WorkTileInfo work_info =
+          tile_idx_to_work_tile(params, int(blockIdx.x), {0, 0, 0, 0});
+      if (threadIdx.x % cutlass::NumThreadsPerWarp == 0) {
+        *work_info_smem = make_int4(
+            work_info.tile_idx,
+            work_info.block,
+            work_info.bidh,
+            work_info.bidb);
+      }
+      hstu::named_barrier_arrive(
+          NumThreads,
+          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemFull) /*id*/);
+      return work_info;
+    } else {
+      return get_next_work<false>(params, {0, 0, 0, 0});
+    }
+  }
+
+  CUTLASS_DEVICE
+  void init_consumer() const {
+    // Don't arrive at the TileCountSmemEmpty barrier here, because
+    // get_initial_work will do that
+  }
+
+  CUTLASS_DEVICE
+  void prefetch_next_work(Params const& params, WorkTileInfo& current_work)
+      const {
+    if (threadIdx.x % NumProducerThreads == 0) {
+      current_work.tile_idx =
+          atomicAdd(params.tile_count_semaphore, 1) + int(gridDim.x);
+    }
+  }
+
+  template <bool IsProducerWarp = false>
+  CUTLASS_DEVICE WorkTileInfo
+  get_next_work(Params const& params, WorkTileInfo const& current_work) const {
+    if constexpr (IsProducerWarp) {
+      // thread 0 has the next tile_idx, just need to broadcast to the rest of
+      // warp 0
+      int new_tile_idx =
+          __shfl_sync(0xffffffff, current_work.tile_idx, 0 /*lane*/);
+      WorkTileInfo work_info = {
+          __shfl_sync(0xffffffff, current_work.tile_idx, 1 /*lane*/),
+          current_work.block,
+          current_work.bidh,
+          current_work.bidb};
+      work_info = tile_idx_to_work_tile(params, new_tile_idx, work_info);
+      hstu::named_barrier_sync(
+          NumThreads,
+          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemEmpty) /*id*/);
+      if (threadIdx.x % cutlass::NumThreadsPerWarp == 0) {
+        *work_info_smem = make_int4(
+            work_info.tile_idx,
+            work_info.block,
+            work_info.bidh,
+            work_info.bidb);
+      }
+      hstu::named_barrier_arrive(
+          NumThreads,
+          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemFull) /*id*/);
+      return work_info;
+    } else {
+      hstu::named_barrier_sync(
+          NumThreads,
+          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemFull) /*id*/);
+      int4 work_info = *work_info_smem;
+      hstu::named_barrier_arrive(
+          NumThreads,
+          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemEmpty) /*id*/);
+      return WorkTileInfo{work_info.x, work_info.y, work_info.z, work_info.w};
+    }
+  }
+};
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/tile_size.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/tile_size.h
new file mode 100644
index 000000000..3c8968bda
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/tile_size.h
@@ -0,0 +1,220 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
+ *Pradeep Ramani, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <tuple>
+
+namespace hstu {
+
+constexpr int kBlockM_bwd(
+    const int arch,
+    const int headdim,
+    const bool causal,
+    const bool is_local) {
+  int const kBlockM_sm90 = headdim <= 64
+      ? 64
+      : (headdim <= 96
+             ? 64
+             : (headdim <= 128 ? (causal || is_local ? 64 : 80) : 64));
+  int const kBlockM_sm80 = headdim <= 64 ? 128 : 64;
+  int const kBlockM = arch >= 90 ? kBlockM_sm90 : kBlockM_sm80;
+  return kBlockM;
+}
+
+constexpr int kBlockN_bwd(const int arch, const int headdim) {
+  int const kBlockN_sm90 = headdim <= 128 ? 128 : (headdim <= 192 ? 96 : 80);
+  int const kBlockN_sm80 = headdim <= 128 ? 128 : (headdim <= 192 ? 80 : 64);
+  int const kBlockN = arch >= 90 ? kBlockN_sm90 : kBlockN_sm80;
+  return kBlockN;
+}
+
+constexpr int NumMmaWarpGroups_bwd(const int arch, const int headdim) {
+  if (headdim <= 128) {
+    return 2;
+  } else if (headdim == 192) {
+    return arch >= 90 ? 3 : 2;
+  } else {
+    return 2;
+  }
+}
+
+constexpr bool V_in_regs_bwd(const int arch, const int headdim) {
+  if (arch >= 90 && headdim == 96) {
+    return true;
+  }
+  return false;
+}
+
+// Stages_dO, Stages_dS_or_QSm80
+constexpr std::tuple<int, int> Stages_bwd(const int arch, const int headdim) {
+  if (headdim <= 128) {
+    return {2, 2};
+  }
+  if (headdim == 192) {
+    if (arch >= 90) {
+      return {1, 1};
+    } else {
+      return {1, 2};
+    }
+  } else {
+    return {1, 1};
+  }
+}
+
+// AtomLayoutMSdP, AtomLayoutNdKV, AtomLayoutMdQ
+constexpr std::tuple<int, int, int> AtomLayout_bwd(
+    const int arch,
+    const int headdim) {
+  if (headdim <= 64) {
+    if (arch >= 90) {
+      return {1, 2, 1};
+    } else {
+      return {4, 4, 4};
+    }
+  } else if (headdim <= 96) {
+    if (arch >= 90) {
+      return {1, 2, 1};
+    } else {
+      return {2, 4, 2};
+    }
+  } else if (headdim <= 128) {
+    if (arch >= 90) {
+      return {1, 2, 1};
+    } else {
+      return {2, 2, 2};
+    }
+  } else {
+    if (arch >= 90) {
+      return {1, 1, 1};
+    } else {
+      return {4, 2, 2};
+    }
+  }
+}
+
+// SdP_swapAB, dKV_swapAB, dQ_swapAB
+constexpr std::tuple<bool, bool, bool> swapAB_bwd(
+    const int arch,
+    const int headdim,
+    const bool causal,
+    const bool local) {
+  if (headdim <= 96) {
+    return {arch >= 90 ? true : false, false, false};
+  } else if (headdim == 128) {
+    bool SdP_swapAB = arch >= 90 ? true : false;
+    bool dKV_swapAB = false;
+    bool dQ_swapAB = arch >= 90 ? ((causal || local) ? false : true) : false;
+    return {SdP_swapAB, dKV_swapAB, dQ_swapAB};
+  } else if (headdim == 192) {
+    return {false, true, false};
+  } else {
+    return {false, arch >= 90 ? true : false, arch >= 90 ? true : false};
+  }
+}
+
+// Return {kBlockM, kBlockN, Mma1_is_RS}
+constexpr std::tuple<int, int, bool> tile_size_fwd_sm90(
+    int headdim,
+    bool is_causal,
+    bool is_local,
+    int element_size = 2,
+    bool v_colmajor = false,
+    bool Cross = false,
+    bool Training = true) {
+  // for cross attention, q is usually much smaller than k/v, so we reduce the
+  // BlockM size to increase parallelism
+  bool small_blockm = Cross && (!Training);
+  if (element_size == 2) {
+    if (headdim <= 64) {
+      return {small_blockm ? 64 : 192, 128, true};
+      // Good for long seqlen (>= 4k) but suffers from tile quantization at
+      // short seqlen return {192, is_causal || is_local ? 192 : 176, true,
+      // false};
+    } else if (headdim <= 96) {
+      return {small_blockm ? 64 : 192, is_local ? 128 : 144, false};
+    } else if (headdim <= 128) {
+      return {small_blockm ? 64 : 128, is_causal || is_local ? 128 : 176, true};
+      // {128, 192, false, false} and {192, 128, false, true} are quite good too
+      // 128 x 192 hits the limit of smem if Mma1_is_RS, 128 x 144 hits the
+      // limit if !Mma1_is_RS
+    } else if (headdim <= 192) {
+      return {
+          small_blockm ? 64 : 128,
+          is_local ? 96 : 112,
+          true}; // 128 x 112 hits the limit of smem
+    } else {
+      return {
+          small_blockm ? 64 : 128,
+          is_local ? 64 : 80,
+          true}; // 128 x 80 hits the limit of smem
+    }
+  } else {
+    if (headdim <= 64) {
+      return {192, 160, true};
+    } else if (headdim <= 96) {
+      return {192, 128, true};
+    } else if (headdim <= 128) {
+      return {128, (v_colmajor ? 192 : 224), true};
+    } else if (headdim <= 192) {
+      return {128, 160, true};
+    } else {
+      return {128, is_local ? 64 : 128, true};
+    }
+  }
+}
+
+// Return {kBlockM, kBlockN, kNWarps, kStages, Q_in_regs}
+constexpr std::tuple<int, int, int, int, bool> tile_size_fwd_sm8x(
+    bool sm86_or_89,
+    int headdim,
+    bool is_causal,
+    bool is_local,
+    int element_size = 2) {
+  if (element_size == 2) {
+    if (headdim <= 64) {
+      return {128, (is_local ? 96 : 112), 4, 1, false};
+    } else if (headdim <= 96) {
+      return {128, is_local ? 48 : 64, 4, 1, false};
+    } else if (headdim <= 128) {
+      bool const use_8_warps = sm86_or_89;
+      return {
+          128,
+          use_8_warps ? (is_local ? 96 : 128) : (is_local ? 48 : 64),
+          use_8_warps ? 8 : 4,
+          1,
+          use_8_warps};
+    } else if (headdim <= 192) {
+      bool const kBlockN_64 = is_local;
+      return {128, kBlockN_64 ? 64 : 96, 8, sm86_or_89 ? 1 : 2, !kBlockN_64};
+    } else {
+      return {
+          128,
+          sm86_or_89 ? (is_local ? 48 : 64) : (is_local ? 64 : 96),
+          8,
+          1,
+          false};
+    }
+  } else {
+    // Placeholder for now
+    return {128, 64, 8, 2, false};
+  }
+}
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/utils.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/utils.h
new file mode 100644
index 000000000..50a065ed4
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/utils.h
@@ -0,0 +1,789 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/******************************************************************************
+ * Copyright (c) 2024, Tri Dao.
+ ******************************************************************************/
+
+#pragma once
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <cuda_fp16.h>
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+#include <cuda_bf16.h>
+#endif
+
+#include <cute/tensor.hpp>
+#include <cutlass/gemm/device/gemm.h>
+
+#include <cutlass/array.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/numeric_conversion.h>
+#include <cutlass/numeric_types.h>
+
+#define CHECK_CUDA(call)                \
+  do {                                  \
+    cudaError_t status_ = call;         \
+    if (status_ != cudaSuccess) {       \
+      fprintf(                          \
+          stderr,                       \
+          "CUDA error (%s:%d): %s\n",   \
+          __FILE__,                     \
+          __LINE__,                     \
+          cudaGetErrorString(status_)); \
+      exit(1);                          \
+    }                                   \
+  } while (0)
+
+#define CHECK_CUDA_KERNEL_LAUNCH() CHECK_CUDA(cudaGetLastError())
+
+#ifndef M_LOG2E
+#define M_LOG2E 1.44269504088896340735992468100 /* log_2 (e) */
+#endif
+
+namespace hstu {
+
+using namespace cute;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// A wrapper for the kernel that is used to guard against compilation on
+// architectures that will never use the kernel. The purpose of this is to
+// reduce the size of the compiled binary.
+// Adapted from
+// https://github.com/vllm-project/vllm/blob/4d29e91be84d27ca313d657eee92c067439a4c23/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh#L55
+template <typename Kernel>
+struct enable_sm90_or_later : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm80_to_sm89 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDA_ARCH__ <= 890)
+    Kernel::operator()(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct MaxOp {
+  __device__ __forceinline__ T operator()(T const& x, T const& y) {
+    return x > y ? x : y;
+  }
+};
+
+template <>
+struct MaxOp<float> {
+  // This is slightly faster
+  __device__ __forceinline__ float operator()(float const& x, float const& y) {
+    return max(x, y);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct SumOp {
+  __device__ __forceinline__ T operator()(T const& x, T const& y) {
+    return x + y;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct SiluScaleOp {
+  cutlass::epilogue::thread::SiLu<T> silu;
+  __device__ __forceinline__ T operator()(T const& t, T const& scale) {
+    float t2 = t / 2;
+    return t2 * (1 + cutlass::fast_tanh(t2)) *
+        scale; // __fdividef(t, 1.0f + cutlass::fast_exp(-t)) * scale
+  }
+};
+
+template <typename T, typename Engine, typename Layout>
+CUTLASS_DEVICE void inplace_silu_scale(
+    Tensor<Engine, Layout>& tensor,
+    T const& scale_before,
+    T const& scale_after) {
+  SiluScaleOp<T> silu_scale_op;
+#pragma unroll
+  for (int i = 0; i < size(tensor); ++i) {
+    tensor(i) = silu_scale_op(tensor(i) * scale_before, scale_after);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <int THREADS>
+struct Allreduce {
+  static_assert(THREADS == 32 || THREADS == 16 || THREADS == 8 || THREADS == 4);
+  template <typename T, typename Operator>
+  static __device__ __forceinline__ T run(T x, Operator& op) {
+    constexpr int OFFSET = THREADS / 2;
+    x = op(x, __shfl_xor_sync(uint32_t(-1), x, OFFSET));
+    return Allreduce<OFFSET>::run(x, op);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <>
+struct Allreduce<2> {
+  template <typename T, typename Operator>
+  static __device__ __forceinline__ T run(T x, Operator& op) {
+    x = op(x, __shfl_xor_sync(uint32_t(-1), x, 1));
+    return x;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// For SM80, convert acc_layout from (MMA=4, MMA_M, MMA_N) to (nrow=(2,
+// MMA_M), ncol=(2, MMA_N)). For SM90, convert acc_layout from ((2, 2, V),
+// MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, V, MMA_N))
+template <bool Transposed = false, typename Layout0>
+CUTLASS_DEVICE auto convert_layout_acc_rowcol(Layout0 acc_layout) {
+  if constexpr (decltype(rank<0>(acc_layout))::value == 3) { // SM90
+    static_assert(decltype(size<0, 0>(acc_layout))::value == 2);
+    static_assert(decltype(size<0, 1>(acc_layout))::value == 2);
+    static_assert(decltype(rank(acc_layout))::value == 3);
+    auto l = acc_layout;
+    if constexpr (!Transposed) {
+      return make_layout(
+          make_layout(get<0, 1>(l), get<1>(l)),
+          make_layout(get<0, 0>(l), get<0, 2>(l), get<2>(l)));
+    } else {
+      return make_layout(
+          make_layout(get<0, 0>(l), get<0, 2>(l), get<2>(l)),
+          make_layout(get<0, 1>(l), get<1>(l)));
+    }
+
+  } else { // SM80
+    static_assert(decltype(size<0>(acc_layout))::value == 4);
+    static_assert(decltype(rank(acc_layout))::value == 3);
+    auto l = logical_divide(acc_layout, Shape<_2>{}); // ((2, 2), MMA_M, MMA_N)
+    if constexpr (!Transposed) {
+      return make_layout(
+          make_layout(get<0, 1>(l), get<1>(l)),
+          make_layout(get<0, 0>(l), get<2>(l)));
+    } else {
+      return make_layout(
+          make_layout(get<0, 0>(l), get<2>(l)),
+          make_layout(get<0, 1>(l), get<1>(l)));
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// For SM80, convert acc_layout from (MMA=4, MMA_M, MMA_N) to ((4, 2), MMA_M,
+// MMA_N / 2) if using m16n8k16, or to (4, MMA_M, MMA_N) if using m16n8k8. For
+// SM90, FP16/BF16, convert acc_layout from ((2, 2, N / 8), MMA_M, MMA_N) to
+// ((2, 2, 2), MMA_M, (N / 16, MMA_N)) For SM90, FP8, convert acc_layout from
+// ((2, 2, N / 8), MMA_M, MMA_N) to ((4, 2, 2), MMA_M, (N / 32, MMA_N))
+template <typename MMA_Traits, typename Layout0>
+CUTLASS_DEVICE auto convert_layout_acc_Aregs(Layout0 acc_layout) {
+  using X = Underscore;
+  if constexpr (decltype(rank<0>(acc_layout))::value == 3) { // SM90
+    static_assert(decltype(size<0, 0>(acc_layout))::value == 2);
+    static_assert(decltype(size<0, 1>(acc_layout))::value == 2);
+    static_assert(decltype(rank(acc_layout))::value == 3);
+    static_assert(decltype(rank(get<0>(acc_layout)))::value == 3);
+    if constexpr (sizeof(typename MMA_Traits::ValTypeA) == 2) {
+      auto l =
+          logical_divide(get<0, 2>(acc_layout), Tile<_2>{}); // ((2, N / 16))
+      return make_layout(
+          make_layout(
+              get<0, 0>(acc_layout), get<0, 1>(acc_layout), get<0, 0>(l)),
+          get<1>(acc_layout),
+          coalesce(make_layout(get<0, 1>(l), get<2>(acc_layout))));
+    } else {
+      static_assert(sizeof(typename MMA_Traits::ValTypeA) == 1);
+      static_assert(decltype(stride<0, 0>(acc_layout))::value == 1);
+      static_assert(decltype(stride<0, 1>(acc_layout))::value == 2);
+      auto l = logical_divide(
+          get<0, 2>(acc_layout),
+          Tile<Layout<Shape<_2, _2>>>{}); // (((2, 2), N / 32))
+      // This combines the first two modes (<0, 0> and <0, 1>) into one mode.
+      // Will require register shuffling later to be correct.
+      return make_layout(
+          make_layout(Layout<_4>{}, get<0, 0, 0>(l), get<0, 0, 1>(l)),
+          get<1>(acc_layout),
+          coalesce(make_layout(
+              get<0, 1>(l),
+              get<2>(acc_layout)))); // ((4, 2, 2), MMA_M, N / 32 * MMA_N)
+      // This combination is right but doesn't work with register shuffling.
+      // return make_layout(make_layout(coalesce(make_layout(get<0,
+      // 0>(acc_layout), get<0, 0, 0>(l))), get<0, 1>(acc_layout), get<0, 0,
+      // 1>(l)),
+      //                    get<1>(acc_layout),
+      //                    coalesce(make_layout(get<0, 1>(l),
+      //                    get<2>(acc_layout))));
+    }
+  } else { // SM80
+    static_assert(decltype(size<0>(acc_layout))::value == 4);
+    static_assert(decltype(rank(acc_layout))::value == 3);
+    constexpr int mma_shape_K = get<2>(typename MMA_Traits::Shape_MNK{});
+    static_assert(mma_shape_K == 8 || mma_shape_K == 16);
+    if constexpr (mma_shape_K == 8) {
+      return acc_layout;
+    } else {
+      auto l = logical_divide(
+          acc_layout, Shape<X, X, _2>{}); // (4, MMA_M, (2, MMA_N / 2)))
+      return make_layout(
+          make_layout(get<0>(l), get<2, 0>(l)), get<1>(l), get<2, 1>(l));
+    }
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename To_type, typename Engine, typename Layout>
+CUTLASS_DEVICE auto convert_type_unsafe(Tensor<Engine, Layout> const& tensor) {
+  using From_type = typename Engine::value_type;
+  static constexpr int numel = decltype(size(tensor))::value;
+  cutlass::NumericArrayConverter<To_type, From_type, numel> convert_op;
+  // HACK: this requires tensor to be "contiguous"
+  auto frag =
+      convert_op(*reinterpret_cast<const cutlass::Array<From_type, numel>*>(
+          tensor.data()));
+  return make_tensor(make_rmem_ptr<To_type>(&frag), tensor.layout());
+  // Unsafe because we're returning a tensor with memory allocated on the
+  // stack. If the compiler does not inline this function, then the memory
+  // might not be valid.
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Engine, typename Layout, typename EngineOut>
+CUTLASS_DEVICE void convert_type_out(
+    Tensor<Engine, Layout> const& tensor,
+    Tensor<EngineOut, Layout>& out) {
+  // Somehow if we allocate out inside this function and return it, e2e is
+  // slower and the output can be wrong.
+  using From_type = typename Engine::value_type;
+  using To_type = typename EngineOut::value_type;
+  static constexpr int FragmentSize = std::max(
+      sizeof(From_type) / sizeof(To_type), sizeof(To_type) / sizeof(From_type));
+  static_assert(
+      CUTE_STATIC_V(size(tensor)) % FragmentSize == 0,
+      "Fragment size does not vectorize properly");
+  Tensor frag = recast<cutlass::Array<From_type, FragmentSize> const>(tensor);
+  Tensor out_frg = recast<cutlass::Array<To_type, FragmentSize>>(out);
+  static_assert(size(frag) == size(out_frg));
+  cutlass::NumericArrayConverter<To_type, From_type, FragmentSize> convert_op;
+#pragma unroll
+  for (int i = 0; i < size(frag); ++i) {
+    out_frg[i] = convert_op(frag[i]);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Blocks until all but N previous cp.async.commit_group operations have
+// committed. This differs from cute::cp_async_wait in that when N = 0 we
+// don't call cp.async.wait_all (which is equivalent to commit_group then
+// wait_group 0). Instead we just call cp.async.wait_group 0, which is
+// slightly faster.
+// https://github.com/NVIDIA/cutlass/blob/master/include/cute/arch/copy_sm80.hpp#L113
+template <int N>
+CUTE_HOST_DEVICE void cp_async_wait() {
+#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
+  asm volatile("cp.async.wait_group %0;\n" ::"n"(N));
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool A, class Mma, class Tensor0>
+CUTLASS_DEVICE auto mma_partition_fragment_AB(
+    Mma const& mma,
+    Tensor0 const& tensor0) {
+  if constexpr (A) {
+    return mma.partition_fragment_A(tensor0);
+  } else {
+    return mma.partition_fragment_B(tensor0);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    bool zero_init = false,
+    int wg_wait = 0,
+    bool SwapAB = false,
+    int M_slice = -1,
+    typename Tensor0,
+    typename Tensor1,
+    typename Tensor2,
+    typename TiledMma>
+CUTLASS_DEVICE void gemm(
+    TiledMma& tiled_mma,
+    Tensor0 const& tCrA,
+    Tensor1 const& tCrB,
+    Tensor2& tCrC) {
+  if constexpr (M_slice >= 0) {
+    static constexpr int MMA_M = decltype(size<1>(tCrC))::value;
+    static_assert(M_slice < MMA_M);
+    // After logical_divide, C has shape ((2,2,V), (MMA_M, 1), MMA_N)
+    Tensor tCrC_slice =
+        cute::logical_divide(tCrC, Shape<cute::Underscore, Int<MMA_M>>{})(
+            _, make_coord(Int<M_slice>{}, _), _);
+    if constexpr (!SwapAB) {
+      Tensor tCrA_slice =
+          cute::logical_divide(tCrA, Shape<cute::Underscore, Int<MMA_M>>{})(
+              _, make_coord(Int<M_slice>{}, _), _);
+      gemm<zero_init, wg_wait, SwapAB, /*M_slice=*/-1>(
+          tiled_mma, tCrA_slice, tCrB, tCrC_slice);
+    } else {
+      Tensor tCrB_slice =
+          cute::logical_divide(tCrB, Shape<cute::Underscore, Int<MMA_M>>{})(
+              _, make_coord(Int<M_slice>{}, _), _);
+      gemm<zero_init, wg_wait, SwapAB, /*M_slice=*/-1>(
+          tiled_mma, tCrA, tCrB_slice, tCrC_slice);
+    }
+  } else {
+    constexpr bool Is_RS = !cute::is_base_of<
+        cute::GMMA::DescriptorIterator,
+        typename TiledMma::FrgTypeA>::value;
+    // Need to cast away const on tCrA since warpgroup_fence_operand doesn't
+    // take const
+    if constexpr (Is_RS) {
+      if constexpr (!SwapAB) {
+        warpgroup_fence_operand(const_cast<Tensor0&>(tCrA));
+      } else {
+        warpgroup_fence_operand(const_cast<Tensor1&>(tCrB));
+      }
+    }
+    warpgroup_fence_operand(tCrC);
+    warpgroup_arrive();
+    if constexpr (zero_init) {
+      tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
+    }
+    // Unroll the K mode manually to set scale D to 1
+    CUTLASS_PRAGMA_UNROLL
+    for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
+      if constexpr (!SwapAB) {
+        cute::gemm(tiled_mma, tCrA(_, _, k_block), tCrB(_, _, k_block), tCrC);
+      } else {
+        cute::gemm(tiled_mma, tCrB(_, _, k_block), tCrA(_, _, k_block), tCrC);
+      }
+      tiled_mma.accumulate_ = GMMA::ScaleOut::One;
+    }
+    warpgroup_commit_batch();
+    if constexpr (wg_wait >= 0) {
+      warpgroup_wait<wg_wait>();
+    }
+    warpgroup_fence_operand(tCrC);
+    if constexpr (Is_RS) {
+      if constexpr (!SwapAB) {
+        warpgroup_fence_operand(const_cast<Tensor0&>(tCrA));
+      } else {
+        warpgroup_fence_operand(const_cast<Tensor1&>(tCrB));
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    bool A_in_regs = false,
+    bool B_in_regs = false,
+    bool SwapAB = false,
+    typename Tensor0,
+    typename Tensor1,
+    typename Tensor2,
+    typename Tensor3,
+    typename Tensor4,
+    typename TiledMma,
+    typename TiledCopyA,
+    typename TiledCopyB,
+    typename ThrCopyA,
+    typename ThrCopyB,
+    typename Hook>
+CUTLASS_DEVICE void gemm_sm80(
+    Tensor0& acc,
+    Tensor1& tCrA,
+    Tensor2& tCrB,
+    Tensor3 const& tCsA,
+    Tensor4 const& tCsB,
+    TiledMma tiled_mma,
+    TiledCopyA smem_tiled_copy_A,
+    TiledCopyB smem_tiled_copy_B,
+    ThrCopyA smem_thr_copy_A,
+    ThrCopyB smem_thr_copy_B,
+    Hook fn) {
+  if constexpr (SwapAB) {
+    gemm_sm80<B_in_regs, A_in_regs>(
+        acc,
+        tCrB,
+        tCrA,
+        tCsB,
+        tCsA,
+        tiled_mma,
+        smem_tiled_copy_B,
+        smem_tiled_copy_A,
+        smem_thr_copy_B,
+        smem_thr_copy_A,
+        fn);
+  } else {
+    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(acc)); // MMA_M
+    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(acc)); // MMA_N
+    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB)); // MMA_K
+    Tensor tCrA_copy_view = smem_thr_copy_A.retile_D(tCrA);
+    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view)); // M
+    Tensor tCrB_copy_view = smem_thr_copy_B.retile_D(tCrB);
+    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view)); // N
+    if (!A_in_regs) {
+      cute::copy(
+          smem_tiled_copy_A, tCsA(_, _, _0{}), tCrA_copy_view(_, _, _0{}));
+    }
+    if (!B_in_regs) {
+      cute::copy(
+          smem_tiled_copy_B, tCsB(_, _, _0{}), tCrB_copy_view(_, _, _0{}));
+    }
+#pragma unroll
+    for (int i = 0; i < size<2>(tCrA); ++i) {
+      if (i < size<2>(tCrA) - 1) {
+        if (!A_in_regs) {
+          cute::copy(
+              smem_tiled_copy_A,
+              tCsA(_, _, i + 1),
+              tCrA_copy_view(_, _, i + 1));
+        }
+        if (!B_in_regs) {
+          cute::copy(
+              smem_tiled_copy_B,
+              tCsB(_, _, i + 1),
+              tCrB_copy_view(_, _, i + 1));
+        }
+      }
+      if constexpr (!std::is_same_v<Hook, std::nullptr_t>) {
+        if (i == 0) {
+          fn();
+        }
+      }
+      cute::gemm(tiled_mma, tCrA(_, _, i), tCrB(_, _, i), acc);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    typename Tensor0,
+    typename Tensor1,
+    typename Tensor2,
+    typename Tensor3,
+    typename TiledMma,
+    typename TiledCopy,
+    typename ThrCopy>
+CUTLASS_DEVICE void gemm_rs_sm80(
+    Tensor0& acc,
+    Tensor1& tCrA,
+    Tensor2& tCrB,
+    Tensor3 const& tCsB,
+    TiledMma tiled_mma,
+    TiledCopy smem_tiled_copy_B,
+    ThrCopy smem_thr_copy_B) {
+  CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(acc)); // MMA_M
+  CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(acc)); // MMA_N
+  CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB)); // MMA_K
+  Tensor tCrB_copy_view = smem_thr_copy_B.retile_D(tCrB);
+  CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view)); // N
+  cute::copy(smem_tiled_copy_B, tCsB(_, _, _0{}), tCrB_copy_view(_, _, _0{}));
+#pragma unroll
+  for (int i = 0; i < size<2>(tCrA); ++i) {
+    if (i < size<2>(tCrA) - 1) {
+      cute::copy(
+          smem_tiled_copy_B, tCsB(_, _, i + 1), tCrB_copy_view(_, _, i + 1));
+    }
+    cute::gemm(tiled_mma, tCrA(_, _, i), tCrB(_, _, i), acc);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    bool Is_even_MN = true,
+    bool Is_even_K = true,
+    bool Clear_OOB_MN = false,
+    bool Clear_OOB_K = true,
+    class CopyAtom,
+    class TV,
+    class Tiler,
+    typename Engine0,
+    typename Layout0,
+    typename Engine1,
+    typename Layout1,
+    typename Engine2,
+    typename Layout2,
+    typename Engine3,
+    typename Layout3>
+CUTLASS_DEVICE void copy(
+    TiledCopy<CopyAtom, TV, Tiler> const& tiled_copy,
+    Tensor<Engine0, Layout0> const& S,
+    Tensor<Engine1, Layout1>& D,
+    Tensor<Engine2, Layout2> const& identity_MN,
+    Tensor<Engine3, Layout3> const& predicate_K,
+    const int max_MN = 0) {
+  // Decay TiledCopy to CopyAtom
+  auto copy_atom = static_cast<CopyAtom const&>(tiled_copy);
+  CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{});
+  CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{});
+  CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D)); // MMA
+  CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(D)); // MMA_M
+  CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(D)); // MMA_K
+  // There's no case where !Clear_OOB_K && Clear_OOB_MN
+  static_assert(!(Clear_OOB_MN && !Clear_OOB_K));
+  auto has_with_bool = cute::is_valid(
+      [](auto t) -> void_t<decltype(declval<typename decltype(t)::Traits>()
+                                        .with(true))> {},
+      copy_atom);
+#pragma unroll
+  for (int m = 0; m < size<1>(S); ++m) {
+    bool predicate_mn =
+        Is_even_MN || get<0>(identity_MN(_0{}, m, _0{})) < max_MN;
+    if constexpr (Is_even_MN || !Clear_OOB_MN) {
+      if (Is_even_MN || predicate_mn) {
+#pragma unroll
+        for (int k = 0; k < size<2>(S); ++k) {
+          if constexpr (Is_even_K || !Clear_OOB_K) {
+            if (Is_even_K || predicate_K(k)) {
+              cute::copy(copy_atom, S(_, m, k), D(_, m, k));
+            }
+          } else { // Clear_OOB_K == true && Is_even_K == false
+            // If copy traits can be transformed with a predicate value, do
+            // it, otherwise branch here
+            if constexpr (has_with_bool) {
+              cute::copy(
+                  copy_atom.with(predicate_K(k)), S(_, m, k), D(_, m, k));
+            } else {
+              if (predicate_K(k)) {
+                cute::copy(copy_atom, S(_, m, k), D(_, m, k));
+              } else {
+                cute::clear(D(_, m, k));
+              }
+            }
+          }
+        }
+      }
+    } else { // Clear_OOB_MN == true && Is_even_MN == false, also implies
+             // Clear_OOB_K == true
+      if constexpr (!has_with_bool) {
+        if (predicate_mn) {
+#pragma unroll
+          for (int k = 0; k < size<2>(S); ++k) {
+            if (Is_even_K || predicate_K(k)) {
+              cute::copy(copy_atom, S(_, m, k), D(_, m, k));
+            } else if (Clear_OOB_K) {
+              cute::clear(D(_, m, k));
+            }
+          }
+        } else {
+          cute::clear(D(_, m, _));
+        }
+      } else { // combine the mn predicate with the k predicate
+#pragma unroll
+        for (int k = 0; k < size<2>(S); ++k) {
+          cute::copy(
+              copy_atom.with(predicate_mn && (Is_even_K || predicate_K(k))),
+              S(_, m, k),
+              D(_, m, k));
+        }
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Byte permute and shuffle to match register layout of
+// (FP8 downcasted) accumulator of GEMM-I to FP8 operand A of GEMM-II.
+template <typename Fragment>
+CUTLASS_DEVICE void permute_Aregs_fp8(Fragment& frag) {
+  // frag has shape ((4, 2, 2), MMA_M, MMA_N), each element is 8 bits
+  static_assert(decltype(size<0, 0>(frag))::value == 4);
+  static_assert(decltype(size<0, 1>(frag))::value == 2);
+  static_assert(decltype(stride<0, 0>(frag))::value == 1);
+  static_assert(decltype(stride<0, 1>(frag))::value == 4);
+  static_assert(sizeof(typename Fragment::value_type) == 1);
+
+  auto quad_idx = threadIdx.x % 4;
+  bool lane_03 = quad_idx == 0 || quad_idx == 3;
+  int selector_upper = lane_03 ? 0x5410 : 0x1054;
+  int selector_lower = lane_03 ? 0x7632 : 0x3276;
+
+  static constexpr int upper_map[4] = {0, 3, 1, 2};
+  // static constexpr int lower_map[4] = {1, 2, 0, 3};
+
+  Tensor frag_64b = recast<uint2>(frag); // ((1, 1, 2), MMA_M, MMA_N)
+#pragma unroll
+  for (int i = 0; i < size(frag_64b); ++i) {
+    uint32_t upper = frag_64b[i].x;
+    uint32_t lower = frag_64b[i].y;
+    uint32_t upper0 = lane_03 ? upper : lower;
+    uint32_t lower0 = lane_03 ? lower : upper;
+    upper0 = __shfl_sync(uint32_t(-1), upper0, upper_map[quad_idx], 4);
+    // lower0 = __shfl_sync(uint32_t(-1), lower0, lower_map[quad_idx], 4);
+    lower0 = __shfl_sync(uint32_t(-1), lower0, upper_map[quad_idx] ^ 1, 4);
+    frag_64b[i].x = __byte_perm(upper0, lower0, selector_upper);
+    frag_64b[i].y = __byte_perm(upper0, lower0, selector_lower);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Fragment>
+CUTLASS_DEVICE void permute_Cregs_fp8(Fragment& frag) {
+  // frag has shape ((2, 2, N / 8), MMA_M, MMA_N), each element is 32 bits
+  static_assert(decltype(size<0, 0>(frag))::value == 2);
+  static_assert(decltype(size<0, 1>(frag))::value == 2);
+  static_assert(decltype(size<0, 2>(frag))::value % 2 == 0);
+  static_assert(decltype(stride<0, 0>(frag))::value == 1);
+  static_assert(sizeof(typename Fragment::value_type) == 4);
+  Tensor frag_64b =
+      group_modes<1, 3>(recast<uint2>(frag)); // ((1, 2, N / 8), (MMA_M, MMA_N))
+#pragma unroll
+  for (int mi = 0; mi < size<1>(frag_64b); ++mi) {
+#pragma unroll
+    for (int i = 0; i < size<0, 2>(frag_64b) / 2; ++i) {
+      cutlass::swap(
+          frag_64b(make_coord(_0{}, _1{}, 2 * i), mi),
+          frag_64b(make_coord(_0{}, _0{}, 2 * i + 1), mi));
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Fragment>
+CUTLASS_DEVICE void permute_output_fp8(Fragment& out) {
+  // out has shape ((2, 2, N / 8), MMA_M, MMA_N), each element is 32 bits
+  static_assert(decltype(size<0, 0>(out))::value == 2);
+  static_assert(decltype(size<0, 1>(out))::value == 2);
+  static_assert(decltype(size<0, 2>(out))::value % 2 == 0);
+  static_assert(decltype(stride<0, 0>(out))::value == 1);
+  static_assert(sizeof(typename Fragment::value_type) == 4);
+  Tensor frag = group_modes<1, 3>(out); // ((2, 2, N / 8), (MMA_M, MMA_N))
+#pragma unroll
+  for (int mi = 0; mi < size<1>(frag); ++mi) {
+#pragma unroll
+    for (int j = 0; j < size<0, 1>(frag); ++j) {
+#pragma unroll
+      for (int i = 0; i < size<0, 2>(frag) / 2; ++i) {
+        cutlass::swap(
+            frag(make_coord(_1{}, j, 2 * i), mi),
+            frag(make_coord(_0{}, j, 2 * i + 1), mi));
+      }
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Fragment>
+CUTLASS_DEVICE void permute_output_fp8_Vcolmajor(Fragment& frag) {
+  // frag has shape ((2, 2, N / 8), MMA_M, MMA_N), each element is 16 bits
+  static_assert(decltype(size<0, 0>(frag))::value == 2);
+  static_assert(decltype(size<0, 1>(frag))::value == 2);
+  static_assert(decltype(stride<0, 0>(frag))::value == 1);
+  static_assert(
+      sizeof(typename Fragment::value_type) == 2 ||
+      sizeof(typename Fragment::value_type) == 4);
+
+  auto quad_idx = threadIdx.x % 4;
+  bool lane_03 = quad_idx == 0 || quad_idx == 3;
+
+  static constexpr int upper_map[4] = {0, 2, 3, 1};
+  // static constexpr int lower_map[4] = {2, 0, 1, 3};
+
+  // if (blockIdx.x == 0 && threadIdx.x == 128) { print_tensor(frag); }
+  using type2 = std::conditional_t<
+      sizeof(typename Fragment::value_type) == 2,
+      uint32_t,
+      uint64_t>;
+  Tensor frag_2 =
+      group_modes<1, 3>(recast<type2>(frag)); // ((1, 2, N / 8), (MMA_M, MMA_N))
+// if (blockIdx.x == 0 && threadIdx.x == 128) { print(frag); printf("\n");
+// print(frag_2); }
+#pragma unroll
+  for (int mi = 0; mi < size<1>(frag_2); ++mi) {
+#pragma unroll
+    for (int j = 0; j < size<0, 1>(frag_2); ++j) {
+#pragma unroll
+      for (int i = 0; i < size<0, 2>(frag_2) / 2; ++i) {
+        type2 upper = frag_2(make_coord(_0{}, j, 2 * i), mi);
+        type2 lower = frag_2(make_coord(_0{}, j, 2 * i + 1), mi);
+        type2 upper0 = lane_03 ? upper : lower;
+        type2 lower0 = lane_03 ? lower : upper;
+        upper0 = __shfl_sync(uint32_t(-1), upper0, upper_map[quad_idx], 4);
+        // lower0 = __shfl_sync(uint32_t(-1), lower0, lower_map[quad_idx], 4);
+        lower0 = __shfl_sync(uint32_t(-1), lower0, upper_map[quad_idx] ^ 2, 4);
+        frag_2(make_coord(_0{}, j, 2 * i), mi) = lane_03 ? upper0 : lower0;
+        frag_2(make_coord(_0{}, j, 2 * i + 1), mi) = lane_03 ? lower0 : upper0;
+      }
+    }
+  }
+  // if (blockIdx.x == 0 && threadIdx.x == 128) { print_tensor(frag); }
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename Engine, typename Layout>
+CUTLASS_DEVICE void apply_softcap(
+    Tensor<Engine, Layout>& tensor,
+    float const softcap) {
+#pragma unroll
+  for (int i = 0; i < size(tensor); ++i) {
+    tensor(i) = cutlass::fast_tanh(tensor(i) * softcap);
+  }
+}
+
+template <typename Engine, typename Layout>
+CUTLASS_DEVICE auto calculate_dtanh(Tensor<Engine, Layout>& tensor) {
+  Tensor out = make_fragment_like<float>(tensor);
+#pragma unroll
+  for (int i = 0; i < size(tensor); ++i) {
+    out(i) = 1.f - (tensor(i) * tensor(i));
+  }
+  return out;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+CUTLASS_DEVICE
+int canonical_warp_group_idx_nosync() {
+  return threadIdx.x / cutlass::NumThreadsPerWarpGroup;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/version.txt b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/version.txt
new file mode 100644
index 000000000..04d34e1a3
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/version.txt
@@ -0,0 +1 @@
+5231d95
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/jagged_transpose_1d.cpp b/recommendation_v4/generative_recommenders/ops/cpp/jagged_transpose_1d.cpp
new file mode 100644
index 000000000..d3a2e9421
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/jagged_transpose_1d.cpp
@@ -0,0 +1,130 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/extension.h>
+#include <torch/library.h>
+
+#include "fbgemm_gpu/sparse_ops.h" // @manual
+
+namespace hstu {
+
+template <typename index_t, typename val_t>
+void _jagged_transpose_1d_cpu_kernel(
+    int32_t size1,
+    int32_t size2,
+    int32_t max_len,
+    const at::TensorAccessor<index_t, 1>& offsets,
+    const at::TensorAccessor<val_t, 1>& values,
+    const at::TensorAccessor<index_t, 1>& lengths,
+    const at::TensorAccessor<index_t, 1>& trans_offsets,
+    at::TensorAccessor<val_t, 1> trans_values) {
+  for (auto i : c10::irange(size1)) {
+    for (auto j : c10::irange(size2)) {
+      auto src_idx = i * size2 + j;
+      auto dst_idx = j * size1 + i;
+      auto src_offset = offsets[src_idx];
+      auto src_length = lengths[src_idx];
+      auto dst_offset = trans_offsets[dst_idx];
+
+      for (auto k = 0; k < src_length; ++k) {
+        trans_values[dst_offset + k] = values[src_offset + k];
+      }
+    }
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> jagged_transpose_1d_cpu(
+    const at::Tensor& values,
+    const at::Tensor& offsets,
+    const at::Tensor& lengths,
+    const int64_t max_len,
+    const int64_t size1,
+    const int64_t size2) {
+  TORCH_INTERNAL_ASSERT(values.device().type() == at::DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT(offsets.device().type() == at::DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT(lengths.device().type() == at::DeviceType::CPU);
+  TORCH_CHECK(offsets.size(0) == size1 * size2 + 1);
+  TORCH_CHECK(lengths.size(0) == size1 * size2);
+
+  auto trans_lengths =
+      lengths.view({size1, size2}).transpose(0, 1).contiguous().view({-1});
+  auto trans_offsets =
+      fbgemm_gpu::asynchronous_complete_cumsum_cpu(trans_lengths);
+  auto L_out = trans_offsets[-1].item<int64_t>();
+  auto trans_values = at::empty({L_out}, values.options());
+
+  if (L_out == 0) {
+    return std::make_tuple(trans_values, trans_offsets, trans_lengths);
+  }
+
+  AT_DISPATCH_INTEGRAL_TYPES(
+      lengths.scalar_type(), "jagged_transpose_1d_cpu_kernel_input1", [&] {
+        using index_t = scalar_t;
+        AT_DISPATCH_ALL_TYPES_AND2(
+            at::ScalarType::BFloat16,
+            at::ScalarType::Half,
+            values.scalar_type(),
+            "jagged_transpose_1d_cpu_kernel_input2",
+            [&] {
+              using val_t = scalar_t;
+              _jagged_transpose_1d_cpu_kernel<index_t, val_t>(
+                  size1,
+                  size2,
+                  max_len,
+                  offsets.accessor<index_t, 1>(),
+                  values.accessor<val_t, 1>(),
+                  lengths.accessor<index_t, 1>(),
+                  trans_offsets.accessor<index_t, 1>(),
+                  trans_values.accessor<val_t, 1>());
+            });
+      });
+
+  return std::make_tuple(trans_values, trans_offsets, trans_lengths);
+}
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> jagged_transpose_1d_meta(
+    const at::Tensor& values,
+    const at::Tensor& offsets,
+    const at::Tensor& lengths,
+    const int64_t max_len,
+    const int64_t size1,
+    const int64_t size2) {
+  auto trans_lengths =
+      lengths.view({size1, size2}).transpose(0, 1).contiguous().view({-1});
+  auto L_out = trans_lengths.sum().item<int64_t>();
+
+  auto trans_values = at::native::empty_meta_symint(
+      {L_out},
+      /*dtype=*/::std::make_optional(values.scalar_type()),
+      /*layout=*/::std::make_optional(values.layout()),
+      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
+      /*pin_memory=*/::std::nullopt);
+
+  auto trans_offsets = at::native::empty_meta_symint(
+      {size1 * size2 + 1},
+      /*dtype=*/::std::make_optional(lengths.scalar_type()),
+      /*layout=*/::std::make_optional(lengths.layout()),
+      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
+      /*pin_memory=*/::std::nullopt);
+
+  return std::make_tuple(trans_values, trans_offsets, trans_lengths);
+}
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/jagged_transpose_1d.cu b/recommendation_v4/generative_recommenders/ops/cpp/jagged_transpose_1d.cu
new file mode 100644
index 000000000..380100962
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/jagged_transpose_1d.cu
@@ -0,0 +1,127 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/library.h>
+
+#include "common.h"
+#include "fbgemm_gpu/sparse_ops.h" // @manual
+#include "fbgemm_gpu/utils/fixed_divisor.cuh" // @manual
+
+namespace hstu {
+
+static constexpr int32_t kMaxThreads = 1024;
+
+template <typename index_t, typename val_t>
+__global__ __launch_bounds__(kMaxThreads) void _jagged_transpose_1d_cuda_kernel(
+    int32_t size1,
+    int32_t size2,
+    int32_t max_len,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> offsets,
+    const at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits> values,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> lengths,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        trans_offsets,
+    at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits> trans_values) {
+  for (auto idx = blockIdx.x * blockDim.y + threadIdx.y;
+       idx < static_cast<uint32_t>(size1 * size2);
+       idx += gridDim.x * blockDim.y) {
+    auto i = idx / size2;
+    auto j = idx % size2;
+    auto src_idx = i * size2 + j;
+    auto dst_idx = j * size1 + i;
+    auto src_offset = offsets[src_idx];
+    auto src_length = lengths[src_idx];
+    auto dst_offset = trans_offsets[dst_idx];
+
+    for (auto k = threadIdx.x; k < static_cast<uint32_t>(src_length);
+         k += blockDim.x) {
+      trans_values[dst_offset + k] = values[src_offset + k];
+    }
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> jagged_transpose_1d_cuda(
+    const at::Tensor& values,
+    const at::Tensor& offsets,
+    const at::Tensor& lengths,
+    const int64_t max_len,
+    const int64_t size1,
+    const int64_t size2) {
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(values.get_device());
+  TORCH_INTERNAL_ASSERT(values.device().type() == at::DeviceType::CUDA);
+  TORCH_INTERNAL_ASSERT(offsets.device().type() == at::DeviceType::CUDA);
+  TORCH_INTERNAL_ASSERT(lengths.device().type() == at::DeviceType::CUDA);
+  TORCH_CHECK(offsets.size(0) == size1 * size2 + 1);
+  TORCH_CHECK(lengths.size(0) == size1 * size2);
+  TORCH_CHECK(values.get_device() == offsets.get_device());
+  TORCH_CHECK(values.get_device() == lengths.get_device());
+
+  auto trans_lengths =
+      lengths.view({size1, size2}).transpose(0, 1).contiguous().view({-1});
+  auto trans_offsets =
+      fbgemm_gpu::asynchronous_complete_cumsum_gpu(trans_lengths);
+  auto L_out = trans_offsets[-1].item<int64_t>();
+  TORCH_CHECK(L_out < std::numeric_limits<int32_t>::max());
+  auto trans_values = at::empty({L_out}, values.options());
+
+  if (L_out == 0) {
+    return std::make_tuple(trans_values, trans_offsets, trans_lengths);
+  }
+
+  // Optimized thread block configuration based on benchmark results
+  uint32_t B_blocks = 4;
+  dim3 threads(256, B_blocks);
+  auto blocks = div_round_up(size1 * size2, B_blocks);
+
+  AT_DISPATCH_INTEGRAL_TYPES(
+      lengths.scalar_type(), "jagged_transpose_1d_cuda_kernel_input1", [&] {
+        using index_t = scalar_t;
+        AT_DISPATCH_ALL_TYPES_AND2(
+            at::ScalarType::BFloat16,
+            at::ScalarType::Half,
+            values.scalar_type(),
+            "jagged_transpose_1d_cuda_kernel_input2",
+            [&] {
+              using val_t = scalar_t;
+              _jagged_transpose_1d_cuda_kernel<index_t, val_t><<<
+                  blocks,
+                  threads,
+                  0,
+                  at::cuda::getCurrentCUDAStream()>>>(
+                  size1,
+                  size2,
+                  max_len,
+                  offsets
+                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                  values.packed_accessor32<val_t, 1, at::RestrictPtrTraits>(),
+                  lengths
+                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                  trans_offsets
+                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                  trans_values
+                      .packed_accessor32<val_t, 1, at::RestrictPtrTraits>());
+            });
+      });
+
+  return std::make_tuple(trans_values, trans_offsets, trans_lengths);
+}
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/replace_last_n_with_jagged.cpp b/recommendation_v4/generative_recommenders/ops/cpp/replace_last_n_with_jagged.cpp
new file mode 100644
index 000000000..fa8eaac09
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/replace_last_n_with_jagged.cpp
@@ -0,0 +1,139 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/extension.h>
+#include <torch/library.h>
+
+#include "fbgemm_gpu/sparse_ops.h" // @manual
+
+namespace hstu {
+
+template <typename index_t, typename val_t>
+void _replace_last_n_with_jagged_cpu_kernel(
+    int32_t B,
+    const at::TensorAccessor<index_t, 1>& lengths_left,
+    const at::TensorAccessor<index_t, 1>& offsets_left,
+    const at::TensorAccessor<val_t, 2>& values_left,
+    const at::TensorAccessor<index_t, 1>& lengths_right,
+    const at::TensorAccessor<index_t, 1>& offsets_right,
+    const at::TensorAccessor<val_t, 2>& values_right,
+    const at::TensorAccessor<index_t, 1>& output_offsets,
+    at::TensorAccessor<val_t, 2> output) {
+  for (auto b : c10::irange(B)) {
+    auto left_start = offsets_left[b];
+    auto left_len = lengths_left[b];
+    auto right_start = offsets_right[b];
+    auto right_len = lengths_right[b];
+    auto output_start = output_offsets[b];
+
+    auto keep_len = left_len - right_len;
+
+    for (auto i = 0; i < left_len; ++i) {
+      for (auto d = 0; d < values_left.size(1); ++d) {
+        if (i < keep_len) {
+          output[output_start + i][d] = values_left[left_start + i][d];
+        } else {
+          auto right_idx = i - keep_len;
+          if (right_idx < right_len) {
+            output[output_start + i][d] =
+                values_right[right_start + right_idx][d];
+          }
+        }
+      }
+    }
+  }
+}
+
+at::Tensor replace_last_n_with_jagged_cpu(
+    const at::Tensor& lengths_left,
+    const at::Tensor& values_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& values_right) {
+  TORCH_INTERNAL_ASSERT(lengths_left.device().type() == at::DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT(lengths_right.device().type() == at::DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT(values_left.device().type() == at::DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT(values_right.device().type() == at::DeviceType::CPU);
+  TORCH_CHECK(lengths_left.size(0) == lengths_right.size(0));
+  TORCH_CHECK(values_left.size(1) == values_right.size(1));
+
+  auto B = lengths_left.size(0);
+  auto D = values_left.size(1);
+
+  auto L_out = lengths_left.sum().item<int64_t>();
+
+  auto output = at::empty({L_out, D}, values_left.options());
+
+  if (L_out == 0) {
+    return output;
+  }
+
+  const auto offsets_left =
+      fbgemm_gpu::asynchronous_complete_cumsum_cpu(lengths_left.view({-1}));
+  const auto offsets_right =
+      fbgemm_gpu::asynchronous_complete_cumsum_cpu(lengths_right.view({-1}));
+  const auto output_offsets = offsets_left;
+
+  AT_DISPATCH_INTEGRAL_TYPES(
+      lengths_left.scalar_type(),
+      "replace_last_n_with_jagged_cpu_kernel_input1",
+      [&] {
+        using index_t = scalar_t;
+        AT_DISPATCH_ALL_TYPES_AND2(
+            at::ScalarType::BFloat16,
+            at::ScalarType::Half,
+            values_left.scalar_type(),
+            "replace_last_n_with_jagged_cpu_kernel_input2",
+            [&] {
+              using val_t = scalar_t;
+              _replace_last_n_with_jagged_cpu_kernel<index_t, val_t>(
+                  B,
+                  lengths_left.accessor<index_t, 1>(),
+                  offsets_left.accessor<index_t, 1>(),
+                  values_left.accessor<val_t, 2>(),
+                  lengths_right.accessor<index_t, 1>(),
+                  offsets_right.accessor<index_t, 1>(),
+                  values_right.accessor<val_t, 2>(),
+                  output_offsets.accessor<index_t, 1>(),
+                  output.accessor<val_t, 2>());
+            });
+      });
+
+  return output;
+}
+
+at::Tensor replace_last_n_with_jagged_meta(
+    const at::Tensor& lengths_left,
+    const at::Tensor& values_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& values_right) {
+  auto L_out = lengths_left.sum().item<int64_t>();
+  auto D = values_left.size(1);
+
+  auto output = at::native::empty_meta_symint(
+      {L_out, D},
+      /*dtype=*/::std::make_optional(values_left.scalar_type()),
+      /*layout=*/::std::make_optional(values_left.layout()),
+      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
+      /*pin_memory=*/::std::nullopt);
+
+  return output;
+}
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/replace_last_n_with_jagged.cu b/recommendation_v4/generative_recommenders/ops/cpp/replace_last_n_with_jagged.cu
new file mode 100644
index 000000000..00a589eb9
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/replace_last_n_with_jagged.cu
@@ -0,0 +1,156 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/library.h>
+
+#include "common.h"
+#include "fbgemm_gpu/sparse_ops.h" // @manual
+#include "fbgemm_gpu/utils/fixed_divisor.cuh" // @manual
+
+namespace hstu {
+
+static constexpr int32_t kMaxThreads = 1024;
+
+template <typename index_t, typename val_t>
+__global__
+__launch_bounds__(kMaxThreads) void _replace_last_n_with_jagged_cuda_kernel(
+    int32_t B,
+    int32_t D,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        lengths_left,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        offsets_left,
+    const at::PackedTensorAccessor32<val_t, 2, at::RestrictPtrTraits>
+        values_left,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        lengths_right,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        offsets_right,
+    const at::PackedTensorAccessor32<val_t, 2, at::RestrictPtrTraits>
+        values_right,
+    at::PackedTensorAccessor32<val_t, 2, at::RestrictPtrTraits> output) {
+  for (auto b = blockIdx.x * blockDim.y + threadIdx.y;
+       b < static_cast<uint32_t>(B);
+       b += gridDim.x * blockDim.y) {
+    auto left_start = offsets_left[b];
+    auto left_len = lengths_left[b];
+    auto right_start = offsets_right[b];
+    auto right_len = lengths_right[b];
+    auto output_start = offsets_left[b];
+    auto keep_len = left_len - right_len;
+
+    for (auto i = threadIdx.x; i < static_cast<uint32_t>(left_len * D);
+         i += blockDim.x) {
+      auto seq_pos = i / D;
+      auto dim_pos = i % D;
+      if (seq_pos < static_cast<uint32_t>(keep_len)) {
+        output[output_start + seq_pos][dim_pos] =
+            values_left[left_start + seq_pos][dim_pos];
+      } else {
+        auto right_idx = seq_pos - keep_len;
+        if (right_idx < static_cast<uint32_t>(right_len)) {
+          output[output_start + seq_pos][dim_pos] =
+              values_right[right_start + right_idx][dim_pos];
+        }
+      }
+    }
+  }
+}
+
+at::Tensor replace_last_n_with_jagged_cuda(
+    const at::Tensor& lengths_left,
+    const at::Tensor& values_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& values_right) {
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(values_left.get_device());
+  TORCH_INTERNAL_ASSERT(lengths_left.device().type() == at::DeviceType::CUDA);
+  TORCH_INTERNAL_ASSERT(lengths_right.device().type() == at::DeviceType::CUDA);
+  TORCH_INTERNAL_ASSERT(values_left.device().type() == at::DeviceType::CUDA);
+  TORCH_INTERNAL_ASSERT(values_right.device().type() == at::DeviceType::CUDA);
+  TORCH_CHECK(lengths_left.size(0) == lengths_right.size(0));
+  TORCH_CHECK(values_left.size(1) == values_right.size(1));
+
+  auto B = lengths_left.size(0);
+  auto D = values_left.size(1);
+  auto L_out = lengths_left.sum().item<int64_t>();
+  TORCH_CHECK(L_out < std::numeric_limits<int32_t>::max());
+  TORCH_CHECK(values_left.get_device() == lengths_left.get_device());
+  TORCH_CHECK(values_left.get_device() == lengths_right.get_device());
+  TORCH_CHECK(values_left.get_device() == values_right.get_device());
+
+  auto output = at::empty({L_out, D}, values_left.options());
+
+  if (L_out == 0) {
+    return output;
+  }
+
+  const auto offsets_left =
+      fbgemm_gpu::asynchronous_complete_cumsum_gpu(lengths_left.view({-1}));
+  const auto offsets_right =
+      fbgemm_gpu::asynchronous_complete_cumsum_gpu(lengths_right.view({-1}));
+
+  // Optimized thread block configuration based on benchmark results
+  uint32_t B_blocks, threads_x;
+  B_blocks = 4;
+  threads_x = 256;
+
+  dim3 threads(threads_x, B_blocks);
+  auto blocks = div_round_up(B, B_blocks);
+
+  AT_DISPATCH_INTEGRAL_TYPES(
+      lengths_left.scalar_type(),
+      "replace_last_n_with_jagged_cuda_kernel_input1",
+      [&] {
+        using index_t = scalar_t;
+        AT_DISPATCH_ALL_TYPES_AND2(
+            at::ScalarType::BFloat16,
+            at::ScalarType::Half,
+            values_left.scalar_type(),
+            "replace_last_n_with_jagged_cuda_kernel_input2",
+            [&] {
+              using val_t = scalar_t;
+              _replace_last_n_with_jagged_cuda_kernel<index_t, val_t><<<
+                  blocks,
+                  threads,
+                  0,
+                  at::cuda::getCurrentCUDAStream()>>>(
+                  B,
+                  D,
+                  lengths_left
+                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                  offsets_left
+                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                  values_left
+                      .packed_accessor32<val_t, 2, at::RestrictPtrTraits>(),
+                  lengths_right
+                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                  offsets_right
+                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                  values_right
+                      .packed_accessor32<val_t, 2, at::RestrictPtrTraits>(),
+                  output.packed_accessor32<val_t, 2, at::RestrictPtrTraits>());
+            });
+      });
+
+  return output;
+}
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/setup.py b/recommendation_v4/generative_recommenders/ops/cpp/setup.py
new file mode 100644
index 000000000..2a06a9d05
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/setup.py
@@ -0,0 +1,487 @@
+# pyre-unsafe
+"""
+Modified from https://github.com/Dao-AILab/flash-attention/blob/main/hopper/setup.py
+"""
+
+import itertools
+import os
+import platform
+import subprocess
+import sys
+import sysconfig
+import warnings
+from pathlib import Path
+
+import torch
+from packaging.version import parse, Version
+from setuptools import find_packages, setup
+from torch.utils.cpp_extension import BuildExtension, CUDA_HOME, CUDAExtension
+
+# ninja build does not work unless include_dirs are abs path
+this_dir = os.path.dirname(os.path.abspath(__file__))
+PACKAGE_NAME = "hstu"
+# FORCE_BUILD: Force a fresh build locally, instead of attempting to find prebuilt wheels
+# SKIP_CUDA_BUILD: Intended to allow CI to use a simple `python setup.py sdist` run to copy over raw files, without any cuda compilation
+FORCE_BUILD = os.getenv("FLASH_ATTENTION_FORCE_BUILD", "FALSE") == "TRUE"
+SKIP_CUDA_BUILD = os.getenv("FLASH_ATTENTION_SKIP_CUDA_BUILD", "FALSE") == "TRUE"
+# For CI, we want the option to build with C++11 ABI since the nvcr images use C++11 ABI
+FORCE_CXX11_ABI = os.getenv("FLASH_ATTENTION_FORCE_CXX11_ABI", "FALSE") == "TRUE"
+
+# HACK: we monkey patch pytorch's _write_ninja_file to pass
+# "-gencode arch=compute_sm90a,code=sm_90a" to files ending in '_sm90.cu',
+# and pass "-gencode arch=compute_sm80,code=sm_80" to files ending in '_sm80.cu'
+from torch.utils.cpp_extension import (
+    _is_cuda_file,
+    _join_cuda_home,
+    _join_rocm_home,
+    _maybe_write,
+    COMMON_HIP_FLAGS,
+    get_cxx_compiler,
+    IS_HIP_EXTENSION,
+    IS_WINDOWS,
+    SUBPROCESS_DECODE_ARGS,
+)
+
+DISABLE_BACKWARD = os.getenv("FLASH_ATTENTION_DISABLE_BACKWARD", "FALSE") == "TRUE"
+DISABLE_FP16 = os.getenv("FLASH_ATTENTION_DISABLE_FP16", "TRUE") == "TRUE"
+DISABLE_HDIM64 = os.getenv("FLASH_ATTENTION_DISABLE_HDIM64", "TRUE") == "TRUE"
+DISABLE_HDIM96 = os.getenv("FLASH_ATTENTION_DISABLE_HDIM96", "TRUE") == "TRUE"
+DISABLE_HDIM128 = os.getenv("FLASH_ATTENTION_DISABLE_HDIM128", "FALSE") == "TRUE"
+DISABLE_HDIM192 = os.getenv("FLASH_ATTENTION_DISABLE_HDIM192", "TRUE") == "TRUE"
+DISABLE_HDIM256 = os.getenv("FLASH_ATTENTION_DISABLE_HDIM256", "TRUE") == "TRUE"
+DISABLE_SM8x = os.getenv("FLASH_ATTENTION_DISABLE_SM80", "TRUE") == "TRUE"
+
+
+def _write_ninja_file(
+    path,
+    cflags,
+    post_cflags,
+    cuda_cflags,
+    cuda_post_cflags,
+    cuda_dlink_post_cflags,
+    sources,
+    objects,
+    ldflags,
+    library_target,
+    with_cuda,
+    **kwargs,  # kwargs (ignored) to absorb new flags in torch.utils.cpp_extension
+) -> None:
+    r"""Write a ninja file that does the desired compiling and linking.
+
+    `path`: Where to write this file
+    `cflags`: list of flags to pass to $cxx. Can be None.
+    `post_cflags`: list of flags to append to the $cxx invocation. Can be None.
+    `cuda_cflags`: list of flags to pass to $nvcc. Can be None.
+    `cuda_postflags`: list of flags to append to the $nvcc invocation. Can be None.
+    `sources`: list of paths to source files
+    `objects`: list of desired paths to objects, one per source.
+    `ldflags`: list of flags to pass to linker. Can be None.
+    `library_target`: Name of the output library. Can be None; in that case,
+                      we do no linking.
+    `with_cuda`: If we should be compiling with CUDA.
+    """
+
+    def sanitize_flags(flags):
+        if flags is None:
+            return []
+        else:
+            return [flag.strip() for flag in flags]
+
+    cflags = sanitize_flags(cflags)
+    post_cflags = sanitize_flags(post_cflags)
+    cuda_cflags = sanitize_flags(cuda_cflags)
+    cuda_post_cflags = sanitize_flags(cuda_post_cflags)
+    cuda_dlink_post_cflags = sanitize_flags(cuda_dlink_post_cflags)
+    ldflags = sanitize_flags(ldflags)
+
+    # Sanity checks...
+    assert len(sources) == len(objects)
+    assert len(sources) > 0
+
+    compiler = get_cxx_compiler()
+
+    # Version 1.3 is required for the `deps` directive.
+    config = ["ninja_required_version = 1.3"]
+    config.append(f"cxx = {compiler}")
+    if with_cuda or cuda_dlink_post_cflags:
+        if IS_HIP_EXTENSION:
+            nvcc = _join_rocm_home("bin", "hipcc")
+        else:
+            nvcc = _join_cuda_home("bin", "nvcc")
+        if "PYTORCH_NVCC" in os.environ:
+            nvcc_from_env = os.getenv(
+                "PYTORCH_NVCC"
+            )  # user can set nvcc compiler with ccache using the environment variable here
+        else:
+            nvcc_from_env = nvcc
+        config.append(f"nvcc_from_env = {nvcc_from_env}")
+        config.append(f"nvcc = {nvcc}")
+
+    if IS_HIP_EXTENSION:
+        post_cflags = COMMON_HIP_FLAGS + post_cflags
+    flags = [f"cflags = {' '.join(cflags)}"]
+    flags.append(f"post_cflags = {' '.join(post_cflags)}")
+    if with_cuda:
+        flags.append(f"cuda_cflags = {' '.join(cuda_cflags)}")
+        flags.append(f"cuda_post_cflags = {' '.join(cuda_post_cflags)}")
+        cuda_post_cflags_sm80 = [
+            s if s != "arch=compute_90a,code=sm_90a" else "arch=compute_80,code=sm_80"
+            for s in cuda_post_cflags
+        ]
+        flags.append(f"cuda_post_cflags_sm80 = {' '.join(cuda_post_cflags_sm80)}")
+        cuda_post_cflags_sm80_sm90 = cuda_post_cflags + [
+            "-gencode",
+            "arch=compute_80,code=sm_80",
+        ]
+        flags.append(
+            f"cuda_post_cflags_sm80_sm90 = {' '.join(cuda_post_cflags_sm80_sm90)}"
+        )
+        cuda_post_cflags_sm100 = [
+            s
+            if s != "arch=compute_90a,code=sm_90a"
+            else "arch=compute_100a,code=sm_100a"
+            for s in cuda_post_cflags
+        ]
+        flags.append(f"cuda_post_cflags_sm100 = {' '.join(cuda_post_cflags_sm100)}")
+    flags.append(f"cuda_dlink_post_cflags = {' '.join(cuda_dlink_post_cflags)}")
+    flags.append(f"ldflags = {' '.join(ldflags)}")
+
+    # Turn into absolute paths so we can emit them into the ninja build
+    # file wherever it is.
+    sources = [os.path.abspath(file) for file in sources]
+
+    # See https://ninja-build.org/build.ninja.html for reference.
+    compile_rule = ["rule compile"]
+    if IS_WINDOWS:
+        compile_rule.append(
+            "  command = cl /showIncludes $cflags -c $in /Fo$out $post_cflags"
+        )
+        compile_rule.append("  deps = msvc")
+    else:
+        compile_rule.append(
+            "  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags"
+        )
+        compile_rule.append("  depfile = $out.d")
+        compile_rule.append("  deps = gcc")
+
+    if with_cuda:
+        cuda_compile_rule = ["rule cuda_compile"]
+        nvcc_gendeps = ""
+        # --generate-dependencies-with-compile is not supported by ROCm
+        # Nvcc flag `--generate-dependencies-with-compile` is not supported by sccache, which may increase build time.
+        if (
+            torch.version.cuda is not None
+            and os.getenv("TORCH_EXTENSION_SKIP_NVCC_GEN_DEPENDENCIES", "0") != "1"
+        ):
+            cuda_compile_rule.append("  depfile = $out.d")
+            cuda_compile_rule.append("  deps = gcc")
+            # Note: non-system deps with nvcc are only supported
+            # on Linux so use --generate-dependencies-with-compile
+            # to make this work on Windows too.
+            nvcc_gendeps = (
+                "--generate-dependencies-with-compile --dependency-output $out.d"
+            )
+        cuda_compile_rule_sm80 = (
+            ["rule cuda_compile_sm80"]
+            + cuda_compile_rule[1:]
+            + [
+                f"  command = $nvcc_from_env {nvcc_gendeps} $cuda_cflags -c $in -o $out $cuda_post_cflags_sm80"
+            ]
+        )
+        cuda_compile_rule_sm80_sm90 = (
+            ["rule cuda_compile_sm80_sm90"]
+            + cuda_compile_rule[1:]
+            + [
+                f"  command = $nvcc_from_env {nvcc_gendeps} $cuda_cflags -c $in -o $out $cuda_post_cflags_sm80_sm90"
+            ]
+        )
+        cuda_compile_rule_sm100 = (
+            ["rule cuda_compile_sm100"]
+            + cuda_compile_rule[1:]
+            + [
+                f"  command = $nvcc_from_env {nvcc_gendeps} $cuda_cflags -c $in -o $out $cuda_post_cflags_sm100"
+            ]
+        )
+        cuda_compile_rule.append(
+            f"  command = $nvcc_from_env {nvcc_gendeps} $cuda_cflags -c $in -o $out $cuda_post_cflags"
+        )
+
+    # Emit one build rule per source to enable incremental build.
+    build = []
+    for source_file, object_file in zip(sources, objects):
+        is_cuda_source = _is_cuda_file(source_file) and with_cuda
+        if is_cuda_source:
+            if source_file.endswith("_sm90.cu"):
+                rule = "cuda_compile"
+            elif source_file.endswith("_sm80.cu"):
+                rule = "cuda_compile_sm80"
+            elif source_file.endswith("_sm100.cu"):
+                rule = "cuda_compile_sm100"
+            else:
+                rule = "cuda_compile_sm80_sm90"
+        else:
+            rule = "compile"
+        if IS_WINDOWS:
+            source_file = source_file.replace(":", "$:")
+            object_file = object_file.replace(":", "$:")
+        source_file = source_file.replace(" ", "$ ")
+        object_file = object_file.replace(" ", "$ ")
+        build.append(f"build {object_file}: {rule} {source_file}")
+
+    if cuda_dlink_post_cflags:
+        devlink_out = os.path.join(os.path.dirname(objects[0]), "dlink.o")
+        devlink_rule = ["rule cuda_devlink"]
+        devlink_rule.append("  command = $nvcc $in -o $out $cuda_dlink_post_cflags")
+        devlink = [f"build {devlink_out}: cuda_devlink {' '.join(objects)}"]
+        objects += [devlink_out]
+    else:
+        devlink_rule, devlink = [], []
+
+    if library_target is not None:
+        link_rule = ["rule link"]
+        if IS_WINDOWS:
+            cl_paths = (
+                subprocess.check_output(["where", "cl"])
+                .decode(*SUBPROCESS_DECODE_ARGS)
+                .split("\r\n")
+            )
+            if len(cl_paths) >= 1:
+                cl_path = os.path.dirname(cl_paths[0]).replace(":", "$:")
+            else:
+                raise RuntimeError("MSVC is required to load C++ extensions")
+            link_rule.append(
+                f'  command = "{cl_path}/link.exe" $in /nologo $ldflags /out:$out'
+            )
+        else:
+            link_rule.append("  command = $cxx $in $ldflags -o $out")
+
+        link = [f"build {library_target}: link {' '.join(objects)}"]
+
+        default = [f"default {library_target}"]
+    else:
+        link_rule, link, default = [], [], []
+
+    # 'Blocks' should be separated by newlines, for visual benefit.
+    blocks = [config, flags, compile_rule]
+    if with_cuda:
+        blocks.append(cuda_compile_rule)  # type: ignore[possibly-undefined]
+        blocks.append(cuda_compile_rule_sm80)  # type: ignore[possibly-undefined]
+        blocks.append(cuda_compile_rule_sm80_sm90)  # type: ignore[possibly-undefined]
+        blocks.append(cuda_compile_rule_sm100)  # type: ignore[possibly-undefined]
+    blocks += [devlink_rule, link_rule, build, devlink, link, default]
+    content = "\n\n".join("\n".join(b) for b in blocks)
+    # Ninja requires a new lines at the end of the .ninja file
+    content += "\n"
+    _maybe_write(path, content)
+
+
+# Monkey patching
+torch.utils.cpp_extension._write_ninja_file = _write_ninja_file
+
+
+def get_platform():
+    """
+    Returns the platform name as used in wheel filenames.
+    """
+    if sys.platform.startswith("linux"):
+        return "linux_x86_64"
+    elif sys.platform == "darwin":
+        mac_version = ".".join(platform.mac_ver()[0].split(".")[:2])
+        return f"macosx_{mac_version}_x86_64"
+    elif sys.platform == "win32":
+        return "win_amd64"
+    else:
+        raise ValueError("Unsupported platform: {}".format(sys.platform))
+
+
+def get_cuda_bare_metal_version(cuda_dir):
+    raw_output = subprocess.check_output(
+        [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
+    )
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    bare_metal_version = parse(output[release_idx].split(",")[0])
+
+    return raw_output, bare_metal_version
+
+
+def check_if_cuda_home_none(global_option: str) -> None:
+    if CUDA_HOME is not None:
+        return
+    # warn instead of error because user could be downloading prebuilt wheels, so nvcc won't be necessary
+    # in that case.
+    warnings.warn(
+        f"{global_option} was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  "
+        "If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, "
+        "only images whose names contain 'devel' will provide nvcc."
+    )
+
+
+def nvcc_threads_args():
+    nvcc_threads = os.getenv("NVCC_THREADS") or "4"
+    return ["--threads", nvcc_threads]
+
+
+exe_extension = sysconfig.get_config_var("EXE")
+
+
+cmdclass = {}
+ext_modules = []
+
+# We want this even if SKIP_CUDA_BUILD because when we run python setup.py sdist we want the .hpp
+# files included in the source distribution, in case the user compiles from source.
+subprocess.run(["git", "submodule", "update", "--init", "cutlass"])
+
+if not SKIP_CUDA_BUILD:
+    print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
+    TORCH_MAJOR = int(torch.__version__.split(".")[0])
+    TORCH_MINOR = int(torch.__version__.split(".")[1])
+
+    check_if_cuda_home_none(PACKAGE_NAME)
+    _, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME)
+    if bare_metal_version < Version("12.3"):
+        raise RuntimeError(
+            f"FlashAttention-3 is only supported on CUDA 12.3 and above, get {bare_metal_version} from {CUDA_HOME}"
+        )
+
+    cc_flag = []
+    cc_flag.append("-gencode")
+    cc_flag.append("arch=compute_90a,code=sm_90a")
+
+    # HACK: The compiler flag -D_GLIBCXX_USE_CXX11_ABI is set to be the same as
+    # torch._C._GLIBCXX_USE_CXX11_ABI
+    # https://github.com/pytorch/pytorch/blob/8472c24e3b5b60150096486616d98b7bea01500b/torch/utils/cpp_extension.py#L920
+    if FORCE_CXX11_ABI:
+        torch._C._GLIBCXX_USE_CXX11_ABI = True
+    repo_dir = Path(this_dir).parent
+    cutlass_dir = repo_dir / "cpp" / "cutlass"
+
+    feature_args = (
+        []
+        + ["-DOSS_ENV"]
+        + (["-DFLASHATTENTION_DISABLE_BACKWARD"] if DISABLE_BACKWARD else [])
+        + (["-DFLASHATTENTION_DISABLE_FP16"] if DISABLE_FP16 else [])
+        + ["-DFLASHATTENTION_DISABLE_FP8"]
+        + (["-DFLASHATTENTION_DISABLE_HDIM64"] if DISABLE_HDIM64 else [])
+        + (["-DFLASHATTENTION_DISABLE_HDIM96"] if DISABLE_HDIM96 else [])
+        + (["-DFLASHATTENTION_DISABLE_HDIM128"] if DISABLE_HDIM128 else [])
+        + (["-DFLASHATTENTION_DISABLE_HDIM192"] if DISABLE_HDIM192 else [])
+        + (["-DFLASHATTENTION_DISABLE_HDIM256"] if DISABLE_HDIM256 else [])
+        + (["-DFLASHATTENTION_DISABLE_SM8x"] if DISABLE_SM8x else [])
+    )
+
+    DTYPE = ["bf16"] + (["fp16"] if not DISABLE_FP16 else [])
+    HEAD_DIMENSIONS = (
+        []
+        + ([64] if not DISABLE_HDIM64 else [])
+        + ([96] if not DISABLE_HDIM96 else [])
+        + ([128] if not DISABLE_HDIM128 else [])
+        + ([192] if not DISABLE_HDIM192 else [])
+        + ([256] if not DISABLE_HDIM256 else [])
+    )
+    sources_fwd_sm80 = [
+        f"hstu_attention/instantiations/flash_fwd_hdim{hdim}_{dtype}_sm80.cu"
+        for hdim, dtype in itertools.product(HEAD_DIMENSIONS, DTYPE)
+    ]
+    sources_bwd_sm80 = [
+        f"hstu_attention/instantiations/flash_bwd_hdim{hdim}_{dtype}_sm80.cu"
+        for hdim, dtype in itertools.product(HEAD_DIMENSIONS, DTYPE)
+    ]
+    sources_fwd_sm90 = [
+        f"hstu_attention/instantiations/flash_fwd_hdim{hdim}_{dtype}_sm90.cu"
+        for hdim, dtype in itertools.product(HEAD_DIMENSIONS, DTYPE)
+    ]
+    sources_bwd_sm90 = [
+        f"hstu_attention/instantiations/flash_bwd_hdim{hdim}_{dtype}_sm90.cu"
+        for hdim, dtype in itertools.product(HEAD_DIMENSIONS, DTYPE)
+    ]
+    if DISABLE_BACKWARD:
+        sources_bwd_sm90 = []
+        sources_bwd_sm80 = []
+    sources = (
+        [
+            "hstu_attention/flash_api.cpp",
+            "hstu_attention/flash_common.cpp",
+            "hstu_attention/flash_cpu_dummy.cpp",
+            "hstu_attention/flash_meta.cpp",
+        ]
+        + (sources_fwd_sm80 if not DISABLE_SM8x else [])
+        + sources_fwd_sm90
+        + (sources_bwd_sm80 if not DISABLE_SM8x else [])
+        + sources_bwd_sm90
+    )
+    nvcc_flags = [
+        "-O3",
+        "-std=c++17",
+        "--ftemplate-backtrace-limit=0",  # To debug template code
+        "--use_fast_math",
+        # "--keep",
+        # "--ptxas-options=--verbose,--register-usage-level=5,--warn-on-local-memory-usage",  # printing out number of registers
+        "--resource-usage",  # printing out number of registers
+        # f"--split-compile={os.getenv('NVCC_THREADS', '4')}",  # split-compile is faster
+        "-lineinfo",  # TODO: disable this for release to reduce binary size
+        "-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED",  # Necessary for the WGMMA shapes that we use
+        "-DCUTLASS_ENABLE_GDC_FOR_SM90",  # For PDL
+        "-DCUTLASS_DEBUG_TRACE_LEVEL=0",  # Can toggle for debugging
+        "-DNDEBUG",  # Important, otherwise performance is severely impacted
+        "-Xfatbin",  # compress all binary sections
+        "-compress-all",
+    ]
+    if get_platform() == "win_amd64":
+        nvcc_flags.extend(
+            [
+                "-D_USE_MATH_DEFINES",  # for M_LN2
+                "-Xcompiler=/Zc:__cplusplus",  # sets __cplusplus correctly, CUTLASS_CONSTEXPR_IF_CXX17 needed for cutlass::gcd
+            ]
+        )
+    include_dirs = [
+        Path(this_dir),
+        cutlass_dir / "include",
+    ]
+
+    ext_modules.append(
+        CUDAExtension(
+            name=f"{PACKAGE_NAME}._C",
+            sources=sources,
+            extra_compile_args={
+                "cxx": ["-O3", "-std=c++17", "-DPy_LIMITED_API=0x03090000"]
+                + feature_args,
+                "nvcc": nvcc_threads_args() + nvcc_flags + cc_flag + feature_args,
+            },
+            include_dirs=include_dirs,
+            py_limited_api=True,
+        )
+    )
+
+
+setup(
+    name=PACKAGE_NAME,
+    version="0.1.0",
+    packages=find_packages(
+        exclude=(
+            "build",
+            "csrc",
+            "include",
+            "tests",
+            "dist",
+            "docs",
+            "benchmarks",
+        )
+    ),
+    py_modules=["cuda_hstu_attention"],
+    description="FlashAttention HSTU",
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: Apache Software License",
+        "Operating System :: Unix",
+    ],
+    ext_modules=ext_modules,
+    cmdclass={"build_ext": BuildExtension},
+    python_requires=">=3.8",
+    install_requires=[
+        "torch",
+        "einops",
+        "packaging",
+        "ninja==1.11.1.1",
+    ],
+)
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda.cpp b/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda.cpp
new file mode 100644
index 000000000..3925aefd7
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda.cpp
@@ -0,0 +1,40 @@
+#include "common.h"
+#include "sort_kv_pairs_cuda_kernels_template.h"
+
+namespace hstu {
+
+DLL_PUBLIC std::tuple<at::Tensor, at::Tensor> sort_kv_pairs_cuda(
+    const at::Tensor& keys,
+    const at::Tensor& values,
+    const std::optional<int64_t>& end_bit,
+    const bool descending = false) {
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(keys.get_device());
+  TORCH_CHECK(
+      keys.dtype() == at::kInt || keys.dtype() == at::kLong ||
+      keys.dtype() == at::kByte || keys.dtype() == at::kShort);
+  TORCH_CHECK(keys.numel() < std::numeric_limits<int32_t>::max());
+  TORCH_CHECK(keys.dim() == 1);
+  TORCH_CHECK(values.dim() == 1);
+  at::Tensor sorted_keys;
+  at::Tensor sorted_values;
+
+  AT_DISPATCH_INTEGRAL_TYPES(keys.scalar_type(), "sort_pairs_cuda_input1", [&] {
+    using key_t = scalar_t;
+    AT_DISPATCH_ALL_TYPES_AND2(
+        at::ScalarType::Half,
+        at::ScalarType::BFloat16,
+        values.scalar_type(),
+        "sort_pairs_cuda_input2",
+        [&] {
+          using val_t = scalar_t;
+          std::tie(sorted_keys, sorted_values) =
+              sort_kv_pairs_cuda_dispatched<key_t, val_t>(
+                  keys, values, end_bit, descending);
+        });
+  });
+
+  return {std::move(sorted_keys), std::move(sorted_values)};
+}
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda_kernels_template.cu b/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda_kernels_template.cu
new file mode 100644
index 000000000..8cd175c71
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda_kernels_template.cu
@@ -0,0 +1,82 @@
+#include <generative_recommenders/ops/cpp/common.h>
+#include <generative_recommenders/ops/cpp/sort_kv_pairs_cuda_kernels_template.h>
+
+#include <cub/device/device_radix_sort.cuh>
+
+namespace hstu {
+
+template <>
+DLL_PUBLIC std::tuple<at::Tensor, at::Tensor>
+sort_kv_pairs_cuda_dispatched<SUB_KEY_T, SUB_VALUE_T>(
+    const at::Tensor& keys,
+    const at::Tensor& values,
+    const std::optional<int64_t>& end_bit,
+    const bool descending) {
+  size_t temp_storage_bytes = 0;
+  auto keys_contig = keys.contiguous();
+  auto values_contig = values.contiguous();
+  auto sorted_keys = at::empty_like(keys_contig);
+  auto sorted_values = at::empty_like(values_contig);
+
+  if (descending) {
+    AT_CUDA_CHECK(
+        cub::DeviceRadixSort::SortPairsDescending(
+            nullptr,
+            temp_storage_bytes,
+            keys_contig.data_ptr<SUB_KEY_T>(),
+            sorted_keys.data_ptr<SUB_KEY_T>(),
+            values_contig.data_ptr<SUB_VALUE_T>(),
+            sorted_values.data_ptr<SUB_VALUE_T>(),
+            keys_contig.numel(),
+            0,
+            end_bit.has_value() ? end_bit.value() : sizeof(SUB_KEY_T) * 8,
+            at::cuda::getCurrentCUDAStream()));
+    auto temp_storage = at::empty(
+        {static_cast<int64_t>(temp_storage_bytes)},
+        keys_contig.options().dtype(at::kByte));
+    AT_CUDA_CHECK(
+        cub::DeviceRadixSort::SortPairsDescending(
+            temp_storage.data_ptr(),
+            temp_storage_bytes,
+            keys_contig.data_ptr<SUB_KEY_T>(),
+            sorted_keys.data_ptr<SUB_KEY_T>(),
+            values_contig.data_ptr<SUB_VALUE_T>(),
+            sorted_values.data_ptr<SUB_VALUE_T>(),
+            keys_contig.numel(),
+            0,
+            end_bit.has_value() ? end_bit.value() : sizeof(SUB_KEY_T) * 8,
+            at::cuda::getCurrentCUDAStream()));
+  } else {
+    AT_CUDA_CHECK(
+        cub::DeviceRadixSort::SortPairs(
+            nullptr,
+            temp_storage_bytes,
+            keys_contig.data_ptr<SUB_KEY_T>(),
+            sorted_keys.data_ptr<SUB_KEY_T>(),
+            values_contig.data_ptr<SUB_VALUE_T>(),
+            sorted_values.data_ptr<SUB_VALUE_T>(),
+            keys_contig.numel(),
+            0,
+            end_bit.has_value() ? end_bit.value() : sizeof(SUB_KEY_T) * 8,
+            at::cuda::getCurrentCUDAStream()));
+    auto temp_storage = at::empty(
+        {static_cast<int64_t>(temp_storage_bytes)},
+        keys_contig.options().dtype(at::kByte));
+    AT_CUDA_CHECK(
+        cub::DeviceRadixSort::SortPairs(
+            temp_storage.data_ptr(),
+            temp_storage_bytes,
+            keys_contig.data_ptr<SUB_KEY_T>(),
+            sorted_keys.data_ptr<SUB_KEY_T>(),
+            values_contig.data_ptr<SUB_VALUE_T>(),
+            sorted_values.data_ptr<SUB_VALUE_T>(),
+            keys_contig.numel(),
+            0,
+            end_bit.has_value() ? end_bit.value() : sizeof(SUB_KEY_T) * 8,
+            at::cuda::getCurrentCUDAStream()));
+  }
+
+  return {std::move(sorted_keys), std::move(sorted_values)};
+}
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda_kernels_template.h b/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda_kernels_template.h
new file mode 100644
index 000000000..e599eccb0
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda_kernels_template.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <ATen/ATen.h>
+#include <optional>
+
+namespace hstu {
+
+template <typename key_t, typename val_t>
+std::tuple<at::Tensor, at::Tensor> sort_kv_pairs_cuda_dispatched(
+    const at::Tensor& keys_contig,
+    const at::Tensor& values_contig,
+    const std::optional<int64_t>& end_bit,
+    const bool descending);
+
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/split_1d_jagged_jagged.cpp b/recommendation_v4/generative_recommenders/ops/cpp/split_1d_jagged_jagged.cpp
new file mode 100644
index 000000000..c361488fa
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/split_1d_jagged_jagged.cpp
@@ -0,0 +1,136 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/extension.h>
+#include <torch/library.h>
+
+#include "fbgemm_gpu/sparse_ops.h" // @manual
+
+namespace hstu {
+
+template <typename index_t, typename val_t>
+void _split_1d_jagged_jagged_cpu_kernel(
+    int32_t B,
+    const at::TensorAccessor<index_t, 1>& combined_offsets,
+    const at::TensorAccessor<val_t, 1>& combined_values,
+    const at::TensorAccessor<index_t, 1>& lengths_left,
+    const at::TensorAccessor<index_t, 1>& offsets_left,
+    const at::TensorAccessor<index_t, 1>& offsets_right,
+    at::TensorAccessor<val_t, 1> values_left,
+    at::TensorAccessor<val_t, 1> values_right) {
+  for (auto b : c10::irange(B)) {
+    auto combined_start = combined_offsets[b];
+    auto left_len = lengths_left[b];
+    auto left_start = offsets_left[b];
+    auto right_start = offsets_right[b];
+
+    for (auto i = 0; i < left_len; ++i) {
+      values_left[left_start + i] = combined_values[combined_start + i];
+    }
+
+    auto right_len = combined_offsets[b + 1] - combined_offsets[b] - left_len;
+    for (auto i = 0; i < right_len; ++i) {
+      values_right[right_start + i] =
+          combined_values[combined_start + left_len + i];
+    }
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor> split_1d_jagged_jagged_cpu(
+    const at::Tensor& lengths_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& combined_values) {
+  TORCH_INTERNAL_ASSERT(lengths_left.device().type() == at::DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT(lengths_right.device().type() == at::DeviceType::CPU);
+  TORCH_INTERNAL_ASSERT(combined_values.device().type() == at::DeviceType::CPU);
+  TORCH_CHECK(lengths_left.size(0) == lengths_right.size(0));
+  auto B = lengths_left.size(0);
+
+  auto L_left = lengths_left.sum().item<int64_t>();
+  auto L_right = lengths_right.sum().item<int64_t>();
+  TORCH_CHECK(L_left + L_right == combined_values.numel());
+
+  auto values_left = at::empty({L_left}, combined_values.options());
+  auto values_right = at::empty({L_right}, combined_values.options());
+
+  if (L_left == 0 && L_right == 0) {
+    return std::make_tuple(values_left, values_right);
+  }
+
+  const auto combined_lengths = lengths_left + lengths_right;
+  const auto combined_offsets =
+      fbgemm_gpu::asynchronous_complete_cumsum_cpu(combined_lengths.view({-1}));
+  const auto offsets_left =
+      fbgemm_gpu::asynchronous_complete_cumsum_cpu(lengths_left.view({-1}));
+  const auto offsets_right =
+      fbgemm_gpu::asynchronous_complete_cumsum_cpu(lengths_right.view({-1}));
+
+  AT_DISPATCH_INTEGRAL_TYPES(
+      lengths_left.scalar_type(),
+      "split_1d_jagged_jagged_values_cpu_kernel_input1",
+      [&] {
+        using index_t = scalar_t;
+        AT_DISPATCH_ALL_TYPES_AND2(
+            at::ScalarType::BFloat16,
+            at::ScalarType::Half,
+            combined_values.scalar_type(),
+            "split_1d_jagged_jagged_values_cpu_kernel_input2",
+            [&] {
+              using val_t = scalar_t;
+              _split_1d_jagged_jagged_cpu_kernel<index_t, val_t>(
+                  B,
+                  combined_offsets.accessor<index_t, 1>(),
+                  combined_values.accessor<val_t, 1>(),
+                  lengths_left.accessor<index_t, 1>(),
+                  offsets_left.accessor<index_t, 1>(),
+                  offsets_right.accessor<index_t, 1>(),
+                  values_left.accessor<val_t, 1>(),
+                  values_right.accessor<val_t, 1>());
+            });
+      });
+
+  return std::make_tuple(values_left, values_right);
+}
+
+std::tuple<at::Tensor, at::Tensor> split_1d_jagged_jagged_meta(
+    const at::Tensor& lengths_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& combined_values) {
+  auto L_left = lengths_left.sum().item<int64_t>();
+  auto L_right = lengths_right.sum().item<int64_t>();
+
+  auto values_left = at::native::empty_meta_symint(
+      {L_left},
+      /*dtype=*/::std::make_optional(combined_values.scalar_type()),
+      /*layout=*/::std::make_optional(combined_values.layout()),
+      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
+      /*pin_memory=*/::std::nullopt);
+
+  auto values_right = at::native::empty_meta_symint(
+      {L_right},
+      /*dtype=*/::std::make_optional(combined_values.scalar_type()),
+      /*layout=*/::std::make_optional(combined_values.layout()),
+      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
+      /*pin_memory=*/::std::nullopt);
+
+  return std::make_tuple(values_left, values_right);
+}
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/split_1d_jagged_jagged.cu b/recommendation_v4/generative_recommenders/ops/cpp/split_1d_jagged_jagged.cu
new file mode 100644
index 000000000..181489bae
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/split_1d_jagged_jagged.cu
@@ -0,0 +1,147 @@
+/* Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/Dispatch.h>
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/library.h>
+
+#include "common.h"
+#include "fbgemm_gpu/sparse_ops.h" // @manual
+#include "fbgemm_gpu/utils/fixed_divisor.cuh" // @manual
+
+namespace hstu {
+
+static constexpr int32_t kMaxThreads = 1024;
+
+template <typename index_t, typename val_t>
+__global__
+__launch_bounds__(kMaxThreads) void _split_1d_jagged_jagged_cuda_kernel(
+    int32_t B,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        combined_offsets,
+    const at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits>
+        combined_values,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        lengths_left,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        offsets_left,
+    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
+        offsets_right,
+    at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits> values_left,
+    at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits> values_right) {
+  for (auto b = blockIdx.x * blockDim.y + threadIdx.y;
+       b < static_cast<uint32_t>(B);
+       b += gridDim.x * blockDim.y) {
+    auto combined_start = combined_offsets[b];
+    auto left_len = lengths_left[b];
+    auto right_len = combined_offsets[b + 1] - combined_offsets[b] - left_len;
+    auto left_start = offsets_left[b];
+    auto right_start = offsets_right[b];
+
+    for (auto i = threadIdx.x; i < static_cast<uint32_t>(left_len + right_len);
+         i += blockDim.x) {
+      if (i < static_cast<uint32_t>(left_len)) {
+        values_left[left_start + i] = combined_values[combined_start + i];
+      } else {
+        values_right[right_start + i - left_len] =
+            combined_values[combined_start + i];
+      }
+    }
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor> split_1d_jagged_jagged_cuda(
+    const at::Tensor& lengths_left,
+    const at::Tensor& lengths_right,
+    const at::Tensor& combined_values) {
+  at::cuda::OptionalCUDAGuard device_guard;
+  device_guard.set_index(combined_values.get_device());
+  TORCH_INTERNAL_ASSERT(lengths_left.device().type() == at::DeviceType::CUDA);
+  TORCH_INTERNAL_ASSERT(lengths_right.device().type() == at::DeviceType::CUDA);
+  TORCH_INTERNAL_ASSERT(
+      combined_values.device().type() == at::DeviceType::CUDA);
+  TORCH_CHECK(lengths_left.size(0) == lengths_right.size(0));
+
+  auto B = lengths_left.size(0);
+  auto L_left = lengths_left.sum().item<int64_t>();
+  auto L_right = lengths_right.sum().item<int64_t>();
+  TORCH_CHECK(L_left + L_right == combined_values.numel());
+  TORCH_CHECK(L_left < std::numeric_limits<int32_t>::max());
+  TORCH_CHECK(L_right < std::numeric_limits<int32_t>::max());
+  TORCH_CHECK(combined_values.get_device() == lengths_left.get_device());
+  TORCH_CHECK(combined_values.get_device() == lengths_right.get_device());
+
+  auto values_left = at::empty({L_left}, combined_values.options());
+  auto values_right = at::empty({L_right}, combined_values.options());
+
+  if (L_left == 0 && L_right == 0) {
+    return std::make_tuple(values_left, values_right);
+  }
+
+  const auto combined_lengths = lengths_left + lengths_right;
+  const auto combined_offsets =
+      fbgemm_gpu::asynchronous_complete_cumsum_gpu(combined_lengths.view({-1}));
+  const auto offsets_left =
+      fbgemm_gpu::asynchronous_complete_cumsum_gpu(lengths_left.view({-1}));
+  const auto offsets_right =
+      fbgemm_gpu::asynchronous_complete_cumsum_gpu(lengths_right.view({-1}));
+
+  // Optimized thread block configuration based on benchmark results
+  uint32_t B_blocks = 4;
+  dim3 threads(256, B_blocks);
+  auto blocks = div_round_up(B, B_blocks);
+
+  AT_DISPATCH_INTEGRAL_TYPES(
+      lengths_left.scalar_type(),
+      "split_1d_jagged_jagged_values_cuda_kernel_input1",
+      [&] {
+        using index_t = scalar_t;
+        AT_DISPATCH_ALL_TYPES_AND2(
+            at::ScalarType::BFloat16,
+            at::ScalarType::Half,
+            combined_values.scalar_type(),
+            "split_1d_jagged_jagged_values_cuda_kernel_input2",
+            [&] {
+              using val_t = scalar_t;
+              _split_1d_jagged_jagged_cuda_kernel<index_t, val_t><<<
+                  blocks,
+                  threads,
+                  0,
+                  at::cuda::getCurrentCUDAStream()>>>(
+                  B,
+                  combined_offsets
+                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                  combined_values
+                      .packed_accessor32<val_t, 1, at::RestrictPtrTraits>(),
+                  lengths_left
+                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                  offsets_left
+                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                  offsets_right
+                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
+                  values_left
+                      .packed_accessor32<val_t, 1, at::RestrictPtrTraits>(),
+                  values_right
+                      .packed_accessor32<val_t, 1, at::RestrictPtrTraits>());
+            });
+      });
+
+  return std::make_tuple(values_left, values_right);
+}
+} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/tests/concat_1d_jagged_jagged_test.py b/recommendation_v4/generative_recommenders/ops/cpp/tests/concat_1d_jagged_jagged_test.py
new file mode 100644
index 000000000..8c27a787b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/tests/concat_1d_jagged_jagged_test.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import unittest
+
+import torch
+from generative_recommenders.common import gpu_unavailable
+from hammer.ops.jagged import concat_1D_jagged_jagged
+from hypothesis import given, settings, strategies as st, Verbosity
+
+# buck2 test @mode/opt -c fbcode.nvcc_arch=h100 fbcode//generative_recommenders/ops/cpp/tests:concat_1d_jagged_jagged_test
+
+torch.ops.load_library("//generative_recommenders/ops/cpp:cpp_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+
+
+class OpsTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(10, 500),
+        max_seq_len_left=st.integers(10, 1000),
+        max_seq_len_right=st.integers(10, 1000),
+        val_dtype=st.sampled_from([torch.float32, torch.float16, torch.bfloat16]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=100,
+        deadline=None,
+    )
+    def test_concat_1d_jagged_jagged(
+        self,
+        batch_size: int,
+        max_seq_len_left: int,
+        max_seq_len_right: int,
+        val_dtype: torch.dtype,
+    ) -> None:
+        batch_size = 3
+        max_seq_len_left = 4
+        max_seq_len_right = 2
+        lengths_left = torch.randint(
+            0, max_seq_len_left + 1, (batch_size,), device="cpu"
+        )
+        values_left = torch.rand(
+            (int(torch.sum(lengths_left).cpu().item()),), dtype=val_dtype, device="cpu"
+        )
+        offsets_left = torch.zeros(
+            (batch_size + 1,),
+            dtype=lengths_left.dtype,
+            device=lengths_left.device,
+        )
+        offsets_left[1:] = torch.cumsum(lengths_left.view(-1), dim=0)
+        lengths_right = torch.randint(
+            0, max_seq_len_right + 1, (batch_size,), device="cpu"
+        )
+        values_right = torch.rand(
+            (int(torch.sum(lengths_right).cpu().item()),), dtype=val_dtype, device="cpu"
+        )
+        offsets_right = torch.zeros(
+            (batch_size + 1,),
+            dtype=lengths_right.dtype,
+            device=lengths_right.device,
+        )
+        offsets_right[1:] = torch.cumsum(lengths_right.view(-1), dim=0)
+        custom_cpu_result = torch.ops.hstu.concat_1d_jagged_jagged(
+            lengths_left=lengths_left,
+            values_left=values_left,
+            lengths_right=lengths_right,
+            values_right=values_right,
+        )
+
+        custom_cuda_result = torch.ops.hstu.concat_1d_jagged_jagged(
+            lengths_left=lengths_left.cuda(),
+            values_left=values_left.cuda(),
+            lengths_right=lengths_right.cuda(),
+            values_right=values_right.cuda(),
+        )
+        torch.testing.assert_close(custom_cuda_result.cpu(), custom_cpu_result)
+
+    @unittest.skipIf(*gpu_unavailable)
+    def test_concat_1d_jagged_jagged_vs_hammer(self) -> None:
+        torch.manual_seed(42)
+        batch_size = 8
+        max_seq_len_left = 50
+        max_seq_len_right = 30
+
+        lengths_left = torch.randint(
+            0, max_seq_len_left + 1, (batch_size,), dtype=torch.int32
+        )
+        lengths_right = torch.randint(
+            0, max_seq_len_right + 1, (batch_size,), dtype=torch.int32
+        )
+
+        total_left = int(lengths_left.sum().item())
+        total_right = int(lengths_right.sum().item())
+
+        values_left = (
+            torch.randn(total_left, dtype=torch.float32)
+            if total_left > 0
+            else torch.empty(0, dtype=torch.float32)
+        )
+        values_right = (
+            torch.randn(total_right, dtype=torch.float32)
+            if total_right > 0
+            else torch.empty(0, dtype=torch.float32)
+        )
+
+        offsets_left = torch.zeros(
+            (batch_size + 1,), dtype=lengths_left.dtype, device=lengths_left.device
+        )
+        offsets_left[1:] = torch.cumsum(lengths_left.view(-1), dim=0)
+        offsets_right = torch.zeros(
+            (batch_size + 1,), dtype=lengths_right.dtype, device=lengths_right.device
+        )
+        offsets_right[1:] = torch.cumsum(lengths_right.view(-1), dim=0)
+
+        combined_values_ref = concat_1D_jagged_jagged(
+            max_seq_len_left=max_seq_len_left,
+            offsets_left=offsets_left,
+            values_left=values_left,
+            max_seq_len_right=max_seq_len_right,
+            offsets_right=offsets_right,
+            values_right=values_right,
+        )
+
+        custom_cuda_result = torch.ops.hstu.concat_1d_jagged_jagged(
+            lengths_left=lengths_left.cuda(),
+            values_left=values_left.cuda(),
+            lengths_right=lengths_right.cuda(),
+            values_right=values_right.cuda(),
+        )
+
+        torch.testing.assert_close(custom_cuda_result.cpu(), combined_values_ref)
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/tests/hstu_mha_cpu_test.py b/recommendation_v4/generative_recommenders/ops/cpp/tests/hstu_mha_cpu_test.py
new file mode 100644
index 000000000..cb787ea61
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/tests/hstu_mha_cpu_test.py
@@ -0,0 +1,39 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+# pyre-strict
+
+# cmd: buck2 run @//mode/opt -c fbcode.enable_gpu_sections=true -c fbcode.platform010_cuda_version=12.8 -c fbcode.nvcc_arch=b200a  //generative_recommenders/ops/cpp/tests:hstu_mha_cpu_test
+
+import unittest
+
+import torch
+
+torch.ops.load_library(
+    "//generative_recommenders/ops/cpp/hstu_attention:hstu_flash_attention"
+)
+
+
+class TestHstuMhaFwd(unittest.TestCase):
+    def test_hstu_mha_fwd(self) -> None:
+        q: torch.Tensor = torch.randn([100, 4, 64], dtype=torch.bfloat16, device="cpu")
+        k: torch.Tensor = torch.randn([100, 4, 64], dtype=torch.bfloat16, device="cpu")
+        v: torch.Tensor = torch.randn([100, 4, 64], dtype=torch.bfloat16, device="cpu")
+        res = torch.ops.hstu.hstu_mha_fwd(
+            10,
+            0.25,
+            q,
+            k,
+            v,
+            torch.empty([0], dtype=torch.int32, device="cpu"),
+            True,  # causal
+            None,
+            None,
+            0,
+            0,
+            0,
+            None,  # q_descale
+            None,  # k_descale
+            None,  # v_descale
+            0,  # sm_margin
+        )
+        self.assertIsNotNone(res)
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/tests/jagged_transpose_1d_test.py b/recommendation_v4/generative_recommenders/ops/cpp/tests/jagged_transpose_1d_test.py
new file mode 100644
index 000000000..6a5f5997b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/tests/jagged_transpose_1d_test.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import unittest
+
+import torch
+from generative_recommenders.common import gpu_unavailable
+from hammer.ops.jagged import jagged_transpose_1D
+from hypothesis import given, settings, strategies as st, Verbosity
+
+# buck2 test @mode/opt -c fbcode.nvcc_arch=h100 fbcode//generative_recommenders/ops/cpp/tests:jagged_transpose_1d_test
+
+torch.ops.load_library("//generative_recommenders/ops/cpp:cpp_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+
+
+class OpsTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        size1=st.integers(2, 10),
+        size2=st.integers(2, 10),
+        max_len=st.integers(5, 50),
+        val_dtype=st.sampled_from([torch.float32, torch.float16, torch.bfloat16]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=100,
+        deadline=None,
+    )
+    def test_jagged_transpose_1d(
+        self,
+        size1: int,
+        size2: int,
+        max_len: int,
+        val_dtype: torch.dtype,
+    ) -> None:
+        lengths = torch.randint(
+            0, max_len + 1, (size1 * size2,), dtype=torch.int32, device="cpu"
+        )
+        offsets = torch.zeros(
+            (size1 * size2 + 1,), dtype=lengths.dtype, device=lengths.device
+        )
+        offsets[1:] = torch.cumsum(lengths.view(-1), dim=0)
+
+        values = torch.randn(int(offsets[-1].item()), dtype=val_dtype, device="cpu")
+
+        (
+            custom_cpu_values,
+            custom_cpu_offsets,
+            custom_cpu_lengths,
+        ) = torch.ops.hstu.jagged_transpose_1d(
+            values=values,
+            offsets=offsets,
+            lengths=lengths,
+            max_len=max_len,
+            size1=size1,
+            size2=size2,
+        )
+
+        (
+            custom_cuda_values,
+            custom_cuda_offsets,
+            custom_cuda_lengths,
+        ) = torch.ops.hstu.jagged_transpose_1d(
+            values=values.cuda(),
+            offsets=offsets.cuda(),
+            lengths=lengths.cuda(),
+            max_len=max_len,
+            size1=size1,
+            size2=size2,
+        )
+
+        torch.testing.assert_close(custom_cuda_values.cpu(), custom_cpu_values)
+        torch.testing.assert_close(custom_cuda_offsets.cpu(), custom_cpu_offsets)
+        torch.testing.assert_close(custom_cuda_lengths.cpu(), custom_cpu_lengths)
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        size1=st.integers(2, 10),
+        size2=st.integers(2, 10),
+        max_len=st.integers(5, 50),
+        val_dtype=st.sampled_from([torch.float32, torch.float16, torch.bfloat16]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=100,
+        deadline=None,
+    )
+    def test_jagged_transpose_1d_vs_hammer(
+        self,
+        size1: int,
+        size2: int,
+        max_len: int,
+        val_dtype: torch.dtype,
+    ) -> None:
+        lengths = torch.randint(0, max_len + 1, (size1 * size2,), dtype=torch.int32)
+        offsets = torch.zeros(
+            (size1 * size2 + 1,), dtype=lengths.dtype, device=lengths.device
+        )
+        offsets[1:] = torch.cumsum(lengths.view(-1), dim=0)
+
+        values = torch.randn(int(offsets[-1].item()), dtype=val_dtype)
+
+        values_ref, offsets_ref, lengths_ref = jagged_transpose_1D(
+            values=values,
+            offsets=offsets,
+            lengths=lengths,
+            max_len=max_len,
+            size1=size1,
+            size2=size2,
+        )
+
+        (
+            custom_cuda_values,
+            custom_cuda_offsets,
+            custom_cuda_lengths,
+        ) = torch.ops.hstu.jagged_transpose_1d(
+            values=values.cuda(),
+            offsets=offsets.cuda(),
+            lengths=lengths.cuda(),
+            max_len=max_len,
+            size1=size1,
+            size2=size2,
+        )
+
+        torch.testing.assert_close(custom_cuda_values.cpu(), values_ref)
+        torch.testing.assert_close(custom_cuda_offsets.cpu(), offsets_ref)
+        torch.testing.assert_close(custom_cuda_lengths.cpu(), lengths_ref)
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/tests/replace_last_n_with_jagged_test.py b/recommendation_v4/generative_recommenders/ops/cpp/tests/replace_last_n_with_jagged_test.py
new file mode 100644
index 000000000..9826f199d
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/tests/replace_last_n_with_jagged_test.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import unittest
+
+import torch
+from generative_recommenders.common import gpu_unavailable
+from hammer.ops.jagged import replace_last_n_with_jagged
+
+# buck2 test @mode/opt -c fbcode.nvcc_arch=h100 fbcode//generative_recommenders/ops/cpp/tests:replace_last_n_with_jagged_test
+
+torch.ops.load_library("//generative_recommenders/ops/cpp:cpp_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+
+
+class OpsTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    def test_replace_last_n_with_jagged(self) -> None:
+        torch.manual_seed(42)
+        batch_size = 8
+        embedding_dim = 64
+        max_seq_len_left = 25
+        max_seq_len_right = 10
+
+        lengths_left = torch.randint(
+            max_seq_len_right, max_seq_len_left + 1, (batch_size,), dtype=torch.int32
+        )
+        lengths_right = torch.randint(
+            1, max_seq_len_right + 1, (batch_size,), dtype=torch.int32
+        )
+
+        lengths_right = torch.min(lengths_right, lengths_left)
+
+        total_left = int(lengths_left.sum().item())
+        total_right = int(lengths_right.sum().item())
+
+        values_left = torch.randn(total_left, embedding_dim, dtype=torch.float32)
+        values_right = torch.randn(total_right, embedding_dim, dtype=torch.float32)
+
+        custom_cpu_result = torch.ops.hstu.replace_last_n_with_jagged(
+            lengths_left=lengths_left,
+            values_left=values_left,
+            lengths_right=lengths_right,
+            values_right=values_right,
+        )
+
+        custom_cuda_result = torch.ops.hstu.replace_last_n_with_jagged(
+            lengths_left=lengths_left.cuda(),
+            values_left=values_left.cuda(),
+            lengths_right=lengths_right.cuda(),
+            values_right=values_right.cuda(),
+        )
+
+        torch.testing.assert_close(custom_cuda_result.cpu(), custom_cpu_result)
+
+    @unittest.skipIf(*gpu_unavailable)
+    def test_replace_last_n_with_jagged_vs_hammer(self) -> None:
+        torch.manual_seed(42)
+        batch_size = 8
+        embedding_dim = 32
+        max_seq_len_left = 20
+        max_seq_len_right = 8
+
+        lengths_left = torch.randint(
+            max_seq_len_right, max_seq_len_left + 1, (batch_size,), dtype=torch.int32
+        )
+        lengths_right = torch.randint(
+            1, max_seq_len_right + 1, (batch_size,), dtype=torch.int32
+        )
+
+        lengths_right = torch.min(lengths_right, lengths_left)
+
+        total_left = int(lengths_left.sum().item())
+        total_right = int(lengths_right.sum().item())
+
+        values_left = torch.randn(total_left, embedding_dim, dtype=torch.float32)
+        values_right = torch.randn(total_right, embedding_dim, dtype=torch.float32)
+
+        offsets_left = torch.zeros(
+            (batch_size + 1,), dtype=lengths_left.dtype, device=lengths_left.device
+        )
+        offsets_left[1:] = torch.cumsum(lengths_left.view(-1), dim=0)
+        offsets_right = torch.zeros(
+            (batch_size + 1,), dtype=lengths_right.dtype, device=lengths_right.device
+        )
+        offsets_right[1:] = torch.cumsum(lengths_right.view(-1), dim=0)
+
+        result_ref = replace_last_n_with_jagged(
+            max_seq_len_left=max_seq_len_left,
+            offsets_left=offsets_left,
+            values_left=values_left,
+            offsets_right=offsets_right,
+            values_right=values_right,
+        )
+
+        custom_cuda_result = torch.ops.hstu.replace_last_n_with_jagged(
+            lengths_left=lengths_left.cuda(),
+            values_left=values_left.cuda(),
+            lengths_right=lengths_right.cuda(),
+            values_right=values_right.cuda(),
+        )
+
+        torch.testing.assert_close(custom_cuda_result.cpu(), result_ref)
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/tests/split_1d_jagged_jagged_test.py b/recommendation_v4/generative_recommenders/ops/cpp/tests/split_1d_jagged_jagged_test.py
new file mode 100644
index 000000000..24f12c4a2
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/cpp/tests/split_1d_jagged_jagged_test.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import unittest
+
+import torch
+from generative_recommenders.common import gpu_unavailable
+from hammer.ops.jagged import split_1D_jagged_jagged
+
+# buck2 test @mode/opt -c fbcode.nvcc_arch=h100 fbcode//generative_recommenders/ops/cpp/tests:split_1d_jagged_jagged_test
+
+torch.ops.load_library("//generative_recommenders/ops/cpp:cpp_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+
+
+class OpsTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    def test_split_1d_jagged_jagged(self) -> None:
+        torch.manual_seed(42)
+        batch_size = 8
+        max_seq_len_left = 25
+        max_seq_len_right = 20
+
+        lengths_left = torch.randint(
+            0, max_seq_len_left + 1, (batch_size,), dtype=torch.int32
+        )
+        lengths_right = torch.randint(
+            0, max_seq_len_right + 1, (batch_size,), dtype=torch.int32
+        )
+
+        combined_lengths = lengths_left + lengths_right
+        combined_offsets = torch.zeros(
+            (batch_size + 1,), dtype=lengths_left.dtype, device=lengths_left.device
+        )
+        combined_offsets[1:] = torch.cumsum(combined_lengths.view(-1), dim=0)
+
+        combined_values = torch.randn(
+            int(combined_offsets[-1].item()), dtype=torch.float32
+        )
+
+        custom_cpu_left, custom_cpu_right = torch.ops.hstu.split_1d_jagged_jagged(
+            lengths_left=lengths_left,
+            lengths_right=lengths_right,
+            combined_values=combined_values,
+        )
+
+        custom_cuda_left, custom_cuda_right = torch.ops.hstu.split_1d_jagged_jagged(
+            lengths_left=lengths_left.cuda(),
+            lengths_right=lengths_right.cuda(),
+            combined_values=combined_values.cuda(),
+        )
+
+        torch.testing.assert_close(custom_cuda_left.cpu(), custom_cpu_left)
+        torch.testing.assert_close(custom_cuda_right.cpu(), custom_cpu_right)
+
+    @unittest.skipIf(*gpu_unavailable)
+    def test_split_1d_jagged_jagged_vs_hammer(self) -> None:
+        torch.manual_seed(42)
+        batch_size = 8
+        max_seq_len_left = 25
+        max_seq_len_right = 20
+
+        lengths_left = torch.randint(
+            0, max_seq_len_left + 1, (batch_size,), dtype=torch.int32
+        )
+        lengths_right = torch.randint(
+            0, max_seq_len_right + 1, (batch_size,), dtype=torch.int32
+        )
+
+        offsets_left = torch.zeros(
+            (batch_size + 1,), dtype=lengths_left.dtype, device=lengths_left.device
+        )
+        offsets_left[1:] = torch.cumsum(lengths_left.view(-1), dim=0)
+        offsets_right = torch.zeros(
+            (batch_size + 1,), dtype=lengths_right.dtype, device=lengths_right.device
+        )
+        offsets_right[1:] = torch.cumsum(lengths_right.view(-1), dim=0)
+
+        combined_offsets = offsets_left + offsets_right
+        combined_values = torch.randn(
+            int(combined_offsets[-1].item()), dtype=torch.float32
+        )
+
+        left_ref, right_ref = split_1D_jagged_jagged(
+            max_seq_len=max_seq_len_left + max_seq_len_right,
+            values=combined_values,
+            offsets_left=offsets_left,
+            offsets_right=offsets_right,
+        )
+
+        custom_cuda_left, custom_cuda_right = torch.ops.hstu.split_1d_jagged_jagged(
+            lengths_left=lengths_left.cuda(),
+            lengths_right=lengths_right.cuda(),
+            combined_values=combined_values.cuda(),
+        )
+
+        torch.testing.assert_close(custom_cuda_left.cpu(), left_ref)
+        torch.testing.assert_close(custom_cuda_right.cpu(), right_ref)
diff --git a/recommendation_v4/generative_recommenders/ops/hstu_attention.py b/recommendation_v4/generative_recommenders/ops/hstu_attention.py
new file mode 100644
index 000000000..137482227
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/hstu_attention.py
@@ -0,0 +1,353 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from typing import Optional
+
+import torch
+from generative_recommenders.common import HammerKernel, switch_to_contiguous_if_needed
+from generative_recommenders.ops.pytorch.pt_hstu_attention import (
+    pytorch_cached_hstu_mha,
+    pytorch_hstu_mha,
+)
+from generative_recommenders.ops.triton.triton_hstu_attention import (
+    triton_cached_hstu_mha,
+    triton_hstu_mha,
+)
+
+try:
+    # @manual=//generative_recommenders/ops/triton_aot:triton_ragged_hstu_attention
+    from generative_recommenders.ops.triton_aot.triton_ragged_hstu_attention import (  # pyre-ignore[21]
+        aot_triton_kernel_wrapper_cached_hstu_mha,
+        aot_triton_kernel_wrapper_ragged_hstu_mha,
+    )
+except ImportError:
+
+    def aot_triton_kernel_wrapper_cached_hstu_mha(
+        *args: object,
+        **kwargs: object,
+    ) -> torch.Tensor:
+        raise ImportError(
+            "AOT-T is required for the TRITON_INFERENCE cached_hstu_mha kernel."
+        )
+
+    def aot_triton_kernel_wrapper_ragged_hstu_mha(
+        *args: object,
+        **kwargs: object,
+    ) -> torch.Tensor:
+        raise ImportError(
+            "AOT-T is required for the TRITON_INFERENCE ragged_hstu_mha kernel."
+        )
+
+
+try:
+    from hammer.ops.triton.cc.hstu_attention.triton_cc_hstu_attention import (
+        triton_cc_hstu_mha,
+    )
+    from hammer.v2.ops.triton.template.tlx_bw_hstu_attention import (
+        tlx_bw_hstu_mha_wrapper,
+    )
+except ImportError:
+    tlx_bw_hstu_mha_wrapper = None
+    from generative_recommenders.ops.triton.triton_hstu_attention import (
+        triton_hstu_mha as triton_cc_hstu_mha,
+    )
+from torch.fx._symbolic_trace import is_fx_tracing
+
+torch.fx.wrap("triton_hstu_mha")
+torch.fx.wrap("triton_cached_hstu_mha")
+
+
+@torch.fx.wrap
+def hstu_mha_cuda(
+    max_seq_len: int,
+    alpha: float,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    num_targets: Optional[torch.Tensor] = None,
+    max_attn_len: int = 0,
+    contextual_seq_len: int = 0,
+) -> torch.Tensor:
+    """TorchScript-friendly inference forwarder onto ``torch.ops.hstu.hstu_mha``.
+
+    Bypasses the ``HammerKernel`` enum dispatch in :func:`hstu_mha` so the
+    scripted graph has a single concrete C++ op to call. Mirrors the
+    inference-only path of
+    :func:`generative_recommenders.ops.cpp.cuda_hstu_attention.cuda_hstu_mha_inference_wrapper`
+    with the subset of arguments :class:`STULayer` actually uses.
+    """
+    return torch.ops.hstu.hstu_mha(
+        max_seq_len,
+        alpha,
+        q,
+        k,
+        v,
+        seq_offsets,
+        True,  # causal
+        num_targets,
+        None,  # attn_scale
+        max_attn_len,
+        0,  # min_full_attn_seq_len
+        contextual_seq_len,
+        None,  # q_descale
+        None,  # k_descale
+        None,  # v_descale
+        False,  # sort_by_length
+        False,  # deterministic
+        0,  # sm_margin
+        0,  # max_q_len
+        None,  # seq_offsets_q
+        0,  # num_softmax_heads
+        False,  # training
+        None,  # max_seq_len_tensor
+        None,  # contextual_seq_len_tensor
+        None,  # max_attn_len_tensor
+        None,  # min_full_attn_seq_len_tensor
+        1,  # num_groups
+    )
+
+
+def hstu_mha(
+    max_seq_len: int,
+    alpha: float,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    causal: bool = True,
+    dropout_pr: float = 0.0,
+    training: bool = True,
+    num_targets: Optional[torch.Tensor] = None,
+    attn_scale: Optional[torch.Tensor] = None,
+    max_attn_len: int = 0,
+    contextual_seq_len: int = 0,
+    min_full_attn_seq_len: int = 0,
+    sort_by_length: bool = False,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+    enable_tma: bool = False,
+) -> torch.Tensor:
+    _, H, _ = q.shape
+    if not is_fx_tracing():
+        torch._assert(max_seq_len > 0, "max_seq_len must be larger than 0")
+        torch._assert(q.dim() == 3, "q must be 3-D")
+        torch._assert(k.shape == q.shape, "k must be the same shape as q")
+        torch._assert(v.dim() == 3, "v must be 3-D")
+        torch._assert(v.shape[0] == q.shape[0], "wrong v shape[0]")
+        torch._assert(v.shape[1] == H, "wrong v shape[1]")
+        torch._assert(causal, "only support causal attention")
+
+    if kernel in [
+        HammerKernel.TRITON,
+        HammerKernel.TLX,
+        HammerKernel.TRITON_CC,
+        HammerKernel.TRITON_INFERENCE,
+    ]:
+        if not is_fx_tracing() and kernel == HammerKernel.TRITON:
+            torch._assert(q.is_cuda, "q must be CUDA tensor")
+            torch._assert(k.is_cuda, "k must be CUDA tensor")
+            torch._assert(v.is_cuda, "v must be CUDA tensor")
+            torch._assert(seq_offsets.is_cuda, "seq_offsets must be CUDA tensor")
+            torch._assert(dropout_pr < 1e-6, "dropout for triton path not implemented")
+            torch._assert(
+                min_full_attn_seq_len == 0, "min_full_attn_seq_len not implemented"
+            )
+        assert attn_scale is None, "attn_scale not implemented"
+        q = switch_to_contiguous_if_needed(q)
+        k = switch_to_contiguous_if_needed(k)
+        v = switch_to_contiguous_if_needed(v)
+        seq_offsets = seq_offsets.contiguous()
+
+    if kernel == HammerKernel.TRITON:
+        return triton_hstu_mha(
+            N=max_seq_len,
+            alpha=alpha,
+            q=q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            sort_by_length=sort_by_length,
+            enable_tma=enable_tma,
+        )
+    elif kernel == HammerKernel.TLX:
+        if tlx_bw_hstu_mha_wrapper is None:
+            raise ImportError(
+                "hammer.v2 is required for the TLX kernel. "
+                "Falling back to TRITON or PYTORCH kernel instead."
+            )
+        return tlx_bw_hstu_mha_wrapper(
+            max_seq_len=max_seq_len,
+            alpha=alpha,
+            q=q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            attn_scale=torch.tensor(1.0 / max_seq_len, device=q.device),
+            num_targets=num_targets,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            sort_by_length=sort_by_length,
+        )
+    elif kernel == HammerKernel.TRITON_CC:
+        return triton_cc_hstu_mha(
+            N=max_seq_len,
+            alpha=alpha,
+            q=q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+        )
+    elif kernel == HammerKernel.TRITON_INFERENCE:
+        return aot_triton_kernel_wrapper_ragged_hstu_mha(
+            N=max_seq_len,
+            alpha=alpha,
+            q=q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            invalid_attn_mask_type="causal",
+            num_targets=num_targets,
+            attn_scale=attn_scale,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            full_attn_size=min_full_attn_seq_len,
+            num_softmax_heads=0,
+        )
+    else:
+        return pytorch_hstu_mha(
+            max_seq_len=max_seq_len,
+            alpha=alpha,
+            q=q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            causal=True,
+            dropout_pr=dropout_pr,
+            training=training,
+            num_targets=num_targets,
+            attn_scale=attn_scale,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            min_full_attn_seq_len=min_full_attn_seq_len,
+        )
+
+
+def delta_hstu_mha(
+    max_seq_len: int,
+    alpha: float,
+    delta_q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    num_targets: Optional[torch.Tensor] = None,
+    max_attn_len: int = 0,
+    contextual_seq_len: int = 0,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+    enable_tma: bool = False,
+) -> torch.Tensor:
+    L, H, D = delta_q.shape
+    B = seq_offsets.size(0) - 1
+    DeltaSize = L // B
+    if not is_fx_tracing():
+        torch._assert(max_seq_len > 0, "max_seq_len must be larger than 0")
+        torch._assert(delta_q.dim() == 3, "delta_q must be 3-D")
+        torch._assert(L % B == 0, "delta_q must be padded")
+        torch._assert(k.dim() == 3, "k must be 3-D")
+        torch._assert(k.shape[1] == H, "wrong k shape[1]")
+        torch._assert(k.shape[2] == D, "wrong k shape[2]")
+        torch._assert(v.dim() == 3, "v must be 3-D")
+        torch._assert(v.shape[1] == H, "wrong v shape[1]")
+    if kernel in [
+        HammerKernel.TRITON,
+        HammerKernel.TRITON_CC,
+        HammerKernel.TRITON_INFERENCE,
+    ]:
+        if not is_fx_tracing() and kernel == HammerKernel.TRITON:
+            torch._assert(delta_q.is_cuda, "q must be CUDA tensor")
+            torch._assert(seq_offsets.is_cuda, "seq_offsets must be CUDA tensor")
+            if num_targets is not None:
+                torch._assert(num_targets.is_cuda, "num_targets must be CUDA tensor")
+        seq_offsets = seq_offsets.contiguous()
+        delta_q = switch_to_contiguous_if_needed(delta_q)
+        k = switch_to_contiguous_if_needed(k)
+        v = switch_to_contiguous_if_needed(v)
+
+    if kernel == HammerKernel.TRITON:
+        return triton_cached_hstu_mha(
+            N=max_seq_len,
+            alpha=alpha,
+            delta_q=delta_q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            enable_tma=enable_tma,
+        )
+    elif kernel == HammerKernel.TRITON_CC:
+        return triton_cc_hstu_mha(
+            N=max_seq_len,
+            alpha=alpha,
+            q=delta_q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            is_delta_q=True,
+            delta_size=DeltaSize,
+        )
+    elif kernel == HammerKernel.TRITON_INFERENCE:
+        delta_x_offsets = torch.arange(
+            0,
+            L + 1,
+            DeltaSize,
+            device=delta_q.device,
+            dtype=seq_offsets.dtype,
+        )
+        return aot_triton_kernel_wrapper_cached_hstu_mha(
+            N=max_seq_len,
+            alpha=alpha,
+            delta_q=delta_q,
+            k=k,
+            v=v,
+            delta_x_offsets=delta_x_offsets,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            attn_scale=None,
+            max_attn_len=max_attn_len,
+            full_attn_size=0,
+        )
+    else:
+        return pytorch_cached_hstu_mha(
+            max_seq_len=max_seq_len,
+            alpha=alpha,
+            delta_q=delta_q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/hstu_compute.py b/recommendation_v4/generative_recommenders/ops/hstu_compute.py
new file mode 100644
index 000000000..7728c1454
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/hstu_compute.py
@@ -0,0 +1,390 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from typing import Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from generative_recommenders.ops.layer_norm import layer_norm
+from generative_recommenders.ops.mm import addmm
+from generative_recommenders.ops.pytorch.pt_hstu_linear import (
+    pytorch_hstu_compute_output,
+)
+
+try:
+    from hammer.ops.triton.cc.addmm.triton_cc_addmm import triton_cc_addmm
+    from hammer.ops.triton.cc.group_norm_mul_dropout.triton_cc_group_norm_mul_dropout import (
+        triton_cc_group_norm_mul_dropout_wrapper,
+    )
+    from hammer.ops.triton.cc.layer_norm_mul_dropout.triton_cc_layer_norm_mul_dropout import (
+        triton_cc_layer_norm_mul_dropout_wrapper,
+    )
+except ImportError:
+    triton_cc_addmm = None
+    triton_cc_group_norm_mul_dropout_wrapper = None
+    triton_cc_layer_norm_mul_dropout_wrapper = None
+from generative_recommenders.common import HammerKernel
+from generative_recommenders.ops.hstu_attention import hstu_mha, hstu_mha_cuda
+from generative_recommenders.ops.triton.triton_hstu_linear import (
+    triton_hstu_compute_output,
+)
+from generative_recommenders.ops.triton.triton_hstu_preprocess_and_attention import (
+    triton_hstu_preprocess_and_attention,
+)
+from torch.fx._symbolic_trace import is_fx_tracing
+
+try:
+    # @manual=//generative_recommenders/ops/triton_aot:triton_group_norm_mul_dropout
+    from generative_recommenders.ops.triton_aot.triton_group_norm_mul_dropout import (  # pyre-ignore[21]
+        aot_triton_kernel_wrapper_group_norm_mul_dropout,
+    )
+
+    # @manual=//generative_recommenders/ops/triton_aot:triton_layer_norm_mul_dropout
+    from generative_recommenders.ops.triton_aot.triton_layer_norm_mul_dropout import (  # pyre-ignore[21]
+        aot_triton_kernel_wrapper_layer_norm_mul_dropout,
+    )
+except ImportError:
+
+    def aot_triton_kernel_wrapper_group_norm_mul_dropout(
+        *args: object,
+        **kwargs: object,
+    ) -> torch.Tensor:
+        raise ImportError(
+            "AOT-T is required for the TRITON_INFERENCE group_norm_mul_dropout kernel."
+        )
+
+    def aot_triton_kernel_wrapper_layer_norm_mul_dropout(
+        *args: object,
+        **kwargs: object,
+    ) -> torch.Tensor:
+        raise ImportError(
+            "AOT-T is required for the TRITON_INFERENCE layer_norm_mul_dropout kernel."
+        )
+
+
+torch.fx.wrap("triton_hstu_compute_output")
+
+
+def hstu_compute_uqvk(
+    x: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    norm_eps: float,
+    num_heads: int,
+    attn_dim: int,
+    hidden_dim: int,
+    uvqk_weight: torch.Tensor,
+    uvqk_bias: torch.Tensor,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    if torch.jit.is_scripting():
+        # Script-mode fast path: pure PyTorch, no HammerKernel dispatch.
+        normed_x = F.layer_norm(
+            x,
+            normalized_shape=(x.shape[-1],),
+            weight=norm_weight,
+            bias=norm_bias,
+            eps=norm_eps,
+        )
+        uvqk = torch.addmm(uvqk_bias, normed_x, uvqk_weight)
+    else:
+        normed_x = layer_norm(
+            x,
+            weight=norm_weight,
+            bias=norm_bias,
+            eps=norm_eps,
+            kernel=kernel,
+        )
+        # NOTE: for AMD training, we go with torch.addmm instead of the triton
+        # version before Triton on AMD achieves on-par perf with NV GPU.
+        if torch.version.hip and kernel == HammerKernel.TRITON:
+            uvqk = torch.addmm(uvqk_bias, normed_x, uvqk_weight)
+        else:
+            uvqk = addmm(uvqk_bias, normed_x, uvqk_weight, kernel)
+    u, v, q, k = torch.split(
+        uvqk,
+        [
+            hidden_dim * num_heads,
+            hidden_dim * num_heads,
+            attn_dim * num_heads,
+            attn_dim * num_heads,
+        ],
+        dim=1,
+    )
+    u = F.silu(u)
+    q = q.view(-1, num_heads, attn_dim)
+    k = k.view(-1, num_heads, attn_dim)
+    v = v.view(-1, num_heads, hidden_dim)
+    return u, q, k, v
+
+
+def hstu_compute_output(
+    attn: torch.Tensor,
+    u: torch.Tensor,
+    x: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    norm_eps: float,
+    output_weight: torch.Tensor,
+    num_heads: int,
+    linear_dim: int,
+    dropout_ratio: float,
+    training: bool,
+    concat_u: bool,
+    concat_x: bool,
+    mul_u_activation_type: str,
+    group_norm: bool,
+    recompute_y_in_backward: bool,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> torch.Tensor:
+    if torch.jit.is_scripting():
+        return pytorch_hstu_compute_output(
+            attn=attn,
+            u=u,
+            x=x,
+            norm_weight=norm_weight,
+            norm_bias=norm_bias,
+            output_weight=output_weight,
+            eps=norm_eps,
+            dropout_ratio=dropout_ratio,
+            training=training,
+            concat_u=concat_u,
+            concat_x=concat_x,
+            mul_u_activation_type=mul_u_activation_type,
+            group_norm=group_norm,
+            num_heads=num_heads,
+            linear_dim=linear_dim,
+        )
+    if kernel == HammerKernel.TRITON:
+        return triton_hstu_compute_output(
+            attn=attn,
+            u=u,
+            x=x,
+            norm_weight=norm_weight,
+            norm_bias=norm_bias,
+            output_weight=output_weight,
+            eps=norm_eps,
+            dropout_ratio=dropout_ratio,
+            training=training,
+            concat_u=concat_u,
+            concat_x=concat_x,
+            mul_u_activation_type=mul_u_activation_type,
+            group_norm=group_norm,
+            num_heads=num_heads,
+            linear_dim=linear_dim,
+            seed=None,
+            recompute_y_in_backward=recompute_y_in_backward,
+        )
+    elif kernel == HammerKernel.TRITON_INFERENCE:
+        if group_norm:
+            y = aot_triton_kernel_wrapper_group_norm_mul_dropout(
+                x=attn,
+                u=u,
+                weight=norm_weight,
+                bias=norm_bias,
+                eps=norm_eps,
+                silu_u=mul_u_activation_type == "silu",
+                concat_ux=concat_u and concat_x,
+                num_heads=num_heads,
+                linear_dim=linear_dim,
+            )
+        else:
+            y = aot_triton_kernel_wrapper_layer_norm_mul_dropout(
+                x=attn,
+                u=u,
+                weight=norm_weight,
+                bias=norm_bias,
+                eps=norm_eps,
+                dropout_ratio=dropout_ratio,
+                training=training,
+                silu_u=mul_u_activation_type == "silu",
+                concat_ux=concat_u and concat_x,
+                mul_u_activation_type=mul_u_activation_type,
+            )
+        return addmm(x, y, output_weight, kernel)
+    elif kernel == HammerKernel.TRITON_CC:
+        if triton_cc_group_norm_mul_dropout_wrapper is None or triton_cc_addmm is None:
+            raise ImportError(
+                "hammer is required for the TRITON_CC kernel in hstu_compute_output."
+            )
+        if group_norm:
+            y = triton_cc_group_norm_mul_dropout_wrapper(
+                x=attn,
+                u=u,
+                weight=norm_weight,
+                bias=norm_bias,
+                eps=norm_eps,
+                dropout_ratio=dropout_ratio,
+                training=training,
+                concat_ux=concat_u and concat_x,
+                num_heads=num_heads,
+                linear_dim=linear_dim,
+            )
+        else:
+            y = triton_cc_layer_norm_mul_dropout_wrapper(
+                x=attn,
+                u=u,
+                weight=norm_weight,
+                bias=norm_bias,
+                eps=norm_eps,
+                dropout_ratio=dropout_ratio,
+                training=training,
+                concat_u=concat_u,
+                concat_x=concat_x,
+                mul_u_activation_type=mul_u_activation_type,
+            )
+        return triton_cc_addmm(x, y, output_weight)
+    else:
+        return pytorch_hstu_compute_output(
+            attn=attn,
+            u=u,
+            x=x,
+            norm_weight=norm_weight,
+            norm_bias=norm_bias,
+            output_weight=output_weight,
+            eps=norm_eps,
+            dropout_ratio=dropout_ratio,
+            training=training,
+            concat_u=concat_u,
+            concat_x=concat_x,
+            mul_u_activation_type=mul_u_activation_type,
+            group_norm=group_norm,
+            num_heads=num_heads,
+            linear_dim=linear_dim,
+        )
+
+
+def hstu_preprocess_and_attention(
+    x: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    norm_eps: float,
+    num_heads: int,
+    attn_dim: int,
+    hidden_dim: int,
+    uvqk_weight: torch.Tensor,
+    uvqk_bias: torch.Tensor,
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    attn_alpha: float,
+    causal: bool,
+    num_targets: Optional[torch.Tensor],
+    max_attn_len: int,
+    contextual_seq_len: int,
+    recompute_uvqk_in_backward: bool,
+    recompute_normed_x_in_backward: bool,
+    sort_by_length: bool,
+    prefill: bool = False,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+    enable_tma: Optional[bool] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+    if not is_fx_tracing():
+        torch._assert(max_seq_len > 0, "max_seq_len must be larger than 0")
+        torch._assert(x.dim() == 2, "x must be 2-D")
+        torch._assert(
+            x.shape[1] == uvqk_weight.shape[0],
+            "x.shape[1] must equal uvqk_weight.shape[0]",
+        )
+        torch._assert(
+            uvqk_weight.shape[1] == 2 * num_heads * (hidden_dim + attn_dim),
+            "uvqk_weight.shape[1] must equal 2 * num_heads * (hidden_dim + attn_dim)",
+        )
+        torch._assert(causal is True, "only causal attention is supported.")
+    if torch.jit.is_scripting():
+        # Script-mode: compute uvqk via PyTorch fallback then call the
+        # libtorch-callable CUDA HSTU MHA op directly. Avoids both the
+        # HammerKernel enum dispatch and the Triton-only fused path.
+        u, q, k, v = hstu_compute_uqvk(
+            x=x,
+            norm_weight=norm_weight,
+            norm_bias=norm_bias,
+            norm_eps=norm_eps,
+            num_heads=num_heads,
+            attn_dim=attn_dim,
+            hidden_dim=hidden_dim,
+            uvqk_weight=uvqk_weight,
+            uvqk_bias=uvqk_bias,
+            kernel=HammerKernel.PYTORCH,
+        )
+        attn_output = hstu_mha_cuda(
+            max_seq_len=max_seq_len,
+            alpha=attn_alpha,
+            q=q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+        ).view(-1, hidden_dim * num_heads)
+        return u, attn_output, k, v
+    if kernel == HammerKernel.TRITON and prefill is False:
+        u, attn_output = triton_hstu_preprocess_and_attention(
+            x=x,
+            norm_weight=norm_weight,
+            norm_bias=norm_bias,
+            norm_eps=norm_eps,
+            num_heads=num_heads,
+            attn_dim=attn_dim,
+            hidden_dim=hidden_dim,
+            uvqk_weight=uvqk_weight,
+            uvqk_bias=uvqk_bias,
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            attn_alpha=attn_alpha,
+            num_targets=num_targets,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            recompute_uvqk_in_backward=recompute_uvqk_in_backward,
+            recompute_normed_x_in_backward=recompute_normed_x_in_backward,
+            sort_by_length=sort_by_length,
+            enable_tma=enable_tma,
+        )
+        attn_output = attn_output.view(-1, hidden_dim * num_heads)
+        k = None
+        v = None
+    else:
+        u, q, k, v = hstu_compute_uqvk(
+            x=x,
+            norm_weight=norm_weight,
+            norm_bias=norm_bias,
+            norm_eps=norm_eps,
+            num_heads=num_heads,
+            attn_dim=attn_dim,
+            hidden_dim=hidden_dim,
+            uvqk_weight=uvqk_weight,
+            uvqk_bias=uvqk_bias,
+            kernel=kernel,
+        )
+        attn_output = hstu_mha(
+            max_seq_len=max_seq_len,
+            alpha=attn_alpha,
+            q=q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            causal=causal,
+            dropout_pr=0.0,
+            training=False,
+            num_targets=num_targets,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            sort_by_length=sort_by_length,
+            kernel=kernel,
+        ).view(-1, hidden_dim * num_heads)
+    return u, attn_output, k, v
diff --git a/recommendation_v4/generative_recommenders/ops/jagged_tensors.py b/recommendation_v4/generative_recommenders/ops/jagged_tensors.py
new file mode 100644
index 000000000..73e3c4a73
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/jagged_tensors.py
@@ -0,0 +1,451 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from typing import Optional, Tuple
+
+import torch
+from generative_recommenders.common import HammerKernel
+from generative_recommenders.ops.pytorch.pt_jagged import pytorch_jagged_dense_bmm_add
+from generative_recommenders.ops.pytorch.pt_jagged_tensors import (
+    pytorch_concat_2D_jagged,
+    pytorch_hstu_concat_l2_embeddings,
+    pytorch_hstu_split_l2_embeddings,
+    pytorch_split_2D_jagged,
+)
+from generative_recommenders.ops.triton.triton_jagged import triton_jagged_dense_bmm_add
+from generative_recommenders.ops.triton.triton_jagged_tensors import (
+    triton_concat_2D_jagged,
+    triton_concat_2D_jagged_multirow,
+    triton_split_2D_jagged,
+    triton_split_2D_jagged_multirow,
+)
+from torch.fx._symbolic_trace import is_fx_tracing
+
+try:
+    # @manual=//generative_recommenders/ops/triton_aot:triton_concat_2d_jagged
+    from generative_recommenders.ops.triton_aot.triton_concat_2d_jagged import (  # pyre-ignore[21]
+        aot_triton_kernel_wrapper_concat_2D_jagged,
+    )
+
+    # @manual=//generative_recommenders/ops/triton_aot:triton_split_2d_jagged
+    from generative_recommenders.ops.triton_aot.triton_split_2d_jagged import (  # pyre-ignore[21]
+        aot_triton_kernel_wrapper_split_2D_jagged,
+    )
+except ImportError:
+
+    def aot_triton_kernel_wrapper_concat_2D_jagged(
+        *args: object,
+        **kwargs: object,
+    ) -> torch.Tensor:
+        raise ImportError(
+            "AOT-T is required for the TRITON_INFERENCE concat_2D_jagged kernel."
+        )
+
+    def aot_triton_kernel_wrapper_split_2D_jagged(
+        *args: object,
+        **kwargs: object,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        raise ImportError(
+            "AOT-T is required for the TRITON_INFERENCE split_2D_jagged kernel."
+        )
+
+
+torch.fx.wrap("triton_jagged_dense_bmm_add")
+
+try:
+    from hammer.ops.triton.cc.jagged_dense_bmm.triton_cc_jagged_dense_bmm import (
+        triton_cc_jagged_dense_bmm,
+    )
+except ImportError:
+    triton_cc_jagged_dense_bmm = None
+
+
+torch.fx.wrap("triton_concat_2D_jagged")
+torch.fx.wrap("triton_split_2D_jagged")
+torch.fx.wrap("triton_concat_2D_jagged_multirow")
+torch.fx.wrap("triton_split_2D_jagged_multirow")
+
+
+def concat_2D_jagged(
+    max_seq_len: int,
+    values_left: torch.Tensor,
+    values_right: torch.Tensor,
+    max_len_left: Optional[int] = None,
+    max_len_right: Optional[int] = None,
+    offsets_left: Optional[torch.Tensor] = None,
+    offsets_right: Optional[torch.Tensor] = None,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> torch.Tensor:
+    if torch.jit.is_scripting():
+        return pytorch_concat_2D_jagged(
+            values_left=values_left,
+            values_right=values_right,
+            max_len_left=max_len_left,
+            max_len_right=max_len_right,
+            offsets_left=offsets_left,
+            offsets_right=offsets_right,
+        )
+    if not is_fx_tracing():
+        torch._assert(values_left.dim() == 2, "values_left must be 2D")
+        torch._assert(values_right.dim() == 2, "values_right must be 2D")
+        torch._assert(
+            values_right.shape[1] == values_left.shape[1],
+            f"values_left shape[1] must be equal to values_right shape[1] {values_left.shape[1]} vs {values_right.shape[1]}",
+        )
+    if kernel == HammerKernel.TRITON:
+        return triton_concat_2D_jagged(
+            max_seq_len=max_seq_len,
+            values_left=values_left,
+            values_right=values_right,
+            max_len_left=max_len_left,
+            max_len_right=max_len_right,
+            offsets_left=offsets_left,
+            offsets_right=offsets_right,
+        )
+    elif kernel == HammerKernel.TRITON_INFERENCE:
+        aott_values_left = values_left
+        aott_values_right = values_right
+        if offsets_left is None:
+            assert max_len_left is not None
+            aott_values_left = values_left.reshape(
+                -1,
+                max_len_left,
+                values_left.shape[-1],
+            )
+        if offsets_right is None:
+            assert max_len_right is not None
+            aott_values_right = values_right.reshape(
+                -1,
+                max_len_right,
+                values_right.shape[-1],
+            )
+        return aot_triton_kernel_wrapper_concat_2D_jagged(
+            max_seq_len=max_seq_len,
+            values_a=aott_values_left,
+            values_b=aott_values_right,
+            offsets_a=offsets_left,
+            offsets_b=offsets_right,
+        )
+    else:
+        return pytorch_concat_2D_jagged(
+            values_left=values_left,
+            values_right=values_right,
+            max_len_left=max_len_left,
+            max_len_right=max_len_right,
+            offsets_left=offsets_left,
+            offsets_right=offsets_right,
+        )
+
+
+def split_2D_jagged(
+    max_seq_len: int,
+    values: torch.Tensor,
+    total_len_left: Optional[int] = None,
+    total_len_right: Optional[int] = None,
+    max_len_left: Optional[int] = None,
+    max_len_right: Optional[int] = None,
+    offsets_left: Optional[torch.Tensor] = None,
+    offsets_right: Optional[torch.Tensor] = None,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if torch.jit.is_scripting():
+        return pytorch_split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=values,
+            max_len_left=max_len_left,
+            max_len_right=max_len_right,
+            offsets_left=offsets_left,
+            offsets_right=offsets_right,
+        )
+    if not is_fx_tracing():
+        torch._assert(values.dim() == 2, "values must be 2D")
+        torch._assert(
+            offsets_left is not None or offsets_right is not None,
+            "offsets_left and offsets_right cannot be None at the same time",
+        )
+        if offsets_left is None:
+            torch._assert(
+                max_len_left is not None,
+                "max_len_left must be provided when offsets_left is None",
+            )
+        if offsets_right is None:
+            torch._assert(
+                max_len_right is not None,
+                "max_len_right must be provided when offsets_right is None",
+            )
+        if offsets_left is not None and offsets_right is not None:
+            torch._assert(
+                offsets_left.shape[0] == offsets_right.shape[0],
+                "offsets_left shape[0] must be equal to offsets_right shape[0]",
+            )
+    if kernel == HammerKernel.TRITON:
+        return triton_split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=values,
+            total_len_left=total_len_left,
+            total_len_right=total_len_right,
+            max_len_left=max_len_left,
+            max_len_right=max_len_right,
+            offsets_left=offsets_left,
+            offsets_right=offsets_right,
+        )
+    elif kernel == HammerKernel.TRITON_INFERENCE:
+        dense_size = 0
+        if offsets_left is None and max_len_left is not None:
+            dense_size = max_len_left
+        elif offsets_right is None and max_len_right is not None:
+            dense_size = max_len_right
+        split_left, split_right = aot_triton_kernel_wrapper_split_2D_jagged(
+            values=values,
+            max_seq_len=max_seq_len,
+            offsets_a=offsets_left,
+            offsets_b=offsets_right,
+            dense_size=dense_size,
+        )
+        if offsets_left is None:
+            split_left = split_left.reshape(-1, split_left.shape[-1])
+        if offsets_right is None:
+            split_right = split_right.reshape(-1, split_right.shape[-1])
+        return split_left, split_right
+    else:
+        return pytorch_split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=values,
+            max_len_left=max_len_left,
+            max_len_right=max_len_right,
+            offsets_left=offsets_left,
+            offsets_right=offsets_right,
+        )
+
+
+def hstu_split_l2_embeddings(
+    max_seq_len: int,
+    x: torch.Tensor,
+    prefix_offsets: torch.Tensor,
+    l2_offsets: torch.Tensor,
+    contextual_seq_len: int,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if kernel == HammerKernel.TRITON:
+        return triton_split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=x,
+            total_len_right=None,
+            total_len_left=None,
+            max_len_left=None,
+            max_len_right=None,
+            offsets_left=prefix_offsets,
+            offsets_right=l2_offsets,
+            n_prefix_to_right=contextual_seq_len,
+        )
+    else:
+        return pytorch_hstu_split_l2_embeddings(
+            max_seq_len=max_seq_len,
+            x=x,
+            prefix_offsets=prefix_offsets,
+            l2_offsets=l2_offsets,
+            contextual_seq_len=contextual_seq_len,
+        )
+
+
+def hstu_concat_l2_embeddings(
+    max_prefix_len: int,
+    prefix_x: torch.Tensor,
+    prefix_offsets: torch.Tensor,
+    max_l2_len: int,
+    l2_x: torch.Tensor,
+    l2_offsets: torch.Tensor,
+    contextual_seq_len: int,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> torch.Tensor:
+    if kernel == HammerKernel.TRITON:
+        return triton_concat_2D_jagged(
+            max_seq_len=max_prefix_len + max_l2_len,
+            values_left=prefix_x,
+            values_right=l2_x,
+            max_len_left=max_prefix_len,
+            max_len_right=max_l2_len,
+            offsets_left=prefix_offsets,
+            offsets_right=l2_offsets,
+            n_prefix_from_right=contextual_seq_len,
+        )
+    else:
+        return pytorch_hstu_concat_l2_embeddings(
+            contextual_seq_len=contextual_seq_len,
+            max_prefix_len=max_prefix_len,
+            prefix_x=prefix_x,
+            prefix_offsets=prefix_offsets,
+            max_l2_len=max_l2_len,
+            l2_x=l2_x,
+            l2_offsets=l2_offsets,
+        )
+
+
+def jagged_dense_bmm_broadcast_add(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    jagged: torch.Tensor,
+    dense: torch.Tensor,
+    bias: torch.Tensor,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> torch.Tensor:
+    """
+    Computing out = jagged x dense + bias
+    jagged has shape (sum_B(M_i), K), dense has shape (B, K, N), and bias has shape (B, N)
+    out has shape (sum_B(M_i), N)
+    """
+    if not is_fx_tracing():
+        _, K = jagged.shape
+        B, _, N = dense.shape
+        torch._assert(dense.shape[1] == K, "wrong dense shape[1]")
+        torch._assert(seq_offsets.shape[0] == B + 1, "wrong seq_offsets shape[0]")
+        torch._assert(bias.shape[0] == B, "wrong bias shape[0]")
+        torch._assert(bias.shape[1] == N, "wrong bias shape[1]")
+    if kernel == HammerKernel.TRITON:
+        return triton_jagged_dense_bmm_add(
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            jagged=jagged,
+            dense=dense,
+            bias=bias,
+            elementwise=False,
+        )
+    elif kernel == HammerKernel.TRITON_CC:
+        if triton_cc_jagged_dense_bmm is None:
+            raise ImportError(
+                "hammer is required for the TRITON_CC kernel in jagged_dense_bmm_broadcast_add."
+            )
+        return triton_cc_jagged_dense_bmm(
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            jagged=jagged,
+            dense=dense,
+            bias=bias,
+        )
+    else:
+        return pytorch_jagged_dense_bmm_add(
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            jagged=jagged,
+            dense=dense,
+            bias=bias,
+        )
+
+
+def concat_2D_jagged_multirow(
+    max_seq_len: int,
+    values_left: torch.Tensor,
+    values_right: torch.Tensor,
+    offsets_left: Optional[torch.Tensor],
+    offsets_right: Optional[torch.Tensor],
+    max_len_left: int,
+    max_len_right: int,
+    kernel: HammerKernel = HammerKernel.TRITON,
+) -> torch.Tensor:
+    if not is_fx_tracing():
+        torch._assert(values_left.dim() == 2, "values_left must be 2D")
+        torch._assert(values_right.dim() == 2, "values_right must be 2D")
+        torch._assert(
+            values_right.shape[1] == values_left.shape[1],
+            f"values_left shape[1] must be equal to values_right shape[1] {values_left.shape[1]} vs {values_right.shape[1]}",
+        )
+        if offsets_left is not None and offsets_right is not None:
+            torch._assert(
+                offsets_left.shape[0] == offsets_right.shape[0],
+                "offsets_left and offsets_right must have the same batch dimension",
+            )
+
+    if kernel == HammerKernel.TRITON:
+        return triton_concat_2D_jagged_multirow(
+            max_seq_len=max_seq_len,
+            values_a=values_left,
+            values_b=values_right,
+            offsets_a=offsets_left,
+            offsets_b=offsets_right,
+            max_len_a=max_len_left,
+            max_len_b=max_len_right,
+        )
+    else:
+        return concat_2D_jagged(
+            max_seq_len=max_seq_len,
+            values_left=values_left,
+            values_right=values_right,
+            max_len_left=max_len_left,
+            max_len_right=max_len_right,
+            offsets_left=offsets_left,
+            offsets_right=offsets_right,
+            kernel=kernel,
+        )
+
+
+def split_2D_jagged_multirow(
+    max_seq_len: int,
+    values: torch.Tensor,
+    total_len_left: Optional[int] = None,
+    total_len_right: Optional[int] = None,
+    max_len_left: Optional[int] = None,
+    max_len_right: Optional[int] = None,
+    offsets_left: Optional[torch.Tensor] = None,
+    offsets_right: Optional[torch.Tensor] = None,
+    kernel: HammerKernel = HammerKernel.TRITON,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if not is_fx_tracing():
+        torch._assert(values.dim() == 2, "values must be 2D")
+        torch._assert(
+            offsets_left is not None or offsets_right is not None,
+            "offsets_left and offsets_right cannot be None at the same time",
+        )
+        if offsets_left is None:
+            torch._assert(
+                max_len_left is not None,
+                "max_len_left must be provided when offsets_left is None",
+            )
+        if offsets_right is None:
+            torch._assert(
+                max_len_right is not None,
+                "max_len_right must be provided when offsets_right is None",
+            )
+        if offsets_left is not None and offsets_right is not None:
+            torch._assert(
+                offsets_left.shape[0] == offsets_right.shape[0],
+                "offsets_left and offsets_right must have the same batch dimension",
+            )
+
+    if kernel == HammerKernel.TRITON:
+        return triton_split_2D_jagged_multirow(
+            max_seq_len=max_seq_len,
+            values=values,
+            total_len_left=total_len_left,
+            total_len_right=total_len_right,
+            max_len_left=max_len_left,
+            max_len_right=max_len_right,
+            offsets_left=offsets_left,
+            offsets_right=offsets_right,
+        )
+    else:
+        return split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=values,
+            total_len_left=total_len_left,
+            total_len_right=total_len_right,
+            max_len_left=max_len_left,
+            max_len_right=max_len_right,
+            offsets_left=offsets_left,
+            offsets_right=offsets_right,
+            kernel=kernel,
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/layer_norm.py b/recommendation_v4/generative_recommenders/ops/layer_norm.py
new file mode 100644
index 000000000..74ed377d6
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/layer_norm.py
@@ -0,0 +1,330 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+
+from typing import List
+
+import torch
+from generative_recommenders.ops.pytorch.pt_layer_norm import (
+    pytorch_layer_norm,
+    pytorch_rms_norm,
+    pytorch_swish_layer_norm,
+)
+from generative_recommenders.ops.triton.triton_layer_norm import triton_rms_norm
+
+try:
+    from hammer.ops.triton.cc.rms_norm.triton_cc_rms_norm import triton_cc_rms_norm
+    from hammer.ops.triton.cc.swish_layer_norm.triton_cc_swish_layer_norm import (
+        triton_cc_swish_layer_norm,
+    )
+except ImportError:
+    triton_cc_swish_layer_norm = None
+    triton_cc_rms_norm = None
+from generative_recommenders.common import HammerKernel, HammerModule
+from generative_recommenders.ops.triton.triton_layer_norm import (
+    triton_layer_norm,
+    triton_swish_layer_norm,
+)
+from torch.fx._symbolic_trace import is_fx_tracing
+
+try:
+    # @manual=//generative_recommenders/ops/triton_aot:triton_layer_norm
+    from generative_recommenders.ops.triton_aot.triton_layer_norm import (  # pyre-ignore[21]
+        aot_triton_kernel_wrapper_swish_layer_norm,
+    )
+
+    # @manual=//generative_recommenders/ops/triton_aot:triton_rms_norm
+    from generative_recommenders.ops.triton_aot.triton_rms_norm import (  # pyre-ignore[21]
+        aot_triton_kernel_wrapper_rms_norm,
+    )
+except ImportError:
+
+    def aot_triton_kernel_wrapper_swish_layer_norm(
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        bias: torch.Tensor,
+        eps: float,
+        is_swish: bool,
+    ) -> torch.Tensor:
+        raise ImportError(
+            "AOT-T is required for the TRITON_INFERENCE swish_layer_norm kernel."
+        )
+
+    def aot_triton_kernel_wrapper_rms_norm(
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        eps: float,
+        silu: bool,
+    ) -> torch.Tensor:
+        raise ImportError("AOT-T is required for the TRITON_INFERENCE rms_norm kernel.")
+
+
+torch.fx.wrap("triton_layer_norm")
+torch.fx.wrap("triton_swish_layer_norm")
+torch.fx.wrap("triton_rms_norm")
+
+
+def layer_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float = 1e-5,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> torch.Tensor:
+    if torch.jit.is_scripting():
+        # Script-mode fast path: bypass the HammerKernel ladder (which would
+        # drag in is_fx_tracing()'s closed-over global bool).
+        return torch.nn.functional.layer_norm(
+            x,
+            normalized_shape=(x.shape[-1],),
+            weight=weight,
+            bias=bias,
+            eps=eps,
+        )
+    if kernel == HammerKernel.TRITON:
+        if not is_fx_tracing():
+            torch._assert(not x.is_cpu, "x must be device tensor")
+            torch._assert(not weight.is_cpu, "weight must be device tensor")
+            torch._assert(not bias.is_cpu, "bias must be device tensor")
+        return triton_layer_norm(x, weight, bias, eps)
+    elif kernel == HammerKernel.TRITON_INFERENCE:
+        return aot_triton_kernel_wrapper_swish_layer_norm(
+            x,
+            weight,
+            bias,
+            eps,
+            is_swish=False,
+        )
+    elif kernel == HammerKernel.TRITON_CC:
+        if triton_cc_swish_layer_norm is None:
+            raise ImportError(
+                "hammer is required for the TRITON_CC kernel in layer_norm."
+            )
+        return triton_cc_swish_layer_norm(
+            x,
+            weight,
+            bias,
+            eps,
+            is_swish=False,
+        )
+    else:
+        return pytorch_layer_norm(
+            x,
+            [
+                x.shape[-1],
+            ],
+            weight,
+            bias,
+            eps,
+        )
+
+
+def rms_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-5,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+    silu: bool = False,
+) -> torch.Tensor:
+    if torch.jit.is_scripting():
+        # Script-mode fast path: bypass the HammerKernel ladder.
+        x_f = x.float()
+        norm = torch.rsqrt(x_f.pow(2).mean(-1, keepdim=True) + eps)
+        out = (x_f * norm * weight.float()).to(x.dtype)
+        if silu:
+            out = torch.nn.functional.silu(out)
+        return out
+    if kernel == HammerKernel.TRITON:
+        if not is_fx_tracing():
+            torch._assert(not x.is_cpu, "x must be device tensor")
+            torch._assert(not weight.is_cpu, "weight must be device tensor")
+        return triton_rms_norm(x, weight, eps, silu)
+    elif kernel == HammerKernel.TRITON_INFERENCE:
+        return aot_triton_kernel_wrapper_rms_norm(x, weight, eps, silu)
+    elif kernel == HammerKernel.TRITON_CC:
+        if triton_cc_rms_norm is None:
+            raise ImportError(
+                "hammer is required for the TRITON_CC kernel in rms_norm."
+            )
+        return triton_cc_rms_norm(
+            x,
+            weight,
+            eps,
+            silu=silu,
+        )
+    else:
+        return pytorch_rms_norm(
+            x,
+            [
+                x.shape[-1],
+            ],
+            weight,
+            eps,
+            silu,
+        )
+
+
+def swish_layer_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float = 1e-5,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> torch.Tensor:
+    if torch.jit.is_scripting():
+        # Script-mode fast path: bypass the HammerKernel ladder (which
+        # otherwise drags in is_fx_tracing(), Triton/Triton_CC closures,
+        # etc.) and call pure PyTorch directly.
+        return pytorch_swish_layer_norm(
+            x,
+            [x.shape[-1]],
+            weight,
+            bias,
+            eps,
+        )
+    if kernel == HammerKernel.TRITON:
+        if not is_fx_tracing():
+            torch._assert(not x.is_cpu, "x must be device tensor")
+            torch._assert(not weight.is_cpu, "weight must be device tensor")
+            torch._assert(not bias.is_cpu, "bias must be device tensor")
+        return triton_swish_layer_norm(x, [x.shape[-1]], weight, bias, eps)
+    elif kernel == HammerKernel.TRITON_INFERENCE:
+        return aot_triton_kernel_wrapper_swish_layer_norm(
+            x,
+            weight,
+            bias,
+            eps,
+            is_swish=True,
+        )
+    elif kernel == HammerKernel.TRITON_CC:
+        if triton_cc_swish_layer_norm is None:
+            raise ImportError(
+                "hammer is required for the TRITON_CC kernel in swish_layer_norm."
+            )
+        return triton_cc_swish_layer_norm(
+            x,
+            weight,
+            bias,
+            eps,
+            is_swish=True,
+        )
+    else:
+        return pytorch_swish_layer_norm(
+            x,
+            [
+                x.shape[-1],
+            ],
+            weight,
+            bias,
+            eps,
+        )
+
+
+class LayerNorm(HammerModule):
+    def __init__(
+        self,
+        dim: int,
+        eps: float = 1e-5,
+        is_inference: bool = False,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+        self._normalized_shape: List[int] = [dim]
+        self._eps = eps
+        self.weight = torch.nn.Parameter(
+            torch.ones(self._normalized_shape),
+        )
+        self.bias = torch.nn.Parameter(
+            torch.zeros(self._normalized_shape),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return layer_norm(
+            x=x,
+            weight=self.weight,
+            bias=self.bias,
+            eps=self._eps,
+            kernel=self.hammer_kernel(),
+        )
+
+
+class RMSNorm(HammerModule):
+    def __init__(
+        self,
+        dim: int,
+        eps: float = 1e-5,
+        is_inference: bool = False,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+        self._eps = eps
+        self.weight = torch.nn.Parameter(torch.ones(dim))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return rms_norm(
+            x,
+            self.weight,
+            self._eps,
+            silu=False,
+            kernel=self.hammer_kernel(),
+        )
+
+
+class RMSNormSilu(HammerModule):
+    def __init__(
+        self,
+        dim: int,
+        eps: float = 1e-5,
+        is_inference: bool = False,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+        self._eps = eps
+        self.weight = torch.nn.Parameter(torch.ones(dim))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return rms_norm(
+            x,
+            self.weight,
+            self._eps,
+            silu=True,
+            kernel=self.hammer_kernel(),
+        )
+
+
+class SwishLayerNorm(HammerModule):
+    def __init__(
+        self,
+        dim: int,
+        eps: float = 1e-5,
+        is_inference: bool = False,
+    ) -> None:
+        super().__init__(is_inference=is_inference)
+        self._normalized_shape: List[int] = [dim]
+        self.weight = torch.nn.Parameter(torch.ones(self._normalized_shape))
+        self.bias = torch.nn.Parameter(torch.zeros(self._normalized_shape))
+        self._eps = eps
+
+    def forward(
+        self,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        return swish_layer_norm(
+            x=x,
+            weight=self.weight,
+            bias=self.bias,
+            eps=self._eps,
+            kernel=self.hammer_kernel(),
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/mm.py b/recommendation_v4/generative_recommenders/ops/mm.py
new file mode 100644
index 000000000..31a5c5d36
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/mm.py
@@ -0,0 +1,60 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import torch
+
+try:
+    from hammer.ops.triton.cc.addmm.triton_cc_addmm import triton_cc_addmm
+except ImportError:
+    triton_cc_addmm = None
+from generative_recommenders.common import HammerKernel
+from generative_recommenders.ops.triton.triton_addmm import triton_addmm
+
+try:
+    # @manual=//generative_recommenders/ops/triton_aot:triton_addmm
+    from generative_recommenders.ops.triton_aot.triton_addmm import (  # pyre-ignore[21]
+        aot_triton_kernel_wrapper_addmm,
+    )
+except ImportError:
+
+    def aot_triton_kernel_wrapper_addmm(
+        input: torch.Tensor,
+        mat1: torch.Tensor,
+        mat2: torch.Tensor,
+    ) -> torch.Tensor:
+        raise ImportError("AOT-T is required for the TRITON_INFERENCE addmm kernel.")
+
+
+def addmm(
+    input: torch.Tensor,
+    mat1: torch.Tensor,
+    mat2: torch.Tensor,
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> torch.Tensor:
+    if torch.jit.is_scripting():
+        return torch.addmm(input, mat1, mat2)
+    if kernel == HammerKernel.TRITON:
+        return triton_addmm(input, mat1, mat2)
+    elif kernel == HammerKernel.TRITON_INFERENCE:
+        return aot_triton_kernel_wrapper_addmm(input, mat1, mat2)
+    elif kernel == HammerKernel.TRITON_CC:
+        if triton_cc_addmm is None:
+            raise ImportError("hammer is required for the TRITON_CC kernel in addmm.")
+        return triton_cc_addmm(input, mat1, mat2)
+    else:
+        return torch.addmm(input, mat1, mat2)
diff --git a/recommendation_v4/generative_recommenders/ops/position.py b/recommendation_v4/generative_recommenders/ops/position.py
new file mode 100644
index 000000000..e090827e3
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/position.py
@@ -0,0 +1,147 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from typing import Optional
+
+import torch
+from generative_recommenders.ops.pytorch.pt_position import (
+    pytorch_add_timestamp_positional_embeddings,
+)
+
+try:
+    from hammer.ops.triton.cc.add_timestamp_position_embeddings.triton_cc_add_timestamp_position_embeddings import (
+        triton_cc_add_timestamp_position_embeddings,
+    )
+except ImportError:
+    triton_cc_add_timestamp_position_embeddings = None
+from generative_recommenders.common import HammerKernel
+from generative_recommenders.ops.triton.triton_position import (
+    triton_add_timestamp_positional_embeddings,
+)
+
+try:
+    # @manual=//generative_recommenders/ops/triton_aot:triton_position
+    from generative_recommenders.ops.triton_aot.triton_position import (  # pyre-ignore[21]
+        aot_triton_kernel_wrapper_position,
+    )
+except ImportError:
+
+    def aot_triton_kernel_wrapper_position(
+        *args: object,
+        **kwargs: object,
+    ) -> torch.Tensor:
+        raise ImportError("AOT-T is required for the TRITON_INFERENCE position kernel.")
+
+
+torch.fx.wrap("triton_add_timestamp_positional_embeddings")
+
+
+def add_timestamp_positional_embeddings(
+    alpha: float,
+    max_seq_len: int,
+    max_contextual_seq_len: int,
+    position_embeddings_weight: torch.Tensor,
+    timestamp_embeddings_weight: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    seq_lengths: torch.Tensor,
+    seq_embeddings: torch.Tensor,
+    timestamps: torch.Tensor,
+    num_targets: Optional[torch.Tensor],
+    interleave_targets: bool,
+    time_bucket_fn: str = "sqrt",
+    kernel: HammerKernel = HammerKernel.PYTORCH,
+) -> torch.Tensor:
+    if torch.jit.is_scripting():
+        # Script-mode fast path: bypass the HammerKernel ladder.
+        seq_embeddings = seq_embeddings * alpha
+        return pytorch_add_timestamp_positional_embeddings(
+            seq_embeddings=seq_embeddings,
+            seq_offsets=seq_offsets,
+            pos_embeddings=position_embeddings_weight,
+            ts_embeddings=timestamp_embeddings_weight,
+            timestamps=timestamps,
+            max_seq_len=max_seq_len,
+            max_contextual_seq_len=max_contextual_seq_len,
+            seq_lengths=seq_lengths,
+            num_targets=num_targets,
+            interleave_targets=interleave_targets,
+            time_bucket_fn=time_bucket_fn,
+        )
+    assert time_bucket_fn in ["sqrt", "log"]
+    seq_embeddings = seq_embeddings * alpha
+    if kernel == HammerKernel.TRITON:
+        return triton_add_timestamp_positional_embeddings(
+            seq_embeddings=seq_embeddings,
+            seq_offsets=seq_offsets,
+            pos_embeddings=position_embeddings_weight,
+            ts_embeddings=timestamp_embeddings_weight,
+            timestamps=timestamps,
+            max_seq_len=max_seq_len,
+            max_contextual_seq_len=max_contextual_seq_len,
+            seq_lengths=seq_lengths,
+            num_targets=num_targets,
+            interleave_targets=interleave_targets,
+            time_bucket_fn=time_bucket_fn,
+        )
+    elif kernel == HammerKernel.TRITON_INFERENCE:
+        return aot_triton_kernel_wrapper_position(
+            alpha=1.0,
+            max_seq_len=max_seq_len,
+            max_contextual_seq_len=max_contextual_seq_len,
+            position_embeddings_weight=position_embeddings_weight.to(torch.float32),
+            timestamp_embeddings_weight=timestamp_embeddings_weight.to(torch.float32),
+            seq_offsets=seq_offsets,
+            seq_lengths=seq_lengths,
+            seq_embeddings=seq_embeddings,
+            timestamps=timestamps,
+            num_targets=num_targets,
+            interleave_targets=interleave_targets,
+            time_bucket_fn=time_bucket_fn,
+        )
+    elif kernel == HammerKernel.TRITON_CC:
+        if triton_cc_add_timestamp_position_embeddings is None:
+            raise ImportError(
+                "hammer is required for the TRITON_CC kernel in add_timestamp_positional_embeddings."
+            )
+        return triton_cc_add_timestamp_position_embeddings(
+            seq_embeddings=seq_embeddings,
+            seq_offsets=seq_offsets,
+            pos_embeddings=position_embeddings_weight,
+            ts_embeddings=timestamp_embeddings_weight,
+            timestamps=timestamps,
+            max_seq_len=max_seq_len,
+            max_contextual_seq_len=max_contextual_seq_len,
+            seq_lengths=seq_lengths,
+            num_targets=num_targets,
+            interleave_targets=interleave_targets,
+            time_bucket_fn=time_bucket_fn,
+        )
+    else:
+        return pytorch_add_timestamp_positional_embeddings(
+            seq_embeddings=seq_embeddings,
+            seq_offsets=seq_offsets,
+            pos_embeddings=position_embeddings_weight,
+            ts_embeddings=timestamp_embeddings_weight,
+            timestamps=timestamps,
+            max_seq_len=max_seq_len,
+            max_contextual_seq_len=max_contextual_seq_len,
+            seq_lengths=seq_lengths,
+            num_targets=num_targets,
+            interleave_targets=interleave_targets,
+            time_bucket_fn=time_bucket_fn,
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/pytorch/pt_hstu_attention.py b/recommendation_v4/generative_recommenders/ops/pytorch/pt_hstu_attention.py
new file mode 100644
index 000000000..32575c4db
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/pytorch/pt_hstu_attention.py
@@ -0,0 +1,251 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from typing import Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+
+
+try:
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+except OSError:
+    pass
+
+
+@torch.fx.wrap
+def _get_valid_attn_mask(
+    device: torch.device,
+    causal: bool,
+    N: int,
+    seq_lengths: torch.Tensor,
+    num_targets: Optional[torch.Tensor] = None,
+    max_attn_len: int = 0,
+    contextual_seq_len: int = 0,
+    min_full_attn_seq_len: int = 0,
+) -> torch.Tensor:
+    ids = torch.arange(0, N, device=device).view(1, N)
+    max_ids = seq_lengths.view(-1, 1, 1)
+    if contextual_seq_len > 0:
+        ids = ids - contextual_seq_len + 1
+        ids = torch.clamp(ids, min=0)
+        max_ids = max_ids - contextual_seq_len + 1
+    if num_targets is not None:
+        max_ids = max_ids - num_targets.view(-1, 1, 1)
+        ids = torch.clamp(
+            ids,
+            max=max_ids,
+        )
+        row_ids = ids.view(-1, N, 1).expand(-1, N, N)
+        col_ids = ids.view(-1, 1, N).expand(-1, N, N)
+    else:
+        row_ids = ids.view(N, 1).expand(N, N)
+        col_ids = row_ids.t()
+        row_ids = row_ids.view(1, N, N)
+        col_ids = col_ids.view(1, N, N)
+    row_col_dist = row_ids - col_ids
+    valid_attn_mask = torch.eye(N, device=device, dtype=torch.bool).view(1, N, N)
+    if not causal:
+        row_col_dist = torch.where(row_col_dist > 0, row_col_dist, -row_col_dist)
+    valid_attn_mask = torch.logical_or(valid_attn_mask, row_col_dist > 0)
+    if max_attn_len > 0:
+        if min_full_attn_seq_len > 0:
+            valid_attn_mask = torch.logical_and(
+                valid_attn_mask,
+                torch.logical_or(
+                    row_col_dist <= max_attn_len,
+                    row_ids >= max_ids - min_full_attn_seq_len,
+                ),
+            )
+        else:
+            valid_attn_mask = torch.logical_and(
+                valid_attn_mask, row_col_dist <= max_attn_len
+            )
+    if contextual_seq_len > 0:
+        valid_attn_mask = torch.logical_or(
+            valid_attn_mask, torch.logical_and(row_ids == 0, col_ids < max_ids)
+        )
+    return valid_attn_mask
+
+
+def _pad_qkv(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    N: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    L, H, D = q.shape
+    V = v.shape[2]
+    padded_q = (
+        torch.ops.fbgemm.jagged_to_padded_dense(
+            values=q.reshape(L, H * D),
+            offsets=[seq_offsets],
+            max_lengths=[N],
+            padding_value=0.0,
+        )
+        .view(-1, N, H, D)
+        .transpose(1, 2)
+    )  # [B, H, N, A]
+    padded_k = (
+        torch.ops.fbgemm.jagged_to_padded_dense(
+            values=k.reshape(L, H * D),
+            offsets=[seq_offsets],
+            max_lengths=[N],
+            padding_value=0.0,
+        )
+        .view(-1, N, H, D)
+        .transpose(1, 2)
+    )  # [B, H, N, A]
+    padded_v = (
+        torch.ops.fbgemm.jagged_to_padded_dense(
+            values=v.reshape(L, H * V),
+            offsets=[seq_offsets],
+            max_lengths=[N],
+            padding_value=0.0,
+        )
+        .view(-1, N, H, V)
+        .transpose(1, 2)
+    )  # [B, H, N, D]
+    return padded_q, padded_k, padded_v
+
+
+@torch.fx.wrap
+def pytorch_hstu_mha(
+    max_seq_len: int,
+    alpha: float,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    causal: bool = True,
+    dropout_pr: float = 0.0,
+    training: bool = True,
+    num_targets: Optional[torch.Tensor] = None,
+    attn_scale: Optional[torch.Tensor] = None,
+    max_attn_len: int = 0,
+    contextual_seq_len: int = 0,
+    min_full_attn_seq_len: int = 0,
+) -> torch.Tensor:
+    L, H, _ = q.shape
+    V = v.shape[2]
+    q, k, v = _pad_qkv(
+        q, k, v, seq_offsets, max_seq_len
+    )  # [B, H, N, D) and [B, H, N, V]
+    qk_attn = torch.einsum("bhxa,bhya->bhxy", q, k) * alpha
+    if attn_scale is not None:
+        if attn_scale.ndim > 0:
+            attn_scale = (
+                torch.ops.fbgemm.jagged_to_padded_dense(
+                    values=attn_scale.unsqueeze(-1),
+                    offsets=[seq_offsets],
+                    max_lengths=[max_seq_len],
+                    padding_value=0.0,
+                )
+                .unsqueeze(1)
+                .to(qk_attn.dtype)
+            )
+
+        qk_attn = F.silu(qk_attn) * attn_scale
+    else:
+        qk_attn = F.silu(qk_attn) / max_seq_len
+    valid_attn_mask = _get_valid_attn_mask(
+        device=q.device,
+        causal=causal,
+        N=max_seq_len,
+        seq_lengths=seq_offsets[1:] - seq_offsets[:-1],
+        num_targets=num_targets,
+        max_attn_len=max_attn_len,
+        contextual_seq_len=contextual_seq_len,
+        min_full_attn_seq_len=min_full_attn_seq_len,
+    )
+    # raise NotImplementedError(valid_attn_mask[0, :, :].to(torch.int32))
+    qk_attn = qk_attn * valid_attn_mask.unsqueeze(1)
+    if dropout_pr > 0.0:
+        qk_attn = F.dropout(qk_attn, p=dropout_pr, training=training)
+    attn_dense = torch.einsum("bhxd,bhdv->bhxv", qk_attn, v)  # [B, H, N, V]
+    return torch.ops.fbgemm.dense_to_jagged(
+        attn_dense.transpose(1, 2).flatten(2, 3),  # [B, N, H, V]->[B, N, H * V]
+        [seq_offsets],
+        L,
+    )[0].view(L, H, V)
+
+
+@torch.fx.wrap
+def pytorch_cached_hstu_mha(
+    max_seq_len: int,
+    alpha: float,
+    delta_q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    num_targets: Optional[torch.Tensor] = None,
+    max_attn_len: int = 0,
+    contextual_seq_len: int = 0,
+) -> torch.Tensor:
+    L, H, D = delta_q.shape
+    _, _, V = v.shape
+    B = seq_offsets.size(0) - 1
+    delta_size = L // B
+    delta_q = delta_q.view(B, -1, H, D).transpose(1, 2)
+    full_k = (
+        torch.ops.fbgemm.jagged_to_padded_dense(
+            values=k.reshape(-1, H * D),
+            offsets=[seq_offsets],
+            max_lengths=[max_seq_len],
+            padding_value=0.0,
+        )
+        .view(B, -1, H, D)
+        .transpose(1, 2)
+    )
+    full_v = (
+        torch.ops.fbgemm.jagged_to_padded_dense(
+            values=v.reshape(-1, H * V),
+            offsets=[seq_offsets],
+            max_lengths=[max_seq_len],
+            padding_value=0.0,
+        )
+        .view(B, -1, H, V)
+        .transpose(1, 2)
+    )
+    qk_attn = torch.einsum("bhxa,bhya->bhxy", delta_q, full_k) * alpha
+    qk_attn = F.silu(qk_attn) / max_seq_len
+    full_valid_attn_mask = _get_valid_attn_mask(
+        device=delta_q.device,
+        causal=True,
+        N=max_seq_len,
+        seq_lengths=seq_offsets[1:] - seq_offsets[:-1],
+        num_targets=num_targets,
+        max_attn_len=max_attn_len,
+        contextual_seq_len=contextual_seq_len,
+    )
+    seq_lengths = seq_offsets[1:] - seq_offsets[:-1]
+    mask = torch.arange(max_seq_len, device=delta_q.device).view(1, -1)
+    mask = torch.logical_and(
+        mask >= (seq_lengths - delta_size).view(-1, 1),
+        mask < seq_lengths.view(-1, 1),
+    )
+    valid_attn_mask = (
+        full_valid_attn_mask.expand(B, -1, -1)
+        .flatten(0, 1)[mask.view(-1), :]
+        .view(-1, delta_size, max_seq_len)
+    )
+    qk_attn = qk_attn * valid_attn_mask.unsqueeze(1)
+    attn_output = torch.einsum("bhxd,bhdv->bhxv", qk_attn, full_v)
+    return attn_output.transpose(1, 2).reshape(-1, H, V)
diff --git a/recommendation_v4/generative_recommenders/ops/pytorch/pt_hstu_linear.py b/recommendation_v4/generative_recommenders/ops/pytorch/pt_hstu_linear.py
new file mode 100644
index 000000000..6ea94a565
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/pytorch/pt_hstu_linear.py
@@ -0,0 +1,130 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import torch
+import torch.nn.functional as F
+
+
+def pytorch_norm_mul_dropout(
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    dropout_ratio: float,
+    training: bool,
+    silu_u: bool = False,
+    concat_u: bool = False,
+    concat_x: bool = False,
+    mul_u_activation_type: str = "none",
+    group_norm: bool = False,
+    num_heads: int = 1,
+    linear_dim: int = -1,
+) -> torch.Tensor:
+    dtype = x.dtype
+    x = x.to(torch.float32)
+    u = u.to(torch.float32)
+    if group_norm:
+        if silu_u:
+            u = F.silu(u)
+            u = u.to(torch.float32)
+        y = u * F.group_norm(
+            x.view(-1, num_heads, linear_dim),
+            num_groups=num_heads,
+            weight=weight.to(torch.float32),
+            bias=bias.to(torch.float32),
+            eps=eps,
+        ).view(-1, num_heads * linear_dim)
+        if concat_u and concat_x:
+            y = torch.cat([u, x, y], dim=1)
+    else:
+        mul_u = u
+        if mul_u_activation_type == "sigmoid":
+            mul_u = torch.sigmoid(u)
+        elif mul_u_activation_type == "silu":
+            mul_u = F.silu(u)
+        y = mul_u * F.layer_norm(
+            x,
+            normalized_shape=(x.shape[-1],),
+            weight=weight.to(torch.float32),
+            bias=bias.to(torch.float32),
+            eps=eps,
+        )
+        if concat_u:
+            if silu_u:
+                u = F.silu(u)
+            if concat_x:
+                y = torch.cat([u, x, y], dim=1)
+            else:
+                y = torch.cat([u, y], dim=1)
+        elif concat_x:
+            y = torch.cat([x, y], dim=1)
+    y = F.dropout(
+        y,
+        p=dropout_ratio,
+        training=training,
+    )
+    return y.to(dtype)
+
+
+def pytorch_hstu_compute_output(
+    attn: torch.Tensor,
+    u: torch.Tensor,
+    x: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    output_weight: torch.Tensor,
+    eps: float,
+    dropout_ratio: float,
+    training: bool,
+    silu_u: bool = False,
+    concat_u: bool = False,
+    concat_x: bool = False,
+    mul_u_activation_type: str = "none",
+    group_norm: bool = False,
+    num_heads: int = 1,
+    linear_dim: int = -1,
+) -> torch.Tensor:
+    dtype = x.dtype
+    y = pytorch_norm_mul_dropout(
+        x=attn,
+        u=u,
+        weight=norm_weight,
+        bias=norm_bias,
+        eps=eps,
+        dropout_ratio=dropout_ratio,
+        training=training,
+        silu_u=silu_u,
+        concat_u=concat_u,
+        concat_x=concat_x,
+        mul_u_activation_type=mul_u_activation_type,
+        group_norm=group_norm,
+        num_heads=num_heads,
+        linear_dim=linear_dim,
+    )
+    return torch.addmm(x, y, output_weight.to(x.dtype)).to(dtype)
+
+
+def pytorch_swiglu(
+    x: torch.Tensor,
+    w_gate: torch.Tensor,
+    w_up: torch.Tensor,
+) -> torch.Tensor:
+    gate = F.silu(F.linear(x, w_gate))
+    up = F.linear(x, w_up)
+    return gate * up
diff --git a/recommendation_v4/generative_recommenders/ops/pytorch/pt_jagged.py b/recommendation_v4/generative_recommenders/ops/pytorch/pt_jagged.py
new file mode 100644
index 000000000..82d82f402
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/pytorch/pt_jagged.py
@@ -0,0 +1,258 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from typing import Tuple
+
+import torch
+
+
+try:
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+except OSError:
+    pass
+
+
+def pytorch_jagged_dense_bmm(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    jagged: torch.Tensor,
+    dense: torch.Tensor,
+) -> torch.Tensor:
+    dtype = jagged.dtype
+    jagged = jagged.to(torch.float32)
+    dense = dense.to(torch.float32)
+    padded_jagged = torch.ops.fbgemm.jagged_to_padded_dense(
+        values=jagged,
+        offsets=[seq_offsets],
+        max_lengths=[max_seq_len],
+        padding_value=0.0,
+    )
+    bmm_out = torch.bmm(padded_jagged, dense)
+    jagged_bmm_out = torch.ops.fbgemm.dense_to_jagged(
+        bmm_out, [seq_offsets], total_L=jagged.shape[0]
+    )[0]
+    jagged_bmm_out = jagged_bmm_out.to(dtype)
+    return jagged_bmm_out
+
+
+def pytorch_jagged_dense_broadcast_add(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    jagged: torch.Tensor,
+    dense: torch.Tensor,
+) -> torch.Tensor:
+    dtype = jagged.dtype
+    jagged = jagged.to(torch.float32)
+    dense = dense.to(torch.float32)
+    padded_jagged = torch.ops.fbgemm.jagged_to_padded_dense(
+        values=jagged,
+        offsets=[seq_offsets],
+        max_lengths=[max_seq_len],
+        padding_value=0.0,
+    )
+    out = padded_jagged + dense.unsqueeze(1)
+    jagged_out = torch.ops.fbgemm.dense_to_jagged(
+        out, [seq_offsets], total_L=jagged.shape[0]
+    )[0]
+    jagged_out = jagged_out.to(dtype)
+    return jagged_out
+
+
+def pytorch_jagged_dense_bmm_add(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    jagged: torch.Tensor,
+    dense: torch.Tensor,
+    bias: torch.Tensor,
+    elementwise: bool = False,
+) -> torch.Tensor:
+    dtype = jagged.dtype
+    jagged = jagged.to(torch.float32)
+    dense = dense.to(torch.float32)
+    padded_jagged = torch.ops.fbgemm.jagged_to_padded_dense(
+        values=jagged,
+        offsets=[seq_offsets],
+        max_lengths=[max_seq_len],
+        padding_value=0.0,
+    )
+    bmm_out = torch.bmm(padded_jagged, dense)
+
+    if elementwise:
+        jagged_out = (
+            torch.ops.fbgemm.dense_to_jagged(
+                bmm_out, [seq_offsets], total_L=jagged.shape[0]
+            )[0]
+            + bias
+        )
+    else:
+        jagged_out = torch.ops.fbgemm.dense_to_jagged(
+            bmm_out + bias.unsqueeze(1), [seq_offsets], total_L=jagged.shape[0]
+        )[0]
+
+    jagged_out = jagged_out.to(dtype)
+    return jagged_out
+
+
+@torch.fx.wrap
+def _arange(len: int, device: torch.device) -> torch.Tensor:
+    return torch.arange(len, device=device)
+
+
+def pytorch_concat_2D_dense_jagged(
+    jagged_max_seq_len: int,
+    jagged_offsets: torch.Tensor,
+    jagged_values: torch.Tensor,
+    dense_values: torch.Tensor,
+) -> torch.Tensor:
+    B, dense_size, D = dense_values.size()
+    jagged_dense = torch.ops.fbgemm.jagged_to_padded_dense(
+        values=jagged_values,
+        offsets=[jagged_offsets],
+        max_lengths=[jagged_max_seq_len],
+        padding_value=0.0,
+    )
+    concatted_dense = torch.cat([dense_values, jagged_dense], dim=1)
+    concatted_offsets = (
+        dense_size * _arange(B + 1, device=jagged_offsets.device) + jagged_offsets
+    )
+    return torch.ops.fbgemm.dense_to_jagged(
+        concatted_dense,
+        [concatted_offsets],
+        total_L=jagged_values.shape[0] + dense_size * B,
+    )[0]
+
+
+def pytorch_concat_2D_jagged_jagged(
+    max_seq_len_left: int,
+    offsets_left: torch.Tensor,
+    values_left: torch.Tensor,
+    max_seq_len_right: int,
+    offsets_right: torch.Tensor,
+    values_right: torch.Tensor,
+    is_replace: bool = False,
+    n_prefix_from_right: int = 0,
+) -> torch.Tensor:
+    # is_replace with n_prefix_from_right != 0 is not supported yet (neither in triton)
+    if is_replace:
+        return pytorch_replace_last_n_with_jagged(
+            max_seq_len_left,
+            offsets_left,
+            values_left,
+            offsets_right,
+            values_right,
+        )
+
+    lengths_a = offsets_left[1:] - offsets_left[:-1]
+    lengths_b = offsets_right[1:] - offsets_right[:-1]
+
+    # Compute output offsets via cumsum (no dynamic shapes).
+    output_lengths = lengths_a + lengths_b
+    output_offsets = torch.nn.functional.pad(
+        torch.cumsum(output_lengths, dim=0), (1, 0)
+    )
+
+    total_len = values_left.shape[0] + values_right.shape[0]
+    positions = torch.arange(total_len, device=values_left.device)
+    batch_idx = torch.searchsorted(output_offsets[1:], positions, right=True)
+    local_pos = positions - output_offsets[batch_idx]
+
+    per_batch_lengths_a = lengths_a[batch_idx]
+
+    # Classify each output position into prefix / left / suffix.
+    is_prefix = local_pos < n_prefix_from_right
+    is_left = (local_pos >= n_prefix_from_right) & (
+        local_pos < n_prefix_from_right + per_batch_lengths_a
+    )
+
+    # Pad with a sentinel zero row so index_select works on empty tensors
+    values_left_safe = torch.nn.functional.pad(values_left, (0, 0, 0, 1))
+    values_right_safe = torch.nn.functional.pad(values_right, (0, 0, 0, 1))
+
+    left_idx = (offsets_left[batch_idx] + (local_pos - n_prefix_from_right)).clamp(
+        min=0, max=values_left.shape[0]
+    )
+    right_prefix_idx = offsets_right[batch_idx] + local_pos
+    right_suffix_idx = offsets_right[batch_idx] + (local_pos - per_batch_lengths_a)
+    right_idx = torch.where(is_prefix, right_prefix_idx, right_suffix_idx).clamp(
+        min=0, max=values_right.shape[0]
+    )
+
+    left_values = values_left_safe.index_select(0, left_idx)
+    right_values = values_right_safe.index_select(0, right_idx)
+
+    return torch.where(is_left.unsqueeze(-1), left_values, right_values)
+
+
+def pytorch_jagged_remove_first_or_last_1D(
+    values: torch.Tensor,
+    lengths: torch.Tensor,
+    offsets: torch.Tensor,
+    max_seq_len: int,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    values = values.view(-1, 1)
+    shrunk_lengths = lengths - 1
+    k_lengths = torch.stack([shrunk_lengths, torch.ones_like(lengths)], dim=1).view(-1)
+    q_lengths = torch.stack([torch.ones_like(lengths), shrunk_lengths], dim=1).view(-1)
+    all_indices = torch.arange(
+        start=0, end=q_lengths.numel(), device=values.device
+    ).reshape(-1, 2)
+    q_indices, k_indices = all_indices[:, 1], all_indices[:, 0]
+    values_no_first, _ = torch.ops.fbgemm.jagged_index_select(
+        values, q_lengths, q_indices
+    )
+    values_no_last, _ = torch.ops.fbgemm.jagged_index_select(
+        values, k_lengths, k_indices
+    )
+    return values_no_first.squeeze(), values_no_last.squeeze()
+
+
+@torch.fx.wrap
+def fx_apply_mask(
+    tensor: torch.Tensor, mask: torch.Tensor, fill_value: torch.Tensor
+) -> torch.Tensor:
+    tensor[mask] = fill_value
+    return tensor
+
+
+def pytorch_replace_last_n_with_jagged(
+    max_seq_len_left: int,
+    offsets_left: torch.Tensor,
+    values_left: torch.Tensor,
+    offsets_right: torch.Tensor,
+    values_right: torch.Tensor,
+) -> torch.Tensor:
+    lengths_a = offsets_left[1:] - offsets_left[:-1]
+    lengths_b = offsets_right[1:] - offsets_right[:-1]
+
+    total_len = values_left.shape[0]
+    positions = torch.arange(total_len, device=values_left.device)
+    batch_idx = torch.searchsorted(offsets_left[1:], positions, right=True)
+    local_pos = positions - offsets_left[batch_idx]
+
+    # Positions >= (lengths_a - lengths_b) within each batch are in the replace zone.
+    threshold = lengths_a[batch_idx] - lengths_b[batch_idx]
+    in_replace_zone = local_pos >= threshold
+
+    # Pad with a sentinel zero row so index_select works on empty tensors
+    values_right_safe = torch.nn.functional.pad(values_right, (0, 0, 0, 1))
+    right_idx = (offsets_right[batch_idx] + (local_pos - threshold)).clamp(
+        min=0, max=values_right.shape[0]
+    )
+    right_values = values_right_safe.index_select(0, right_idx)
+    return torch.where(in_replace_zone.unsqueeze(-1), right_values, values_left)
diff --git a/recommendation_v4/generative_recommenders/ops/pytorch/pt_jagged_tensors.py b/recommendation_v4/generative_recommenders/ops/pytorch/pt_jagged_tensors.py
new file mode 100644
index 000000000..27817f7fb
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/pytorch/pt_jagged_tensors.py
@@ -0,0 +1,246 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from typing import Optional, Tuple
+
+import torch
+from generative_recommenders.common import fx_arange
+
+try:
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+except OSError:
+    pass
+
+
+def _concat_2D_jagged_jagged(
+    values_left: torch.Tensor,
+    values_right: torch.Tensor,
+    max_len_left: int,
+    max_len_right: int,
+    offsets_left: torch.Tensor,
+    offsets_right: torch.Tensor,
+) -> torch.Tensor:
+    max_seq_len = max_len_left + max_len_right
+    lengths_left = offsets_left[1:] - offsets_left[:-1]
+    lengths_right = offsets_right[1:] - offsets_right[:-1]
+    padded_left = torch.ops.fbgemm.jagged_to_padded_dense(
+        values=values_left,
+        offsets=[offsets_left],
+        max_lengths=[max_len_left],
+        padding_value=0.0,
+    )
+    padded_right = torch.ops.fbgemm.jagged_to_padded_dense(
+        values=values_right,
+        offsets=[offsets_right],
+        max_lengths=[max_len_right],
+        padding_value=0.0,
+    )
+    concatted_dense = torch.cat([padded_left, padded_right], dim=1)
+    mask = fx_arange(max_seq_len, device=offsets_left.device).view(1, -1)
+    mask = torch.logical_or(
+        mask < lengths_left.view(-1, 1),
+        torch.logical_and(
+            mask >= max_len_left,
+            mask < max_len_left + lengths_right.view(-1, 1),
+        ),
+    )
+    return concatted_dense.flatten(0, 1)[mask.view(-1), :]
+
+
+@torch.fx.wrap
+def pytorch_concat_2D_jagged(
+    values_left: torch.Tensor,
+    values_right: torch.Tensor,
+    max_len_left: Optional[int],
+    max_len_right: Optional[int],
+    offsets_left: Optional[torch.Tensor],
+    offsets_right: Optional[torch.Tensor],
+) -> torch.Tensor:
+    if offsets_left is None:
+        assert max_len_left is not None
+        B = values_left.shape[0] // max_len_left
+        offsets_left_non_optional = max_len_left * torch.arange(
+            B + 1, device=values_left.device
+        )
+    else:
+        offsets_left_non_optional = offsets_left
+    if offsets_right is None:
+        assert max_len_right is not None
+        B = values_right.shape[0] // max_len_right
+        offsets_right_non_optional = max_len_right * torch.arange(
+            B + 1, device=values_left.device
+        )
+    else:
+        offsets_right_non_optional = offsets_right
+    max_len_left = (
+        int(
+            (offsets_left_non_optional[1:] - offsets_left_non_optional[:-1])
+            .max()
+            .item()
+        )
+        if max_len_left is None
+        else max_len_left
+    )
+    max_len_right = (
+        int(
+            (offsets_right_non_optional[1:] - offsets_right_non_optional[:-1])
+            .max()
+            .item()
+        )
+        if max_len_right is None
+        else max_len_right
+    )
+    return _concat_2D_jagged_jagged(
+        values_left=values_left,
+        values_right=values_right,
+        max_len_left=max_len_left,
+        max_len_right=max_len_right,
+        offsets_left=offsets_left_non_optional,
+        offsets_right=offsets_right_non_optional,
+    )
+
+
+def _split_2D_jagged_jagged(
+    max_seq_len: int,
+    values: torch.Tensor,
+    offsets_left: torch.Tensor,
+    offsets_right: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    offsets = offsets_left + offsets_right
+    padded_values = torch.ops.fbgemm.jagged_to_padded_dense(
+        values=values,
+        offsets=[offsets],
+        max_lengths=[max_seq_len],
+        padding_value=0.0,
+    ).flatten(0, 1)
+    lengths_left = offsets_left[1:] - offsets_left[:-1]
+    lengths_right = offsets_right[1:] - offsets_right[:-1]
+    mask = fx_arange(max_seq_len, device=values.device).view(1, -1)
+    mask_left = mask < lengths_left.view(-1, 1)
+    mask_right = torch.logical_and(
+        mask >= lengths_left.view(-1, 1),
+        mask < (lengths_left + lengths_right).view(-1, 1),
+    )
+    return padded_values[mask_left.view(-1), :], padded_values[mask_right.view(-1), :]
+
+
+@torch.fx.wrap
+def pytorch_split_2D_jagged(
+    max_seq_len: int,
+    values: torch.Tensor,
+    max_len_left: Optional[int],
+    max_len_right: Optional[int],
+    offsets_left: Optional[torch.Tensor],
+    offsets_right: Optional[torch.Tensor],
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if offsets_left is None:
+        assert max_len_left is not None
+        assert offsets_right is not None
+        offsets_left_non_optional = max_len_left * torch.arange(
+            offsets_right.shape[0], device=values.device
+        )
+    else:
+        offsets_left_non_optional = offsets_left
+    if offsets_right is None:
+        assert max_len_right is not None
+        assert offsets_left is not None
+        offsets_right_non_optional = max_len_right * torch.arange(
+            offsets_left.shape[0], device=values.device
+        )
+    else:
+        offsets_right_non_optional = offsets_right
+    return _split_2D_jagged_jagged(
+        max_seq_len=max_seq_len,
+        values=values,
+        offsets_left=offsets_left_non_optional,
+        offsets_right=offsets_right_non_optional,
+    )
+
+
+def pytorch_hstu_split_l2_embeddings(
+    max_seq_len: int,
+    x: torch.Tensor,
+    prefix_offsets: torch.Tensor,
+    l2_offsets: torch.Tensor,
+    contextual_seq_len: int,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    x_offsets = prefix_offsets + l2_offsets
+    x_lengths = x_offsets[1:] - x_offsets[:-1]
+    padded_x = torch.ops.fbgemm.jagged_to_padded_dense(
+        values=x,
+        offsets=[x_offsets],
+        max_lengths=[max_seq_len],
+        padding_value=0.0,
+    ).flatten(0, 1)
+    prefix_lengths = prefix_offsets[1:] - prefix_offsets[:-1]
+    mask = fx_arange(max_seq_len, device=x_offsets.device).view(1, -1)
+    mask_prefix = torch.logical_and(
+        mask >= contextual_seq_len,
+        mask < prefix_lengths.view(-1, 1) + contextual_seq_len,
+    )
+    mask_l2 = torch.logical_or(
+        mask < contextual_seq_len,
+        torch.logical_and(
+            mask >= prefix_lengths.view(-1, 1) + contextual_seq_len,
+            mask < x_lengths.view(-1, 1),
+        ),
+    )
+    return padded_x[mask_prefix.view(-1), :], padded_x[mask_l2.view(-1), :]
+
+
+def pytorch_hstu_concat_l2_embeddings(
+    max_prefix_len: int,
+    prefix_x: torch.Tensor,
+    prefix_offsets: torch.Tensor,
+    max_l2_len: int,
+    l2_x: torch.Tensor,
+    l2_offsets: torch.Tensor,
+    contextual_seq_len: int,
+) -> torch.Tensor:
+    padded_prefix_x = torch.ops.fbgemm.jagged_to_padded_dense(
+        values=prefix_x,
+        offsets=[prefix_offsets],
+        max_lengths=[max_prefix_len],
+        padding_value=0.0,
+    )
+    padded_l2_x = torch.ops.fbgemm.jagged_to_padded_dense(
+        values=l2_x,
+        offsets=[l2_offsets],
+        max_lengths=[max_l2_len],
+        padding_value=0.0,
+    )
+    padded_x = torch.cat(
+        [
+            padded_l2_x[:, 0:contextual_seq_len, :],
+            padded_prefix_x,
+            padded_l2_x[:, contextual_seq_len:, :],
+        ],
+        dim=1,
+    )
+    mask = fx_arange(max_prefix_len + max_l2_len, device=prefix_x.device).view(1, -1)
+    prefix_lengths = prefix_offsets[1:] - prefix_offsets[:-1]
+    l2_lengths = l2_offsets[1:] - l2_offsets[:-1]
+    mask = torch.logical_or(
+        mask < prefix_lengths.view(-1, 1) + contextual_seq_len,
+        torch.logical_and(
+            mask >= max_prefix_len + contextual_seq_len,
+            mask < max_prefix_len + l2_lengths.view(-1, 1),
+        ),
+    )
+    return padded_x.flatten(0, 1)[mask.view(-1), :]
diff --git a/recommendation_v4/generative_recommenders/ops/pytorch/pt_layer_norm.py b/recommendation_v4/generative_recommenders/ops/pytorch/pt_layer_norm.py
new file mode 100644
index 000000000..0666212ce
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/pytorch/pt_layer_norm.py
@@ -0,0 +1,81 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# pyre-strict
+
+
+from typing import List
+
+import torch
+
+
+def pytorch_layer_norm(
+    x: torch.Tensor,
+    normalized_shape: List[int],
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+) -> torch.Tensor:
+    dtype = x.dtype
+    return torch.nn.functional.layer_norm(
+        x.to(torch.float32),
+        normalized_shape,
+        weight.to(torch.float32),
+        bias.to(torch.float32),
+        eps,
+    ).to(dtype)
+
+
+def pytorch_rms_norm(
+    x: torch.Tensor,
+    normalized_shape: List[int],
+    weight: torch.Tensor,
+    eps: float,
+    silu: bool = False,
+) -> torch.Tensor:
+    dtype = x.dtype
+    x_float = x.to(torch.float32)
+    normalized = torch.nn.functional.rms_norm(
+        x_float,
+        normalized_shape,
+        weight.to(torch.float32),
+        eps,
+    )
+    if silu:
+        normalized = torch.nn.functional.silu(normalized)
+    return normalized.to(dtype)
+
+
+def pytorch_swish_layer_norm(
+    x: torch.Tensor,
+    normalized_shape: List[int],
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+) -> torch.Tensor:
+    dtype = x.dtype
+    x = x.to(torch.float32)
+    return (
+        x
+        * torch.sigmoid(
+            torch.nn.functional.layer_norm(
+                x,
+                normalized_shape,
+                weight.to(torch.float32),
+                bias.to(torch.float32),
+                eps,
+            )
+        )
+    ).to(dtype)
diff --git a/recommendation_v4/generative_recommenders/ops/pytorch/pt_position.py b/recommendation_v4/generative_recommenders/ops/pytorch/pt_position.py
new file mode 100644
index 000000000..fced57e4f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/pytorch/pt_position.py
@@ -0,0 +1,142 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from typing import Optional
+
+import torch
+from generative_recommenders.common import (
+    fx_unwrap_optional_tensor,
+    jagged_to_padded_dense,
+)
+
+try:
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
+    torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
+except OSError:
+    pass
+
+
+@torch.fx.wrap
+def torch_arange(end: int, device: torch.device) -> torch.Tensor:
+    return torch.arange(end, device=device)
+
+
+@torch.fx.wrap
+def _get_col_indices(
+    max_seq_len: int,
+    max_contextual_seq_len: int,
+    max_pos_ind: int,
+    seq_lengths: torch.Tensor,
+    num_targets: Optional[torch.Tensor],
+    interleave_targets: bool,
+) -> torch.Tensor:
+    B = seq_lengths.size(0)
+    col_indices = torch.arange(max_seq_len, device=seq_lengths.device).expand(
+        B, max_seq_len
+    )
+    if num_targets is not None:
+        if interleave_targets:
+            high_inds = seq_lengths - fx_unwrap_optional_tensor(num_targets) * 2
+        else:
+            high_inds = seq_lengths - fx_unwrap_optional_tensor(num_targets)
+        col_indices = torch.clamp(col_indices, max=high_inds.view(-1, 1))
+        col_indices = high_inds.view(-1, 1) - col_indices
+    else:
+        col_indices = seq_lengths.view(-1, 1) - col_indices
+    col_indices = col_indices + max_contextual_seq_len
+    col_indices = torch.clamp(col_indices, max=max_pos_ind - 1)
+    if max_contextual_seq_len > 0:
+        col_indices[:, :max_contextual_seq_len] = torch.arange(
+            0,
+            max_contextual_seq_len,
+            device=col_indices.device,
+            dtype=col_indices.dtype,
+        ).view(1, -1)
+    return col_indices
+
+
+def pytorch_add_timestamp_positional_embeddings(
+    seq_embeddings: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    pos_embeddings: torch.Tensor,
+    ts_embeddings: torch.Tensor,
+    timestamps: torch.Tensor,
+    max_seq_len: int,
+    max_contextual_seq_len: int,
+    seq_lengths: torch.Tensor,
+    num_targets: Optional[torch.Tensor],
+    interleave_targets: bool,
+    time_bucket_fn: str,
+) -> torch.Tensor:
+    max_pos_ind = int(pos_embeddings.size(0))
+    # position encoding
+    pos_inds = _get_col_indices(
+        max_seq_len=max_seq_len,
+        max_contextual_seq_len=max_contextual_seq_len,
+        max_pos_ind=max_pos_ind,
+        seq_lengths=seq_lengths,
+        num_targets=num_targets,
+        interleave_targets=interleave_targets,
+    )
+    B, _ = pos_inds.shape
+    # timestamp encoding
+    num_time_buckets = ts_embeddings.size(1) - 1
+    time_bucket_increments = 60.0
+    time_bucket_divisor = 1.0
+    time_delta = 0
+    timestamps = jagged_to_padded_dense(
+        values=timestamps.unsqueeze(-1),
+        offsets=[seq_offsets],
+        max_lengths=[max_seq_len],
+        padding_value=0.0,
+    ).squeeze(-1)
+    query_time = torch.gather(
+        timestamps, dim=1, index=(seq_lengths - 1).unsqueeze(1).clamp(min=0)
+    )
+    ts = query_time - timestamps
+    ts = ts + time_delta
+    ts = ts.clamp(min=1e-6) / time_bucket_increments
+    if time_bucket_fn == "log":
+        ts = torch.log(ts)
+    else:
+        ts = torch.sqrt(ts)
+    ts = (ts / time_bucket_divisor).clamp(min=0).int()
+    ts = torch.clamp(
+        ts,
+        min=0,
+        max=num_time_buckets,
+    )
+    position_embeddings = torch.index_select(
+        pos_embeddings, 0, pos_inds.reshape(-1)
+    ).view(B, max_seq_len, -1)
+    time_embeddings = torch.index_select(ts_embeddings, 0, ts.reshape(-1)).view(
+        B, max_seq_len, -1
+    )
+    padded_emb = torch.ops.fbgemm.jagged_to_padded_dense(
+        values=seq_embeddings,
+        offsets=[seq_offsets],
+        max_lengths=[max_seq_len],
+        padding_value=0.0,
+    )
+    summed = padded_emb + (time_embeddings + position_embeddings).to(
+        seq_embeddings.dtype
+    )
+    result, _ = torch.ops.fbgemm.dense_to_jagged(
+        summed, [seq_offsets], seq_embeddings.shape[0]
+    )
+    return result
diff --git a/recommendation_v4/generative_recommenders/ops/tests/fake_signature_test.py b/recommendation_v4/generative_recommenders/ops/tests/fake_signature_test.py
new file mode 100644
index 000000000..fb9047454
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/tests/fake_signature_test.py
@@ -0,0 +1,162 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+"""
+Tests to ensure fake and real implementations of triton functions
+have the same function signatures. This is critical for PT2 compile compatibility.
+"""
+
+import inspect
+import unittest
+from typing import Any, Callable, List
+
+
+def get_custom_op_params(func: Callable[..., object]) -> List[str]:
+    """
+    Get parameter names from a function, handling custom_op decorated functions.
+
+    For maybe_register_custom_op decorated functions, inspect.signature may return
+    *args, **kwargs instead of the actual parameters. In this case, we need to
+    access the underlying schema to get the real parameter names.
+    """
+    sig = inspect.signature(func)
+    params = list(sig.parameters.keys())
+
+    if params == ["args", "kwargs"]:
+        func_any: Any = func
+        if hasattr(func_any, "_opoverload"):
+            schema = func_any._opoverload._schema
+            return [arg.name for arg in schema.arguments]
+
+    return params
+
+
+class FakeSignatureTest(unittest.TestCase):
+    """Test to ensure fake and real implementations have the same function signatures."""
+
+    def test_triton_addmm_fwd_and_fake_have_same_signature(self) -> None:
+        """Verify triton_addmm_fwd and triton_addmm_fwd_fake have the same arguments."""
+        from generative_recommenders.ops.triton.triton_addmm import (
+            triton_addmm_fwd,
+            triton_addmm_fwd_fake,
+        )
+
+        real_params = get_custom_op_params(triton_addmm_fwd)
+        fake_params = get_custom_op_params(triton_addmm_fwd_fake)
+
+        self.assertEqual(
+            real_params,
+            fake_params,
+            f"triton_addmm_fwd and triton_addmm_fwd_fake have different arguments.\n"
+            f"Real: {real_params}\n"
+            f"Fake: {fake_params}",
+        )
+
+    def test_maybe_triton_addmm_fwd_and_fake_have_same_signature(self) -> None:
+        """Verify maybe_triton_addmm_fwd and maybe_triton_addmm_fwd_fake have the same arguments."""
+        from generative_recommenders.ops.triton.triton_addmm import (
+            maybe_triton_addmm_fwd,
+            maybe_triton_addmm_fwd_fake,
+        )
+
+        real_params = get_custom_op_params(maybe_triton_addmm_fwd)
+        fake_params = get_custom_op_params(maybe_triton_addmm_fwd_fake)
+
+        self.assertEqual(
+            real_params,
+            fake_params,
+            f"maybe_triton_addmm_fwd and maybe_triton_addmm_fwd_fake have different arguments.\n"
+            f"Real: {real_params}\n"
+            f"Fake: {fake_params}",
+        )
+
+    def test_triton_hstu_attention_fwd_and_fake_have_same_signature(self) -> None:
+        """Verify triton_hstu_attention_fwd and _triton_hstu_attention_fwd_fake have the same arguments."""
+        from generative_recommenders.ops.triton.triton_hstu_attention import (
+            _triton_hstu_attention_fwd_fake,
+            triton_hstu_attention_fwd,
+        )
+
+        real_params = get_custom_op_params(triton_hstu_attention_fwd)
+        fake_params = get_custom_op_params(_triton_hstu_attention_fwd_fake)
+
+        self.assertEqual(
+            real_params,
+            fake_params,
+            f"triton_hstu_attention_fwd and _triton_hstu_attention_fwd_fake have different arguments.\n"
+            f"Real: {real_params}\n"
+            f"Fake: {fake_params}",
+        )
+
+    def test_triton_hstu_attention_bwd_and_fake_have_same_signature(self) -> None:
+        """Verify triton_hstu_attention_bwd and _triton_hstu_attention_bwd_fake have the same arguments."""
+        from generative_recommenders.ops.triton.triton_hstu_attention import (
+            _triton_hstu_attention_bwd_fake,
+            triton_hstu_attention_bwd,
+        )
+
+        real_params = get_custom_op_params(triton_hstu_attention_bwd)
+        fake_params = get_custom_op_params(_triton_hstu_attention_bwd_fake)
+
+        self.assertEqual(
+            real_params,
+            fake_params,
+            f"triton_hstu_attention_bwd and _triton_hstu_attention_bwd_fake have different arguments.\n"
+            f"Real: {real_params}\n"
+            f"Fake: {fake_params}",
+        )
+
+    def test_triton_layer_norm_mul_dropout_fwd_impl_and_fake_have_same_signature(
+        self,
+    ) -> None:
+        """Verify _triton_layer_norm_mul_dropout_fwd_impl and its fake have the same arguments."""
+        from generative_recommenders.ops.triton.triton_hstu_linear import (
+            _triton_layer_norm_mul_dropout_fwd_impl,
+            _triton_layer_norm_mul_dropout_fwd_impl_fake,
+        )
+
+        real_params = get_custom_op_params(_triton_layer_norm_mul_dropout_fwd_impl)
+        fake_params = get_custom_op_params(_triton_layer_norm_mul_dropout_fwd_impl_fake)
+
+        self.assertEqual(
+            real_params,
+            fake_params,
+            f"_triton_layer_norm_mul_dropout_fwd_impl and _triton_layer_norm_mul_dropout_fwd_impl_fake have different arguments.\n"
+            f"Real: {real_params}\n"
+            f"Fake: {fake_params}",
+        )
+
+    def test_triton_layer_norm_mul_dropout_bwd_impl_and_fake_have_same_signature(
+        self,
+    ) -> None:
+        """Verify _triton_layer_norm_mul_dropout_bwd_impl and its fake have the same arguments."""
+        from generative_recommenders.ops.triton.triton_hstu_linear import (
+            _triton_layer_norm_mul_dropout_bwd_impl,
+            _triton_layer_norm_mul_dropout_bwd_impl_fake,
+        )
+
+        real_params = get_custom_op_params(_triton_layer_norm_mul_dropout_bwd_impl)
+        fake_params = get_custom_op_params(_triton_layer_norm_mul_dropout_bwd_impl_fake)
+
+        self.assertEqual(
+            real_params,
+            fake_params,
+            f"_triton_layer_norm_mul_dropout_bwd_impl and _triton_layer_norm_mul_dropout_bwd_impl_fake have different arguments.\n"
+            f"Real: {real_params}\n"
+            f"Fake: {fake_params}",
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/tests/hstu_attention_test.py b/recommendation_v4/generative_recommenders/ops/tests/hstu_attention_test.py
new file mode 100644
index 000000000..ef70b8cc2
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/tests/hstu_attention_test.py
@@ -0,0 +1,485 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import random
+import unittest
+from typing import Optional
+
+import torch
+from generative_recommenders.common import (
+    generate_sparse_seq_len,
+    gpu_unavailable,
+    HammerKernel,
+    set_dev_mode,
+)
+from generative_recommenders.ops.jagged_tensors import split_2D_jagged
+from hypothesis import given, settings, strategies as st, Verbosity
+
+
+def test_attn(
+    batch_size: int,
+    heads: int,
+    max_uih_len: int,
+    max_targets: int,
+    attn_dim: int,
+    hidden_dim: int,
+    causal: bool,
+    has_multiple_targets: bool,
+    has_max_attn_len: bool,
+    dtype: torch.dtype,
+    test_backward: bool,
+    ref_kernel: HammerKernel,
+    real_kernel: HammerKernel,
+    skip_comparisons: bool = False,
+    sparsity: float = -1.0,
+    contextual_seq_len: int = 0,
+    atol: Optional[float] = None,
+    rtol: Optional[float] = None,
+    enable_tma: bool = False,
+) -> None:
+    set_dev_mode(True)
+    torch.backends.cudnn.allow_tf32 = True
+    torch.backends.cuda.matmul.allow_tf32 = True
+    from generative_recommenders.ops.hstu_attention import hstu_mha
+
+    alpha = 1.0 / (attn_dim**0.5)
+    if sparsity > 0.0:
+        lengths = generate_sparse_seq_len(
+            size=batch_size,
+            max_seq_len=max_uih_len,
+            sparsity=sparsity,
+            device=torch.device("cuda"),
+        )
+    else:
+        lengths = torch.randint(
+            max_uih_len + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+    num_targets = torch.randint(
+        1, max_targets + 1, size=(batch_size,), device=torch.device("cuda")
+    )
+    lengths = lengths + num_targets + contextual_seq_len
+    max_seq_len = max_uih_len + max_targets + contextual_seq_len
+    if has_max_attn_len:
+        max_attn_len = random.randint(1, max_uih_len // 5)
+    else:
+        max_attn_len = 0
+    seq_offsets = torch.zeros(
+        (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+    )
+    seq_offsets[1:] = torch.cumsum(lengths, dim=0)
+
+    L = int(seq_offsets[-1].item())
+    q = (
+        torch.empty((L, heads, attn_dim), dtype=dtype, device=torch.device("cuda"))
+        .uniform_(-0.1, 0.1)
+        .requires_grad_()
+    )
+    k = (
+        torch.empty((L, heads, attn_dim), dtype=dtype, device=torch.device("cuda"))
+        .uniform_(-0.1, 0.1)
+        .requires_grad_()
+    )
+    v = (
+        torch.empty((L, heads, hidden_dim), dtype=dtype, device=torch.device("cuda"))
+        .uniform_(-0.1, 0.1)
+        .requires_grad_()
+    )
+
+    # ref implementation
+    ref_out = hstu_mha(
+        max_seq_len=max_seq_len,
+        alpha=alpha,
+        q=q,
+        k=k,
+        v=v,
+        seq_offsets=seq_offsets,
+        causal=causal,
+        num_targets=num_targets if has_multiple_targets else None,
+        dropout_pr=0.0,
+        max_attn_len=max_attn_len,
+        contextual_seq_len=contextual_seq_len,
+        kernel=ref_kernel,
+        enable_tma=enable_tma,
+    )
+    dout = torch.randn_like(ref_out)
+    ref_out.backward(dout)
+
+    if skip_comparisons:
+        return
+
+    # pyre-ignore
+    ref_dv, v.grad = v.grad.clone(), None
+    ref_dk, k.grad = k.grad.clone(), None
+    ref_dq, q.grad = q.grad.clone(), None
+
+    # triton implementation
+    q = q.detach().clone().requires_grad_()
+    k = k.detach().clone().requires_grad_()
+    v = v.detach().clone().requires_grad_()
+    dout = dout.detach().clone()
+    real_out = hstu_mha(
+        max_seq_len=max_seq_len,
+        alpha=alpha,
+        q=q,
+        k=k,
+        v=v,
+        seq_offsets=seq_offsets,
+        causal=causal,
+        num_targets=num_targets if has_multiple_targets else None,
+        dropout_pr=0.0,
+        max_attn_len=max_attn_len,
+        contextual_seq_len=contextual_seq_len,
+        kernel=real_kernel,
+        enable_tma=enable_tma,
+    )
+
+    torch.testing.assert_close(
+        ref_out,
+        real_out,
+        atol=atol,
+        rtol=rtol,
+    )
+    if test_backward:
+        real_out.backward(dout)
+        real_dq, real_dk, real_dv = q.grad.clone(), k.grad.clone(), v.grad.clone()
+        torch.testing.assert_close(ref_dv, real_dv, atol=atol, rtol=rtol)
+        torch.testing.assert_close(ref_dk, real_dk, atol=atol, rtol=rtol)
+        torch.testing.assert_close(ref_dq, real_dq, atol=atol, rtol=rtol)
+
+
+def test_delta_attn(
+    batch_size: int,
+    heads: int,
+    max_uih_len: int,
+    max_targets: int,
+    delta_size: int,
+    attn_dim: int,
+    hidden_dim: int,
+    has_multiple_targets: bool,
+    has_max_attn_len: bool,
+    dtype: torch.dtype,
+    ref_kernel: HammerKernel,
+    real_kernel: HammerKernel,
+    contextual_seq_len: int = 0,
+    atol: Optional[float] = None,
+    rtol: Optional[float] = None,
+    enable_tma: bool = False,
+) -> None:
+    set_dev_mode(True)
+    torch.backends.cudnn.allow_tf32 = True
+    torch.backends.cuda.matmul.allow_tf32 = True
+    from generative_recommenders.ops.hstu_attention import delta_hstu_mha
+
+    alpha = 1.0 / (attn_dim**0.5)
+    lengths = torch.randint(
+        max_uih_len + 1, size=(batch_size,), device=torch.device("cuda")
+    )
+    num_targets = torch.randint(
+        1, delta_size + 1, size=(batch_size,), device=torch.device("cuda")
+    )
+    lengths = lengths + delta_size + contextual_seq_len
+    max_seq_len = max_uih_len + delta_size + contextual_seq_len
+    if has_max_attn_len:
+        max_attn_len = random.randint(1, max_uih_len // 5)
+    else:
+        max_attn_len = 0
+    seq_offsets = torch.zeros(
+        (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+    )
+    seq_offsets[1:] = torch.cumsum(lengths, dim=0)
+
+    L = int(seq_offsets[-1].item())
+    delta_q = torch.empty(
+        (batch_size * delta_size, heads, attn_dim),
+        dtype=dtype,
+        device=torch.device("cuda"),
+    ).uniform_(-0.1, 0.1)
+    k = torch.empty(
+        (L, heads, attn_dim), dtype=dtype, device=torch.device("cuda")
+    ).uniform_(-0.1, 0.1)
+    v = torch.empty(
+        (L, heads, hidden_dim), dtype=dtype, device=torch.device("cuda")
+    ).uniform_(-0.1, 0.1)
+
+    # ref implementation
+    ref_out = delta_hstu_mha(
+        max_seq_len=max_seq_len,
+        alpha=alpha,
+        delta_q=delta_q,
+        k=k,
+        v=v,
+        seq_offsets=seq_offsets,
+        num_targets=num_targets if has_multiple_targets else None,
+        max_attn_len=max_attn_len,
+        contextual_seq_len=contextual_seq_len,
+        kernel=ref_kernel,
+        enable_tma=enable_tma,
+    )
+
+    # real implementation
+    real_out = delta_hstu_mha(
+        max_seq_len=max_seq_len,
+        alpha=alpha,
+        delta_q=delta_q,
+        k=k,
+        v=v,
+        seq_offsets=seq_offsets,
+        num_targets=num_targets if has_multiple_targets else None,
+        max_attn_len=max_attn_len,
+        contextual_seq_len=contextual_seq_len,
+        kernel=real_kernel,
+        enable_tma=enable_tma,
+    )
+    torch.testing.assert_close(
+        ref_out,
+        real_out,
+        atol=atol,
+        rtol=rtol,
+    )
+
+
+class HSTUAttentionTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(4, 8),
+        heads=st.integers(1, 4),
+        max_uih_len=st.sampled_from([20, 100, 128, 256]),
+        max_targets=st.sampled_from([20, 512]),
+        attn_dim=st.sampled_from([16, 32, 64, 128]),
+        hidden_dim=st.sampled_from([16, 32, 64, 128]),
+        causal=st.sampled_from([True]),
+        has_multiple_targets=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+        has_max_attn_len=st.sampled_from([True, False]),
+        contextual_seq_len=st.sampled_from([0, 10]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=200,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_attn_triton(self, *args, **kwargs) -> None:
+        test_attn(
+            *args,
+            **kwargs,
+            test_backward=True,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.just(64),
+        heads=st.just(4),
+        max_uih_len=st.sampled_from([32768]),
+        max_targets=st.sampled_from([32]),
+        attn_dim=st.just(128),
+        hidden_dim=st.just(128),
+        causal=st.sampled_from([True]),
+        has_multiple_targets=st.sampled_from([True, False]),
+        dtype=st.sampled_from([torch.bfloat16]),
+        has_max_attn_len=st.sampled_from([True, False]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=5,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_attn_triton_long_seqs(self, *args, **kwargs) -> None:
+        test_attn(
+            *args,
+            **kwargs,
+            test_backward=True,
+            ref_kernel=HammerKernel.TRITON,
+            real_kernel=HammerKernel.TRITON,
+            skip_comparisons=True,
+            sparsity=1.0,
+        )
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(4, 8),
+        heads=st.integers(1, 4),
+        max_uih_len=st.sampled_from([100, 128, 256]),
+        max_targets=st.sampled_from([20, 512]),
+        delta_size=st.sampled_from([20, 512]),
+        attn_dim=st.sampled_from([16, 32, 64, 128]),
+        hidden_dim=st.sampled_from([16, 32, 64, 128]),
+        has_multiple_targets=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+        has_max_attn_len=st.sampled_from([False, True]),
+        contextual_seq_len=st.sampled_from([0, 10]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=200,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_delta_attn_triton(self, *args, **kwargs) -> None:
+        test_delta_attn(
+            *args,
+            **kwargs,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(4, 8),
+        heads=st.integers(1, 4),
+        max_uih_len=st.sampled_from([20, 100, 128]),
+        max_targets=st.sampled_from([20, 512]),
+        delta_size=st.sampled_from([20, 512]),
+        attn_dim=st.sampled_from([16, 32, 64]),
+        hidden_dim=st.sampled_from([16, 32, 64]),
+        has_multiple_targets=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+        has_max_attn_len=st.sampled_from([False, True]),
+        contextual_seq_len=st.sampled_from([0, 10]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=200,
+        deadline=None,
+    )
+    def test_cache(
+        self,
+        batch_size: int,
+        heads: int,
+        max_uih_len: int,
+        max_targets: int,
+        delta_size: int,
+        attn_dim: int,
+        hidden_dim: int,
+        has_multiple_targets: bool,
+        dtype: torch.dtype,
+        has_max_attn_len: bool,
+        contextual_seq_len: int,
+    ) -> None:
+        set_dev_mode(True)
+        torch.backends.cudnn.allow_tf32 = True
+        torch.backends.cuda.matmul.allow_tf32 = True
+        from generative_recommenders.ops.hstu_attention import delta_hstu_mha, hstu_mha
+
+        alpha = 1.0 / (attn_dim**0.5)
+        lengths = torch.randint(
+            max_uih_len + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+        num_targets = torch.randint(
+            1, delta_size + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+        lengths = lengths + delta_size + contextual_seq_len
+        max_seq_len = max_uih_len + delta_size + contextual_seq_len
+        if has_max_attn_len:
+            max_attn_len = random.randint(1, max_uih_len // 5)
+        else:
+            max_attn_len = 0
+        seq_offsets = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        seq_offsets[1:] = torch.cumsum(lengths, dim=0)
+
+        L = int(seq_offsets[-1].item())
+        q = torch.empty(
+            (L, heads, attn_dim),
+            dtype=dtype,
+            device=torch.device("cuda"),
+        ).uniform_(-0.1, 0.1)
+        _, delta_q = split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=q.view(-1, heads * attn_dim),
+            max_len_left=None,
+            max_len_right=delta_size,
+            offsets_left=torch.ops.fbgemm.asynchronous_complete_cumsum(
+                lengths - delta_size
+            ),
+            offsets_right=None,
+            kernel=HammerKernel.TRITON,
+        )
+        delta_q = delta_q.view(-1, heads, attn_dim)
+        k = torch.empty(
+            (L, heads, attn_dim), dtype=dtype, device=torch.device("cuda")
+        ).uniform_(-0.1, 0.1)
+        v = torch.empty(
+            (L, heads, hidden_dim), dtype=dtype, device=torch.device("cuda")
+        ).uniform_(-0.1, 0.1)
+        prime_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(
+            lengths - delta_size
+        )
+
+        # ref implementation
+        ref_out = hstu_mha(
+            max_seq_len=max_seq_len,
+            alpha=alpha,
+            q=q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            causal=True,
+            num_targets=num_targets if has_multiple_targets else None,
+            dropout_pr=0.0,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            kernel=HammerKernel.TRITON,
+        )
+        _, delta_out = split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=ref_out.view(-1, heads * hidden_dim),
+            max_len_left=None,
+            max_len_right=delta_size,
+            offsets_left=prime_offsets,
+            offsets_right=None,
+            kernel=HammerKernel.TRITON,
+        )
+        delta_out = delta_out.view(-1, heads, hidden_dim)
+
+        # real implementation
+        real_delta_out = delta_hstu_mha(
+            max_seq_len=max_seq_len,
+            alpha=alpha,
+            delta_q=delta_q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets if has_multiple_targets else None,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+        )
+        torch.testing.assert_close(
+            delta_out,
+            real_delta_out,
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/tests/hstu_attention_tma_test.py b/recommendation_v4/generative_recommenders/ops/tests/hstu_attention_tma_test.py
new file mode 100644
index 000000000..8bb264af6
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/tests/hstu_attention_tma_test.py
@@ -0,0 +1,270 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import random
+import unittest
+
+import torch
+from generative_recommenders.common import (
+    HammerKernel,
+    nv_gpu_unavailable,
+    set_dev_mode,
+)
+from generative_recommenders.ops.jagged_tensors import split_2D_jagged
+from generative_recommenders.ops.tests.hstu_attention_test import (
+    test_attn,
+    test_delta_attn,
+)
+from hypothesis import given, settings, strategies as st, Verbosity
+
+
+class HSTUAttentionTmaTest(unittest.TestCase):
+    @unittest.skipIf(*nv_gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(4, 8),
+        heads=st.integers(1, 4),
+        max_uih_len=st.sampled_from([20, 100, 128, 256]),
+        max_targets=st.sampled_from([20, 512]),
+        attn_dim=st.sampled_from([16, 32, 64, 128]),
+        hidden_dim=st.sampled_from([16, 32, 64, 128]),
+        causal=st.sampled_from([True]),
+        has_multiple_targets=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+        has_max_attn_len=st.sampled_from([True, False]),
+        contextual_seq_len=st.sampled_from([0, 10]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=200,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_attn_triton_tma(self, *args, **kwargs) -> None:
+        test_attn(
+            *args,
+            **kwargs,
+            test_backward=True,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+            enable_tma=True,
+        )
+
+    @unittest.skipIf(*nv_gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.just(64),
+        heads=st.just(4),
+        max_uih_len=st.sampled_from([32768]),
+        max_targets=st.sampled_from([32]),
+        attn_dim=st.just(128),
+        hidden_dim=st.just(128),
+        causal=st.sampled_from([True]),
+        has_multiple_targets=st.sampled_from([True, False]),
+        dtype=st.sampled_from([torch.bfloat16]),
+        has_max_attn_len=st.sampled_from([True, False]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=5,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_attn_triton_long_seqs_tma(self, *args, **kwargs) -> None:
+        test_attn(
+            *args,
+            **kwargs,
+            test_backward=True,
+            ref_kernel=HammerKernel.TRITON,
+            real_kernel=HammerKernel.TRITON,
+            skip_comparisons=True,
+            sparsity=1.0,
+            enable_tma=True,
+        )
+
+    @unittest.skipIf(*nv_gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(4, 8),
+        heads=st.integers(1, 4),
+        max_uih_len=st.sampled_from([100, 128, 256]),
+        max_targets=st.sampled_from([20, 512]),
+        delta_size=st.sampled_from([20, 512]),
+        attn_dim=st.sampled_from([16, 32, 64, 128]),
+        hidden_dim=st.sampled_from([16, 32, 64, 128]),
+        has_multiple_targets=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+        has_max_attn_len=st.sampled_from([False, True]),
+        contextual_seq_len=st.sampled_from([0, 10]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=200,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_delta_attn_triton_tma(self, *args, **kwargs) -> None:
+        test_delta_attn(
+            *args,
+            **kwargs,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+            enable_tma=True,
+        )
+
+    @unittest.skipIf(*nv_gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(4, 8),
+        heads=st.integers(1, 4),
+        max_uih_len=st.sampled_from([20, 100, 128]),
+        max_targets=st.sampled_from([20, 512]),
+        delta_size=st.sampled_from([20, 512]),
+        attn_dim=st.sampled_from([16, 32, 64]),
+        hidden_dim=st.sampled_from([16, 32, 64]),
+        has_multiple_targets=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+        has_max_attn_len=st.sampled_from([False, True]),
+        contextual_seq_len=st.sampled_from([0, 10]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=200,
+        deadline=None,
+    )
+    def test_cache_tma(
+        self,
+        batch_size: int,
+        heads: int,
+        max_uih_len: int,
+        max_targets: int,
+        delta_size: int,
+        attn_dim: int,
+        hidden_dim: int,
+        has_multiple_targets: bool,
+        dtype: torch.dtype,
+        has_max_attn_len: bool,
+        contextual_seq_len: int,
+    ) -> None:
+        set_dev_mode(True)
+        torch.backends.cudnn.allow_tf32 = True
+        torch.backends.cuda.matmul.allow_tf32 = True
+        from generative_recommenders.ops.hstu_attention import delta_hstu_mha, hstu_mha
+
+        alpha = 1.0 / (attn_dim**0.5)
+        lengths = torch.randint(
+            max_uih_len + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+        num_targets = torch.randint(
+            1, delta_size + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+        lengths = lengths + delta_size + contextual_seq_len
+        max_seq_len = max_uih_len + delta_size + contextual_seq_len
+        if has_max_attn_len:
+            max_attn_len = random.randint(1, max_uih_len // 5)
+        else:
+            max_attn_len = 0
+        seq_offsets = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        seq_offsets[1:] = torch.cumsum(lengths, dim=0)
+
+        L = int(seq_offsets[-1].item())
+        q = torch.empty(
+            (L, heads, attn_dim),
+            dtype=dtype,
+            device=torch.device("cuda"),
+        ).uniform_(-0.1, 0.1)
+        _, delta_q = split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=q.view(-1, heads * attn_dim),
+            max_len_left=None,
+            max_len_right=delta_size,
+            offsets_left=torch.ops.fbgemm.asynchronous_complete_cumsum(
+                lengths - delta_size
+            ),
+            offsets_right=None,
+            kernel=HammerKernel.TRITON,
+        )
+        delta_q = delta_q.view(-1, heads, attn_dim)
+        k = torch.empty(
+            (L, heads, attn_dim), dtype=dtype, device=torch.device("cuda")
+        ).uniform_(-0.1, 0.1)
+        v = torch.empty(
+            (L, heads, hidden_dim), dtype=dtype, device=torch.device("cuda")
+        ).uniform_(-0.1, 0.1)
+        prime_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(
+            lengths - delta_size
+        )
+
+        # ref implementation
+        ref_out = hstu_mha(
+            max_seq_len=max_seq_len,
+            alpha=alpha,
+            q=q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            causal=True,
+            num_targets=num_targets if has_multiple_targets else None,
+            dropout_pr=0.0,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            kernel=HammerKernel.TRITON,
+            enable_tma=True,
+        )
+        _, delta_out = split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=ref_out.view(-1, heads * hidden_dim),
+            max_len_left=None,
+            max_len_right=delta_size,
+            offsets_left=prime_offsets,
+            offsets_right=None,
+            kernel=HammerKernel.TRITON,
+        )
+        delta_out = delta_out.view(-1, heads, hidden_dim)
+
+        # real implementation
+        real_delta_out = delta_hstu_mha(
+            max_seq_len=max_seq_len,
+            alpha=alpha,
+            delta_q=delta_q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets if has_multiple_targets else None,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            enable_tma=True,
+        )
+        torch.testing.assert_close(
+            delta_out,
+            real_delta_out,
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/tests/hstu_compute_test.py b/recommendation_v4/generative_recommenders/ops/tests/hstu_compute_test.py
new file mode 100644
index 000000000..57f217895
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/tests/hstu_compute_test.py
@@ -0,0 +1,503 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import random
+import unittest
+from typing import Optional
+
+import torch
+from generative_recommenders.common import (
+    generate_sparse_seq_len,
+    gpu_unavailable,
+    HammerKernel,
+    set_dev_mode,
+)
+from hypothesis import given, settings, strategies as st, Verbosity
+
+
+class HSTUComputeTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore[56]
+    @given(
+        N=st.integers(min_value=1000, max_value=1000),
+        D=st.integers(min_value=128, max_value=128),
+        L=st.integers(min_value=512, max_value=512),
+        concat_u=st.booleans(),
+        concat_x=st.booleans(),
+        mul_u_activation_type=st.sampled_from(["silu", "sigmoid", "none"]),
+        group_norm=st.booleans(),
+        num_heads=st.sampled_from([4]),
+        training=st.just(False),
+        recompute_y_in_backward=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.float32, torch.bfloat16]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        deadline=None,
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+    )
+    # pyre-ignore[2]
+    def test_compute_output(self, *args, **kwargs) -> None:
+        self._test_compute_output(
+            *args,
+            **kwargs,
+            test_backward=True,
+            ref_kernel=HammerKernel.PYTORCH,
+            opt_kernel=HammerKernel.TRITON,
+        )
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore[56]
+    @given(
+        N=st.just(1500000),
+        D=st.just(512),
+        L=st.just(512),
+        concat_u=st.sampled_from([True]),
+        concat_x=st.sampled_from([True]),
+        mul_u_activation_type=st.sampled_from(["none"]),
+        group_norm=st.sampled_from([False]),
+        num_heads=st.sampled_from([4]),
+        training=st.just(False),
+        recompute_y_in_backward=st.sampled_from([False]),
+        dtype=st.just(torch.bfloat16),
+    )
+    @settings(
+        deadline=None,
+        verbosity=Verbosity.verbose,
+        max_examples=1,
+    )
+    # pyre-ignore[2]
+    def test_long_sequences_compute_output(self, *args, **kwargs) -> None:
+        self._test_compute_output(
+            *args,
+            **kwargs,
+            test_backward=False,
+            ref_kernel=HammerKernel.TRITON,
+            opt_kernel=HammerKernel.TRITON,
+            skip_comparisons=True,
+        )
+
+    def _test_compute_output(
+        self,
+        N: int,
+        D: int,
+        L: int,
+        concat_u: bool,
+        concat_x: bool,
+        mul_u_activation_type: str,
+        group_norm: bool,
+        num_heads: int,
+        training: bool,
+        recompute_y_in_backward: bool,
+        dtype: torch.dtype,
+        test_backward: bool,
+        ref_kernel: HammerKernel,
+        opt_kernel: HammerKernel,
+        skip_comparisons: bool = False,
+        atol: Optional[float] = None,
+        rtol: Optional[float] = None,
+    ) -> None:
+        from generative_recommenders.ops.hstu_compute import hstu_compute_output
+
+        torch.manual_seed(0)
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cuda.matmul.allow_tf32 = False
+        dropout_ratio = 0.3 if training else 0.0
+        attn = (
+            torch.empty((N, L), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        u = (
+            torch.empty((N, L), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        norm_weight = (
+            torch.empty(
+                (L if not group_norm else num_heads,),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        norm_bias = (
+            torch.empty(
+                (L if not group_norm else num_heads,),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        norm_eps = 1e-6
+        # When group_norm=True, only concat_ux = concat_u and concat_x is supported
+        if group_norm:
+            L_mult = 3 if (concat_u and concat_x) else 1
+        else:
+            L_mult = 1
+            if concat_u:
+                L_mult += 1
+            if concat_x:
+                L_mult += 1
+        output_weight = (
+            torch.empty((L * L_mult, D), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        x = (
+            torch.empty((N, D), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+
+        # ref
+        ref_out = hstu_compute_output(
+            attn=attn,
+            u=u,
+            x=x,
+            norm_weight=norm_weight,
+            norm_bias=norm_bias,
+            norm_eps=norm_eps,
+            dropout_ratio=dropout_ratio,
+            output_weight=output_weight,
+            concat_u=concat_u,
+            concat_x=concat_x,
+            mul_u_activation_type=mul_u_activation_type,
+            group_norm=group_norm,
+            num_heads=num_heads,
+            linear_dim=L // num_heads,
+            training=training,
+            recompute_y_in_backward=recompute_y_in_backward,
+            kernel=ref_kernel,
+        )
+        dout = torch.randn_like(ref_out) * 0.1
+        ref_out.backward(dout)
+        if skip_comparisons:
+            return
+        # pyre-ignore[16]
+        ref_dattn, attn.grad = attn.grad.detach().clone(), None
+        ref_du, u.grad = u.grad.detach().clone(), None
+        ref_d_norm_w, norm_weight.grad = norm_weight.grad.detach().clone(), None
+        ref_d_norm_b, norm_bias.grad = norm_bias.grad.detach().clone(), None
+        ref_dx, x.grad = x.grad.detach().clone(), None
+        ref_d_output_w, output_weight.grad = output_weight.grad.detach().clone(), None
+
+        # opt
+        attn = attn.detach().clone().requires_grad_()
+        u = u.detach().clone().requires_grad_()
+        norm_weight = norm_weight.detach().clone().requires_grad_()
+        norm_bias = norm_bias.detach().clone().requires_grad_()
+        output_weight = output_weight.detach().clone().requires_grad_()
+        x = x.detach().clone().requires_grad_()
+        opt_out = hstu_compute_output(
+            attn=attn,
+            u=u,
+            x=x,
+            norm_weight=norm_weight,
+            norm_bias=norm_bias,
+            norm_eps=norm_eps,
+            dropout_ratio=dropout_ratio,
+            output_weight=output_weight,
+            concat_u=concat_u,
+            concat_x=concat_x,
+            mul_u_activation_type=mul_u_activation_type,
+            group_norm=group_norm,
+            num_heads=num_heads,
+            linear_dim=L // num_heads,
+            training=training,
+            recompute_y_in_backward=recompute_y_in_backward,
+            kernel=opt_kernel,
+        )
+        torch.testing.assert_close(
+            ref_out,
+            opt_out,
+            atol=atol,
+            rtol=rtol,
+        )
+
+        if test_backward:
+            dout = dout.detach().clone()
+            opt_out.backward(dout)
+            opt_dattn, attn.grad = attn.grad.detach().clone(), None
+            opt_du, u.grad = u.grad.detach().clone(), None
+            opt_d_norm_w, norm_weight.grad = norm_weight.grad.detach().clone(), None
+            opt_d_norm_b, norm_bias.grad = norm_bias.grad.detach().clone(), None
+            opt_dx, x.grad = x.grad.detach().clone(), None
+            opt_d_output_w, output_weight.grad = (
+                output_weight.grad.detach().clone(),
+                None,
+            )
+            torch.testing.assert_close(ref_du, opt_du)
+            torch.testing.assert_close(ref_dattn, opt_dattn)
+            torch.testing.assert_close(ref_d_norm_w, opt_d_norm_w)
+            torch.testing.assert_close(ref_d_norm_b, opt_d_norm_b)
+            torch.testing.assert_close(ref_dx, opt_dx)
+            torch.testing.assert_close(ref_d_output_w, opt_d_output_w)
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(4, 8),
+        heads=st.integers(1, 4),
+        max_uih_len=st.sampled_from([100, 128, 256, 1300]),
+        max_targets=st.sampled_from([20, 512]),
+        embedding_dim=st.sampled_from([16, 32, 64]),
+        attn_dim=st.sampled_from([16, 32, 64, 128]),
+        hidden_dim=st.sampled_from([16, 32, 64, 128]),
+        causal=st.sampled_from([True]),
+        has_multiple_targets=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+        contextual_seq_len=st.sampled_from([0]),
+        has_max_attn_len=st.sampled_from([False, True]),
+        sort_by_length=st.sampled_from([True, False]),
+        recompute_uvqk_in_backward=st.sampled_from([True, False]),
+        recompute_normed_x_in_backward=st.sampled_from([True, False]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=150,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_preprocess_and_attention(self, *args, **kwargs) -> None:
+        self._test_hstu_preprocess_and_attention(
+            *args,
+            **kwargs,
+            test_backward=True,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    def _test_hstu_preprocess_and_attention(
+        self,
+        batch_size: int,
+        heads: int,
+        max_uih_len: int,
+        max_targets: int,
+        embedding_dim: int,
+        attn_dim: int,
+        hidden_dim: int,
+        causal: bool,
+        has_multiple_targets: bool,
+        has_max_attn_len: bool,
+        dtype: torch.dtype,
+        ref_kernel: HammerKernel,
+        real_kernel: HammerKernel,
+        test_backward: bool,
+        contextual_seq_len: int,
+        sort_by_length: bool,
+        recompute_uvqk_in_backward: bool,
+        recompute_normed_x_in_backward: bool,
+        sparsity: float = -1.0,
+        atol: Optional[float] = None,
+        rtol: Optional[float] = None,
+    ) -> None:
+        set_dev_mode(True)
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cuda.matmul.allow_tf32 = False
+
+        from generative_recommenders.ops.hstu_compute import (
+            hstu_preprocess_and_attention,
+        )
+
+        alpha = 1.0 / (attn_dim**0.5)
+        if sparsity > 0.0:
+            lengths = generate_sparse_seq_len(
+                size=batch_size,
+                max_seq_len=max_uih_len,
+                sparsity=sparsity,
+                device=torch.device("cuda"),
+            )
+        else:
+            lengths = torch.randint(
+                max_uih_len + 1, size=(batch_size,), device=torch.device("cuda")
+            )
+
+        num_targets = torch.randint(
+            max_targets + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+        lengths = lengths + num_targets
+        max_seq_len = max_uih_len + max_targets
+        if has_max_attn_len:
+            max_attn_len = random.randint(1, max_uih_len // 5)
+        else:
+            max_attn_len = 0
+        seq_offsets = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        seq_offsets[1:] = torch.cumsum(lengths, dim=0)
+
+        L = int(seq_offsets[-1].item())
+
+        x = (
+            torch.empty((L, embedding_dim), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        norm_weight = (
+            torch.empty((embedding_dim,), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        norm_bias = (
+            torch.empty(
+                (embedding_dim,),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        norm_eps = 1e-6
+        uvqk_weight = (
+            torch.empty(
+                (
+                    embedding_dim,
+                    (hidden_dim * 2 + attn_dim * 2) * heads,
+                ),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        uvqk_bias = (
+            torch.empty(
+                (hidden_dim * 2 + attn_dim * 2) * heads,
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+
+        # ref implementation
+        ref_u, ref_attn_output, _, _ = hstu_preprocess_and_attention(
+            x=x,
+            norm_weight=norm_weight,
+            norm_bias=norm_bias,
+            norm_eps=norm_eps,
+            num_heads=heads,
+            attn_dim=attn_dim,
+            hidden_dim=hidden_dim,
+            uvqk_weight=uvqk_weight,
+            uvqk_bias=uvqk_bias,
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            attn_alpha=alpha,
+            causal=causal,
+            num_targets=num_targets,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            recompute_uvqk_in_backward=recompute_uvqk_in_backward,
+            recompute_normed_x_in_backward=recompute_normed_x_in_backward,
+            sort_by_length=sort_by_length,
+            kernel=ref_kernel,
+        )
+        ref_out = ref_u + ref_attn_output
+        dout = torch.randn_like(ref_out) * 0.01
+        ref_out.backward(dout)
+
+        # pyre-ignore
+        ref_dx, x.grad = x.grad.clone(), None
+        ref_d_norm_weight, norm_weight.grad = norm_weight.grad.clone(), None
+        ref_d_norm_bias, norm_bias.grad = norm_bias.grad.clone(), None
+        ref_d_uvqk_weight, uvqk_weight.grad = uvqk_weight.grad.clone(), None
+        ref_d_uvqk_bias, uvqk_bias.grad = uvqk_bias.grad.clone(), None
+
+        # real implementation
+        x = x.detach().clone().requires_grad_()
+        norm_weight = norm_weight.detach().clone().requires_grad_()
+        norm_bias = norm_bias.detach().clone().requires_grad_()
+        uvqk_weight = uvqk_weight.detach().clone().requires_grad_()
+        uvqk_bias = uvqk_bias.detach().clone().requires_grad_()
+        real_u, real_attn_output, _, _ = hstu_preprocess_and_attention(
+            x=x,
+            norm_weight=norm_weight,
+            norm_bias=norm_bias,
+            norm_eps=norm_eps,
+            num_heads=heads,
+            attn_dim=attn_dim,
+            hidden_dim=hidden_dim,
+            uvqk_weight=uvqk_weight,
+            uvqk_bias=uvqk_bias,
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            attn_alpha=alpha,
+            causal=causal,
+            num_targets=num_targets,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            recompute_uvqk_in_backward=recompute_uvqk_in_backward,
+            recompute_normed_x_in_backward=recompute_normed_x_in_backward,
+            sort_by_length=sort_by_length,
+            kernel=real_kernel,
+        )
+        real_out = real_u + real_attn_output
+        torch.testing.assert_close(
+            ref_u,
+            real_u,
+            atol=atol,
+            rtol=rtol,
+        )
+        torch.testing.assert_close(
+            ref_attn_output,
+            real_attn_output,
+            atol=atol,
+            rtol=rtol,
+        )
+        if test_backward:
+            # real implementation
+            dout = dout.detach().clone()
+            real_out.backward(dout)
+            (
+                real_dx,
+                real_d_norm_weight,
+                real_d_norm_bias,
+                real_d_uvqk_weight,
+                real_d_uvqk_bias,
+            ) = (
+                x.grad.clone(),
+                norm_weight.grad.clone(),
+                norm_bias.grad.clone(),
+                uvqk_weight.grad.clone(),
+                uvqk_bias.grad.clone(),
+            )
+            torch.testing.assert_close(ref_dx, real_dx, atol=atol, rtol=rtol)
+            torch.testing.assert_close(
+                ref_d_norm_weight, real_d_norm_weight, atol=atol, rtol=rtol
+            )
+            torch.testing.assert_close(
+                ref_d_norm_bias, real_d_norm_bias, atol=atol, rtol=rtol
+            )
+            torch.testing.assert_close(
+                ref_d_uvqk_weight, real_d_uvqk_weight, atol=atol, rtol=rtol
+            )
+            torch.testing.assert_close(
+                ref_d_uvqk_bias, real_d_uvqk_bias, atol=atol, rtol=rtol
+            )
diff --git a/recommendation_v4/generative_recommenders/ops/tests/jagged_tensors_test.py b/recommendation_v4/generative_recommenders/ops/tests/jagged_tensors_test.py
new file mode 100644
index 000000000..e03d68d0b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/tests/jagged_tensors_test.py
@@ -0,0 +1,963 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import unittest
+from typing import Optional
+
+import torch
+from generative_recommenders.common import (
+    generate_sparse_seq_len,
+    gpu_unavailable,
+    HammerKernel,
+    set_dev_mode,
+)
+from generative_recommenders.ops.jagged_tensors import (
+    concat_2D_jagged,
+    concat_2D_jagged_multirow,
+    split_2D_jagged,
+    split_2D_jagged_multirow,
+)
+from hypothesis import given, settings, strategies as st, Verbosity
+
+
+class JaggedTensorsTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(2, 8),
+        max_len_a=st.integers(20, 100),
+        max_len_b=st.integers(20, 100),
+        D=st.integers(10, 30),
+        is_dense_a=st.sampled_from([True, False]),
+        is_dense_b=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_split_2D_jagged_triton(self, *args, **kwargs) -> None:
+        self._test_split_2D_jagged(
+            *args,
+            **kwargs,
+            test_backward=True,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    def _test_split_2D_jagged(
+        self,
+        batch_size: int,
+        max_len_a: int,
+        max_len_b: int,
+        D: int,
+        is_dense_a: bool,
+        is_dense_b: bool,
+        test_backward: bool,
+        ref_kernel: HammerKernel,
+        real_kernel: HammerKernel,
+        dtype: torch.dtype = torch.float32,
+        skip_comparisons: bool = False,
+    ) -> None:
+        set_dev_mode(True)
+        from generative_recommenders.ops.jagged_tensors import split_2D_jagged
+
+        max_seq_len = max_len_a + max_len_b
+        if not is_dense_a:
+            lengths_a = torch.randint(
+                1, max_len_a + 1, size=(batch_size,), device=torch.device("cuda")
+            )
+            offsets_a = torch.zeros(
+                (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+            )
+            offsets_a[1:] = torch.cumsum(lengths_a, dim=0)
+            total_len_a = int(offsets_a[-1].item())
+        else:
+            offsets_a = None
+            total_len_a = batch_size * max_len_a
+            is_dense_b = False
+        if not is_dense_b:
+            lengths_b = torch.randint(
+                1, max_len_b + 1, size=(batch_size,), device=torch.device("cuda")
+            )
+            offsets_b = torch.zeros(
+                (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+            )
+            offsets_b[1:] = torch.cumsum(lengths_b, dim=0)
+            total_len_b = int(offsets_b[-1].item())
+        else:
+            offsets_b = None
+            total_len_b = batch_size * max_len_b
+        values = (
+            torch.empty(
+                (total_len_a + total_len_b, D),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+
+        ref_values_a, ref_values_b = split_2D_jagged(
+            max_seq_len=max_len_a + max_len_b,
+            values=values,
+            max_len_left=max_len_a if is_dense_a else None,
+            max_len_right=max_len_b if is_dense_b else None,
+            offsets_left=offsets_a,
+            offsets_right=offsets_b,
+            kernel=ref_kernel,
+        )
+        d_values_a = torch.randn_like(ref_values_a)
+        d_values_b = torch.randn_like(ref_values_b)
+        ref_values_a.backward(d_values_a, retain_graph=True)
+        ref_values_b.backward(d_values_b)
+        if skip_comparisons:
+            return
+
+        assert values.grad is not None
+        ref_d_values, values.grad = values.grad.clone(), None
+
+        values = values.detach().clone().requires_grad_()
+        real_values_a, real_values_b = split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=values,
+            max_len_left=max_len_a if is_dense_a else None,
+            max_len_right=max_len_b if is_dense_b else None,
+            offsets_left=offsets_a,
+            offsets_right=offsets_b,
+            kernel=real_kernel,
+        )
+        torch.testing.assert_close(ref_values_a, real_values_a)
+        torch.testing.assert_close(ref_values_b, real_values_b)
+
+        if test_backward:
+            d_values_a = d_values_a.detach().clone()
+            d_values_b = d_values_b.detach().clone()
+            real_values_a.backward(d_values_a, retain_graph=True)
+            real_values_b.backward(d_values_b)
+            real_d_values = values.grad.clone()
+            torch.testing.assert_close(ref_d_values, real_d_values)
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(2, 8),
+        max_len_a=st.integers(20, 100),
+        max_len_b=st.integers(20, 100),
+        D=st.integers(10, 30),
+        is_dense_a=st.sampled_from([True, False]),
+        is_dense_b=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_concat_2D_jagged_triton(self, *args, **kwargs) -> None:
+        self._test_concat_2D_jagged(
+            *args,
+            **kwargs,
+            test_backward=True,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.sampled_from([130]),
+        max_len_a=st.sampled_from([32768]),
+        max_len_b=st.sampled_from([10]),
+        D=st.sampled_from([512]),
+        is_dense_a=st.sampled_from([True, False]),
+        is_dense_b=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_concat_2D_jagged_large_tensor(self, *args, **kwargs) -> None:
+        self._test_concat_2D_jagged(
+            *args,
+            **kwargs,
+            test_backward=True,
+            skip_comparisons=True,
+            ref_kernel=HammerKernel.TRITON,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    def _test_concat_2D_jagged(
+        self,
+        batch_size: int,
+        max_len_a: int,
+        max_len_b: int,
+        D: int,
+        is_dense_a: bool,
+        is_dense_b: bool,
+        test_backward: bool,
+        ref_kernel: HammerKernel,
+        real_kernel: HammerKernel,
+        dtype: torch.dtype = torch.float32,
+        skip_comparisons: bool = False,
+    ) -> None:
+        set_dev_mode(True)
+        from generative_recommenders.ops.jagged_tensors import concat_2D_jagged
+
+        if not is_dense_a:
+            lengths_a = torch.randint(
+                1, max_len_a + 1, size=(batch_size,), device=torch.device("cuda")
+            )
+            offsets_a = torch.zeros(
+                (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+            )
+            offsets_a[1:] = torch.cumsum(lengths_a, dim=0)
+            total_len_a = int(offsets_a[-1].item())
+        else:
+            offsets_a = None
+            total_len_a = batch_size * max_len_a
+        values_a = (
+            torch.empty(
+                (total_len_a, D),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+
+        if not is_dense_b:
+            lengths_b = torch.randint(
+                1, max_len_b + 1, size=(batch_size,), device=torch.device("cuda")
+            )
+            offsets_b = torch.zeros(
+                (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+            )
+            offsets_b[1:] = torch.cumsum(lengths_b, dim=0)
+            total_len_b = int(offsets_b[-1].item())
+        else:
+            offsets_b = None
+            total_len_b = batch_size * max_len_b
+        values_b = (
+            torch.empty(
+                (total_len_b, D),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+
+        ref_values = concat_2D_jagged(
+            max_seq_len=max_len_a + max_len_b,
+            values_left=values_a,
+            values_right=values_b,
+            max_len_left=max_len_a,
+            max_len_right=max_len_b,
+            offsets_left=offsets_a,
+            offsets_right=offsets_b,
+            kernel=ref_kernel,
+        )
+        dout = torch.randn_like(ref_values)
+        ref_values.backward(dout)
+        if skip_comparisons:
+            return
+
+        assert values_a.grad is not None
+        ref_d_a, values_a.grad = values_a.grad.clone(), None
+        assert values_b.grad is not None
+        ref_d_b, values_b.grad = values_b.grad.clone(), None
+
+        values_a = values_a.detach().clone().requires_grad_()
+        values_b = values_b.detach().clone().requires_grad_()
+        dout = dout.detach().clone()
+        real_values = concat_2D_jagged(
+            max_seq_len=max_len_a + max_len_b,
+            values_left=values_a,
+            values_right=values_b,
+            max_len_left=max_len_a,
+            max_len_right=max_len_b,
+            offsets_left=offsets_a,
+            offsets_right=offsets_b,
+            kernel=real_kernel,
+        )
+        torch.testing.assert_close(ref_values, real_values)
+
+        if test_backward:
+            real_values.backward(dout)
+            real_d_a = values_a.grad.clone()
+            real_d_b = values_b.grad.clone()
+            torch.testing.assert_close(ref_d_a, real_d_a)
+            torch.testing.assert_close(ref_d_b, real_d_b)
+
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(2, 8),
+        max_uih_len=st.integers(20, 100),
+        max_l2_len=st.integers(10, 30),
+        contextual_seq_len=st.sampled_from([0, 10]),
+        max_targets=st.sampled_from([10, 20]),
+        D=st.integers(10, 30),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    def test_hstu_split_l2_embeddings(
+        self,
+        batch_size: int,
+        max_uih_len: int,
+        max_l2_len: int,
+        contextual_seq_len: int,
+        max_targets: int,
+        D: int,
+        dtype: torch.dtype,
+    ) -> None:
+        set_dev_mode(True)
+        from generative_recommenders.ops.jagged_tensors import hstu_split_l2_embeddings
+
+        max_seq_len = max_uih_len + max_targets + contextual_seq_len
+        num_targets = torch.randint(
+            1, max_targets + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+        x_lengths = torch.randint(
+            0,
+            max_uih_len + 1,
+            size=(batch_size,),
+            device=torch.device("cuda"),
+        )
+        x_lengths = num_targets + x_lengths + contextual_seq_len
+        x_offsets = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        x_offsets[1:] = torch.cumsum(x_lengths, dim=0)
+        total_len = int(x_offsets[-1].item())
+        x = (
+            torch.empty(
+                (total_len, D),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        prefix_lengths = x_lengths - max_l2_len - num_targets - contextual_seq_len
+        prefix_lengths = torch.clamp(prefix_lengths, min=0)
+        prefix_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(prefix_lengths)
+        l2_offsets = x_offsets - prefix_offsets
+        ref_prefix_x, ref_l2_x = hstu_split_l2_embeddings(
+            max_seq_len=max_seq_len,
+            x=x,
+            prefix_offsets=prefix_offsets,
+            l2_offsets=l2_offsets,
+            contextual_seq_len=contextual_seq_len,
+            kernel=HammerKernel.PYTORCH,
+        )
+        d_prefix_x = torch.randn_like(ref_prefix_x)
+        d_l2_x = torch.randn_like(ref_l2_x)
+        ref_prefix_x.backward(d_prefix_x, retain_graph=True)
+        ref_l2_x.backward(d_l2_x)
+        assert x.grad is not None
+        ref_d_x, x.grad = x.grad.clone(), None
+        x = x.detach().clone().requires_grad_()
+        real_prefix_x, real_l2_x = hstu_split_l2_embeddings(
+            max_seq_len=max_seq_len,
+            x=x,
+            prefix_offsets=prefix_offsets,
+            l2_offsets=l2_offsets,
+            contextual_seq_len=contextual_seq_len,
+            kernel=HammerKernel.TRITON,
+        )
+        print(ref_prefix_x.shape, real_prefix_x.shape)
+        torch.testing.assert_close(ref_prefix_x, real_prefix_x)
+        torch.testing.assert_close(ref_l2_x, real_l2_x)
+        d_prefix_x = d_prefix_x.detach().clone()
+        d_l2_x = d_l2_x.detach().clone()
+        real_prefix_x.backward(d_prefix_x, retain_graph=True)
+        real_l2_x.backward(d_l2_x)
+        real_d_x = x.grad.clone()
+        torch.testing.assert_close(ref_d_x, real_d_x)
+
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(1, 1),
+        max_prefix_len=st.integers(10, 10),
+        max_l2_len=st.integers(5, 5),
+        contextual_seq_len=st.sampled_from([3]),
+        max_targets=st.sampled_from([2]),
+        D=st.integers(10, 10),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    def test_hstu_concat_l2_embeddings(
+        self,
+        batch_size: int,
+        max_prefix_len: int,
+        max_l2_len: int,
+        contextual_seq_len: int,
+        max_targets: int,
+        D: int,
+        dtype: torch.dtype,
+    ) -> None:
+        set_dev_mode(True)
+        from generative_recommenders.ops.jagged_tensors import hstu_concat_l2_embeddings
+
+        num_targets = torch.randint(
+            1, max_targets + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+        l2_lengths = torch.randint(
+            0,
+            max_l2_len + 1,
+            size=(batch_size,),
+            device=torch.device("cuda"),
+        )
+        l2_lengths = num_targets + l2_lengths + contextual_seq_len
+        max_l2_len = max_l2_len + contextual_seq_len + max_targets
+        l2_offsets = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        l2_offsets[1:] = torch.cumsum(l2_lengths, dim=0)
+        total_l2_len = int(l2_offsets[-1].item())
+        l2_x = (
+            torch.empty(
+                (total_l2_len, D),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        prefix_lengths = torch.randint(
+            0,
+            max_prefix_len + 1,
+            size=(batch_size,),
+            device=torch.device("cuda"),
+        )
+        prefix_lengths = torch.randint(
+            0,
+            max_prefix_len + 1,
+            size=(batch_size,),
+            device=torch.device("cuda"),
+        )
+        prefix_offsets = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        prefix_offsets[1:] = torch.cumsum(prefix_lengths, dim=0)
+        total_prefix_len = int(prefix_offsets[-1].item())
+        prefix_x = (
+            torch.empty(
+                (total_prefix_len, D),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        ref_x = hstu_concat_l2_embeddings(
+            max_prefix_len=max_prefix_len,
+            prefix_x=prefix_x,
+            prefix_offsets=prefix_offsets,
+            max_l2_len=max_l2_len,
+            l2_x=l2_x,
+            l2_offsets=l2_offsets,
+            contextual_seq_len=contextual_seq_len,
+            kernel=HammerKernel.PYTORCH,
+        )
+        dout = torch.randn_like(ref_x)
+        ref_x.backward(dout)
+
+        assert prefix_x.grad is not None
+        ref_d_prefix_x, prefix_x.grad = prefix_x.grad.clone(), None
+        assert l2_x.grad is not None
+        ref_d_l2_x, l2_x.grad = l2_x.grad.clone(), None
+
+        prefix_x = prefix_x.detach().clone().requires_grad_()
+        l2_x = l2_x.detach().clone().requires_grad_()
+        real_x = hstu_concat_l2_embeddings(
+            max_prefix_len=max_prefix_len,
+            prefix_x=prefix_x,
+            prefix_offsets=prefix_offsets,
+            max_l2_len=max_l2_len,
+            l2_x=l2_x,
+            l2_offsets=l2_offsets,
+            contextual_seq_len=contextual_seq_len,
+            kernel=HammerKernel.TRITON,
+        )
+        torch.testing.assert_close(ref_x, real_x)
+        dout = dout.detach().clone()
+        real_x.backward(dout)
+        real_d_prefix_x = prefix_x.grad.clone()
+        real_d_l2_x = l2_x.grad.clone()
+        torch.testing.assert_close(ref_d_prefix_x, real_d_prefix_x)
+        torch.testing.assert_close(ref_d_l2_x, real_d_l2_x)
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(4, 8),
+        max_seq_len=st.integers(50, 500),
+        D=st.integers(20, 200),
+        K=st.integers(30, 200),
+        dtype=st.sampled_from(
+            [torch.float32, torch.bfloat16, torch.float16]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+        contiguous=st.booleans(),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_jagged_dense_bmm_broadcast_add_triton(self, *args, **kwargs) -> None:
+        self._test_jagged_dense_bmm_broadcast_add(
+            *args,
+            **kwargs,
+            test_backward=True,
+            atol=None,
+            rtol=None,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.sampled_from([130]),
+        max_seq_len=st.sampled_from([32768]),
+        D=st.sampled_from([512]),
+        K=st.sampled_from([512]),
+        dtype=st.sampled_from([torch.float32, torch.bfloat16]),
+        contiguous=st.booleans(),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=1,
+        deadline=None,
+    )
+    def test_jagged_dense_bmm_broadcast_add_triton_large_tensor(
+        self,
+        # pyre-fixme[2]: Parameter must be annotated.
+        *args,
+        **kwargs,  # pyre-ignore[2]
+    ) -> None:
+        self._test_jagged_dense_bmm_broadcast_add(
+            *args,
+            **kwargs,
+            test_backward=True,
+            atol=None,
+            rtol=None,
+            ref_kernel=HammerKernel.TRITON,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    def _test_jagged_dense_bmm_broadcast_add(
+        self,
+        batch_size: int,
+        max_seq_len: int,
+        D: int,
+        K: int,
+        dtype: torch.dtype,
+        ref_kernel: HammerKernel,
+        real_kernel: HammerKernel,
+        test_backward: bool,
+        contiguous: bool = True,
+        atol: Optional[float] = None,
+        rtol: Optional[float] = None,
+        sparsity: float = -1,
+    ) -> None:
+        set_dev_mode(True)
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cuda.matmul.allow_tf32 = False
+        from generative_recommenders.ops.jagged_tensors import (
+            jagged_dense_bmm_broadcast_add,
+        )
+
+        if sparsity > 0.0:
+            lengths = generate_sparse_seq_len(
+                size=batch_size,
+                max_seq_len=max_seq_len,
+                sparsity=sparsity,
+                device=torch.device("cuda"),
+            ).to(torch.int64)
+        else:
+            lengths = torch.randint(
+                max_seq_len + 1, size=(batch_size,), device=torch.device("cuda")
+            )
+        # Test the edge case with an empty row
+        seq_offsets = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        seq_offsets[1:] = torch.cumsum(lengths, dim=0)
+        jagged_size = int(seq_offsets[-1].item())
+        jagged = (
+            torch.empty((jagged_size, D), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        dense = (
+            torch.empty((batch_size, D, K), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        bias = (
+            torch.empty((batch_size, K), dtype=dtype, device=torch.device("cuda"))
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+
+        if not contiguous:
+            dense = (
+                dense.transpose(1, 2)
+                .contiguous()
+                .transpose(1, 2)
+                .detach()
+                .clone()
+                .requires_grad_()
+            )
+
+        ref_out = jagged_dense_bmm_broadcast_add(
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            jagged=jagged,
+            dense=dense,
+            bias=bias,
+            kernel=ref_kernel,
+        ).to(jagged.dtype)
+        if test_backward:
+            dout = torch.randn_like(ref_out) * 0.01
+            ref_out.backward(dout)
+            # pyre-ignore
+            ref_d_jagged, jagged.grad = jagged.grad.clone(), None
+            ref_d_dense, dense.grad = dense.grad.clone(), None
+            ref_d_bias, bias.grad = bias.grad.clone(), None
+
+        jagged = jagged.detach().clone().requires_grad_()
+        dense = dense.detach().clone().requires_grad_()
+        bias = bias.detach().clone().requires_grad_()
+        real_out = jagged_dense_bmm_broadcast_add(
+            max_seq_len=max_seq_len,
+            seq_offsets=seq_offsets,
+            jagged=jagged,
+            dense=dense,
+            bias=bias,
+            kernel=real_kernel,
+        )
+        torch.testing.assert_close(
+            ref_out,
+            real_out,
+            atol=atol,
+            rtol=rtol,
+        )
+        if test_backward:
+            real_out.backward(dout)  # pyre-ignore
+            real_d_jagged = jagged.grad.clone()
+            real_d_dense = dense.grad.clone()
+            real_d_bias = bias.grad.clone()
+            torch.testing.assert_close(
+                ref_d_jagged,  # pyre-ignore
+                real_d_jagged,
+                atol=atol,
+                rtol=rtol,
+            )
+            torch.testing.assert_close(
+                ref_d_dense,  # pyre-ignore
+                real_d_dense,
+                atol=atol,
+                rtol=rtol,
+            )
+            torch.testing.assert_close(
+                ref_d_bias,  # pyre-ignore
+                real_d_bias,
+                atol=atol,
+                rtol=rtol,
+            )
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(2, 8),
+        max_len_a=st.integers(20, 100),
+        max_len_b=st.integers(20, 100),
+        D=st.integers(10, 30),
+        is_dense_a=st.sampled_from([True, False]),
+        is_dense_b=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_concat_2D_jagged_multirow_triton(self, *args, **kwargs) -> None:
+        self._test_concat_2D_jagged_multirow(
+            *args,
+            **kwargs,
+            test_backward=True,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    def _test_concat_2D_jagged_multirow(
+        self,
+        batch_size: int,
+        max_len_a: int,
+        max_len_b: int,
+        D: int,
+        is_dense_a: bool,
+        is_dense_b: bool,
+        test_backward: bool,
+        ref_kernel: HammerKernel,
+        real_kernel: HammerKernel,
+        dtype: torch.dtype = torch.float32,
+    ) -> None:
+        set_dev_mode(True)
+
+        if not is_dense_a:
+            lengths_a = torch.randint(
+                1, max_len_a + 1, size=(batch_size,), device=torch.device("cuda")
+            )
+            offsets_a = torch.zeros(
+                (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+            )
+            offsets_a[1:] = torch.cumsum(lengths_a, dim=0)
+            total_len_a = int(offsets_a[-1].item())
+        else:
+            offsets_a = None
+            total_len_a = batch_size * max_len_a
+        values_a = (
+            torch.empty(
+                (total_len_a, D),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+
+        if not is_dense_b:
+            lengths_b = torch.randint(
+                1, max_len_b + 1, size=(batch_size,), device=torch.device("cuda")
+            )
+            offsets_b = torch.zeros(
+                (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+            )
+            offsets_b[1:] = torch.cumsum(lengths_b, dim=0)
+            total_len_b = int(offsets_b[-1].item())
+        else:
+            offsets_b = None
+            total_len_b = batch_size * max_len_b
+        values_b = (
+            torch.empty(
+                (total_len_b, D),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+
+        ref_values = concat_2D_jagged(
+            max_seq_len=max_len_a + max_len_b,
+            values_left=values_a,
+            values_right=values_b,
+            max_len_left=max_len_a,
+            max_len_right=max_len_b,
+            offsets_left=offsets_a,
+            offsets_right=offsets_b,
+            kernel=ref_kernel,
+        )
+        dout = torch.randn_like(ref_values) * 0.1
+        ref_values.backward(dout)
+        assert values_a.grad is not None
+        ref_d_a, values_a.grad = values_a.grad.clone(), None
+        assert values_b.grad is not None
+        ref_d_b, values_b.grad = values_b.grad.clone(), None
+
+        values_a = values_a.detach().clone().requires_grad_()
+        values_b = values_b.detach().clone().requires_grad_()
+        dout = dout.detach().clone()
+
+        real_values = concat_2D_jagged_multirow(
+            max_seq_len=max_len_a + max_len_b,
+            values_left=values_a,
+            values_right=values_b,
+            offsets_left=offsets_a,
+            offsets_right=offsets_b,
+            max_len_left=max_len_a,
+            max_len_right=max_len_b,
+            kernel=real_kernel,
+        )
+        torch.testing.assert_close(ref_values, real_values)
+        if test_backward:
+            real_values.backward(dout)
+            real_d_a = values_a.grad.clone()
+            real_d_b = values_b.grad.clone()
+            torch.testing.assert_close(ref_d_a, real_d_a)
+            torch.testing.assert_close(ref_d_b, real_d_b)
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        batch_size=st.integers(2, 8),
+        max_len_a=st.integers(20, 100),
+        max_len_b=st.integers(20, 100),
+        D=st.integers(10, 30),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_split_2D_jagged_multirow_triton(self, *args, **kwargs) -> None:
+        self._test_split_2D_jagged_multirow(
+            *args,
+            **kwargs,
+            test_backward=True,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    def _test_split_2D_jagged_multirow(
+        self,
+        batch_size: int,
+        max_len_a: int,
+        max_len_b: int,
+        D: int,
+        test_backward: bool,
+        ref_kernel: HammerKernel,
+        real_kernel: HammerKernel,
+        dtype: torch.dtype = torch.float32,
+    ) -> None:
+        set_dev_mode(True)
+
+        lengths_a = torch.randint(
+            1, max_len_a + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+        offsets_a = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        offsets_a[1:] = torch.cumsum(lengths_a, dim=0)
+
+        lengths_b = torch.randint(
+            1, max_len_b + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+        offsets_b = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        offsets_b[1:] = torch.cumsum(lengths_b, dim=0)
+
+        total_len = int(offsets_a[-1].item()) + int(offsets_b[-1].item())
+        values = (
+            torch.empty(
+                (total_len, D),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+
+        ref_values_a, ref_values_b = split_2D_jagged(
+            max_seq_len=max_len_a + max_len_b,
+            values=values,
+            total_len_left=int(offsets_a[-1].item()),
+            total_len_right=int(offsets_b[-1].item()),
+            max_len_left=max_len_a,
+            max_len_right=max_len_b,
+            offsets_left=offsets_a,
+            offsets_right=offsets_b,
+            kernel=ref_kernel,
+        )
+        d_values_a = torch.randn_like(ref_values_a) * 0.1
+        d_values_b = torch.randn_like(ref_values_b) * 0.1
+        ref_values_a.backward(d_values_a, retain_graph=True)
+        ref_values_b.backward(d_values_b)
+        assert values.grad is not None
+        ref_d_values, values.grad = values.grad.clone(), None
+
+        values = values.detach().clone().requires_grad_()
+        d_values_a = d_values_a.detach().clone()
+        d_values_b = d_values_b.detach().clone()
+
+        max_len_a_actual = int((offsets_a[1:] - offsets_a[:-1]).max().item())
+        max_len_b_actual = int((offsets_b[1:] - offsets_b[:-1]).max().item())
+
+        real_values_a, real_values_b = split_2D_jagged_multirow(
+            max_seq_len=max_len_a + max_len_b,
+            values=values,
+            total_len_left=int(offsets_a[-1].item()),
+            total_len_right=int(offsets_b[-1].item()),
+            max_len_left=max_len_a_actual,
+            max_len_right=max_len_b_actual,
+            offsets_left=offsets_a,
+            offsets_right=offsets_b,
+            kernel=real_kernel,
+        )
+        torch.testing.assert_close(ref_values_a, real_values_a)
+        torch.testing.assert_close(ref_values_b, real_values_b)
+        if test_backward:
+            real_values_a.backward(d_values_a, retain_graph=True)
+            real_values_b.backward(d_values_b)
+            real_d_values = values.grad.clone()
+            torch.testing.assert_close(ref_d_values, real_d_values)
diff --git a/recommendation_v4/generative_recommenders/ops/tests/layer_norm_test.py b/recommendation_v4/generative_recommenders/ops/tests/layer_norm_test.py
new file mode 100644
index 000000000..62540967a
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/tests/layer_norm_test.py
@@ -0,0 +1,231 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import copy
+import unittest
+
+import torch
+from generative_recommenders.common import gpu_unavailable, HammerKernel, set_dev_mode
+from generative_recommenders.ops.layer_norm import (
+    layer_norm,
+    LayerNorm,
+    swish_layer_norm,
+    SwishLayerNorm,
+)
+from hypothesis import given, settings, strategies as st, Verbosity
+
+
+class LayerNormTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore[56]
+    @given(
+        N=st.sampled_from([4200000]),
+        D=st.sampled_from([512]),
+        is_swish=st.sampled_from([False]),
+        dtype=st.sampled_from(
+            [torch.bfloat16]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        deadline=None,
+        verbosity=Verbosity.verbose,
+        max_examples=1,
+    )
+    # pyre-ignore[2]
+    def test_large_tensors(self, *args, **kwargs) -> None:
+        self._test_layernorm(
+            *args,
+            **kwargs,
+            ref_kernel=HammerKernel.TRITON,
+            real_kernel=HammerKernel.TRITON,
+            skip_comparisons=True,
+        )
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore[56]
+    @given(
+        N=st.integers(min_value=0, max_value=10000),
+        D=st.integers(min_value=32, max_value=512),
+        is_swish=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        deadline=None,
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+    )
+    # pyre-ignore[2]
+    def test_ln(self, *args, **kwargs) -> None:
+        self._test_layernorm(
+            *args,
+            **kwargs,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    def _test_layernorm(
+        self,
+        N: int,
+        D: int,
+        is_swish: bool,
+        dtype: torch.dtype,
+        ref_kernel: HammerKernel,
+        real_kernel: HammerKernel,
+        skip_comparisons: bool = False,
+    ) -> None:
+        N = N // 4 * 4
+        # enable auto-tuning to verify correctness of multi-row kernel
+        set_dev_mode(False)
+        x = (
+            torch.empty((N, D), dtype=dtype, device=torch.device("cuda"))
+            .normal_(0.0, 1.0)
+            .requires_grad_()
+        )
+        weight = (
+            torch.empty((D,), device=torch.device("cuda"))
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        bias = (
+            torch.empty((D,), device=torch.device("cuda"))
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        if is_swish:
+            layer_norm_func = swish_layer_norm
+        else:
+            layer_norm_func = layer_norm
+        # ref
+        ref_out = layer_norm_func(x, weight, bias, eps=1e-6, kernel=ref_kernel)
+        dout = torch.randn_like(ref_out) * 0.05
+        ref_out.backward(dout)
+        if skip_comparisons:
+            return
+        # pyre-ignore[16]
+        ref_dx, x.grad = x.grad.detach().clone(), None
+        ref_dw, weight.grad = weight.grad.detach().clone(), None
+        ref_db, bias.grad = bias.grad.detach().clone(), None
+        # opt
+        x = x.detach().clone().requires_grad_()
+        weight = weight.detach().clone().requires_grad_()
+        bias = bias.detach().clone().requires_grad_()
+        opt_out = layer_norm_func(x, weight, bias, eps=1e-6, kernel=real_kernel)
+        dout = dout.detach().clone()
+        opt_out.backward(dout)
+        opt_dx, x.grad = x.grad.detach().clone(), None
+        opt_dw, weight.grad = weight.grad.detach().clone(), None
+        opt_db, bias.grad = bias.grad.detach().clone(), None
+        torch.testing.assert_close(ref_out, opt_out)
+        torch.testing.assert_close(ref_dx, opt_dx)
+        torch.testing.assert_close(ref_dw, opt_dw)
+        torch.testing.assert_close(ref_db, opt_db)
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore[56]
+    @given(
+        N=st.integers(min_value=32, max_value=10000),
+        D=st.integers(min_value=32, max_value=512),
+        is_swish=st.sampled_from([True, False]),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        deadline=None,
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+    )
+    # pyre-ignore[2]
+    def test_modules(self, *args, **kwargs) -> None:
+        self._test_layer_norm_module(
+            *args,
+            **kwargs,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    def _test_layer_norm_module(
+        self,
+        N: int,
+        D: int,
+        is_swish: bool,
+        dtype: torch.dtype,
+        ref_kernel: HammerKernel,
+        real_kernel: HammerKernel,
+        skip_comparisons: bool = False,
+    ) -> None:
+        set_dev_mode(True)
+        x = (
+            torch.empty((N, D), dtype=dtype, device=torch.device("cuda"))
+            .normal_(0.0, 1.0)
+            .requires_grad_()
+        )
+        # ref
+        if is_swish:
+            ref_layer = SwishLayerNorm(
+                dim=D,
+                eps=1e-6,
+            ).to(device="cuda")
+            ref_layer._hammer_kernel = ref_kernel
+        else:
+            ref_layer = LayerNorm(
+                dim=D,
+                eps=1e-6,
+            ).to(device="cuda")
+            ref_layer._hammer_kernel = ref_kernel
+        opt_layer = copy.deepcopy(ref_layer)
+        opt_layer._hammer_kernel = real_kernel
+
+        ref_out = ref_layer(x)
+        dout = torch.randn_like(ref_out) * 0.05
+        ref_out.backward(dout)
+        if skip_comparisons:
+            return
+        # pyre-ignore[16]
+        ref_dx, x.grad = x.grad.detach().clone(), None
+        ref_dw = ref_layer.weight.grad.detach().clone()
+        ref_db = ref_layer.bias.grad.detach().clone()
+        # opt
+        x = x.detach().clone().requires_grad_()
+        opt_out = opt_layer(x)
+        dout = dout.detach().clone()
+        opt_out.backward(dout)
+        opt_dx, x.grad = x.grad.detach().clone(), None
+        opt_dw = opt_layer.weight.grad.detach().clone()
+        opt_db = opt_layer.bias.grad.detach().clone()
+        torch.testing.assert_close(ref_out, opt_out)
+        torch.testing.assert_close(
+            ref_dx,
+            opt_dx,
+        )
+        torch.testing.assert_close(
+            ref_dw,
+            opt_dw,
+        )
+        torch.testing.assert_close(
+            ref_db,
+            opt_db,
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/tests/mm_test.py b/recommendation_v4/generative_recommenders/ops/tests/mm_test.py
new file mode 100644
index 000000000..0695275e3
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/tests/mm_test.py
@@ -0,0 +1,156 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import unittest
+from typing import Optional
+
+import torch
+from generative_recommenders.common import gpu_unavailable, HammerKernel
+from generative_recommenders.ops.mm import addmm
+from hypothesis import given, settings, strategies as st, Verbosity
+
+
+class MMlTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore[56]
+    @given(
+        M=st.integers(min_value=100, max_value=300),
+        N=st.integers(min_value=100, max_value=300),
+        K=st.sampled_from([128, 256]),
+        broadcast=st.booleans(),
+        dtype=st.sampled_from(
+            [torch.float32, torch.bfloat16, torch.float16]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    def test_addmm(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        broadcast: bool,
+        dtype: torch.dtype,
+    ) -> None:
+        self._test_addmm(
+            M=M,
+            N=N,
+            K=K,
+            broadcast=broadcast,
+            dtype=dtype,
+            kernel_type=HammerKernel.TRITON,
+        )
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore[56]
+    @given(
+        M=st.integers(min_value=100, max_value=300),
+        N=st.sampled_from([16, 48, 128, 144, 256]),
+        K=st.sampled_from([16, 48, 128, 144, 256]),
+        broadcast=st.booleans(),
+        dtype=st.sampled_from(
+            [torch.float32, torch.bfloat16, torch.float16]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    def test_addmm_tma(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        broadcast: bool,
+        dtype: torch.dtype,
+    ) -> None:
+        self._test_addmm(
+            M=M,
+            N=N,
+            K=K,
+            broadcast=broadcast,
+            dtype=dtype,
+            kernel_type=HammerKernel.TRITON,
+        )
+
+    def _test_addmm(
+        self,
+        M: int,
+        N: int,
+        K: int,
+        broadcast: bool,
+        dtype: torch.dtype,
+        kernel_type: HammerKernel,
+        atol: Optional[float] = None,
+        rtol: Optional[float] = None,
+    ) -> None:
+        # to enable more deterministic results.
+        torch.manual_seed(0)
+
+        torch.backends.cudnn.allow_tf32 = False
+        torch.backends.cuda.matmul.allow_tf32 = False
+
+        x: torch.Tensor = torch.rand((M, K), dtype=dtype, device="cuda").requires_grad_(
+            True
+        )
+        w: torch.Tensor = torch.rand((K, N), dtype=dtype, device="cuda").requires_grad_(
+            True
+        )
+
+        if broadcast:
+            y: torch.Tensor = torch.rand(
+                (N), dtype=dtype, device="cuda"
+            ).requires_grad_(True)
+        else:
+            y: torch.Tensor = torch.rand(
+                (M, N), dtype=dtype, device="cuda"
+            ).requires_grad_(True)
+
+        ref_z = addmm(y, x, w, kernel=HammerKernel.PYTORCH)
+        dz = torch.randn_like(ref_z) * 0.1
+        ref_z.backward(dz)
+        # pyre-ignore[16]
+        ref_dx, x.grad = x.grad.detach().clone(), None
+        ref_dw, w.grad = w.grad.detach().clone(), None
+        ref_dy, y.grad = y.grad.detach().clone(), None
+
+        x = x.detach().clone().requires_grad_(True)
+        w = w.detach().clone().requires_grad_(True)
+        y = y.detach().clone().requires_grad_(True)
+        real_z = addmm(y, x, w, kernel=kernel_type)
+
+        torch.testing.assert_close(ref_z, real_z, atol=atol, rtol=rtol)
+
+        # triton cc doesn't support backward
+        if kernel_type != HammerKernel.TRITON_CC:
+            real_z.backward(dz)
+            real_dx, x.grad = x.grad.detach().clone(), None
+            real_dw, w.grad = w.grad.detach().clone(), None
+            real_dy, y.grad = y.grad.detach().clone(), None
+
+            torch.testing.assert_close(ref_dx, real_dx, atol=atol, rtol=rtol)
+            torch.testing.assert_close(ref_dw, real_dw, atol=atol, rtol=rtol)
+            torch.testing.assert_close(ref_dy, real_dy, atol=atol, rtol=rtol)
diff --git a/recommendation_v4/generative_recommenders/ops/tests/position_test.py b/recommendation_v4/generative_recommenders/ops/tests/position_test.py
new file mode 100644
index 000000000..ab9e1b415
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/tests/position_test.py
@@ -0,0 +1,234 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import unittest
+
+import torch
+from generative_recommenders.common import (
+    generate_sparse_seq_len,
+    gpu_unavailable,
+    HammerKernel,
+    set_dev_mode,
+)
+from hypothesis import given, settings, strategies as st, Verbosity
+
+
+class PositionEmbeddingsTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        alpha=st.sampled_from([0.5]),
+        max_uih_len=st.integers(50, 500),
+        max_contextual_seq_len=st.sampled_from([10]),
+        interleave_targets=st.sampled_from([True, False]),
+        batch_size=st.integers(16, 32),
+        D=st.integers(20, 200),
+        max_targets=st.sampled_from([10, 20]),
+        time_bucket_fn=st.sampled_from(["log"]),
+        dtype=st.sampled_from([torch.float32, torch.bfloat16, torch.float16]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=20,
+        deadline=None,
+    )
+    # pyre-ignore[2]
+    def test_add_timestamp_positional_embeddings_triton(self, *args, **kwargs) -> None:
+        self._test_add_timestamp_positional_embeddings(
+            *args,
+            **kwargs,
+            test_backward=True,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore
+    @given(
+        alpha=st.sampled_from([0.5]),
+        max_uih_len=st.sampled_from([32768]),
+        max_contextual_seq_len=st.sampled_from([10]),
+        interleave_targets=st.sampled_from([False]),
+        batch_size=st.sampled_from([130]),
+        D=st.sampled_from([512]),
+        max_targets=st.sampled_from([10]),
+        time_bucket_fn=st.sampled_from(["log"]),
+        dtype=st.sampled_from([torch.bfloat16]),
+    )
+    @settings(
+        verbosity=Verbosity.verbose,
+        max_examples=1,
+        deadline=None,
+    )
+    def test_add_timestamp_positional_embeddings_triton_large_tensor(
+        self,
+        # pyre-fixme[2]: Parameter must be annotated.
+        *args,
+        # pyre-ignore[2]
+        **kwargs,
+    ) -> None:
+        self._test_add_timestamp_positional_embeddings(
+            *args,
+            **kwargs,
+            test_backward=False,
+            ref_kernel=HammerKernel.TRITON,
+            real_kernel=HammerKernel.TRITON,
+            sparsity=1.0,
+        )
+
+    def _test_add_timestamp_positional_embeddings(
+        self,
+        alpha: float,
+        max_uih_len: int,
+        max_contextual_seq_len: int,
+        interleave_targets: bool,
+        batch_size: int,
+        D: int,
+        max_targets: int,
+        time_bucket_fn: str,
+        dtype: torch.dtype,
+        ref_kernel: HammerKernel,
+        real_kernel: HammerKernel,
+        test_backward: bool,
+        sparsity: float = -1,
+    ) -> None:
+        set_dev_mode(True)
+        from generative_recommenders.ops.position import (
+            add_timestamp_positional_embeddings,
+        )
+
+        num_targets = torch.randint(
+            max_targets + 1, size=(batch_size,), device=torch.device("cuda")
+        )
+        if sparsity > 0.0:
+            lengths = generate_sparse_seq_len(
+                size=batch_size,
+                max_seq_len=max_uih_len,
+                sparsity=sparsity,
+                device=torch.device("cuda"),
+            ).to(torch.int64)
+        else:
+            lengths = torch.randint(
+                max_uih_len + 1, size=(batch_size,), device=torch.device("cuda")
+            )
+        seq_offsets = torch.zeros(
+            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
+        )
+        seq_offsets[1:] = torch.cumsum(lengths, dim=0)
+        max_seq_len = max_uih_len + max_targets
+
+        position_embeddings_weight = (
+            torch.empty(
+                (max_seq_len, D), dtype=torch.float32, device=torch.device("cuda")
+            )
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        num_time_buckets = 1000
+        timestamp_embeddings_weight = (
+            torch.empty(
+                (num_time_buckets, D), dtype=torch.float32, device=torch.device("cuda")
+            )
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        seq_embeddings = (
+            torch.empty(
+                (int(seq_offsets[-1].item()), D),
+                dtype=dtype,
+                device=torch.device("cuda"),
+            )
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        timestamp_deltas: torch.Tensor = torch.randint(
+            86400,
+            size=(batch_size, max_seq_len),
+            device="cuda",
+        )
+        timestamps = timestamp_deltas.cumsum(dim=1)
+        mask = torch.arange(max_seq_len, device=timestamps.device) < lengths.unsqueeze(
+            1
+        )
+        timestamps = timestamps[mask.view(batch_size, -1)]
+
+        ref_out = add_timestamp_positional_embeddings(
+            alpha=alpha,
+            max_seq_len=max_seq_len,
+            max_contextual_seq_len=max_contextual_seq_len,
+            position_embeddings_weight=position_embeddings_weight,
+            timestamp_embeddings_weight=timestamp_embeddings_weight,
+            seq_offsets=seq_offsets,
+            seq_lengths=lengths,
+            seq_embeddings=seq_embeddings,
+            timestamps=timestamps,
+            num_targets=num_targets,
+            interleave_targets=interleave_targets,
+            time_bucket_fn=time_bucket_fn,
+            kernel=ref_kernel,
+        )
+        dout = torch.randn_like(ref_out) * 0.01
+        ref_out.backward(dout)
+        # pyre-ignore
+        ref_d_seq_embeddings, seq_embeddings.grad = seq_embeddings.grad.clone(), None
+        ref_d_position_embeddings_weight, position_embeddings_weight.grad = (
+            position_embeddings_weight.grad.clone(),
+            None,
+        )
+        ref_d_timestamp_embeddings_weight, timestamp_embeddings_weight.grad = (
+            timestamp_embeddings_weight.grad.clone(),
+            None,
+        )
+
+        real_out = add_timestamp_positional_embeddings(
+            alpha=alpha,
+            max_seq_len=max_seq_len,
+            max_contextual_seq_len=max_contextual_seq_len,
+            position_embeddings_weight=position_embeddings_weight,
+            timestamp_embeddings_weight=timestamp_embeddings_weight,
+            seq_offsets=seq_offsets,
+            seq_lengths=lengths,
+            seq_embeddings=seq_embeddings,
+            timestamps=timestamps,
+            num_targets=num_targets,
+            interleave_targets=interleave_targets,
+            time_bucket_fn=time_bucket_fn,
+            kernel=real_kernel,
+        )
+
+        torch.testing.assert_close(ref_out, real_out)
+        if test_backward:
+            real_out.backward(dout)
+            real_d_seq_embeddings = seq_embeddings.grad.clone()
+            real_d_position_embeddings_weight = position_embeddings_weight.grad.clone()
+            real_d_timestamp_embeddings_weight = (
+                timestamp_embeddings_weight.grad.clone()
+            )
+            torch.testing.assert_close(ref_d_seq_embeddings, real_d_seq_embeddings)
+            torch.testing.assert_close(
+                ref_d_position_embeddings_weight,
+                real_d_position_embeddings_weight,
+                atol=5e-2 if dtype != torch.float32 else None,
+                rtol=2e-2 if dtype != torch.float32 else None,
+            )
+            torch.testing.assert_close(
+                ref_d_timestamp_embeddings_weight,
+                real_d_timestamp_embeddings_weight,
+                atol=5e-2 if dtype != torch.float32 else None,
+                rtol=2e-2 if dtype != torch.float32 else None,
+            )
diff --git a/recommendation_v4/generative_recommenders/ops/tests/rms_norm_test.py b/recommendation_v4/generative_recommenders/ops/tests/rms_norm_test.py
new file mode 100644
index 000000000..4e5c1a871
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/tests/rms_norm_test.py
@@ -0,0 +1,229 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import copy
+import unittest
+
+import torch
+from generative_recommenders.common import gpu_unavailable, HammerKernel, set_dev_mode
+from generative_recommenders.ops.layer_norm import rms_norm, RMSNorm
+from hammer.ops.triton.cc.utils import set_triton_cc_version
+from hypothesis import given, settings, strategies as st, Verbosity
+
+
+class LayerNormTest(unittest.TestCase):
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore[56]
+    @given(
+        N=st.sampled_from([2000000]),
+        D=st.sampled_from([512]),
+        dtype=st.sampled_from(
+            [torch.bfloat16]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+        silu=st.booleans(),
+    )
+    @settings(
+        deadline=None,
+        verbosity=Verbosity.verbose,
+        max_examples=1,
+    )
+    # pyre-ignore[2]
+    def test_large_tensors(self, *args, **kwargs) -> None:
+        self._test_rms_norm(
+            *args,
+            **kwargs,
+            ref_kernel=HammerKernel.TRITON,
+            real_kernel=HammerKernel.TRITON,
+            skip_comparisons=True,
+        )
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore[56]
+    @given(
+        N=st.integers(min_value=0, max_value=10000),
+        D=st.integers(min_value=32, max_value=512),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+        silu=st.booleans(),
+    )
+    @settings(
+        deadline=None,
+        verbosity=Verbosity.verbose,
+        max_examples=50,
+    )
+    # pyre-ignore[2]
+    def test_rms_norm(self, *args, **kwargs) -> None:
+        self._test_rms_norm(
+            *args,
+            **kwargs,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore[56]
+    @given(
+        N=st.integers(min_value=4, max_value=10000),
+        D=st.sampled_from([256, 512]),
+        dtype=st.sampled_from([torch.bfloat16, torch.float16]),
+        triton_cc_version=st.sampled_from(["", "repkg"]),
+        silu=st.just(False),
+    )
+    @settings(
+        deadline=None,
+        verbosity=Verbosity.verbose,
+        max_examples=10,
+    )
+    # pyre-ignore[2]
+    def test_rms_norm_triton_cc(self, triton_cc_version: str, *args, **kwargs) -> None:
+        set_triton_cc_version(triton_cc_version)
+        self._test_rms_norm(
+            *args,
+            **kwargs,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON_CC,
+            test_backward=False,
+        )
+
+    def _test_rms_norm(
+        self,
+        N: int,
+        D: int,
+        dtype: torch.dtype,
+        silu: bool,
+        ref_kernel: HammerKernel,
+        real_kernel: HammerKernel,
+        skip_comparisons: bool = False,
+        test_backward: bool = True,
+    ) -> None:
+        N = N // 4 * 4
+        # enable auto-tuning to verify correctness of multi-row kernel
+        set_dev_mode(False)
+        x = (
+            torch.empty((N, D), dtype=dtype, device=torch.device("cuda"))
+            .normal_(0.0, 1.0)
+            .requires_grad_()
+        )
+        weight = (
+            torch.empty((D,), device=torch.device("cuda"))
+            .uniform_(-1.0, 1.0)
+            .requires_grad_()
+        )
+        # ref
+        ref_out = rms_norm(x, weight, eps=1e-6, silu=silu, kernel=ref_kernel)
+        opt_x = x.detach().clone().requires_grad_()
+        opt_weight = weight.detach().clone().requires_grad_()
+        opt_out = rms_norm(opt_x, opt_weight, eps=1e-6, silu=silu, kernel=real_kernel)
+        torch.testing.assert_close(ref_out, opt_out)
+
+        if not test_backward:
+            return
+
+        dout = torch.randn_like(ref_out) * 0.05
+        ref_out.backward(dout)
+        if skip_comparisons:
+            return
+        # pyre-ignore[16]
+        ref_dx, x.grad = x.grad.detach().clone(), None
+        ref_dw, weight.grad = weight.grad.detach().clone(), None
+        # opt
+        dout = dout.detach().clone()
+        opt_out.backward(dout)
+        opt_dx, x.grad = opt_x.grad.detach().clone(), None
+        opt_dw, weight.grad = opt_weight.grad.detach().clone(), None
+        torch.testing.assert_close(ref_dx, opt_dx)
+        torch.testing.assert_close(ref_dw, opt_dw)
+
+    @unittest.skipIf(*gpu_unavailable)
+    # pyre-ignore[56]
+    @given(
+        N=st.integers(min_value=32, max_value=10000),
+        D=st.integers(min_value=32, max_value=512),
+        dtype=st.sampled_from(
+            [torch.bfloat16, torch.float32]
+            if torch.cuda.get_device_capability(torch.device("cuda"))[0] >= 8
+            else [torch.float32]
+        ),
+    )
+    @settings(
+        deadline=None,
+        verbosity=Verbosity.verbose,
+        max_examples=50,
+    )
+    # pyre-ignore[2]
+    def test_modules(self, *args, **kwargs) -> None:
+        self._test_rms_norm_module(
+            *args,
+            **kwargs,
+            ref_kernel=HammerKernel.PYTORCH,
+            real_kernel=HammerKernel.TRITON,
+        )
+
+    def _test_rms_norm_module(
+        self,
+        N: int,
+        D: int,
+        dtype: torch.dtype,
+        ref_kernel: HammerKernel,
+        real_kernel: HammerKernel,
+        skip_comparisons: bool = False,
+    ) -> None:
+        set_dev_mode(True)
+        x = (
+            torch.empty((N, D), dtype=dtype, device=torch.device("cuda"))
+            .normal_(0.0, 1.0)
+            .requires_grad_()
+        )
+        # ref
+        ref_layer = RMSNorm(
+            dim=D,
+            eps=1e-6,
+        ).to(device="cuda")
+        ref_layer._hammer_kernel = ref_kernel
+        opt_layer = copy.deepcopy(ref_layer)
+        opt_layer._hammer_kernel = real_kernel
+
+        ref_out = ref_layer(x)
+        dout = torch.randn_like(ref_out) * 0.05
+        ref_out.backward(dout)
+        if skip_comparisons:
+            return
+        # pyre-ignore[16]
+        ref_dx, x.grad = x.grad.detach().clone(), None
+        ref_dw = ref_layer.weight.grad.detach().clone()
+        # opt
+        x = x.detach().clone().requires_grad_()
+        opt_out = opt_layer(x)
+        dout = dout.detach().clone()
+        opt_out.backward(dout)
+        opt_dx, x.grad = x.grad.detach().clone(), None
+        opt_dw = opt_layer.weight.grad.detach().clone()
+        torch.testing.assert_close(ref_out.to(dtype), opt_out)
+        torch.testing.assert_close(
+            ref_dx,
+            opt_dx,
+        )
+        torch.testing.assert_close(
+            ref_dw,
+            opt_dw,
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_addmm.py b/recommendation_v4/generative_recommenders/ops/triton/triton_addmm.py
new file mode 100644
index 000000000..915d85742
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_addmm.py
@@ -0,0 +1,1706 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+#!/usr/bin/env python3
+
+
+import math
+from typing import List, Optional, Tuple
+
+import torch
+
+# @manual=//triton:triton
+import triton
+
+# @manual=//triton:triton
+import triton.language as tl
+from generative_recommenders.ops.utils import is_sm100_plus, maybe_register_custom_op
+
+try:
+    # @manual=//triton:triton
+    from triton.language.extra.subtile_ops import _split_n_2D
+except ImportError:
+    _split_n_2D = None
+
+try:
+    # @manual=//triton:triton
+    import triton.language.extra.tlx as tlx  # type: ignore
+
+    HAS_TLX = True
+except ImportError:
+    tlx = None
+    HAS_TLX = False
+
+from generative_recommenders.common import triton_autotune, triton_cc
+
+try:
+    # @manual=//triton:triton
+    from triton.tools.tensor_descriptor import TensorDescriptor
+
+    TMA_AVAILABLE = True
+except ImportError:
+    TMA_AVAILABLE = False
+    pass
+
+
+ENABLE_FULL_TURNING_SPACE = False
+
+
+def _use_meta_ws() -> bool:
+    """Check if Meta's warp specialization is available, enabled, and on SM100+."""
+    return (
+        is_sm100_plus()
+        and hasattr(triton, "knobs")
+        and hasattr(triton.knobs, "nvidia")
+        and triton.knobs.nvidia.use_meta_ws
+    )
+
+
+def _check_tma_alignment(
+    x: torch.Tensor, w: torch.Tensor, y: torch.Tensor, min_alignment: int = 16
+) -> bool:
+    """Check if tensors meet TMA alignment requirements.
+
+    TMA (Tensor Memory Accelerator) on H100 requires:
+    1. Base addresses to be 64-byte aligned
+    2. Dimensions to be multiples of 64 for optimal performance
+    3. Contiguous inner dimensions (stride=1)
+
+    Args:
+        x: Input tensor [M, K]
+        w: Weight tensor [K, N]
+        y: Bias tensor [N] or [M, N]
+        min_alignment: Minimum alignment requirement (default: 64)
+
+    Returns:
+        True if all tensors meet TMA alignment requirements
+    """
+    _, K = x.shape
+    KB, N = w.shape
+    assert K == KB, f"incompatible dimensions {K}, {KB}"
+
+    is_y_1d = y.dim() == 1
+    NY = y.shape[0] if is_y_1d else y.shape[1]
+    assert N == NY, f"incompatible dimensions {N}, {NY}"
+
+    return (K % min_alignment == 0) and (N % min_alignment == 0)
+
+
+def _prune_persistent_autows_configs(configs, named_args, **kwargs):  # noqa
+    if not _use_meta_ws():
+        return configs
+    BROADCAST_Y = kwargs.get("BROADCAST_Y", False)
+    pruned = []
+    for c in configs:
+        BLOCK_M = c.kwargs.get("BLOCK_M", 0)
+        BLOCK_N = c.kwargs.get("BLOCK_N", 0)
+        EPILOGUE_SUBTILE = c.kwargs.get("EPILOGUE_SUBTILE", 1)
+        DP = c.kwargs.get("DATA_PARTITION_FACTOR", 1)
+        # DATA_PARTITION_FACTOR=2 is only supported with BLOCK_M=256
+        if DP == 2 and BLOCK_M != 256:
+            continue
+        if (BLOCK_N // EPILOGUE_SUBTILE) < 32:
+            continue
+        if BROADCAST_Y and (BLOCK_N // EPILOGUE_SUBTILE) < 64:
+            continue
+        pruned.append(c)
+    return pruned
+
+
+def _prune_configs_for_tlx_persistent_addmm(configs, named_args, **kwargs):  # noqa
+    M = named_args.get("M", 0)
+    N = named_args.get("N", 0)
+    BROADCAST_Y = kwargs.get("BROADCAST_Y", False)
+
+    pruned = []
+    for c in configs:
+        BLOCK_M = c.kwargs.get("BLOCK_M", 0)
+        BLOCK_N = c.kwargs.get("BLOCK_N", 0)
+        EPILOGUE_SUBTILE = c.kwargs.get("EPILOGUE_SUBTILE", 1)
+        NUM_MMA_GROUPS = c.kwargs.get("NUM_MMA_GROUPS", 1)
+        BLOCK_M_SPLIT = BLOCK_M // NUM_MMA_GROUPS
+        NUM_SMEM_BUFFERS = c.kwargs.get("NUM_SMEM_BUFFERS", 1)
+
+        # Hardware constraint: Always make MMA tile 128.
+        if BLOCK_M_SPLIT != 128:
+            continue
+
+        # BLOCK_N >= 64 required for PAIR_CTA
+        if BLOCK_N < 64:
+            continue
+
+        # Subslice loads cannot be smaller than 32
+        if (BLOCK_N // EPILOGUE_SUBTILE) < 32:
+            continue
+
+        # TMA loads must be at least 128 bytes. With BROADCAST_Y
+        # this may not be met.
+        if BROADCAST_Y and (BLOCK_N // EPILOGUE_SUBTILE) < 64:
+            continue
+
+        # Prune the support SMEM_BUFFER configurations.
+        if BROADCAST_Y:
+            if NUM_MMA_GROUPS == 1 and NUM_SMEM_BUFFERS != 5:
+                continue
+            elif NUM_MMA_GROUPS == 2 and NUM_SMEM_BUFFERS != 4:
+                continue
+        else:
+            if NUM_MMA_GROUPS == 1 and NUM_SMEM_BUFFERS != 4:
+                continue
+            elif NUM_MMA_GROUPS == 2 and NUM_SMEM_BUFFERS != 3:
+                continue
+
+        # PAIR_CTA requires even number of M tiles and even total tiles
+        num_tiles_m = math.ceil(M / BLOCK_M) if BLOCK_M > 0 else 0
+        num_tiles_n = math.ceil(N / BLOCK_N) if BLOCK_N > 0 else 0
+        total_tiles = num_tiles_m * num_tiles_n
+
+        # PAIR_CTA incompatible with MMA M=64
+        pair_cta_compatible = (
+            (num_tiles_m % 2 == 0)
+            and (total_tiles % 2 == 0)
+            and BLOCK_M == 128
+            and NUM_MMA_GROUPS == 1
+        )
+
+        c.kwargs["PAIR_CTA"] = pair_cta_compatible
+        # Set ctas_per_cga for CUDA-native cluster launch semantics (TLX way)
+        c.ctas_per_cga = (2, 1, 1) if pair_cta_compatible else None
+
+        pruned.append(c)
+    return pruned
+
+
+def get_mm_configs(pre_hook=None) -> List[triton.Config]:
+    if torch.version.hip:
+        if ENABLE_FULL_TURNING_SPACE:
+            block_m_range = [32, 64, 128, 256]
+            block_n_range = [32, 64, 128, 256]
+            block_k_range = [32, 64]
+            group_m_range = [4, 8]
+            matrix_instr_nonkdim_range = [16]
+            waves_per_eu_range = [0]
+            kpack_range = [1, 2]
+            num_warps_range = [4, 8]
+            num_stage_range = [2] if triton.__version__ >= "3.2.0" else [0]
+        else:
+            block_m_range = [256]
+            block_n_range = [256]
+            block_k_range = [32]
+            group_m_range = [8]
+            matrix_instr_nonkdim_range = [16]
+            waves_per_eu_range = [0]
+            kpack_range = [2]
+            num_warps_range = [8]
+            num_stage_range = [2] if triton.__version__ >= "3.2.0" else [0]
+
+        return [
+            triton.Config(
+                {
+                    "BLOCK_M": block_m,
+                    "BLOCK_N": block_n,
+                    "BLOCK_K": block_k,
+                    "GROUP_M": group_m,
+                    "matrix_instr_nonkdim": matrix_instr_nonkdim,
+                    "waves_per_eu": waves_per_eu,
+                    "kpack": kpack,
+                },
+                num_stages=num_stages,
+                num_warps=num_warps,
+                pre_hook=pre_hook,
+            )
+            for block_m in block_m_range
+            for block_n in block_n_range
+            for block_k in block_k_range
+            for group_m in group_m_range
+            for matrix_instr_nonkdim in matrix_instr_nonkdim_range
+            for waves_per_eu in waves_per_eu_range
+            for kpack in kpack_range
+            for num_stages in num_stage_range
+            for num_warps in num_warps_range
+        ]
+    else:
+        block_m_range = [32, 64, 128, 256]
+        block_n_range = [32, 64, 128, 256]
+        block_k_range = [32, 64]
+        group_m_range = [4, 8]
+        # WARP_SPECIALIZE only works with num_warps >=4
+        num_warps_range = [4, 8] if is_sm100_plus() else [2, 4, 8]
+        num_stage_range = [2, 3, 4, 5]
+        if ENABLE_FULL_TURNING_SPACE:
+            return [
+                triton.Config(
+                    {
+                        "BLOCK_M": block_m,
+                        "BLOCK_N": block_n,
+                        "BLOCK_K": block_k,
+                        "GROUP_M": group_m,
+                    },
+                    num_stages=num_stages,
+                    num_warps=num_warps,
+                    pre_hook=pre_hook,
+                )
+                for block_m in block_m_range
+                for block_n in block_n_range
+                for block_k in block_k_range
+                for group_m in group_m_range
+                for num_stages in num_stage_range
+                for num_warps in num_warps_range
+            ]
+        else:
+            configs = [
+                triton.Config(
+                    {
+                        "BLOCK_M": 32,
+                        "BLOCK_N": 64,
+                        "BLOCK_K": 32,
+                        "GROUP_M": 8,
+                    },
+                    num_stages=5,
+                    num_warps=2,
+                    pre_hook=pre_hook,
+                ),
+                triton.Config(
+                    {
+                        "BLOCK_M": 128,
+                        "BLOCK_N": 256,
+                        "BLOCK_K": 64,
+                        "GROUP_M": 8,
+                    },
+                    num_stages=3,
+                    num_warps=8,
+                    pre_hook=pre_hook,
+                ),
+                triton.Config(
+                    {
+                        "BLOCK_M": 64,
+                        "BLOCK_N": 256,
+                        "BLOCK_K": 32,
+                        "GROUP_M": 8,
+                    },
+                    num_stages=4,
+                    num_warps=4,
+                    pre_hook=pre_hook,
+                ),
+                triton.Config(
+                    {
+                        "BLOCK_M": 128,
+                        "BLOCK_N": 128,
+                        "BLOCK_K": 32,
+                        "GROUP_M": 8,
+                    },
+                    num_stages=4,
+                    num_warps=4,
+                    pre_hook=pre_hook,
+                ),
+                triton.Config(
+                    {
+                        "BLOCK_M": 128,
+                        "BLOCK_N": 64,
+                        "BLOCK_K": 32,
+                        "GROUP_M": 8,
+                    },
+                    num_stages=4,
+                    num_warps=4,
+                    pre_hook=pre_hook,
+                ),
+                triton.Config(
+                    {
+                        "BLOCK_M": 64,
+                        "BLOCK_N": 128,
+                        "BLOCK_K": 32,
+                        "GROUP_M": 8,
+                    },
+                    num_stages=4,
+                    num_warps=4,
+                    pre_hook=pre_hook,
+                ),
+                triton.Config(
+                    {
+                        "BLOCK_M": 128,
+                        "BLOCK_N": 32,
+                        "BLOCK_K": 32,
+                        "GROUP_M": 8,
+                    },
+                    num_stages=4,
+                    num_warps=4,
+                    pre_hook=pre_hook,
+                ),
+                triton.Config(
+                    {
+                        "BLOCK_M": 64,
+                        "BLOCK_N": 32,
+                        "BLOCK_K": 32,
+                        "GROUP_M": 8,
+                    },
+                    num_stages=5,
+                    num_warps=2,
+                    pre_hook=pre_hook,
+                ),
+            ]
+            if is_sm100_plus():
+                configs += [
+                    triton.Config(
+                        {
+                            "BLOCK_M": 128,
+                            "BLOCK_N": 256,
+                            "BLOCK_K": 64,
+                            "GROUP_M": 8,
+                        },
+                        num_stages=3,
+                        num_warps=4,
+                        pre_hook=pre_hook,
+                    ),
+                ]
+                return [c for c in configs if c.num_warps >= 4]
+
+            return configs
+
+
+def _get_addmm_tma_ws_persistent_configs(pre_hook=None) -> List[triton.Config]:
+    """Get configs for _addmm_fwd_tma_ws_persistent (sm100+ TLX kernel).
+
+    This kernel has unique requirements (warp specialization, PAIR_CTA,
+    EPILOGUE_SUBTILE) that don't apply to the other addmm kernels.
+    """
+    if ENABLE_FULL_TURNING_SPACE:
+        block_m_range = [64, 128, 256]
+        block_n_range = [64, 128, 256]
+        block_k_range = [64, 128, 256]
+        group_m_range = [8]
+        num_warps_range = [4]
+        num_stage_range = [1]
+        epilogue_subtile_range = [1, 2, 4]
+        num_mma_groups_range = [1, 2]
+        return [
+            triton.Config(
+                {
+                    "BLOCK_M": block_m,
+                    "BLOCK_N": block_n,
+                    "BLOCK_K": block_k,
+                    "GROUP_M": group_m,
+                    "EPILOGUE_SUBTILE": epilogue_subtile,
+                    "NUM_MMA_GROUPS": num_mma_groups,
+                    "NUM_TMEM_BUFFERS": 1 if num_mma_groups == 2 else 2,
+                    "NUM_SMEM_BUFFERS": num_smem_buffers,
+                },
+                num_stages=num_stages,
+                num_warps=num_warps,
+                pre_hook=pre_hook,
+            )
+            for block_m in block_m_range
+            for block_n in block_n_range
+            for block_k in block_k_range
+            for group_m in group_m_range
+            for num_stages in num_stage_range
+            for num_warps in num_warps_range
+            for epilogue_subtile in epilogue_subtile_range
+            for num_mma_groups in num_mma_groups_range
+            for num_smem_buffers in [3, 4, 5]
+        ]
+    else:
+        configs = []
+        for block_m, block_n, block_k in [
+            (128, 256, 64),
+            (128, 128, 64),
+            (64, 128, 64),
+            (64, 256, 64),
+            (128, 64, 128),
+        ]:
+            # Note: num_smem_buffers is pruned to 1 in
+            # the pruning function.
+            for num_smem_buffers in [3, 4, 5]:
+                configs.append(
+                    triton.Config(
+                        {
+                            "BLOCK_M": block_m,
+                            "BLOCK_N": block_n,
+                            "BLOCK_K": block_k,
+                            "GROUP_M": 8,
+                            "EPILOGUE_SUBTILE": 1,
+                            "NUM_MMA_GROUPS": 1,
+                            "NUM_TMEM_BUFFERS": 2,
+                            "NUM_SMEM_BUFFERS": num_smem_buffers,
+                        },
+                        num_stages=1,
+                        num_warps=4,
+                        pre_hook=pre_hook,
+                    ),
+                )
+        return configs
+
+
+def get_triton_persistent_configs(pre_hook=None) -> List[triton.Config]:
+    if not _use_meta_ws():
+        configs = get_mm_configs(pre_hook=pre_hook)
+        for c in configs:
+            c.kwargs["DATA_PARTITION_FACTOR"] = 1
+            c.kwargs["EPILOGUE_SUBTILE"] = 1
+        return configs
+    # TODO: Prune configs to best configs.
+    return [
+        triton.Config(  # pyre-ignore[28]
+            {
+                "BLOCK_M": block_m,
+                "BLOCK_N": block_n,
+                "BLOCK_K": block_k,
+                "GROUP_M": 8,
+                "EPILOGUE_SUBTILE": subtile,
+                "DATA_PARTITION_FACTOR": DP,
+            },
+            num_stages=num_stages,
+            num_warps=4,
+            pre_hook=pre_hook,
+            early_tma_store_lowering=1,
+            maxRegAutoWS=255,
+        )
+        for block_m in [64, 128, 256]
+        for block_n in [64, 128, 256]
+        for block_k in [64, 128, 256]
+        for num_stages in [2, 3, 4]
+        for subtile in [1, 2, 4, 8]
+        for DP in [1, 2]
+    ]
+
+
+@triton_cc(
+    annotations={
+        "M": "i32",
+        "N": ("i32", 16),
+        "K": ("i32", 16),
+        "stride_xm": ("i32", 16),
+        "stride_xk": ("i32", 1),
+        "stride_wk": ("i32", 16),
+        "stride_wn": ("i32", 1),
+        "stride_ym": ("i32", 16),
+        "stride_yn": ("i32", 1),
+        "stride_zm": ("i32", 16),
+        "stride_zn": ("i32", 1),
+    },
+)
+@triton_autotune(
+    configs=get_mm_configs(),
+    key=["N", "K"],
+)
+@triton.jit
+def _addmm_fwd(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    z_ptr,
+    M,
+    N,
+    K,
+    stride_xm,
+    stride_xk,
+    stride_wk,
+    stride_wn,
+    stride_ym,
+    stride_yn,
+    stride_zm,
+    stride_zn,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_M: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BROADCAST_Y: tl.constexpr,
+):
+    pid_0, pid_1 = tl.program_id(axis=0), tl.program_id(axis=1)
+    pid = pid_0 * tl.num_programs(axis=1) + pid_1
+    num_pid_m = tl.cdiv(M, BLOCK_M)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    num_pid_in_group = GROUP_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_k = tl.arange(0, BLOCK_K)
+    offs_n = tl.arange(0, BLOCK_N)
+    mask_m = (pid_m * BLOCK_M + offs_m)[:, None] < M
+    mask_n = (pid_n * BLOCK_N + offs_n)[None, :] < N
+    x_ptr += pid_m.to(tl.int64) * BLOCK_M * stride_xm
+    x_ptrs = x_ptr + (offs_m[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    w_ptr += pid_n.to(tl.int64) * BLOCK_N * stride_wn
+    w_ptrs = w_ptr + (offs_k[:, None] * stride_wk + offs_n[None, :] * stride_wn)
+    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        mask_k = offs_k[None, :] < K - k * BLOCK_K
+        x = tl.load(x_ptrs, mask=mask_k & mask_m, other=0.0)
+        mask_k = offs_k[:, None] < K - k * BLOCK_K
+        w = tl.load(w_ptrs, mask=mask_k & mask_n, other=0.0)
+        accumulator += tl.dot(x, w, allow_tf32=ALLOW_TF32)
+        x_ptrs += BLOCK_K * stride_xk
+        w_ptrs += BLOCK_K * stride_wk
+
+    z_mask = mask_m & mask_n
+    if BROADCAST_Y:
+        # y is a vector, broadcast to add to z
+        y_ptr += pid_n.to(tl.int64) * BLOCK_N * stride_yn
+        y_ptrs = y_ptr + stride_yn * offs_n[None, :]
+        y = tl.load(y_ptrs, mask=mask_n)
+    else:
+        y_ptr += pid_m.to(tl.int64) * BLOCK_M * stride_ym
+        y_ptr += pid_n.to(tl.int64) * BLOCK_N * stride_yn
+        y_ptrs = y_ptr + stride_ym * offs_m[:, None] + stride_yn * offs_n[None, :]
+        y = tl.load(y_ptrs, mask=z_mask)
+    z = (accumulator + y.to(tl.float32)).to(z_ptr.dtype.element_ty)
+    z_ptr += pid_m.to(tl.int64) * BLOCK_M * stride_zm
+    z_ptr += pid_n.to(tl.int64) * BLOCK_N * stride_zn
+    z_ptrs = z_ptr + stride_zm * offs_m[:, None] + stride_zn * offs_n[None, :]
+    tl.store(z_ptrs, z, mask=z_mask)
+
+
+def _addmm_tma_set_block_size_hook(nargs):
+    BLOCK_M = nargs["BLOCK_M"]
+    BLOCK_N = nargs["BLOCK_N"]
+    BLOCK_K = nargs["BLOCK_K"]
+    NUM_MMA_GROUPS = nargs.get("NUM_MMA_GROUPS", 1)
+    BLOCK_M_SPLIT = BLOCK_M // NUM_MMA_GROUPS
+    PAIR_CTA = nargs.get("PAIR_CTA", False)
+    nargs["x_desc"].block_shape = [BLOCK_M_SPLIT, BLOCK_K]
+    # In PAIR_CTA mode, each CTA loads BLOCK_N // 2 of W
+    if PAIR_CTA:
+        nargs["w_desc"].block_shape = [BLOCK_K, BLOCK_N // 2]
+    else:
+        nargs["w_desc"].block_shape = [BLOCK_K, BLOCK_N]
+    EPILOGUE_SUBTILE = nargs.get("EPILOGUE_SUBTILE", 1)
+    nargs["z_desc"].block_shape = [BLOCK_M_SPLIT, BLOCK_N // EPILOGUE_SUBTILE]
+    if nargs["BROADCAST_Y"]:
+        nargs["y_desc"].block_shape = [1, BLOCK_N // EPILOGUE_SUBTILE]
+    else:
+        nargs["y_desc"].block_shape = [BLOCK_M_SPLIT, BLOCK_N // EPILOGUE_SUBTILE]
+
+
+@triton.jit
+def _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS):
+    group_id = tile_id // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
+    pid_m = first_pid_m + (tile_id % group_size_m)
+    pid_n = (tile_id % num_pid_in_group) // group_size_m
+    return pid_m, pid_n
+
+
+@triton.jit
+def _addmm_persistent_tile_body(
+    x_desc,
+    w_desc,
+    y_desc,
+    z_desc,
+    tile_id,
+    num_pid_in_group,
+    num_pid_m,
+    k_tiles,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_M: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BROADCAST_Y: tl.constexpr,
+    NUM_SMS: tl.constexpr,
+    EPILOGUE_SUBTILE: tl.constexpr,
+    INNER_WARP_SPECIALIZE: tl.constexpr,
+):
+    pid_m, pid_n = _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_M, NUM_SMS)
+    offs_xm = pid_m * BLOCK_M
+    offs_wn = pid_n * BLOCK_N
+
+    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for k in tl.range(0, k_tiles, warp_specialize=INNER_WARP_SPECIALIZE):
+        offs_k = k * BLOCK_K
+        x = x_desc.load([offs_xm, offs_k])
+        w = w_desc.load([offs_k, offs_wn])
+        accumulator = tl.dot(x, w, accumulator, allow_tf32=ALLOW_TF32)
+
+    # Epilogue subtiling breaks the store into multiple pieces to reduce
+    # shared memory consumption and allow higher stage counts.
+    tl.static_assert(
+        EPILOGUE_SUBTILE <= 8,
+        "EPILOGUE_SUBTILE > 8 is not supported",
+    )
+    acc_subtiles = _split_n_2D(accumulator, EPILOGUE_SUBTILE)  # pyre-ignore[16]
+    slice_size: tl.constexpr = BLOCK_N // EPILOGUE_SUBTILE
+    for i in tl.static_range(EPILOGUE_SUBTILE):
+        if BROADCAST_Y:
+            y_i = y_desc.load([0, offs_wn + i * slice_size])
+        else:
+            y_i = y_desc.load([offs_xm, offs_wn + i * slice_size])
+        z_i = (acc_subtiles[i] + y_i.to(tl.float32)).to(z_desc.dtype)
+        z_desc.store([offs_xm, offs_wn + i * slice_size], z_i)
+
+
+@triton_autotune(
+    configs=get_triton_persistent_configs(pre_hook=_addmm_tma_set_block_size_hook),
+    key=["M", "N", "K", "WARP_SPECIALIZE"],
+    prune_configs_by={"early_config_prune": _prune_persistent_autows_configs},
+)
+@triton.jit
+def _addmm_fwd_tma_persistent(
+    x_desc,
+    w_desc,
+    y_desc,
+    z_desc,
+    M,
+    N,
+    K,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_M: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BROADCAST_Y: tl.constexpr,
+    WARP_SPECIALIZE: tl.constexpr,
+    NUM_SMS: tl.constexpr,
+    EPILOGUE_SUBTILE: tl.constexpr,
+    DATA_PARTITION_FACTOR: tl.constexpr,
+    USE_META_WS: tl.constexpr,
+):
+    start_pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_M)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    k_tiles = tl.cdiv(K, BLOCK_K)
+    num_tiles = num_pid_m * num_pid_n
+
+    num_pid_in_group = GROUP_M * num_pid_n
+
+    if USE_META_WS:
+        # Some arguments are only available in FBexperimental.
+        # pyre-ignore[28]: smem_alloc_algo is FBexperimental
+        for tile_id in tl.range(
+            start_pid,
+            num_tiles,
+            NUM_SMS,
+            flatten=False,
+            warp_specialize=WARP_SPECIALIZE,
+            data_partition_factor=DATA_PARTITION_FACTOR,
+            smem_alloc_algo=1,
+        ):
+            _addmm_persistent_tile_body(
+                x_desc,
+                w_desc,
+                y_desc,
+                z_desc,
+                tile_id,
+                num_pid_in_group,
+                num_pid_m,
+                k_tiles,
+                BLOCK_M=BLOCK_M,
+                BLOCK_N=BLOCK_N,
+                BLOCK_K=BLOCK_K,
+                GROUP_M=GROUP_M,
+                ALLOW_TF32=ALLOW_TF32,
+                BROADCAST_Y=BROADCAST_Y,
+                NUM_SMS=NUM_SMS,
+                EPILOGUE_SUBTILE=EPILOGUE_SUBTILE,
+                INNER_WARP_SPECIALIZE=tl.constexpr(False),
+            )
+    else:
+        # Pure OAI Triton version.
+        for tile_id in tl.range(
+            start_pid, num_tiles, NUM_SMS, flatten=True, warp_specialize=WARP_SPECIALIZE
+        ):
+            _addmm_persistent_tile_body(
+                x_desc,
+                w_desc,
+                y_desc,
+                z_desc,
+                tile_id,
+                num_pid_in_group,
+                num_pid_m,
+                k_tiles,
+                BLOCK_M=BLOCK_M,
+                BLOCK_N=BLOCK_N,
+                BLOCK_K=BLOCK_K,
+                GROUP_M=GROUP_M,
+                ALLOW_TF32=ALLOW_TF32,
+                BROADCAST_Y=BROADCAST_Y,
+                NUM_SMS=NUM_SMS,
+                EPILOGUE_SUBTILE=EPILOGUE_SUBTILE,
+                INNER_WARP_SPECIALIZE=WARP_SPECIALIZE,
+            )
+
+
+@triton_autotune(
+    configs=get_mm_configs(pre_hook=_addmm_tma_set_block_size_hook),
+    key=["N", "K"],
+)
+@triton.jit
+def _addmm_fwd_tma_ws(
+    x_desc,
+    w_desc,
+    y_desc,
+    z_desc,
+    M,
+    N,
+    K,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_M: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BROADCAST_Y: tl.constexpr,
+    NUM_SMEM_BUFFERS: tl.constexpr,
+):
+    x_buffers = tlx.local_alloc((BLOCK_M, BLOCK_K), x_desc.dtype, NUM_SMEM_BUFFERS)
+    w_buffers = tlx.local_alloc((BLOCK_K, BLOCK_N), w_desc.dtype, NUM_SMEM_BUFFERS)
+    acc_tmem_buffer = tlx.local_alloc(
+        (BLOCK_M, BLOCK_N), tl.float32, tl.constexpr(1), tlx.storage_kind.tmem
+    )
+
+    if BROADCAST_Y:
+        y_buffer = tlx.local_alloc((1, BLOCK_N), y_desc.dtype, tl.constexpr(1))
+    else:
+        y_buffer = tlx.local_alloc((BLOCK_M, BLOCK_N), y_desc.dtype, tl.constexpr(1))
+    z_buffer = tlx.local_alloc((BLOCK_M, BLOCK_N), z_desc.dtype, tl.constexpr(1))
+
+    smem_full_bars = tlx.alloc_barriers(num_barriers=NUM_SMEM_BUFFERS, arrive_count=1)
+    smem_empty_bars = tlx.alloc_barriers(num_barriers=NUM_SMEM_BUFFERS, arrive_count=1)
+    y_load_barrier = tlx.alloc_barriers(num_barriers=1, arrive_count=1)
+
+    with tlx.async_tasks():
+        # Producer task: TMA loads
+        with tlx.async_task("default"):
+            pid_0, pid_1 = tl.program_id(axis=0), tl.program_id(axis=1)
+            pid = pid_0 * tl.num_programs(axis=1) + pid_1
+            num_pid_m = tl.cdiv(M, BLOCK_M)
+            num_pid_n = tl.cdiv(N, BLOCK_N)
+            num_pid_in_group = GROUP_M * num_pid_n
+            group_id = pid // num_pid_in_group
+            first_pid_m = group_id * GROUP_M
+            group_size_m = min(num_pid_m - first_pid_m, GROUP_M)
+            pid_m = first_pid_m + (pid % group_size_m)
+            pid_n = (pid % num_pid_in_group) // group_size_m
+
+            offs_xm = pid_m * BLOCK_M
+            offs_wn = pid_n * BLOCK_N
+            k_tiles = tl.cdiv(K, BLOCK_K)
+
+            load_phase = 0
+            for k in range(0, k_tiles):
+                buf = k % int(NUM_SMEM_BUFFERS)
+
+                # Wait for buffer to be free
+                if k >= NUM_SMEM_BUFFERS:
+                    tlx.barrier_wait(smem_empty_bars[buf], load_phase ^ 1)
+
+                offs_k = k * BLOCK_K
+                tlx.barrier_expect_bytes(
+                    smem_full_bars[buf],
+                    2 * (BLOCK_M * BLOCK_K + BLOCK_K * BLOCK_N),
+                )
+                tlx.async_descriptor_load(
+                    x_desc, x_buffers[buf], [offs_xm, offs_k], smem_full_bars[buf]
+                )
+                tlx.async_descriptor_load(
+                    w_desc, w_buffers[buf], [offs_k, offs_wn], smem_full_bars[buf]
+                )
+
+                load_phase = load_phase ^ (buf == NUM_SMEM_BUFFERS - 1)
+
+        # Consumer task: async_dot MMA
+        with tlx.async_task(num_warps=4, num_regs=232):
+            pid_0, pid_1 = tl.program_id(axis=0), tl.program_id(axis=1)
+            pid = pid_0 * tl.num_programs(axis=1) + pid_1
+            num_pid_m = tl.cdiv(M, BLOCK_M)
+            num_pid_n = tl.cdiv(N, BLOCK_N)
+            num_pid_in_group = GROUP_M * num_pid_n
+            group_id = pid // num_pid_in_group
+            first_pid_m = group_id * GROUP_M
+            group_size_m = min(num_pid_m - first_pid_m, GROUP_M)
+            pid_m = first_pid_m + (pid % group_size_m)
+            pid_n = (pid % num_pid_in_group) // group_size_m
+
+            offs_xm = pid_m * BLOCK_M
+            offs_wn = pid_n * BLOCK_N
+            k_tiles = tl.cdiv(K, BLOCK_K)
+
+            # Start async load of y early
+            y_buf_view = tlx.local_view(y_buffer, 0)
+            y_load_bar = tlx.local_view(y_load_barrier, 0)
+            if BROADCAST_Y:
+                tlx.barrier_expect_bytes(y_load_bar, 1 * BLOCK_N * 2)
+                tlx.async_descriptor_load(y_desc, y_buf_view, [0, offs_wn], y_load_bar)
+            else:
+                tlx.barrier_expect_bytes(y_load_bar, BLOCK_M * BLOCK_N * 2)
+                tlx.async_descriptor_load(
+                    y_desc, y_buf_view, [offs_xm, offs_wn], y_load_bar
+                )
+
+            dot_phase = 0
+            for k in range(0, k_tiles):
+                buf = k % int(NUM_SMEM_BUFFERS)
+                tlx.barrier_wait(smem_full_bars[buf], dot_phase)
+
+                tlx.async_dot(
+                    x_buffers[buf],
+                    w_buffers[buf],
+                    acc_tmem_buffer[0],
+                    use_acc=k > 0,
+                    mBarriers=[smem_empty_bars[buf]],
+                    out_dtype=tl.float32,
+                )
+
+                dot_phase = dot_phase ^ (buf == NUM_SMEM_BUFFERS - 1)
+
+            last_buf = (k_tiles - 1) % NUM_SMEM_BUFFERS
+            last_dot_phase = dot_phase ^ (last_buf == NUM_SMEM_BUFFERS - 1)
+            tlx.barrier_wait(smem_empty_bars[last_buf], last_dot_phase)
+
+            tmem_result = tlx.local_load(acc_tmem_buffer[0])
+
+            tlx.barrier_wait(y_load_bar, 0)
+            y = tlx.local_load(y_buf_view)
+
+            z = (tmem_result + y.to(tl.float32)).to(z_desc.dtype)
+            z_buf_view = tlx.local_view(z_buffer, 0)
+            tlx.local_store(z_buf_view, z)
+            tlx.async_descriptor_store(z_desc, z_buf_view, [offs_xm, offs_wn])
+            tlx.async_descriptor_store_wait(0)
+
+
+@triton_autotune(
+    configs=_get_addmm_tma_ws_persistent_configs(
+        pre_hook=_addmm_tma_set_block_size_hook
+    ),
+    key=["M", "N", "K"],
+    prune_configs_by={"early_config_prune": _prune_configs_for_tlx_persistent_addmm},
+)
+@triton.jit
+def _addmm_fwd_tma_ws_persistent(
+    x_desc,
+    w_desc,
+    y_desc,
+    z_desc,
+    M,
+    N,
+    K,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_M: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BROADCAST_Y: tl.constexpr,
+    NUM_SMEM_BUFFERS: tl.constexpr,
+    NUM_TMEM_BUFFERS: tl.constexpr,
+    NUM_SMS: tl.constexpr,
+    EPILOGUE_SUBTILE: tl.constexpr,
+    PAIR_CTA: tl.constexpr,
+    NUM_MMA_GROUPS: tl.constexpr,
+):
+    BLOCK_M_SPLIT: tl.constexpr = BLOCK_M // NUM_MMA_GROUPS
+
+    # Allocate buffers once for all tiles
+    x_buffers = tlx.local_alloc(
+        (BLOCK_M_SPLIT, BLOCK_K), x_desc.dtype, NUM_SMEM_BUFFERS * NUM_MMA_GROUPS
+    )
+    # In pair CTA mode, each CTA only needs to load half of W
+    if PAIR_CTA:
+        w_buffers = tlx.local_alloc(
+            (BLOCK_K, BLOCK_N // 2), w_desc.dtype, NUM_SMEM_BUFFERS
+        )
+    else:
+        w_buffers = tlx.local_alloc((BLOCK_K, BLOCK_N), w_desc.dtype, NUM_SMEM_BUFFERS)
+    tmem_buffers = tlx.local_alloc(
+        (BLOCK_M_SPLIT, BLOCK_N),
+        tl.float32,
+        NUM_TMEM_BUFFERS * NUM_MMA_GROUPS,
+        tlx.storage_kind.tmem,
+    )
+    slice_size: tl.constexpr = BLOCK_N // EPILOGUE_SUBTILE
+
+    Y_Z_SHARED: tl.constexpr = NUM_MMA_GROUPS == 2 and not BROADCAST_Y
+    if Y_Z_SHARED:
+        NUM_Z_BUFFERS: tl.constexpr = EPILOGUE_SUBTILE * NUM_MMA_GROUPS
+    else:
+        NUM_Z_BUFFERS: tl.constexpr = NUM_MMA_GROUPS
+
+    if Y_Z_SHARED:
+        bias_storage_alias = tlx.storage_alias_spec(storage=tlx.storage_kind.smem)
+        y_buffers = tlx.local_alloc(
+            (BLOCK_M_SPLIT, slice_size),
+            y_desc.dtype,
+            NUM_Z_BUFFERS,
+            reuse=bias_storage_alias,
+        )
+        z_buffers = tlx.local_alloc(
+            (BLOCK_M_SPLIT, slice_size),
+            z_desc.dtype,
+            NUM_Z_BUFFERS,
+            reuse=bias_storage_alias,
+        )
+        # Define y and z to share a single buffer
+        bias_storage_alias.set_buffer_overlap(
+            tlx.reuse_group(
+                y_buffers,
+                z_buffers,
+                group_type=tlx.reuse_group_type.shared,
+            )
+        )
+    else:
+        if BROADCAST_Y:
+            y_buffers = tlx.local_alloc(
+                (1, slice_size), y_desc.dtype, EPILOGUE_SUBTILE * NUM_MMA_GROUPS
+            )
+        else:
+            y_buffers = tlx.local_alloc(
+                (BLOCK_M_SPLIT, slice_size),
+                y_desc.dtype,
+                EPILOGUE_SUBTILE * NUM_MMA_GROUPS,
+            )
+        z_buffers = tlx.local_alloc(
+            (BLOCK_M_SPLIT, slice_size), z_desc.dtype, NUM_Z_BUFFERS
+        )
+
+    cluster_cta_rank = tlx.cluster_cta_rank()
+    pred_cta0 = cluster_cta_rank == 0
+    if PAIR_CTA:
+        cta_bars = tlx.alloc_barriers(
+            num_barriers=NUM_SMEM_BUFFERS * NUM_MMA_GROUPS, arrive_count=2
+        )
+
+    # Barriers for producer <-> MMA (separate X and W barriers)
+    x_smem_full_bars = tlx.alloc_barriers(
+        num_barriers=NUM_SMEM_BUFFERS * NUM_MMA_GROUPS, arrive_count=1
+    )
+    x_smem_empty_bars = tlx.alloc_barriers(
+        num_barriers=NUM_SMEM_BUFFERS * NUM_MMA_GROUPS, arrive_count=1
+    )
+    w_smem_full_bars = tlx.alloc_barriers(num_barriers=NUM_SMEM_BUFFERS, arrive_count=1)
+    # Barriers for MMA <-> Epilogue
+    tmem_full_bars = tlx.alloc_barriers(
+        num_barriers=NUM_TMEM_BUFFERS * NUM_MMA_GROUPS, arrive_count=1
+    )
+    tmem_empty_bars = tlx.alloc_barriers(
+        num_barriers=NUM_TMEM_BUFFERS * NUM_MMA_GROUPS, arrive_count=1
+    )
+    # Barriers for producer <-> Epilogue
+    # y_load_bar: producer signals when y data is ready
+    # y_empty_bar: epilogue signals when done using y buffer
+    y_load_bars = tlx.alloc_barriers(
+        num_barriers=EPILOGUE_SUBTILE * NUM_MMA_GROUPS, arrive_count=1
+    )
+    y_empty_bars = tlx.alloc_barriers(
+        num_barriers=EPILOGUE_SUBTILE * NUM_MMA_GROUPS, arrive_count=1
+    )
+    z_load_bars = tlx.alloc_barriers(num_barriers=NUM_Z_BUFFERS, arrive_count=1)
+    z_empty_bars = tlx.alloc_barriers(num_barriers=NUM_Z_BUFFERS, arrive_count=1)
+
+    with tlx.async_tasks():
+        # Epilogue consumer: waits for Y from producer, adds bias, stores to SMEM.
+        with tlx.async_task("default"):
+            start_pid = tl.program_id(axis=0)
+            num_pid_m = tl.cdiv(M, BLOCK_M)
+            if PAIR_CTA:
+                # Round up to even for proper CTA pairing
+                num_pid_m = (num_pid_m + 1) // 2 * 2
+            num_pid_n = tl.cdiv(N, BLOCK_N)
+            num_tiles = num_pid_m * num_pid_n
+
+            tmem_read_phase = 0
+            cur_tmem_buf = 0
+            y_load_phase = 0
+            z_load_phase = 0
+
+            z_idx = 0
+            for _ in range(start_pid, num_tiles, NUM_SMS):
+                for slice_id in tl.static_range(EPILOGUE_SUBTILE):
+                    for group_id in tl.static_range(NUM_MMA_GROUPS):
+                        buf_idx = group_id * NUM_TMEM_BUFFERS + cur_tmem_buf
+                        acc_tmem = tmem_buffers[buf_idx]
+                        if slice_id == 0:
+                            # Wait for MMA to finish computing this group
+                            tlx.barrier_wait(tmem_full_bars[buf_idx], tmem_read_phase)
+
+                        # Load result from TMEM and add bias
+                        acc_subslice = tlx.subslice(
+                            acc_tmem, slice_id * slice_size, slice_size
+                        )
+                        result = tlx.local_load(acc_subslice)
+                        if slice_id == EPILOGUE_SUBTILE - 1:
+                            # Signal MMA that this TMEM buffer is now free
+                            tlx.barrier_arrive(tmem_empty_bars[buf_idx], 1)
+
+                        y_idx = slice_id * NUM_MMA_GROUPS + group_id
+                        y_buf_view = tlx.local_view(y_buffers, y_idx)
+                        y_full = tlx.local_view(y_load_bars, y_idx)
+                        tlx.barrier_wait(y_full, y_load_phase)
+                        y = tlx.local_load(y_buf_view)
+                        # If Y and Z are not shared signal we can load the next bias.
+                        if not Y_Z_SHARED:
+                            y_empty = tlx.local_view(y_empty_bars, y_idx)
+                            tlx.barrier_arrive(y_empty, 1)
+                        z = (result + y.to(tl.float32)).to(z_desc.dtype)
+                        z_buf_view = tlx.local_view(z_buffers, z_idx)
+                        # If Y and Z are not shared wait for Z to be empty.
+                        # If there are shared this already guaranteed.
+                        if not Y_Z_SHARED:
+                            z_empty = tlx.local_view(z_empty_bars, z_idx)
+                            tlx.barrier_wait(z_empty, z_load_phase ^ 1)
+                        tlx.local_store(z_buf_view, z)
+                        z_full = tlx.local_view(z_load_bars, z_idx)
+                        tlx.barrier_arrive(z_full, 1)
+                        z_load_phase = z_load_phase ^ (z_idx == (NUM_Z_BUFFERS - 1))
+                        # pyre-ignore[58]
+                        z_idx = (z_idx + 1) % NUM_Z_BUFFERS
+
+                tmem_read_phase = tmem_read_phase ^ (
+                    cur_tmem_buf == int(NUM_TMEM_BUFFERS) - 1
+                )
+                y_load_phase = y_load_phase ^ 1
+
+                cur_tmem_buf = (cur_tmem_buf + 1) % int(NUM_TMEM_BUFFERS)
+
+        # MMA consumer: performs matrix multiplication
+        with tlx.async_task(num_warps=1, num_regs=24):
+            start_pid = tl.program_id(axis=0)
+            num_pid_m = tl.cdiv(M, BLOCK_M)
+            if PAIR_CTA:
+                # Round up to even for proper CTA pairing
+                num_pid_m = (num_pid_m + 1) // 2 * 2
+            num_pid_n = tl.cdiv(N, BLOCK_N)
+            num_pid_in_group = GROUP_M * num_pid_n
+            num_tiles = num_pid_m * num_pid_n
+            k_tiles = tl.cdiv(K, BLOCK_K)
+
+            dot_phase = 0
+            tmem_write_phase = 1
+            cur_tmem_buf = 0
+            processed_k_iters = 0
+
+            for tile_id in range(start_pid, num_tiles, NUM_SMS):
+                pid_m, pid_n = _compute_pid(
+                    tile_id, num_pid_in_group, num_pid_m, GROUP_M, NUM_SMS
+                )
+
+                # First K iteration (peeled): use_acc=False
+                buf = processed_k_iters % int(NUM_SMEM_BUFFERS)
+                tlx.barrier_wait(w_smem_full_bars[buf], dot_phase)
+
+                for group_id in tl.static_range(NUM_MMA_GROUPS):
+                    a_buf = group_id * NUM_SMEM_BUFFERS + buf
+                    acc_buf = group_id * NUM_TMEM_BUFFERS + cur_tmem_buf
+
+                    tlx.barrier_wait(x_smem_full_bars[a_buf], dot_phase)
+
+                    # Wait for epilogue to finish with this TMEM buffer
+                    tlx.barrier_wait(tmem_empty_bars[acc_buf], tmem_write_phase)
+
+                    if PAIR_CTA:
+                        # pyre-ignore[61]
+                        tlx.barrier_arrive(cta_bars[a_buf], 1, remote_cta_rank=0)
+                        # pyre-ignore[61]
+                        tlx.barrier_wait(
+                            # pyre-ignore[61]
+                            cta_bars[a_buf],
+                            phase=dot_phase,
+                            pred=pred_cta0,
+                        )
+
+                    tlx.async_dot(
+                        x_buffers[a_buf],
+                        w_buffers[buf],
+                        tmem_buffers[acc_buf],
+                        use_acc=False,
+                        mBarriers=[x_smem_empty_bars[a_buf]],
+                        two_ctas=PAIR_CTA,
+                        out_dtype=tl.float32,
+                    )
+
+                dot_phase = dot_phase ^ (buf == int(NUM_SMEM_BUFFERS) - 1)
+
+                # Remaining K iterations: use_acc=True
+                for k in range(1, k_tiles):
+                    buf = (processed_k_iters + k) % int(NUM_SMEM_BUFFERS)
+                    tlx.barrier_wait(w_smem_full_bars[buf], dot_phase)
+
+                    for group_id in tl.static_range(NUM_MMA_GROUPS):
+                        a_buf = group_id * NUM_SMEM_BUFFERS + buf
+                        acc_buf = group_id * NUM_TMEM_BUFFERS + cur_tmem_buf
+
+                        tlx.barrier_wait(x_smem_full_bars[a_buf], dot_phase)
+
+                        if PAIR_CTA:
+                            # pyre-ignore[61]
+                            tlx.barrier_arrive(cta_bars[a_buf], 1, remote_cta_rank=0)
+                            # pyre-ignore[61]
+                            tlx.barrier_wait(
+                                # pyre-ignore[61]
+                                cta_bars[a_buf],
+                                phase=dot_phase,
+                                # pyre-ignore[61]
+                                pred=pred_cta0,
+                            )
+
+                        tlx.async_dot(
+                            x_buffers[a_buf],
+                            w_buffers[buf],
+                            tmem_buffers[acc_buf],
+                            use_acc=True,
+                            mBarriers=[x_smem_empty_bars[a_buf]],
+                            two_ctas=PAIR_CTA,
+                            out_dtype=tl.float32,
+                        )
+
+                    dot_phase = dot_phase ^ (buf == int(NUM_SMEM_BUFFERS) - 1)
+
+                # Wait for last MMA to complete and signal epilogue
+                last_buf = (processed_k_iters + k_tiles - 1) % int(NUM_SMEM_BUFFERS)
+                last_dot_phase = dot_phase ^ (last_buf == int(NUM_SMEM_BUFFERS) - 1)
+                for group_id in tl.static_range(NUM_MMA_GROUPS):
+                    a_buf = group_id * NUM_SMEM_BUFFERS + last_buf
+                    tlx.barrier_wait(x_smem_empty_bars[a_buf], last_dot_phase)
+                    acc_buf = group_id * NUM_TMEM_BUFFERS + cur_tmem_buf
+                    # Signal epilogue that result is ready
+                    tlx.barrier_arrive(tmem_full_bars[acc_buf], 1)
+
+                tmem_write_phase = tmem_write_phase ^ (
+                    cur_tmem_buf == int(NUM_TMEM_BUFFERS) - 1
+                )
+                cur_tmem_buf = (cur_tmem_buf + 1) % int(NUM_TMEM_BUFFERS)
+                processed_k_iters += k_tiles
+
+        # Producer: TMA loads for X, W, and Y
+        with tlx.async_task(num_warps=1, num_regs=24):
+            start_pid = tl.program_id(axis=0)
+            num_pid_m = tl.cdiv(M, BLOCK_M)
+            if PAIR_CTA:
+                # Round up to even for proper CTA pairing
+                num_pid_m = (num_pid_m + 1) // 2 * 2
+            num_pid_n = tl.cdiv(N, BLOCK_N)
+            num_pid_in_group = GROUP_M * num_pid_n
+            num_tiles = num_pid_m * num_pid_n
+            k_tiles = tl.cdiv(K, BLOCK_K)
+
+            load_phase = 0
+            y_load_phase = 0
+            processed_k_iters = 0
+
+            for tile_id in range(start_pid, num_tiles, NUM_SMS):
+                pid_m, pid_n = _compute_pid(
+                    tile_id, num_pid_in_group, num_pid_m, GROUP_M, NUM_SMS
+                )
+                offs_xm = pid_m * BLOCK_M
+                # Full tile offset for y loading (both CTAs use same y)
+                offs_wn_full = pid_n * BLOCK_N
+                # Split W into two parts so each CTA has different offset
+                if PAIR_CTA:
+                    # pyre-ignore[61]
+                    offs_wn = pid_n * BLOCK_N + cluster_cta_rank * (BLOCK_N // 2)
+                else:
+                    offs_wn = pid_n * BLOCK_N
+
+                for k in range(0, k_tiles):
+                    buf = (processed_k_iters + k) % int(NUM_SMEM_BUFFERS)
+                    offs_k = k * BLOCK_K
+
+                    # Load X for group 0
+                    a_buf = buf  # 0 * NUM_SMEM_BUFFERS + buf
+                    tlx.barrier_wait(x_smem_empty_bars[a_buf], load_phase ^ 1)
+                    tlx.barrier_expect_bytes(
+                        x_smem_full_bars[a_buf],
+                        2 * BLOCK_M_SPLIT * BLOCK_K,
+                    )
+                    tlx.async_descriptor_load(
+                        x_desc,
+                        x_buffers[a_buf],
+                        [offs_xm, offs_k],
+                        x_smem_full_bars[a_buf],
+                    )
+
+                    # Load W (wait for last group's x_empty to know W is free)
+                    last_a_buf = (NUM_MMA_GROUPS - 1) * NUM_SMEM_BUFFERS + buf
+                    tlx.barrier_wait(x_smem_empty_bars[last_a_buf], load_phase ^ 1)
+                    if PAIR_CTA:
+                        tlx.barrier_expect_bytes(
+                            w_smem_full_bars[buf],
+                            2 * BLOCK_K * (BLOCK_N // 2),
+                        )
+                    else:
+                        tlx.barrier_expect_bytes(
+                            w_smem_full_bars[buf],
+                            2 * BLOCK_K * BLOCK_N,
+                        )
+                    tlx.async_descriptor_load(
+                        w_desc,
+                        w_buffers[buf],
+                        [offs_k, offs_wn],
+                        w_smem_full_bars[buf],
+                    )
+
+                    # Load X for remaining groups
+                    for group_id in tl.static_range(1, NUM_MMA_GROUPS):
+                        a_buf = group_id * NUM_SMEM_BUFFERS + buf
+                        tlx.barrier_wait(x_smem_empty_bars[a_buf], load_phase ^ 1)
+                        offs_xm2 = offs_xm + group_id * BLOCK_M_SPLIT
+                        tlx.barrier_expect_bytes(
+                            x_smem_full_bars[a_buf],
+                            2 * BLOCK_M_SPLIT * BLOCK_K,
+                        )
+                        tlx.async_descriptor_load(
+                            x_desc,
+                            x_buffers[a_buf],
+                            [offs_xm2, offs_k],
+                            x_smem_full_bars[a_buf],
+                        )
+
+                    load_phase = load_phase ^ (buf == int(NUM_SMEM_BUFFERS) - 1)
+
+                for slice_id in tl.static_range(EPILOGUE_SUBTILE):
+                    for group_id in tl.static_range(NUM_MMA_GROUPS):
+                        y_idx = slice_id * NUM_MMA_GROUPS + group_id
+                        y_buf_view = tlx.local_view(y_buffers, y_idx)
+                        y_bar = tlx.local_view(y_load_bars, y_idx)
+                        # If Y and Z are shared we need to wait for Z to be empty.
+                        if Y_Z_SHARED:
+                            y_empty = tlx.local_view(z_empty_bars, y_idx)
+                        else:
+                            y_empty = tlx.local_view(y_empty_bars, y_idx)
+                        tlx.barrier_wait(y_empty, y_load_phase ^ 1)
+                        if BROADCAST_Y:
+                            tlx.barrier_expect_bytes(y_bar, 1 * slice_size * 2)
+                            tlx.async_descriptor_load(
+                                y_desc,
+                                y_buf_view,
+                                [0, offs_wn_full + slice_id * slice_size],
+                                y_bar,
+                            )
+                        else:
+                            tlx.barrier_expect_bytes(
+                                y_bar, BLOCK_M_SPLIT * slice_size * 2
+                            )
+                            tlx.async_descriptor_load(
+                                y_desc,
+                                y_buf_view,
+                                [
+                                    offs_xm + group_id * BLOCK_M_SPLIT,
+                                    offs_wn_full + slice_id * slice_size,
+                                ],
+                                y_bar,
+                            )
+
+                y_load_phase = y_load_phase ^ 1
+
+                processed_k_iters += k_tiles
+
+        # TMA Store consumer. Added to simplify the barrier
+        # logic.
+        with tlx.async_task(num_warps=1, num_regs=24):
+            start_pid = tl.program_id(axis=0)
+            num_pid_m = tl.cdiv(M, BLOCK_M)
+            if PAIR_CTA:
+                # Round up to even for proper CTA pairing
+                num_pid_m = (num_pid_m + 1) // 2 * 2
+            num_pid_n = tl.cdiv(N, BLOCK_N)
+            num_pid_in_group = GROUP_M * num_pid_n
+            num_tiles = num_pid_m * num_pid_n
+            z_load_phase = 0
+
+            # Unroll the first iteration.
+            # This guraranteed safe from our grid size.
+            pid_m, pid_n = _compute_pid(
+                start_pid, num_pid_in_group, num_pid_m, GROUP_M, NUM_SMS
+            )
+            offs_xm = pid_m * BLOCK_M
+            offs_wn = pid_n * BLOCK_N
+            z_idx = 0
+            for slice_id in tl.static_range(EPILOGUE_SUBTILE):
+                for group_id in tl.static_range(NUM_MMA_GROUPS):
+                    # Determine the base "index" to decide if we need to wait on TMA.
+                    z_idx_unrolled = slice_id * NUM_MMA_GROUPS + group_id
+                    if z_idx_unrolled >= NUM_Z_BUFFERS:
+                        tlx.async_descriptor_store_wait(NUM_Z_BUFFERS - 1)
+                        z_empty = tlx.local_view(z_empty_bars, z_idx)
+                        tlx.barrier_arrive(z_empty, 1)
+
+                    z_full = tlx.local_view(z_load_bars, z_idx)
+                    tlx.barrier_wait(z_full, z_load_phase)
+                    z_buf_view = tlx.local_view(z_buffers, z_idx)
+                    tlx.fence_async_shared()
+                    tlx.async_descriptor_store(
+                        z_desc,
+                        z_buf_view,
+                        [
+                            offs_xm + group_id * BLOCK_M_SPLIT,
+                            offs_wn + slice_id * slice_size,
+                        ],
+                    )
+
+                    z_load_phase = z_load_phase ^ (z_idx == (NUM_Z_BUFFERS - 1))
+                    # pyre-ignore[58]
+                    z_idx = (z_idx + 1) % NUM_Z_BUFFERS
+
+            for tile_id in range(start_pid + NUM_SMS, num_tiles, NUM_SMS):
+                pid_m, pid_n = _compute_pid(
+                    tile_id, num_pid_in_group, num_pid_m, GROUP_M, NUM_SMS
+                )
+                offs_xm = pid_m * BLOCK_M
+                offs_wn = pid_n * BLOCK_N
+                for slice_id in tl.static_range(EPILOGUE_SUBTILE):
+                    for group_id in tl.static_range(NUM_MMA_GROUPS):
+                        # Wait on prior store to finish.
+                        tlx.async_descriptor_store_wait(NUM_Z_BUFFERS - 1)
+                        z_empty = tlx.local_view(z_empty_bars, z_idx)
+                        tlx.barrier_arrive(z_empty, 1)
+                        # Wait for the next load to be ready
+                        z_full = tlx.local_view(z_load_bars, z_idx)
+                        tlx.barrier_wait(z_full, z_load_phase)
+                        z_buf_view = tlx.local_view(z_buffers, z_idx)
+                        tlx.async_descriptor_store(
+                            z_desc,
+                            z_buf_view,
+                            [
+                                offs_xm + group_id * BLOCK_M_SPLIT,
+                                offs_wn + slice_id * slice_size,
+                            ],
+                        )
+                        z_load_phase = z_load_phase ^ (z_idx == (NUM_Z_BUFFERS - 1))
+                        # pyre-ignore[58]
+                        z_idx = (z_idx + 1) % NUM_Z_BUFFERS
+
+            # Wait for the last store.
+            tlx.async_descriptor_store_wait(0)
+
+
+@torch.fx.wrap
+def triton_addmm_fwd_tma_persistent(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    y: torch.Tensor,
+    warp_specialize: bool | None = None,
+) -> torch.Tensor:
+    _meta_ws = _use_meta_ws()
+    if warp_specialize is None:
+        warp_specialize = _meta_ws
+
+    M, K = x.shape
+    _, N = w.shape
+
+    is_y_1d = y.dim() == 1
+
+    # Allocate output
+    z = torch.empty((M, N), device=x.device, dtype=x.dtype)
+    if M == 0 or N == 0:
+        return z
+
+    # A dummy block value that will be overwritten when we have the real block size
+    dummy_block = [1, 1]
+    # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional
+    # argument, expected `List[int]` but got `Size`
+    x_desc = TensorDescriptor(x, x.shape, x.stride(), dummy_block)
+    # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional
+    # argument, expected `List[int]` but got `Size`
+    w_desc = TensorDescriptor(w, w.shape, w.stride(), dummy_block)
+    y = y.reshape(1, -1) if is_y_1d else y
+    # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional
+    # argument, expected `List[int]` but got `Size`
+    y_desc = TensorDescriptor(y, y.shape, y.stride(), dummy_block)
+    # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional
+    # argument, expected `List[int]` but got `Size`
+    z_desc = TensorDescriptor(z, z.shape, z.stride(), dummy_block)
+    NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
+
+    def grid(meta):
+        BLOCK_M = meta["BLOCK_M"]
+        BLOCK_N = meta["BLOCK_N"]
+        return (
+            min(
+                NUM_SMS,
+                triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N),
+            ),
+        )
+
+    _addmm_fwd_tma_persistent[grid](
+        x_desc,
+        w_desc,
+        y_desc,
+        z_desc,
+        M,
+        N,
+        K,
+        ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+        BROADCAST_Y=is_y_1d,
+        WARP_SPECIALIZE=warp_specialize,
+        NUM_SMS=NUM_SMS,
+        USE_META_WS=_meta_ws,
+    )
+    return z
+
+
+@torch.fx.wrap
+def triton_addmm_fwd_tma_ws_tlx(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    y: torch.Tensor,
+) -> torch.Tensor:
+    M, K = x.shape
+    _, N = w.shape
+
+    is_y_1d = y.dim() == 1
+
+    # Allocate output
+    z = torch.empty((M, N), device=x.device, dtype=x.dtype)
+    if M == 0 or N == 0:
+        return z
+
+    # A dummy block value that will be overwritten when we have the real block size
+    dummy_block = [1, 1]
+    # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional
+    # argument, expected `List[int]` but got `Size`
+    x_desc = TensorDescriptor(x, x.shape, x.stride(), dummy_block)
+    # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional
+    # argument, expected `List[int]` but got `Size`
+    w_desc = TensorDescriptor(w, w.shape, w.stride(), dummy_block)
+    y = y.reshape(1, -1) if is_y_1d else y
+    # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional
+    # argument, expected `List[int]` but got `Size`
+    y_desc = TensorDescriptor(y, y.shape, y.stride(), dummy_block)
+    # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional
+    # argument, expected `List[int]` but got `Size`
+    z_desc = TensorDescriptor(z, z.shape, z.stride(), dummy_block)
+
+    def grid(meta):
+        BLOCK_M = meta["BLOCK_M"]
+        BLOCK_N = meta["BLOCK_N"]
+        return (
+            triton.cdiv(M, BLOCK_M),
+            triton.cdiv(N, BLOCK_N),
+        )
+
+    _addmm_fwd_tma_ws[grid](
+        x_desc,
+        w_desc,
+        y_desc,
+        z_desc,
+        M,
+        N,
+        K,
+        ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+        BROADCAST_Y=is_y_1d,
+        NUM_SMEM_BUFFERS=2,  # Double buffering
+    )
+    return z
+
+
+@torch.fx.wrap
+def triton_addmm_fwd_tma_ws_persistent_tlx(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    y: torch.Tensor,
+) -> torch.Tensor:
+    M, K = x.shape
+    _, N = w.shape
+
+    is_y_1d = y.dim() == 1
+
+    # Allocate output
+    z = torch.empty((M, N), device=x.device, dtype=x.dtype)
+    if M == 0 or N == 0:
+        return z
+
+    NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
+
+    # A dummy block value that will be overwritten by the hook
+    dummy_block = [1, 1]
+    # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional
+    # argument, expected `List[int]` but got `Size`
+    x_desc = TensorDescriptor(x, x.shape, x.stride(), dummy_block)
+    # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional
+    # argument, expected `List[int]` but got `Size`
+    w_desc = TensorDescriptor(w, w.shape, w.stride(), dummy_block)
+    y = y.reshape(1, -1) if is_y_1d else y
+    # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional
+    # argument, expected `List[int]` but got `Size`
+    y_desc = TensorDescriptor(y, y.shape, y.stride(), dummy_block)
+    # pyre-ignore[6]: In call `TensorDescriptor.__init__`, for 2nd positional
+    # argument, expected `List[int]` but got `Size`
+    z_desc = TensorDescriptor(z, z.shape, z.stride(), dummy_block)
+
+    def grid(meta):
+        BLOCK_M = meta["BLOCK_M"]
+        BLOCK_N = meta["BLOCK_N"]
+        num_pid_m = triton.cdiv(M, BLOCK_M)
+        num_pid_n = triton.cdiv(N, BLOCK_N)
+        # Round up num_pid_m to even for PAIR_CTA cluster compatibility
+        num_pid_m = (num_pid_m + 1) // 2 * 2
+        total_tiles = num_pid_m * num_pid_n
+        grid_size = min(NUM_SMS, total_tiles)
+        # Ensure grid is even for cluster compatibility
+        if grid_size % 2 == 1:
+            grid_size = min(grid_size + 1, NUM_SMS)
+            # If rounding up exceeds NUM_SMS and NUM_SMS is odd, round down instead
+            if grid_size % 2 == 1:
+                grid_size = grid_size - 1
+        return (grid_size,)
+
+    _addmm_fwd_tma_ws_persistent[grid](
+        x_desc,
+        w_desc,
+        y_desc,
+        z_desc,
+        M,
+        N,
+        K,
+        ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+        BROADCAST_Y=is_y_1d,
+        NUM_SMS=NUM_SMS,
+    )
+    return z
+
+
+@maybe_register_custom_op("generative_recommenders::triton_addmm_fwd", mutates_args=())
+def triton_addmm_fwd(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    y: torch.Tensor,
+) -> torch.Tensor:
+    M, K = x.shape
+    KB, N = w.shape
+    assert K == KB, f"incompatible dimensions {K}, {KB}"
+
+    is_y_1d = y.dim() == 1
+    NY = y.shape[0] if is_y_1d else y.shape[1]
+    assert N == NY, f"incompatible dimensions {N}, {NY}"
+
+    # Allocate output
+    z = torch.empty((M, N), device=x.device, dtype=x.dtype)
+    if M == 0 or N == 0:
+        return z
+
+    grid = lambda meta: (  # noqa E731
+        triton.cdiv(M, meta["BLOCK_M"]),
+        triton.cdiv(N, meta["BLOCK_N"]),
+    )
+
+    _addmm_fwd[grid](
+        x,
+        w,
+        y,
+        z,
+        M,
+        N,
+        K,
+        x.stride(0),
+        x.stride(1),
+        w.stride(0),
+        w.stride(1),
+        y.stride(0) if not is_y_1d else 0,
+        y.stride(1) if not is_y_1d else y.stride(0),
+        z.stride(0),
+        z.stride(1),
+        ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+        BROADCAST_Y=is_y_1d,
+    )
+    return z
+
+
+@triton_addmm_fwd.register_fake
+def triton_addmm_fwd_fake(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    y: torch.Tensor,
+) -> torch.Tensor:
+    """Fake implementation for FakeTensor tracing."""
+    M, _ = x.shape
+    _, N = w.shape
+    return torch.empty((M, N), device=x.device, dtype=x.dtype)
+
+
+def triton_addmm_bwd(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    dz: torch.Tensor,
+    is_y_1d: bool,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if is_y_1d:
+        dy = torch.sum(dz, dim=0)
+    else:
+        dy = dz
+    dw = torch.mm(x.t(), dz)
+    dx = torch.mm(dz, w.t())
+
+    return dx, dw, dy
+
+
+@maybe_register_custom_op(
+    "generative_recommenders::maybe_triton_addmm_fwd", mutates_args=()
+)
+def maybe_triton_addmm_fwd(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    y: Optional[torch.Tensor],
+) -> torch.Tensor:
+    # triton addmm is slower than torch (cublas) on AMD/Blackwell.
+    # Default to pytorch addmm on AMD/Blackwell for now.
+    if y is None:
+        return torch.mm(x, w)
+    if is_sm100_plus() or torch.version.hip is not None:
+        return torch.addmm(y, x, w)
+    else:
+        return triton_addmm_fwd(x=x, w=w, y=y)
+
+
+@maybe_triton_addmm_fwd.register_fake
+def maybe_triton_addmm_fwd_fake(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    y: torch.Tensor,
+) -> torch.Tensor:
+    """Fake implementation for FakeTensor tracing."""
+    M, _ = x.shape
+    _, N = w.shape
+    return torch.empty((M, N), device=x.device, dtype=x.dtype)
+
+
+class _AddMmFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        x: torch.Tensor,
+        w: torch.Tensor,
+        y: torch.Tensor,
+    ) -> torch.Tensor:
+        ctx.save_for_backward(x, w)
+        ctx.is_y_1d = y.dim() == 1
+        if is_sm100_plus() and TMA_AVAILABLE and _check_tma_alignment(x, w, y):
+            if x.dtype == torch.float32 or HAS_TLX == False:
+                return triton_addmm_fwd_tma_persistent(x, w, y, warp_specialize=True)
+            else:
+                return triton_addmm_fwd_tma_ws_persistent_tlx(
+                    x, w, y
+                )  # tlx.async_dot doesn't support fp32 inputs because of WGMMA requirements
+        else:
+            return triton_addmm_fwd(x, w, y)
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, dz: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        (x, w) = ctx.saved_tensors
+        return triton_addmm_bwd(x, w, dz, ctx.is_y_1d)
+
+
+def triton_addmm(
+    input: torch.Tensor,
+    mat1: torch.Tensor,
+    mat2: torch.Tensor,
+) -> torch.Tensor:
+    return _AddMmFunction.apply(mat1, mat2, input)
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_attention_utils.py b/recommendation_v4/generative_recommenders/ops/triton/triton_attention_utils.py
new file mode 100644
index 000000000..61fd614f3
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_attention_utils.py
@@ -0,0 +1,64 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+#!/usr/bin/env python3
+
+
+# @manual=//triton:triton
+import triton
+
+# @manual=//triton:triton
+import triton.language as tl
+
+
+@triton.jit
+def acc_dq(
+    dq_ptrs_trans,
+    start_m,
+    stride_dqm,
+    k,
+    dqk_trans,
+    alpha,
+    mask_m,
+    MAX_SEQ_LEN,
+    LOCK,
+    BLOCK_M: tl.constexpr,
+    ATOMIC_ADD: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+):
+    if ATOMIC_ADD:
+        lock_id = start_m // BLOCK_M
+        stride_lock = tl.cdiv(MAX_SEQ_LEN, BLOCK_M)
+        lock = LOCK + tl.program_id(0) * stride_lock + lock_id
+        tl.debug_barrier()  # add a barrier to force sync
+        while tl.atomic_cas(lock, 0, 1) == 1:
+            pass
+    dq_trans = tl.load(
+        dq_ptrs_trans + start_m * stride_dqm,
+        mask=mask_m[None, :],
+        other=0.0,
+        eviction_policy="evict_last",
+    )
+    dq_trans += tl.dot(tl.trans(k), dqk_trans, allow_tf32=ALLOW_TF32) * alpha
+    dq_trans = dq_trans.to(k.dtype)
+    tl.store(
+        dq_ptrs_trans + start_m * stride_dqm,
+        dq_trans,
+        mask=mask_m[None, :],
+        eviction_policy="evict_last",
+    )
+    if ATOMIC_ADD:
+        tl.atomic_xchg(lock, 0)  # pyre-ignore [61]
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_attention.py b/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_attention.py
new file mode 100644
index 000000000..ac667e139
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_attention.py
@@ -0,0 +1,3134 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#!/usr/bin/env python3
+
+# pyre-unsafe
+
+from typing import List, Optional, Tuple
+
+import torch
+
+# @manual=//triton:triton
+import triton
+
+# @manual=//triton:triton
+import triton.language as tl
+from generative_recommenders.ops.utils import (
+    copy_if_different_ptr,
+    maybe_register_custom_op,
+)
+
+try:
+    # @manual=//triton:triton
+    import triton.language.extra.tlx as tlx  # type: ignore
+
+    HAS_TLX = True
+except ImportError:
+    # suppress type checking errors
+    tlx = None
+
+    HAS_TLX = False
+
+from generative_recommenders.common import (
+    autotune_max_seq_len,
+    prev_power_of_2,
+    switch_to_contiguous_if_needed,
+    triton_autotune,
+)
+from triton.language.extra.libdevice import (  # @manual=//triton:triton
+    fast_dividef,
+    fast_expf,
+)
+
+try:
+    # @manual=//triton:triton
+    from triton.tools.tensor_descriptor import TensorDescriptor
+
+    tensor_descriptor_tma = True
+except ImportError:
+    tensor_descriptor_tma = False
+
+try:
+    from generative_recommenders.ops.triton.fb.triton_attention_utils import acc_dq
+except ImportError:
+    from generative_recommenders.ops.triton.triton_attention_utils import acc_dq
+
+
+def _host_descriptor_pre_hook(nargs):
+    if not tensor_descriptor_tma:
+        return
+
+    if not isinstance(nargs["Q"], TensorDescriptor):
+        return
+    BLOCK_M = nargs["BLOCK_M"]
+    BLOCK_N = nargs["BLOCK_N"]
+    BLOCK_D_Q = nargs["BLOCK_D_Q"]
+    BLOCK_D_V = nargs["BLOCK_D_V"]
+    if "USE_TLX" in nargs and nargs["USE_TLX"]:
+        BLOCK_M = BLOCK_M // nargs["NUM_MMA_GROUPS"]
+    nargs["Q"].block_shape = [BLOCK_M, BLOCK_D_Q]
+    nargs["V"].block_shape = [BLOCK_N, BLOCK_D_V]
+    nargs["K"].block_shape = [BLOCK_N, BLOCK_D_Q]
+
+
+# pyre-ignore[2]
+def _early_config_prune(
+    configs: List[triton.Config],
+    named_args,
+    **kwargs,
+) -> List[triton.Config]:
+    """Filter autotune configs that are incompatible with the current call.
+
+    The TLX (warp-specialized) variant of ``_hstu_attn_fwd`` calls
+    ``tlx.async_descriptor_load(Q, ...)`` which requires Q/K/V to be real TMA
+    tensor descriptors (``tl.tensor_descriptor_base``). They are only
+    constructed by the host wrapper when ``ENABLE_TMA=True`` AND the host
+    ``TensorDescriptor`` API is importable. If the kernel is invoked without
+    those preconditions, raw tensors flow into the TLX path and the
+    ``isinstance(desc, tl.tensor_descriptor_base)`` assert in
+    ``triton/language/extra/tlx/mem_ops.py`` fires at compile time.
+
+    We make autotuning robust to that mismatch by dropping any config with
+    ``USE_TLX=True`` whenever ENABLE_TMA is not set or TMA host descriptors
+    are unavailable. This is purely defensive: if the caller threads
+    ``enable_tma=True`` (see ``_should_enable_tma`` below) the TLX configs
+    remain eligible.
+    """
+    enable_tma = kwargs.get("ENABLE_TMA", None)
+    if enable_tma is None:
+        enable_tma = named_args.get("ENABLE_TMA", False)
+    if enable_tma and tensor_descriptor_tma:
+        return configs
+    pruned = [c for c in configs if not c.kwargs.get("USE_TLX", False)]
+    # Safety: never return an empty config list.
+    return pruned if pruned else configs
+
+
+def _should_enable_tma() -> bool:
+    """Return True iff the TMA / TLX fast path can be safely enabled.
+
+    Conditions:
+      * The host ``triton.tools.tensor_descriptor.TensorDescriptor`` API is
+        importable (``tensor_descriptor_tma``).
+      * CUDA is available and the device is Hopper (compute capability 9),
+        which is the only architecture for which TLX configs are emitted in
+        ``_get_fw_configs``.
+    """
+    if not tensor_descriptor_tma:
+        return False
+    if not torch.cuda.is_available():
+        return False
+    try:
+        device_capability = torch.cuda.get_device_capability()[0]
+    except (RuntimeError, AssertionError):
+        return False
+    return device_capability == 9
+
+
+def _get_fw_configs() -> List[triton.Config]:  # noqa: C901
+    configs = []
+    if torch.version.hip:
+        for BLOCK_M in [32, 64, 128]:
+            for BLOCK_N in [32, 64]:
+                for num_stages in [1, 2]:
+                    for num_warps in [4, 8]:
+                        for matrix_instr_nonkdim in [16, 32]:
+                            configs.append(
+                                triton.Config(
+                                    {
+                                        "BLOCK_M": BLOCK_M,
+                                        "BLOCK_N": BLOCK_N,
+                                        "matrix_instr_nonkdim": matrix_instr_nonkdim,
+                                        "waves_per_eu": 0,
+                                        "kpack": 2,
+                                    },
+                                    num_stages=num_stages,
+                                    num_warps=num_warps,
+                                )
+                            )
+    else:
+        configs = [
+            triton.Config(
+                {"BLOCK_M": 16, "BLOCK_N": 32},
+                num_stages=2,
+                num_warps=2,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 32},
+                num_stages=2,
+                num_warps=2,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 32},
+                num_stages=4,
+                num_warps=2,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 32},
+                num_stages=2,
+                num_warps=4,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 32},
+                num_stages=4,
+                num_warps=4,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 64},
+                num_stages=2,
+                num_warps=4,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 64},
+                num_stages=4,
+                num_warps=4,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 64},
+                num_stages=4,
+                num_warps=8,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 128},
+                num_stages=2,
+                num_warps=4,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 128},
+                num_stages=2,
+                num_warps=8,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 64, "BLOCK_N": 32},
+                num_stages=4,
+                num_warps=2,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 64, "BLOCK_N": 32},
+                num_stages=2,
+                num_warps=4,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 64, "BLOCK_N": 32},
+                num_stages=4,
+                num_warps=4,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 64, "BLOCK_N": 32},
+                num_stages=2,
+                num_warps=8,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 64, "BLOCK_N": 64},
+                num_stages=2,
+                num_warps=2,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 64, "BLOCK_N": 64},
+                num_stages=2,
+                num_warps=4,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 64, "BLOCK_N": 64},
+                num_stages=4,
+                num_warps=4,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 64, "BLOCK_N": 64},
+                num_stages=4,
+                num_warps=8,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 128, "BLOCK_N": 32},
+                num_stages=2,
+                num_warps=2,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 128, "BLOCK_N": 32},
+                num_stages=4,
+                num_warps=2,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 128, "BLOCK_N": 32},
+                num_stages=2,
+                num_warps=4,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 128, "BLOCK_N": 32},
+                num_stages=4,
+                num_warps=4,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 128, "BLOCK_N": 32},
+                num_stages=2,
+                num_warps=8,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 128, "BLOCK_N": 32},
+                num_stages=4,
+                num_warps=8,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 128, "BLOCK_N": 64},
+                num_stages=2,
+                num_warps=4,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 128, "BLOCK_N": 64},
+                num_stages=2,
+                num_warps=8,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 128, "BLOCK_N": 64},
+                num_stages=4,
+                num_warps=8,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 128, "BLOCK_N": 128},
+                num_stages=4,
+                num_warps=4,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 128, "BLOCK_N": 128},
+                num_stages=2,
+                num_warps=8,
+                pre_hook=_host_descriptor_pre_hook,
+            ),
+        ]
+
+        # Add 'USE_TLX' : False, 'NUM_BUFFERS': 1, 'NUM_MMA_WARPS_PER_GROUP': 1, 'NUM_MMA_GROUPS': 1 to non-TLX configs
+        for config in configs:
+            if not config.kwargs.get("USE_TLX", False):
+                config.kwargs["USE_TLX"] = False
+                config.kwargs["NUM_BUFFERS"] = 1
+                config.kwargs["NUM_MMA_WARPS_PER_GROUP"] = 1
+                config.kwargs["NUM_MMA_GROUPS"] = 1
+
+        # Add TLX configs if TLX is available
+        if HAS_TLX:
+            try:
+                device_capability = torch.cuda.get_device_capability()[0]
+            except (RuntimeError, AssertionError):
+                # No CUDA device available
+                device_capability = None
+
+            if device_capability == 9:
+                # H100 configs
+                configs.append(
+                    triton.Config(
+                        {
+                            "BLOCK_M": 128,
+                            "BLOCK_N": 64,
+                            "USE_TLX": True,
+                            "NUM_BUFFERS": 2,
+                            "NUM_MMA_WARPS_PER_GROUP": 4,
+                            "NUM_MMA_GROUPS": 2,
+                        },
+                        num_stages=0,
+                        num_warps=4,
+                        pre_hook=_host_descriptor_pre_hook,
+                    ),
+                )
+
+    return configs
+
+
+@triton.jit
+def _hstu_attn_fwd_one_block(  # noqa: C901
+    start_n,
+    seq_len,
+    offs_m,
+    offs_n,
+    q,
+    K,
+    V,
+    K_block_ptr,
+    V_block_ptr,
+    offset_kh,
+    offset_vh,
+    seq_start,
+    n_targets,
+    alpha,
+    MAX_SEQ_LEN,
+    contextual_seq_len,
+    max_attn_len,
+    HAS_MULTIPLE_TARGETS: tl.constexpr,
+    HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr,
+    HAS_MAX_ATTN_LEN: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BLOCK_D_Q: tl.constexpr,
+    BLOCK_D_V: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    ENABLE_TMA: tl.constexpr,
+):
+    start_n = tl.multiple_of(start_n, BLOCK_N)
+    # -- compute qk ----
+    k = None
+    qk = None
+    if ENABLE_TMA:
+        k = K.load(
+            [(seq_start + start_n).to(tl.int32), offset_kh.to(tl.int32)],
+        )
+        # tma can only be loaded in one order, use trans afterwards
+        qk = tl.dot(q, tl.trans(k), allow_tf32=ALLOW_TF32) * alpha
+    else:
+        k = tl.load(K_block_ptr, boundary_check=(1,), padding_option="zero")
+        qk = tl.dot(q, k, allow_tf32=ALLOW_TF32) * alpha
+    invalid_mask = offs_m[:, None] == offs_n[None, :]
+    max_ids = seq_len
+    if HAS_CONTEXTUAL_SEQ_LEN:
+        offs_m = offs_m - contextual_seq_len + 1
+        offs_m = tl.where(
+            offs_m > 0,
+            offs_m,
+            0,
+        )
+        offs_n = offs_n - contextual_seq_len + 1
+        offs_n = tl.where(
+            offs_n > 0,
+            offs_n,
+            0,
+        )
+        max_ids = max_ids - contextual_seq_len + 1
+    if HAS_MULTIPLE_TARGETS:
+        max_ids = max_ids - n_targets
+        offs_m = tl.where(
+            offs_m < max_ids,
+            offs_m,
+            max_ids,
+        )
+        offs_n = tl.where(
+            offs_n < max_ids,
+            offs_n,
+            max_ids,
+        )
+    offs_m_minus_n = offs_m[:, None] - offs_n[None, :]
+    invalid_mask = invalid_mask or (offs_m_minus_n > 0)
+    if HAS_MAX_ATTN_LEN:
+        invalid_mask = invalid_mask and offs_m_minus_n <= max_attn_len
+    if HAS_CONTEXTUAL_SEQ_LEN:
+        invalid_mask = invalid_mask or (
+            offs_m[:, None] == 0 and offs_n[None, :] < max_ids
+        )
+    scale = tl.where(invalid_mask, (1.0 / MAX_SEQ_LEN), 0.0)
+    silu = fast_dividef(qk, 1.0 + fast_expf(-qk)) * scale
+    v = None
+    if ENABLE_TMA:
+        v = V.load(
+            [(seq_start + start_n).to(tl.int32), offset_vh.to(tl.int32)],
+        )
+    else:
+        v = tl.load(V_block_ptr, boundary_check=(0,), padding_option="zero")
+    silu = silu.to(v.dtype)
+    return tl.dot(silu, v, allow_tf32=ALLOW_TF32)
+
+
+@triton.jit
+def _hstu_attn_fwd_compute(  # noqa C901
+    Q,
+    K,
+    V,
+    H,
+    DimQ,
+    DimV,
+    workspace_ptr,
+    seq_offsets,
+    num_targets,
+    Out,
+    stride_qm,
+    stride_qh,
+    stride_kn,
+    stride_kh,
+    stride_vn,
+    stride_vh,
+    stride_om,
+    stride_oh,
+    alpha,
+    MAX_SEQ_LEN,
+    DeltaSize,
+    contextual_seq_len,
+    max_attn_len,
+    off_z,
+    off_h,
+    pid,
+    HAS_MULTIPLE_TARGETS: tl.constexpr,
+    IS_DELTA_Q: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BLOCK_D_Q: tl.constexpr,
+    BLOCK_D_V: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr,
+    HAS_MAX_ATTN_LEN: tl.constexpr,
+    ENABLE_TMA: tl.constexpr,
+    TMA_DESC_SIZE: tl.constexpr,
+):
+    seq_start = tl.load(seq_offsets + off_z).to(tl.int64)
+    off_h = off_h.to(tl.int64)
+    off_z = off_z.to(tl.int64)
+    seq_end = tl.load(seq_offsets + off_z + 1)
+    seq_len = (seq_end - seq_start).to(tl.int32)
+
+    if IS_DELTA_Q:
+        start_m_delta = pid * BLOCK_M
+        start_m = (start_m_delta + seq_len - DeltaSize).to(tl.int32)
+    else:
+        start_m_delta = 0
+        start_m = pid * BLOCK_M
+    if start_m < seq_len:
+        if HAS_MULTIPLE_TARGETS:
+            n_targets = tl.load(num_targets + off_z).to(tl.int32)
+        else:
+            n_targets = None
+
+        # initialize offsets
+        offs_m = start_m + tl.arange(0, BLOCK_M)
+        offs_n = tl.arange(0, BLOCK_N)
+        Q_block_ptr = None
+        K_block_ptr = None
+        V_block_ptr = None
+        if not ENABLE_TMA:
+            if IS_DELTA_Q:
+                Q_block_ptr = tl.make_block_ptr(
+                    base=Q + off_h * stride_qh + off_z * DeltaSize * stride_qm,
+                    shape=(DeltaSize, BLOCK_D_Q),
+                    strides=(stride_qm, 1),
+                    offsets=(start_m_delta, 0),
+                    block_shape=(BLOCK_M, BLOCK_D_Q),
+                    order=(1, 0),
+                )
+            else:
+                Q_block_ptr = tl.make_block_ptr(
+                    base=Q + off_h * stride_qh + seq_start * stride_qm,
+                    shape=(seq_len, BLOCK_D_Q),
+                    strides=(stride_qm, 1),
+                    offsets=(start_m, 0),
+                    block_shape=(BLOCK_M, BLOCK_D_Q),
+                    order=(1, 0),
+                )
+            q = tl.load(Q_block_ptr, boundary_check=(0,), padding_option="zero")
+
+            K_block_ptr = tl.make_block_ptr(
+                base=K + off_h * stride_kh + seq_start * stride_kn,
+                shape=(BLOCK_D_Q, seq_len),
+                strides=(1, stride_kn),
+                offsets=(0, 0),
+                block_shape=(BLOCK_D_Q, BLOCK_N),
+                order=(0, 1),
+            )
+            V_block_ptr = tl.make_block_ptr(
+                base=V + off_h * stride_vh + seq_start * stride_vn,
+                shape=(seq_len, BLOCK_D_V),
+                strides=(stride_vn, 1),
+                offsets=(0, 0),
+                block_shape=(BLOCK_N, BLOCK_D_V),
+                order=(1, 0),
+            )
+        else:
+            if IS_DELTA_Q:
+                q = Q.load(
+                    [
+                        (off_z * DeltaSize + start_m_delta).to(tl.int32),
+                        (off_h * stride_qh).to(tl.int32),
+                    ]
+                )
+            else:
+                q = Q.load(
+                    [
+                        (seq_start + start_m).to(tl.int32),
+                        (off_h * stride_qh).to(tl.int32),
+                    ]
+                )
+
+        acc = tl.zeros([BLOCK_M, BLOCK_D_V], dtype=tl.float32)
+        if HAS_MULTIPLE_TARGETS:
+            uih_end = seq_len - n_targets
+        else:
+            uih_end = seq_len
+        if HAS_CONTEXTUAL_SEQ_LEN is True and start_m < contextual_seq_len:
+            # uih_end must be larger than start_m
+            low = 0
+            high = seq_len
+        else:
+            low = 0
+            high = start_m + BLOCK_M
+            if HAS_MAX_ATTN_LEN:
+                if start_m > uih_end:
+                    low = uih_end - max_attn_len
+                else:
+                    low = start_m - max_attn_len
+                if HAS_CONTEXTUAL_SEQ_LEN:
+                    low = low if low > contextual_seq_len else 0
+                else:
+                    low = low if low > 0 else 0
+            if HAS_MULTIPLE_TARGETS:
+                uih_end = (uih_end + BLOCK_N - 1) // BLOCK_N * BLOCK_N
+                if uih_end < start_m:
+                    high = seq_len - n_targets
+
+        if low > 0:
+            if not ENABLE_TMA:
+                K_block_ptr = tl.advance(K_block_ptr, (0, low))
+                V_block_ptr = tl.advance(V_block_ptr, (low, 0))
+        end_n = low
+        for start_n in range(low, high, BLOCK_N):
+            acc += _hstu_attn_fwd_one_block(
+                start_n=start_n,
+                seq_len=seq_len,
+                offs_m=offs_m,
+                offs_n=offs_n + start_n,
+                q=q,
+                K=K,
+                V=V,
+                K_block_ptr=K_block_ptr,
+                V_block_ptr=V_block_ptr,
+                offset_kh=off_h * stride_kh,
+                offset_vh=off_h * stride_vh,
+                seq_start=seq_start,
+                n_targets=n_targets if HAS_MULTIPLE_TARGETS else None,
+                alpha=alpha,
+                MAX_SEQ_LEN=MAX_SEQ_LEN,
+                contextual_seq_len=contextual_seq_len,
+                max_attn_len=max_attn_len,
+                HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS,
+                HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN,
+                HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN,
+                ALLOW_TF32=ALLOW_TF32,
+                BLOCK_D_Q=BLOCK_D_Q,
+                BLOCK_D_V=BLOCK_D_V,
+                BLOCK_N=BLOCK_N,
+                ENABLE_TMA=ENABLE_TMA,
+            )
+            if not ENABLE_TMA:
+                K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
+                V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
+            end_n += BLOCK_N
+
+        if HAS_MULTIPLE_TARGETS:
+            # pyre-ignore[61]
+            if uih_end < start_m:
+                low_delta = start_m
+                high_delta = start_m + BLOCK_M
+                offset = (low_delta - end_n).to(tl.int32)
+                if not ENABLE_TMA:
+                    K_block_ptr = tl.advance(K_block_ptr, (0, offset))
+                    V_block_ptr = tl.advance(V_block_ptr, (offset, 0))
+                for start_delta in tl.range(
+                    low_delta, high_delta, BLOCK_N, num_stages=0
+                ):
+                    acc += _hstu_attn_fwd_one_block(
+                        start_n=start_delta,
+                        seq_len=seq_len,
+                        offs_m=offs_m,
+                        offs_n=offs_n + start_delta,
+                        q=q,
+                        K=K,
+                        V=V,
+                        K_block_ptr=K_block_ptr,
+                        V_block_ptr=V_block_ptr,
+                        offset_kh=off_h * stride_kh,
+                        offset_vh=off_h * stride_vh,
+                        seq_start=seq_start,
+                        n_targets=n_targets if HAS_MULTIPLE_TARGETS else None,
+                        alpha=alpha,
+                        MAX_SEQ_LEN=MAX_SEQ_LEN,
+                        contextual_seq_len=contextual_seq_len,
+                        max_attn_len=max_attn_len,
+                        HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS,
+                        HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN,
+                        HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN,
+                        ALLOW_TF32=ALLOW_TF32,
+                        BLOCK_D_Q=BLOCK_D_Q,
+                        BLOCK_D_V=BLOCK_D_V,
+                        BLOCK_N=BLOCK_N,
+                        ENABLE_TMA=ENABLE_TMA,
+                    )
+                    if not ENABLE_TMA:
+                        K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
+                        V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
+        # Don't use TMA in Jagged case since we don't want to overwrite
+        # the output of another sequence
+        if IS_DELTA_Q:
+            start_m_delta = pid * BLOCK_M
+            offs_m_delta = start_m_delta + tl.arange(0, BLOCK_M)
+            offs_v_d = tl.arange(0, BLOCK_D_V)
+            off_o = Out + off_z * DeltaSize * stride_om + off_h * stride_oh
+            out_ptrs = off_o + offs_m_delta[:, None] * stride_om + offs_v_d[None, :]
+            tl.store(out_ptrs, acc, mask=(offs_m_delta < DeltaSize)[:, None])
+        else:
+            # rematerialize offsets to save registers
+            start_m = pid * BLOCK_M
+            offs_m = start_m + tl.arange(0, BLOCK_M)
+            offs_v_d = tl.arange(0, BLOCK_D_V)
+            off_o = Out + seq_start * stride_om + off_h * stride_oh
+            out_ptrs = off_o + offs_m[:, None] * stride_om + offs_v_d[None, :]
+            tl.store(out_ptrs, acc, mask=(offs_m < seq_len)[:, None])
+
+
+@triton.jit
+def _hstu_attn_fwd_compute_main_loop_tlx(  # noqa C901
+    low,
+    high,
+    seq_len,
+    offs_m,
+    offs_n,
+    acc,
+    q_tiles,
+    k_tiles,
+    v_tiles,
+    q_fulls,
+    k_fulls,
+    v_fulls,
+    k_empties,
+    v_empties,
+    v_dtype,
+    n_targets,
+    alpha,
+    end_n,
+    loop_trip_cnt,
+    max_attn_len,
+    HAS_MULTIPLE_TARGETS: tl.constexpr,
+    HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr,
+    HAS_MAX_ATTN_LEN: tl.constexpr,
+    cid: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    NUM_BUFFERS: tl.constexpr,
+    MAX_SEQ_LEN: tl.constexpr,
+    WAIT_FOR_Q: tl.constexpr,
+):
+    if WAIT_FOR_Q:
+        # wait for the Q buffer to be populated by the producer
+        q_full = tlx.local_view(q_fulls, cid)
+        tlx.barrier_wait(q_full, 0)
+
+    q_tile = tlx.local_view(q_tiles, cid)
+
+    for start in tl.range(low + BLOCK_N, high, BLOCK_N, num_stages=0):
+        buf_id = loop_trip_cnt % NUM_BUFFERS
+        # buffers in a row share the same phase
+        kv_phase = (loop_trip_cnt // NUM_BUFFERS) % 2
+
+        start_n = tl.multiple_of(start, BLOCK_N)
+        offs_n_start = offs_n
+        offs_n = offs_n_start + start_n
+
+        # wait for the K buffer to be populated by the producer
+        k_full = tlx.local_view(k_fulls, buf_id)
+        tlx.barrier_wait(k_full, kv_phase)
+        k_tile = tlx.local_view(k_tiles, buf_id)
+
+        # tma can only be loaded in one order, use trans afterwards
+        k_tile = tlx.local_trans(k_tile)
+        # second
+        qk = tlx.async_dot(q_tile, k_tile)
+        # wait for the MMA using to complete
+        qk = tlx.async_dot_wait(0, qk)
+        # release the K buffer
+        k_empty = tlx.local_view(k_empties, buf_id)
+        tlx.barrier_arrive(k_empty, 1)
+
+        qk = qk * alpha
+
+        invalid_mask = offs_m[:, None] == offs_n[None, :]
+        max_ids = seq_len
+        if HAS_MULTIPLE_TARGETS:
+            max_ids = max_ids - n_targets
+            offs_m = tl.where(
+                offs_m < max_ids,
+                offs_m,
+                max_ids,
+            )
+            offs_n = tl.where(
+                offs_n < max_ids,
+                offs_n,
+                max_ids,
+            )
+        offs_m_minus_n = offs_m[:, None] - offs_n[None, :]
+        invalid_mask = invalid_mask or (offs_m_minus_n > 0)
+        if HAS_MAX_ATTN_LEN:
+            invalid_mask = invalid_mask and offs_m_minus_n <= max_attn_len
+        if HAS_CONTEXTUAL_SEQ_LEN:
+            invalid_mask = invalid_mask or (
+                offs_m[:, None] == 0 and offs_n[None, :] < max_ids
+            )
+        scale = tl.where(invalid_mask, (1.0 / MAX_SEQ_LEN), 0.0)
+        silu = fast_dividef(qk, 1.0 + fast_expf(-qk)) * scale
+        silu = silu.to(v_dtype)
+
+        # wait for the V buffer to be populated by the producer
+        v_full = tlx.local_view(v_fulls, buf_id)
+        tlx.barrier_wait(v_full, kv_phase)
+        v_tile = tlx.local_view(v_tiles, buf_id)
+        acc = tlx.async_dot(silu, v_tile, acc)
+        # wait for the MMA using to complete
+        acc = tlx.async_dot_wait(0, acc)
+        # release the V buffer
+        v_empty = tlx.local_view(v_empties, buf_id)
+        tlx.barrier_arrive(v_empty, 1)
+
+        end_n += BLOCK_N
+
+        # increment loop trip counts
+        loop_trip_cnt += 1
+
+    return acc, end_n, loop_trip_cnt
+
+
+@triton.jit
+def _hstu_attn_fwd_compute_main_loop_tlx_pipelined(  # noqa C901
+    low,
+    high,
+    seq_len,
+    offs_m,
+    offs_n,
+    acc,
+    q_tiles,
+    k_tiles,
+    v_tiles,
+    q_fulls,
+    k_fulls,
+    v_fulls,
+    k_empties,
+    v_empties,
+    v_dtype,
+    n_targets,
+    alpha,
+    end_n,
+    loop_trip_cnt,
+    max_attn_len,
+    HAS_MULTIPLE_TARGETS: tl.constexpr,
+    HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr,
+    HAS_MAX_ATTN_LEN: tl.constexpr,
+    cid: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    NUM_BUFFERS: tl.constexpr,
+    MAX_SEQ_LEN: tl.constexpr,
+    WAIT_FOR_Q: tl.constexpr,
+):
+    if WAIT_FOR_Q:
+        # wait for the Q buffer to be populated by the producer
+        q_full = tlx.local_view(q_fulls, cid)
+        tlx.barrier_wait(q_full, 0)
+    q_tile = tlx.local_view(q_tiles, cid)
+
+    # wait for the K buffer to be populated by the producer
+    k_buf_id = loop_trip_cnt % NUM_BUFFERS
+    # buffers in a row share the same phase
+    k_phase = (loop_trip_cnt // NUM_BUFFERS) % 2
+
+    k_full = tlx.local_view(k_fulls, k_buf_id)
+    tlx.barrier_wait(k_full, k_phase)
+    k_tile = tlx.local_view(k_tiles, k_buf_id)
+
+    # tma can only be loaded in one order, use trans afterwards
+    k_tile = tlx.local_trans(k_tile)
+
+    # Pingpong
+    if cid == 0:
+        # Consumer 0 waits for Consumer 1 to reach synchronization point at barrier 9.
+        tlx.named_barrier_wait(9, 256)
+    else:
+        # Consumer 1 signals its arrival at barrier 9.
+        tlx.named_barrier_arrive(9, 256)
+        # Then waits at barrier 10 until Consumer 0 finishes issuing its async_dot.
+        tlx.named_barrier_wait(10, 256)
+
+    qk = tlx.async_dot(q_tile, k_tile)
+
+    if cid == 0:
+        # After issuing async_dot, Consumer 0 signals barrier 10 to unblock Consumer 1.
+        tlx.named_barrier_arrive(10, 256)
+
+    # wait for the MMA using to complete
+    qk = tlx.async_dot_wait(0, qk)
+    # release the K buffer
+    k_empty = tlx.local_view(k_empties, k_buf_id)
+    tlx.barrier_arrive(k_empty, 1)
+
+    qk = qk * alpha
+
+    start_n = tl.multiple_of(low, BLOCK_N)
+    offs_n_start = offs_n
+    offs_n = offs_n_start + start_n
+
+    invalid_mask = offs_m[:, None] == offs_n[None, :]
+    max_ids = seq_len
+    if HAS_MULTIPLE_TARGETS:
+        max_ids = max_ids - n_targets
+        offs_m = tl.where(
+            offs_m < max_ids,
+            offs_m,
+            max_ids,
+        )
+        offs_n = tl.where(
+            offs_n < max_ids,
+            offs_n,
+            max_ids,
+        )
+    offs_m_minus_n = offs_m[:, None] - offs_n[None, :]
+    invalid_mask = invalid_mask or (offs_m_minus_n > 0)
+    if HAS_MAX_ATTN_LEN:
+        invalid_mask = invalid_mask and offs_m_minus_n <= max_attn_len
+    if HAS_CONTEXTUAL_SEQ_LEN:
+        invalid_mask = invalid_mask or (
+            offs_m[:, None] == 0 and offs_n[None, :] < max_ids
+        )
+    scale = tl.where(invalid_mask, (1.0 / MAX_SEQ_LEN), 0.0)
+    silu = fast_dividef(qk, 1.0 + fast_expf(-qk)) * scale
+    silu = silu.to(v_dtype)
+
+    loop_trip_cnt += 1
+
+    for start in tl.range(low + BLOCK_N, high, BLOCK_N, num_stages=0):
+        start_n = tl.multiple_of(start, BLOCK_N)
+        offs_n = offs_n_start + start_n
+
+        k_buf_id = loop_trip_cnt % NUM_BUFFERS
+        # buffers in a row share the same phase
+        k_phase = k_phase ^ (k_buf_id == 0)
+
+        # wait for the K buffer to be populated by the producer
+        k_full = tlx.local_view(k_fulls, k_buf_id)
+        tlx.barrier_wait(k_full, k_phase)
+        k_tile = tlx.local_view(k_tiles, k_buf_id)
+
+        # tma can only be loaded in one order, use trans afterwards
+        k_tile = tlx.local_trans(k_tile)
+
+        qk = tlx.async_dot(q_tile, k_tile)
+        # wait for the MMA using to complete
+        prev_silu = silu
+
+        v_buf_id = (loop_trip_cnt - 1) % NUM_BUFFERS
+        # v_phase = v_phase ^ (v_buf_id == 0)
+        v_phase = ((loop_trip_cnt - 1) // NUM_BUFFERS) % 2
+        v_full = tlx.local_view(v_fulls, v_buf_id)
+        tlx.barrier_wait(v_full, v_phase)
+        v_tile = tlx.local_view(v_tiles, v_buf_id)
+        acc = tlx.async_dot(prev_silu, v_tile, acc)
+        qk = tlx.async_dot_wait(1, qk)
+
+        # release the K buffer
+        k_empty = tlx.local_view(k_empties, k_buf_id)
+        tlx.barrier_arrive(k_empty, 1)
+
+        qk = qk * alpha
+        invalid_mask = offs_m[:, None] == offs_n[None, :]
+        max_ids = seq_len
+        if HAS_MULTIPLE_TARGETS:
+            max_ids = max_ids - n_targets
+            offs_m = tl.where(
+                offs_m < max_ids,
+                offs_m,
+                max_ids,
+            )
+            offs_n = tl.where(
+                offs_n < max_ids,
+                offs_n,
+                max_ids,
+            )
+        offs_m_minus_n = offs_m[:, None] - offs_n[None, :]
+        invalid_mask = invalid_mask or (offs_m_minus_n > 0)
+        if HAS_MAX_ATTN_LEN:
+            invalid_mask = invalid_mask and offs_m_minus_n <= max_attn_len
+        if HAS_CONTEXTUAL_SEQ_LEN:
+            invalid_mask = invalid_mask or (
+                offs_m[:, None] == 0 and offs_n[None, :] < max_ids
+            )
+        scale = tl.where(invalid_mask, (1.0 / MAX_SEQ_LEN), 0.0)
+        silu = fast_dividef(qk, 1.0 + fast_expf(-qk)) * scale
+        silu = silu.to(v_dtype)
+
+        acc = tlx.async_dot_wait(0, acc)
+        # release the V buffer
+        v_empty = tlx.local_view(v_empties, v_buf_id)
+        tlx.barrier_arrive(v_empty, 1)
+
+        end_n += BLOCK_N
+
+        # increment loop trip counts
+        loop_trip_cnt += 1
+        # v_buf_id = loop_trip_cnt % NUM_BUFFERS
+        # v_phase = (loop_trip_cnt // NUM_BUFFERS) % 2
+
+    # wait for the V buffer to be populated by the producer
+    v_buf_id = (loop_trip_cnt - 1) % NUM_BUFFERS
+    v_phase = ((loop_trip_cnt - 1) // NUM_BUFFERS) % 2
+    v_full = tlx.local_view(v_fulls, v_buf_id)
+    # tlx.barrier_wait(v_full, v_buf_id)
+    v_tile = tlx.local_view(v_tiles, v_buf_id)
+    tlx.barrier_wait(v_full, v_phase)
+    acc = tlx.async_dot(silu, v_tile, acc)
+    acc = tlx.async_dot_wait(0, acc)
+    # release the V buffer
+    v_empty = tlx.local_view(v_empties, v_buf_id)
+    tlx.barrier_arrive(v_empty, 1)
+
+    return acc, end_n, loop_trip_cnt
+
+
+@triton.jit
+def _hstu_attn_fwd_load_K_or_V(
+    K,
+    k_tiles,
+    k_empties,
+    k_fulls,
+    buf_id,
+    k_phase,
+    start_n,
+    seq_start,
+    offset_kh,
+    BLOCK_D_Q: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    # wait for the K buffer to be released by the consumer
+    k_empty = tlx.local_view(k_empties, buf_id)
+    tlx.barrier_wait(k_empty, k_phase)
+    # load K
+    k_full = tlx.local_view(k_fulls, buf_id)
+    k_tile = tlx.local_view(k_tiles, buf_id)
+    tlx.barrier_expect_bytes(k_full, 2 * BLOCK_N * BLOCK_D_Q)  # float16
+    tlx.async_descriptor_load(
+        K,
+        k_tile,
+        [(seq_start + start_n).to(tl.int32), offset_kh.to(tl.int32)],
+        k_full,
+    )
+
+
+@triton.jit
+def _hstu_attn_fwd_load_Q(
+    Q,
+    q_tiles,
+    q_fulls,
+    cid,
+    off_z,
+    off_h,
+    stride_qh,
+    start_m,
+    seq_start,
+    DeltaSize,
+    IS_DELTA_Q: tl.constexpr,
+    BLOCK_D_Q: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+):
+    q_full = tlx.local_view(q_fulls, cid)
+    tlx.barrier_expect_bytes(q_full, 2 * BLOCK_M * BLOCK_D_Q)  # float16
+    q_tile = tlx.local_view(q_tiles, cid)
+    seq_offset = start_m + cid * BLOCK_M
+    if IS_DELTA_Q:
+        tlx.async_descriptor_load(
+            Q,
+            q_tile,
+            [
+                (off_z * DeltaSize + start_m).to(tl.int32),
+                (off_h * stride_qh).to(tl.int32),
+            ],
+            q_full,
+        )
+    else:
+        tlx.async_descriptor_load(
+            Q,
+            q_tile,
+            [
+                (seq_start + seq_offset).to(tl.int32),
+                (off_h * stride_qh).to(tl.int32),
+            ],
+            q_full,
+        )
+
+
+@triton.jit
+def _hstu_attn_fwd_caculate_range(
+    seq_len,
+    start_m,
+    n_targets,
+    contextual_seq_len,
+    max_attn_len,
+    HAS_MULTIPLE_TARGETS: tl.constexpr,
+    HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr,
+    HAS_MAX_ATTN_LEN: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    if HAS_MULTIPLE_TARGETS:
+        uih_end = seq_len - n_targets
+    else:
+        uih_end = seq_len
+
+    if HAS_CONTEXTUAL_SEQ_LEN is True and start_m < contextual_seq_len:
+        # uih_end must be larger than start_m
+        low = 0
+        high = seq_len
+    else:
+        low = 0
+        high = start_m + BLOCK_M
+        if HAS_MAX_ATTN_LEN:
+            if start_m > uih_end:
+                low = uih_end - max_attn_len
+            else:
+                low = start_m - max_attn_len
+            if HAS_CONTEXTUAL_SEQ_LEN:
+                low = low if low > contextual_seq_len else 0
+            else:
+                low = low if low > 0 else 0
+        if HAS_MULTIPLE_TARGETS:
+            uih_end = (uih_end + BLOCK_N - 1) // BLOCK_N * BLOCK_N
+            if uih_end < start_m:
+                high = seq_len - n_targets
+
+    return low, high, uih_end
+
+
+@triton.jit
+def _hstu_attn_fwd_load_Q_K_V(
+    Q,
+    K,
+    V,
+    q_tiles,
+    k_tiles,
+    v_tiles,
+    q_fulls,
+    k_fulls,
+    v_fulls,
+    k_empties,
+    v_empties,
+    stride_qh,
+    stride_kh,
+    stride_vh,
+    contextual_seq_len,
+    max_attn_len,
+    DeltaSize,
+    off_z,
+    off_h,
+    start_m,
+    seq_start,
+    seq_len,
+    n_targets,
+    HAS_MULTIPLE_TARGETS: tl.constexpr,
+    IS_DELTA_Q: tl.constexpr,
+    BLOCK_D_Q: tl.constexpr,
+    BLOCK_D_V: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    NUM_BUFFERS: tl.constexpr,
+    NUM_MMA_GROUPS: tl.constexpr,
+    HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr,
+    HAS_MAX_ATTN_LEN: tl.constexpr,
+):
+    # load q: it will stay in SRAM throughout
+    BLOCK_M_SPLIT: tl.constexpr = BLOCK_M // NUM_MMA_GROUPS
+
+    _hstu_attn_fwd_load_Q(
+        Q=Q,
+        q_tiles=q_tiles,
+        q_fulls=q_fulls,
+        cid=0,
+        off_z=off_z,
+        off_h=off_h,
+        stride_qh=stride_qh,
+        start_m=start_m,
+        seq_start=seq_start,
+        DeltaSize=DeltaSize,
+        IS_DELTA_Q=IS_DELTA_Q,
+        BLOCK_D_Q=BLOCK_D_Q,
+        BLOCK_M=BLOCK_M_SPLIT,
+    )
+
+    off_h = off_h.to(tl.int64)
+    off_z = off_z.to(tl.int64)
+    offset_kh = off_h * stride_kh
+    offset_vh = off_h * stride_vh
+
+    low, high, uih_end = _hstu_attn_fwd_caculate_range(
+        seq_len,
+        start_m,
+        n_targets,
+        contextual_seq_len,
+        max_attn_len,
+        HAS_MULTIPLE_TARGETS,
+        HAS_CONTEXTUAL_SEQ_LEN,
+        HAS_MAX_ATTN_LEN,
+        BLOCK_M,
+        BLOCK_N,
+    )
+
+    kv_phase = 0
+    loop_trip_cnt = 0
+
+    # pyre-ignore[58]
+    buf_id = loop_trip_cnt % NUM_BUFFERS
+    # buffers in a row share the same phase
+    kv_phase = kv_phase ^ (buf_id == 0)
+
+    start_n = tl.multiple_of(low, BLOCK_N)
+
+    _hstu_attn_fwd_load_K_or_V(
+        K,
+        k_tiles,
+        k_empties,
+        k_fulls,
+        buf_id,
+        kv_phase,
+        start_n,
+        seq_start,
+        offset_kh,
+        BLOCK_D_Q,
+        BLOCK_N,
+    )
+
+    for cid in tl.range(1, NUM_MMA_GROUPS, loop_unroll_factor=NUM_MMA_GROUPS - 1):
+        _hstu_attn_fwd_load_Q(
+            Q,
+            q_tiles,
+            q_fulls,
+            cid,
+            off_z,
+            off_h,
+            stride_qh,
+            start_m,
+            seq_start,
+            DeltaSize,
+            IS_DELTA_Q,
+            BLOCK_D_Q,
+            BLOCK_M_SPLIT,
+        )
+
+    _hstu_attn_fwd_load_K_or_V(
+        V,
+        v_tiles,
+        v_empties,
+        v_fulls,
+        buf_id,
+        kv_phase,
+        start_n,
+        seq_start,
+        offset_vh,
+        BLOCK_D_V,
+        BLOCK_N,
+    )
+
+    loop_trip_cnt += 1
+
+    for start in range(low + BLOCK_N, high, BLOCK_N):
+        # pyre-ignore[58]
+        buf_id = loop_trip_cnt % NUM_BUFFERS
+        # buffers in a row share the same phase
+        kv_phase = kv_phase ^ (buf_id == 0)
+
+        start_n = tl.multiple_of(start, BLOCK_N)
+
+        _hstu_attn_fwd_load_K_or_V(
+            K,
+            k_tiles,
+            k_empties,
+            k_fulls,
+            buf_id,
+            kv_phase,
+            start_n,
+            seq_start,
+            offset_kh,
+            BLOCK_D_Q,
+            BLOCK_N,
+        )
+
+        _hstu_attn_fwd_load_K_or_V(
+            V,
+            v_tiles,
+            v_empties,
+            v_fulls,
+            buf_id,
+            kv_phase,
+            start_n,
+            seq_start,
+            offset_vh,
+            BLOCK_D_V,
+            BLOCK_N,
+        )
+
+        # increment loop trip counts
+        loop_trip_cnt += 1
+
+    # pyre-ignore[61]
+    if uih_end < start_m:
+        low_delta = start_m
+        high_delta = start_m + BLOCK_M
+        for start_delta in tl.range(low_delta, high_delta, BLOCK_N, num_stages=0):
+            # pyre-ignore[58]
+            buf_id = loop_trip_cnt % NUM_BUFFERS
+            # buffers in a row share the same phase
+            kv_phase = kv_phase ^ (buf_id == 0)
+
+            start_n = tl.multiple_of(start_delta, BLOCK_N)
+
+            _hstu_attn_fwd_load_K_or_V(
+                K,
+                k_tiles,
+                k_empties,
+                k_fulls,
+                buf_id,
+                kv_phase,
+                start_n,
+                seq_start,
+                offset_kh,
+                BLOCK_D_Q,
+                BLOCK_N,
+            )
+
+            _hstu_attn_fwd_load_K_or_V(
+                V,
+                v_tiles,
+                v_empties,
+                v_fulls,
+                buf_id,
+                kv_phase,
+                start_n,
+                seq_start,
+                offset_vh,
+                BLOCK_D_V,
+                BLOCK_N,
+            )
+
+            # increment loop trip counts
+            loop_trip_cnt += 1
+
+
+@triton.jit
+def _hstu_attn_fwd_compute_tlx(  # noqa C901
+    Q,
+    K,
+    V,
+    H,
+    DimQ,
+    DimV,
+    seq_offsets,
+    num_targets,
+    Out,
+    stride_qh,
+    stride_kh,
+    stride_vh,
+    stride_om,
+    stride_oh,
+    alpha,
+    MAX_SEQ_LEN,
+    DeltaSize,
+    contextual_seq_len,
+    max_attn_len,
+    off_z,
+    off_h,
+    pid,
+    HAS_MULTIPLE_TARGETS: tl.constexpr,
+    IS_DELTA_Q: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BLOCK_D_Q: tl.constexpr,
+    BLOCK_D_V: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    NUM_BUFFERS: tl.constexpr,  #
+    NUM_MMA_WARPS_PER_GROUP: tl.constexpr,  #
+    NUM_MMA_GROUPS: tl.constexpr,  #
+    HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr,
+    HAS_MAX_ATTN_LEN: tl.constexpr,
+):
+    seq_start = tl.load(seq_offsets + off_z).to(tl.int64)
+    seq_end = tl.load(seq_offsets + off_z + 1)
+    seq_len = (seq_end - seq_start).to(tl.int32)
+
+    if IS_DELTA_Q:
+        start_m = pid * BLOCK_M
+        start_m = (start_m + seq_len - DeltaSize).to(tl.int32)
+    else:
+        start_m = pid * BLOCK_M
+
+    if start_m >= seq_len:
+        return
+
+    if HAS_MULTIPLE_TARGETS:
+        n_targets = tl.load(num_targets + off_z).to(tl.int32)
+    else:
+        n_targets = None
+
+    BLOCK_M_SPLIT: tl.constexpr = BLOCK_M // NUM_MMA_GROUPS
+    # allocate buffers
+    q_tiles = tlx.local_alloc(
+        (BLOCK_M_SPLIT, BLOCK_D_Q), tlx.dtype_of(Q), NUM_MMA_GROUPS
+    )
+    k_tiles = tlx.local_alloc((BLOCK_N, BLOCK_D_Q), tlx.dtype_of(K), NUM_BUFFERS)
+    v_tiles = tlx.local_alloc((BLOCK_N, BLOCK_D_V), tlx.dtype_of(V), NUM_BUFFERS)
+
+    # allocate barriers
+    q_fulls = tlx.alloc_barriers(num_barriers=NUM_MMA_GROUPS, arrive_count=1)
+    k_empties = tlx.alloc_barriers(
+        num_barriers=NUM_BUFFERS, arrive_count=NUM_MMA_GROUPS
+    )
+    k_fulls = tlx.alloc_barriers(num_barriers=NUM_BUFFERS, arrive_count=1)
+    v_empties = tlx.alloc_barriers(
+        num_barriers=NUM_BUFFERS, arrive_count=NUM_MMA_GROUPS
+    )
+    v_fulls = tlx.alloc_barriers(num_barriers=NUM_BUFFERS, arrive_count=1)
+
+    with tlx.async_tasks():
+        # producer group
+        with tlx.async_task("default"):
+            _hstu_attn_fwd_load_Q_K_V(
+                Q=Q,
+                K=K,
+                V=V,
+                q_tiles=q_tiles,
+                k_tiles=k_tiles,
+                v_tiles=v_tiles,
+                q_fulls=q_fulls,
+                k_fulls=k_fulls,
+                v_fulls=v_fulls,
+                k_empties=k_empties,
+                v_empties=v_empties,
+                stride_qh=stride_qh,
+                stride_kh=stride_kh,
+                stride_vh=stride_vh,
+                contextual_seq_len=contextual_seq_len,
+                max_attn_len=max_attn_len,
+                DeltaSize=DeltaSize,
+                off_z=off_z,
+                off_h=off_h,
+                start_m=start_m,
+                seq_start=seq_start,
+                seq_len=seq_len,
+                n_targets=n_targets,
+                HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS,
+                IS_DELTA_Q=IS_DELTA_Q,
+                BLOCK_D_Q=BLOCK_D_Q,
+                BLOCK_D_V=BLOCK_D_V,
+                BLOCK_M=BLOCK_M,
+                BLOCK_N=BLOCK_N,
+                NUM_BUFFERS=NUM_BUFFERS,
+                NUM_MMA_GROUPS=NUM_MMA_GROUPS,
+                HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN,
+                HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN,
+            )
+
+        # consumer groups
+        with tlx.async_task(
+            num_warps=NUM_MMA_WARPS_PER_GROUP, registers=232, replicate=NUM_MMA_GROUPS
+        ):
+            cid = tlx.async_task_replica_id()
+            acc = tl.zeros([BLOCK_M_SPLIT, BLOCK_D_V], dtype=tl.float32)
+            # initialize offsets
+            offs_m = start_m + tl.arange(0, BLOCK_M_SPLIT) + cid * BLOCK_M_SPLIT
+            offs_n = tl.arange(0, BLOCK_N)
+
+            low, high, uih_end = _hstu_attn_fwd_caculate_range(
+                seq_len,
+                start_m,
+                n_targets,
+                contextual_seq_len,
+                max_attn_len,
+                HAS_MULTIPLE_TARGETS,
+                HAS_CONTEXTUAL_SEQ_LEN,
+                HAS_MAX_ATTN_LEN,
+                BLOCK_M,
+                BLOCK_N,
+            )
+
+            end_n = low
+            loop_trip_cnt = 0
+
+            acc, end_n, loop_trip_cnt = _hstu_attn_fwd_compute_main_loop_tlx_pipelined(
+                low=low,
+                high=high,
+                seq_len=seq_len,
+                offs_m=offs_m,
+                offs_n=offs_n,
+                acc=acc,
+                q_tiles=q_tiles,
+                k_tiles=k_tiles,
+                v_tiles=v_tiles,
+                q_fulls=q_fulls,
+                k_fulls=k_fulls,
+                v_fulls=v_fulls,
+                k_empties=k_empties,
+                v_empties=v_empties,
+                v_dtype=tlx.dtype_of(V),
+                n_targets=n_targets,
+                alpha=alpha,
+                end_n=end_n,
+                loop_trip_cnt=loop_trip_cnt,
+                max_attn_len=max_attn_len,
+                HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS,
+                HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN,
+                HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN,
+                cid=cid,
+                BLOCK_N=BLOCK_N,
+                NUM_BUFFERS=NUM_BUFFERS,
+                MAX_SEQ_LEN=MAX_SEQ_LEN,
+                WAIT_FOR_Q=1,
+            )
+
+            # pyre-ignore[61]
+            if uih_end < start_m:
+                low_delta = start_m
+                high_delta = start_m + BLOCK_M
+                acc, end_n, loop_trip_cnt = _hstu_attn_fwd_compute_main_loop_tlx(
+                    low=low_delta,
+                    high=high_delta,
+                    seq_len=seq_len,
+                    offs_m=offs_m,
+                    offs_n=offs_n,
+                    acc=acc,
+                    q_tiles=q_tiles,
+                    k_tiles=k_tiles,
+                    v_tiles=v_tiles,
+                    q_fulls=q_fulls,
+                    k_fulls=k_fulls,
+                    v_fulls=v_fulls,
+                    k_empties=k_empties,
+                    v_empties=v_empties,
+                    v_dtype=tlx.dtype_of(V),
+                    n_targets=n_targets,
+                    alpha=alpha,
+                    end_n=end_n,
+                    loop_trip_cnt=loop_trip_cnt,
+                    max_attn_len=max_attn_len,
+                    HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS,
+                    HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN,
+                    HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN,
+                    cid=cid,
+                    BLOCK_N=BLOCK_N,
+                    NUM_BUFFERS=NUM_BUFFERS,
+                    MAX_SEQ_LEN=MAX_SEQ_LEN,
+                    WAIT_FOR_Q=0,
+                )
+
+            # Don't use TMA in Jagged case since we don't want to overwrite
+            # the output of another sequence
+            if IS_DELTA_Q:
+                start_m_delta = pid * BLOCK_M + cid * BLOCK_M_SPLIT
+                offs_m_delta = start_m_delta + tl.arange(0, BLOCK_M_SPLIT)
+                offs_v_d = tl.arange(0, BLOCK_D_V)
+                off_o = Out + off_z * DeltaSize * stride_om + off_h * stride_oh
+                out_ptrs = off_o + offs_m_delta[:, None] * stride_om + offs_v_d[None, :]
+                tl.store(out_ptrs, acc, mask=(offs_m_delta < DeltaSize)[:, None])
+            else:
+                # rematerialize offsets to save registers
+                start_m = pid * BLOCK_M + cid * BLOCK_M_SPLIT
+                offs_m = start_m + tl.arange(0, BLOCK_M_SPLIT)
+                offs_v_d = tl.arange(0, BLOCK_D_V)
+                off_o = Out + seq_start * stride_om + off_h * stride_oh
+                out_ptrs = off_o + offs_m[:, None] * stride_om + offs_v_d[None, :]
+                tl.store(out_ptrs, acc, mask=(offs_m < seq_len)[:, None])
+
+
+@triton_autotune(
+    configs=_get_fw_configs(),
+    key=[
+        "AUTOTUNE_Z",
+        "H",
+        "AUTOTUNE_MAX_SEQ_LEN",
+        "DimQ",
+        "DimV",
+        "DeltaSize",
+        "IS_DELTA_Q",
+    ],
+    prune_configs_by={"early_config_prune": _early_config_prune},
+)
+@triton.jit
+def _hstu_attn_fwd(  # noqa C901
+    Q,
+    K,
+    V,
+    workspace_ptr,
+    sort_by_length_indices,
+    seq_offsets,
+    num_targets,
+    Out,
+    stride_qm,
+    stride_qh,
+    stride_kn,
+    stride_kh,
+    stride_vn,
+    stride_vh,
+    stride_om,
+    stride_oh,
+    alpha,
+    Z,
+    AUTOTUNE_Z,
+    H,
+    MAX_SEQ_LEN,
+    AUTOTUNE_MAX_SEQ_LEN,  # Quantized MAX_SEQ_LEN used as an autotuning key
+    DimQ,
+    DimV,
+    DeltaSize,
+    contextual_seq_len,
+    max_attn_len,
+    HAS_MULTIPLE_TARGETS: tl.constexpr,
+    IS_DELTA_Q: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BLOCK_D_Q: tl.constexpr,
+    BLOCK_D_V: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    USE_TLX: tl.constexpr,
+    NUM_BUFFERS: tl.constexpr,  #
+    NUM_MMA_WARPS_PER_GROUP: tl.constexpr,  #
+    NUM_MMA_GROUPS: tl.constexpr,  #
+    HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr,
+    HAS_MAX_ATTN_LEN: tl.constexpr,
+    HAS_SORT_BY_LENGTH_INDICES: tl.constexpr,
+    ENABLE_TMA: tl.constexpr,
+    TMA_DESC_SIZE: tl.constexpr,
+):
+    off_hz = tl.program_id(1)
+    off_z = off_hz // H
+    if HAS_SORT_BY_LENGTH_INDICES:
+        off_z = tl.load(sort_by_length_indices + off_z)
+    off_h = off_hz % H
+    pid = tl.program_id(0)
+    if USE_TLX:
+        _hstu_attn_fwd_compute_tlx(
+            Q=Q,
+            K=K,
+            V=V,
+            H=H,
+            DimQ=DimQ,
+            DimV=DimV,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            Out=Out,
+            stride_qh=stride_qh,
+            stride_kh=stride_kh,
+            stride_vh=stride_vh,
+            stride_om=stride_om,
+            stride_oh=stride_oh,
+            alpha=alpha,
+            MAX_SEQ_LEN=MAX_SEQ_LEN,
+            DeltaSize=DeltaSize,
+            contextual_seq_len=contextual_seq_len,
+            max_attn_len=max_attn_len,
+            off_z=off_z,
+            off_h=off_h,
+            pid=pid,
+            HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS,
+            IS_DELTA_Q=IS_DELTA_Q,
+            ALLOW_TF32=ALLOW_TF32,
+            BLOCK_D_Q=BLOCK_D_Q,
+            BLOCK_D_V=BLOCK_D_V,
+            HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN,
+            HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN,
+            BLOCK_M=BLOCK_M,
+            BLOCK_N=BLOCK_N,
+            NUM_BUFFERS=NUM_BUFFERS,
+            NUM_MMA_WARPS_PER_GROUP=NUM_MMA_WARPS_PER_GROUP,
+            NUM_MMA_GROUPS=NUM_MMA_GROUPS,
+        )
+    else:
+        _hstu_attn_fwd_compute(
+            Q=Q,
+            K=K,
+            V=V,
+            H=H,
+            DimQ=DimQ,
+            DimV=DimV,
+            workspace_ptr=workspace_ptr,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            Out=Out,
+            stride_qm=stride_qm,
+            stride_qh=stride_qh,
+            stride_kn=stride_kn,
+            stride_kh=stride_kh,
+            stride_vn=stride_vn,
+            stride_vh=stride_vh,
+            stride_om=stride_om,
+            stride_oh=stride_oh,
+            alpha=alpha,
+            MAX_SEQ_LEN=MAX_SEQ_LEN,
+            DeltaSize=DeltaSize,
+            contextual_seq_len=contextual_seq_len,
+            max_attn_len=max_attn_len,
+            off_z=off_z,
+            off_h=off_h,
+            pid=pid,
+            HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS,
+            IS_DELTA_Q=IS_DELTA_Q,
+            ALLOW_TF32=ALLOW_TF32,
+            BLOCK_D_Q=BLOCK_D_Q,
+            BLOCK_D_V=BLOCK_D_V,
+            HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN,
+            HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN,
+            BLOCK_M=BLOCK_M,
+            BLOCK_N=BLOCK_N,
+            ENABLE_TMA=ENABLE_TMA,
+            TMA_DESC_SIZE=TMA_DESC_SIZE,
+        )
+
+
+@triton_autotune(
+    configs=_get_fw_configs(),
+    key=[
+        "AUTOTUNE_Z",
+        "H",
+        "AUTOTUNE_MAX_SEQ_LEN",
+        "DimQ",
+        "DimV",
+        "DeltaSize",
+        "IS_DELTA_Q",
+    ],
+    prune_configs_by={"early_config_prune": _early_config_prune},
+)
+@triton.jit
+def _hstu_attn_fwd_persistent(  # noqa C901
+    Q,
+    K,
+    V,
+    workspace_ptr,
+    sort_by_length_indices,
+    seq_offsets,
+    num_targets,
+    Out,
+    stride_qm,
+    stride_qh,
+    stride_kn,
+    stride_kh,
+    stride_vn,
+    stride_vh,
+    stride_om,
+    stride_oh,
+    alpha,
+    Z,
+    AUTOTUNE_Z,
+    H,
+    MAX_SEQ_LEN,
+    AUTOTUNE_MAX_SEQ_LEN,  # Quantized MAX_SEQ_LEN used as an autotuning key
+    DimQ,
+    DimV,
+    DeltaSize,
+    contextual_seq_len,
+    max_attn_len,
+    HAS_MULTIPLE_TARGETS: tl.constexpr,
+    IS_DELTA_Q: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BLOCK_D_Q: tl.constexpr,
+    BLOCK_D_V: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    USE_TLX: tl.constexpr,
+    NUM_BUFFERS: tl.constexpr,  #
+    NUM_MMA_WARPS_PER_GROUP: tl.constexpr,  #
+    NUM_MMA_GROUPS: tl.constexpr,  #
+    HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr,
+    HAS_MAX_ATTN_LEN: tl.constexpr,
+    HAS_SORT_BY_LENGTH_INDICES: tl.constexpr,
+    ENABLE_TMA: tl.constexpr,
+    TMA_DESC_SIZE: tl.constexpr,
+):
+    n_tile_num = tl.cdiv(MAX_SEQ_LEN, BLOCK_M)
+    prog_id = tl.program_id(0)
+    num_progs = tl.num_programs(0)
+
+    total_tiles = n_tile_num * Z * H
+
+    tiles_per_sm = total_tiles // num_progs
+    if prog_id < total_tiles % num_progs:
+        tiles_per_sm += 1
+
+    tile_idx = prog_id
+    for _ in range(0, tiles_per_sm):
+        pid = (total_tiles - tile_idx - 1) // (Z * H)
+        off_hz = (total_tiles - tile_idx - 1) % (Z * H)
+        off_z = off_hz // H
+        off_h = off_hz % H
+        _hstu_attn_fwd_compute(
+            Q=Q,
+            K=K,
+            V=V,
+            H=H,
+            DimQ=DimQ,
+            DimV=DimV,
+            workspace_ptr=workspace_ptr,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            Out=Out,
+            stride_qm=stride_qm,
+            stride_qh=stride_qh,
+            stride_kn=stride_kn,
+            stride_kh=stride_kh,
+            stride_vn=stride_vn,
+            stride_vh=stride_vh,
+            stride_om=stride_om,
+            stride_oh=stride_oh,
+            alpha=alpha,
+            MAX_SEQ_LEN=MAX_SEQ_LEN,
+            DeltaSize=DeltaSize,
+            contextual_seq_len=contextual_seq_len,
+            max_attn_len=max_attn_len,
+            off_z=off_z,
+            off_h=off_h,
+            pid=pid,
+            HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS,
+            IS_DELTA_Q=IS_DELTA_Q,
+            ALLOW_TF32=ALLOW_TF32,
+            BLOCK_D_Q=BLOCK_D_Q,
+            BLOCK_D_V=BLOCK_D_V,
+            HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN,
+            HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN,
+            BLOCK_M=BLOCK_M,
+            BLOCK_N=BLOCK_N,
+            ENABLE_TMA=ENABLE_TMA,
+            TMA_DESC_SIZE=TMA_DESC_SIZE,
+        )
+        tile_idx += num_progs
+
+
+@triton.jit
+def _hstu_attn_bwd_one_block(  # noqa C901
+    start_m,
+    offs_n,
+    offs_m,
+    q_ptrs_trans,
+    dq_ptrs_trans,
+    do_ptrs,
+    device_desc_q,
+    device_desc_do,
+    dk,
+    dv,
+    k,
+    v,
+    pos_offs_n,
+    seq_len,
+    max_ids,
+    contextual_seq_len,
+    max_attn_len,
+    LOCK,
+    off_h,
+    stride_qh,
+    stride_doh,
+    stride_qm,
+    stride_dom,
+    stride_dqm,
+    alpha,
+    MAX_SEQ_LEN,
+    HAS_MULTIPLE_TARGETS: tl.constexpr,
+    HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr,
+    HAS_MAX_ATTN_LEN: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    ATOMIC_ADD: tl.constexpr,
+    ENABLE_TMA: tl.constexpr,
+    BLOCK_D_Q: tl.constexpr,
+    BLOCK_D_V: tl.constexpr,
+):
+    pos_offs_m = offs_m + start_m
+    mask_m = pos_offs_m < seq_len
+    invalid_mask_trans = pos_offs_m[None, :] == offs_n[:, None]
+    # recompute qk and silu
+    if HAS_CONTEXTUAL_SEQ_LEN:
+        pos_offs_m = pos_offs_m - contextual_seq_len + 1
+        pos_offs_m = tl.where(
+            pos_offs_m > 0,
+            pos_offs_m,
+            0,
+        )
+    if HAS_MULTIPLE_TARGETS:
+        pos_offs_m = tl.where(
+            pos_offs_m < max_ids,
+            pos_offs_m,
+            max_ids,
+        )
+    if ENABLE_TMA:
+        q = device_desc_q.load(
+            [start_m, (off_h * stride_qh).to(tl.int32)],
+        )
+        q_trans = tl.trans(q)
+    else:
+        q_trans = tl.load(
+            q_ptrs_trans + start_m * stride_qm,
+            mask=mask_m[None, :],
+            other=0.0,
+        )
+    qk_trans = tl.dot(k, q_trans, allow_tf32=ALLOW_TF32) * alpha
+    sig_trans = fast_dividef(1.0, 1.0 + tl.exp(-qk_trans))
+    silu_trans = qk_trans * sig_trans * (1.0 / MAX_SEQ_LEN)
+    pos_offs_m_minus_n = pos_offs_m[None, :] - pos_offs_n[:, None]
+    invalid_mask_trans = invalid_mask_trans or (pos_offs_m_minus_n > 0)
+    if HAS_MAX_ATTN_LEN:
+        invalid_mask_trans = invalid_mask_trans and pos_offs_m_minus_n <= max_attn_len
+    if HAS_CONTEXTUAL_SEQ_LEN:
+        invalid_mask_trans = invalid_mask_trans or (
+            pos_offs_m[None, :] == 0 and pos_offs_n[:, None] < max_ids
+        )
+    silu_trans = tl.where(invalid_mask_trans, silu_trans, 0)
+    silu_trans = silu_trans.to(k.dtype)
+    # compute dv
+    if ENABLE_TMA:
+        do = device_desc_do.load(
+            [start_m, (off_h * stride_doh).to(tl.int32)],
+        )
+    else:
+        do = tl.load(
+            do_ptrs + start_m * stride_dom,
+            mask=mask_m[:, None],
+            other=0.0,
+        )
+    dv += tl.dot(silu_trans, do, allow_tf32=ALLOW_TF32)
+
+    # compute dk and dq
+    dqk_trans = tl.dot(v, tl.trans(do), allow_tf32=ALLOW_TF32)
+    dqk_trans = (
+        dqk_trans * sig_trans * (1 + qk_trans * (1 - sig_trans)) * (1.0 / MAX_SEQ_LEN)
+    )
+    dqk_trans = tl.where(invalid_mask_trans, dqk_trans, 0)
+    dqk_trans = dqk_trans.to(k.dtype)
+
+    # Note: the factor `alpha` is delayed until the end of the function to reduce the cost
+    dk += tl.dot(dqk_trans, tl.trans(q_trans), allow_tf32=ALLOW_TF32)
+    acc_dq(
+        dq_ptrs_trans=dq_ptrs_trans,
+        start_m=start_m,
+        stride_dqm=stride_dqm,
+        k=k,
+        dqk_trans=dqk_trans,
+        alpha=alpha,
+        mask_m=mask_m,
+        MAX_SEQ_LEN=MAX_SEQ_LEN,
+        LOCK=LOCK,
+        BLOCK_M=BLOCK_M,
+        ATOMIC_ADD=ATOMIC_ADD,
+        ALLOW_TF32=ALLOW_TF32,
+    )
+    return dk, dv
+
+
+@triton.jit
+def _hstu_attn_bwd_one_col_block(  # noqa C901
+    start_n,
+    seq_len,
+    n_targets,
+    contextual_seq_len,
+    max_attn_len,
+    Q,
+    K,
+    V,
+    DOut,
+    DQ,
+    DK,
+    DV,
+    device_desc_q,
+    device_desc_k,
+    device_desc_v,
+    device_desc_do,
+    device_desc_dk,
+    device_desc_dv,
+    LOCK,
+    off_h,
+    stride_qh,
+    stride_kh,
+    stride_vh,
+    stride_doh,
+    stride_dkh,
+    stride_dvh,
+    stride_qm,
+    stride_kn,
+    stride_vn,
+    stride_dom,
+    stride_dqm,
+    stride_dkn,
+    stride_dvn,
+    alpha,
+    MAX_SEQ_LEN,
+    HAS_MULTIPLE_TARGETS: tl.constexpr,
+    HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr,
+    HAS_MAX_ATTN_LEN: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BLOCK_D_Q: tl.constexpr,
+    BLOCK_D_V: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    UNROLL: tl.constexpr,
+    ATOMIC_ADD: tl.constexpr,
+    ENABLE_TMA: tl.constexpr,
+):
+    if HAS_MULTIPLE_TARGETS:
+        low = start_n
+        if HAS_MAX_ATTN_LEN:
+            high = start_n + max_attn_len + BLOCK_N
+            high = high if high + n_targets < seq_len else seq_len
+        else:
+            high = seq_len
+    else:
+        low = start_n
+        if HAS_MAX_ATTN_LEN:
+            high = start_n + max_attn_len + BLOCK_N
+            high = high if high < seq_len else seq_len
+        else:
+            high = seq_len
+    if HAS_CONTEXTUAL_SEQ_LEN:
+        contextual_block_end = tl.cdiv(contextual_seq_len, BLOCK_M) * BLOCK_M
+        if low < contextual_block_end:
+            low = contextual_block_end
+
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_qk_d = tl.arange(0, BLOCK_D_Q)
+    offs_v_d = tl.arange(0, BLOCK_D_V)
+    offs_n = start_n + tl.arange(0, BLOCK_N)
+
+    dq_ptrs_trans = DQ + (offs_m[None, :] * stride_dqm + offs_qk_d[:, None])
+    dv = tl.zeros([BLOCK_N, BLOCK_D_V], dtype=tl.float32)
+    dk = tl.zeros([BLOCK_N, BLOCK_D_Q], dtype=tl.float32)
+    if ENABLE_TMA:
+        q_ptrs_trans = None
+        do_ptrs = None
+        k = device_desc_k.load(
+            [start_n, (off_h * stride_kh).to(tl.int32)],
+        )
+        v = device_desc_v.load(
+            [start_n, (off_h * stride_vh).to(tl.int32)],
+        )
+    else:
+        mask_n = offs_n < seq_len
+        q_ptrs_trans = Q + (offs_m[None, :] * stride_qm + offs_qk_d[:, None])
+        do_ptrs = DOut + (offs_m[:, None] * stride_dom + offs_v_d[None, :])
+        k_ptrs = K + (offs_n[:, None] * stride_kn + offs_qk_d[None, :])
+        v_ptrs = V + (offs_n[:, None] * stride_vn + offs_v_d[None, :])
+        k = tl.load(k_ptrs, mask=mask_n[:, None], other=0.0)
+        v = tl.load(v_ptrs, mask=mask_n[:, None], other=0.0)
+    max_ids = seq_len
+    if HAS_CONTEXTUAL_SEQ_LEN:
+        pos_offs_n = offs_n - contextual_seq_len + 1
+        pos_offs_n = tl.where(
+            pos_offs_n > 0,
+            pos_offs_n,
+            0,
+        )
+        max_ids = max_ids - contextual_seq_len + 1
+    else:
+        pos_offs_n = offs_n
+    if HAS_MULTIPLE_TARGETS:
+        max_ids = max_ids - n_targets
+        pos_offs_n = tl.where(
+            pos_offs_n < max_ids,
+            pos_offs_n,
+            max_ids,
+        )
+    # loop over rows
+    if HAS_CONTEXTUAL_SEQ_LEN:
+        for start_m in range(0, contextual_seq_len, BLOCK_M):
+            start_m = tl.multiple_of(start_m, BLOCK_M)
+            dk, dv = _hstu_attn_bwd_one_block(
+                start_m=start_m,
+                offs_n=offs_n,
+                offs_m=offs_m,
+                q_ptrs_trans=q_ptrs_trans,
+                dq_ptrs_trans=dq_ptrs_trans,
+                do_ptrs=do_ptrs,
+                device_desc_q=device_desc_q,
+                device_desc_do=device_desc_do,
+                dk=dk,
+                dv=dv,
+                k=k,
+                v=v,
+                pos_offs_n=pos_offs_n,
+                seq_len=seq_len,
+                max_ids=max_ids,
+                contextual_seq_len=contextual_seq_len,
+                max_attn_len=max_attn_len,
+                LOCK=LOCK,
+                off_h=off_h,
+                stride_qh=stride_qh,
+                stride_doh=stride_doh,
+                stride_qm=stride_qm,
+                stride_dom=stride_dom,
+                stride_dqm=stride_dqm,
+                alpha=alpha,
+                MAX_SEQ_LEN=MAX_SEQ_LEN,
+                HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS,
+                HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN,
+                HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN,
+                ALLOW_TF32=ALLOW_TF32,
+                BLOCK_M=BLOCK_M,
+                ATOMIC_ADD=ATOMIC_ADD,
+                ENABLE_TMA=ENABLE_TMA,
+                BLOCK_D_Q=BLOCK_D_Q,
+                BLOCK_D_V=BLOCK_D_V,
+            )
+    for start_m in tl.range(low, high, BLOCK_M, loop_unroll_factor=UNROLL):
+        start_m = tl.multiple_of(start_m, BLOCK_M)
+        dk, dv = _hstu_attn_bwd_one_block(
+            start_m=start_m,
+            offs_n=offs_n,
+            offs_m=offs_m,
+            q_ptrs_trans=q_ptrs_trans,
+            dq_ptrs_trans=dq_ptrs_trans,
+            do_ptrs=do_ptrs,
+            device_desc_q=device_desc_q,
+            device_desc_do=device_desc_do,
+            dk=dk,
+            dv=dv,
+            k=k,
+            v=v,
+            pos_offs_n=pos_offs_n,
+            seq_len=seq_len,
+            max_ids=max_ids,
+            contextual_seq_len=contextual_seq_len,
+            max_attn_len=max_attn_len,
+            LOCK=LOCK,
+            off_h=off_h,
+            stride_qh=stride_qh,
+            stride_doh=stride_doh,
+            stride_qm=stride_qm,
+            stride_dom=stride_dom,
+            stride_dqm=stride_dqm,
+            alpha=alpha,
+            MAX_SEQ_LEN=MAX_SEQ_LEN,
+            HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS,
+            HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN,
+            HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN,
+            ALLOW_TF32=ALLOW_TF32,
+            BLOCK_M=BLOCK_M,
+            ATOMIC_ADD=ATOMIC_ADD,
+            ENABLE_TMA=ENABLE_TMA,
+            BLOCK_D_Q=BLOCK_D_Q,
+            BLOCK_D_V=BLOCK_D_V,
+        )
+    # write-back
+    dk = dk * alpha
+    if ENABLE_TMA:
+        device_desc_dv.store(
+            [start_n, (off_h * stride_dvh).to(tl.int32)],
+            dv.to(k.dtype),
+        )
+        device_desc_dk.store(
+            [start_n, (off_h * stride_dkh).to(tl.int32)],
+            dk.to(k.dtype),
+        )
+    else:
+        dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_v_d[None, :])
+        dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_qk_d[None, :])
+        tl.store(dv_ptrs, dv.to(k.dtype), mask=mask_n[:, None])  # pyre-ignore[61]
+        tl.store(dk_ptrs, dk.to(k.dtype), mask=mask_n[:, None])  # pyre-ignore[61]
+
+
+def _bwd_pre_hook(nargs):
+    nargs["DQ"].zero_()
+    if nargs["SEQUENCE_PARALLEL"] is True:
+        nargs["LOCK"].zero_()
+
+
+def _get_bw_configs() -> List[triton.Config]:
+    if torch.version.hip:
+        configs = []
+        for BLOCK_M in [32, 64]:
+            for BLOCK_N in [32, 64, 128]:
+                for num_stages in [1, 2]:
+                    for num_warps in [4, 8]:
+                        for matrix_instr_nonkdim in [16, 32]:
+                            for waves_per_eu in [0, 2, 4]:
+                                for sp in [True, False]:
+                                    configs.append(
+                                        triton.Config(
+                                            {
+                                                "BLOCK_M": BLOCK_M,
+                                                "BLOCK_N": BLOCK_N,
+                                                "matrix_instr_nonkdim": matrix_instr_nonkdim,
+                                                "waves_per_eu": waves_per_eu,
+                                                "SEQUENCE_PARALLEL": sp,
+                                                "UNROLL": 1,
+                                            },
+                                            num_stages=num_stages,
+                                            num_warps=num_warps,
+                                            pre_hook=_bwd_pre_hook,
+                                        )
+                                    )
+        return configs
+
+    configs = [
+        triton.Config(
+            {"BLOCK_M": 16, "BLOCK_N": 32, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=2,
+            num_warps=2,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 16, "BLOCK_N": 16, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=2,
+            num_warps=2,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 16, "BLOCK_N": 32, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=2,
+            num_warps=4,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 16, "BLOCK_N": 32, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=1,
+            num_warps=8,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 32, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=1,
+            num_warps=4,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 32, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=2,
+            num_warps=4,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=2,
+            num_warps=4,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=2,
+            num_warps=8,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=1,
+            num_warps=4,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=2,
+            num_warps=4,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=1,
+            num_warps=8,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=2,
+            num_warps=8,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 128, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=2,
+            num_warps=8,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 128, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+            num_stages=3,
+            num_warps=8,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 128, "SEQUENCE_PARALLEL": False, "UNROLL": 4},
+            num_stages=2,
+            num_warps=8,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 16, "BLOCK_N": 32, "SEQUENCE_PARALLEL": True, "UNROLL": 1},
+            num_stages=2,
+            num_warps=2,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 32, "SEQUENCE_PARALLEL": True, "UNROLL": 1},
+            num_stages=1,
+            num_warps=4,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 32, "SEQUENCE_PARALLEL": True, "UNROLL": 1},
+            num_stages=2,
+            num_warps=4,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True, "UNROLL": 1},
+            num_stages=1,
+            num_warps=4,
+            pre_hook=_bwd_pre_hook,
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True, "UNROLL": 1},
+            num_stages=2,
+            num_warps=4,
+            pre_hook=_bwd_pre_hook,
+        ),
+    ]
+    if torch.cuda.is_available() and torch.version.cuda < "12.8":
+        configs += [
+            triton.Config(
+                {"BLOCK_M": 16, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+                num_stages=1,
+                num_warps=4,
+                pre_hook=_bwd_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+                num_stages=1,
+                num_warps=4,
+                pre_hook=_bwd_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 64, "SEQUENCE_PARALLEL": False, "UNROLL": 1},
+                num_stages=1,
+                num_warps=8,
+                pre_hook=_bwd_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True, "UNROLL": 1},
+                num_stages=1,
+                num_warps=8,
+                pre_hook=_bwd_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 128, "SEQUENCE_PARALLEL": True, "UNROLL": 1},
+                num_stages=3,
+                num_warps=8,
+                pre_hook=_bwd_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True, "UNROLL": 1},
+                num_stages=1,
+                num_warps=4,
+                pre_hook=_bwd_pre_hook,
+            ),
+            triton.Config(
+                {"BLOCK_M": 32, "BLOCK_N": 64, "SEQUENCE_PARALLEL": True, "UNROLL": 1},
+                num_stages=2,
+                num_warps=4,
+                pre_hook=_bwd_pre_hook,
+            ),
+            triton.Config(
+                {
+                    "BLOCK_M": 32,
+                    "BLOCK_N": 128,
+                    "SEQUENCE_PARALLEL": False,
+                    "UNROLL": 2,
+                },
+                num_stages=2,
+                num_warps=8,
+                pre_hook=_bwd_pre_hook,
+            ),
+        ]
+    else:
+        print("WARNING: temporarily disabled some autotune configs for CUDA 12.8+")
+    return configs
+
+
+@triton_autotune(
+    configs=_get_bw_configs(),
+    key=[
+        "AUTOTUNE_Z",
+        "H",
+        "AUTOTUNE_MAX_SEQ_LEN",
+        "DimQ",
+        "DimV",
+    ],
+)
+@triton.jit
+def _hstu_attn_bwd(  # noqa C901
+    Q,
+    K,
+    V,
+    tma_workspace_ptr,
+    sort_by_length_indices,
+    seq_offsets,
+    num_targets,
+    DOut,
+    DQ,
+    DK,
+    DV,
+    LOCK,
+    stride_qm,
+    stride_qh,
+    stride_kn,
+    stride_kh,
+    stride_vn,
+    stride_vh,
+    stride_dom,
+    stride_doh,
+    stride_dqm,
+    stride_dqh,
+    stride_dkn,
+    stride_dkh,
+    stride_dvn,
+    stride_dvh,
+    alpha,
+    contextual_seq_len,
+    max_attn_len,
+    Z,
+    AUTOTUNE_Z,
+    H,
+    MAX_SEQ_LEN,
+    AUTOTUNE_MAX_SEQ_LEN,  # Quantized MAX_SEQ_LEN used as an autotuning key
+    DimQ,
+    DimV,
+    HAS_MULTIPLE_TARGETS: tl.constexpr,
+    HAS_CONTEXTUAL_SEQ_LEN: tl.constexpr,
+    HAS_MAX_ATTN_LEN: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BLOCK_D_Q: tl.constexpr,
+    BLOCK_D_V: tl.constexpr,
+    SEQUENCE_PARALLEL: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    UNROLL: tl.constexpr,
+    HAS_SORT_BY_LENGTH_INDICES: tl.constexpr,
+    ENABLE_TMA: tl.constexpr,
+    TMA_DESC_SIZE: tl.constexpr,
+    ENABLE_BUFFER_OPS_ASSUMES: tl.constexpr,
+):
+    off_hz = tl.program_id(0)
+    off_z = off_hz // H
+    if HAS_SORT_BY_LENGTH_INDICES:
+        off_z = tl.load(sort_by_length_indices + off_z)
+    off_h = off_hz % H
+    off_h = off_h.to(tl.int64)
+    seq_start = tl.load(seq_offsets + off_z).to(tl.int64)
+    seq_end = tl.load(seq_offsets + off_z + 1)
+    seq_len = (seq_end - seq_start).to(tl.int32)
+    if HAS_MULTIPLE_TARGETS:
+        n_targets = tl.load(num_targets + off_z).to(tl.int32)
+    else:
+        n_targets = None
+    if ENABLE_BUFFER_OPS_ASSUMES:
+        tl.assume(off_hz >= 0)
+        tl.assume(off_z >= 0)
+        tl.assume(off_h >= 0)
+        tl.assume(seq_start >= 0)
+        tl.assume(stride_qm >= 0)
+        tl.assume(stride_qh >= 0)
+        tl.assume(stride_kn >= 0)
+        tl.assume(stride_kh >= 0)
+        tl.assume(stride_vn >= 0)
+        tl.assume(stride_vh >= 0)
+        tl.assume(stride_dom >= 0)
+        tl.assume(stride_doh >= 0)
+        tl.assume(stride_dqm >= 0)
+        tl.assume(stride_dqh >= 0)
+        tl.assume(stride_dkn >= 0)
+        tl.assume(stride_dkh >= 0)
+        tl.assume(stride_dvn >= 0)
+        tl.assume(stride_dvh >= 0)
+
+    # offset pointers for batch/head
+    Q = Q + seq_start * stride_qm
+    K = K + seq_start * stride_kn
+    V = V + seq_start * stride_vn
+    DOut = DOut + seq_start * stride_dom
+    DQ = DQ + seq_start * stride_dqm + off_h * stride_dqh
+    DK = DK + seq_start * stride_dkn
+    DV = DV + seq_start * stride_dvn
+    device_desc_q = None
+    device_desc_k = None
+    device_desc_v = None
+    device_desc_do = None
+    device_desc_dk = None
+    device_desc_dv = None
+    if ENABLE_TMA:
+        device_desc_q = tl.make_tensor_descriptor(
+            Q,
+            shape=[seq_len, H * DimQ],
+            strides=[H * DimQ, 1],
+            block_shape=[BLOCK_M, BLOCK_D_Q],
+        )
+        device_desc_do = tl.make_tensor_descriptor(
+            DOut,
+            shape=[seq_len, H * DimV],
+            strides=[H * DimV, 1],
+            block_shape=[BLOCK_M, BLOCK_D_V],
+        )
+        device_desc_k = tl.make_tensor_descriptor(
+            K,
+            shape=[seq_len, H * DimQ],
+            strides=[H * DimQ, 1],
+            block_shape=[BLOCK_N, BLOCK_D_Q],
+        )
+        device_desc_dk = tl.make_tensor_descriptor(
+            DK,
+            shape=[seq_len, H * DimQ],
+            strides=[H * DimQ, 1],
+            block_shape=[BLOCK_N, BLOCK_D_Q],
+        )
+        device_desc_v = tl.make_tensor_descriptor(
+            V,
+            shape=[seq_len, H * DimV],
+            strides=[H * DimV, 1],
+            block_shape=[BLOCK_N, BLOCK_D_V],
+        )
+        device_desc_dv = tl.make_tensor_descriptor(
+            DV,
+            shape=[seq_len, H * DimV],
+            strides=[H * DimV, 1],
+            block_shape=[BLOCK_N, BLOCK_D_V],
+        )
+    else:
+        Q += off_h * stride_qh
+        K += off_h * stride_kh
+        V += off_h * stride_vh
+        DOut += off_h * stride_doh
+        DK += off_h * stride_dkh
+        DV += off_h * stride_dvh
+    if SEQUENCE_PARALLEL:
+        start_n = tl.program_id(1) * BLOCK_N
+        if start_n >= seq_len:
+            return
+        _hstu_attn_bwd_one_col_block(
+            start_n=start_n,
+            seq_len=seq_len,
+            n_targets=n_targets,
+            contextual_seq_len=contextual_seq_len,
+            max_attn_len=max_attn_len,
+            Q=Q,
+            K=K,
+            V=V,
+            DOut=DOut,
+            DQ=DQ,
+            DK=DK,
+            DV=DV,
+            device_desc_q=device_desc_q,
+            device_desc_k=device_desc_k,
+            device_desc_v=device_desc_v,
+            device_desc_do=device_desc_do,
+            device_desc_dk=device_desc_dk,
+            device_desc_dv=device_desc_dv,
+            LOCK=LOCK,
+            off_h=off_h,
+            stride_qh=stride_qh,
+            stride_kh=stride_kh,
+            stride_vh=stride_vh,
+            stride_doh=stride_doh,
+            stride_dkh=stride_dkh,
+            stride_dvh=stride_dvh,
+            stride_qm=stride_qm,
+            stride_kn=stride_kn,
+            stride_vn=stride_vn,
+            stride_dom=stride_dom,
+            stride_dqm=stride_dqm,
+            stride_dkn=stride_dkn,
+            stride_dvn=stride_dvn,
+            alpha=alpha,
+            MAX_SEQ_LEN=MAX_SEQ_LEN,
+            HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS,
+            HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN,
+            HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN,
+            ALLOW_TF32=ALLOW_TF32,
+            BLOCK_D_Q=BLOCK_D_Q,
+            BLOCK_D_V=BLOCK_D_V,
+            BLOCK_M=BLOCK_M,
+            BLOCK_N=BLOCK_N,
+            UNROLL=UNROLL,
+            ATOMIC_ADD=True,
+            ENABLE_TMA=ENABLE_TMA,
+        )
+    else:
+        for start_n in range(0, seq_len, BLOCK_N):
+            _hstu_attn_bwd_one_col_block(
+                start_n=start_n,
+                seq_len=seq_len,
+                n_targets=n_targets,
+                contextual_seq_len=contextual_seq_len,
+                max_attn_len=max_attn_len,
+                Q=Q,
+                K=K,
+                V=V,
+                DOut=DOut,
+                DQ=DQ,
+                DK=DK,
+                DV=DV,
+                device_desc_q=device_desc_q,
+                device_desc_k=device_desc_k,
+                device_desc_v=device_desc_v,
+                device_desc_do=device_desc_do,
+                device_desc_dk=device_desc_dk,
+                device_desc_dv=device_desc_dv,
+                LOCK=LOCK,
+                off_h=off_h,
+                stride_qh=stride_qh,
+                stride_kh=stride_kh,
+                stride_vh=stride_vh,
+                stride_doh=stride_doh,
+                stride_dkh=stride_dkh,
+                stride_dvh=stride_dvh,
+                stride_qm=stride_qm,
+                stride_kn=stride_kn,
+                stride_vn=stride_vn,
+                stride_dom=stride_dom,
+                stride_dqm=stride_dqm,
+                stride_dkn=stride_dkn,
+                stride_dvn=stride_dvn,
+                alpha=alpha,
+                MAX_SEQ_LEN=MAX_SEQ_LEN,
+                HAS_MULTIPLE_TARGETS=HAS_MULTIPLE_TARGETS,
+                HAS_CONTEXTUAL_SEQ_LEN=HAS_CONTEXTUAL_SEQ_LEN,
+                HAS_MAX_ATTN_LEN=HAS_MAX_ATTN_LEN,
+                ALLOW_TF32=ALLOW_TF32,
+                BLOCK_D_Q=BLOCK_D_Q,
+                BLOCK_D_V=BLOCK_D_V,
+                BLOCK_M=BLOCK_M,
+                BLOCK_N=BLOCK_N,
+                UNROLL=UNROLL,
+                ATOMIC_ADD=False,
+                ENABLE_TMA=ENABLE_TMA,
+            )
+
+
+@maybe_register_custom_op(
+    "generative_recommenders::triton_hstu_attention_fwd", mutates_args=()
+)
+def triton_hstu_attention_fwd(
+    N: int,
+    alpha: float,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    num_targets: Optional[torch.Tensor],
+    max_attn_len: int,
+    contextual_seq_len: int,
+    sort_by_length_indices: Optional[torch.Tensor],
+    enable_tma: bool,
+    num_softmax_heads: int,
+) -> torch.Tensor:
+    Z = seq_offsets.numel() - 1
+    AUTOTUNE_Z = prev_power_of_2(Z)
+    L, H, DimQ = q.shape
+    _, _, DimV = v.shape
+    out = torch.empty_like(v)
+    has_multiple_targets = num_targets is not None
+    has_contextual_seq_len = contextual_seq_len > 0
+    has_max_attn_len = max_attn_len > 0
+    has_sort_by_length_indices = sort_by_length_indices is not None
+    if L == 0:
+        return out
+
+    TMA_DESC_SIZE = 128
+    workspace = None
+    desc_q = q
+    desc_k = k
+    desc_v = v
+
+    if enable_tma and tensor_descriptor_tma:
+        dummy_block = [1, 1]
+        desc_q = TensorDescriptor(
+            q,
+            shape=[L, H * DimQ],
+            strides=[H * DimQ, 1],
+            block_shape=dummy_block,
+        )
+        desc_v = TensorDescriptor(
+            v,
+            shape=[L, H * DimV],
+            strides=[H * DimV, 1],
+            block_shape=dummy_block,
+        )
+        desc_k = TensorDescriptor(
+            k,
+            shape=[L, H * DimQ],
+            strides=[H * DimQ, 1],
+            block_shape=dummy_block,
+        )
+
+    def alloc_fn(size: int, align: int, stream: Optional[int]):
+        assert align == TMA_DESC_SIZE
+        return torch.empty(size, dtype=torch.int8, device="cuda")
+
+    # pyre-ignore [6]
+    triton.set_allocator(alloc_fn)
+    grid = lambda meta: (  # noqa E731
+        triton.cdiv(N, meta["BLOCK_M"]),
+        Z * H,
+    )
+
+    _hstu_attn_fwd[grid](
+        Q=desc_q,
+        K=desc_k,
+        V=desc_v,
+        workspace_ptr=workspace,
+        sort_by_length_indices=sort_by_length_indices,
+        seq_offsets=seq_offsets,
+        num_targets=num_targets,
+        Out=out,
+        stride_qm=q.stride(0),
+        stride_qh=q.stride(1),
+        stride_kn=k.stride(0),
+        stride_kh=k.stride(1),
+        stride_vn=v.stride(0),
+        stride_vh=v.stride(1),
+        stride_om=out.stride(0),
+        stride_oh=out.stride(1),
+        alpha=alpha,
+        Z=Z,
+        AUTOTUNE_Z=AUTOTUNE_Z,
+        H=H,
+        MAX_SEQ_LEN=N,
+        AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(N),
+        DimQ=DimQ,
+        DimV=DimV,
+        DeltaSize=0,
+        contextual_seq_len=contextual_seq_len,
+        max_attn_len=max_attn_len,
+        HAS_MULTIPLE_TARGETS=has_multiple_targets,
+        IS_DELTA_Q=False,
+        ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+        BLOCK_D_Q=DimQ,
+        BLOCK_D_V=DimV,
+        HAS_CONTEXTUAL_SEQ_LEN=has_contextual_seq_len,
+        HAS_MAX_ATTN_LEN=has_max_attn_len,
+        HAS_SORT_BY_LENGTH_INDICES=has_sort_by_length_indices,
+        ENABLE_TMA=enable_tma,
+        TMA_DESC_SIZE=TMA_DESC_SIZE,
+    )
+    return out
+
+
+@maybe_register_custom_op(
+    "generative_recommenders::triton_hstu_attention_bwd",
+    mutates_args=("dq", "dk", "dv"),
+)
+def triton_hstu_attention_bwd(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    dq: torch.Tensor,
+    dk: torch.Tensor,
+    dv: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    num_targets: Optional[torch.Tensor],
+    N: int,
+    alpha: float,
+    max_attn_len: int,
+    contextual_seq_len: int,
+    sort_by_length_indices: Optional[torch.Tensor],
+    enable_tma: bool,
+    num_softmax_heads: int,
+) -> None:
+    orig_dq, orig_dk, orig_dv = dq, dk, dv
+    dout = switch_to_contiguous_if_needed(dout)
+    dq = switch_to_contiguous_if_needed(dq)
+    dk = switch_to_contiguous_if_needed(dk)
+    dv = switch_to_contiguous_if_needed(dv)
+    if dout.shape[0] == 0:
+        orig_dq.zero_()
+        orig_dk.zero_()
+        orig_dv.zero_()
+        return
+    Z = seq_offsets.numel() - 1
+    _, H, DimQ = q.shape
+    _, _, DimV = v.shape
+    grid = lambda meta: (  # noqa E731
+        Z * H,
+        (triton.cdiv(N, meta["BLOCK_N"]) if meta["SEQUENCE_PARALLEL"] else 1),
+    )
+    # The minimum size of BLOCK_M used in `_get_bw_configs`.
+    # TODO (linjianma): avoid hardcoding the value.
+    MIN_BLOCK_M = 16
+    lock = torch.empty(
+        (Z * H, triton.cdiv(N, MIN_BLOCK_M)),
+        dtype=torch.int32,
+        device=q.device,
+    )
+    AUTOTUNE_Z = prev_power_of_2(Z)
+    TMA_DESC_SIZE = 128
+    tma_workspace = None
+
+    def alloc_fn(size: int, align: int, stream: Optional[int]):
+        assert align == TMA_DESC_SIZE
+        return torch.empty(size, dtype=torch.int8, device="cuda")
+
+    # pyre-ignore [6]
+    triton.set_allocator(alloc_fn)
+
+    # Enable BufferOps on AMD
+    ENABLE_BUFFER_OPS_ASSUMES = torch.version.hip is not None
+    _hstu_attn_bwd[grid](
+        Q=q,
+        K=k,
+        V=v,
+        tma_workspace_ptr=tma_workspace,
+        sort_by_length_indices=sort_by_length_indices,
+        seq_offsets=seq_offsets,
+        num_targets=num_targets,
+        DOut=dout,
+        DQ=dq,
+        DK=dk,
+        DV=dv,
+        LOCK=lock,
+        stride_qm=q.stride(0),
+        stride_qh=q.stride(1),
+        stride_kn=k.stride(0),
+        stride_kh=k.stride(1),
+        stride_vn=v.stride(0),
+        stride_vh=v.stride(1),
+        stride_dom=dout.stride(0),
+        stride_doh=dout.stride(1),
+        stride_dqm=dq.stride(0),
+        stride_dqh=dq.stride(1),
+        stride_dkn=dk.stride(0),
+        stride_dkh=dk.stride(1),
+        stride_dvn=dv.stride(0),
+        stride_dvh=dv.stride(1),
+        alpha=alpha,
+        contextual_seq_len=contextual_seq_len,
+        max_attn_len=max_attn_len,
+        Z=Z,
+        AUTOTUNE_Z=AUTOTUNE_Z,
+        H=H,
+        MAX_SEQ_LEN=N,
+        AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(N),
+        DimQ=DimQ,
+        DimV=DimV,
+        HAS_MULTIPLE_TARGETS=num_targets is not None,
+        HAS_CONTEXTUAL_SEQ_LEN=contextual_seq_len > 0,
+        HAS_MAX_ATTN_LEN=max_attn_len > 0,
+        ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+        BLOCK_D_Q=DimQ,
+        BLOCK_D_V=DimV,
+        HAS_SORT_BY_LENGTH_INDICES=sort_by_length_indices is not None,
+        ENABLE_TMA=enable_tma,
+        TMA_DESC_SIZE=TMA_DESC_SIZE,
+        ENABLE_BUFFER_OPS_ASSUMES=ENABLE_BUFFER_OPS_ASSUMES,
+    )
+
+    copy_if_different_ptr(orig_dq, dq)
+    copy_if_different_ptr(orig_dk, dk)
+    copy_if_different_ptr(orig_dv, dv)
+
+
+@triton_hstu_attention_fwd.register_fake
+def _triton_hstu_attention_fwd_fake(
+    N: int,
+    alpha: float,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    num_targets: Optional[torch.Tensor],
+    max_attn_len: int,
+    contextual_seq_len: int,
+    sort_by_length_indices: Optional[torch.Tensor],
+    enable_tma: bool,
+    num_softmax_heads: int,
+) -> torch.Tensor:
+    L, H, _ = q.shape
+    _, _, DimV = v.shape
+    out = torch.empty((L, H, DimV), dtype=v.dtype, device=v.device)
+    return out
+
+
+@triton_hstu_attention_bwd.register_fake
+def _triton_hstu_attention_bwd_fake(
+    dout: torch.Tensor,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    dq: torch.Tensor,
+    dk: torch.Tensor,
+    dv: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    num_targets: Optional[torch.Tensor],
+    N: int,
+    alpha: float,
+    max_attn_len: int,
+    contextual_seq_len: int,
+    sort_by_length_indices: Optional[torch.Tensor],
+    enable_tma: bool,
+    num_softmax_heads: int,
+) -> None:
+    return None
+
+
+class _AttentionFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        N: int,
+        alpha: float,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        seq_offsets: torch.Tensor,
+        num_targets: Optional[torch.Tensor],
+        max_attn_len: int,
+        contextual_seq_len: int,
+        sort_by_length: bool,
+        enable_tma: bool,
+    ) -> torch.Tensor:
+        sort_by_length_indices = None
+        if sort_by_length:
+            seq_lengths = seq_offsets[1:] - seq_offsets[:-1]
+            _, sort_by_length_indices = torch.sort(
+                seq_lengths, descending=True, stable=False
+            )
+        saved_tensors = [q, k, v, seq_offsets]
+        if num_targets is not None:
+            saved_tensors.append(num_targets)
+        if sort_by_length_indices is not None:
+            saved_tensors.append(sort_by_length_indices)
+        ctx.save_for_backward(*saved_tensors)
+        ctx.alpha = alpha
+        ctx.has_multiple_targets = num_targets is not None
+        ctx.max_attn_len = max_attn_len
+        ctx.N = N
+        ctx.contextual_seq_len = contextual_seq_len
+        ctx.sort_by_length = sort_by_length
+        ctx.enable_tma = enable_tma
+        return triton_hstu_attention_fwd(
+            N=N,
+            alpha=alpha,
+            q=q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            sort_by_length_indices=sort_by_length_indices,
+            enable_tma=enable_tma,
+            num_softmax_heads=0,
+        )
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, dout: torch.Tensor
+    ) -> Tuple[
+        None,
+        None,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+    ]:
+        with torch.inference_mode():
+            q, k, v, seq_offsets = ctx.saved_tensors[:4]
+            idx = 4
+            if ctx.has_multiple_targets:
+                num_targets = ctx.saved_tensors[idx]
+                idx += 1
+            else:
+                num_targets = None
+            if ctx.sort_by_length:
+                sort_by_length_indices = ctx.saved_tensors[idx]
+            else:
+                sort_by_length_indices = None
+
+            dq = torch.empty_like(q)
+            dk = torch.empty_like(k)
+            dv = torch.empty_like(v)
+            triton_hstu_attention_bwd(
+                dout=dout,
+                q=q,
+                k=k,
+                v=v,
+                dq=dq,
+                dk=dk,
+                dv=dv,
+                seq_offsets=seq_offsets,
+                num_targets=num_targets,
+                N=ctx.N,
+                alpha=ctx.alpha,
+                max_attn_len=ctx.max_attn_len,
+                contextual_seq_len=ctx.contextual_seq_len,
+                sort_by_length_indices=sort_by_length_indices,
+                enable_tma=ctx.enable_tma,
+                num_softmax_heads=0,
+            )
+            return (
+                None,
+                None,
+                dq,
+                dk,
+                dv,
+                None,
+                None,
+                None,
+                None,
+                None,
+                None,
+            )
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def triton_hstu_mha(
+    N: int,
+    alpha: float,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    num_targets: Optional[torch.Tensor] = None,
+    max_attn_len: int = 0,
+    contextual_seq_len: int = 0,
+    sort_by_length: bool = False,
+    enable_tma: bool = False,
+) -> torch.Tensor:
+    return _AttentionFunction.apply(
+        N,
+        alpha,
+        q,
+        k,
+        v,
+        seq_offsets,
+        num_targets,
+        max_attn_len,
+        contextual_seq_len,
+        sort_by_length,
+        enable_tma,
+    )
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def triton_cached_hstu_mha(
+    N: int,
+    alpha: float,
+    delta_q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    num_targets: Optional[torch.Tensor] = None,
+    max_attn_len: int = 0,
+    contextual_seq_len: int = 0,
+    enable_tma: bool = False,
+) -> torch.Tensor:
+    Z = seq_offsets.size(0) - 1
+    AUTOTUNE_Z = prev_power_of_2(Z)
+    DELTA_L, H, DimQ = delta_q.shape
+    DeltaSize = DELTA_L // Z
+    L, _, DimV = v.shape
+    out = torch.empty((DELTA_L, H, DimV), dtype=delta_q.dtype, device=delta_q.device)
+
+    TMA_DESC_SIZE = 128
+    desc_q = delta_q
+    desc_k = k
+    desc_v = v
+
+    if enable_tma and tensor_descriptor_tma:
+        dummy_block = [1, 1]
+        desc_q = TensorDescriptor(
+            delta_q,
+            shape=[DELTA_L, H * DimQ],
+            strides=[H * DimQ, 1],
+            block_shape=dummy_block,
+        )
+        desc_v = TensorDescriptor(
+            v,
+            shape=[L, H * DimV],
+            strides=[H * DimV, 1],
+            block_shape=dummy_block,
+        )
+        desc_k = TensorDescriptor(
+            k,
+            shape=[L, H * DimQ],
+            strides=[H * DimQ, 1],
+            block_shape=dummy_block,
+        )
+
+    def alloc_fn(size: int, align: int, stream: Optional[int]):
+        assert align == TMA_DESC_SIZE
+        return torch.empty(size, dtype=torch.int8, device="cuda")
+
+    # pyre-ignore [6]
+    triton.set_allocator(alloc_fn)
+    grid = lambda meta: (  # noqa E731
+        triton.cdiv(DeltaSize, meta["BLOCK_M"]),
+        Z * H,
+    )
+
+    has_contextual_seq_len = contextual_seq_len > 0
+    has_max_attn_len = max_attn_len > 0
+    _hstu_attn_fwd[grid](
+        Q=desc_q,
+        K=desc_k,
+        V=desc_v,
+        workspace_ptr=None,
+        sort_by_length_indices=None,
+        seq_offsets=seq_offsets,
+        num_targets=num_targets,
+        Out=out,
+        stride_qm=delta_q.stride(0),
+        stride_qh=delta_q.stride(1),
+        stride_kn=k.stride(0),
+        stride_kh=k.stride(1),
+        stride_vn=v.stride(0),
+        stride_vh=v.stride(1),
+        stride_om=out.stride(0),
+        stride_oh=out.stride(1),
+        alpha=alpha,
+        contextual_seq_len=contextual_seq_len,
+        max_attn_len=max_attn_len,
+        Z=Z,
+        AUTOTUNE_Z=AUTOTUNE_Z,
+        H=H,
+        MAX_SEQ_LEN=N,
+        AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(N),
+        DimQ=DimQ,
+        DimV=DimV,
+        DeltaSize=DeltaSize,
+        HAS_MULTIPLE_TARGETS=num_targets is not None,
+        IS_DELTA_Q=True,
+        ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+        BLOCK_D_Q=DimQ,
+        BLOCK_D_V=DimV,
+        HAS_CONTEXTUAL_SEQ_LEN=has_contextual_seq_len,
+        HAS_MAX_ATTN_LEN=has_max_attn_len,
+        HAS_SORT_BY_LENGTH_INDICES=False,
+        ENABLE_TMA=enable_tma,
+        TMA_DESC_SIZE=TMA_DESC_SIZE,
+    )
+    return out
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_linear.py b/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_linear.py
new file mode 100644
index 000000000..ff04dde40
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_linear.py
@@ -0,0 +1,3042 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+#!/usr/bin/env python3
+
+
+from typing import List, Optional, Tuple
+
+import torch
+
+# @manual=//triton:triton
+import triton
+
+# @manual=//triton:triton
+import triton.language as tl
+from generative_recommenders.common import (
+    switch_to_contiguous_if_needed,
+    triton_autotune,
+)
+from generative_recommenders.ops.triton.triton_addmm import maybe_triton_addmm_fwd
+from generative_recommenders.ops.utils import maybe_register_custom_op
+
+
+def _get_layer_norm_mul_dropout_fwd_multirow_configs() -> List[triton.Config]:
+    """Generate autotune configs for multi-row LayerNorm multiplication with dropout kernels."""
+    configs = []
+    for BLOCK_N in [1, 2, 4, 8, 16]:
+        for num_warps in [1, 2, 4]:
+            configs.append(
+                triton.Config(
+                    {"BLOCK_N": BLOCK_N},
+                    num_warps=num_warps,
+                )
+            )
+    return configs
+
+
+from generative_recommenders.ops.utils import is_sm100_plus
+
+# @manual=//triton:triton
+from triton.language.extra import libdevice
+
+try:
+    # @manual=//triton:triton
+    from triton.language.extra.libdevice import fast_dividef
+except ImportError:
+    try:
+        # @manual=//triton:triton
+        from triton.language.extra.cuda.libdevice import fast_dividef
+    except ImportError:
+        # pyre-ignore: Undefined import [21]
+        # @manual=//triton:triton
+        from triton.language.math import fast_dividef
+
+
+COMPUTE_OUTPUT_LN_FAST_DROPOUT = False
+
+
+def set_compute_output_ln_fast_dropout(value: bool) -> None:
+    global COMPUTE_OUTPUT_LN_FAST_DROPOUT
+    COMPUTE_OUTPUT_LN_FAST_DROPOUT = value
+
+
+FUSE_OUTPUT_LN_RNG_BLACKWELL = False
+
+
+# Only impact B200 training when CONCAT_UX is False
+def set_fuse_output_ln_rng_blackwell(value: bool) -> None:
+    global FUSE_OUTPUT_LN_RNG_BLACKWELL
+    FUSE_OUTPUT_LN_RNG_BLACKWELL = value
+
+
+@triton.jit
+def rand3x(seed, offsets, n_rounds: tl.constexpr = 10):  # pyre-ignore [9]
+    i1, i2, i3, _ = tl.randint4x(seed, offsets, n_rounds)
+    u1 = tl.uint_to_uniform_float(i1)
+    u2 = tl.uint_to_uniform_float(i2)
+    u3 = tl.uint_to_uniform_float(i3)
+    return u1, u2, u3
+
+
+@triton.jit
+def _generate_random_mask(
+    MASK_BUFFER,
+    N,
+    dropout_ratio,
+    seed,
+    D: tl.constexpr,
+    STRIDE: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    NUM_MASKS: tl.constexpr,
+):
+    """Generate bit-packed dropout masks for (N, D) tensors. Outputs int8.
+
+    Processes 4 rows per program using rand4x. Mask j occupies bit j.
+    Extraction: y = val & 1, x = val & 2, u = val & 4.
+    """
+    pid = tl.program_id(0)
+    cols = tl.arange(0, BLOCK_D)
+    col_mask = cols < D
+    start_row = pid.to(tl.int64) * 4
+
+    base_ptr = MASK_BUFFER + start_row * STRIDE + cols
+    row0_mask = (start_row < N) & col_mask
+    row1_mask = ((start_row + 1) < N) & col_mask
+    row2_mask = ((start_row + 2) < N) & col_mask
+    row3_mask = ((start_row + 3) < N) & col_mask
+
+    # Each pid uses NUM_MASKS consecutive BLOCK_D chunks for Philox offsets
+    rand_offset = pid * (NUM_MASKS * BLOCK_D) + cols
+
+    packed0 = tl.zeros([BLOCK_D], dtype=tl.int8)
+    packed1 = tl.zeros([BLOCK_D], dtype=tl.int8)
+    packed2 = tl.zeros([BLOCK_D], dtype=tl.int8)
+    packed3 = tl.zeros([BLOCK_D], dtype=tl.int8)
+
+    for j in tl.static_range(NUM_MASKS):
+        r0, r1, r2, r3 = tl.rand4x(seed, rand_offset)
+        packed0 |= (r0 > dropout_ratio).to(tl.int8) << j
+        packed1 |= (r1 > dropout_ratio).to(tl.int8) << j
+        packed2 |= (r2 > dropout_ratio).to(tl.int8) << j
+        packed3 |= (r3 > dropout_ratio).to(tl.int8) << j
+        rand_offset += BLOCK_D
+
+    tl.store(base_ptr, packed0, mask=row0_mask)
+    tl.store(base_ptr + STRIDE, packed1, mask=row1_mask)
+    tl.store(base_ptr + 2 * STRIDE, packed2, mask=row2_mask)
+    tl.store(base_ptr + 3 * STRIDE, packed3, mask=row3_mask)
+
+
+@triton_autotune(
+    configs=_get_layer_norm_mul_dropout_fwd_multirow_configs(),
+    key=["BLOCK_D"],
+)
+@triton.jit
+def _ln_mul_dropout_fwd_rng(
+    X,
+    U,
+    Y,
+    W,
+    B,
+    Mean,
+    Rstd,
+    RANDOM_MASK,
+    N,
+    D,
+    eps,
+    dropout_ratio,
+    stride_x,
+    stride_u,
+    stride_y,
+    stride_mask,
+    SILU_U: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    TRAINING: tl.constexpr,
+    CONCAT_U: tl.constexpr,
+    CONCAT_X: tl.constexpr,
+    MUL_U_ACTIVATION_TYPE: tl.constexpr,
+):
+    block_id = tl.program_id(0)
+    start_row = block_id * BLOCK_N
+
+    # Create block pointers for X, U, and Y
+    X_block_ptr = tl.make_block_ptr(
+        base=X,
+        shape=(N, D),
+        strides=(stride_x, 1),
+        offsets=(start_row, 0),
+        block_shape=(BLOCK_N, BLOCK_D),
+        order=(1, 0),
+    )
+
+    U_block_ptr = tl.make_block_ptr(
+        base=U,
+        shape=(N, D),
+        strides=(stride_u, 1),
+        offsets=(start_row, 0),
+        block_shape=(BLOCK_N, BLOCK_D),
+        order=(1, 0),
+    )
+
+    # Load data blocks
+    x_block = tl.load(X_block_ptr, boundary_check=(0, 1), padding_option="zero").to(
+        tl.float32
+    )
+    u_block = tl.load(U_block_ptr, boundary_check=(0, 1), padding_option="zero").to(
+        tl.float32
+    )
+
+    cols = tl.arange(0, BLOCK_D)
+    col_mask = cols < D
+    rows = start_row + tl.arange(0, BLOCK_N)
+    row_mask = rows < N
+    # Pre-compute 2D mask for reuse in dropout and masked operations
+    mask_2d = row_mask[:, None] & col_mask[None, :]
+
+    # Pre-compute inv_D to replace divisions with multiplications (optimization)
+    inv_D = 1.0 / D
+
+    mean = tl.sum(x_block, axis=1) * inv_D
+    tl.store(Mean + rows, mean, mask=row_mask)
+    mean = tl.expand_dims(mean, 1)
+
+    x_mean = x_block - mean
+    x_mean = tl.where(mask_2d, x_mean, 0.0)
+    _var = x_mean * x_mean
+    var = tl.sum(_var, axis=1) * inv_D
+    rstd = 1 / tl.sqrt(var + eps)
+    tl.store(Rstd + rows, rstd, mask=row_mask)
+    rstd = tl.expand_dims(rstd, 1)
+
+    y = x_mean * rstd
+    w = tl.load(W + cols, mask=col_mask).to(tl.float32)
+    b = tl.load(B + cols, mask=col_mask).to(tl.float32)
+    y = y * w[None, :] + b[None, :]
+
+    # Pre-compute sigmoid once to avoid redundant computation
+    sigmoid_u_block = tl.sigmoid(u_block)
+    silu_u_block = u_block * sigmoid_u_block
+
+    if MUL_U_ACTIVATION_TYPE == "silu":
+        y = y * silu_u_block
+    elif MUL_U_ACTIVATION_TYPE == "sigmoid":
+        y = y * sigmoid_u_block
+    else:
+        y = y * u_block
+
+    if CONCAT_U and SILU_U:
+        # pyre-fixme[16]
+        u_block = silu_u_block
+
+    if TRAINING:
+        # Reuse rows (as int64 for pointer arithmetic) and pre-computed mask_2d
+        row_offsets_i64 = rows.to(tl.int64)
+        # Pre-compute loop-invariant values
+        dropout_scale = 1.0 / (1.0 - dropout_ratio)
+        offsets = row_offsets_i64[:, None] * stride_mask + cols[None, :]
+
+        if CONCAT_U or CONCAT_X:
+            # All 2+ mask cases use compressed int8 format - load once
+            compressed = tl.load(RANDOM_MASK + offsets, mask=mask_2d, other=0).to(
+                tl.int32
+            )
+            # Bit 0 is always y_mask
+            y_keep = (compressed & 1) != 0
+
+            if CONCAT_U and CONCAT_X:
+                # 3-mask: (u_mask << 2) | (x_mask << 1) | y_mask
+                x_keep = (compressed & 2) != 0
+                u_keep = (compressed & 4) != 0
+                u_block = tl.where(u_keep, u_block * dropout_scale, 0.0)
+                x_block = tl.where(x_keep, x_block * dropout_scale, 0.0)
+            elif CONCAT_U:
+                # 2-mask: (u_mask << 1) | y_mask
+                u_keep = (compressed & 2) != 0
+                u_block = tl.where(u_keep, u_block * dropout_scale, 0.0)
+            else:  # CONCAT_X
+                # 2-mask: (x_mask << 1) | y_mask
+                x_keep = (compressed & 2) != 0
+                x_block = tl.where(x_keep, x_block * dropout_scale, 0.0)
+
+            y = tl.where(y_keep, y * dropout_scale, 0.0)
+        else:
+            # 1-mask: y_mask at bit 0
+            y_keep = tl.load(RANDOM_MASK + offsets, mask=mask_2d, other=True)
+            y = tl.where(y_keep, y * dropout_scale, 0.0)
+
+    if CONCAT_U and CONCAT_X:
+        Y_block_ptr_u = tl.make_block_ptr(
+            base=Y,
+            shape=(N, 3 * D),
+            strides=(stride_y, 1),
+            offsets=(start_row, 0),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+
+        Y_block_ptr_x = tl.make_block_ptr(
+            base=Y,
+            shape=(N, 3 * D),
+            strides=(stride_y, 1),
+            offsets=(start_row, D),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+
+        Y_block_ptr_y = tl.make_block_ptr(
+            base=Y,
+            shape=(N, 3 * D),
+            strides=(stride_y, 1),
+            offsets=(start_row, 2 * D),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+
+        tl.store(Y_block_ptr_u, u_block.to(Y.dtype.element_ty), boundary_check=(0, 1))
+        tl.store(Y_block_ptr_x, x_block.to(Y.dtype.element_ty), boundary_check=(0, 1))
+        tl.store(Y_block_ptr_y, y.to(Y.dtype.element_ty), boundary_check=(0, 1))
+    elif CONCAT_U:
+        Y_block_ptr_u = tl.make_block_ptr(
+            base=Y,
+            shape=(N, 2 * D),
+            strides=(stride_y, 1),
+            offsets=(start_row, 0),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+        Y_block_ptr_y = tl.make_block_ptr(
+            base=Y,
+            shape=(N, 2 * D),
+            strides=(stride_y, 1),
+            offsets=(start_row, D),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+        tl.store(Y_block_ptr_u, u_block.to(Y.dtype.element_ty), boundary_check=(0, 1))
+        tl.store(Y_block_ptr_y, y.to(Y.dtype.element_ty), boundary_check=(0, 1))
+    elif CONCAT_X:
+        Y_block_ptr_x = tl.make_block_ptr(
+            base=Y,
+            shape=(N, 2 * D),
+            strides=(stride_y, 1),
+            offsets=(start_row, 0),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+        Y_block_ptr_y = tl.make_block_ptr(
+            base=Y,
+            shape=(N, 2 * D),
+            strides=(stride_y, 1),
+            offsets=(start_row, D),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+        tl.store(Y_block_ptr_x, x_block.to(Y.dtype.element_ty), boundary_check=(0, 1))
+        tl.store(Y_block_ptr_y, y.to(Y.dtype.element_ty), boundary_check=(0, 1))
+    else:
+        Y_block_ptr = tl.make_block_ptr(
+            base=Y,
+            shape=(N, D),
+            strides=(stride_y, 1),
+            offsets=(start_row, 0),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+
+        tl.store(Y_block_ptr, y.to(Y.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton.jit
+def _ln_mul_dropout_fwd(
+    X,
+    U,
+    Y,
+    W,
+    B,
+    Mean,
+    Rstd,
+    D,
+    eps,
+    seed,
+    dropout_ratio,
+    stride_x,
+    stride_u,
+    stride_y,
+    SILU_U: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    TRAINING: tl.constexpr,
+    CONCAT_U: tl.constexpr,
+    CONCAT_X: tl.constexpr,
+    MUL_U_ACTIVATION_TYPE: tl.constexpr,
+    FAST_DROPOUT: tl.constexpr,
+):
+    row = tl.program_id(0)
+    X += row.to(tl.int64) * stride_x
+    U += row.to(tl.int64) * stride_u
+    Y += row.to(tl.int64) * stride_y
+    cols = tl.arange(0, BLOCK_D)
+
+    # Compute mean
+    mean = 0.0
+    x = tl.load(X + cols, mask=cols < D, other=0.0).to(tl.float32)
+    mean = tl.sum(x, axis=0) / D
+
+    # Compute variance
+    _var = tl.zeros([BLOCK_D], dtype=tl.float32)
+    x_mean = tl.where(cols < D, x - mean, 0.0)
+    _var += x_mean * x_mean
+    var = tl.sum(_var, axis=0) / D
+    rstd = 1 / tl.sqrt(var + eps)
+    tl.store(Mean + row, mean)
+    tl.store(Rstd + row, rstd)
+
+    # Normalize and apply linear transformation
+    mask = cols < D
+    y = x_mean * rstd
+    w = tl.load(W + cols, mask=mask).to(tl.float32)
+    b = tl.load(B + cols, mask=mask).to(tl.float32)
+    y = y * w + b
+    u = tl.load(U + cols, mask=cols < D, other=0.0).to(tl.float32)
+    sigmoid_u = tl.sigmoid(u)
+    silu_u = u * sigmoid_u
+
+    if MUL_U_ACTIVATION_TYPE == "silu":
+        y = y * silu_u
+    elif MUL_U_ACTIVATION_TYPE == "sigmoid":
+        y = y * sigmoid_u
+    else:
+        y = y * u
+
+    if CONCAT_U and SILU_U:
+        u = silu_u
+
+    if TRAINING:
+        random_offsets = 3 * row * BLOCK_D + cols
+        if CONCAT_U and CONCAT_X:
+            # apply dropout on u
+            if FAST_DROPOUT:
+                random_u, random_x, random_y = rand3x(seed, random_offsets)
+            else:
+                random_u = tl.rand(seed, random_offsets)
+            u_keep = random_u > dropout_ratio
+            u = tl.where(u_keep, u / (1.0 - dropout_ratio), 0.0)
+            # apply dropout on x
+            if not FAST_DROPOUT:
+                random_x = tl.rand(seed, random_offsets + D)
+            x_keep = random_x > dropout_ratio  # pyre-ignore [61]
+            x = tl.where(x_keep, x / (1.0 - dropout_ratio), 0.0)
+            # apply dropout on y
+            if not FAST_DROPOUT:
+                random_y = tl.rand(seed, random_offsets + 2 * D)
+            y_keep = random_y > dropout_ratio  # pyre-ignore [61]
+            y = tl.where(y_keep, y / (1.0 - dropout_ratio), 0.0)
+        elif CONCAT_U:
+            # apply dropout on u
+            if FAST_DROPOUT:
+                random_u, random_y, _ = rand3x(seed, random_offsets)
+            else:
+                random_u = tl.rand(seed, random_offsets)
+            u_keep = random_u > dropout_ratio
+            u = tl.where(u_keep, u / (1.0 - dropout_ratio), 0.0)
+            # apply dropout on y
+            if not FAST_DROPOUT:
+                random_y = tl.rand(seed, random_offsets + D)
+            y_keep = random_y > dropout_ratio  # pyre-ignore [61]
+            y = tl.where(y_keep, y / (1.0 - dropout_ratio), 0.0)
+        elif CONCAT_X:
+            # apply dropout on x
+            if FAST_DROPOUT:
+                random_x, random_y, _ = rand3x(seed, random_offsets)
+            else:
+                random_x = tl.rand(seed, random_offsets)
+            x_keep = random_x > dropout_ratio
+            x = tl.where(x_keep, x / (1.0 - dropout_ratio), 0.0)
+            # apply dropout on y
+            if not FAST_DROPOUT:
+                random_y = tl.rand(seed, random_offsets + D)
+            y_keep = random_y > dropout_ratio  # pyre-ignore [61]
+            y = tl.where(y_keep, y / (1.0 - dropout_ratio), 0.0)
+        else:
+            random = tl.rand(seed, random_offsets)
+            y_keep = random > dropout_ratio
+            # write-back
+            y = tl.where(y_keep, y / (1.0 - dropout_ratio), 0.0)
+
+    # Write output
+    if CONCAT_U and CONCAT_X:
+        tl.store(Y + cols, u.to(Y.dtype.element_ty), mask=mask)
+        tl.store(Y + D + cols, x.to(Y.dtype.element_ty), mask=mask)
+        tl.store(Y + 2 * D + cols, y.to(Y.dtype.element_ty), mask=mask)
+    elif CONCAT_U:
+        tl.store(Y + cols, u.to(Y.dtype.element_ty), mask=mask)
+        tl.store(Y + D + cols, y.to(Y.dtype.element_ty), mask=mask)
+    elif CONCAT_X:
+        tl.store(Y + cols, x.to(Y.dtype.element_ty), mask=mask)
+        tl.store(Y + D + cols, y.to(Y.dtype.element_ty), mask=mask)
+    else:
+        tl.store(Y + cols, y.to(Y.dtype.element_ty), mask=mask)
+
+
+@triton.jit
+def _ln_mul_dropout_bwd_dx_du_rng(
+    DX,
+    DU,
+    DY,
+    DW,
+    DB,
+    X,
+    U,
+    Y,
+    W,
+    B,
+    Mean,
+    Rstd,
+    RANDOM_MASK,
+    stride_dx,
+    stride_du,
+    stride_dy,
+    stride_x,
+    stride_u,
+    stride_y,
+    stride_mask,
+    D,
+    eps,
+    dropout_ratio,
+    N,
+    SILU_U: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    TRAINING: tl.constexpr,
+    CONCAT_U: tl.constexpr,
+    CONCAT_X: tl.constexpr,
+    MUL_U_ACTIVATION_TYPE: tl.constexpr,
+    COMPUTE_Y: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    tile_num = tl.num_programs(0)
+    rows_per_tile = N // tile_num
+    if pid < N % tile_num:
+        rows_per_tile += 1
+
+    if rows_per_tile == 0:
+        return
+
+    cols = tl.arange(0, BLOCK_D)
+    mask = cols < D
+
+    row = pid
+    # Pre-compute row and pid as int64 once for initial pointer setup
+    row_i64 = row.to(tl.int64)
+    pid_i64 = pid.to(tl.int64)
+    X += row_i64 * stride_x
+    U += row_i64 * stride_u
+    if COMPUTE_Y:
+        Y += row_i64 * stride_y
+    DY += row_i64 * stride_dy
+    DX += row_i64 * stride_dx
+    DU += row_i64 * stride_du
+    DW = DW + pid_i64 * D + cols
+    DB = DB + pid_i64 * D + cols
+
+    # Pre-compute mask pointer offset (all cases use stride_mask for (N, D) shape)
+    RANDOM_MASK += row_i64 * stride_mask
+
+    partial_dw = tl.zeros((BLOCK_D,), dtype=tl.float32)
+    partial_db = tl.zeros((BLOCK_D,), dtype=tl.float32)
+    w = tl.load(W + cols, mask=mask).to(tl.float32)
+    b = tl.load(B + cols, mask=mask).to(tl.float32)
+
+    dropout_scale = 0.0
+    if TRAINING:
+        dropout_scale = 1.0 / (1.0 - dropout_ratio)
+
+    # Pre-compute inv_D to replace divisions with multiplications (optimization)
+    inv_D = 1.0 / D
+
+    # Pre-compute tile_num as int64 to avoid repeated conversion in the loop
+    tile_num_i64 = tile_num.to(tl.int64)
+    for _ in range(0, rows_per_tile):
+        # Load data to SRAM
+        x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
+        if CONCAT_U and CONCAT_X:
+            du = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+            dx = tl.load(DY + D + cols, mask=mask, other=0).to(tl.float32)
+            dy = tl.load(DY + 2 * D + cols, mask=mask, other=0).to(tl.float32)
+        elif CONCAT_U:
+            du = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+            dx = tl.zeros([BLOCK_D], dtype=tl.float32)
+            dy = tl.load(DY + D + cols, mask=mask, other=0).to(tl.float32)
+        elif CONCAT_X:
+            du = tl.zeros([BLOCK_D], dtype=tl.float32)
+            dx = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+            dy = tl.load(DY + D + cols, mask=mask, other=0).to(tl.float32)
+        else:
+            du = tl.zeros([BLOCK_D], dtype=tl.float32)
+            dx = tl.zeros([BLOCK_D], dtype=tl.float32)
+            dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+        if TRAINING:
+            if CONCAT_U or CONCAT_X:
+                # All 2+ mask cases use compressed int8 format - load once
+                compressed = tl.load(RANDOM_MASK + cols, mask=mask, other=0).to(
+                    tl.int32
+                )
+                dy_keep = (compressed & 1) != 0  # Bit 0 always y_mask
+
+                if CONCAT_U and CONCAT_X:
+                    # Format: (u_mask << 2) | (x_mask << 1) | y_mask
+                    dx_keep = (compressed & 2) != 0
+                    du_keep = (compressed & 4) != 0
+                    du = tl.where(du_keep, du * dropout_scale, 0.0)
+                    dx = tl.where(dx_keep, dx * dropout_scale, 0.0)
+                elif CONCAT_U:
+                    # Format: (u_mask << 1) | y_mask
+                    du_keep = (compressed & 2) != 0
+                    du = tl.where(du_keep, du * dropout_scale, 0.0)
+                else:  # CONCAT_X
+                    # Format: (x_mask << 1) | y_mask
+                    dx_keep = (compressed & 2) != 0
+                    dx = tl.where(dx_keep, dx * dropout_scale, 0.0)
+                dy = tl.where(dy_keep, dy * dropout_scale, 0.0)
+            else:
+                # 1-mask: y_mask at bit 0
+                dy_keep = tl.load(RANDOM_MASK + cols, mask=mask, other=True)
+                dy = tl.where(dy_keep, dy * dropout_scale, 0.0)
+
+        mean = tl.load(Mean + row)
+        rstd = tl.load(Rstd + row)
+
+        # Compute dx
+        xhat = (x - mean) * rstd
+        u = tl.load(U + cols, mask=mask, other=0).to(tl.float32)
+        ln = xhat * w + b
+        du_y = dy * ln
+        mul_u = u
+        sig_u = tl.sigmoid(u)
+
+        # Pre-compute commonly used expressions to avoid redundant computation
+        silu_u = u * sig_u  # silu(u) - used multiple times
+        dsig_u = sig_u * (1.0 - sig_u)  # sigmoid derivative - used multiple times
+        dsilu_u = sig_u + silu_u * (
+            1.0 - sig_u
+        )  # silu derivative - used multiple times
+
+        if MUL_U_ACTIVATION_TYPE == "silu":
+            mul_u = silu_u
+            du_y = dy * ln * dsilu_u
+            dy = dy * silu_u
+        elif MUL_U_ACTIVATION_TYPE == "sigmoid":
+            mul_u = sig_u
+            du_y = dy * ln * dsig_u
+            dy = dy * sig_u
+        else:
+            dy = dy * u
+
+        du_u = du
+        if CONCAT_U and SILU_U:
+            du_u *= dsilu_u
+            u = silu_u
+
+        du = du_y + du_u
+
+        tl.store(DU + cols, du.to(DU.dtype.element_ty), mask=mask)
+
+        wdy = w * dy
+        if COMPUTE_Y:
+            y = ln * mul_u
+            if TRAINING:
+                if CONCAT_U:
+                    u = tl.where(
+                        du_keep,  # pyre-ignore [61]
+                        u * dropout_scale,
+                        0.0,
+                    )
+                if CONCAT_X:
+                    x = tl.where(
+                        dx_keep,  # pyre-ignore [61]
+                        x * dropout_scale,
+                        0.0,
+                    )
+                y = tl.where(
+                    dy_keep,  # pyre-ignore [61]
+                    y * dropout_scale,
+                    0.0,
+                )
+            if CONCAT_U and CONCAT_X:
+                tl.store(Y + cols, u.to(Y.dtype.element_ty), mask=mask)
+                tl.store(Y + D + cols, x.to(Y.dtype.element_ty), mask=mask)
+                tl.store(Y + 2 * D + cols, y.to(Y.dtype.element_ty), mask=mask)
+            elif CONCAT_U:
+                tl.store(Y + cols, u.to(Y.dtype.element_ty), mask=mask)
+                tl.store(Y + D + cols, y.to(Y.dtype.element_ty), mask=mask)
+            elif CONCAT_X:
+                tl.store(Y + cols, x.to(Y.dtype.element_ty), mask=mask)
+                tl.store(Y + D + cols, y.to(Y.dtype.element_ty), mask=mask)
+            else:
+                tl.store(Y + cols, y.to(Y.dtype.element_ty), mask=mask)
+            Y += tile_num_i64 * stride_y
+
+        # Note: xhat and wdy are already 0 outside valid range due to masked loads,
+        # so no additional tl.where masking is needed before reduction
+        c1 = tl.sum(xhat * wdy, axis=0) * inv_D
+        c2 = tl.sum(wdy, axis=0) * inv_D
+        dx += (wdy - (xhat * c1 + c2)) * rstd
+        # Write dx
+        tl.store(DX + cols, dx, mask=mask)
+
+        # Accumulate partial sums for dw/db
+        partial_dw += dy * xhat
+        partial_db += dy
+        X += tile_num_i64 * stride_x
+        U += tile_num_i64 * stride_u
+        DY += tile_num_i64 * stride_dy
+        DX += tile_num_i64 * stride_dx
+        DU += tile_num_i64 * stride_du
+        # Increment mask pointer
+        RANDOM_MASK += tile_num_i64 * stride_mask
+        row += tile_num
+    tl.store(DW, partial_dw, mask=mask)
+    tl.store(DB, partial_db, mask=mask)
+
+
+@triton.jit
+def _ln_mul_dropout_bwd_dx_du(
+    DX,
+    DU,
+    DY,
+    DW,
+    DB,
+    X,
+    U,
+    Y,
+    W,
+    B,
+    Mean,
+    Rstd,
+    stride_dx,
+    stride_du,
+    stride_dy,
+    stride_x,
+    stride_u,
+    stride_y,
+    D,
+    eps,
+    seed,
+    dropout_ratio,
+    N,
+    SILU_U: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    TRAINING: tl.constexpr,
+    CONCAT_U: tl.constexpr,
+    CONCAT_X: tl.constexpr,
+    MUL_U_ACTIVATION_TYPE: tl.constexpr,
+    COMPUTE_Y: tl.constexpr,
+    FAST_DROPOUT: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    tile_num = tl.num_programs(0)
+    rows_per_tile = N // tile_num
+    if pid < N % tile_num:
+        rows_per_tile += 1
+
+    if rows_per_tile == 0:
+        return
+
+    cols = tl.arange(0, BLOCK_D)
+    mask = cols < D
+
+    row = pid
+    X += row.to(tl.int64) * stride_x
+    U += row.to(tl.int64) * stride_u
+    if COMPUTE_Y:
+        Y += row.to(tl.int64) * stride_y
+    DY += row.to(tl.int64) * stride_dy
+    DX += row.to(tl.int64) * stride_dx
+    DU += row.to(tl.int64) * stride_du
+    DW = DW + pid * D + cols
+    DB = DB + pid * D + cols
+
+    partial_dw = tl.zeros((BLOCK_D,), dtype=tl.float32)
+    partial_db = tl.zeros((BLOCK_D,), dtype=tl.float32)
+    w = tl.load(W + cols, mask=mask).to(tl.float32)
+    b = tl.load(B + cols, mask=mask).to(tl.float32)
+    for _idx in range(0, rows_per_tile):
+        # Load data to SRAM
+        x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
+        if CONCAT_U and CONCAT_X:
+            du = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+            dx = tl.load(DY + D + cols, mask=mask, other=0).to(tl.float32)
+            dy = tl.load(DY + 2 * D + cols, mask=mask, other=0).to(tl.float32)
+        elif CONCAT_U:
+            du = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+            dx = tl.zeros([BLOCK_D], dtype=tl.float32)
+            dy = tl.load(DY + D + cols, mask=mask, other=0).to(tl.float32)
+        elif CONCAT_X:
+            du = tl.zeros([BLOCK_D], dtype=tl.float32)
+            dx = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+            dy = tl.load(DY + D + cols, mask=mask, other=0).to(tl.float32)
+        else:
+            du = tl.zeros([BLOCK_D], dtype=tl.float32)
+            dx = tl.zeros([BLOCK_D], dtype=tl.float32)
+            dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+        if TRAINING:
+            random_offsets = 3 * row * BLOCK_D + cols
+            if CONCAT_U and CONCAT_X:
+                # apply dropout on du
+                if FAST_DROPOUT:
+                    random_du, random_dx, random_dy = rand3x(seed, random_offsets)
+                else:
+                    random_du = tl.rand(seed, random_offsets)
+                du_keep = random_du > dropout_ratio
+                du = tl.where(du_keep, du / (1.0 - dropout_ratio), 0.0)
+                # apply dropout on dx
+                if not FAST_DROPOUT:
+                    random_dx = tl.rand(seed, random_offsets + D)
+                dx_keep = random_dx > dropout_ratio  # pyre-ignore [61]
+                dx = tl.where(dx_keep, dx / (1.0 - dropout_ratio), 0.0)
+                # apply dropout on dy
+                if not FAST_DROPOUT:
+                    random_dy = tl.rand(seed, random_offsets + 2 * D)
+                dy_keep = random_dy > dropout_ratio  # pyre-ignore [61]
+                dy = tl.where(dy_keep, dy / (1.0 - dropout_ratio), 0.0)
+            elif CONCAT_U:
+                # apply dropout on du
+                if FAST_DROPOUT:
+                    random_du, _, random_dy = rand3x(seed, random_offsets)
+                else:
+                    random_du = tl.rand(seed, random_offsets)
+                du_keep = random_du > dropout_ratio
+                du = tl.where(du_keep, du / (1.0 - dropout_ratio), 0.0)
+                # apply dropout on dy
+                if not FAST_DROPOUT:
+                    random_dy = tl.rand(seed, random_offsets + D)
+                dy_keep = random_dy > dropout_ratio  # pyre-ignore [61]
+                dy = tl.where(dy_keep, dy / (1.0 - dropout_ratio), 0.0)
+            elif CONCAT_X:
+                # apply dropout on dx
+                if FAST_DROPOUT:
+                    _, random_dx, random_dy = rand3x(seed, random_offsets)
+                else:
+                    random_dx = tl.rand(seed, random_offsets)
+                dx_keep = random_dx > dropout_ratio  # pyre-ignore [61]
+                dx = tl.where(dx_keep, dx / (1.0 - dropout_ratio), 0.0)
+                # apply dropout on dy
+                if not FAST_DROPOUT:
+                    random_dy = tl.rand(seed, random_offsets + D)
+                dy_keep = random_dy > dropout_ratio  # pyre-ignore [61]
+                dy = tl.where(dy_keep, dy / (1.0 - dropout_ratio), 0.0)
+            else:
+                random = tl.rand(seed, random_offsets)
+                dy_keep = random > dropout_ratio
+                # write-back
+                dy = tl.where(dy_keep, dy / (1.0 - dropout_ratio), 0.0)
+
+        mean = tl.load(Mean + row)
+        rstd = tl.load(Rstd + row)
+
+        # Compute dx
+        xhat = (x - mean) * rstd
+        u = tl.load(U + cols, mask=mask, other=0).to(tl.float32)
+        ln = xhat * w + b
+        du_y = dy * ln
+        mul_u = u
+        sig_u = tl.sigmoid(u)
+
+        if MUL_U_ACTIVATION_TYPE == "silu":
+            mul_u = u * sig_u
+            du_y = dy * ln * (sig_u + u * sig_u * (1.0 - sig_u))
+            dy = dy * u * sig_u
+        elif MUL_U_ACTIVATION_TYPE == "sigmoid":
+            mul_u = sig_u
+            du_y = dy * ln * (sig_u * (1.0 - sig_u))
+            dy = dy * sig_u
+        else:
+            dy = dy * u
+
+        du_u = du
+        if CONCAT_U:
+            if SILU_U:
+                du_u *= sig_u + u * sig_u * (1.0 - sig_u)
+                u = u * sig_u
+
+        du = du_y + du_u
+
+        tl.store(DU + cols, du.to(DU.dtype.element_ty), mask=mask)
+        wdy = w * dy
+        if COMPUTE_Y:
+            y = ln * mul_u
+            if TRAINING:
+                if CONCAT_U:
+                    u = tl.where(
+                        du_keep,  # pyre-ignore [61]
+                        u / (1.0 - dropout_ratio),
+                        0.0,
+                    )
+                if CONCAT_X:
+                    x = tl.where(
+                        dx_keep,  # pyre-ignore [61]
+                        x / (1.0 - dropout_ratio),
+                        0.0,
+                    )
+                y = tl.where(
+                    dy_keep,  # pyre-ignore [61]
+                    y / (1.0 - dropout_ratio),
+                    0.0,
+                )
+            if CONCAT_U and CONCAT_X:
+                tl.store(Y + cols, u.to(Y.dtype.element_ty), mask=mask)
+                tl.store(Y + D + cols, x.to(Y.dtype.element_ty), mask=mask)
+                tl.store(Y + 2 * D + cols, y.to(Y.dtype.element_ty), mask=mask)
+            elif CONCAT_U:
+                tl.store(Y + cols, u.to(Y.dtype.element_ty), mask=mask)
+                tl.store(Y + D + cols, y.to(Y.dtype.element_ty), mask=mask)
+            elif CONCAT_X:
+                tl.store(Y + cols, x.to(Y.dtype.element_ty), mask=mask)
+                tl.store(Y + D + cols, y.to(Y.dtype.element_ty), mask=mask)
+            else:
+                tl.store(Y + cols, y.to(Y.dtype.element_ty), mask=mask)
+            Y += tile_num.to(tl.int64) * stride_y
+
+        xhat = tl.where(mask, xhat, 0.0)
+        wdy = tl.where(mask, wdy, 0.0)
+        c1 = tl.sum(xhat * wdy, axis=0) / D
+        c2 = tl.sum(wdy, axis=0) / D
+        dx += (wdy - (xhat * c1 + c2)) * rstd
+        # Write dx
+        tl.store(DX + cols, dx, mask=mask)
+
+        # Accumulate partial sums for dw/db
+        partial_dw += dy * xhat
+        partial_db += dy
+        X += tile_num.to(tl.int64) * stride_x
+        U += tile_num.to(tl.int64) * stride_u
+        DY += tile_num.to(tl.int64) * stride_dy
+        DX += tile_num.to(tl.int64) * stride_dx
+        DU += tile_num.to(tl.int64) * stride_du
+        row += tile_num
+    tl.store(DW, partial_dw, mask=mask)
+    tl.store(DB, partial_db, mask=mask)
+
+
+def _get_bwd_dwdb_configs() -> List[triton.Config]:
+    configs = []
+    for BLOCK_N in [32, 64, 128, 256]:
+        for num_warps in [8, 16] + ([] if torch.ops.hip else [32]):
+            configs.append(
+                triton.Config(
+                    {"BLOCK_N": BLOCK_N},
+                    num_warps=num_warps,
+                )
+            )
+    return configs
+
+
+@triton_autotune(
+    configs=_get_bwd_dwdb_configs(),
+    key=["D"],
+)
+@triton.jit
+def _ln_mul_dropout_bwd_dwdb(
+    DW,
+    DB,
+    FINAL_DW,
+    FINAL_DB,
+    N,
+    D,
+    BLOCK_N: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+):
+    pid = tl.program_id(0).to(tl.int64)
+    cols = pid * BLOCK_D + tl.arange(0, BLOCK_D)
+    dw = tl.zeros((BLOCK_N, BLOCK_D), dtype=tl.float32)
+    db = tl.zeros((BLOCK_N, BLOCK_D), dtype=tl.float32)
+
+    for i in range(0, N, BLOCK_N):
+        rows = i + tl.arange(0, BLOCK_N)
+        # pyre-fixme[16]: `int` has no attribute `__getitem__`.
+        mask = (rows[:, None] < N) & (cols[None, :] < D)
+        offs = rows[:, None] * D + cols[None, :]
+        dw += tl.load(DW + offs, mask=mask, other=0.0)
+        db += tl.load(DB + offs, mask=mask, other=0.0)
+
+    sum_dw = tl.sum(dw, axis=0)
+    sum_db = tl.sum(db, axis=0)
+    tl.store(FINAL_DW + cols, sum_dw.to(FINAL_DW.dtype.element_ty), mask=cols < D)
+    tl.store(FINAL_DB + cols, sum_db.to(FINAL_DB.dtype.element_ty), mask=cols < D)
+
+
+def _create_dropout_mask(
+    N: int,
+    D: int,
+    BLOCK_D: int,
+    concat_u: bool,
+    concat_x: bool,
+    dropout_ratio: float,
+    seed: int,
+    device: torch.device,
+) -> torch.Tensor:
+    """Create dropout mask tensor for layer norm mul dropout.
+
+    Args:
+        N: Number of rows
+        D: Feature dimension
+        BLOCK_D: Block size for D dimension
+        concat_u: Whether to concatenate u
+        concat_x: Whether to concatenate x
+        dropout_ratio: Dropout ratio
+        seed: Random seed
+        device: Device to create tensor on
+
+    Returns:
+        random_mask: (N, D) int8 tensor. Mask j at bit j.
+
+    Bit layout: y = val & 1, x = val & 2, u = val & 4.
+    """
+    num_masks = 1 + int(concat_u) + int(concat_x)
+    # Torch uses 1 byte for bool internally, same as int8, so always use int8.
+    random_mask = torch.empty([N, D], dtype=torch.int8, device=device)
+    _generate_random_mask[(triton.cdiv(N, 4),)](
+        random_mask,
+        N,
+        dropout_ratio,
+        seed,
+        D,  # pyre-ignore[6]
+        random_mask.stride(0),  # pyre-ignore[6]
+        BLOCK_D,  # pyre-fixme[6]: Triton constexpr param
+        num_masks,  # pyre-ignore[6]: NUM_MASKS constexpr
+    )
+    return random_mask
+
+
+@maybe_register_custom_op(
+    "generative_recommenders::_triton_layer_norm_mul_dropout_fwd_impl", mutates_args=()
+)
+def _triton_layer_norm_mul_dropout_fwd_impl(
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    dropout_ratio: float,
+    training: bool,
+    silu_u: bool,
+    concat_u: bool,
+    concat_x: bool,
+    mul_u_activation_type: str,
+    seed: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Internal implementation that returns only tensors for custom_op compatibility.
+
+    Returns (y, mean, rstd, random_mask) where random_mask is empty when not used.
+    """
+    N, D = x.shape
+
+    if concat_u and concat_x:
+        y = torch.empty((N, 3 * D), dtype=x.dtype, device=x.device)
+    elif concat_u:
+        y = torch.empty((N, 2 * D), dtype=x.dtype, device=x.device)
+    elif concat_x:
+        y = torch.empty((N, 2 * D), dtype=x.dtype, device=x.device)
+    else:
+        y = torch.empty_like(x)
+    mean = torch.empty((N,), dtype=torch.float32, device=x.device)
+    rstd = torch.empty((N,), dtype=torch.float32, device=x.device)
+    if N == 0:
+        return y, mean, rstd, torch.empty(0, dtype=x.dtype, device=x.device)
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BLOCK_D: int = min(MAX_FUSED_SIZE, triton.next_power_of_2(D))
+    if D > BLOCK_D:
+        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+
+    num_warps: int = min(max(BLOCK_D // 256, 1), 8)
+    random_mask: torch.Tensor = torch.empty(0, dtype=x.dtype, device=x.device)
+    # Benchmark shows separating RNG from ln_mul_dropout kernel only benefits on
+    # blackwell when CONCAT_UX is enabled. (fused RNG kernel can benefit from rand3x fast
+    # dropout)
+    # Extended to support concat_u + concat_x for mask reuse optimization
+    if not FUSE_OUTPUT_LN_RNG_BLACKWELL and is_sm100_plus() and training:
+        random_mask = _create_dropout_mask(
+            N=N,
+            D=D,
+            BLOCK_D=BLOCK_D,
+            concat_u=concat_u,
+            concat_x=concat_x,
+            dropout_ratio=dropout_ratio,
+            seed=seed,
+            device=x.device,
+        )
+
+        def grid(META):
+            return (triton.cdiv(N, META["BLOCK_N"]),)
+
+        # pyre-ignore[28]
+        _ln_mul_dropout_fwd_rng[grid](
+            x,
+            u,
+            y,
+            weight,
+            bias,
+            mean,
+            rstd,
+            random_mask,
+            N,
+            D,
+            eps,
+            dropout_ratio,
+            x.stride(0),
+            u.stride(0),
+            y.stride(0),
+            random_mask.stride(0),
+            SILU_U=silu_u,
+            BLOCK_D=BLOCK_D,
+            TRAINING=training,
+            CONCAT_U=concat_u,
+            CONCAT_X=concat_x,
+            MUL_U_ACTIVATION_TYPE=mul_u_activation_type,
+        )
+
+    else:
+        # Default path: fused RNG generation
+        # Mask cannot be saved with fused RNG - it's generated inline in the kernel
+        # pyre-ignore[28]
+        _ln_mul_dropout_fwd[(N,)](
+            x,
+            u,
+            y,
+            weight,
+            bias,
+            mean,
+            rstd,
+            D,
+            eps,
+            seed,
+            dropout_ratio,
+            x.stride(0),
+            u.stride(0),
+            y.stride(0),
+            SILU_U=silu_u,
+            BLOCK_D=BLOCK_D,
+            TRAINING=training,
+            CONCAT_U=concat_u,
+            CONCAT_X=concat_x,
+            MUL_U_ACTIVATION_TYPE=mul_u_activation_type,
+            FAST_DROPOUT=COMPUTE_OUTPUT_LN_FAST_DROPOUT,
+            num_warps=num_warps,
+        )
+    return y, mean, rstd, random_mask
+
+
+@_triton_layer_norm_mul_dropout_fwd_impl.register_fake
+def _triton_layer_norm_mul_dropout_fwd_impl_fake(
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    dropout_ratio: float,
+    training: bool,
+    silu_u: bool,
+    concat_u: bool,
+    concat_x: bool,
+    mul_u_activation_type: str,
+    seed: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Fake implementation for FakeTensor tracing."""
+    N, D = x.shape
+    if concat_u and concat_x:
+        y = torch.empty((N, 3 * D), dtype=x.dtype, device=x.device)
+    elif concat_u:
+        y = torch.empty((N, 2 * D), dtype=x.dtype, device=x.device)
+    elif concat_x:
+        y = torch.empty((N, 2 * D), dtype=x.dtype, device=x.device)
+    else:
+        y = torch.empty_like(x)
+    mean = torch.empty((N,), dtype=torch.float32, device=x.device)
+    rstd = torch.empty((N,), dtype=torch.float32, device=x.device)
+    random_mask = torch.empty(0, dtype=x.dtype, device=x.device)
+    return y, mean, rstd, random_mask
+
+
+def triton_layer_norm_mul_dropout_fwd(
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    dropout_ratio: float,
+    training: bool,
+    silu_u: bool = False,
+    concat_u: bool = False,
+    concat_x: bool = False,
+    mul_u_activation_type: str = "none",
+    seed: Optional[int] = None,
+) -> Tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, int, int, int, Optional[torch.Tensor]
+]:  # y, mean, rstd, BLOCK_D, num_warps, seed, random_mask
+    """Forward pass for layer norm + mul + dropout.
+
+    Args:
+        x: Input tensor of shape (N, D)
+        u: Second input tensor of shape (N, D)
+        weight: Layer norm weight of shape (D,)
+        bias: Layer norm bias of shape (D,)
+        eps: Layer norm epsilon
+        dropout_ratio: Dropout probability
+        training: Whether in training mode
+        silu_u: Whether to apply SiLU to u before concatenation
+        concat_u: Whether to concatenate u to output
+        concat_x: Whether to concatenate x to output
+        mul_u_activation_type: Activation type for u multiplication
+        seed: Random seed for dropout
+
+    Returns:
+        Tuple of (y, mean, rstd, BLOCK_D, num_warps, seed, random_mask)
+        - random_mask is None when using fused RNG path (non-SM100+)
+        - random_mask is always returned when using separate RNG path (SM100+)
+          for reuse in backward pass (avoids redundant mask generation)
+    """
+    assert x.dim() == 2
+    x = switch_to_contiguous_if_needed(x)
+    N, D = x.shape
+    assert weight.dim() == 1
+    assert bias.dim() == 1
+    assert weight.numel() == D
+    assert bias.numel() == D
+
+    if N == 0:
+        D = x.shape[1]
+        if concat_u and concat_x:
+            y = torch.empty((0, 3 * D), dtype=x.dtype, device=x.device)
+        elif concat_u or concat_x:
+            y = torch.empty((0, 2 * D), dtype=x.dtype, device=x.device)
+        else:
+            y = torch.empty_like(x)
+        return (
+            y,
+            torch.empty((N,), dtype=torch.float32, device=x.device),
+            torch.empty((N,), dtype=torch.float32, device=x.device),
+            0,
+            0,
+            0,
+            None,
+        )
+
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BLOCK_D: int = min(MAX_FUSED_SIZE, triton.next_power_of_2(D))
+    if D > BLOCK_D:
+        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+
+    if seed is None and training:
+        # pyre-ignore[9]: torch.randint with dtype=int64 always returns int
+        seed = torch.randint(low=0, high=2**62, size=(1,), dtype=torch.int64).item()
+    num_warps: int = min(max(BLOCK_D // 256, 1), 8)
+
+    # Call internal implementation
+    y, mean, rstd, random_mask_tensor = _triton_layer_norm_mul_dropout_fwd_impl(
+        x,
+        u,
+        weight,
+        bias,
+        eps,
+        dropout_ratio,
+        training,
+        silu_u,
+        concat_u,
+        concat_x,
+        mul_u_activation_type,
+        seed if seed is not None else 0,
+    )
+
+    # Convert empty tensor back to None
+    random_mask: Optional[torch.Tensor] = (
+        random_mask_tensor if random_mask_tensor.numel() > 0 else None
+    )
+
+    return y, mean, rstd, BLOCK_D, num_warps, seed, random_mask  # pyre-ignore[7]
+
+
+@maybe_register_custom_op(
+    "generative_recommenders::_triton_layer_norm_mul_dropout_bwd_impl", mutates_args=()
+)
+def _triton_layer_norm_mul_dropout_bwd_impl(
+    dy: torch.Tensor,
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    mean: torch.Tensor,
+    rstd: torch.Tensor,
+    BLOCK_D: int,
+    num_warps: int,
+    eps: float,
+    training: bool,
+    dropout_ratio: float,
+    seed: int,
+    silu_u: bool,
+    concat_u: bool,
+    concat_x: bool,
+    mul_u_activation_type: str,
+    compute_y: bool,
+    random_mask: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Internal implementation that returns only tensors for custom_op compatibility.
+
+    When compute_y is False, y is returned as an empty tensor.
+    random_mask with numel() == 0 means no mask (fused RNG path).
+    """
+    N, D = x.shape
+    if compute_y:
+        if concat_u and concat_x:
+            y = torch.empty((N, 3 * D), dtype=x.dtype, device=x.device)
+        elif concat_u:
+            y = torch.empty((N, 2 * D), dtype=x.dtype, device=x.device)
+        elif concat_x:
+            y = torch.empty((N, 2 * D), dtype=x.dtype, device=x.device)
+        else:
+            y = torch.empty_like(x)
+    else:
+        y = torch.empty(0, dtype=x.dtype, device=x.device)
+
+    if N == 0:
+        return (
+            torch.zeros_like(x),
+            torch.zeros_like(u),
+            torch.zeros((D,), dtype=weight.dtype, device=x.device),
+            torch.zeros((D,), dtype=weight.dtype, device=x.device),
+            y,
+        )
+    dx = torch.empty_like(x)
+    du = torch.empty_like(u)
+    sms = torch.cuda.get_device_properties(x.device).multi_processor_count
+    tile_num = max(1, min(sms * 64, N // 4))
+    _dweight = torch.empty((tile_num, D), dtype=torch.float32, device=x.device)
+    _dbias = torch.empty((tile_num, D), dtype=torch.float32, device=x.device)
+    dweight = torch.empty((D,), dtype=weight.dtype, device=x.device)
+    dbias = torch.empty((D,), dtype=weight.dtype, device=x.device)
+
+    # Use separated RNG when random_mask is provided (from forward pass on SM100+ path)
+    has_random_mask = random_mask.numel() > 0
+    if has_random_mask:
+        # pyre-ignore[28]
+        _ln_mul_dropout_bwd_dx_du_rng[(tile_num,)](
+            dx,
+            du,
+            dy,
+            _dweight,
+            _dbias,
+            x,
+            u,
+            y if compute_y else None,
+            weight,
+            bias,
+            mean,
+            rstd,
+            random_mask,
+            dx.stride(0),
+            du.stride(0),
+            dy.stride(0),
+            x.stride(0),
+            u.stride(0),
+            y.stride(0) if compute_y else 0,
+            random_mask.stride(0),
+            D,
+            eps,
+            dropout_ratio,
+            N=N,
+            SILU_U=silu_u,
+            BLOCK_D=BLOCK_D,
+            TRAINING=training,
+            CONCAT_U=concat_u,
+            CONCAT_X=concat_x,
+            MUL_U_ACTIVATION_TYPE=mul_u_activation_type,
+            COMPUTE_Y=compute_y,
+            num_warps=num_warps,
+        )
+
+    else:
+        # pyre-ignore[28]
+        _ln_mul_dropout_bwd_dx_du[(tile_num,)](
+            dx,
+            du,
+            dy,
+            _dweight,
+            _dbias,
+            x,
+            u,
+            y if compute_y else None,
+            weight,
+            bias,
+            mean,
+            rstd,
+            dx.stride(0),
+            du.stride(0),
+            dy.stride(0),
+            x.stride(0),
+            u.stride(0),
+            y.stride(0) if compute_y else 0,
+            D,
+            eps,
+            seed,
+            dropout_ratio,
+            N=N,
+            SILU_U=silu_u,
+            BLOCK_D=BLOCK_D,
+            TRAINING=training,
+            CONCAT_U=concat_u,
+            CONCAT_X=concat_x,
+            MUL_U_ACTIVATION_TYPE=mul_u_activation_type,
+            COMPUTE_Y=compute_y,
+            FAST_DROPOUT=COMPUTE_OUTPUT_LN_FAST_DROPOUT,
+            num_warps=num_warps,
+        )
+
+    def grid(META):
+        return (triton.cdiv(D, META["BLOCK_D"]),)
+
+    blocks = triton.next_power_of_2(sms * 4)
+    BLOCK_D_bwd = triton.next_power_of_2(triton.cdiv(D, blocks))
+    BLOCK_D_bwd = min(max(BLOCK_D_bwd, 4), 128)
+    _ln_mul_dropout_bwd_dwdb[grid](
+        _dweight,
+        _dbias,
+        dweight,
+        dbias,
+        tile_num,
+        D,
+        BLOCK_D=BLOCK_D_bwd,
+    )
+    return dx, du, dweight, dbias, y
+
+
+@_triton_layer_norm_mul_dropout_bwd_impl.register_fake
+def _triton_layer_norm_mul_dropout_bwd_impl_fake(
+    dy: torch.Tensor,
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    mean: torch.Tensor,
+    rstd: torch.Tensor,
+    BLOCK_D: int,
+    num_warps: int,
+    eps: float,
+    training: bool,
+    dropout_ratio: float,
+    seed: int,
+    silu_u: bool,
+    concat_u: bool,
+    concat_x: bool,
+    mul_u_activation_type: str,
+    compute_y: bool,
+    random_mask: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Fake implementation for FakeTensor tracing."""
+    N, D = x.shape
+    dx = torch.empty_like(x)
+    du = torch.empty_like(u)
+    dweight = torch.empty((D,), dtype=weight.dtype, device=x.device)
+    dbias = torch.empty((D,), dtype=weight.dtype, device=x.device)
+    if compute_y:
+        if concat_u and concat_x:
+            y = torch.empty((N, 3 * D), dtype=x.dtype, device=x.device)
+        elif concat_u:
+            y = torch.empty((N, 2 * D), dtype=x.dtype, device=x.device)
+        elif concat_x:
+            y = torch.empty((N, 2 * D), dtype=x.dtype, device=x.device)
+        else:
+            y = torch.empty_like(x)
+    else:
+        y = torch.empty(0, dtype=x.dtype, device=x.device)
+    return dx, du, dweight, dbias, y
+
+
+def triton_layer_norm_mul_dropout_bwd(
+    dy: torch.Tensor,
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    mean: torch.Tensor,
+    rstd: torch.Tensor,
+    BLOCK_D: int,
+    num_warps: int,
+    eps: float,
+    training: bool,
+    dropout_ratio: float,
+    seed: Optional[int] = None,
+    silu_u: bool = False,
+    concat_u: bool = False,
+    concat_x: bool = False,
+    mul_u_activation_type: str = "none",
+    compute_y: bool = False,
+    random_mask: Optional[torch.Tensor] = None,
+) -> Tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]
+]:
+    N, D = x.shape
+
+    # Use empty tensor as sentinel for no random_mask
+    random_mask_tensor = (
+        random_mask
+        if random_mask is not None
+        else torch.empty(0, dtype=x.dtype, device=x.device)
+    )
+
+    dx, du, dweight, dbias, y_tensor = _triton_layer_norm_mul_dropout_bwd_impl(
+        dy,
+        x,
+        u,
+        weight,
+        bias,
+        mean,
+        rstd,
+        BLOCK_D,
+        num_warps,
+        eps,
+        training,
+        dropout_ratio,
+        seed if seed is not None else 0,
+        silu_u,
+        concat_u,
+        concat_x,
+        mul_u_activation_type,
+        compute_y,
+        random_mask_tensor,
+    )
+
+    # Convert empty tensor back to None
+    y: Optional[torch.Tensor] = y_tensor if compute_y else None
+    return dx, du, dweight, dbias, y
+
+
+class LayerNormMulDropoutFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        x: torch.Tensor,
+        u: torch.Tensor,
+        weight: torch.Tensor,
+        bias: torch.Tensor,
+        eps: float,
+        dropout_ratio: float,
+        training: bool,
+        silu_u: bool = False,
+        concat_ux: bool = False,
+        seed: Optional[int] = None,
+    ) -> torch.Tensor:
+        if dropout_ratio == 0.0:
+            # skip dropout computation if dropout ratio is 0
+            training = False
+        # skipping supporting concat_u and concat_x separately here because seems like this code path is only used in v1 of hstu_linear
+        concat_u, concat_x = concat_ux, concat_ux
+
+        # Call forward function which generates and returns random_mask
+        # On SM100+ path, random_mask is always returned for backward reuse
+        # On fused RNG path, random_mask is None (mask generated inline in kernel)
+        y, mean, rstd, BLOCK_D, num_warps, returned_seed, random_mask = (
+            triton_layer_norm_mul_dropout_fwd(
+                x=x,
+                u=u,
+                weight=weight,
+                bias=bias,
+                eps=eps,
+                dropout_ratio=dropout_ratio,
+                training=training,
+                silu_u=silu_u,
+                concat_u=concat_u,
+                concat_x=concat_x,
+                seed=seed,
+            )
+        )
+
+        # Save tensors for backward pass
+        # When random_mask is generated (SM100+ path), always save it for reuse
+        # in backward pass. This avoids redundant _generate_random_mask execution.
+        if random_mask is not None:
+            ctx.save_for_backward(x, u, weight, bias, mean, rstd, random_mask)
+            ctx.has_random_mask = True
+        else:
+            ctx.save_for_backward(x, u, weight, bias, mean, rstd)
+            ctx.has_random_mask = False
+
+        ctx.BLOCK_D = BLOCK_D
+        ctx.num_warps = num_warps
+        ctx.eps = eps
+        ctx.seed = returned_seed
+        ctx.training = training
+        ctx.concat_ux = concat_ux
+        ctx.silu_u = silu_u
+        ctx.dropout_ratio = dropout_ratio
+        return y
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, dy: torch.Tensor
+    ) -> Tuple[
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+    ]:
+        # Extract saved tensors including optional random mask
+        if ctx.has_random_mask:
+            x, u, weight, bias, mean, rstd, random_mask = ctx.saved_tensors
+        else:
+            x, u, weight, bias, mean, rstd = ctx.saved_tensors
+            random_mask = None
+
+        dx, du, dweight, dbias, _ = triton_layer_norm_mul_dropout_bwd(
+            dy=dy,
+            x=x,
+            u=u,
+            weight=weight,
+            bias=bias,
+            mean=mean,
+            rstd=rstd,
+            BLOCK_D=ctx.BLOCK_D,
+            num_warps=ctx.num_warps,
+            eps=ctx.eps,
+            training=ctx.training,
+            dropout_ratio=ctx.dropout_ratio,
+            seed=ctx.seed,
+            silu_u=ctx.silu_u,
+            concat_u=ctx.concat_ux,
+            concat_x=ctx.concat_ux,
+            compute_y=False,
+            random_mask=random_mask,  # Pass saved mask to backward
+        )
+        return dx, du, dweight, dbias, None, None, None, None, None, None
+
+
+@triton.jit
+def _group_norm_mul_dropout_fwd(
+    X,
+    U,
+    Y,
+    W,
+    B,
+    Mean,
+    Rstd,
+    D,
+    Heads,
+    eps,
+    seed,
+    dropout_ratio,
+    stride_x,
+    stride_u,
+    stride_y,
+    SILU_U: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_H: tl.constexpr,
+    TRAINING: tl.constexpr,
+    CONCAT_UX: tl.constexpr,
+):
+    row = tl.program_id(0)
+    X += row.to(tl.int64) * stride_x
+    U += row.to(tl.int64) * stride_u
+    Y += row.to(tl.int64) * stride_y
+    cols = tl.arange(0, BLOCK_D)
+    heads = tl.arange(0, BLOCK_H)
+    offsets = heads[:, None] * D + cols[None, :]
+    mask_h = heads < Heads
+    mask_c = cols < D
+    mask = mask_c[None, :] & mask_h[:, None]
+
+    # Compute mean
+    mean = 0.0
+    x = tl.load(X + offsets, mask=mask, other=0.0).to(tl.float32)
+    mean = tl.sum(x, axis=1) / D
+    mean = tl.ravel(mean)
+
+    # Compute variance
+    _var = tl.zeros([BLOCK_H, BLOCK_D], dtype=tl.float32)
+    x_mean = tl.where(mask, x - mean[:, None], 0.0)
+    _var += x_mean * x_mean
+    var = tl.sum(_var, axis=1) / D
+    var = tl.ravel(var)
+    rstd = 1 / tl.sqrt(var + eps)
+    tl.store(Mean + row * Heads + heads, mean, mask=mask_h)
+    tl.store(Rstd + row * Heads + heads, rstd, mask=mask_h)
+
+    # Normalize and apply linear transformation
+    y = x_mean * rstd[:, None]  # pyre-ignore [16]
+    w = tl.load(W + heads, mask=mask_h).to(tl.float32)
+    b = tl.load(B + heads, mask=mask_h).to(tl.float32)
+    y = y * w[:, None] + b[:, None]
+    u = tl.load(U + offsets, mask=mask, other=0.0).to(tl.float32)
+    if SILU_U:
+        u = u * tl.sigmoid(u)
+    y = y * u
+
+    if TRAINING:
+        if CONCAT_UX:
+            random_offsets = row * 3 * D * Heads + offsets
+            # apply dropout on u
+            random_u = tl.rand(seed, random_offsets)
+            u_keep = random_u > dropout_ratio
+            u = tl.where(u_keep, u / (1.0 - dropout_ratio), 0.0)
+            # apply dropout on x
+            random_x = tl.rand(seed, random_offsets + Heads * D)
+            x_keep = random_x > dropout_ratio
+            x = tl.where(x_keep, x / (1.0 - dropout_ratio), 0.0)
+            # apply dropout on y
+            random_y = tl.rand(seed, random_offsets + 2 * Heads * D)
+            y_keep = random_y > dropout_ratio
+            y = tl.where(y_keep, y / (1.0 - dropout_ratio), 0.0)
+        else:
+            random_offsets = row * D * Heads + offsets
+            random = tl.rand(seed, random_offsets)
+            y_keep = random > dropout_ratio
+            # write-back
+            y = tl.where(y_keep, y / (1.0 - dropout_ratio), 0.0)
+
+    # Write output
+    if CONCAT_UX:
+        tl.store(Y + offsets, u.to(Y.dtype.element_ty), mask=mask)
+        tl.store(Y + Heads * D + offsets, x.to(Y.dtype.element_ty), mask=mask)
+        tl.store(Y + 2 * Heads * D + offsets, y.to(Y.dtype.element_ty), mask=mask)
+    else:
+        tl.store(Y + offsets, y.to(Y.dtype.element_ty), mask=mask)
+
+
+@triton.jit
+def _group_norm_mul_dropout_bwd_dx_du(
+    DX,
+    DU,
+    DY,
+    DW,
+    DB,
+    X,
+    U,
+    Y,
+    W,
+    B,
+    Mean,
+    Rstd,
+    stride_dx,
+    stride_du,
+    stride_dy,
+    stride_x,
+    stride_u,
+    stride_y,
+    D,
+    Heads,
+    eps,
+    seed,
+    dropout_ratio,
+    SILU_U: tl.constexpr,
+    GROUP_N: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_H: tl.constexpr,
+    TRAINING: tl.constexpr,
+    CONCAT_UX: tl.constexpr,
+    COMPUTE_Y: tl.constexpr,
+):
+    row = tl.program_id(0)
+    cols = tl.arange(0, BLOCK_D)
+    off_heads = tl.arange(0, BLOCK_H)
+    mask_c = cols < D
+    mask_h = off_heads < Heads
+    mask = mask_c[None, :] & mask_h[:, None]
+    X += row.to(tl.int64) * stride_x
+    U += row.to(tl.int64) * stride_u
+    DY += row.to(tl.int64) * stride_dy
+    DX += row.to(tl.int64) * stride_dx
+    DU += row.to(tl.int64) * stride_du
+    offsets = off_heads[:, None] * D + cols[None, :]
+
+    # Load data to SRAM
+    x = tl.load(X + offsets, mask=mask, other=0).to(tl.float32)
+    if CONCAT_UX:
+        du = tl.load(DY + offsets, mask=mask, other=0).to(tl.float32)
+        dx = tl.load(DY + Heads * D + offsets, mask=mask, other=0).to(tl.float32)
+        dy = tl.load(DY + 2 * Heads * D + offsets, mask=mask, other=0).to(tl.float32)
+    else:
+        du = tl.zeros([BLOCK_H, BLOCK_D], dtype=tl.float32)
+        dx = tl.zeros([BLOCK_H, BLOCK_D], dtype=tl.float32)
+        dy = tl.load(DY + offsets, mask=mask, other=0).to(tl.float32)
+    if TRAINING:
+        if CONCAT_UX:
+            random_offsets = row * 3 * D * Heads + offsets
+            # apply dropout on du
+            random_du = tl.rand(seed, random_offsets)
+            du_keep = random_du > dropout_ratio
+            du = tl.where(du_keep, du / (1.0 - dropout_ratio), 0.0)
+            # apply dropout on dx
+            random_dx = tl.rand(seed, random_offsets + Heads * D)
+            dx_keep = random_dx > dropout_ratio
+            dx = tl.where(dx_keep, dx / (1.0 - dropout_ratio), 0.0)
+            # apply dropout on dy
+            random_dy = tl.rand(seed, random_offsets + 2 * Heads * D)
+            dy_keep = random_dy > dropout_ratio
+            dy = tl.where(dy_keep, dy / (1.0 - dropout_ratio), 0.0)
+        else:
+            random_offsets = row * D * Heads + offsets
+            random = tl.rand(seed, random_offsets)
+            dy_keep = random > dropout_ratio
+            # write-back
+            dy = tl.where(dy_keep, dy / (1.0 - dropout_ratio), 0.0)
+
+    mean = tl.load(Mean + row * Heads + off_heads)
+    rstd = tl.load(Rstd + row * Heads + off_heads)
+
+    # Compute dx
+    xhat = (x - mean[:, None]) * rstd[:, None]
+    w = tl.load(W + off_heads, mask=mask_h).to(tl.float32)
+    b = tl.load(B + off_heads, mask=mask_h).to(tl.float32)
+    u = tl.load(U + offsets, mask=mask, other=0).to(tl.float32)
+    ln = xhat * w[:, None] + b[:, None]
+    du += dy * ln
+    if SILU_U:
+        sig_u = tl.sigmoid(u)
+        silu_u = u * sig_u
+        du = du * sig_u * (1 + u - silu_u)
+        u = silu_u
+    tl.store(DU + offsets, du.to(DU.dtype.element_ty), mask=mask)
+    dy = dy * u
+    wdy = w[:, None] * dy
+    if COMPUTE_Y:
+        Y += row.to(tl.int64) * stride_y
+        y = ln * u
+        if TRAINING:
+            if CONCAT_UX:
+                u = tl.where(
+                    du_keep,  # pyre-ignore [61]
+                    u / (1.0 - dropout_ratio),
+                    0.0,
+                )
+                x = tl.where(
+                    dx_keep,  # pyre-ignore [61]
+                    x / (1.0 - dropout_ratio),
+                    0.0,
+                )
+                y = tl.where(
+                    dy_keep,  # pyre-ignore [61]
+                    y / (1.0 - dropout_ratio),
+                    0.0,
+                )
+            else:
+                y = tl.where(
+                    dy_keep,  # pyre-ignore [61]
+                    y / (1.0 - dropout_ratio),
+                    0.0,
+                )
+        if CONCAT_UX:
+            tl.store(Y + offsets, u.to(Y.dtype.element_ty), mask=mask)
+            tl.store(Y + Heads * D + offsets, x.to(Y.dtype.element_ty), mask=mask)
+            tl.store(Y + 2 * Heads * D + offsets, y.to(Y.dtype.element_ty), mask=mask)
+        else:
+            tl.store(Y + offsets, y.to(Y.dtype.element_ty), mask=mask)
+
+    xhat = tl.where(mask, xhat, 0.0)
+    wdy = tl.where(mask, wdy, 0.0)
+    c1 = tl.sum(xhat * wdy, axis=1) / D
+    c2 = tl.sum(wdy, axis=1) / D
+    dx += (wdy - (xhat * c1[:, None] + c2[:, None])) * rstd[:, None]
+    # Write dx
+    tl.store(DX + offsets, dx, mask=mask)
+
+    # Offset locks and weights/biases gradient pointer for parallel reduction
+    lock_id = row % GROUP_N
+    DW = DW + lock_id * Heads + off_heads
+    DB = DB + lock_id * Heads + off_heads
+    # Accumulate partial sums for dw/db
+    partial_dw = tl.sum(dy * xhat, axis=1)
+    partial_dw = tl.ravel(partial_dw)
+    partial_db = tl.sum(dy, axis=1)
+    partial_db = tl.ravel(partial_db)
+    tl.atomic_add(
+        DW,
+        partial_dw,
+        mask=mask_h,
+        sem="relaxed",
+    )
+    tl.atomic_add(
+        DB,
+        partial_db,
+        mask=mask_h,
+        sem="relaxed",
+    )
+
+
+def triton_group_norm_mul_dropout_fwd(
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    dropout_ratio: float,
+    training: bool,
+    silu_u: bool = False,
+    concat_ux: bool = False,
+    num_heads: int = 1,
+    linear_dim: int = -1,
+    seed: Optional[int] = None,
+) -> Tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, int, int, int, int
+]:  # y, mean, rstd, BLOCK_D, BLOCK_H, num_warps, seed
+    assert x.dim() == 2
+    assert x.shape == u.shape
+    assert x.shape[1] == num_heads * linear_dim
+    x = switch_to_contiguous_if_needed(x)
+    u = switch_to_contiguous_if_needed(u)
+    N, _ = x.shape
+    assert weight.dim() == 1
+    assert bias.dim() == 1
+    assert weight.numel() == num_heads
+    assert bias.numel() == num_heads
+
+    if concat_ux:
+        y = torch.empty((N, 3 * num_heads * linear_dim), dtype=x.dtype, device=x.device)
+    else:
+        y = torch.empty((N, num_heads * linear_dim), dtype=x.dtype, device=x.device)
+    mean = torch.empty((N * num_heads,), dtype=torch.float32, device=x.device)
+    rstd = torch.empty((N * num_heads,), dtype=torch.float32, device=x.device)
+    if N == 0:
+        return y, mean, rstd, 0, 0, 0, 0
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BLOCK_D: int = triton.next_power_of_2(linear_dim)
+    BLOCK_H: int = triton.next_power_of_2(num_heads)
+    if BLOCK_D * BLOCK_H > MAX_FUSED_SIZE:
+        raise RuntimeError(
+            "This group norm doesn't support num_heads * linear_dim >= 64KB."
+        )
+
+    if seed is None:
+        seed = torch.randint(low=0, high=2**62, size=(1,), dtype=torch.int64).item()
+    num_warps: int = min(max(BLOCK_D * BLOCK_H // 256, 1), 8)
+    # pyre-ignore[28]
+    _group_norm_mul_dropout_fwd[(N,)](
+        x,
+        u,
+        y,
+        weight,
+        bias,
+        mean,
+        rstd,
+        linear_dim,
+        num_heads,
+        eps,
+        seed,
+        dropout_ratio,
+        x.stride(0),
+        u.stride(0),
+        y.stride(0),
+        SILU_U=silu_u,
+        BLOCK_D=BLOCK_D,
+        BLOCK_H=BLOCK_H,
+        TRAINING=training,
+        CONCAT_UX=concat_ux,
+        num_warps=num_warps,
+    )
+    return y, mean, rstd, BLOCK_D, BLOCK_H, num_warps, seed  # pyre-ignore [7]
+
+
+def triton_group_norm_mul_dropout_bwd(
+    dy: torch.Tensor,
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    mean: torch.Tensor,
+    rstd: torch.Tensor,
+    BLOCK_D: int,
+    BLOCK_H: int,
+    num_warps: int,
+    eps: float,
+    training: bool,
+    dropout_ratio: float,
+    seed: Optional[int] = None,
+    silu_u: bool = False,
+    concat_ux: bool = False,
+    num_heads: int = 1,
+    linear_dim: int = -1,
+    compute_y: bool = False,
+) -> Tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]
+]:
+    y = None
+    N, dim = x.shape
+    if compute_y:
+        if concat_ux:
+            y = torch.empty(
+                (N, 3 * num_heads * linear_dim), dtype=x.dtype, device=x.device
+            )
+        else:
+            y = torch.empty((N, num_heads * linear_dim), dtype=x.dtype, device=x.device)
+    if N == 0:
+        return (
+            torch.zeros_like(x),
+            torch.zeros_like(u),
+            torch.zeros_like(weight),
+            torch.zeros_like(bias),
+            y,
+        )
+    dx = torch.empty_like(x)
+    du = torch.empty_like(u)
+    if dim <= 1024:
+        GROUP_N = 256 * 8
+    elif dim <= 4096:
+        GROUP_N = 128 * 8
+    elif dim <= 8192:
+        GROUP_N = 96 * 8
+    else:
+        GROUP_N = 64 * 8
+    GROUP_N = N if GROUP_N > N else GROUP_N
+    _dweight = torch.zeros((GROUP_N, num_heads), dtype=torch.float32, device=x.device)
+    _dbias = torch.zeros((GROUP_N, num_heads), dtype=torch.float32, device=x.device)
+    dweight = torch.empty((num_heads,), dtype=weight.dtype, device=x.device)
+    dbias = torch.empty((num_heads,), dtype=weight.dtype, device=x.device)
+    # pyre-ignore[28]
+    _group_norm_mul_dropout_bwd_dx_du[(N,)](
+        dx,
+        du,
+        dy,
+        _dweight,
+        _dbias,
+        x,
+        u,
+        y,
+        weight,
+        bias,
+        mean,
+        rstd,
+        dx.stride(0),
+        du.stride(0),
+        dy.stride(0),
+        x.stride(0),
+        u.stride(0),
+        y.stride(0) if compute_y else 0,  # pyre-ignore [16]
+        linear_dim,
+        num_heads,
+        eps,
+        seed,
+        dropout_ratio,
+        SILU_U=silu_u,
+        GROUP_N=GROUP_N,
+        BLOCK_D=BLOCK_D,
+        BLOCK_H=BLOCK_H,
+        TRAINING=training,
+        CONCAT_UX=concat_ux,
+        COMPUTE_Y=compute_y,
+        num_warps=num_warps,
+    )
+    _group_norm_bwd_dwdb[(num_heads,)](
+        _dweight,
+        _dbias,
+        dweight,
+        dbias,
+        GROUP_N,
+    )
+    return dx, du, dweight, dbias, y
+
+
+def _get_bwd_dwdb_configs() -> List[triton.Config]:
+    configs = []
+    for BLOCK_N in [32, 64, 128, 256]:
+        for num_warps in [8, 16] + ([] if torch.ops.hip else [32]):
+            configs.append(
+                triton.Config(
+                    {"BLOCK_N": BLOCK_N},
+                    num_warps=num_warps,
+                )
+            )
+    return configs
+
+
+@triton_autotune(
+    configs=_get_bwd_dwdb_configs(),
+    key=[],
+)
+@triton.jit
+def _group_norm_bwd_dwdb(
+    DW,
+    DB,
+    FINAL_DW,
+    FINAL_DB,
+    N,
+    BLOCK_N: tl.constexpr,
+):
+    col = tl.program_id(0)
+    num_heads = tl.num_programs(0)
+    dw = tl.zeros((BLOCK_N,), dtype=tl.float32)
+    db = tl.zeros((BLOCK_N,), dtype=tl.float32)
+
+    for i in range(0, N, BLOCK_N):
+        rows = i + tl.arange(0, BLOCK_N)
+        mask = rows < N
+        offs = rows * num_heads + col
+        dw += tl.load(DW + offs, mask=mask, other=0.0)
+        db += tl.load(DB + offs, mask=mask, other=0.0)
+
+    sum_dw = tl.sum(dw, axis=0)
+    sum_db = tl.sum(db, axis=0)
+    tl.store(FINAL_DW + col, sum_dw.to(FINAL_DW.dtype.element_ty))
+    tl.store(FINAL_DB + col, sum_db.to(FINAL_DB.dtype.element_ty))
+
+
+class GroupNormMulDropoutFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        x: torch.Tensor,
+        u: torch.Tensor,
+        weight: torch.Tensor,
+        bias: torch.Tensor,
+        eps: float,
+        dropout_ratio: float,
+        training: bool,
+        silu_u: bool = False,
+        concat_ux: bool = False,
+        num_heads: int = 1,
+        linear_dim: int = -1,
+        seed: Optional[int] = None,
+    ) -> torch.Tensor:
+        y, mean, rstd, BLOCK_D, BLOCK_H, num_warps, seed = (
+            triton_group_norm_mul_dropout_fwd(
+                x=x,
+                u=u,
+                weight=weight,
+                bias=bias,
+                eps=eps,
+                dropout_ratio=dropout_ratio,
+                training=training,
+                silu_u=silu_u,
+                concat_ux=concat_ux,
+                num_heads=num_heads,
+                linear_dim=linear_dim,
+                seed=seed,
+            )
+        )
+        ctx.save_for_backward(x, u, weight, bias, mean, rstd)
+        ctx.BLOCK_D = BLOCK_D
+        ctx.BLOCK_H = BLOCK_H
+        ctx.num_warps = num_warps
+        ctx.eps = eps
+        ctx.seed = seed
+        ctx.training = training
+        ctx.silu_u = silu_u
+        ctx.concat_ux = concat_ux
+        ctx.dropout_ratio = dropout_ratio
+        ctx.num_heads = num_heads
+        ctx.linear_dim = linear_dim
+        return y
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, dy: torch.Tensor
+    ) -> Tuple[
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+    ]:
+        x, u, weight, bias, mean, rstd = ctx.saved_tensors
+        dx, du, dweight, dbias, _ = triton_group_norm_mul_dropout_bwd(
+            dy=dy,
+            x=x,
+            u=u,
+            weight=weight,
+            bias=bias,
+            mean=mean,
+            rstd=rstd,
+            BLOCK_D=ctx.BLOCK_D,
+            BLOCK_H=ctx.BLOCK_H,
+            num_warps=ctx.num_warps,
+            eps=ctx.eps,
+            training=ctx.training,
+            dropout_ratio=ctx.dropout_ratio,
+            seed=ctx.seed,
+            silu_u=ctx.silu_u,
+            concat_ux=ctx.concat_ux,
+            num_heads=ctx.num_heads,
+            linear_dim=ctx.linear_dim,
+            compute_y=False,
+        )
+        return (
+            dx,
+            du,
+            dweight,
+            dbias,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+class HSTUComputeOutputFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        attn: torch.Tensor,
+        u: torch.Tensor,
+        x: torch.Tensor,
+        norm_weight: torch.Tensor,
+        norm_bias: torch.Tensor,
+        output_weight: torch.Tensor,
+        eps: float,
+        dropout_ratio: float,
+        training: bool,
+        silu_u: bool = False,
+        concat_u: bool = False,
+        concat_x: bool = False,
+        mul_u_activation_type: str = "none",
+        group_norm: bool = False,
+        num_heads: int = 1,
+        linear_dim: int = -1,
+        seed: Optional[int] = None,
+        recompute_y_in_backward: bool = False,
+    ) -> torch.Tensor:
+        if dropout_ratio == 0.0:
+            training = False
+
+        if group_norm:
+            y, mean, rstd, BLOCK_D, BLOCK_H, num_warps, seed = (
+                triton_group_norm_mul_dropout_fwd(
+                    x=attn,
+                    u=u,
+                    weight=norm_weight,
+                    bias=norm_bias,
+                    eps=eps,
+                    dropout_ratio=dropout_ratio,
+                    training=training,
+                    silu_u=silu_u,
+                    concat_ux=concat_u and concat_x,
+                    num_heads=num_heads,
+                    linear_dim=linear_dim,
+                    seed=seed,
+                )
+            )
+            ctx.BLOCK_H = BLOCK_H
+            random_mask = None
+        else:
+            y, mean, rstd, BLOCK_D, num_warps, seed, random_mask = (
+                triton_layer_norm_mul_dropout_fwd(
+                    x=attn,
+                    u=u,
+                    weight=norm_weight,
+                    bias=norm_bias,
+                    eps=eps,
+                    dropout_ratio=dropout_ratio,
+                    training=training,
+                    silu_u=silu_u,
+                    concat_u=concat_u,
+                    concat_x=concat_x,
+                    seed=seed,
+                )
+            )
+
+        out = maybe_triton_addmm_fwd(x=y, w=output_weight, y=x)
+
+        saved_tensors = [attn, u, norm_weight, norm_bias, mean, rstd, output_weight]
+        if not recompute_y_in_backward:
+            saved_tensors.append(y)
+        # Save random_mask for reuse in backward pass (avoids regenerating mask)
+        # When random_mask is available (SM100+ path), always save it.
+        if random_mask is not None:
+            saved_tensors.append(random_mask)
+            ctx.has_random_mask = True
+        else:
+            ctx.has_random_mask = False
+        ctx.save_for_backward(*saved_tensors)
+        ctx.BLOCK_D = BLOCK_D
+        ctx.num_warps = num_warps
+        ctx.eps = eps
+        ctx.seed = seed
+        ctx.training = training
+        ctx.concat_u = concat_u
+        ctx.concat_x = concat_x
+        ctx.dropout_ratio = dropout_ratio
+        ctx.num_heads = num_heads
+        ctx.linear_dim = linear_dim
+        ctx.group_norm = group_norm
+        ctx.recompute_y_in_backward = recompute_y_in_backward
+        ctx.silu_u = silu_u
+        ctx.mul_u_activation_type = mul_u_activation_type
+        return out
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, dout: torch.Tensor
+    ) -> Tuple[
+        torch.Tensor,  # dattn
+        torch.Tensor,  # du
+        torch.Tensor,  # dx
+        torch.Tensor,  # d_norm_weight
+        torch.Tensor,  # d_norm_bias
+        torch.Tensor,  # d_output_weight
+        None,  # eps
+        None,  # dropout_ratio
+        None,  # training
+        None,  # silu_u
+        None,  # concat_u
+        None,  # concat_x
+        None,  # mul_u_activation_type
+        None,  # group_norm
+        None,  # num_heads
+        None,  # linear_dim
+        None,  # seed
+        None,  # recompute_y_in_backward
+    ]:
+        attn, u, norm_weight, norm_bias, mean, rstd, output_weight = ctx.saved_tensors[
+            :7
+        ]
+        # Extract optional saved tensors based on flags
+        next_idx = 7
+        if not ctx.recompute_y_in_backward:
+            saved_y = ctx.saved_tensors[next_idx]
+            next_idx += 1
+        else:
+            saved_y = None
+        if ctx.has_random_mask:
+            random_mask = ctx.saved_tensors[next_idx]
+        else:
+            random_mask = None
+        dy = torch.mm(dout, output_weight.t())
+
+        if ctx.group_norm:
+            dattn, du, d_norm_weight, d_norm_bias, y = (
+                triton_group_norm_mul_dropout_bwd(
+                    dy=dy,
+                    x=attn,
+                    u=u,
+                    weight=norm_weight,
+                    bias=norm_bias,
+                    mean=mean,
+                    rstd=rstd,
+                    BLOCK_D=ctx.BLOCK_D,
+                    BLOCK_H=ctx.BLOCK_H,
+                    num_warps=ctx.num_warps,
+                    eps=ctx.eps,
+                    training=ctx.training,
+                    dropout_ratio=ctx.dropout_ratio,
+                    seed=ctx.seed,
+                    silu_u=ctx.silu_u,
+                    concat_ux=ctx.concat_u and ctx.concat_x,
+                    num_heads=ctx.num_heads,
+                    linear_dim=ctx.linear_dim,
+                    compute_y=ctx.recompute_y_in_backward,
+                )
+            )
+        else:
+            dattn, du, d_norm_weight, d_norm_bias, y = (
+                triton_layer_norm_mul_dropout_bwd(
+                    dy=dy,
+                    x=attn,
+                    u=u,
+                    weight=norm_weight,
+                    bias=norm_bias,
+                    mean=mean,
+                    rstd=rstd,
+                    BLOCK_D=ctx.BLOCK_D,
+                    num_warps=ctx.num_warps,
+                    eps=ctx.eps,
+                    training=ctx.training,
+                    dropout_ratio=ctx.dropout_ratio,
+                    seed=ctx.seed,
+                    silu_u=ctx.silu_u,
+                    concat_u=ctx.concat_u,
+                    concat_x=ctx.concat_x,
+                    mul_u_activation_type=ctx.mul_u_activation_type,
+                    compute_y=ctx.recompute_y_in_backward,
+                    random_mask=random_mask,
+                )
+            )
+        if not ctx.recompute_y_in_backward:
+            y = saved_y
+        d_output_weight = torch.mm(y.t(), dout)
+        return (
+            dattn,
+            du,
+            dout,
+            d_norm_weight,
+            d_norm_bias,
+            d_output_weight,
+            None,  # eps
+            None,  # dropout_ratio
+            None,  # training
+            None,  # silu_u
+            None,  # concat_u
+            None,  # concat_x
+            None,  # mul_u_activation_type
+            None,  # group_norm
+            None,  # num_heads
+            None,  # linear_dim
+            None,  # seed
+            None,  # recompute_y_in_backward
+        )
+
+
+@triton.jit
+def _helion_ln_mul_dropout_fwd(
+    x,
+    weight,
+    bias,
+    u,
+    y,
+    mean,
+    rstd,
+    eps,
+    seed,
+    dropout_ratio,
+    D: tl.constexpr,
+    stride_x: tl.constexpr,
+    stride_u: tl.constexpr,
+    stride_y: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    CONCAT_UX: tl.constexpr,
+    SILU_U: tl.constexpr,
+    TRAINING: tl.constexpr,
+):
+    row = tl.program_id(0)
+    x += row.to(tl.int64) * stride_x
+    u += row.to(tl.int64) * stride_u
+    y += row.to(tl.int64) * stride_y
+    cols = tl.arange(0, BLOCK_D)
+    mask = cols < D
+
+    # Load input
+    x_val = tl.load(x + cols, mask=mask, other=0.0).to(tl.float32)
+
+    # Precompute inverse of D for faster computation
+    inv_D = 1.0 / D
+
+    # Compute mean
+    mean_val = tl.sum(x_val, axis=0) * inv_D
+
+    # Center the data
+    x_mean = tl.where(mask, x_val - mean_val, 0.0)
+
+    # Compute variance
+    var = tl.sum(x_mean * x_mean, axis=0) * inv_D
+
+    # Compute reciprocal standard deviation
+    # pyre-fixme[16]
+    rstd_val = libdevice.rsqrt(var + eps)
+
+    # Normalize
+    y_norm = x_mean * rstd_val
+
+    # Apply weight and bias
+    w = tl.load(weight + cols, mask=mask, other=0.0).to(tl.float32)
+    b = tl.load(bias + cols, mask=mask, other=0.0).to(tl.float32)
+    y_ln = y_norm * w + b
+
+    # Load u and optionally apply SiLU activation
+    u_val = tl.load(u + cols, mask=mask, other=0.0).to(tl.float32)
+    if SILU_U:
+        u_processed = u_val * tl.sigmoid(u_val)
+    else:
+        u_processed = u_val
+
+    y_out = y_ln * u_processed
+
+    if TRAINING:
+        # Compute dropout scale
+        # pyre-fixme[16]
+        dropout_scale = fast_dividef(1.0, 1.0 - dropout_ratio)
+
+        if CONCAT_UX:
+            # Generate dropout masks
+            random_offsets = 3 * row * BLOCK_D + cols
+            random_u, random_x, random_y = rand3x(seed, random_offsets)
+
+            u_keep = random_u > dropout_ratio
+            x_keep = random_x > dropout_ratio
+            y_keep = random_y > dropout_ratio
+
+            # Apply dropout to u, x, y
+            u_output = tl.where(u_keep, u_processed * dropout_scale, 0.0)
+            x_output = tl.where(x_keep, x_val * dropout_scale, 0.0)
+            y_output = tl.where(y_keep, y_out * dropout_scale, 0.0)
+        else:
+            # Generate dropout mask for y
+            random_offsets = row * BLOCK_D + cols
+            random_y = tl.rand(seed, random_offsets)
+            y_keep = random_y > dropout_ratio
+
+            # Apply dropout to y
+            y_output = tl.where(y_keep, y_out * dropout_scale, 0.0)
+    else:
+        if CONCAT_UX:
+            u_output = u_processed
+            x_output = x_val
+        y_output = y_out
+
+    # Store outputs
+    if CONCAT_UX:
+        tl.store(y + cols, u_output.to(y.dtype.element_ty), mask=mask)
+        tl.store(y + D + cols, x_output.to(y.dtype.element_ty), mask=mask)
+        tl.store(y + 2 * D + cols, y_output.to(y.dtype.element_ty), mask=mask)
+    else:
+        tl.store(y + cols, y_output.to(y.dtype.element_ty), mask=mask)
+
+    # Store mean and rstd
+    tl.store(mean + row, mean_val)
+    tl.store(rstd + row, rstd_val)
+
+
+def helion_layer_norm_mul_dropout_fwd(
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    dropout_ratio: float,
+    training: bool,
+    silu_u: bool = False,
+    concat_ux: bool = False,
+    seed: Optional[int] = None,
+) -> Tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, int, int, int
+]:  # y, mean, rstd, BLOCK_D, num_warps, seed
+    N, D = x.shape
+
+    if seed is None:
+        seed = torch.randint(low=0, high=2**62, size=(1,), dtype=torch.int64).item()
+
+    if concat_ux:
+        y = torch.empty([N, 3 * D], dtype=x.dtype, device=x.device)
+    else:
+        y = torch.empty([N, D], dtype=x.dtype, device=x.device)
+    mean = torch.empty([N], dtype=torch.float32, device=x.device)
+    rstd = torch.empty([N], dtype=torch.float32, device=x.device)
+
+    BLOCK_D = triton.next_power_of_2(D)
+    # pyre-ignore[28]
+    _helion_ln_mul_dropout_fwd[(N,)](
+        x,
+        weight,
+        bias,
+        u,
+        y,
+        mean,
+        rstd,
+        eps,
+        seed,
+        dropout_ratio,
+        D,
+        x.stride(0),
+        u.stride(0),
+        y.stride(0),
+        BLOCK_D,
+        CONCAT_UX=concat_ux,
+        SILU_U=silu_u,
+        TRAINING=training,
+        num_warps=1,
+    )
+
+    return y, mean, rstd, BLOCK_D, 1, seed  # pyre-ignore [7]
+
+
+@triton.jit
+def _helion_ln_mul_dropout_bwd_dx_du(
+    DX,
+    DU,
+    DY,
+    DW,
+    DB,
+    X,
+    U,
+    Y,
+    W,
+    B,
+    Mean,
+    Rstd,
+    stride_dx,
+    stride_du,
+    stride_dy,
+    stride_x,
+    stride_u,
+    stride_y,
+    D: tl.constexpr,
+    eps,
+    seed,
+    dropout_ratio,
+    N,
+    SILU_U: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    TRAINING: tl.constexpr,
+    CONCAT_UX: tl.constexpr,
+    COMPUTE_Y: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    tile_num = tl.num_programs(0)
+    rows_per_tile = N // tile_num
+    if pid < N % tile_num:
+        rows_per_tile += 1
+
+    if rows_per_tile == 0:
+        return
+
+    cols = tl.arange(0, BLOCK_D)
+    mask = cols < D
+
+    # precompute inverse of D
+    inv_D: tl.constexpr = 1.0 / D
+
+    row = pid
+    X += row.to(tl.int64) * stride_x
+    U += row.to(tl.int64) * stride_u
+    if COMPUTE_Y:
+        Y += row.to(tl.int64) * stride_y
+    DY += row.to(tl.int64) * stride_dy
+    DX += row.to(tl.int64) * stride_dx
+    DU += row.to(tl.int64) * stride_du
+    DW = DW + pid * D + cols
+    DB = DB + pid * D + cols
+
+    partial_dw = tl.zeros((BLOCK_D,), dtype=tl.float32)
+    partial_db = tl.zeros((BLOCK_D,), dtype=tl.float32)
+    w = tl.load(W + cols, mask=mask).to(tl.float32)
+    b = tl.load(B + cols, mask=mask).to(tl.float32)
+
+    for _idx in range(0, rows_per_tile):
+        x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
+        if CONCAT_UX:
+            du = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+            dx = tl.load(DY + D + cols, mask=mask, other=0).to(tl.float32)
+            dy = tl.load(DY + 2 * D + cols, mask=mask, other=0).to(tl.float32)
+        else:
+            du = tl.zeros([BLOCK_D], dtype=tl.float32)
+            dx = tl.zeros([BLOCK_D], dtype=tl.float32)
+            dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+
+        if TRAINING:
+            # pyre-fixme[16]
+            dropout_scale = fast_dividef(1.0, 1.0 - dropout_ratio)
+            if CONCAT_UX:
+                random_offsets = 3 * row * BLOCK_D + cols
+                # apply dropout on du
+                random_du, random_dx, random_dy = rand3x(seed, random_offsets)
+                du_keep = random_du > dropout_ratio
+                du = tl.where(du_keep, du * dropout_scale, 0.0)
+                # apply dropout on dx
+                dx_keep = random_dx > dropout_ratio
+                dx = tl.where(dx_keep, dx * dropout_scale, 0.0)
+                # apply dropout on dy
+                dy_keep = random_dy > dropout_ratio
+                dy = tl.where(dy_keep, dy * dropout_scale, 0.0)
+            else:
+                random_offsets = row * BLOCK_D + cols
+                random = tl.rand(seed, random_offsets)
+                dy_keep = random > dropout_ratio
+                dy = tl.where(dy_keep, dy * dropout_scale, 0.0)
+
+        mean = tl.load(Mean + row)
+        rstd = tl.load(Rstd + row)
+
+        # Compute dx
+        xhat = (x - mean) * rstd
+        u = tl.load(U + cols, mask=mask, other=0).to(tl.float32)
+        ln = xhat * w + b
+        du += dy * ln
+
+        if SILU_U:
+            sig_u = tl.sigmoid(u)
+            silu_u = u * sig_u
+            du = du * sig_u * (1 + u - silu_u)
+            u = silu_u
+
+        tl.store(DU + cols, du.to(DU.dtype.element_ty), mask=mask)
+        dy = dy * u
+        wdy = w * dy
+
+        if COMPUTE_Y:
+            y = ln * u
+            if TRAINING:
+                # pyre-fixme[16]
+                dropout_scale_y = fast_dividef(1.0, 1.0 - dropout_ratio)
+                if CONCAT_UX:
+                    u = tl.where(du_keep, u * dropout_scale_y, 0.0)  # pyre-ignore [61]
+                    x = tl.where(dx_keep, x * dropout_scale_y, 0.0)  # pyre-ignore [61]
+                    y = tl.where(dy_keep, y * dropout_scale_y, 0.0)  # pyre-ignore [61]
+                else:
+                    y = tl.where(dy_keep, y * dropout_scale_y, 0.0)  # pyre-ignore [61]
+            if CONCAT_UX:
+                tl.store(Y + cols, u.to(Y.dtype.element_ty), mask=mask)
+                tl.store(Y + D + cols, x.to(Y.dtype.element_ty), mask=mask)
+                tl.store(Y + 2 * D + cols, y.to(Y.dtype.element_ty), mask=mask)
+            else:
+                tl.store(Y + cols, y.to(Y.dtype.element_ty), mask=mask)
+            Y += tile_num.to(tl.int64) * stride_y
+
+        xhat = tl.where(mask, xhat, 0.0)
+        wdy = tl.where(mask, wdy, 0.0)
+        # multiply by inv_D
+        c1 = tl.sum(xhat * wdy, axis=0) * inv_D
+        c2 = tl.sum(wdy, axis=0) * inv_D
+        dx += (wdy - (xhat * c1 + c2)) * rstd
+
+        # Write dx
+        tl.store(DX + cols, dx, mask=mask)
+
+        # Accumulate partial sums for dw/db
+        partial_dw += dy * xhat
+        partial_db += dy
+
+        X += tile_num.to(tl.int64) * stride_x
+        U += tile_num.to(tl.int64) * stride_u
+        DY += tile_num.to(tl.int64) * stride_dy
+        DX += tile_num.to(tl.int64) * stride_dx
+        DU += tile_num.to(tl.int64) * stride_du
+        row += tile_num
+
+    tl.store(DW, partial_dw, mask=mask)
+    tl.store(DB, partial_db, mask=mask)
+
+
+@triton_autotune(
+    configs=_get_bwd_dwdb_configs(),
+    key=["D"],
+)
+@triton.jit
+def _helion_ln_mul_dropout_bwd_dwdb(
+    DW,
+    DB,
+    FINAL_DW,
+    FINAL_DB,
+    N,
+    D,
+    BLOCK_N: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    cols = pid * BLOCK_D + tl.arange(0, BLOCK_D)
+    dw = tl.zeros((BLOCK_N, BLOCK_D), dtype=tl.float32)
+    db = tl.zeros((BLOCK_N, BLOCK_D), dtype=tl.float32)
+
+    for i in range(0, N, BLOCK_N):
+        rows = i + tl.arange(0, BLOCK_N)
+        # pyre-fixme[16]: `int` has no attribute `__getitem__`.
+        off_mask = (rows[:, None] < N) & (cols[None, :] < D)
+        offs = rows[:, None] * D + cols[None, :]
+        dw += tl.load(DW + offs, mask=off_mask, other=0.0)
+        db += tl.load(DB + offs, mask=off_mask, other=0.0)
+
+    sum_dw = tl.sum(dw, axis=0)
+    sum_db = tl.sum(db, axis=0)
+    tl.store(FINAL_DW + cols, sum_dw.to(FINAL_DW.dtype.element_ty), mask=cols < D)
+    tl.store(FINAL_DB + cols, sum_db.to(FINAL_DB.dtype.element_ty), mask=cols < D)
+
+
+def helion_layer_norm_mul_dropout_bwd(
+    dy: torch.Tensor,
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    mean: torch.Tensor,
+    rstd: torch.Tensor,
+    BLOCK_D: int,
+    num_warps: int,
+    eps: float,
+    training: bool,
+    dropout_ratio: float,
+    seed: Optional[int] = None,
+    silu_u: bool = False,
+    concat_ux: bool = False,
+    compute_y: bool = False,
+) -> Tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]
+]:
+    y = None
+    N, D = x.shape
+    if compute_y:
+        if concat_ux:
+            y = torch.empty((N, 3 * D), dtype=x.dtype, device=x.device)
+        else:
+            y = torch.empty_like(x)
+    if N == 0:
+        return (
+            torch.zeros_like(x),
+            torch.zeros_like(u),
+            torch.zeros((D,), dtype=weight.dtype, device=x.device),
+            torch.zeros((D,), dtype=weight.dtype, device=x.device),
+            y,
+        )
+    dx = torch.empty_like(x)
+    du = torch.empty_like(u)
+    sms = torch.cuda.get_device_properties(x.device).multi_processor_count
+    tile_num = max(1, min(sms * 64, N // 4))
+    _dweight = torch.empty((tile_num, D), dtype=torch.float32, device=x.device)
+    _dbias = torch.empty((tile_num, D), dtype=torch.float32, device=x.device)
+    dweight = torch.empty((D,), dtype=weight.dtype, device=x.device)
+    dbias = torch.empty((D,), dtype=weight.dtype, device=x.device)
+
+    # pyre-ignore[28]
+    _helion_ln_mul_dropout_bwd_dx_du[(tile_num,)](
+        dx,
+        du,
+        dy,
+        _dweight,
+        _dbias,
+        x,
+        u,
+        y,
+        weight,
+        bias,
+        mean,
+        rstd,
+        dx.stride(0),
+        du.stride(0),
+        dy.stride(0),
+        x.stride(0),
+        u.stride(0),
+        y.stride(0) if compute_y else 0,  # pyre-ignore [16]
+        D,
+        eps,
+        seed,
+        dropout_ratio,
+        N=N,
+        SILU_U=silu_u,
+        BLOCK_D=BLOCK_D,
+        TRAINING=training,
+        CONCAT_UX=concat_ux,
+        COMPUTE_Y=compute_y,
+        num_warps=num_warps,
+    )
+
+    blocks = triton.next_power_of_2(sms * 4)
+    BLOCK_D_DWDB = triton.next_power_of_2(triton.cdiv(D, blocks))
+    BLOCK_D_DWDB = min(max(BLOCK_D_DWDB, 4), 128)
+    _helion_ln_mul_dropout_bwd_dwdb[(triton.cdiv(D, BLOCK_D_DWDB),)](
+        _dweight,
+        _dbias,
+        dweight,
+        dbias,
+        tile_num,
+        D,
+        BLOCK_D=BLOCK_D_DWDB,
+    )
+    return dx, du, dweight, dbias, y
+
+
+class HelionLayerNormMulDropoutFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        x: torch.Tensor,
+        u: torch.Tensor,
+        weight: torch.Tensor,
+        bias: torch.Tensor,
+        eps: float,
+        dropout_ratio: float,
+        training: bool,
+        silu_u: bool = False,
+        concat_ux: bool = False,
+        seed: Optional[int] = None,
+    ) -> torch.Tensor:
+        if dropout_ratio == 0.0:
+            # skip dropout computation if dropout ratio is 0
+            training = False
+        y, mean, rstd, BLOCK_D, num_warps, seed = helion_layer_norm_mul_dropout_fwd(
+            x=x,
+            u=u,
+            weight=weight,
+            bias=bias,
+            eps=eps,
+            dropout_ratio=dropout_ratio,
+            training=training,
+            silu_u=silu_u,
+            concat_ux=concat_ux,
+            seed=seed,
+        )
+        ctx.save_for_backward(x, u, weight, bias, mean, rstd)
+        ctx.BLOCK_D = BLOCK_D
+        ctx.num_warps = num_warps
+        ctx.eps = eps
+        ctx.seed = seed
+        ctx.training = training
+        ctx.silu_u = silu_u
+        ctx.concat_ux = concat_ux
+        ctx.dropout_ratio = dropout_ratio
+        return y
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, dy: torch.Tensor
+    ) -> Tuple[
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+    ]:
+        x, u, weight, bias, mean, rstd = ctx.saved_tensors
+        dx, du, dweight, dbias, _ = helion_layer_norm_mul_dropout_bwd(
+            dy=dy,
+            x=x,
+            u=u,
+            weight=weight,
+            bias=bias,
+            mean=mean,
+            rstd=rstd,
+            BLOCK_D=ctx.BLOCK_D,
+            num_warps=ctx.num_warps,
+            eps=ctx.eps,
+            training=ctx.training,
+            dropout_ratio=ctx.dropout_ratio,
+            seed=ctx.seed,
+            silu_u=ctx.silu_u,
+            concat_ux=ctx.concat_ux,
+            compute_y=False,
+        )
+        return dx, du, dweight, dbias, None, None, None, None, None, None
+
+
+@torch.fx.wrap
+def helion_norm_mul_dropout(
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    dropout_ratio: float,
+    training: bool,
+    silu_u: bool = False,
+    concat_ux: bool = False,
+    seed: Optional[int] = None,
+) -> torch.Tensor:
+    return HelionLayerNormMulDropoutFunction.apply(
+        x, u, weight, bias, eps, dropout_ratio, training, silu_u, concat_ux, seed
+    )
+
+
+@torch.fx.wrap
+def triton_norm_mul_dropout(
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    dropout_ratio: float,
+    training: bool,
+    silu_u: bool = False,
+    concat_u: bool = False,
+    concat_x: bool = False,
+    group_norm: bool = False,
+    num_heads: int = 1,
+    linear_dim: int = -1,
+    seed: Optional[int] = None,
+) -> torch.Tensor:
+    if group_norm:
+        return GroupNormMulDropoutFunction.apply(
+            x,
+            u,
+            weight,
+            bias,
+            eps,
+            dropout_ratio,
+            training,
+            silu_u,
+            concat_u and concat_x,
+            num_heads,
+            linear_dim,
+            seed,
+        )
+    else:
+        return LayerNormMulDropoutFunction.apply(
+            x,
+            u,
+            weight,
+            bias,
+            eps,
+            dropout_ratio,
+            training,
+            silu_u,
+            concat_u and concat_x,
+            seed,
+        )
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def triton_hstu_compute_output(
+    attn: torch.Tensor,
+    u: torch.Tensor,
+    x: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    output_weight: torch.Tensor,
+    eps: float,
+    dropout_ratio: float,
+    training: bool,
+    silu_u: bool = False,
+    concat_u: bool = False,
+    concat_x: bool = False,
+    mul_u_activation_type: str = "none",
+    group_norm: bool = False,
+    num_heads: int = 1,
+    linear_dim: int = -1,
+    seed: Optional[int] = None,
+    recompute_y_in_backward: bool = False,
+) -> torch.Tensor:
+    return HSTUComputeOutputFunction.apply(
+        attn,
+        u,
+        x,
+        norm_weight,
+        norm_bias,
+        output_weight,
+        eps,
+        dropout_ratio,
+        training,
+        silu_u,
+        concat_u,
+        concat_x,
+        mul_u_activation_type,
+        group_norm,
+        num_heads,
+        linear_dim,
+        seed,
+        recompute_y_in_backward,
+    )
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_preprocess_and_attention.py b/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_preprocess_and_attention.py
new file mode 100644
index 000000000..bda97ff96
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_preprocess_and_attention.py
@@ -0,0 +1,342 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+from typing import Optional, Tuple
+
+import torch
+from generative_recommenders.ops.triton.triton_addmm import (
+    maybe_triton_addmm_fwd,
+    triton_addmm_bwd,
+    triton_addmm_fwd,
+)
+from generative_recommenders.ops.triton.triton_hstu_attention import (
+    _should_enable_tma,
+    triton_hstu_attention_bwd,
+    triton_hstu_attention_fwd,
+)
+from generative_recommenders.ops.triton.triton_layer_norm import (
+    compute_BLOCK_D,
+    triton_weighted_layer_norm_bwd,
+    triton_weighted_layer_norm_fwd,
+)
+from torch.nn import functional as F
+
+
+class _HSTUPreprocessAndAttentionFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore [14]
+    def forward(
+        ctx,  # pyre-ignore [2]
+        x: torch.Tensor,
+        norm_weight: torch.Tensor,
+        norm_bias: torch.Tensor,
+        norm_eps: float,
+        num_heads: int,
+        attn_dim: int,
+        hidden_dim: int,
+        uvqk_weight: torch.Tensor,
+        uvqk_bias: torch.Tensor,
+        max_seq_len: int,
+        seq_offsets: torch.Tensor,
+        attn_alpha: float,
+        num_targets: Optional[torch.Tensor],
+        max_attn_len: int,
+        contextual_seq_len: int,
+        recompute_uvqk_in_backward: bool,
+        recompute_normed_x_in_backward: bool,
+        sort_by_length: bool,
+        enable_tma: bool,
+        num_softmax_heads: int = 0,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert num_softmax_heads == 0, "Softmax attention is not supported"
+        normed_x, x_mean, x_rstd = triton_weighted_layer_norm_fwd(
+            x=x,
+            weight=norm_weight,
+            bias=norm_bias,
+            eps=norm_eps,
+        )
+        BLOCK_D = compute_BLOCK_D(x)
+        uvqk = maybe_triton_addmm_fwd(
+            x=normed_x, w=uvqk_weight, y=uvqk_bias
+        ).contiguous()
+        u, v, q, k = uvqk.split(
+            [
+                hidden_dim * num_heads,
+                hidden_dim * num_heads,
+                attn_dim * num_heads,
+                attn_dim * num_heads,
+            ],
+            dim=1,
+        )
+        q = q.view(-1, num_heads, attn_dim)
+        k = k.view(-1, num_heads, attn_dim)
+        v = v.view(-1, num_heads, hidden_dim)
+        silu_u = F.silu(u)
+        sort_by_length_indices = None
+        if sort_by_length:
+            seq_lengths = seq_offsets[1:] - seq_offsets[:-1]
+            _, sort_by_length_indices = torch.sort(
+                seq_lengths, descending=True, stable=False
+            )
+        out = triton_hstu_attention_fwd(
+            N=max_seq_len,
+            alpha=attn_alpha,
+            q=q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            sort_by_length_indices=sort_by_length_indices,
+            enable_tma=enable_tma,
+            num_softmax_heads=num_softmax_heads,
+        )
+        # update ctx
+        saved_tensors = [
+            x,
+            norm_weight,
+            norm_bias,
+            x_mean,
+            x_rstd,
+            uvqk_weight,
+            seq_offsets,
+        ]
+        if num_targets is not None:
+            saved_tensors.append(num_targets)
+        if not recompute_normed_x_in_backward:
+            saved_tensors.append(normed_x)
+        if recompute_uvqk_in_backward:
+            saved_tensors.append(uvqk_bias)
+        else:
+            saved_tensors.append(uvqk)
+        if sort_by_length:
+            saved_tensors.append(sort_by_length_indices)
+        ctx.save_for_backward(*saved_tensors)
+        ctx.attn_alpha = attn_alpha
+        ctx.has_multiple_targets = num_targets is not None
+        ctx.max_seq_len = max_seq_len
+        ctx.max_attn_len = max_attn_len
+        ctx.recompute_normed_x_in_backward = recompute_normed_x_in_backward
+        ctx.recompute_uvqk_in_backward = recompute_uvqk_in_backward
+        ctx.hidden_dim = hidden_dim
+        ctx.attn_dim = attn_dim
+        ctx.num_heads = num_heads
+        ctx.uvqk_bias_1d = uvqk_bias.dim() == 1
+        ctx.norm_eps = norm_eps
+        ctx.norm_BLOCK_D = BLOCK_D
+        ctx.contextual_seq_len = contextual_seq_len
+        ctx.sort_by_length = sort_by_length
+        ctx.enable_tma = enable_tma
+        ctx.num_softmax_heads = num_softmax_heads
+        return silu_u, out
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx,  # pyre-ignore[2]
+        dsilu_u: torch.Tensor,
+        dout: torch.Tensor,
+    ) -> Tuple[
+        torch.Tensor,  # d_x
+        torch.Tensor,  # d_norm_weight
+        torch.Tensor,  # d_norm_bias
+        None,
+        None,
+        None,
+        None,
+        torch.Tensor,  # d_uvqk_weight
+        torch.Tensor,  # d_uvqk_bias
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+    ]:
+        x, norm_weight, norm_bias, x_mean, x_rstd, uvqk_weight, seq_offsets = (
+            ctx.saved_tensors[:7]
+        )
+        idx = 7
+        if ctx.has_multiple_targets:
+            num_targets = ctx.saved_tensors[idx]
+            idx += 1
+        else:
+            num_targets = None
+        if ctx.recompute_normed_x_in_backward:
+            normed_x, _, _ = triton_weighted_layer_norm_fwd(
+                x=x,
+                weight=norm_weight,
+                bias=norm_bias,
+                eps=ctx.norm_eps,
+                mean=x_mean,
+                rstd=x_rstd,
+            )
+        else:
+            normed_x = ctx.saved_tensors[idx]
+            idx += 1
+        if ctx.recompute_uvqk_in_backward:
+            uvqk_bias = ctx.saved_tensors[idx]
+            uvqk = maybe_triton_addmm_fwd(x=normed_x, w=uvqk_weight, y=uvqk_bias)
+            idx += 1
+        else:
+            uvqk = ctx.saved_tensors[idx]
+            idx += 1
+        if ctx.sort_by_length:
+            sort_by_length_indices = ctx.saved_tensors[idx]
+        else:
+            sort_by_length_indices = None
+
+        duvqk = torch.empty_like(uvqk)
+        du, dv, dq, dk = duvqk.split(
+            [
+                ctx.hidden_dim * ctx.num_heads,
+                ctx.hidden_dim * ctx.num_heads,
+                ctx.attn_dim * ctx.num_heads,
+                ctx.attn_dim * ctx.num_heads,
+            ],
+            dim=1,
+        )
+        u, v, q, k = uvqk.split(
+            [
+                ctx.hidden_dim * ctx.num_heads,
+                ctx.hidden_dim * ctx.num_heads,
+                ctx.attn_dim * ctx.num_heads,
+                ctx.attn_dim * ctx.num_heads,
+            ],
+            dim=1,
+        )
+        q = q.view(-1, ctx.num_heads, ctx.attn_dim)
+        k = k.view(-1, ctx.num_heads, ctx.attn_dim)
+        v = v.view(-1, ctx.num_heads, ctx.hidden_dim)
+        dq = dq.view(-1, ctx.num_heads, ctx.attn_dim)
+        dk = dk.view(-1, ctx.num_heads, ctx.attn_dim)
+        dv = dv.view(-1, ctx.num_heads, ctx.hidden_dim)
+        # Note: the operation below updates duvqk in place
+        triton_hstu_attention_bwd(
+            dout=dout,
+            q=q,
+            k=k,
+            v=v,
+            dq=dq,
+            dk=dk,
+            dv=dv,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            N=ctx.max_seq_len,
+            max_attn_len=ctx.max_attn_len,
+            alpha=ctx.attn_alpha,
+            contextual_seq_len=ctx.contextual_seq_len,
+            sort_by_length_indices=sort_by_length_indices,
+            enable_tma=ctx.enable_tma,
+            num_softmax_heads=ctx.num_softmax_heads,
+        )
+        torch.ops.aten.silu_backward(dsilu_u, u, grad_input=du)
+        d_normed_x, d_uvqk_weight, d_uvqk_bias = triton_addmm_bwd(
+            x=normed_x,
+            w=uvqk_weight,
+            dz=duvqk,
+            is_y_1d=ctx.uvqk_bias_1d,
+        )
+        d_x, d_norm_weight, d_norm_bias = triton_weighted_layer_norm_bwd(
+            dy=d_normed_x,
+            x=x,
+            weight=norm_weight,
+            bias=norm_bias,
+            mean=x_mean,
+            rstd=x_rstd,
+            learnable=True,
+            eps=ctx.norm_eps,
+            BLOCK_D=ctx.norm_BLOCK_D,
+        )
+        # pyre-ignore[7]
+        return (
+            d_x,
+            d_norm_weight,
+            d_norm_bias,
+            None,
+            None,
+            None,
+            None,
+            d_uvqk_weight,
+            d_uvqk_bias,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+def triton_hstu_preprocess_and_attention(
+    x: torch.Tensor,
+    norm_weight: torch.Tensor,
+    norm_bias: torch.Tensor,
+    norm_eps: float,
+    num_heads: int,
+    attn_dim: int,
+    hidden_dim: int,
+    uvqk_weight: torch.Tensor,
+    uvqk_bias: torch.Tensor,
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    attn_alpha: float,
+    num_targets: Optional[torch.Tensor],
+    max_attn_len: int = 0,
+    contextual_seq_len: int = 0,
+    recompute_uvqk_in_backward: bool = False,
+    recompute_normed_x_in_backward: bool = False,
+    sort_by_length: bool = False,
+    enable_tma: Optional[bool] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # When the caller does not specify enable_tma, auto-detect whether the
+    # TMA / TLX fast path is safe on this device. Resolving here (vs inside
+    # the autograd Function.forward) keeps a concrete bool flowing through
+    # ctx.save_for_backward / ctx attributes.
+    if enable_tma is None:
+        enable_tma = _should_enable_tma()
+    return _HSTUPreprocessAndAttentionFunction.apply(
+        x,
+        norm_weight,
+        norm_bias,
+        norm_eps,
+        num_heads,
+        attn_dim,
+        hidden_dim,
+        uvqk_weight,
+        uvqk_bias,
+        max_seq_len,
+        seq_offsets,
+        attn_alpha,
+        num_targets,
+        max_attn_len,
+        contextual_seq_len,
+        recompute_uvqk_in_backward,
+        recompute_normed_x_in_backward,
+        sort_by_length,
+        enable_tma,
+    )
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_jagged.py b/recommendation_v4/generative_recommenders/ops/triton/triton_jagged.py
new file mode 100644
index 000000000..7a4e82cf4
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_jagged.py
@@ -0,0 +1,2533 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+#!/usr/bin/env python3
+
+
+import os
+from typing import List, Optional, Tuple
+
+import torch
+
+# @manual=//triton:triton
+import triton
+
+# @manual=//triton:triton
+import triton.language as tl
+from generative_recommenders.common import (
+    autotune_max_seq_len,
+    fine_grained_autotune_max_seq_len,
+    switch_to_contiguous_if_needed,
+    triton_autotune,
+)
+from generative_recommenders.ops.utils import is_sm100_plus, is_sm90
+from torch._inductor.runtime import triton_helpers
+
+try:
+    torch.ops.load_library(
+        "//generative_recommenders/fb/ultra/ops/hopper/jagged_dense_bmm_add:jagged_dense_bmm_add"
+    )
+except OSError:
+    pass
+
+CUDA_JAGGED_DENSE_BMM_FWD = False
+CUDA_JAGGED_DENSE_BMM_BWD = False
+
+SPLIT_2D_JAGGED_KERNEL = None
+GLN_MUL_DROPOUT_KERNEL = None
+CONCAT_2D_JAGGED_KERNEL = None
+
+
+def set_cuda_jagged_dense_bmm_fwd(value: bool) -> None:
+    global CUDA_JAGGED_DENSE_BMM_FWD
+    CUDA_JAGGED_DENSE_BMM_FWD = value
+
+
+def get_cuda_jagged_dense_bmm_fwd() -> bool:
+    # currently only supports H100
+    return CUDA_JAGGED_DENSE_BMM_FWD and is_sm90()
+
+
+def set_cuda_jagged_dense_bmm_bwd(value: bool) -> None:
+    global CUDA_JAGGED_DENSE_BMM_BWD
+    CUDA_JAGGED_DENSE_BMM_BWD = value
+
+
+def get_cuda_jagged_dense_bmm_bwd() -> bool:
+    # currently only supports H100
+    return CUDA_JAGGED_DENSE_BMM_BWD and is_sm90()
+
+
+def set_split_2d_jagged_kernel(value: Optional[str]) -> None:
+    global SPLIT_2D_JAGGED_KERNEL
+    SPLIT_2D_JAGGED_KERNEL = value
+
+
+def get_split_2d_jagged_kernel() -> Optional[str]:
+    # only override during training
+    if torch.is_grad_enabled():
+        return SPLIT_2D_JAGGED_KERNEL
+    return None
+
+
+def set_concat_2d_jagged_kernel(value: Optional[str]) -> None:
+    global CONCAT_2D_JAGGED_KERNEL
+    CONCAT_2D_JAGGED_KERNEL = value
+
+
+def get_concat_2d_jagged_kernel() -> Optional[str]:
+    # only override during training
+    if torch.is_grad_enabled():
+        return CONCAT_2D_JAGGED_KERNEL
+    return None
+
+
+def _should_use_multirow() -> bool:
+    """Check if multirow kernel should be used based on current hardware.
+
+    Can be overridden via the JAGGED_USE_MULTIROW_MI350 environment variable:
+      JAGGED_USE_MULTIROW_MI350=1  -> force multirow on
+      JAGGED_USE_MULTIROW_MI350=0  -> force multirow off
+      unset                  -> auto-detect based on hardware (SM100+ or MI350)
+    """
+    env = os.environ.get("JAGGED_USE_MULTIROW_MI350")
+    if env is not None:
+        return env == "1"
+    return is_sm100_plus()
+
+
+def set_gln_mul_dropout_kernel(value: Optional[str]) -> None:
+    global GLN_MUL_DROPOUT_KERNEL
+    GLN_MUL_DROPOUT_KERNEL = value
+
+
+def get_gln_mul_dropout_kernel() -> Optional[str]:
+    # only override during training
+    return GLN_MUL_DROPOUT_KERNEL
+
+
+def _triton_concat_2D_jagged_internal(
+    values_a: torch.Tensor,
+    values_b: torch.Tensor,
+    values_out: torch.Tensor,
+    max_seq_len: int,
+    B: int,
+    offsets_a: Optional[torch.Tensor],
+    offsets_b: Optional[torch.Tensor],
+    D: int,
+    dense_size: int,
+    stride_dense_batch: int,
+    n_prefix: int,
+    is_dense_a: bool,
+    is_dense_b: bool,
+    is_replace: bool,
+    BLOCK_D: int,
+) -> None:
+    use_multirow = _should_use_multirow()
+    if n_prefix != 0:
+        if use_multirow:
+
+            def grid(meta):
+                return (triton.cdiv(max_seq_len, meta["BLOCK_N"]), B)
+
+            concat_2D_jagged_jagged_w_prefix_multirow[grid](
+                OffsetsA=offsets_a,
+                ValuesA=values_a,
+                OffsetsB=offsets_b,
+                ValuesB=values_b,
+                Out=values_out,
+                D=D,
+                stride_ad=values_a.stride(-2),
+                stride_bd=values_b.stride(-2),
+                stride_od=values_out.stride(0),
+                n_prefix_from_B=n_prefix,
+                BLOCK_D=BLOCK_D,
+            )
+        else:
+            concat_2D_jagged_jagged_w_prefix[(max_seq_len, B)](
+                OffsetsA=offsets_a,
+                ValuesA=values_a,
+                OffsetsB=offsets_b,
+                ValuesB=values_b,
+                Out=values_out,
+                D=D,
+                stride_ad=values_a.stride(-2),
+                stride_bd=values_b.stride(-2),
+                stride_od=values_out.stride(0),
+                n_prefix_from_B=n_prefix,
+                BLOCK_D=BLOCK_D,  # pyre-ignore[6]
+            )
+    else:
+        if use_multirow:
+
+            def grid(meta):
+                return (triton.cdiv(max_seq_len, meta["BLOCK_N"]), B)
+
+            concat_2D_jagged_multirow[grid](
+                OffsetsA=offsets_a,
+                ValuesA=values_a,
+                OffsetsB=offsets_b,
+                ValuesB=values_b,
+                DenseSize=dense_size,
+                Out=values_out,
+                D=D,
+                stride_ad=values_a.stride(-2),
+                stride_bd=values_b.stride(-2),
+                stride_dense_batch=stride_dense_batch,
+                stride_od=values_out.stride(0),
+                IS_DENSE_A=is_dense_a,  # pyre-ignore[6]
+                IS_DENSE_B=is_dense_b,  # pyre-ignore[6]
+                BLOCK_D=BLOCK_D,  # pyre-ignore[6]
+                IS_REPLACE=is_replace,  # pyre-ignore[6]
+            )
+        else:
+            concat_2D_jagged[(max_seq_len, B)](
+                OffsetsA=offsets_a,
+                ValuesA=values_a,
+                OffsetsB=offsets_b,
+                ValuesB=values_b,
+                DenseSize=dense_size,
+                Out=values_out,
+                D=D,
+                stride_ad=values_a.stride(-2),
+                stride_bd=values_b.stride(-2),
+                stride_dense_batch=stride_dense_batch,
+                stride_od=values_out.stride(0),
+                IS_DENSE_A=is_dense_a,  # pyre-ignore[6]
+                IS_DENSE_B=is_dense_b,  # pyre-ignore[6]
+                BLOCK_D=BLOCK_D,  # pyre-ignore[6]
+                IS_REPLACE=is_replace,  # pyre-ignore[6]
+            )
+
+
+def _get_split_concat_2d_jagged_multirow_configs() -> List[triton.Config]:
+    configs = []
+    for BLOCK_N in [1, 2, 4, 8]:
+        for num_warps in [1, 2, 4]:
+            configs.append(
+                triton.Config(
+                    {"BLOCK_N": BLOCK_N},
+                    num_warps=num_warps,
+                )
+            )
+    return configs
+
+
+def _get_split_concat_2d_jagged_multirow_configs_wrapper() -> List[triton.Config]:
+    # Use extended config space only when JAGGED_USE_MULTIROW_MI350 is explicitly set,
+    # otherwise fall back to the existing configs to avoid breaking autotune.
+    if os.environ.get("JAGGED_USE_MULTIROW_MI350") is not None:
+        configs = []
+        # Extended config space for MI350 tuning
+        # - BLOCK_N: number of rows processed per block
+        # - num_warps: number of warps (AMD wavefront = 64 threads)
+        # - num_stages: software pipeline depth for memory latency hiding
+        #   NOTE: num_stages=0 is invalid for AMD GPUs, start from 1
+        # - waves_per_eu: AMD-specific, controls occupancy (waves per execution unit)
+        for BLOCK_N in [1, 2, 4, 8, 16, 32]:
+            for num_warps in [1, 2, 4, 8, 16, 32]:
+                for num_stages in [1, 2, 3, 4]:
+                    for waves_per_eu in [0, 1, 2, 4]:
+                        configs.append(
+                            triton.Config(
+                                {"BLOCK_N": BLOCK_N, "waves_per_eu": waves_per_eu},
+                                num_warps=num_warps,
+                                num_stages=num_stages,
+                            )
+                        )
+        return configs
+    return _get_split_concat_2d_jagged_multirow_configs()
+
+
+def _get_bmm_configs() -> List[triton.Config]:
+    configs = []
+    for BLOCK_M in [64, 128]:
+        for BLOCK_N in [64, 128, 256]:
+            for BLOCK_K in [32, 64]:
+                for num_stages in [3, 5]:
+                    for num_warps in [4, 8]:
+                        configs.append(
+                            triton.Config(
+                                {
+                                    "BLOCK_M": BLOCK_M,
+                                    "BLOCK_N": BLOCK_N,
+                                    "BLOCK_K": BLOCK_K,
+                                },
+                                num_stages=num_stages,
+                                num_warps=num_warps,
+                            )
+                        )
+    return configs
+
+
+@triton_autotune(
+    configs=_get_bmm_configs(),
+    key=["AUTOTUNE_MAX_SEQ_LEN", "N", "K", "ELEMENTWISE", "HAS_BIAS"],
+)
+@triton.jit
+def jagged_dense_bmm_broadcast_add_kernel(
+    seq_offsets,
+    Jagged,
+    Dense,
+    Bias,
+    Out,
+    AUTOTUNE_MAX_SEQ_LEN,
+    N,
+    K,
+    stride_jm,
+    stride_db,
+    stride_dk,
+    stride_dn,
+    stride_bias_b,
+    stride_om,
+    HAS_BIAS: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    ELEMENTWISE: tl.constexpr,
+):
+    """
+    Computing bmm Out = Jagged x Dense + Bias
+    M is the jagged dimension
+    Jagged has shape (sum_B(M_i), K), Dense has shape (B, K, N), Bias has shape (B, N), and Out has shape (sum_B(M_i), N)
+    """
+
+    off_n = tl.program_id(0)
+    off_m = tl.program_id(1).to(tl.int64)
+    off_b = tl.program_id(2)
+
+    seq_start = tl.load(seq_offsets + off_b).to(tl.int64)
+    seq_end = tl.load(seq_offsets + off_b + 1)
+    seq_len = seq_end - seq_start
+    start_m = off_m * BLOCK_M
+    start_n = off_n * BLOCK_N
+    if start_m >= seq_len:
+        return
+
+    Jagged += (seq_start + start_m) * stride_jm
+    Dense += off_b.to(tl.int64) * stride_db
+    Out += seq_start * stride_om
+
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_n = start_n + tl.arange(0, BLOCK_N)
+    offs_k = tl.arange(0, BLOCK_K)
+    jg_ptrs = Jagged + offs_m[:, None] * stride_jm + offs_k[None, :]
+    dn_ptrs = Dense + offs_k[:, None] * stride_dk + offs_n[None, :] * stride_dn
+
+    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for k in range(0, K, BLOCK_K):
+        jg = tl.load(
+            jg_ptrs,
+            # pyre-fixme[16]: `int` has no attribute `__getitem__`.
+            mask=(offs_m[:, None] < (seq_len - start_m)) & ((k + offs_k)[None, :] < K),
+            other=0.0,
+        )
+        dn = tl.load(
+            dn_ptrs,
+            mask=((k + offs_k)[:, None] < K) & (offs_n[None, :] < N),
+            other=0.0,
+        )
+        accumulator += tl.dot(jg, dn, allow_tf32=ALLOW_TF32)
+        jg_ptrs += BLOCK_K
+        dn_ptrs += BLOCK_K * stride_dk
+
+    if HAS_BIAS:
+        if ELEMENTWISE:
+            Bias += (seq_start + start_m) * stride_bias_b
+            bias_ptrs = Bias + offs_m[:, None] * stride_bias_b + offs_n[None, :]
+            bias = tl.load(
+                bias_ptrs,
+                mask=(offs_m[:, None] < (seq_len - start_m)) & (offs_n[None, :] < N),
+                other=0.0,
+            )
+            accumulator += bias.to(tl.float32)
+        else:
+            bias_ptrs = Bias + off_b.to(tl.int64) * stride_bias_b + offs_n
+            bias = tl.load(bias_ptrs, mask=offs_n < N)
+            accumulator += bias[None, :].to(tl.float32)
+
+    out = accumulator.to(Out.dtype.element_ty)
+
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_n = start_n + tl.arange(0, BLOCK_N)
+    Out += start_m * stride_om
+    out_ptrs = Out + offs_m[:, None] * stride_om + offs_n[None, :]
+    tl.store(
+        out_ptrs,
+        out,
+        mask=(offs_m[:, None] < (seq_len - start_m)) & (offs_n[None, :] < N),
+    )
+
+
+def _get_bmm_reduce_sum_configs() -> List[triton.Config]:
+    configs = []
+    for BLOCK_M in [64, 128]:
+        for BLOCK_N in [64, 128]:
+            for BLOCK_K in [64, 128]:
+                for num_stages in [3, 4]:
+                    for num_warps in [4, 8]:
+                        configs.append(
+                            triton.Config(
+                                {
+                                    "BLOCK_M": BLOCK_M,
+                                    "BLOCK_N": BLOCK_N,
+                                    "BLOCK_K": BLOCK_K,
+                                },
+                                num_stages=num_stages,
+                                num_warps=num_warps,
+                            )
+                        )
+    return configs
+
+
+@triton_autotune(
+    configs=_get_bmm_reduce_sum_configs(),
+    key=["M", "N", "AUTOTUNE_MAX_SEQ_LEN"],
+)
+@triton.jit
+def _jagged_jagged_bmm_reduce_sum(
+    seq_offsets,
+    JaggedA,
+    JaggedB,
+    Out,
+    ReduceOut,
+    M,
+    N,
+    AUTOTUNE_MAX_SEQ_LEN,
+    stride_ak,
+    stride_bk,
+    stride_ob,
+    stride_om,
+    stride_on,
+    stride_orb,
+    stride_orn,
+    REDUCE_JAGGEDB: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+):
+    """
+    Computing bmm Out = Jagged x Jagged
+    K is the jagged dimension
+    JaggedA has shape (sum_B(K_i), M), JaggedB has shape (sum_B(K_i), N), and Out has shape (B, M, N)
+    """
+
+    off_m = tl.program_id(0).to(tl.int64)
+    off_n = tl.program_id(1)
+    off_b = tl.program_id(2)
+
+    seq_start = tl.load(seq_offsets + off_b).to(tl.int64)
+    seq_end = tl.load(seq_offsets + off_b + 1)
+    seq_len = seq_end - seq_start
+
+    start_m = off_m * BLOCK_M
+    start_n = off_n * BLOCK_N
+
+    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    Out += off_b.to(tl.int64) * stride_ob
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_n = start_n + tl.arange(0, BLOCK_N)
+    Out += start_m * stride_om
+    out_ptrs = Out + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on
+    if REDUCE_JAGGEDB:
+        out_reduce_ptrs = (
+            ReduceOut + off_b.to(tl.int64) * stride_orb + offs_n * stride_orn
+        )
+        acc_reduce = tl.zeros((BLOCK_N,), dtype=tl.float32)
+    if seq_len == 0:
+        out = accumulator.to(Out.dtype.element_ty)
+        tl.store(
+            out_ptrs,
+            out,
+            mask=(offs_m[:, None] < (M - start_m)) & (offs_n[None, :] < N),
+        )
+        if REDUCE_JAGGEDB:
+            if off_m == 0:
+                tl.store(
+                    out_reduce_ptrs,  # pyre-ignore [61]
+                    acc_reduce.to(ReduceOut.dtype.element_ty),
+                    mask=(offs_n < N),
+                )
+        return
+
+    JaggedA += seq_start * stride_ak
+    JaggedB += seq_start * stride_bk
+    offs_k = tl.arange(0, BLOCK_K)
+    jg_a_ptrs = JaggedA + offs_k[None, :] * stride_ak + (start_m + offs_m)[:, None]
+    jg_b_ptrs = JaggedB + offs_k[:, None] * stride_bk + offs_n[None, :]
+
+    for k in range(0, seq_len, BLOCK_K):
+        jg_a = tl.load(
+            jg_a_ptrs,
+            # pyre-fixme[16]: `int` has no attribute `__getitem__`.
+            mask=(offs_m[:, None] < (M - start_m)) & ((k + offs_k)[None, :] < seq_len),
+            other=0.0,
+        )
+        jg_b = tl.load(
+            jg_b_ptrs,
+            mask=(offs_n[None, :] < N) & ((k + offs_k)[:, None] < seq_len),
+            other=0.0,
+        )
+
+        accumulator += tl.dot(jg_a, jg_b, allow_tf32=ALLOW_TF32)
+        if REDUCE_JAGGEDB:
+            if off_m == 0:
+                acc_reduce += tl.sum(jg_b.to(tl.float32), axis=0)
+
+        jg_a_ptrs += BLOCK_K * stride_ak
+        jg_b_ptrs += BLOCK_K * stride_bk
+
+    out = accumulator.to(Out.dtype.element_ty)
+    tl.store(
+        out_ptrs,
+        out,
+        mask=(offs_m[:, None] < (M - start_m)) & (offs_n[None, :] < N),
+    )
+    if REDUCE_JAGGEDB:
+        if off_m == 0:
+            tl.store(
+                out_reduce_ptrs,  # pyre-ignore [61]
+                acc_reduce.to(ReduceOut.dtype.element_ty),
+                mask=(offs_n < N),
+            )
+
+
+class _JaggedDenseBmmFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        max_seq_len: int,
+        seq_offsets: torch.Tensor,
+        jagged: torch.Tensor,
+        dense: torch.Tensor,
+    ):
+        jagged = switch_to_contiguous_if_needed(jagged)
+        L, D = jagged.shape
+        B, _, K = dense.shape
+        bmm_out = torch.empty((L, K), dtype=jagged.dtype, device=jagged.device)
+
+        grid = lambda meta: (  # noqa E731
+            triton.cdiv(K, meta["BLOCK_N"]),
+            triton.cdiv(max_seq_len, meta["BLOCK_M"]),
+            B,
+        )
+
+        jagged_dense_bmm_broadcast_add_kernel[grid](
+            seq_offsets=seq_offsets,
+            Jagged=jagged,
+            Dense=dense,
+            Bias=0,
+            Out=bmm_out,
+            AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(max_seq_len),
+            N=K,
+            K=D,
+            stride_jm=jagged.stride(0),
+            stride_db=dense.stride(0),
+            stride_dk=dense.stride(1),
+            stride_dn=dense.stride(2),
+            stride_bias_b=0,
+            stride_om=bmm_out.stride(0),
+            HAS_BIAS=False,
+            ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+            ELEMENTWISE=False,
+        )
+
+        ctx.save_for_backward(seq_offsets, jagged, dense)
+        ctx.B = B
+        ctx.max_seq_len = max_seq_len
+        ctx.K = K
+        ctx.D = D
+        return bmm_out
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, d_bmm_out: torch.Tensor
+    ) -> Tuple[None, None, torch.Tensor, torch.Tensor]:
+        seq_offsets, jagged, dense = ctx.saved_tensors
+        d_jagged = torch.empty_like(jagged)
+        d_dense = torch.empty_like(dense)
+
+        grid = lambda meta: (  # noqa E731
+            triton.cdiv(ctx.D, meta["BLOCK_N"]),
+            triton.cdiv(ctx.max_seq_len, meta["BLOCK_M"]),
+            ctx.B,
+        )
+        jagged_dense_bmm_broadcast_add_kernel[grid](
+            seq_offsets=seq_offsets,
+            Jagged=d_bmm_out,
+            Dense=dense,
+            Bias=None,
+            Out=d_jagged,
+            AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(ctx.max_seq_len),
+            N=ctx.D,
+            K=ctx.K,
+            stride_jm=d_bmm_out.stride(0),
+            stride_db=dense.stride(0),
+            stride_dk=dense.stride(2),
+            stride_dn=dense.stride(1),
+            stride_bias_b=0,
+            stride_om=d_jagged.stride(0),
+            HAS_BIAS=False,
+            ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+            ELEMENTWISE=False,
+        )
+
+        grid = lambda meta: (  # noqa E731
+            triton.cdiv(ctx.D, meta["BLOCK_M"]),
+            triton.cdiv(ctx.K, meta["BLOCK_N"]),
+            ctx.B,
+        )
+        _jagged_jagged_bmm_reduce_sum[grid](
+            seq_offsets=seq_offsets,
+            JaggedA=jagged,
+            JaggedB=d_bmm_out,
+            Out=d_dense,
+            ReduceOut=None,
+            M=ctx.D,
+            N=ctx.K,
+            AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(ctx.max_seq_len),
+            stride_ak=jagged.stride(0),
+            stride_bk=d_bmm_out.stride(0),
+            stride_ob=d_dense.stride(0),
+            stride_om=d_dense.stride(1),
+            stride_on=d_dense.stride(2),
+            stride_orb=0,
+            stride_orn=0,
+            REDUCE_JAGGEDB=False,
+            ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+        )
+
+        return None, None, d_jagged, d_dense
+
+
+def _get_jagged_dense_broadcast_add_configs() -> List[triton.Config]:
+    configs = []
+    for BLOCK_N in [16, 32, 64]:
+        for num_stages in [1, 2]:
+            for num_warps in [2, 4, 8]:
+                configs.append(
+                    triton.Config(
+                        {
+                            "BLOCK_N": BLOCK_N,
+                        },
+                        num_stages=num_stages,
+                        num_warps=num_warps,
+                    )
+                )
+    return configs
+
+
+@triton_autotune(
+    configs=_get_jagged_dense_broadcast_add_configs(),
+    key=["AUTOTUNE_MAX_SEQ_LEN"],
+)
+@triton.jit
+def jagged_dense_broadcast_add_kernel(
+    seq_offsets,
+    Jagged,
+    Dense,
+    Out,
+    AUTOTUNE_MAX_SEQ_LEN,
+    D,
+    stride_jn,
+    stride_db,
+    stride_on,
+    BLOCK_N: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+):
+    """
+    Computing Out = Jagged + Dense
+    JaggedA has shape (sum_B(N_i), D), Dense has shape (B, D), and Out has shape (sum_B(N_i), D)
+    """
+
+    off_b = tl.program_id(0)
+    off_n = tl.program_id(1)
+    seq_start = tl.load(seq_offsets + off_b)
+    seq_end = tl.load(seq_offsets + off_b + 1)
+    seq_len = seq_end - seq_start
+    start_n = off_n * BLOCK_N
+    if start_n >= seq_len:
+        return
+    Jagged += seq_start * stride_jn
+    Dense += off_b * stride_db
+    Out += seq_start * stride_on
+    offs_n = start_n + tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_D)
+    jagged_ptrs = Jagged + offs_n[:, None] * stride_jn + offs_d[None, :]
+    dense_ptrs = Dense + offs_d
+    out_ptrs = Out + offs_n[:, None] * stride_jn + offs_d[None, :]
+    for d in range(0, D, BLOCK_D):
+        jg = tl.load(
+            jagged_ptrs,
+            # pyre-fixme[16]: `int` has no attribute `__getitem__`.
+            mask=(offs_n[:, None] < seq_len) & ((d + offs_d)[None, :] < D),
+        )
+        dn = tl.load(dense_ptrs, mask=d + offs_d < D)
+        out = jg + dn[None, :]
+        tl.store(
+            out_ptrs,
+            out,
+            mask=(offs_n[:, None] < seq_len) & ((d + offs_d)[None, :] < D),
+        )
+        dense_ptrs += BLOCK_D
+        jagged_ptrs += BLOCK_D
+        out_ptrs += BLOCK_D
+
+
+@triton.jit
+def jagged_reduce_sum(
+    seq_offsets,
+    Jagged,
+    Out,
+    D,
+    stride_jn,
+    stride_ob,
+    BLOCK_D: tl.constexpr,
+):
+    """
+    Computing Out = Jagged + Dense
+    JaggedA has shape (sum_B(N_i), D), Dense has shape (B, D), and Out has shape (sum_B(N_i), D)
+    """
+    off_b = tl.program_id(0)
+    off_d = tl.program_id(1) * BLOCK_D
+    seq_start = tl.load(seq_offsets + off_b)
+    seq_end = tl.load(seq_offsets + off_b + 1)
+    seq_len = seq_end - seq_start
+    Jagged += seq_start * stride_jn
+    Out += off_b * stride_ob
+    offs_d = off_d + tl.arange(0, BLOCK_D)
+    jagged_ptrs = Jagged + offs_d
+    out_ptrs = Out + offs_d
+    accumulator = tl.zeros((BLOCK_D,), dtype=tl.float32)
+    for _ in range(0, seq_len):
+        jg = tl.load(
+            jagged_ptrs,
+            mask=offs_d < D,
+        )
+        accumulator += jg
+        jagged_ptrs += stride_jn
+    out = accumulator.to(Out.dtype.element_ty)
+    tl.store(
+        out_ptrs,
+        out,
+        mask=offs_d < D,
+    )
+
+
+class _JaggedDenseBroadcastAddFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        max_seq_len: int,
+        seq_offsets: torch.Tensor,
+        jagged: torch.Tensor,
+        dense: torch.Tensor,
+    ):
+        jagged = switch_to_contiguous_if_needed(jagged)
+        dense = switch_to_contiguous_if_needed(dense)
+        L, D = jagged.shape
+        B, _ = dense.shape
+        out = torch.empty_like(jagged)
+
+        grid = lambda meta: (  # noqa E731
+            B,
+            triton.cdiv(max_seq_len, meta["BLOCK_N"]),
+        )
+        BLOCK_D = triton.next_power_of_2(D) if D < 64 else 64
+        jagged_dense_broadcast_add_kernel[grid](
+            seq_offsets=seq_offsets,
+            Jagged=jagged,
+            Dense=dense,
+            Out=out,
+            AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(max_seq_len),
+            D=D,
+            stride_jn=jagged.stride(0),
+            stride_db=dense.stride(0),
+            stride_on=out.stride(0),
+            BLOCK_D=BLOCK_D,
+        )
+
+        ctx.save_for_backward(seq_offsets)
+        ctx.max_seq_len = max_seq_len
+        ctx.B = B
+        ctx.D = D
+        return out
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, d_out: torch.Tensor
+    ) -> Tuple[None, None, torch.Tensor, torch.Tensor]:
+        seq_offsets = ctx.saved_tensors[0]
+        d_dense = torch.empty((ctx.B, ctx.D), device=d_out.device, dtype=d_out.dtype)
+        BLOCK_D = triton.next_power_of_2(ctx.D) if ctx.D < 64 else 64
+        jagged_reduce_sum[(ctx.B, triton.cdiv(ctx.D, BLOCK_D))](
+            seq_offsets=seq_offsets,
+            Jagged=d_out,
+            Out=d_dense,
+            D=ctx.D,
+            stride_jn=d_out.stride(0),
+            stride_ob=d_dense.stride(0),
+            BLOCK_D=BLOCK_D,
+        )
+        return None, None, d_out, d_dense
+
+
+def triton_jagged_dense_bmm_add_fwd(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    jagged: torch.Tensor,
+    dense: torch.Tensor,
+    bias: torch.Tensor,
+    elementwise: bool = False,
+) -> Tuple[torch.Tensor, int, int, int]:
+    jagged = switch_to_contiguous_if_needed(jagged)
+    bias = switch_to_contiguous_if_needed(bias)
+    L, K = jagged.shape
+    B, _, N = dense.shape
+    out = torch.empty((L, N), dtype=jagged.dtype, device=jagged.device)
+
+    grid = lambda meta: (  # noqa E731
+        triton.cdiv(N, meta["BLOCK_N"]),
+        triton.cdiv(max_seq_len, meta["BLOCK_M"]),
+        B,
+    )
+
+    jagged_dense_bmm_broadcast_add_kernel[grid](
+        seq_offsets=seq_offsets,
+        Jagged=jagged,
+        Dense=dense,
+        Bias=bias,
+        Out=out,
+        AUTOTUNE_MAX_SEQ_LEN=fine_grained_autotune_max_seq_len(max_seq_len),
+        N=N,
+        K=K,
+        stride_jm=jagged.stride(0),
+        stride_db=dense.stride(0),
+        stride_dk=dense.stride(1),
+        stride_dn=dense.stride(2),
+        stride_bias_b=bias.stride(0),
+        stride_om=out.stride(0),
+        HAS_BIAS=True,
+        ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+        ELEMENTWISE=elementwise,
+    )
+
+    return out, B, K, N
+
+
+def triton_jagged_dense_bmm_add_bwd_jagged(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    d_jagged: torch.Tensor,
+    dense: torch.Tensor,
+    d_out: torch.Tensor,
+    K: int,
+    B: int,
+    N: int,
+) -> torch.Tensor:
+    grid = lambda meta: (  # noqa E731
+        triton.cdiv(K, meta["BLOCK_N"]),
+        triton.cdiv(max_seq_len, meta["BLOCK_M"]),
+        B,
+    )
+    jagged_dense_bmm_broadcast_add_kernel[grid](
+        seq_offsets=seq_offsets,
+        Jagged=d_out,
+        Dense=dense,
+        Bias=None,
+        Out=d_jagged,
+        AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(max_seq_len),
+        N=K,
+        K=N,
+        stride_jm=d_out.stride(0),
+        stride_db=dense.stride(0),
+        stride_dk=dense.stride(2),
+        stride_dn=dense.stride(1),
+        stride_bias_b=0,
+        stride_om=d_jagged.stride(0),
+        HAS_BIAS=False,
+        ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+        ELEMENTWISE=False,
+    )
+
+    return d_jagged
+
+
+def triton_jagged_dense_bmm_add_bwd_dense_bias(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    jagged: torch.Tensor,
+    d_dense: torch.Tensor,
+    B: int,
+    K: int,
+    N: int,
+    d_out: torch.Tensor,
+    elementwise: bool,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    d_bias = torch.empty((B, N), device=d_out.device, dtype=d_out.dtype)
+
+    grid = lambda meta: (  # noqa E731
+        triton.cdiv(K, meta["BLOCK_M"]),
+        triton.cdiv(N, meta["BLOCK_N"]),
+        B,
+    )
+
+    if elementwise:
+        d_bias = d_out
+        reduce_out = None
+        stride_orb = 0
+        stride_orn = 0
+        reduce_jaggedb = False
+    else:
+        reduce_out = d_bias
+        stride_orb = d_bias.stride(0)
+        stride_orn = d_bias.stride(1)
+        reduce_jaggedb = True
+
+    _jagged_jagged_bmm_reduce_sum[grid](
+        seq_offsets=seq_offsets,
+        JaggedA=jagged,
+        JaggedB=d_out,
+        Out=d_dense,
+        ReduceOut=reduce_out,
+        M=K,
+        N=N,
+        AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(max_seq_len),
+        stride_ak=jagged.stride(0),
+        stride_bk=d_out.stride(0),
+        stride_ob=d_dense.stride(0),
+        stride_om=d_dense.stride(1),
+        stride_on=d_dense.stride(2),
+        stride_orb=stride_orb,
+        stride_orn=stride_orn,
+        REDUCE_JAGGEDB=reduce_jaggedb,
+        ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+    )
+
+    return d_dense, d_bias
+
+
+class _JaggedDenseBmmAddFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        max_seq_len: int,
+        seq_offsets: torch.Tensor,
+        jagged: torch.Tensor,
+        dense: torch.Tensor,
+        bias: torch.Tensor,
+        elementwise: bool = False,
+    ):
+        if get_cuda_jagged_dense_bmm_fwd():
+            jagged = switch_to_contiguous_if_needed(jagged)
+            bias = switch_to_contiguous_if_needed(bias)
+            # Ensure bias has same dtype as jagged (required by CUDA kernel)
+            bias = bias.to(jagged.dtype)
+            # Ensure seq_offsets is int64 (required by CUDA kernel)
+            seq_offsets = seq_offsets.to(torch.int64)
+            _, K = jagged.shape
+            B, _, N = dense.shape
+            out = torch.ops.jagged_dense_bmm_broadcast_add.jagged_dense_bmm_broadcast_add_fwd(
+                max_seq_len, seq_offsets, jagged, dense, bias, elementwise
+            )
+        else:
+            out, B, K, N = triton_jagged_dense_bmm_add_fwd(
+                max_seq_len, seq_offsets, jagged, dense, bias, elementwise
+            )
+
+        ctx.save_for_backward(seq_offsets, jagged, dense)
+        ctx.B = B
+        ctx.max_seq_len = max_seq_len
+        ctx.K = K
+        ctx.N = N
+        ctx.elementwise = elementwise
+        return out
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, d_out: torch.Tensor
+    ) -> Tuple[None, None, torch.Tensor, torch.Tensor, torch.Tensor, None]:
+        seq_offsets, jagged, dense = ctx.saved_tensors
+        if get_cuda_jagged_dense_bmm_bwd():
+            d_jagged, d_dense, d_bias = (
+                torch.ops.jagged_dense_bmm_broadcast_add.jagged_dense_bmm_broadcast_add_bwd(
+                    ctx.max_seq_len,
+                    d_out,
+                    seq_offsets.to(torch.int64),
+                    jagged,
+                    dense,
+                    ctx.elementwise,
+                )
+            )
+        else:
+            d_jagged = triton_jagged_dense_bmm_add_bwd_jagged(
+                ctx.max_seq_len,
+                seq_offsets,
+                torch.empty_like(jagged),
+                dense,
+                d_out,
+                ctx.K,
+                ctx.B,
+                ctx.N,
+            )
+            d_dense, d_bias = triton_jagged_dense_bmm_add_bwd_dense_bias(
+                ctx.max_seq_len,
+                seq_offsets,
+                jagged,
+                torch.empty_like(dense),
+                ctx.B,
+                ctx.K,
+                ctx.N,
+                d_out,
+                ctx.elementwise,
+            )
+
+        return None, None, d_jagged, d_dense, d_bias, None
+
+
+@triton.jit
+def concat_2D_jagged_w_prefix(
+    OffsetsA,
+    ValuesA,
+    OffsetsB,
+    ValuesB,
+    DenseSize,
+    Out,
+    D,
+    stride_ad,
+    stride_bd,
+    stride_dense_batch,
+    stride_od,
+    n_prefix_from_B,  # nonzero is not supported when IS_REPLACE=True
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    IS_REPLACE: tl.constexpr,
+):
+    off_z = tl.program_id(1)
+    off_n = tl.program_id(0)
+    if IS_DENSE_A:
+        seq_start_a = off_z * DenseSize
+        seq_len_a = DenseSize
+        seq_start_b = tl.load(OffsetsB + off_z)
+        seq_end_b = tl.load(OffsetsB + off_z + 1)
+        seq_len_b = seq_end_b - seq_start_b
+    elif IS_DENSE_B:
+        seq_start_a = tl.load(OffsetsA + off_z)
+        seq_end_a = tl.load(OffsetsA + off_z + 1)
+        seq_len_a = seq_end_a - seq_start_a
+        seq_start_b = off_z * DenseSize
+        seq_len_b = DenseSize
+    else:
+        seq_start_a = tl.load(OffsetsA + off_z)
+        seq_end_a = tl.load(OffsetsA + off_z + 1)
+        seq_len_a = seq_end_a - seq_start_a
+        seq_start_b = tl.load(OffsetsB + off_z)
+        seq_end_b = tl.load(OffsetsB + off_z + 1)
+        seq_len_b = seq_end_b - seq_start_b
+
+    if IS_REPLACE:
+        seq_len = seq_len_a
+    else:
+        seq_len = seq_len_a + seq_len_b
+    if off_n >= seq_len:
+        return
+
+    offs_d = tl.arange(0, BLOCK_D)
+    if IS_REPLACE:
+        out_seq_start = seq_start_a + off_n
+        out_seq_b_start = seq_len_a - seq_len_b
+    else:
+        out_seq_start = seq_start_a + seq_start_b + off_n
+        out_seq_b_start = seq_len_a + n_prefix_from_B
+
+    out_ptrs = Out + out_seq_start.to(tl.int64) * stride_od + offs_d
+    if off_n < out_seq_b_start and off_n >= n_prefix_from_B:
+        off_a = off_n - n_prefix_from_B
+        if IS_DENSE_A:
+            in_ptrs = (
+                ValuesA
+                + off_a.to(tl.int64) * stride_ad
+                + off_z.to(tl.int64) * stride_dense_batch
+                + offs_d
+            )
+        else:
+            in_ptrs = ValuesA + (off_a + seq_start_a).to(tl.int64) * stride_ad + offs_d
+    else:
+        off_b = off_n - out_seq_b_start + n_prefix_from_B
+        if off_n < n_prefix_from_B:
+            off_b += out_seq_b_start - n_prefix_from_B
+        if IS_DENSE_B:
+            in_ptrs = (
+                ValuesB
+                + off_b.to(tl.int64) * stride_bd
+                + off_z.to(tl.int64) * stride_dense_batch
+                + offs_d
+            )
+        else:
+            in_ptrs = ValuesB + (off_b + seq_start_b).to(tl.int64) * stride_bd + offs_d
+    v = tl.load(in_ptrs, mask=offs_d < D)
+    tl.store(out_ptrs, v, mask=offs_d < D)
+
+
+@triton.jit
+def concat_2D_jagged(
+    OffsetsA,
+    ValuesA,
+    OffsetsB,
+    ValuesB,
+    DenseSize,
+    Out,
+    D,
+    stride_ad,
+    stride_bd,
+    stride_dense_batch,
+    stride_od,
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    IS_REPLACE: tl.constexpr,
+):
+    concat_2D_jagged_w_prefix(
+        OffsetsA,
+        ValuesA,
+        OffsetsB,
+        ValuesB,
+        DenseSize,
+        Out,
+        D,
+        stride_ad,
+        stride_bd,
+        stride_dense_batch,
+        stride_od,
+        0,
+        IS_DENSE_A,
+        IS_DENSE_B,
+        BLOCK_D,
+        IS_REPLACE,
+    )
+
+
+@triton.jit
+def concat_2D_jagged_jagged_w_prefix(
+    OffsetsA,
+    ValuesA,
+    OffsetsB,
+    ValuesB,
+    Out,
+    D,
+    stride_ad,
+    stride_bd,
+    stride_od,
+    n_prefix_from_B,
+    BLOCK_D: tl.constexpr,
+):
+    concat_2D_jagged_w_prefix(
+        OffsetsA,
+        ValuesA,
+        OffsetsB,
+        ValuesB,
+        0,
+        Out,
+        D,
+        stride_ad,
+        stride_bd,
+        0,
+        stride_od,
+        n_prefix_from_B,
+        IS_DENSE_A=False,
+        IS_DENSE_B=False,
+        BLOCK_D=BLOCK_D,
+        IS_REPLACE=False,
+    )
+
+
+@triton.jit
+def split_2D_jagged_w_prefix(
+    JaggedIn,
+    DenseSize,
+    OffsetsA,
+    OffsetsB,
+    OutA,
+    OutB,
+    D,
+    stride_id,
+    stride_ad,
+    stride_bd,
+    n_prefix_to_B,
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    IS_REPLACE: tl.constexpr,
+):
+    off_z = tl.program_id(1)
+    off_n = tl.program_id(0)
+    if IS_DENSE_A:
+        seq_start_b = tl.load(OffsetsB + off_z)
+        seq_end_b = tl.load(OffsetsB + off_z + 1)
+        seq_start_a = off_z * DenseSize
+        seq_len_a = DenseSize
+        seq_len_b = seq_end_b - seq_start_b
+    elif IS_DENSE_B:
+        seq_start_a = tl.load(OffsetsA + off_z)
+        seq_end_a = tl.load(OffsetsA + off_z + 1)
+        seq_len_a = seq_end_a - seq_start_a
+        seq_start_b = off_z * DenseSize
+        seq_len_b = DenseSize
+    else:
+        seq_start_a = tl.load(OffsetsA + off_z)
+        seq_end_a = tl.load(OffsetsA + off_z + 1)
+        seq_len_a = seq_end_a - seq_start_a
+        seq_start_b = tl.load(OffsetsB + off_z)
+        seq_end_b = tl.load(OffsetsB + off_z + 1)
+        seq_len_b = seq_end_b - seq_start_b
+    if IS_REPLACE:
+        seq_len = seq_len_a
+    else:
+        seq_len = seq_len_a + seq_len_b
+    if off_n >= seq_len:
+        return
+
+    if IS_REPLACE:
+        seq_start = seq_start_a
+        out_seq_b_start = seq_len_a - seq_len_b
+    else:
+        seq_start = seq_start_a + seq_start_b
+        out_seq_b_start = seq_len_a + n_prefix_to_B
+
+    offs_d = tl.arange(0, BLOCK_D)
+    in_ptrs = JaggedIn + (seq_start + off_n).to(tl.int64) * stride_id + offs_d
+    if off_n < out_seq_b_start and off_n >= n_prefix_to_B:
+        off_a = off_n - n_prefix_to_B
+        out_ptrs = OutA + (off_a + seq_start_a).to(tl.int64) * stride_ad + offs_d
+    else:
+        off_b = off_n - out_seq_b_start + n_prefix_to_B
+        if off_n < n_prefix_to_B:
+            off_b += out_seq_b_start - n_prefix_to_B
+        out_ptrs = OutB + (off_b + seq_start_b).to(tl.int64) * stride_bd + offs_d
+    v = tl.load(in_ptrs, mask=offs_d < D)
+    tl.store(out_ptrs, v, mask=offs_d < D)
+
+
+@triton.jit
+def split_2D_jagged(
+    JaggedIn,
+    DenseSize,
+    OffsetsA,
+    OffsetsB,
+    OutA,
+    OutB,
+    D,
+    stride_id,
+    stride_ad,
+    stride_bd,
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    IS_REPLACE: tl.constexpr,
+):
+    split_2D_jagged_w_prefix(
+        JaggedIn,
+        DenseSize,
+        OffsetsA,
+        OffsetsB,
+        OutA,
+        OutB,
+        D,
+        stride_id,
+        stride_ad,
+        stride_bd,
+        0,
+        IS_DENSE_A,
+        IS_DENSE_B,
+        BLOCK_D,
+        IS_REPLACE,
+    )
+
+
+@triton.jit
+def split_2D_jagged_jagged_w_prefix(
+    JaggedIn,
+    OffsetsA,
+    OffsetsB,
+    OutA,
+    OutB,
+    D,
+    stride_id,
+    stride_ad,
+    stride_bd,
+    n_prefix_to_B,
+    BLOCK_D: tl.constexpr,
+):
+    split_2D_jagged_w_prefix(
+        JaggedIn,
+        0,
+        OffsetsA,
+        OffsetsB,
+        OutA,
+        OutB,
+        D,
+        stride_id,
+        stride_ad,
+        stride_bd,
+        n_prefix_to_B,
+        IS_DENSE_A=False,
+        IS_DENSE_B=False,
+        BLOCK_D=BLOCK_D,
+        IS_REPLACE=False,
+    )
+
+
+def _triton_split_2D_jagged_internal(
+    jagged_in: torch.Tensor,
+    max_seq_len: int,
+    B: int,
+    offsets_a: Optional[torch.Tensor],
+    offsets_b: Optional[torch.Tensor],
+    out_a: torch.Tensor,
+    out_b: torch.Tensor,
+    D: int,
+    dense_size: int,
+    n_prefix: int,
+    is_dense_a: bool,
+    is_dense_b: bool,
+    is_replace: bool,
+    BLOCK_D: int,
+) -> None:
+    use_multirow = _should_use_multirow()
+    if n_prefix != 0:
+        if use_multirow:
+
+            def grid(meta):
+                return (triton.cdiv(max_seq_len, meta["BLOCK_N"]), B)
+
+            split_2D_jagged_jagged_w_prefix_multirow[grid](
+                JaggedIn=jagged_in,
+                OffsetsA=offsets_a,
+                OffsetsB=offsets_b,
+                OutA=out_a,
+                OutB=out_b,
+                D=D,
+                stride_id=jagged_in.stride(0),
+                stride_ad=out_a.stride(0),
+                stride_bd=out_b.stride(0),
+                n_prefix_to_B=n_prefix,
+                BLOCK_D=BLOCK_D,
+            )
+        else:
+            split_2D_jagged_jagged_w_prefix[(max_seq_len, B)](
+                JaggedIn=jagged_in,
+                OffsetsA=offsets_a,
+                OffsetsB=offsets_b,
+                OutA=out_a,
+                OutB=out_b,
+                D=D,
+                stride_id=jagged_in.stride(0),
+                stride_ad=out_a.stride(0),
+                stride_bd=out_b.stride(0),
+                n_prefix_to_B=n_prefix,
+                BLOCK_D=BLOCK_D,  # pyre-ignore[6]
+            )
+    else:
+        if use_multirow:
+
+            def grid(meta):
+                return (triton.cdiv(max_seq_len, meta["BLOCK_N"]), B)
+
+            split_2D_jagged_multirow[grid](
+                JaggedIn=jagged_in,
+                DenseSize=dense_size,
+                OffsetsA=offsets_a,
+                OffsetsB=offsets_b,
+                OutA=out_a,
+                OutB=out_b,
+                D=D,
+                stride_id=jagged_in.stride(0),
+                stride_ad=out_a.stride(0),
+                stride_bd=out_b.stride(0),
+                IS_DENSE_A=is_dense_a,  # pyre-ignore[6]
+                IS_DENSE_B=is_dense_b,  # pyre-ignore[6]
+                BLOCK_D=BLOCK_D,  # pyre-ignore[6]
+                IS_REPLACE=is_replace,  # pyre-ignore[6]
+            )
+        else:
+            split_2D_jagged[(max_seq_len, B)](
+                JaggedIn=jagged_in,
+                DenseSize=dense_size,
+                OffsetsA=offsets_a,
+                OffsetsB=offsets_b,
+                OutA=out_a,
+                OutB=out_b,
+                D=D,
+                stride_id=jagged_in.stride(0),
+                stride_ad=out_a.stride(0),
+                stride_bd=out_b.stride(0),
+                IS_DENSE_A=is_dense_a,  # pyre-ignore[6]
+                IS_DENSE_B=is_dense_b,  # pyre-ignore[6]
+                BLOCK_D=BLOCK_D,  # pyre-ignore[6]
+                IS_REPLACE=is_replace,  # pyre-ignore[6]
+            )
+
+
+class _Concat2DJaggedFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        max_seq_len: int,
+        values_a: torch.Tensor,
+        values_b: torch.Tensor,
+        offsets_a: Optional[torch.Tensor] = None,
+        offsets_b: Optional[torch.Tensor] = None,
+        is_replace: bool = False,
+        n_prefix_from_right: int = 0,
+    ):
+        values_a = switch_to_contiguous_if_needed(values_a)
+        values_b = switch_to_contiguous_if_needed(values_b)
+        is_dense_a = offsets_a is None
+        is_dense_b = offsets_b is None
+        dense_size: int = 0
+        if is_dense_a:
+            assert offsets_b is not None
+            B, dense_size, D = values_a.shape
+            seq_len_a = dense_size * B
+            seq_len_b, _ = values_b.shape
+            device = values_b.device
+            dtype = values_b.dtype
+            stride_dense_batch = values_a.stride(0)
+        elif is_dense_b:
+            assert offsets_a is not None
+            B, dense_size, D = values_b.shape
+            seq_len_a, _ = values_a.shape
+            seq_len_b = dense_size * B
+            device = values_a.device
+            dtype = values_a.dtype
+            stride_dense_batch = values_b.stride(0)
+        else:
+            assert offsets_a is not None and offsets_b is not None
+            B = offsets_a.shape[0] - 1
+            seq_len_a, D = values_a.shape
+            seq_len_b, _ = values_b.shape
+            device = values_a.device
+            dtype = values_a.dtype
+            stride_dense_batch = 0
+
+        BLOCK_D = triton.next_power_of_2(D)
+        if is_replace:
+            values_out = torch.empty_like(values_a)
+        else:
+            values_out = torch.empty(
+                (seq_len_a + seq_len_b, D), device=device, dtype=dtype
+            )
+        _triton_concat_2D_jagged_internal(
+            values_a=values_a,
+            values_b=values_b,
+            values_out=values_out,
+            max_seq_len=max_seq_len,
+            B=B,
+            offsets_a=offsets_a,
+            offsets_b=offsets_b,
+            D=D,
+            dense_size=dense_size,
+            stride_dense_batch=stride_dense_batch,
+            n_prefix=n_prefix_from_right,
+            is_dense_a=is_dense_a,
+            is_dense_b=is_dense_b,
+            is_replace=is_replace,
+            BLOCK_D=BLOCK_D,
+        )
+        ctx.save_for_backward(offsets_a, offsets_b)
+        ctx.max_seq_len = max_seq_len
+        ctx.seq_len_a = seq_len_a
+        ctx.seq_len_b = seq_len_b
+        ctx.is_dense_a = is_dense_a
+        ctx.is_dense_b = is_dense_b
+        ctx.dense_size = dense_size
+        ctx.is_replace = is_replace
+        ctx.n_prefix_from_right = n_prefix_from_right
+        return values_out
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, d_out: torch.Tensor
+    ) -> Tuple[None, torch.Tensor, torch.Tensor, None, None, None, None]:
+        offsets_a, offsets_b = ctx.saved_tensors
+        is_dense_a, is_dense_b, is_replace = (
+            ctx.is_dense_a,
+            ctx.is_dense_b,
+            ctx.is_replace,
+        )
+        dense_size = ctx.dense_size
+        if is_dense_a:
+            B = offsets_b.shape[0] - 1
+        else:
+            B = offsets_a.shape[0] - 1
+        _, D = d_out.shape
+        BLOCK_D = triton.next_power_of_2(D)
+        values_a = torch.zeros(
+            (ctx.seq_len_a, D), device=d_out.device, dtype=d_out.dtype
+        )
+        values_b = torch.empty(
+            (ctx.seq_len_b, D), device=d_out.device, dtype=d_out.dtype
+        )
+        _triton_split_2D_jagged_internal(
+            jagged_in=d_out,
+            max_seq_len=ctx.max_seq_len,
+            B=B,
+            offsets_a=offsets_a,
+            offsets_b=offsets_b,
+            out_a=values_a,
+            out_b=values_b,
+            D=D,
+            dense_size=dense_size,
+            n_prefix=ctx.n_prefix_from_right,
+            is_dense_a=is_dense_a,
+            is_dense_b=is_dense_b,
+            is_replace=is_replace,
+            BLOCK_D=BLOCK_D,
+        )
+
+        if is_dense_a:
+            values_a = values_a.reshape((B, dense_size, D))
+        elif is_dense_b:
+            values_b = values_b.reshape((B, dense_size, D))
+        return None, values_a, values_b, None, None, None, None
+
+
+class _HelionConcat2DJaggedFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        max_seq_len: int,
+        values_a: torch.Tensor,
+        values_b: torch.Tensor,
+        offsets_a: Optional[torch.Tensor] = None,
+        offsets_b: Optional[torch.Tensor] = None,
+    ):
+        values_a = switch_to_contiguous_if_needed(values_a)
+        values_b = switch_to_contiguous_if_needed(values_b)
+
+        assert offsets_a is not None and offsets_b is not None
+        B = offsets_a.shape[0] - 1
+        seq_len_a, D = values_a.shape
+        seq_len_b, _ = values_b.shape
+        device = values_a.device
+        dtype = values_a.dtype
+
+        BLOCK_D = triton.next_power_of_2(D)
+        values_out = torch.empty((seq_len_a + seq_len_b, D), device=device, dtype=dtype)
+        _triton_concat_2D_jagged_internal(
+            values_a=values_a,
+            values_b=values_b,
+            values_out=values_out,
+            max_seq_len=max_seq_len,
+            B=B,
+            offsets_a=offsets_a,
+            offsets_b=offsets_b,
+            D=D,
+            dense_size=0,
+            stride_dense_batch=0,
+            n_prefix=0,
+            is_dense_a=False,
+            is_dense_b=False,
+            is_replace=False,
+            BLOCK_D=BLOCK_D,
+        )
+        ctx.save_for_backward(offsets_a, offsets_b)
+        ctx.max_seq_len = max_seq_len
+        ctx.seq_len_a = seq_len_a
+        ctx.seq_len_b = seq_len_b
+        return values_out
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, d_out: torch.Tensor
+    ) -> Tuple[None, torch.Tensor, torch.Tensor, None, None, None, None]:
+        offsets_a, offsets_b = ctx.saved_tensors
+        d_out = switch_to_contiguous_if_needed(d_out)
+        values_a, values_b = _helion_split_2D_jagged_impl(
+            values=d_out,
+            max_seq_len=ctx.max_seq_len,
+            offsets_a=offsets_a,
+            offsets_b=offsets_b,
+            dense_size=0,
+            total_len_a=ctx.seq_len_a,
+            total_len_b=ctx.seq_len_b,
+        )
+
+        return None, values_a, values_b, None, None, None, None
+
+
+class _Split2DJaggedFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        values: torch.Tensor,
+        max_seq_len: int,
+        offsets_a: Optional[torch.Tensor] = None,
+        offsets_b: Optional[torch.Tensor] = None,
+        dense_size: int = 0,
+        n_prefix_to_right: int = 0,
+        seq_len_a: Optional[int] = None,
+        seq_len_b: Optional[int] = None,
+        total_seq_len: Optional[int] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        values = switch_to_contiguous_if_needed(values)
+        is_dense_a: bool = offsets_a is None
+        is_dense_b: bool = offsets_b is None
+        if is_dense_a:
+            L, _ = values.shape
+            assert offsets_b is not None
+            B = offsets_b.shape[0] - 1
+            seq_len_a = dense_size * B
+            seq_len_b = L - seq_len_a
+            offsets_a = offsets_b.new_empty(0)
+        elif is_dense_b:
+            L, _ = values.shape
+            assert offsets_a is not None
+            B = offsets_a.shape[0] - 1
+            seq_len_b = dense_size * B
+            seq_len_a = L - seq_len_b
+            offsets_b = offsets_a.new_empty(0)
+        else:
+            assert offsets_a is not None and offsets_b is not None
+            B = offsets_a.shape[0] - 1
+
+            # Select the last offset item using torch.index_select instead of
+            # "int(offsets_a[-1].item())" so that it won't cause "Cannot cast
+            # FakeTensor to python number" error for AOTI.
+            if torch.compiler.is_compiling():
+                offsets_b_last_idx = torch.tensor(offsets_b.size(0) - 1).to(
+                    offsets_b.device, non_blocking=True
+                )
+                if seq_len_b is None:
+                    seq_len_b = offsets_b.index_select(dim=0, index=offsets_b_last_idx)
+                if seq_len_a is None and total_seq_len is None:
+                    offsets_a_last_idx = torch.tensor(offsets_a.size(0) - 1).to(
+                        offsets_a.device, non_blocking=True
+                    )
+                    seq_len_a = offsets_a.index_select(dim=0, index=offsets_a_last_idx)
+            else:
+                if seq_len_b is None:
+                    seq_len_b = int(offsets_b[-1].item())
+                if seq_len_a is None and total_seq_len is None:
+                    seq_len_a = int(offsets_a[-1].item())
+        _, D = values.shape
+        BLOCK_D = triton.next_power_of_2(D)
+        # pyre-ignore[6] Incompatible parameter type
+        values_b = torch.empty((seq_len_b, D), device=values.device, dtype=values.dtype)
+        if seq_len_a is None:
+            # Derive seq_len_a from total_seq_len and values_b.size(0).
+            # values_b.size(0) is a SymInt (from the torch.empty above),
+            # so this is SymInt arithmetic — no new unbacked SymInt.
+            assert total_seq_len is not None
+            seq_len_a = total_seq_len - values_b.size(0)
+        # pyre-ignore[6] Incompatible parameter type
+        values_a = torch.empty((seq_len_a, D), device=values.device, dtype=values.dtype)
+        _triton_split_2D_jagged_internal(
+            jagged_in=values,
+            max_seq_len=max_seq_len,
+            B=B,
+            offsets_a=offsets_a,
+            offsets_b=offsets_b,
+            out_a=values_a,
+            out_b=values_b,
+            D=D,
+            dense_size=dense_size,
+            n_prefix=n_prefix_to_right,
+            is_dense_a=is_dense_a,
+            is_dense_b=is_dense_b,
+            is_replace=False,
+            BLOCK_D=BLOCK_D,
+        )
+        if is_dense_a:
+            values_a = values_a.reshape(B, dense_size, D)
+        if is_dense_b:
+            values_b = values_b.reshape(B, dense_size, D)
+        ctx.save_for_backward(offsets_a, offsets_b)
+        ctx.max_seq_len = max_seq_len
+        ctx.seq_len_a = seq_len_a
+        ctx.seq_len_b = seq_len_b
+        ctx.is_dense_a = is_dense_a
+        ctx.is_dense_b = is_dense_b
+        ctx.dense_size = dense_size
+        ctx.B = B
+        ctx.D = D
+        ctx.n_prefix_to_right = n_prefix_to_right
+        return values_a, values_b
+
+    @staticmethod
+    def backward(
+        ctx, *d_values
+    ) -> Tuple[torch.Tensor, None, None, None, None, None, None, None, None]:
+        offsets_a, offsets_b = ctx.saved_tensors
+        is_dense_a, is_dense_b = ctx.is_dense_a, ctx.is_dense_b
+        values_a, values_b = d_values
+        if is_dense_a:
+            stride_dense_batch = values_a.stride(0)
+        elif is_dense_b:
+            stride_dense_batch = values_b.stride(0)
+        else:
+            stride_dense_batch = 0
+
+        BLOCK_D = triton.next_power_of_2(ctx.D)
+        dvalues = torch.empty(
+            (ctx.seq_len_a + ctx.seq_len_b, ctx.D),
+            device=values_a.device,
+            dtype=values_b.dtype,
+        )
+        _triton_concat_2D_jagged_internal(
+            values_a=values_a,
+            values_b=values_b,
+            values_out=dvalues,
+            max_seq_len=ctx.max_seq_len,
+            B=ctx.B,
+            offsets_a=offsets_a,
+            offsets_b=offsets_b,
+            D=ctx.D,
+            dense_size=ctx.dense_size,
+            stride_dense_batch=stride_dense_batch,
+            n_prefix=ctx.n_prefix_to_right,
+            is_dense_a=is_dense_a,
+            is_dense_b=is_dense_b,
+            is_replace=False,
+            BLOCK_D=BLOCK_D,
+        )
+
+        return dvalues, None, None, None, None, None, None, None, None
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def triton_jagged_dense_bmm_add(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    jagged: torch.Tensor,
+    dense: torch.Tensor,
+    bias: torch.Tensor,
+    elementwise: bool = False,
+) -> torch.Tensor:
+    """
+    Computing bmm Out = Jagged x Dense + Bias
+    M is the jagged dimension
+    Jagged has shape (sum_B(M_i), K), Dense has shape (B, K, N), Bias has shape (B, N) or (sum_B(M_i), N) depending on Elementwise, and Out has shape (sum_B(M_i), N)
+    """
+    return _JaggedDenseBmmAddFunction.apply(
+        max_seq_len, seq_offsets, jagged, dense, bias, elementwise
+    )
+
+
+@torch.fx.wrap
+def triton_concat_2D_jagged(
+    max_seq_len: int,
+    values_a: torch.Tensor,
+    values_b: torch.Tensor,
+    offsets_a: Optional[torch.Tensor] = None,
+    offsets_b: Optional[torch.Tensor] = None,
+    is_replace: bool = False,
+    n_prefix_from_right: int = 0,
+) -> torch.Tensor:
+    return _Concat2DJaggedFunction.apply(
+        max_seq_len,
+        values_a,
+        values_b,
+        offsets_a,
+        offsets_b,
+        is_replace,
+        n_prefix_from_right,
+    )
+
+
+@torch.fx.wrap
+def triton_concat_2D_jagged_jagged(
+    max_seq_len_left: int,
+    offsets_left: torch.Tensor,
+    values_left: torch.Tensor,
+    max_seq_len_right: int,
+    offsets_right: torch.Tensor,
+    values_right: torch.Tensor,
+    is_replace: bool,
+    n_prefix_from_right: int,
+) -> torch.Tensor:
+    return triton_concat_2D_jagged(
+        max_seq_len=max_seq_len_left + max_seq_len_right,
+        values_a=values_left,
+        values_b=values_right,
+        offsets_a=offsets_left,
+        offsets_b=offsets_right,
+        is_replace=is_replace,
+        n_prefix_from_right=n_prefix_from_right,
+    )
+
+
+@torch.fx.wrap
+def helion_concat_2D_jagged(
+    max_seq_len: int,
+    values_a: torch.Tensor,
+    values_b: torch.Tensor,
+    offsets_a: Optional[torch.Tensor] = None,
+    offsets_b: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    return _HelionConcat2DJaggedFunction.apply(
+        max_seq_len,
+        values_a,
+        values_b,
+        offsets_a,
+        offsets_b,
+    )
+
+
+@torch.fx.wrap
+def triton_concat_2D_dense_jagged(
+    jagged_max_seq_len: int,
+    jagged_offsets: torch.Tensor,
+    jagged_values: torch.Tensor,
+    dense_values: torch.Tensor,
+) -> torch.Tensor:
+    B, dense_size, D = dense_values.size()
+    max_seq_len = jagged_max_seq_len + dense_size
+    return triton_concat_2D_jagged(
+        max_seq_len=max_seq_len,
+        values_a=dense_values,
+        values_b=jagged_values,
+        offsets_a=None,
+        offsets_b=jagged_offsets,
+    )
+
+
+def triton_jagged_dense_bmm(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    jagged: torch.Tensor,
+    dense: torch.Tensor,
+) -> torch.Tensor:
+    return _JaggedDenseBmmFunction.apply(max_seq_len, seq_offsets, jagged, dense)
+
+
+@torch.jit.unused
+def triton_split_2D_jagged(
+    values: torch.Tensor,
+    max_seq_len: int,
+    offsets_a: Optional[torch.Tensor] = None,
+    offsets_b: Optional[torch.Tensor] = None,
+    dense_size: int = 0,
+    n_prefix_to_right: int = 0,
+    seq_len_a: Optional[int] = None,
+    seq_len_b: Optional[int] = None,
+    total_seq_len: Optional[int] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    return _Split2DJaggedFunction.apply(
+        values,
+        max_seq_len,
+        offsets_a,
+        offsets_b,
+        dense_size,
+        n_prefix_to_right,
+        seq_len_a,
+        seq_len_b,
+        total_seq_len,
+    )
+
+
+@torch.jit.unused
+def helion_split_2D_jagged(
+    values: torch.Tensor,
+    max_seq_len: int,
+    offsets_a: Optional[torch.Tensor] = None,
+    offsets_b: Optional[torch.Tensor] = None,
+    dense_size: int = 0,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    return _HelionSplit2DJaggedFunction.apply(
+        values,
+        max_seq_len,
+        offsets_a,
+        offsets_b,
+        dense_size,
+    )
+
+
+@triton.jit
+def concat_2D_jagged_w_prefix_multirow(
+    OffsetsA,
+    ValuesA,
+    OffsetsB,
+    ValuesB,
+    DenseSize,
+    Out,
+    D,
+    stride_ad,
+    stride_bd,
+    stride_dense_batch,
+    stride_od,
+    n_prefix_from_B,
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    IS_REPLACE: tl.constexpr,
+):
+    off_z = tl.program_id(1)
+    off_block_n = tl.program_id(0)
+
+    if IS_DENSE_A:
+        seq_start_a = off_z * DenseSize
+        seq_len_a = DenseSize
+        seq_start_b = tl.load(OffsetsB + off_z)
+        seq_end_b = tl.load(OffsetsB + off_z + 1)
+        seq_len_b = seq_end_b - seq_start_b
+    elif IS_DENSE_B:
+        seq_start_a = tl.load(OffsetsA + off_z)
+        seq_end_a = tl.load(OffsetsA + off_z + 1)
+        seq_len_a = seq_end_a - seq_start_a
+        seq_start_b = off_z * DenseSize
+        seq_len_b = DenseSize
+    else:
+        seq_start_a = tl.load(OffsetsA + off_z)
+        seq_end_a = tl.load(OffsetsA + off_z + 1)
+        seq_len_a = seq_end_a - seq_start_a
+        seq_start_b = tl.load(OffsetsB + off_z)
+        seq_end_b = tl.load(OffsetsB + off_z + 1)
+        seq_len_b = seq_end_b - seq_start_b
+
+    if IS_REPLACE:
+        seq_len = seq_len_a
+        out_seq_start = seq_start_a
+        out_seq_b_start = seq_len_a - seq_len_b
+    else:
+        seq_len = seq_len_a + seq_len_b
+        out_seq_start = seq_start_a + seq_start_b
+        out_seq_b_start = seq_len_a + n_prefix_from_B
+
+    start_n = off_block_n * BLOCK_N
+    offs_n = start_n + tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_D)
+    if start_n >= seq_len:
+        return
+    valid_mask = offs_n < seq_len
+
+    out_ptrs = (
+        Out
+        + (out_seq_start + offs_n[:, None]).to(tl.int64) * stride_od
+        + offs_d[None, :]
+    )
+
+    to_a_mask = (offs_n < out_seq_b_start) & (offs_n >= n_prefix_from_B) & valid_mask
+    to_b_mask = ~to_a_mask & valid_mask
+
+    off_a = offs_n - n_prefix_from_B
+    if IS_DENSE_A:
+        in_a_ptrs = (
+            ValuesA
+            + off_a[:, None].to(tl.int64) * stride_ad
+            + off_z.to(tl.int64) * stride_dense_batch
+            + offs_d[None, :]
+        )
+    else:
+        in_a_ptrs = (
+            ValuesA
+            + (off_a[:, None] + seq_start_a).to(tl.int64) * stride_ad
+            + offs_d[None, :]
+        )
+
+    v_a = tl.load(in_a_ptrs, mask=to_a_mask[:, None] & (offs_d[None, :] < D), other=0.0)
+    tl.store(out_ptrs, v_a, mask=to_a_mask[:, None] & (offs_d[None, :] < D))
+
+    prefix_mask = offs_n < n_prefix_from_B
+
+    off_b = tl.where(prefix_mask, offs_n, offs_n - out_seq_b_start + n_prefix_from_B)
+    if IS_DENSE_B:
+        in_b_ptrs = (
+            ValuesB
+            + off_b[:, None].to(tl.int64) * stride_bd
+            + off_z.to(tl.int64) * stride_dense_batch
+            + offs_d[None, :]
+        )
+    else:
+        in_b_ptrs = (
+            ValuesB
+            + (off_b[:, None] + seq_start_b).to(tl.int64) * stride_bd
+            + offs_d[None, :]
+        )
+
+    v_b = tl.load(in_b_ptrs, mask=to_b_mask[:, None] & (offs_d[None, :] < D), other=0.0)
+    tl.store(out_ptrs, v_b, mask=to_b_mask[:, None] & (offs_d[None, :] < D))
+
+
+@triton_autotune(
+    configs=_get_split_concat_2d_jagged_multirow_configs_wrapper(),
+    key=["BLOCK_D"],
+)
+@triton.jit
+def concat_2D_jagged_multirow(
+    OffsetsA,
+    ValuesA,
+    OffsetsB,
+    ValuesB,
+    DenseSize,
+    Out,
+    D,
+    stride_ad,
+    stride_bd,
+    stride_dense_batch,
+    stride_od,
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    IS_REPLACE: tl.constexpr,
+):
+    concat_2D_jagged_w_prefix_multirow(
+        OffsetsA,
+        ValuesA,
+        OffsetsB,
+        ValuesB,
+        DenseSize,
+        Out,
+        D,
+        stride_ad,
+        stride_bd,
+        stride_dense_batch,
+        stride_od,
+        0,
+        IS_DENSE_A,
+        IS_DENSE_B,
+        BLOCK_D,
+        BLOCK_N,
+        IS_REPLACE,
+    )
+
+
+@triton_autotune(
+    configs=_get_split_concat_2d_jagged_multirow_configs(),
+    key=["BLOCK_D"],
+)
+@triton.jit
+def concat_2D_jagged_jagged_w_prefix_multirow(
+    OffsetsA,
+    ValuesA,
+    OffsetsB,
+    ValuesB,
+    Out,
+    D,
+    stride_ad,
+    stride_bd,
+    stride_od,
+    n_prefix_from_B,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    concat_2D_jagged_w_prefix_multirow(
+        OffsetsA,
+        ValuesA,
+        OffsetsB,
+        ValuesB,
+        0,
+        Out,
+        D,
+        stride_ad,
+        stride_bd,
+        0,
+        stride_od,
+        n_prefix_from_B,
+        IS_DENSE_A=False,
+        IS_DENSE_B=False,
+        BLOCK_D=BLOCK_D,
+        BLOCK_N=BLOCK_N,
+        IS_REPLACE=False,
+    )
+
+
+@triton.jit
+def split_2D_jagged_w_prefix_multirow(
+    JaggedIn,
+    DenseSize,
+    OffsetsA,
+    OffsetsB,
+    OutA,
+    OutB,
+    D,
+    stride_id,
+    stride_ad,
+    stride_bd,
+    n_prefix_to_B,
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    IS_REPLACE: tl.constexpr,
+):
+    off_z = tl.program_id(1)
+    off_block_n = tl.program_id(0)
+
+    if IS_DENSE_A:
+        seq_start_b = tl.load(OffsetsB + off_z)
+        seq_end_b = tl.load(OffsetsB + off_z + 1)
+        seq_start_a = off_z * DenseSize
+        seq_len_a = DenseSize
+        seq_len_b = seq_end_b - seq_start_b
+    elif IS_DENSE_B:
+        seq_start_a = tl.load(OffsetsA + off_z)
+        seq_end_a = tl.load(OffsetsA + off_z + 1)
+        seq_len_a = seq_end_a - seq_start_a
+        seq_start_b = off_z * DenseSize
+        seq_len_b = DenseSize
+    else:
+        seq_start_a = tl.load(OffsetsA + off_z)
+        seq_end_a = tl.load(OffsetsA + off_z + 1)
+        seq_len_a = seq_end_a - seq_start_a
+        seq_start_b = tl.load(OffsetsB + off_z)
+        seq_end_b = tl.load(OffsetsB + off_z + 1)
+        seq_len_b = seq_end_b - seq_start_b
+
+    if IS_REPLACE:
+        seq_len = seq_len_a
+    else:
+        seq_len = seq_len_a + seq_len_b
+
+    if IS_REPLACE:
+        seq_start = seq_start_a
+        out_seq_b_start = seq_len_a - seq_len_b
+    else:
+        seq_start = seq_start_a + seq_start_b
+        out_seq_b_start = seq_len_a + n_prefix_to_B
+
+    start_n = off_block_n * BLOCK_N
+    offs_n = start_n + tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_D)
+    if start_n >= seq_len:
+        return
+    valid_mask = offs_n < seq_len
+
+    in_ptrs = (
+        JaggedIn
+        + (seq_start + offs_n[:, None]).to(tl.int64) * stride_id
+        + offs_d[None, :]
+    )
+
+    v = tl.load(in_ptrs, mask=valid_mask[:, None] & (offs_d[None, :] < D), other=0.0)
+
+    to_a_mask = (offs_n < out_seq_b_start) & (offs_n >= n_prefix_to_B) & valid_mask
+    to_b_mask = ~to_a_mask & valid_mask
+
+    off_a = offs_n - n_prefix_to_B
+    out_a_ptrs = (
+        OutA + (off_a[:, None] + seq_start_a).to(tl.int64) * stride_ad + offs_d[None, :]
+    )
+    tl.store(out_a_ptrs, v, mask=to_a_mask[:, None] & (offs_d[None, :] < D))
+
+    prefix_mask = offs_n < n_prefix_to_B
+
+    off_b = tl.where(prefix_mask, offs_n, offs_n - out_seq_b_start + n_prefix_to_B)
+    out_b_ptrs = (
+        OutB + (off_b[:, None] + seq_start_b).to(tl.int64) * stride_bd + offs_d[None, :]
+    )
+    tl.store(out_b_ptrs, v, mask=to_b_mask[:, None] & (offs_d[None, :] < D))
+
+
+@triton_autotune(
+    configs=_get_split_concat_2d_jagged_multirow_configs_wrapper(),
+    key=["BLOCK_D"],
+)
+@triton.jit
+def split_2D_jagged_multirow(
+    JaggedIn,
+    DenseSize,
+    OffsetsA,
+    OffsetsB,
+    OutA,
+    OutB,
+    D,
+    stride_id,
+    stride_ad,
+    stride_bd,
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    IS_REPLACE: tl.constexpr,
+):
+    split_2D_jagged_w_prefix_multirow(
+        JaggedIn,
+        DenseSize,
+        OffsetsA,
+        OffsetsB,
+        OutA,
+        OutB,
+        D,
+        stride_id,
+        stride_ad,
+        stride_bd,
+        0,
+        IS_DENSE_A,
+        IS_DENSE_B,
+        BLOCK_D,
+        BLOCK_N,
+        IS_REPLACE,
+    )
+
+
+@triton_autotune(
+    configs=_get_split_concat_2d_jagged_multirow_configs(),
+    key=["BLOCK_D"],
+)
+@triton.jit
+def split_2D_jagged_jagged_w_prefix_multirow(
+    JaggedIn,
+    OffsetsA,
+    OffsetsB,
+    OutA,
+    OutB,
+    D,
+    stride_id,
+    stride_ad,
+    stride_bd,
+    n_prefix_to_B,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    split_2D_jagged_w_prefix_multirow(
+        JaggedIn,
+        0,
+        OffsetsA,
+        OffsetsB,
+        OutA,
+        OutB,
+        D,
+        stride_id,
+        stride_ad,
+        stride_bd,
+        n_prefix_to_B,
+        IS_DENSE_A=False,
+        IS_DENSE_B=False,
+        BLOCK_D=BLOCK_D,
+        BLOCK_N=BLOCK_N,
+        IS_REPLACE=False,
+    )
+
+
+def triton_jagged_dense_broadcast_add(
+    max_seq_len: int,
+    seq_offsets: torch.Tensor,
+    jagged: torch.Tensor,
+    dense: torch.Tensor,
+) -> torch.Tensor:
+    return _JaggedDenseBroadcastAddFunction.apply(
+        max_seq_len, seq_offsets, jagged, dense
+    )
+
+
+@triton.jit
+def _helion_split_2d_jagged_kernel(
+    offsets_a,
+    offsets_b,
+    values_flat,
+    out_a_flat,
+    out_b_flat,
+    max_seq_len,
+    D: tl.constexpr,
+    _BLOCK_SIZE_0: tl.constexpr,
+    _BLOCK_SIZE_1: tl.constexpr,
+) -> None:
+    # Get program ID and decompose to batch and sequence block coordinates
+    program_id = tl.program_id(0)
+    flat_program_id = program_id
+    batch_id = triton_helpers.div_floor_integer(
+        flat_program_id,
+        triton_helpers.div_floor_integer(
+            -1 + _BLOCK_SIZE_0 + max_seq_len, _BLOCK_SIZE_0
+        ),
+    )
+    seq_block_id = triton_helpers.remainder_integer(  # noqa: F841
+        flat_program_id,
+        triton_helpers.div_floor_integer(
+            -1 + _BLOCK_SIZE_0 + max_seq_len, _BLOCK_SIZE_0
+        ),
+    )
+    # Load output boundaries for part A
+    out_a_start = tl.load(offsets_a + batch_id * 1, None, eviction_policy="evict_last")
+    batch_id_plus_1 = 1 + triton_helpers.div_floor_integer(
+        flat_program_id,
+        triton_helpers.div_floor_integer(
+            -1 + _BLOCK_SIZE_0 + max_seq_len, _BLOCK_SIZE_0
+        ),
+    )
+    out_a_end = tl.load(
+        offsets_a + batch_id_plus_1 * 1, None, eviction_policy="evict_last"
+    )
+    len_a = out_a_end - out_a_start
+    # Load output boundaries for part B
+    out_b_start = tl.load(offsets_b + batch_id * 1, None)
+    out_b_end = tl.load(
+        offsets_b + batch_id_plus_1 * 1, None, eviction_policy="evict_last"
+    )
+    len_b = out_b_end - out_b_start
+    # Compute input start and total length for this batch
+    input_start = out_a_start + out_b_start
+    total_len = len_a + len_b
+    # Calculate sequence offset for this block
+    seq_offset = _BLOCK_SIZE_0 * triton_helpers.remainder_integer(
+        flat_program_id,
+        triton_helpers.div_floor_integer(
+            -1 + _BLOCK_SIZE_0 + max_seq_len, _BLOCK_SIZE_0
+        ),
+    )
+    has_work = total_len > seq_offset
+    if has_work:
+        # Generate row indices for this sequence block
+        seq_range = tl.arange(0, _BLOCK_SIZE_0)
+        seq_offset_i32 = tl.cast(seq_offset, tl.int32)
+        row_indices = seq_range + seq_offset_i32
+
+        # Create masks for valid rows and parts A/B
+        total_len_i32 = tl.cast(total_len[None], tl.int32)
+        len_a_i32 = tl.cast(len_a[None], tl.int32)
+        valid_mask = row_indices < total_len_i32
+        is_part_a = row_indices < len_a_i32
+        is_part_b = (row_indices >= len_a_i32) & valid_mask
+
+        # Extract scalar values once
+        input_start_i32 = tl.cast(input_start[None, None], tl.int32)
+        out_a_start_i32 = tl.cast(out_a_start[None, None], tl.int32)
+        out_b_start_i32 = tl.cast(out_b_start[None, None], tl.int32)
+
+        # Process features in smaller tiles
+        for feature_offset in tl.range(
+            0,
+            D,
+            _BLOCK_SIZE_1,
+            loop_unroll_factor=1,
+            num_stages=4,
+            disallow_acc_multi_buffer=True,
+            flatten=True,
+        ):
+            feature_indices = feature_offset + tl.arange(0, _BLOCK_SIZE_1).to(tl.int32)
+
+            # Compute D constant and feature mask once per feature iteration
+            D_const = tl.full([], tl.cast(D, tl.int32), tl.int32)
+            D_i32 = tl.cast(D, tl.int32)
+            feature_mask = feature_indices < D_i32
+
+            # Compute indices for part A
+            row_subscript = row_indices[:, None]
+            input_row_a = input_start_i32 + row_subscript
+            input_idx_a = (
+                tl.cast(input_row_a * D_const, tl.int32) + feature_indices[None, :]
+            )
+
+            out_a_row = out_a_start_i32 + row_subscript
+            out_a_idx = (
+                tl.cast(out_a_row * D_const, tl.int32) + feature_indices[None, :]
+            )
+
+            mask_a = is_part_a[:, None] & valid_mask[:, None] & feature_mask[None, :]
+
+            # Load and store part A data
+            slice_a = tl.load(
+                values_flat + input_idx_a * 1,
+                mask_a,
+                other=0,
+                eviction_policy="evict_first",
+            )
+            tl.store(out_a_flat + out_a_idx * 1, slice_a, mask_a)
+
+            # Compute indices for part B
+            input_idx_b = (
+                tl.cast((input_start_i32 + row_subscript) * D_const, tl.int32)
+                + feature_indices[None, :]
+            )
+
+            row_minus_len_a = row_subscript - len_a_i32
+            out_b_row = out_b_start_i32 + row_minus_len_a
+            out_b_idx = (
+                tl.cast(out_b_row * D_const, tl.int32) + feature_indices[None, :]
+            )
+
+            mask_b = is_part_b[:, None] & feature_mask[None, :]
+
+            # Load and store part B data
+            slice_b = tl.load(
+                values_flat + input_idx_b * 1,
+                mask_b,
+                other=0,
+                eviction_policy="evict_first",
+            )
+            tl.store(out_b_flat + out_b_idx * 1, slice_b, mask_b)
+
+
+class _HelionSplit2DJaggedFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        values: torch.Tensor,
+        max_seq_len: int,
+        offsets_a: torch.Tensor,
+        offsets_b: torch.Tensor,
+        dense_size: int = 0,  # noqa: F841
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        values = switch_to_contiguous_if_needed(values)
+        B = offsets_a.shape[0] - 1
+        D = values.size(1)
+
+        # TODO: maybe check if torch.compiler.is_compiling() and use index_select instead
+        seq_len_a = int(offsets_a[-1].item())
+        seq_len_b = int(offsets_b[-1].item())
+
+        values_a, values_b = _helion_split_2D_jagged_impl(
+            values=values,
+            max_seq_len=max_seq_len,
+            offsets_a=offsets_a,
+            offsets_b=offsets_b,
+            dense_size=dense_size,
+            total_len_a=seq_len_a,
+            total_len_b=seq_len_b,
+        )
+
+        ctx.save_for_backward(offsets_a, offsets_b)
+        ctx.max_seq_len = max_seq_len
+        ctx.seq_len_a = seq_len_a
+        ctx.seq_len_b = seq_len_b
+        ctx.dense_size = dense_size
+        ctx.B = B
+        ctx.D = D
+        return values_a, values_b
+
+    @staticmethod
+    def backward(ctx, *d_values) -> Tuple[torch.Tensor, None, None, None, None]:
+        offsets_a, offsets_b = ctx.saved_tensors
+        values_a, values_b = d_values
+        BLOCK_D = triton.next_power_of_2(ctx.D)
+
+        dvalues = torch.empty(
+            (ctx.seq_len_a + ctx.seq_len_b, ctx.D),
+            device=values_a.device,
+            dtype=values_a.dtype,
+        )
+        _triton_concat_2D_jagged_internal(
+            values_a=values_a,
+            values_b=values_b,
+            values_out=dvalues,
+            max_seq_len=ctx.max_seq_len,
+            B=ctx.B,
+            offsets_a=offsets_a,
+            offsets_b=offsets_b,
+            D=ctx.D,
+            dense_size=0,
+            stride_dense_batch=0,
+            n_prefix=0,
+            is_dense_a=False,
+            is_dense_b=False,
+            is_replace=False,
+            BLOCK_D=BLOCK_D,
+        )
+        return dvalues, None, None, None, None
+
+
+def _helion_split_2D_jagged_impl(
+    values: torch.Tensor,
+    max_seq_len: int,
+    offsets_a: torch.Tensor,
+    offsets_b: torch.Tensor,
+    dense_size: int = 0,  # noqa: F841
+    total_len_a: Optional[int] = None,
+    total_len_b: Optional[int] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    D = values.size(1)
+
+    # Select dtype-specific optimal parameters
+    if values.dtype == torch.float32:
+        # FP32-optimized parameters
+        block_size_0 = 64
+        block_size_1 = 64
+        num_warps = 4
+        num_stages = 4
+    else:
+        # BF16/FP16-optimized parameters
+        block_size_0 = 128
+        block_size_1 = triton.next_power_of_2(D)
+        num_warps = 32
+        num_stages = 7
+
+    return _helion_split_2d_jagged(
+        values,
+        max_seq_len,
+        offsets_a,
+        offsets_b,
+        dense_size,
+        block_size_0=block_size_0,
+        block_size_1=block_size_1,
+        num_warps=num_warps,
+        num_stages=num_stages,
+        total_len_a=total_len_a,
+        total_len_b=total_len_b,
+    )
+
+
+def _helion_split_2d_jagged(
+    values: torch.Tensor,
+    max_seq_len: int,
+    offsets_a: torch.Tensor,
+    offsets_b: torch.Tensor,
+    dense_size: int,  # noqa: F841
+    block_size_0: int = 64,
+    block_size_1: int = 64,
+    num_warps: int = 4,
+    num_stages: int = 4,
+    total_len_a: Optional[int] = None,
+    total_len_b: Optional[int] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    values = values.contiguous()
+    num_batches = offsets_a.size(0) - 1
+    D = values.size(1)
+    num_seq_blocks = (max_seq_len + block_size_0 - 1) // block_size_0
+    if total_len_a is None:
+        total_len_a = int(offsets_a[-1].item())
+    if total_len_b is None:
+        total_len_b = int(offsets_b[-1].item())
+    out_a = torch.empty([total_len_a, D], dtype=values.dtype, device=values.device)
+    out_b = torch.empty([total_len_b, D], dtype=values.dtype, device=values.device)
+    values_flat = values.view(-1)
+    out_a_flat = out_a.view(-1)
+    out_b_flat = out_b.view(-1)
+    total_programs = num_batches * num_seq_blocks
+
+    # pyre-ignore[28]
+    _helion_split_2d_jagged_kernel[(total_programs,)](
+        offsets_a,
+        offsets_b,
+        values_flat,
+        out_a_flat,
+        out_b_flat,
+        max_seq_len,
+        D,
+        block_size_0,
+        block_size_1,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    return (out_a, out_b)
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_jagged_tensors.py b/recommendation_v4/generative_recommenders/ops/triton/triton_jagged_tensors.py
new file mode 100644
index 000000000..3488e308a
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_jagged_tensors.py
@@ -0,0 +1,1067 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+#!/usr/bin/env python3
+
+
+from typing import Optional, Tuple
+
+import torch
+
+# @manual=//triton:triton
+import triton
+
+# @manual=//triton:triton
+import triton.language as tl
+from generative_recommenders.common import (
+    switch_to_contiguous_if_needed,
+    triton_autotune,
+)
+from generative_recommenders.ops.utils import is_sm100_plus
+
+
+def _triton_concat_2D_jagged_internal(
+    values_a: torch.Tensor,
+    values_b: torch.Tensor,
+    values_out: torch.Tensor,
+    max_seq_len: int,
+    B: int,
+    offsets_a: Optional[torch.Tensor],
+    offsets_b: Optional[torch.Tensor],
+    max_len_a: Optional[int],
+    max_len_b: Optional[int],
+    D: int,
+    n_prefix_from_B: int,
+    is_dense_a: bool,
+    is_dense_b: bool,
+    BLOCK_D: int,
+) -> None:
+    if is_sm100_plus():
+
+        def grid(meta):
+            return (triton.cdiv(max_seq_len, meta["BLOCK_N"]), B)
+
+        concat_2D_jagged_multirow[grid](
+            ValuesA=values_a,
+            ValuesB=values_b,
+            OffsetsA=offsets_a,
+            OffsetsB=offsets_b,
+            MaxLenA=max_len_a,
+            MaxLenB=max_len_b,
+            Out=values_out,
+            D=D,
+            stride_ad=values_a.stride(-2),
+            stride_bd=values_b.stride(-2),
+            stride_od=values_out.stride(-2),
+            n_prefix_from_B=n_prefix_from_B,
+            IS_DENSE_A=is_dense_a,  # pyre-ignore[6]
+            IS_DENSE_B=is_dense_b,  # pyre-ignore[6]
+            BLOCK_D=BLOCK_D,  # pyre-ignore[6]
+        )
+    else:
+        _concat_2D_jagged[(max_seq_len, B)](
+            ValuesA=values_a,
+            ValuesB=values_b,
+            OffsetsA=offsets_a,
+            OffsetsB=offsets_b,
+            MaxLenA=max_len_a,
+            MaxLenB=max_len_b,
+            Out=values_out,
+            D=D,
+            stride_ad=values_a.stride(-2),
+            stride_bd=values_b.stride(-2),
+            stride_od=values_out.stride(-2),
+            n_prefix_from_B=n_prefix_from_B,
+            IS_DENSE_A=is_dense_a,  # pyre-ignore[6]
+            IS_DENSE_B=is_dense_b,  # pyre-ignore[6]
+            BLOCK_D=BLOCK_D,  # pyre-ignore[6]
+        )
+
+
+def _triton_split_2D_jagged_internal(
+    jagged_in: torch.Tensor,
+    max_seq_len: int,
+    B: int,
+    offsets_a: Optional[torch.Tensor],
+    offsets_b: Optional[torch.Tensor],
+    max_len_a: Optional[int],
+    max_len_b: Optional[int],
+    out_a: torch.Tensor,
+    out_b: torch.Tensor,
+    D: int,
+    n_prefix_to_B: int,
+    is_dense_a: bool,
+    is_dense_b: bool,
+    BLOCK_D: int,
+) -> None:
+    if is_sm100_plus():
+
+        def grid(meta):
+            return (triton.cdiv(max_seq_len, meta["BLOCK_N"]), B)
+
+        split_2D_jagged_multirow[grid](
+            JaggedIn=jagged_in,
+            OffsetsA=offsets_a,
+            OffsetsB=offsets_b,
+            MaxLenA=max_len_a,
+            MaxLenB=max_len_b,
+            OutA=out_a,
+            OutB=out_b,
+            D=D,
+            stride_id=jagged_in.stride(0),
+            stride_ad=out_a.stride(0),
+            stride_bd=out_b.stride(0),
+            n_prefix_to_B=n_prefix_to_B,
+            IS_DENSE_A=is_dense_a,  # pyre-ignore[6]
+            IS_DENSE_B=is_dense_b,  # pyre-ignore[6]
+            BLOCK_D=BLOCK_D,  # pyre-ignore[6]
+        )
+    else:
+        _split_2D_jagged[(max_seq_len, B)](
+            JaggedIn=jagged_in,
+            OffsetsA=offsets_a,
+            OffsetsB=offsets_b,
+            MaxLenA=max_len_a,
+            MaxLenB=max_len_b,
+            OutA=out_a,
+            OutB=out_b,
+            D=D,
+            stride_id=jagged_in.stride(0),
+            stride_ad=out_a.stride(0),
+            stride_bd=out_b.stride(0),
+            n_prefix_to_B=n_prefix_to_B,
+            IS_DENSE_A=is_dense_a,  # pyre-ignore[6]
+            IS_DENSE_B=is_dense_b,  # pyre-ignore[6]
+            BLOCK_D=BLOCK_D,  # pyre-ignore[6]
+        )
+
+
+def _get_concat_split_2d_jagged_multirow_configs():
+    configs = []
+    for BLOCK_N in [1, 2, 4, 8]:
+        for num_warps in [1, 2, 4]:
+            configs.append(
+                triton.Config(
+                    {"BLOCK_N": BLOCK_N},
+                    num_warps=num_warps,
+                )
+            )
+    return configs
+
+
+@triton.jit
+def _concat_2D_jagged_multirow(
+    ValuesA,
+    ValuesB,
+    OffsetsA,
+    OffsetsB,
+    MaxLenA,
+    MaxLenB,
+    Out,
+    D,
+    stride_ad,
+    stride_bd,
+    stride_od,
+    n_prefix_from_B,
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    off_z = tl.program_id(1)
+    block_n = tl.program_id(0)
+
+    if IS_DENSE_A:
+        seq_start_a = off_z * MaxLenA
+        seq_len_a = MaxLenA
+    else:
+        seq_start_a = tl.load(OffsetsA + off_z)
+        seq_end_a = tl.load(OffsetsA + off_z + 1)
+        seq_len_a = seq_end_a - seq_start_a
+    if IS_DENSE_B:
+        seq_start_b = off_z * MaxLenB
+        seq_len_b = MaxLenB
+    else:
+        seq_start_b = tl.load(OffsetsB + off_z)
+        seq_end_b = tl.load(OffsetsB + off_z + 1)
+        seq_len_b = seq_end_b - seq_start_b
+    seq_len = seq_len_a + seq_len_b
+
+    start_n = block_n * BLOCK_N
+    offs_n = start_n + tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_D)
+
+    valid_mask = offs_n < seq_len
+
+    out_seq_start = seq_start_a + seq_start_b + offs_n
+    out_ptrs = Out + out_seq_start[:, None].to(tl.int64) * stride_od + offs_d[None, :]
+
+    from_prefix_b_mask = (offs_n < n_prefix_from_B) & valid_mask
+    from_a_mask = (
+        (offs_n >= n_prefix_from_B)
+        & (offs_n < seq_len_a + n_prefix_from_B)
+        & valid_mask
+    )
+    from_suffix_b_mask = (offs_n >= seq_len_a + n_prefix_from_B) & valid_mask
+
+    in_b1_ptrs = (
+        ValuesB
+        + (offs_n[:, None] + seq_start_b).to(tl.int64) * stride_bd
+        + offs_d[None, :]
+    )
+    v_b1 = tl.load(
+        in_b1_ptrs, mask=from_prefix_b_mask[:, None] & (offs_d[None, :] < D), other=0.0
+    )
+    tl.store(out_ptrs, v_b1, mask=from_prefix_b_mask[:, None] & (offs_d[None, :] < D))
+
+    off_a = offs_n - n_prefix_from_B
+    in_a_ptrs = (
+        ValuesA
+        + (off_a[:, None] + seq_start_a).to(tl.int64) * stride_ad
+        + offs_d[None, :]
+    )
+    v_a = tl.load(
+        in_a_ptrs, mask=from_a_mask[:, None] & (offs_d[None, :] < D), other=0.0
+    )
+    tl.store(out_ptrs, v_a, mask=from_a_mask[:, None] & (offs_d[None, :] < D))
+
+    off_b = offs_n - seq_len_a
+    in_b2_ptrs = (
+        ValuesB
+        + (off_b[:, None] + seq_start_b).to(tl.int64) * stride_bd
+        + offs_d[None, :]
+    )
+    v_b2 = tl.load(
+        in_b2_ptrs, mask=from_suffix_b_mask[:, None] & (offs_d[None, :] < D), other=0.0
+    )
+    tl.store(out_ptrs, v_b2, mask=from_suffix_b_mask[:, None] & (offs_d[None, :] < D))
+
+
+@triton_autotune(
+    configs=_get_concat_split_2d_jagged_multirow_configs(),
+    key=["BLOCK_D"],
+)
+@triton.jit
+def concat_2D_jagged_multirow(
+    ValuesA,
+    ValuesB,
+    OffsetsA,
+    OffsetsB,
+    MaxLenA,
+    MaxLenB,
+    Out,
+    D,
+    stride_ad,
+    stride_bd,
+    stride_od,
+    n_prefix_from_B,
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    _concat_2D_jagged_multirow(
+        ValuesA,
+        ValuesB,
+        OffsetsA,
+        OffsetsB,
+        MaxLenA,
+        MaxLenB,
+        Out,
+        D,
+        stride_ad,
+        stride_bd,
+        stride_od,
+        n_prefix_from_B,
+        IS_DENSE_A,
+        IS_DENSE_B,
+        BLOCK_D,
+        BLOCK_N,
+    )
+
+
+@triton.jit
+def _split_2D_jagged_multirow(
+    JaggedIn,
+    OffsetsA,
+    OffsetsB,
+    MaxLenA,
+    MaxLenB,
+    OutA,
+    OutB,
+    D,
+    stride_id,
+    stride_ad,
+    stride_bd,
+    n_prefix_to_B,
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    off_z = tl.program_id(1)
+    block_n = tl.program_id(0)
+
+    if IS_DENSE_A:
+        seq_start_a = off_z * MaxLenA
+        seq_len_a = MaxLenA
+    else:
+        seq_start_a = tl.load(OffsetsA + off_z)
+        seq_end_a = tl.load(OffsetsA + off_z + 1)
+        seq_len_a = seq_end_a - seq_start_a
+    if IS_DENSE_B:
+        seq_start_b = off_z * MaxLenB
+        seq_len_b = MaxLenB
+    else:
+        seq_start_b = tl.load(OffsetsB + off_z)
+        seq_end_b = tl.load(OffsetsB + off_z + 1)
+        seq_len_b = seq_end_b - seq_start_b
+    seq_len = seq_len_a + seq_len_b
+    seq_start = seq_start_a + seq_start_b
+
+    start_n = block_n * BLOCK_N
+    offs_n = start_n + tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_D)
+
+    valid_mask = offs_n < seq_len
+
+    in_ptrs = (
+        JaggedIn
+        + (seq_start + offs_n[:, None]).to(tl.int64) * stride_id
+        + offs_d[None, :]
+    )
+
+    v = tl.load(in_ptrs, mask=valid_mask[:, None] & (offs_d[None, :] < D), other=0.0)
+
+    to_prefix_b_mask = (offs_n < n_prefix_to_B) & valid_mask
+    to_a_mask = (
+        (offs_n >= n_prefix_to_B) & (offs_n < seq_len_a + n_prefix_to_B) & valid_mask
+    )
+    to_suffix_b_mask = (offs_n >= seq_len_a + n_prefix_to_B) & valid_mask
+
+    out_b1_ptrs = (
+        OutB
+        + (offs_n[:, None] + seq_start_b).to(tl.int64) * stride_bd
+        + offs_d[None, :]
+    )
+    tl.store(out_b1_ptrs, v, mask=to_prefix_b_mask[:, None] & (offs_d[None, :] < D))
+
+    off_a = offs_n - n_prefix_to_B
+    out_a_ptrs = (
+        OutA + (off_a[:, None] + seq_start_a).to(tl.int64) * stride_ad + offs_d[None, :]
+    )
+    tl.store(out_a_ptrs, v, mask=to_a_mask[:, None] & (offs_d[None, :] < D))
+
+    off_b = offs_n - seq_len_a
+    out_b2_ptrs = (
+        OutB + (off_b[:, None] + seq_start_b).to(tl.int64) * stride_bd + offs_d[None, :]
+    )
+    tl.store(out_b2_ptrs, v, mask=to_suffix_b_mask[:, None] & (offs_d[None, :] < D))
+
+
+@triton_autotune(
+    configs=_get_concat_split_2d_jagged_multirow_configs(),
+    key=["BLOCK_D"],
+)
+@triton.jit
+def split_2D_jagged_multirow(
+    JaggedIn,
+    OffsetsA,
+    OffsetsB,
+    MaxLenA,
+    MaxLenB,
+    OutA,
+    OutB,
+    D,
+    stride_id,
+    stride_ad,
+    stride_bd,
+    n_prefix_to_B,
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    _split_2D_jagged_multirow(
+        JaggedIn,
+        OffsetsA,
+        OffsetsB,
+        MaxLenA,
+        MaxLenB,
+        OutA,
+        OutB,
+        D,
+        stride_id,
+        stride_ad,
+        stride_bd,
+        n_prefix_to_B,
+        IS_DENSE_A,
+        IS_DENSE_B,
+        BLOCK_D,
+        BLOCK_N,
+    )
+
+
+@triton.jit
+def _concat_2D_jagged(
+    ValuesA,
+    ValuesB,
+    OffsetsA,
+    OffsetsB,
+    MaxLenA,
+    MaxLenB,
+    Out,
+    D,
+    stride_ad,
+    stride_bd,
+    stride_od,
+    n_prefix_from_B,
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+):
+    off_z = tl.program_id(1)
+    off_n = tl.program_id(0)
+    if IS_DENSE_A:
+        seq_start_a = off_z * MaxLenA
+        seq_len_a = MaxLenA
+    else:
+        seq_start_a = tl.load(OffsetsA + off_z)
+        seq_end_a = tl.load(OffsetsA + off_z + 1)
+        seq_len_a = seq_end_a - seq_start_a
+    if IS_DENSE_B:
+        seq_start_b = off_z * MaxLenB
+        seq_len_b = MaxLenB
+    else:
+        seq_start_b = tl.load(OffsetsB + off_z)
+        seq_end_b = tl.load(OffsetsB + off_z + 1)
+        seq_len_b = seq_end_b - seq_start_b
+    seq_len = seq_len_a + seq_len_b
+    if off_n >= seq_len:
+        return
+    offs_d = tl.arange(0, BLOCK_D)
+    out_seq_start = seq_start_a + seq_start_b + off_n
+    out_ptrs = Out + out_seq_start.to(tl.int64) * stride_od + offs_d
+    if off_n < n_prefix_from_B:
+        in_ptrs = ValuesB + (off_n + seq_start_b).to(tl.int64) * stride_bd + offs_d
+    elif off_n < seq_len_a + n_prefix_from_B:
+        in_ptrs = (
+            ValuesA
+            + (off_n - n_prefix_from_B + seq_start_a).to(tl.int64) * stride_ad
+            + offs_d
+        )
+    else:
+        in_ptrs = (
+            ValuesB
+            + (off_n - seq_len_a + seq_start_b).to(tl.int64) * stride_bd
+            + offs_d
+        )
+    v = tl.load(in_ptrs, mask=offs_d < D)
+    tl.store(out_ptrs, v, mask=offs_d < D)
+
+
+@triton.jit
+def _split_2D_jagged(
+    JaggedIn,
+    OffsetsA,
+    OffsetsB,
+    MaxLenA,
+    MaxLenB,
+    OutA,
+    OutB,
+    D,
+    stride_id,
+    stride_ad,
+    stride_bd,
+    n_prefix_to_B,
+    IS_DENSE_A: tl.constexpr,
+    IS_DENSE_B: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+):
+    off_z = tl.program_id(1)
+    off_n = tl.program_id(0)
+    if IS_DENSE_A:
+        seq_start_a = off_z * MaxLenA
+        seq_len_a = MaxLenA
+    else:
+        seq_start_a = tl.load(OffsetsA + off_z)
+        seq_end_a = tl.load(OffsetsA + off_z + 1)
+        seq_len_a = seq_end_a - seq_start_a
+    if IS_DENSE_B:
+        seq_start_b = off_z * MaxLenB
+        seq_len_b = MaxLenB
+    else:
+        seq_start_b = tl.load(OffsetsB + off_z)
+        seq_end_b = tl.load(OffsetsB + off_z + 1)
+        seq_len_b = seq_end_b - seq_start_b
+    seq_len = seq_len_a + seq_len_b
+    if off_n >= seq_len:
+        return
+    seq_start = seq_start_a + seq_start_b
+    offs_d = tl.arange(0, BLOCK_D)
+    in_ptrs = JaggedIn + (seq_start + off_n).to(tl.int64) * stride_id + offs_d
+    if off_n < n_prefix_to_B:
+        out_ptrs = OutB + (off_n + seq_start_b).to(tl.int64) * stride_bd + offs_d
+    elif off_n < seq_len_a + n_prefix_to_B:
+        out_ptrs = (
+            OutA
+            + (off_n - n_prefix_to_B + seq_start_a).to(tl.int64) * stride_ad
+            + offs_d
+        )
+    else:
+        out_ptrs = (
+            OutB + (off_n - seq_len_a + seq_start_b).to(tl.int64) * stride_bd + offs_d
+        )
+    v = tl.load(in_ptrs, mask=offs_d < D)
+    tl.store(out_ptrs, v, mask=offs_d < D)
+
+
+class _Concat2DJaggedFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        max_seq_len: int,
+        values_a: torch.Tensor,
+        values_b: torch.Tensor,
+        max_len_a: Optional[int],
+        max_len_b: Optional[int],
+        offsets_a: Optional[torch.Tensor],
+        offsets_b: Optional[torch.Tensor],
+        n_prefix_from_B: int,
+    ):
+        values_a = switch_to_contiguous_if_needed(values_a)
+        values_b = switch_to_contiguous_if_needed(values_b)
+        is_dense_a = offsets_a is None
+        is_dense_b = offsets_b is None
+        total_len_a, D = values_a.shape
+        total_len_b, _ = values_b.shape
+        if is_dense_a:
+            assert max_len_a is not None
+            B = total_len_a // max_len_a
+        else:
+            assert offsets_a is not None
+            B = offsets_a.shape[0] - 1
+        if is_dense_b:
+            assert max_len_b is not None
+            B = total_len_b // max_len_b
+        else:
+            assert offsets_b is not None
+            B = offsets_b.shape[0] - 1
+        total_seq_len = total_len_a + total_len_b
+        BLOCK_D = triton.next_power_of_2(D)
+        values_out = torch.empty(
+            (total_seq_len, D), device=values_a.device, dtype=values_a.dtype
+        )
+        _triton_concat_2D_jagged_internal(
+            values_a=values_a,
+            values_b=values_b,
+            values_out=values_out,
+            max_seq_len=max_seq_len,
+            B=B,
+            offsets_a=offsets_a,
+            offsets_b=offsets_b,
+            max_len_a=max_len_a,
+            max_len_b=max_len_b,
+            D=D,
+            n_prefix_from_B=n_prefix_from_B,
+            is_dense_a=is_dense_a,
+            is_dense_b=is_dense_b,
+            BLOCK_D=BLOCK_D,
+        )
+        ctx.save_for_backward(offsets_a, offsets_b)
+        ctx.max_seq_len = max_seq_len
+        ctx.total_len_a = total_len_a
+        ctx.total_len_b = total_len_b
+        ctx.is_dense_a = is_dense_a
+        ctx.is_dense_b = is_dense_b
+        ctx.max_len_a = max_len_a
+        ctx.max_len_b = max_len_b
+        ctx.B = B
+        ctx.n_prefix_from_B = n_prefix_from_B
+        return values_out
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, d_out: torch.Tensor
+    ) -> Tuple[None, torch.Tensor, torch.Tensor, None, None, None, None, None]:
+        offsets_a, offsets_b = ctx.saved_tensors
+        _, D = d_out.shape
+        BLOCK_D = triton.next_power_of_2(D)
+        d_values_a = torch.zeros(
+            (ctx.total_len_a, D), device=d_out.device, dtype=d_out.dtype
+        )
+        d_values_b = torch.empty(
+            (ctx.total_len_b, D), device=d_out.device, dtype=d_out.dtype
+        )
+        _split_2D_jagged[(ctx.max_seq_len, ctx.B)](
+            JaggedIn=d_out,
+            OffsetsA=offsets_a,
+            OffsetsB=offsets_b,
+            MaxLenA=ctx.max_len_a,
+            MaxLenB=ctx.max_len_b,
+            OutA=d_values_a,
+            OutB=d_values_b,
+            D=D,
+            stride_id=d_out.stride(-2),
+            stride_ad=d_values_a.stride(-2),
+            stride_bd=d_values_b.stride(-2),
+            n_prefix_to_B=ctx.n_prefix_from_B,
+            BLOCK_D=BLOCK_D,
+            IS_DENSE_A=ctx.is_dense_a,
+            IS_DENSE_B=ctx.is_dense_b,
+        )
+        return None, d_values_a, d_values_b, None, None, None, None, None
+
+
+class _Split2DJaggedFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        max_seq_len: int,
+        values: torch.Tensor,
+        total_len_left: Optional[int],
+        total_len_right: Optional[int],
+        max_len_a: Optional[int],
+        max_len_b: Optional[int],
+        offsets_a: Optional[torch.Tensor],
+        offsets_b: Optional[torch.Tensor],
+        n_prefix_to_B: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        values = switch_to_contiguous_if_needed(values)
+        is_dense_a: bool = offsets_a is None
+        is_dense_b: bool = offsets_b is None
+        total_seq_len, D = values.shape
+        if is_dense_a:
+            assert is_dense_b is False
+            assert offsets_b is not None
+            assert max_len_a is not None
+            B = offsets_b.shape[0] - 1
+            total_len_a = max_len_a * B
+            total_len_b = total_seq_len - total_len_a
+        elif is_dense_b:
+            assert is_dense_a is False
+            assert offsets_a is not None
+            assert max_len_b is not None
+            B = offsets_a.shape[0] - 1
+            total_len_b = max_len_b * B
+            total_len_a = total_seq_len - total_len_b
+        else:
+            assert offsets_a is not None and offsets_b is not None
+            B = offsets_a.shape[0] - 1
+            if total_len_left is not None and total_len_right is not None:
+                assert total_len_left + total_len_right == total_seq_len
+                total_len_a = total_len_left
+                total_len_b = total_len_right
+            else:
+                total_len_a = int(offsets_a[-1].item())
+                total_len_b = values.size(0) - total_len_a
+        _, D = values.shape
+        BLOCK_D = triton.next_power_of_2(D)
+        values_a = torch.empty(
+            (total_len_a, D), device=values.device, dtype=values.dtype
+        )
+        values_b = torch.empty(
+            (total_len_b, D), device=values.device, dtype=values.dtype
+        )
+        _triton_split_2D_jagged_internal(
+            jagged_in=values,
+            max_seq_len=max_seq_len,
+            B=B,
+            offsets_a=offsets_a,
+            offsets_b=offsets_b,
+            max_len_a=max_len_a,
+            max_len_b=max_len_b,
+            out_a=values_a,
+            out_b=values_b,
+            D=D,
+            n_prefix_to_B=n_prefix_to_B,
+            is_dense_a=is_dense_a,
+            is_dense_b=is_dense_b,
+            BLOCK_D=BLOCK_D,
+        )
+        ctx.save_for_backward(offsets_a, offsets_b)
+        ctx.max_seq_len = max_seq_len
+        ctx.total_seq_len = total_seq_len
+        ctx.max_len_a = max_len_a
+        ctx.max_len_b = max_len_b
+        ctx.is_dense_a = is_dense_a
+        ctx.is_dense_b = is_dense_b
+        ctx.B = B
+        ctx.D = D
+        ctx.n_prefix_to_B = n_prefix_to_B
+        return values_a, values_b
+
+    @staticmethod
+    def backward(
+        ctx, *d_values
+    ) -> Tuple[None, torch.Tensor, None, None, None, None, None, None, None]:
+        offsets_a, offsets_b = ctx.saved_tensors
+        d_values_a, d_values_b = d_values
+        BLOCK_D = triton.next_power_of_2(ctx.D)
+        d_jagged_in = torch.empty(
+            (ctx.total_seq_len, ctx.D),
+            device=d_values_a.device,
+            dtype=d_values_a.dtype,
+        )
+        _triton_concat_2D_jagged_internal(
+            values_a=d_values_a,
+            values_b=d_values_b,
+            values_out=d_jagged_in,
+            max_seq_len=ctx.max_seq_len,
+            B=ctx.B,
+            offsets_a=offsets_a,
+            offsets_b=offsets_b,
+            max_len_a=ctx.max_len_a,
+            max_len_b=ctx.max_len_b,
+            D=ctx.D,
+            n_prefix_from_B=ctx.n_prefix_to_B,
+            is_dense_a=ctx.is_dense_a,
+            is_dense_b=ctx.is_dense_b,
+            BLOCK_D=BLOCK_D,
+        )
+
+        return None, d_jagged_in, None, None, None, None, None, None, None
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def triton_concat_2D_jagged(
+    max_seq_len: int,
+    values_left: torch.Tensor,
+    values_right: torch.Tensor,
+    max_len_left: Optional[int],
+    max_len_right: Optional[int],
+    offsets_left: Optional[torch.Tensor],
+    offsets_right: Optional[torch.Tensor],
+    n_prefix_from_right: int = 0,
+) -> torch.Tensor:
+    return _Concat2DJaggedFunction.apply(
+        max_seq_len,
+        values_left,
+        values_right,
+        max_len_left,
+        max_len_right,
+        offsets_left,
+        offsets_right,
+        n_prefix_from_right,
+    )
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def triton_split_2D_jagged(
+    max_seq_len: int,
+    values: torch.Tensor,
+    total_len_left: Optional[int],
+    total_len_right: Optional[int],
+    max_len_left: Optional[int],
+    max_len_right: Optional[int],
+    offsets_left: Optional[torch.Tensor],
+    offsets_right: Optional[torch.Tensor],
+    n_prefix_to_right: int = 0,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    return _Split2DJaggedFunction.apply(
+        max_seq_len,
+        values,
+        total_len_left,
+        total_len_right,
+        max_len_left,
+        max_len_right,
+        offsets_left,
+        offsets_right,
+        n_prefix_to_right,
+    )
+
+
+class _Concat2DJaggedMultirowFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        max_seq_len: int,
+        values_left: torch.Tensor,
+        values_right: torch.Tensor,
+        max_len_left: Optional[int],
+        max_len_right: Optional[int],
+        offsets_left: Optional[torch.Tensor],
+        offsets_right: Optional[torch.Tensor],
+        n_prefix_from_right: int,
+    ) -> torch.Tensor:
+        values_left = switch_to_contiguous_if_needed(values_left)
+        values_right = switch_to_contiguous_if_needed(values_right)
+        is_dense_left = offsets_left is None
+        is_dense_right = offsets_right is None
+        total_len_left, D = values_left.shape
+        total_len_right, _ = values_right.shape
+        if is_dense_left:
+            assert max_len_left is not None
+            B = total_len_left // max_len_left
+        else:
+            assert offsets_left is not None
+            B = offsets_left.shape[0] - 1
+        if is_dense_right:
+            assert max_len_right is not None
+            B = total_len_right // max_len_right
+        else:
+            assert offsets_right is not None
+            B = offsets_right.shape[0] - 1
+        total_seq_len = total_len_left + total_len_right
+        BLOCK_D = triton.next_power_of_2(D)
+        values_out = torch.empty(
+            (total_seq_len, D), device=values_left.device, dtype=values_left.dtype
+        )
+
+        def grid(meta):
+            return (triton.cdiv(max_seq_len, meta["BLOCK_N"]), B)
+
+        concat_2D_jagged_multirow[grid](
+            ValuesA=values_left,
+            ValuesB=values_right,
+            OffsetsA=offsets_left,
+            OffsetsB=offsets_right,
+            MaxLenA=max_len_left,
+            MaxLenB=max_len_right,
+            Out=values_out,
+            D=D,
+            stride_ad=values_left.stride(-2),
+            stride_bd=values_right.stride(-2),
+            stride_od=values_out.stride(-2),
+            n_prefix_from_B=n_prefix_from_right,
+            IS_DENSE_A=is_dense_left,
+            IS_DENSE_B=is_dense_right,
+            BLOCK_D=BLOCK_D,
+        )
+        ctx.save_for_backward(offsets_left, offsets_right)
+        ctx.max_seq_len = max_seq_len
+        ctx.total_len_left = total_len_left
+        ctx.total_len_right = total_len_right
+        ctx.is_dense_left = is_dense_left
+        ctx.is_dense_right = is_dense_right
+        ctx.max_len_left = max_len_left
+        ctx.max_len_right = max_len_right
+        ctx.B = B
+        ctx.n_prefix_from_right = n_prefix_from_right
+        return values_out
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, d_out: torch.Tensor
+    ) -> Tuple[None, torch.Tensor, torch.Tensor, None, None, None, None, None]:
+        offsets_left, offsets_right = ctx.saved_tensors
+        _, D = d_out.shape
+        BLOCK_D = triton.next_power_of_2(D)
+        d_values_left = torch.zeros(
+            (ctx.total_len_left, D), device=d_out.device, dtype=d_out.dtype
+        )
+        d_values_right = torch.empty(
+            (ctx.total_len_right, D), device=d_out.device, dtype=d_out.dtype
+        )
+
+        def grid(meta):
+            return (triton.cdiv(ctx.max_seq_len, meta["BLOCK_N"]), ctx.B)
+
+        split_2D_jagged_multirow[grid](
+            JaggedIn=d_out,
+            OffsetsA=offsets_left,
+            OffsetsB=offsets_right,
+            MaxLenA=ctx.max_len_left,
+            MaxLenB=ctx.max_len_right,
+            OutA=d_values_left,
+            OutB=d_values_right,
+            D=D,
+            stride_id=d_out.stride(-2),
+            stride_ad=d_values_left.stride(-2),
+            stride_bd=d_values_right.stride(-2),
+            n_prefix_to_B=ctx.n_prefix_from_right,
+            IS_DENSE_A=ctx.is_dense_left,
+            IS_DENSE_B=ctx.is_dense_right,
+            BLOCK_D=BLOCK_D,
+        )
+        return None, d_values_left, d_values_right, None, None, None, None, None
+
+
+class _Split2DJaggedMultirowFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        max_seq_len: int,
+        values: torch.Tensor,
+        total_len_left: Optional[int],
+        total_len_right: Optional[int],
+        max_len_left: Optional[int],
+        max_len_right: Optional[int],
+        offsets_left: Optional[torch.Tensor],
+        offsets_right: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        values = switch_to_contiguous_if_needed(values)
+        is_dense_left: bool = offsets_left is None
+        is_dense_right: bool = offsets_right is None
+        total_seq_len, D = values.shape
+
+        if is_dense_left:
+            assert is_dense_right is False
+            assert offsets_right is not None
+            assert max_len_left is not None
+            B = offsets_right.shape[0] - 1
+            total_len_a = max_len_left * B
+            total_len_b = total_seq_len - total_len_a
+        elif is_dense_right:
+            assert is_dense_left is False
+            assert offsets_left is not None
+            assert max_len_right is not None
+            B = offsets_left.shape[0] - 1
+            total_len_b = max_len_right * B
+            total_len_a = total_seq_len - total_len_b
+        else:
+            assert offsets_left is not None and offsets_right is not None
+            B = offsets_left.shape[0] - 1
+            if total_len_left is not None and total_len_right is not None:
+                assert total_len_left + total_len_right == total_seq_len
+                total_len_a = total_len_left
+                total_len_b = total_len_right
+            else:
+                total_len_a = int(offsets_left[-1].item())
+                total_len_b = values.size(0) - total_len_a
+
+        BLOCK_D = triton.next_power_of_2(D)
+        values_a = torch.empty(
+            (total_len_a, D), device=values.device, dtype=values.dtype
+        )
+        values_b = torch.empty(
+            (total_len_b, D), device=values.device, dtype=values.dtype
+        )
+
+        def grid(meta):
+            return (triton.cdiv(max_seq_len, meta["BLOCK_N"]), B)
+
+        split_2D_jagged_multirow[grid](
+            JaggedIn=values,
+            OffsetsA=offsets_left,
+            OffsetsB=offsets_right,
+            MaxLenA=max_len_left,
+            MaxLenB=max_len_right,
+            OutA=values_a,
+            OutB=values_b,
+            D=D,
+            stride_id=values.stride(-2),
+            stride_ad=values_a.stride(-2),
+            stride_bd=values_b.stride(-2),
+            n_prefix_to_B=0,
+            IS_DENSE_A=is_dense_left,
+            IS_DENSE_B=is_dense_right,
+            BLOCK_D=BLOCK_D,
+        )
+
+        ctx.save_for_backward(offsets_left, offsets_right)
+        ctx.max_seq_len = max_seq_len
+        ctx.total_seq_len = total_seq_len
+        ctx.max_len_left = max_len_left
+        ctx.max_len_right = max_len_right
+        ctx.is_dense_left = is_dense_left
+        ctx.is_dense_right = is_dense_right
+        ctx.B = B
+        ctx.D = D
+
+        return values_a, values_b
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, *d_values
+    ) -> Tuple[None, torch.Tensor, None, None, None, None, None, None]:
+        offsets_left, offsets_right = ctx.saved_tensors
+        d_values_a, d_values_b = d_values
+        BLOCK_D = triton.next_power_of_2(ctx.D)
+        d_jagged_in = torch.empty(
+            (ctx.total_seq_len, ctx.D),
+            device=d_values_a.device,
+            dtype=d_values_a.dtype,
+        )
+
+        def grid(meta):
+            return (triton.cdiv(ctx.max_seq_len, meta["BLOCK_N"]), ctx.B)
+
+        concat_2D_jagged_multirow[grid](
+            ValuesA=d_values_a,
+            ValuesB=d_values_b,
+            OffsetsA=offsets_left,
+            OffsetsB=offsets_right,
+            MaxLenA=ctx.max_len_left,
+            MaxLenB=ctx.max_len_right,
+            Out=d_jagged_in,
+            D=ctx.D,
+            stride_ad=d_values_a.stride(-2),
+            stride_bd=d_values_b.stride(-2),
+            stride_od=d_jagged_in.stride(-2),
+            n_prefix_from_B=0,
+            IS_DENSE_A=ctx.is_dense_left,
+            IS_DENSE_B=ctx.is_dense_right,
+            BLOCK_D=BLOCK_D,
+        )
+
+        return None, d_jagged_in, None, None, None, None, None, None
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def triton_concat_2D_jagged_multirow(
+    max_seq_len: int,
+    values_a: torch.Tensor,
+    values_b: torch.Tensor,
+    offsets_a: Optional[torch.Tensor],
+    offsets_b: Optional[torch.Tensor],
+    max_len_a: int,
+    max_len_b: int,
+) -> torch.Tensor:
+    return _Concat2DJaggedMultirowFunction.apply(
+        max_seq_len,
+        values_a,
+        values_b,
+        max_len_a,
+        max_len_b,
+        offsets_a,
+        offsets_b,
+        0,
+    )
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def triton_split_2D_jagged_multirow(
+    max_seq_len: int,
+    values: torch.Tensor,
+    total_len_left: Optional[int] = None,
+    total_len_right: Optional[int] = None,
+    max_len_left: Optional[int] = None,
+    max_len_right: Optional[int] = None,
+    offsets_left: Optional[torch.Tensor] = None,
+    offsets_right: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    return _Split2DJaggedMultirowFunction.apply(
+        max_seq_len,
+        values,
+        total_len_left,
+        total_len_right,
+        max_len_left,
+        max_len_right,
+        offsets_left,
+        offsets_right,
+    )
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_layer_norm.py b/recommendation_v4/generative_recommenders/ops/triton/triton_layer_norm.py
new file mode 100644
index 000000000..1e997fd40
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_layer_norm.py
@@ -0,0 +1,1327 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+#!/usr/bin/env python3
+
+
+from typing import List, Optional, Tuple
+
+import torch
+
+# @manual=//triton:triton
+import triton
+
+# @manual=//triton:triton
+import triton.language as tl
+from generative_recommenders.common import (
+    switch_to_contiguous_if_needed,
+    triton_autotune,
+)
+from generative_recommenders.ops.utils import (
+    is_sm100_plus,
+    is_sm90,
+    maybe_register_custom_op,
+)
+
+try:
+    # @manual=//triton:triton
+    from triton.language.extra.libdevice import fast_dividef, rsqrt as libdevice_rsqrt
+except ImportError:
+    try:
+        # @manual=//triton:triton
+        from triton.language.extra.cuda.libdevice import (
+            fast_dividef,
+            rsqrt as libdevice_rsqrt,
+        )
+    except ImportError:
+        # pyre-ignore: Undefined import [21]
+        # @manual=//triton:triton
+        from triton.language.math import fast_dividef, rsqrt as libdevice_rsqrt
+
+
+def _get_layer_norm_fwd_configs() -> List[triton.Config]:
+    """Generate autotune configs for multi-row LayerNorm kernels."""
+    configs = []
+    block_ns = [4, 8, 16] if is_sm100_plus() else [1, 2, 4, 8]
+    for BLOCK_N in block_ns:
+        for num_warps in [1, 2, 4, 8]:
+            configs.append(
+                triton.Config(
+                    {"BLOCK_N": BLOCK_N},
+                    num_warps=num_warps,
+                )
+            )
+    return configs
+
+
+def _bwd_pre_hook(nargs):
+    nargs["DW"].zero_()
+    if "DB" in nargs:
+        nargs["DB"].zero_()
+
+
+def _get_norm_bwd_configs() -> List[triton.Config]:
+    """Generate autotune configs for multi-row LayerNorm kernels."""
+    configs = []
+    if is_sm100_plus():
+        block_ns = [8, 16]
+        num_shards_list = [8, 16]
+        num_warps_list = [2, 4]
+    elif is_sm90():
+        block_ns = [2, 4]
+        num_shards_list = [8]
+        num_warps_list = [2, 4]
+    else:
+        block_ns = [1, 2]
+        num_shards_list = [8]
+        num_warps_list = [2, 4]
+    for BLOCK_N in block_ns:
+        for num_warps in num_warps_list:
+            for num_shards in num_shards_list:
+                configs.append(
+                    triton.Config(
+                        {"BLOCK_N": BLOCK_N, "SHARDS_PER_SM": num_shards},
+                        num_warps=num_warps,
+                    )
+                )
+    return configs
+
+
+@triton_autotune(
+    configs=_get_layer_norm_fwd_configs(),
+    key=["BLOCK_D"],
+)
+@triton.jit
+def _layer_norm_fwd(
+    X,
+    Y,
+    Mean,
+    Rstd,
+    N,
+    D,
+    eps,
+    stride_x,
+    stride_y,
+    TRAINING: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    COMPUTE_MEAN_AND_RSTD: tl.constexpr,
+):
+    block_id = tl.program_id(0)
+    start_row = block_id * BLOCK_N
+
+    X_block_ptr = tl.make_block_ptr(
+        base=X,
+        shape=(N, D),
+        strides=(stride_x, 1),
+        offsets=(start_row, 0),
+        block_shape=(BLOCK_N, BLOCK_D),
+        order=(1, 0),
+    )
+
+    Y_block_ptr = tl.make_block_ptr(
+        base=Y,
+        shape=(N, D),
+        strides=(stride_y, 1),
+        offsets=(start_row, 0),
+        block_shape=(BLOCK_N, BLOCK_D),
+        order=(1, 0),
+    )
+
+    x_block = tl.load(X_block_ptr, boundary_check=(0, 1), padding_option="zero").to(
+        tl.float32
+    )
+
+    cols = tl.arange(0, BLOCK_D)
+    col_mask = cols < D
+    rows = start_row + tl.arange(0, BLOCK_N)
+    row_mask = rows < N
+
+    if COMPUTE_MEAN_AND_RSTD:
+        mean = tl.sum(x_block, axis=1) / D
+        if TRAINING:
+            tl.store(Mean + rows, mean, row_mask)
+        mean = tl.expand_dims(mean, 1)
+    else:
+        mean = tl.load(Mean + rows, row_mask, other=0.0)
+        mean = tl.expand_dims(mean, 1)
+
+    x_mean = x_block - mean
+    x_mean = tl.where(row_mask[:, None] & col_mask[None, :], x_mean, 0.0)
+
+    if COMPUTE_MEAN_AND_RSTD:
+        _var = x_mean * x_mean
+        var = tl.sum(_var, axis=1) / D
+        rstd = 1 / tl.sqrt(var + eps)
+        if TRAINING:
+            tl.store(Rstd + rows, rstd, row_mask)
+    else:
+        rstd = tl.load(Rstd + rows, row_mask, other=0.0)
+
+    rstd = tl.expand_dims(rstd, 1)
+    y = x_mean * rstd
+
+    tl.store(Y_block_ptr, y.to(Y.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton_autotune(
+    configs=_get_layer_norm_fwd_configs(),
+    key=["BLOCK_D"],
+)
+@triton.jit
+def _weighted_layer_norm_fwd(
+    X,
+    Y,
+    W,
+    B,
+    Mean,
+    Rstd,
+    N,
+    D,
+    eps,
+    stride_x,
+    stride_y,
+    IS_SWISH: tl.constexpr,
+    TRAINING: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    COMPUTE_MEAN_AND_RSTD: tl.constexpr,
+):
+    # Get the block ID and calculate starting row
+    block_id = tl.program_id(0)
+    start_row = block_id * BLOCK_N
+
+    # Load weight and bias once (shared across all rows in this block)
+    cols = tl.arange(0, BLOCK_D)
+    col_mask = cols < D
+    w = tl.load(W + cols, mask=col_mask, other=0.0).to(tl.float32)
+    b = tl.load(B + cols, mask=col_mask, other=0.0).to(tl.float32)
+
+    # Create block pointers for X and Y
+    X_block_ptr = tl.make_block_ptr(
+        base=X,
+        shape=(N, D),
+        strides=(stride_x, 1),
+        offsets=(start_row, 0),
+        block_shape=(BLOCK_N, BLOCK_D),
+        order=(1, 0),
+    )
+
+    Y_block_ptr = tl.make_block_ptr(
+        base=Y,
+        shape=(N, D),
+        strides=(stride_y, 1),
+        offsets=(start_row, 0),
+        block_shape=(BLOCK_N, BLOCK_D),
+        order=(1, 0),
+    )
+
+    x_block = tl.load(X_block_ptr, boundary_check=(0, 1), padding_option="zero").to(
+        tl.float32
+    )
+
+    rows = start_row + tl.arange(0, BLOCK_N)
+    row_mask = rows < N
+
+    if COMPUTE_MEAN_AND_RSTD:
+        mean = tl.sum(x_block, axis=1) / D
+        if TRAINING:
+            tl.store(Mean + rows, mean, row_mask)
+        mean = tl.expand_dims(mean, 1)
+    else:
+        mean = tl.load(Mean + rows, row_mask, other=0.0)
+        mean = tl.expand_dims(mean, 1)
+
+    x_mean = x_block - mean
+    x_mean = tl.where(row_mask[:, None] & col_mask[None, :], x_mean, 0.0)
+
+    if COMPUTE_MEAN_AND_RSTD:
+        _var = x_mean * x_mean
+        var = tl.sum(_var, axis=1) / D
+        rstd = libdevice_rsqrt(var + eps)
+        if TRAINING:
+            tl.store(Rstd + rows, rstd, row_mask)
+    else:
+        rstd = tl.load(Rstd + rows, row_mask, other=0.0)
+
+    rstd = tl.expand_dims(rstd, 1)
+    y = x_mean * rstd
+    y = y * w[None, :] + b[None, :]
+
+    if IS_SWISH:
+        y = tl.sigmoid(y) * x_block
+
+    tl.store(Y_block_ptr, y.to(Y.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton.jit
+def _layer_norm_bwd_dx(
+    DX,
+    DY,
+    X,
+    Mean,
+    Rstd,
+    stride_dx,
+    stride_dy,
+    stride_x,
+    D,
+    eps,
+    BLOCK_D: tl.constexpr,
+):
+    row = tl.program_id(0)
+    cols = tl.arange(0, BLOCK_D)
+    mask = cols < D
+    X += row.to(tl.int64) * stride_x
+    DY += row.to(tl.int64) * stride_dy
+    DX += row.to(tl.int64) * stride_dx
+
+    # Load data to SRAM
+    x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
+    dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+    mean = tl.load(Mean + row)
+    rstd = tl.load(Rstd + row)
+
+    # Compute dx
+    xhat = (x - mean) * rstd
+    xhat = tl.where(mask, xhat, 0.0)
+    dy = tl.where(mask, dy, 0.0)
+    c1 = tl.sum(xhat * dy, axis=0) / D
+    c2 = tl.sum(dy, axis=0) / D
+    dx = (dy - (xhat * c1 + c2)) * rstd
+    # Write dx
+    tl.store(DX + cols, dx, mask=mask)
+
+
+@triton_autotune(
+    configs=_get_layer_norm_fwd_configs(),
+    key=["BLOCK_D"],
+)
+@triton.jit
+def _weighted_layer_norm_bwd_dx(
+    DX,
+    DY,
+    DW,
+    DB,
+    X,
+    W,
+    B,
+    Mean,
+    Rstd,
+    stride_dx,
+    stride_dy,
+    stride_x,
+    D,
+    eps,
+    IS_SWISH: tl.constexpr,
+    N,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    tile_num = tl.num_programs(0)
+    num_blocks = tl.cdiv(N, BLOCK_N)
+    blocks_per_tile = num_blocks // tile_num
+    if pid < num_blocks % tile_num:
+        blocks_per_tile += 1
+
+    cols = tl.arange(0, BLOCK_D)
+    col_mask = cols < D
+    w = tl.load(W + cols, mask=col_mask, other=0.0).to(tl.float32)
+
+    acc_dw = tl.zeros([BLOCK_D], dtype=tl.float32)
+    acc_db = tl.zeros([BLOCK_D], dtype=tl.float32)
+
+    start_block = pid
+
+    for idx in range(blocks_per_tile):
+        current_block = start_block + idx * tile_num
+        start_row = current_block * BLOCK_N
+
+        X_block_ptr = tl.make_block_ptr(
+            base=X,
+            shape=(N, D),
+            strides=(stride_x, 1),
+            offsets=(start_row, 0),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+
+        DX_block_ptr = tl.make_block_ptr(
+            base=DX,
+            shape=(N, D),
+            strides=(stride_dx, 1),
+            offsets=(start_row, 0),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+
+        DY_block_ptr = tl.make_block_ptr(
+            base=DY,
+            shape=(N, D),
+            strides=(stride_dy, 1),
+            offsets=(start_row, 0),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+
+        # Load data blocks
+        x_block = tl.load(X_block_ptr, boundary_check=(0, 1), padding_option="zero").to(
+            tl.float32
+        )
+        dy_block = tl.load(
+            DY_block_ptr, boundary_check=(0, 1), padding_option="zero"
+        ).to(tl.float32)
+
+        # Load mean and rstd for all rows in this block
+        rows = start_row + tl.arange(0, BLOCK_N)
+        row_mask = rows < N
+        mean = tl.load(Mean + rows, row_mask, other=0.0)
+        rstd = tl.load(Rstd + rows, row_mask, other=0.0)
+
+        # Expand dimensions for broadcasting
+        mean = tl.expand_dims(mean, 1)
+        rstd = tl.expand_dims(rstd, 1)
+
+        xhat = (x_block - mean) * rstd
+
+        xhat = tl.where(row_mask[:, None] & col_mask[None, :], xhat, 0.0)
+        wdy = w[None, :] * dy_block
+        wdy = tl.where(row_mask[:, None] & col_mask[None, :], wdy, 0.0)
+
+        # Compute dx
+        if IS_SWISH:
+            b = tl.load(B + cols, mask=col_mask, other=0.0).to(tl.float32)
+            sigmoid_layer_norm = tl.sigmoid(xhat * w[None, :] + b[None, :])
+            sigmoid_layer_norm = tl.where(
+                row_mask[:, None] & col_mask[None, :], sigmoid_layer_norm, 0.0
+            )
+
+            sigmoid_deriv = sigmoid_layer_norm * (1 - sigmoid_layer_norm)
+            x_ = wdy * x_block * sigmoid_deriv
+            x_ = tl.where(row_mask[:, None] & col_mask[None, :], x_, 0.0)
+
+            c1 = tl.sum(xhat * x_, axis=1) / D
+            c2 = tl.sum(x_, axis=1) / D
+            c1 = tl.expand_dims(c1, 1)
+            c2 = tl.expand_dims(c2, 1)
+            dx = (x_ - (xhat * c1 + c2)) * rstd
+
+            dx = dy_block * sigmoid_layer_norm + dx
+            # Write dx
+            tl.store(DX_block_ptr, dx.to(DX.dtype.element_ty), boundary_check=(0, 1))
+            partial_dw = tl.sum(dy_block * x_block * xhat * sigmoid_deriv, axis=0)
+            partial_db = tl.sum(dy_block * x_block * sigmoid_deriv, axis=0)
+        else:
+            c1 = tl.sum(xhat * wdy, axis=1) / D
+            c2 = tl.sum(wdy, axis=1) / D
+            c1 = tl.expand_dims(c1, 1)
+            c2 = tl.expand_dims(c2, 1)
+            dx = (wdy - (xhat * c1 + c2)) * rstd
+            # Write dx
+            tl.store(DX_block_ptr, dx.to(DX.dtype.element_ty), boundary_check=(0, 1))
+            partial_dw = tl.sum(dy_block * xhat, axis=0)
+            partial_db = tl.sum(dy_block, axis=0)
+
+        # Accumulate partial sums in shared memory
+        acc_dw += partial_dw
+        acc_db += partial_db
+
+    # Store accumulated sums back to global memory
+    dw_ptrs = DW + pid.to(tl.int64) * D + cols
+    db_ptrs = DB + pid.to(tl.int64) * D + cols
+    tl.store(dw_ptrs, acc_dw, mask=col_mask)
+    tl.store(db_ptrs, acc_db, mask=col_mask)
+
+
+def _get_bwd_dwdb_configs() -> List[triton.Config]:
+    configs = []
+    BLOCK_N_CHOICES = [32, 64, 128, 256]
+    if is_sm100_plus():
+        BLOCK_N_CHOICES = [128, 256, 512, 1024]
+    for BLOCK_N in BLOCK_N_CHOICES:
+        for num_warps in [8, 16] + ([] if torch.ops.hip else [32]):
+            configs.append(
+                triton.Config(
+                    {"BLOCK_N": BLOCK_N},
+                    num_warps=num_warps,
+                )
+            )
+    return configs
+
+
+@triton_autotune(
+    configs=_get_bwd_dwdb_configs(),
+    key=["D"],
+)
+@triton.jit
+def _layer_norm_bwd_dwdb(
+    DW,
+    DB,
+    FINAL_DW,
+    FINAL_DB,
+    N,
+    D,
+    BLOCK_N: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    cols = pid * BLOCK_D + tl.arange(0, BLOCK_D)
+    dw = tl.zeros((BLOCK_N, BLOCK_D), dtype=tl.float32)
+    db = tl.zeros((BLOCK_N, BLOCK_D), dtype=tl.float32)
+
+    for i in range(0, N, BLOCK_N):
+        rows = i + tl.arange(0, BLOCK_N)
+        # pyre-fixme[16]: `int` has no attribute `__getitem__`.
+        mask = (rows[:, None] < N) & (cols[None, :] < D)
+        offs = rows[:, None] * D + cols[None, :]
+        dw += tl.load(DW + offs, mask=mask, other=0.0)
+        db += tl.load(DB + offs, mask=mask, other=0.0)
+
+    sum_dw = tl.sum(dw, axis=0)
+    sum_db = tl.sum(db, axis=0)
+    tl.store(FINAL_DW + cols, sum_dw.to(FINAL_DW.dtype.element_ty), mask=cols < D)
+    tl.store(FINAL_DB + cols, sum_db.to(FINAL_DB.dtype.element_ty), mask=cols < D)
+
+
+def compute_BLOCK_D(x: torch.Tensor) -> int:
+    """Compute the BLOCK_D parameter for layer norm kernels."""
+    D = x.shape[-1]
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    return min(MAX_FUSED_SIZE, triton.next_power_of_2(D))
+
+
+@maybe_register_custom_op(
+    "generative_recommenders::triton_weighted_layer_norm_fwd", mutates_args=()
+)
+def triton_weighted_layer_norm_fwd(
+    x: torch.Tensor,
+    weight: Optional[torch.Tensor],
+    bias: Optional[torch.Tensor],
+    eps: float,
+    mean: Optional[torch.Tensor] = None,
+    rstd: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    assert x.dim() == 2, f"x.dim() == {x.dim()}, expected 2"
+    x = switch_to_contiguous_if_needed(x)
+    N, D = x.shape
+    learnable = weight is not None
+    if learnable:
+        assert bias is not None and weight is not None
+        assert weight.dim() == 1
+        assert bias.dim() == 1
+        assert weight.numel() == D
+        assert bias.numel() == D
+
+    y = torch.empty_like(x)
+    compute_mean_and_rstd = mean is None or rstd is None
+    # Always allocate new tensors to avoid aliasing inputs with outputs
+    out_mean = torch.empty((N,), dtype=torch.float32, device=x.device)
+    out_rstd = torch.empty((N,), dtype=torch.float32, device=x.device)
+    if not compute_mean_and_rstd:
+        assert mean is not None and rstd is not None
+        out_mean.copy_(mean)
+        out_rstd.copy_(rstd)
+
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BLOCK_D: int = min(MAX_FUSED_SIZE, triton.next_power_of_2(D))
+    if D > BLOCK_D:
+        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+
+    if N == 0:
+        return y, out_mean, out_rstd
+
+    # pyre-ignore[28]
+    grid = lambda meta: (triton.cdiv(N, meta["BLOCK_N"]),)  # noqa E731
+    if learnable:
+        _weighted_layer_norm_fwd[grid](
+            x,
+            y,
+            weight,
+            bias,
+            out_mean,
+            out_rstd,
+            N,
+            D,
+            eps,
+            x.stride(0),
+            y.stride(0),
+            IS_SWISH=False,
+            TRAINING=True,
+            BLOCK_D=BLOCK_D,
+            COMPUTE_MEAN_AND_RSTD=compute_mean_and_rstd,
+        )
+    else:
+        _layer_norm_fwd[grid](
+            x,
+            y,
+            out_mean,
+            out_rstd,
+            N,
+            D,
+            eps,
+            x.stride(0),
+            y.stride(0),
+            TRAINING=True,
+            BLOCK_D=BLOCK_D,
+            COMPUTE_MEAN_AND_RSTD=compute_mean_and_rstd,
+        )
+
+    return y, out_mean, out_rstd
+
+
+@triton_weighted_layer_norm_fwd.register_fake
+def _(
+    x: torch.Tensor,
+    weight: Optional[torch.Tensor],
+    bias: Optional[torch.Tensor],
+    eps: float,
+    mean: Optional[torch.Tensor] = None,
+    rstd: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    N = x.shape[0]
+    y = torch.empty_like(x)
+    # Always allocate new tensors to avoid aliasing inputs with outputs
+    out_mean = torch.empty((N,), dtype=torch.float32, device=x.device)
+    out_rstd = torch.empty((N,), dtype=torch.float32, device=x.device)
+    return y, out_mean, out_rstd
+
+
+@maybe_register_custom_op(
+    "generative_recommenders::triton_weighted_layer_norm_bwd", mutates_args=()
+)
+def _triton_weighted_layer_norm_bwd_impl(
+    dy: torch.Tensor,
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    mean: torch.Tensor,
+    rstd: torch.Tensor,
+    learnable: bool,
+    eps: float,
+    BLOCK_D: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    num_warps: int = min(max(BLOCK_D // 256, 1), 8)
+    if learnable:
+        N, D = x.shape
+        dx = torch.empty_like(x)
+        sms = torch.cuda.get_device_properties(x.device).multi_processor_count
+        tile_num = max(1, min(sms * 8, N // 4))
+        _dweight = torch.empty((tile_num, D), dtype=torch.float32, device=x.device)
+        _dbias = torch.empty((tile_num, D), dtype=torch.float32, device=x.device)
+        dweight = torch.empty((D,), dtype=weight.dtype, device=x.device)
+        dbias = torch.empty((D,), dtype=weight.dtype, device=x.device)
+        if N == 0:
+            dweight.zero_()
+            dbias.zero_()
+            return dx, dweight, dbias
+        # pyre-ignore[28]
+        _weighted_layer_norm_bwd_dx[(tile_num,)](
+            dx,
+            dy,
+            _dweight,
+            _dbias,
+            x,
+            weight,
+            bias,
+            mean,
+            rstd,
+            dx.stride(0),
+            dy.stride(0),
+            x.stride(0),
+            D,
+            eps,
+            IS_SWISH=False,
+            N=N,
+            BLOCK_D=BLOCK_D,
+        )
+
+        def grid(META):
+            return (triton.cdiv(D, META["BLOCK_D"]),)
+
+        blocks = triton.next_power_of_2(sms * 4)
+        BLOCK_D = triton.next_power_of_2(triton.cdiv(D, blocks))
+        BLOCK_D = min(max(BLOCK_D, 4), 128)
+        _layer_norm_bwd_dwdb[grid](
+            _dweight,
+            _dbias,
+            dweight,
+            dbias,
+            tile_num,
+            D,
+            BLOCK_D=BLOCK_D,
+        )
+
+        return dx, dweight, dbias
+    else:
+        N, D = x.shape
+        dx = torch.empty_like(x)
+        # Return empty tensors as sentinels for None
+        dweight = torch.empty(0, dtype=x.dtype, device=x.device)
+        dbias = torch.empty(0, dtype=x.dtype, device=x.device)
+        if N == 0:
+            return dx, dweight, dbias
+        # pyre-ignore[28]
+        _layer_norm_bwd_dx[(N,)](
+            dx,
+            dy,
+            x,
+            mean,
+            rstd,
+            dx.stride(0),
+            dy.stride(0),
+            x.stride(0),
+            D,
+            eps,
+            BLOCK_D=BLOCK_D,
+            num_warps=num_warps,
+        )
+        return dx, dweight, dbias
+
+
+@_triton_weighted_layer_norm_bwd_impl.register_fake
+def _(
+    dy: torch.Tensor,
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    mean: torch.Tensor,
+    rstd: torch.Tensor,
+    learnable: bool,
+    eps: float,
+    BLOCK_D: int,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    dx = torch.empty_like(x)
+    if learnable:
+        D = x.shape[-1]
+        dweight = torch.empty((D,), dtype=weight.dtype, device=x.device)
+        dbias = torch.empty((D,), dtype=weight.dtype, device=x.device)
+    else:
+        dweight = torch.empty(0, dtype=x.dtype, device=x.device)
+        dbias = torch.empty(0, dtype=x.dtype, device=x.device)
+    return dx, dweight, dbias
+
+
+def triton_weighted_layer_norm_bwd(
+    dy: torch.Tensor,
+    x: torch.Tensor,
+    weight: Optional[torch.Tensor],
+    bias: Optional[torch.Tensor],
+    mean: torch.Tensor,
+    rstd: torch.Tensor,
+    learnable: bool,
+    eps: float,
+    BLOCK_D: int,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+    # Use sentinel tensors for custom_op compatibility (can't return Optional[Tensor])
+    _weight = (
+        weight if weight is not None else torch.empty(0, dtype=x.dtype, device=x.device)
+    )
+    _bias = bias if bias is not None else torch.empty(0, dtype=x.dtype, device=x.device)
+    dx, dweight, dbias = _triton_weighted_layer_norm_bwd_impl(
+        dy=dy,
+        x=x,
+        weight=_weight,
+        bias=_bias,
+        mean=mean,
+        rstd=rstd,
+        learnable=learnable,
+        eps=eps,
+        BLOCK_D=BLOCK_D,
+    )
+    if not learnable:
+        return dx, None, None
+    return dx, dweight, dbias
+
+
+class LayerNormFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        x: torch.Tensor,
+        weight: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor],
+        eps: float,
+    ) -> torch.Tensor:
+        y, mean, rstd = triton_weighted_layer_norm_fwd(
+            x=x,
+            weight=weight,
+            bias=bias,
+            eps=eps,
+        )
+        BLOCK_D = compute_BLOCK_D(x)
+        learnable = weight is not None
+        if learnable:
+            ctx.save_for_backward(x, weight, bias, mean, rstd)
+        else:
+            ctx.save_for_backward(x, mean, rstd)
+        ctx.BLOCK_D = BLOCK_D
+        ctx.eps = eps
+        ctx.learnable = learnable
+        return y
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, dy: torch.Tensor
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], None]:
+        if ctx.learnable:
+            x, weight, bias, mean, rstd = ctx.saved_tensors
+        else:
+            x, mean, rstd = ctx.saved_tensors
+            weight, bias = None, None
+        dx, dweight, dbias = triton_weighted_layer_norm_bwd(
+            dy=dy,
+            x=x,
+            weight=weight,
+            bias=bias,
+            mean=mean,
+            rstd=rstd,
+            learnable=ctx.learnable,
+            eps=ctx.eps,
+            BLOCK_D=ctx.BLOCK_D,
+        )
+        return dx, dweight, dbias, None
+
+
+def _get_rms_norm_fwd_configs() -> List[triton.Config]:
+    """Generate autotune configs for multi-row RMSNorm kernels."""
+    configs = []
+    for BLOCK_N in [1, 4, 16]:
+        for num_warps in [2, 4]:
+            configs.append(
+                triton.Config(
+                    {"BLOCK_N": BLOCK_N},
+                    num_warps=num_warps,
+                )
+            )
+    return configs
+
+
+@triton.autotune(
+    configs=_get_rms_norm_fwd_configs(),
+    key=["BLOCK_D", "SILU"],
+)
+@triton.jit
+def _weighted_rms_norm_fwd(
+    X,
+    Y,
+    W,
+    Rstd,
+    N,
+    D: tl.constexpr,
+    eps,
+    stride_x,
+    stride_y,
+    SILU: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    block_id = tl.program_id(0)
+    start_row = block_id * BLOCK_N
+
+    # Load weight once (shared across all rows in this block)
+    cols = tl.arange(0, BLOCK_D)
+    col_mask = cols < D
+    w = tl.load(W + cols, mask=col_mask, other=0.0).to(tl.float32)
+
+    # Create block pointers for X and Y
+    X_block_ptr = tl.make_block_ptr(
+        base=X,
+        shape=(N, D),
+        strides=(stride_x, 1),
+        offsets=(start_row, 0),
+        block_shape=(BLOCK_N, BLOCK_D),
+        order=(1, 0),
+    )
+
+    Y_block_ptr = tl.make_block_ptr(
+        base=Y,
+        shape=(N, D),
+        strides=(stride_y, 1),
+        offsets=(start_row, 0),
+        block_shape=(BLOCK_N, BLOCK_D),
+        order=(1, 0),
+    )
+
+    x_block = tl.load(X_block_ptr, boundary_check=(0, 1), padding_option="zero").to(
+        tl.float32
+    )
+
+    rows = start_row + tl.arange(0, BLOCK_N)
+    row_mask = rows < N
+
+    # Compute variance (RMS norm uses x directly, not x - mean)
+    x_masked = tl.where(row_mask[:, None] & col_mask[None, :], x_block, 0.0)
+    _var = x_masked * x_masked
+    var = tl.sum(_var, axis=1) / D
+    rstd = 1 / tl.sqrt(var + eps)
+    tl.store(Rstd + rows, rstd, row_mask)
+
+    # Normalize and apply linear transformation
+    rstd = tl.expand_dims(rstd, 1)
+    y = x_block * rstd
+    y = y * w[None, :]
+
+    if SILU:
+        # pyre-ignore[16]: Module `triton.language.math` has no attribute `fast_dividef`
+        y = fast_dividef(y, 1.0 + tl.exp(-y))
+
+    tl.store(Y_block_ptr, y.to(Y.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton.jit
+def _weighted_rms_norm_bwd_dx(
+    DX,
+    DY,
+    DW,
+    X,
+    W,
+    Rstd,
+    Lock,
+    stride_dx,
+    stride_dy,
+    stride_x,
+    D: tl.constexpr,
+    eps,
+    GROUP_N,
+    BLOCK_D: tl.constexpr,
+):
+    row = tl.program_id(0)
+    cols = tl.arange(0, BLOCK_D)
+    mask = cols < D
+    X += row.to(tl.int64) * stride_x
+    DY += row.to(tl.int64) * stride_dy
+    DX += row.to(tl.int64) * stride_dx
+
+    # Load data to SRAM
+    x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
+    dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+    rstd = tl.load(Rstd + row)
+
+    # Compute dx
+    xhat = x * rstd
+    w = tl.load(W + cols, mask=mask).to(tl.float32)
+    wdy = w * dy
+
+    xhat = tl.where(mask, xhat, 0.0)
+    wdy = tl.where(mask, wdy, 0.0)
+    c1 = tl.sum(xhat * wdy, axis=0) / D
+    dx = (wdy - (xhat * c1)) * rstd
+    # Write dx
+    tl.store(DX + cols, dx, mask=mask)
+
+    # Offset locks and weights/biases gradient pointer for parallel reduction
+    lock_id = row % GROUP_N
+    Lock += lock_id
+    Count = Lock + GROUP_N
+    DW = DW + lock_id * D + cols
+    # Accumulate partial sums for dw/db
+    partial_dw = dy * xhat
+    while tl.atomic_cas(Lock, 0, 1) == 1:
+        pass
+    count = tl.load(Count)
+    # First store doesn't accumulate
+    if count == 0:
+        tl.atomic_xchg(Count, 1)
+    else:
+        partial_dw += tl.load(DW, mask=mask)
+    tl.store(DW, partial_dw, mask=mask)
+    # Release the lock
+    tl.atomic_xchg(Lock, 0)
+
+
+@triton_autotune(
+    configs=_get_norm_bwd_configs(),
+    key=["BLOCK_D", "SILU"],
+    reset_to_zero=["DW"],
+)
+@triton.jit
+def _weighted_rms_norm_bwd(
+    DX,
+    DY,
+    DW,
+    X,
+    W,
+    Rstd,
+    stride_dx,
+    stride_dy,
+    stride_x,
+    D,
+    eps,
+    N,
+    SILU: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    SHARDS_PER_SM: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    tile_num = tl.num_programs(0)
+    num_blocks = tl.cdiv(N, BLOCK_N)
+    blocks_per_tile = num_blocks // tile_num
+    if pid < num_blocks % tile_num:
+        blocks_per_tile += 1
+
+    cols = tl.arange(0, BLOCK_D)
+    col_mask = cols < D
+    w = tl.load(W + cols, mask=col_mask, other=0.0).to(tl.float32)
+
+    start_block = pid
+
+    acc_dw = tl.zeros([BLOCK_D], dtype=tl.float32)
+
+    for idx in range(blocks_per_tile):
+        current_block = start_block + idx * tile_num
+        start_row = current_block * BLOCK_N
+
+        X_block_ptr = tl.make_block_ptr(
+            base=X,
+            shape=(N, D),
+            strides=(stride_x, 1),
+            offsets=(start_row, 0),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+
+        DX_block_ptr = tl.make_block_ptr(
+            base=DX,
+            shape=(N, D),
+            strides=(stride_dx, 1),
+            offsets=(start_row, 0),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+
+        DY_block_ptr = tl.make_block_ptr(
+            base=DY,
+            shape=(N, D),
+            strides=(stride_dy, 1),
+            offsets=(start_row, 0),
+            block_shape=(BLOCK_N, BLOCK_D),
+            order=(1, 0),
+        )
+
+        # Load data blocks
+        x_block = tl.load(X_block_ptr, boundary_check=(0, 1), padding_option="zero").to(
+            tl.float32
+        )
+        dy_block = tl.load(
+            DY_block_ptr, boundary_check=(0, 1), padding_option="zero"
+        ).to(tl.float32)
+
+        # Load rstd for all rows in this block
+        rows = start_row + tl.arange(0, BLOCK_N)
+        row_mask = rows < N
+        rstd = tl.load(Rstd + rows, row_mask, other=0.0)
+
+        # Expand dimensions for broadcasting
+        rstd = tl.expand_dims(rstd, 1)
+
+        # Compute dx
+        xhat = x_block * rstd
+
+        # Apply SILU backward if enabled
+        if SILU:
+            y_before_silu = xhat * w[None, :]
+            # pyre-fixme[16]
+            sig_y = fast_dividef(1.0, 1.0 + tl.exp(-y_before_silu))
+            # SILU derivative: sigmoid(y) + y * sigmoid(y) * (1 - sigmoid(y))
+            dy_block = dy_block * (sig_y + y_before_silu * sig_y * (1.0 - sig_y))
+
+        wdy = w[None, :] * dy_block
+
+        c1 = tl.sum(xhat * wdy, axis=1) / D
+        c1 = tl.expand_dims(c1, 1)
+        dx = (wdy - (xhat * c1)) * rstd
+
+        # Write dx
+        tl.store(DX_block_ptr, dx.to(DX.dtype.element_ty), boundary_check=(0, 1))
+
+        # Accumulate partial sums for dw
+        # Compute dw for all rows, then sum locally before atomic operation
+        partial_dw_block = dy_block * xhat
+        # Local reduction: sum across all rows in this block
+        partial_dw = tl.sum(partial_dw_block, axis=0)
+        acc_dw += partial_dw
+
+    DW_ptr = DW + cols
+    tl.atomic_add(DW_ptr, acc_dw, col_mask)
+
+
+@triton_autotune(
+    configs=_get_bwd_dwdb_configs(),
+    key=["D"],
+)
+@triton.jit
+def _rms_norm_bwd_dwdb(
+    DW,
+    FINAL_DW,
+    N,
+    D,
+    BLOCK_N: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    cols = pid * BLOCK_D + tl.arange(0, BLOCK_D)
+    dw = tl.zeros((BLOCK_N, BLOCK_D), dtype=tl.float32)
+
+    for i in range(0, N, BLOCK_N):
+        rows = i + tl.arange(0, BLOCK_N)
+        # pyre-fixme[16]: `int` has no attribute `__getitem__`.
+        mask = (rows[:, None] < N) & (cols[None, :] < D)
+        offs = rows[:, None] * D + cols[None, :]
+        dw += tl.load(DW + offs, mask=mask, other=0.0)
+
+    sum_dw = tl.sum(dw, axis=0)
+    tl.store(FINAL_DW + cols, sum_dw.to(FINAL_DW.dtype.element_ty), mask=cols < D)
+
+
+class RMSNormFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        eps: float,
+        silu: bool,
+    ) -> torch.Tensor:
+        assert x.dim() == 2
+        x = switch_to_contiguous_if_needed(x)
+        N, D = x.shape
+        assert weight.dim() == 1
+        assert weight.numel() == D
+
+        y = torch.empty_like(x)
+        rstd = torch.empty((N,), dtype=torch.float32, device=x.device)
+
+        # Less than 64KB per feature: enqueue fused kernel
+        MAX_FUSED_SIZE = 65536 // x.element_size()
+        BLOCK_D = min(MAX_FUSED_SIZE, triton.next_power_of_2(D))
+        if D > BLOCK_D:
+            raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+
+        ctx.save_for_backward(x, weight, rstd)
+        ctx.silu = silu
+        if N == 0:
+            return y
+
+        # pyre-ignore[28]
+        grid = lambda meta: (triton.cdiv(N, meta["BLOCK_N"]),)  # noqa E731
+        _weighted_rms_norm_fwd[grid](
+            x,
+            y,
+            weight,
+            rstd,
+            N,
+            D,
+            eps,
+            x.stride(0),
+            y.stride(0),
+            SILU=silu,
+            BLOCK_D=BLOCK_D,
+        )
+
+        ctx.BLOCK_D = BLOCK_D
+        ctx.eps = eps
+        return y
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, dy: torch.Tensor
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], None, None]:
+        x, weight, rstd = ctx.saved_tensors
+        N, D = x.shape
+        dx = torch.empty_like(x)
+        dweight = torch.zeros((D,), dtype=weight.dtype, device=x.device)
+        if N == 0:
+            dweight.zero_()
+            return dx, dweight, None, None
+
+        sms = torch.cuda.get_device_properties(x.device).multi_processor_count
+
+        # pyre-ignore[28]
+        grid = lambda meta: (  # noqa E731
+            max(1, min(sms * meta["SHARDS_PER_SM"], N // 4)),
+        )
+        _weighted_rms_norm_bwd[grid](
+            dx,
+            dy,
+            dweight,
+            x,
+            weight,
+            rstd,
+            dx.stride(0),
+            dy.stride(0),
+            x.stride(0),
+            D,
+            ctx.eps,
+            N=N,
+            SILU=ctx.silu,
+            BLOCK_D=ctx.BLOCK_D,
+        )
+
+        return dx, dweight, None, None
+
+
+class SwishLayerNormFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        x: torch.Tensor,
+        weight: torch.Tensor,
+        bias: torch.Tensor,
+        eps: float,
+    ) -> torch.Tensor:
+        assert x.dim() == 2, f"x.dim() == {x.dim()}, expected 2"
+        x = switch_to_contiguous_if_needed(x)
+        N, D = x.shape
+
+        assert bias is not None and weight is not None
+        assert weight.dim() == 1
+        assert bias.dim() == 1
+        assert weight.numel() == D
+        assert bias.numel() == D
+
+        y = torch.empty_like(x)
+        mean = torch.empty((N,), dtype=torch.float32, device=x.device)
+        rstd = torch.empty((N,), dtype=torch.float32, device=x.device)
+
+        BLOCK_D = triton.next_power_of_2(D)
+        num_warps = min(max(BLOCK_D // 256, 1), 8)
+
+        ctx.save_for_backward(x, weight, bias, mean, rstd)
+        ctx.BLOCK_D = BLOCK_D
+        ctx.num_warps = num_warps
+        ctx.eps = eps
+        if N == 0:
+            return y
+
+        # pyre-ignore[28]
+        grid = lambda meta: (triton.cdiv(N, meta["BLOCK_N"]),)  # noqa E731
+        _weighted_layer_norm_fwd[grid](
+            x,
+            y,
+            weight,
+            bias,
+            mean,
+            rstd,
+            N,
+            D,
+            eps,
+            x.stride(0),
+            y.stride(0),
+            IS_SWISH=True,
+            TRAINING=True,
+            BLOCK_D=BLOCK_D,
+            COMPUTE_MEAN_AND_RSTD=True,
+        )
+
+        return y
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, dy: torch.Tensor
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], None]:
+        x, weight, bias, mean, rstd = ctx.saved_tensors
+        N, D = x.shape
+        dx = torch.empty_like(x)
+        sms = torch.cuda.get_device_properties(x.device).multi_processor_count
+        tile_num = max(1, min(sms * 8, N // 4))
+        _dweight = torch.empty((tile_num, D), dtype=torch.float32, device=x.device)
+        _dbias = torch.empty((tile_num, D), dtype=torch.float32, device=x.device)
+        dweight = torch.empty((D,), dtype=weight.dtype, device=x.device)
+        dbias = torch.empty((D,), dtype=weight.dtype, device=x.device)
+        if N == 0:
+            dweight.zero_()
+            dbias.zero_()
+            return dx, dweight, dbias, None
+        # pyre-ignore[28]
+        _weighted_layer_norm_bwd_dx[(tile_num,)](
+            dx,
+            dy,
+            _dweight,
+            _dbias,
+            x,
+            weight,
+            bias,
+            mean,
+            rstd,
+            dx.stride(0),
+            dy.stride(0),
+            x.stride(0),
+            D,
+            ctx.eps,
+            IS_SWISH=True,
+            N=N,
+            BLOCK_D=ctx.BLOCK_D,
+        )
+
+        def grid(META):
+            return (triton.cdiv(D, META["BLOCK_D"]),)
+
+        blocks = triton.next_power_of_2(sms * 4)
+        BLOCK_D = triton.next_power_of_2(triton.cdiv(D, blocks))
+        BLOCK_D = min(max(BLOCK_D, 4), 128)
+        _layer_norm_bwd_dwdb[grid](
+            _dweight,
+            _dbias,
+            dweight,
+            dbias,
+            tile_num,
+            D,
+            BLOCK_D=BLOCK_D,
+        )
+
+        return dx, dweight, dbias, None
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def triton_layer_norm(
+    x: torch.Tensor,
+    weight: Optional[torch.Tensor],
+    bias: Optional[torch.Tensor],
+    eps: float,
+) -> torch.Tensor:
+    return LayerNormFunction.apply(x, weight, bias, eps)
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def triton_rms_norm(
+    x: torch.Tensor,
+    weight: Optional[torch.Tensor],
+    eps: float,
+    silu: bool = False,
+) -> torch.Tensor:
+    return RMSNormFunction.apply(x, weight, eps, silu)
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def triton_swish_layer_norm(
+    x: torch.Tensor,
+    normalized_shape: List[int],
+    weight: Optional[torch.Tensor],
+    bias: Optional[torch.Tensor],
+    eps: float,
+) -> torch.Tensor:
+    return SwishLayerNormFunction.apply(x, weight, bias, eps)
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_position.py b/recommendation_v4/generative_recommenders/ops/triton/triton_position.py
new file mode 100644
index 000000000..72c43f9ac
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_position.py
@@ -0,0 +1,438 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+#!/usr/bin/env python3
+
+
+from typing import List, Optional, Tuple
+
+import torch
+
+# @manual=//triton:triton
+import triton
+
+# @manual=//triton:triton
+import triton.language as tl
+
+try:
+    torch.ops.load_library("//hammer/ops/cuda:cuda_ops")
+except OSError:
+    pass
+
+from generative_recommenders.common import (
+    autotune_max_seq_len,
+    prev_power_of_2,
+    switch_to_contiguous_if_needed,
+    triton_autotune,
+)
+
+
+def _autotune_configs() -> List[triton.Config]:
+    configs = []
+    for BLOCK_N in [16, 32, 64]:
+        for num_stages in [1, 2]:
+            for num_warps in [2, 4, 8]:
+                configs.append(
+                    triton.Config(
+                        {
+                            "BLOCK_N": BLOCK_N,
+                        },
+                        num_stages=num_stages,
+                        num_warps=num_warps,
+                    )
+                )
+    return configs
+
+
+@triton_autotune(
+    configs=_autotune_configs(),
+    key=["AUTOTUNE_MAX_SEQ_LEN"],
+)
+@triton.jit
+def _add_timestamp_position_embeddings_kernel(
+    SeqEmb,
+    Offsets,
+    Lengths,
+    PosEmb,
+    TsEmb,
+    Out,
+    TS,
+    PosInds,
+    TsInds,
+    NumTargets,
+    AUTOTUNE_MAX_SEQ_LEN,
+    D,
+    num_time_buckets,
+    time_bucket_increments,
+    time_bucket_scale,
+    time_delta,
+    max_contextual_seq_len,
+    max_pos_ind,
+    stride_sn,
+    stride_pn,
+    stride_tn,
+    stride_on,
+    TRAINING: tl.constexpr,
+    HAS_MULTIPLE_TARGETS: tl.constexpr,
+    INTERLEAVE_TARGETS: tl.constexpr,
+    TIME_BUCKET_FN: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    """
+    SeqEmb has shape (sum_B(N_i), D),
+    PosEmb has shape (N_p, D),
+    TsEmb has shape (N_t, D),
+    Out has shape (sum_B(N_i), D)
+    """
+
+    off_b = tl.program_id(0)
+    off_n = tl.program_id(1)
+    seq_start = tl.load(Offsets + off_b)
+    seq_end = tl.load(Offsets + off_b + 1)
+    seq_len = seq_end - seq_start
+    start_n = off_n * BLOCK_N
+    if start_n >= seq_len:
+        return
+    offs_n = start_n + tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_D)
+    seq_emb_offsets = offs_n[:, None] * stride_sn + offs_d[None, :]
+    SeqEmb += seq_start.to(tl.int64) * stride_sn
+    mask_n = offs_n < seq_len
+    # position encoding
+    seq_len = tl.load(Lengths + off_b)
+    if HAS_MULTIPLE_TARGETS:
+        num_targets = tl.load(NumTargets + off_b)
+        if INTERLEAVE_TARGETS:
+            high_ind = seq_len - num_targets * 2
+        else:
+            high_ind = seq_len - num_targets
+    else:
+        high_ind = seq_len
+    pos_inds = tl.where(offs_n < high_ind, offs_n, high_ind)
+    pos_inds = high_ind - pos_inds + max_contextual_seq_len
+    pos_inds = tl.where(pos_inds < max_pos_ind - 1, pos_inds, max_pos_ind - 1)
+    pos_inds = tl.where(offs_n < max_contextual_seq_len, offs_n, pos_inds)
+    if TRAINING:
+        tl.store(PosInds + seq_start + offs_n, pos_inds, mask=mask_n)
+    pos_emb_offsets = pos_inds[:, None] * stride_pn + offs_d[None, :]
+    # timestamp encoding
+    ts = tl.load(TS + seq_start + offs_n, mask=mask_n)
+    query_time = tl.load(TS + seq_end - 1)
+    ts = query_time - ts + time_delta
+    ts = tl.where(ts > 1e-6, ts, 1e-6) / time_bucket_increments
+    if TIME_BUCKET_FN == "log":
+        ts = tl.log(ts)
+    else:
+        ts = tl.sqrt(ts)
+    ts = ts * time_bucket_scale
+    ts = ts.to(tl.int32)
+    ts = tl.where(ts > 0, ts, 0)
+    ts = tl.where(ts < num_time_buckets, ts, num_time_buckets)
+    if TRAINING:
+        tl.store(TsInds + seq_start + offs_n, ts, mask=mask_n)
+    ts_emb_offsets = ts[:, None] * stride_tn + offs_d[None, :]
+    Out += seq_start.to(tl.int64) * stride_on
+    out_offsets = Out + offs_n[:, None] * stride_on + offs_d[None, :]
+    for _d in range(0, D, BLOCK_D):
+        mask = (offs_n[:, None] < seq_len) and offs_d[None, :] < D
+        seq_emb = tl.load(SeqEmb + seq_emb_offsets, mask=mask)
+        pos_emb = tl.load(PosEmb + pos_emb_offsets, mask=mask)
+        ts_emb = tl.load(TsEmb + ts_emb_offsets, mask=mask)
+        tl.store(out_offsets, seq_emb + (pos_emb + ts_emb).to(seq_emb.dtype), mask=mask)
+        seq_emb_offsets += BLOCK_D
+        pos_emb_offsets += BLOCK_D
+        ts_emb_offsets += BLOCK_D
+        out_offsets += BLOCK_D
+        offs_d += BLOCK_D
+
+
+def bwd_pre_hook(nargs):
+    nargs["Out"].zero_()
+
+
+def _add_embeddings_bwd_configs() -> List[triton.Config]:
+    configs = []
+    for BLOCK in [32, 64, 128]:
+        for num_stages in [2, 3, 4]:
+            for num_warps in [2, 4, 8]:
+                configs.append(
+                    triton.Config(
+                        {
+                            "BLOCK": BLOCK,
+                        },
+                        num_stages=num_stages,
+                        num_warps=num_warps,
+                        pre_hook=bwd_pre_hook,
+                    )
+                )
+    return configs
+
+
+@triton_autotune(
+    configs=_add_embeddings_bwd_configs(),
+    key=["AUTOTUNE_MAX_SEQ_LEN", "AUTOTUNE_B", "D"],
+)
+@triton.jit
+def _add_embeddings_bwd_kernel(
+    In,
+    KeyInds,
+    ValueInds,
+    Out,
+    AUTOTUNE_MAX_SEQ_LEN,
+    AUTOTUNE_B,
+    D,
+    jagged_size,
+    stride_in,
+    stride_on,
+    BLOCK_D: tl.constexpr,
+    BLOCK: tl.constexpr,
+):
+    off_block = tl.program_id(0)
+    offs_d = tl.arange(0, BLOCK_D)
+    mask_d = offs_d < D
+    key_ind = -1
+    key_ind = key_ind.to(KeyInds.dtype.element_ty)  # pyre-ignore[16]
+    accumulator = tl.zeros((BLOCK_D,), dtype=In.dtype.element_ty)
+    for off_i in range(0, BLOCK):
+        off = off_block * BLOCK + off_i
+        if off < jagged_size:
+            value_ind = tl.load(ValueInds + off)
+            in_offset = In + value_ind.to(tl.int64) * stride_in
+            jagged_in = tl.load(in_offset + offs_d, mask=mask_d)
+            key_ind_new = tl.load(KeyInds + off)
+            if key_ind == key_ind_new:
+                accumulator += jagged_in
+            else:
+                if key_ind >= 0:
+                    out_offset = Out + key_ind.to(tl.int64) * stride_on
+                    tl.atomic_add(
+                        out_offset + offs_d,
+                        accumulator.to(Out.dtype.element_ty),
+                        mask=mask_d,
+                        sem="relaxed",
+                    )
+                key_ind = key_ind_new
+                accumulator = jagged_in
+    if key_ind >= 0:
+        out_offset = Out + key_ind.to(tl.int64) * stride_on
+        tl.atomic_add(
+            out_offset + offs_d,
+            accumulator.to(Out.dtype.element_ty),
+            mask=mask_d,
+            sem="relaxed",
+        )
+
+
+class _AddTimestampPositionEmbeddingsFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]
+    def forward(
+        ctx,
+        seq_embeddings: torch.Tensor,
+        seq_offsets: torch.Tensor,
+        pos_embeddings: torch.Tensor,
+        ts_embeddings: torch.Tensor,
+        timestamps: torch.Tensor,
+        max_seq_len: int,
+        max_contextual_seq_len: int,
+        seq_lengths: torch.Tensor,
+        num_targets: Optional[torch.Tensor],
+        interleave_targets: bool,
+        time_bucket_fn: str,
+    ):
+        seq_embeddings = switch_to_contiguous_if_needed(seq_embeddings)
+        pos_embeddings = switch_to_contiguous_if_needed(pos_embeddings)
+        ts_embeddings = switch_to_contiguous_if_needed(ts_embeddings)
+
+        max_pos_ind = pos_embeddings.shape[0]
+        B = seq_lengths.shape[0]
+        N, D = seq_embeddings.shape
+        assert len(pos_embeddings.shape) == 2
+        assert len(ts_embeddings.shape) == 2
+        assert pos_embeddings.shape[1] == D, (
+            "shape[1] of pos_embeddings much match seq_embeddings"
+        )
+        assert ts_embeddings.shape[1] == D, (
+            "shape[1] of ts_embeddings much match seq_embeddings"
+        )
+        out = torch.empty_like(seq_embeddings)
+
+        timestamps = switch_to_contiguous_if_needed(timestamps)
+        ts_inds = torch.empty_like(seq_embeddings[:, 0], dtype=torch.int32)
+        pos_inds = torch.empty_like(seq_embeddings[:, 0], dtype=torch.int32)
+        ts_emb_size = ts_embeddings.shape[0]
+
+        grid = lambda meta: (  # noqa E731
+            B,
+            triton.cdiv(max_seq_len, meta["BLOCK_N"]),
+        )
+        BLOCK_D = triton.next_power_of_2(D) if D < 64 else 64
+        _add_timestamp_position_embeddings_kernel[grid](
+            SeqEmb=seq_embeddings,
+            Offsets=seq_offsets,
+            Lengths=seq_lengths,
+            PosEmb=pos_embeddings,
+            TsEmb=ts_embeddings,
+            Out=out,
+            TS=timestamps,
+            PosInds=pos_inds,
+            TsInds=ts_inds,
+            NumTargets=num_targets,
+            AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(max_seq_len),
+            D=D,
+            num_time_buckets=ts_emb_size - 1,
+            time_bucket_increments=60.0,
+            time_bucket_scale=1.0,
+            time_delta=0,
+            max_contextual_seq_len=max_contextual_seq_len,
+            max_pos_ind=max_pos_ind,
+            stride_sn=seq_embeddings.stride(0),
+            stride_pn=pos_embeddings.stride(0),
+            stride_tn=ts_embeddings.stride(0),
+            stride_on=out.stride(0),
+            TRAINING=True,
+            HAS_MULTIPLE_TARGETS=num_targets is not None,
+            INTERLEAVE_TARGETS=interleave_targets,
+            TIME_BUCKET_FN=time_bucket_fn,
+            BLOCK_D=BLOCK_D,
+        )
+        try:
+            values = torch.arange(0, N, dtype=torch.int32, device=timestamps.device)
+            sorted_ts_key_inds, sorted_ts_value_inds = torch.ops.hammer.sort_kv_pairs(
+                ts_inds, values
+            )
+            sorted_pos_key_inds, sorted_pos_value_inds = torch.ops.hammer.sort_kv_pairs(
+                pos_inds, values
+            )
+        except Exception:
+            sorted_ts_key_inds, sorted_ts_value_inds = torch.sort(ts_inds)
+            sorted_pos_key_inds, sorted_pos_value_inds = torch.sort(pos_inds)
+        ctx.save_for_backward(
+            sorted_pos_key_inds,
+            sorted_pos_value_inds,
+            sorted_ts_key_inds,
+            sorted_ts_value_inds,
+        )
+        ctx.B = B
+        ctx.D = D
+        ctx.max_seq_len = max_seq_len
+        ctx.pos_emb_size = pos_embeddings.shape[0]
+        ctx.ts_emb_size = ts_emb_size
+        ctx.pos_dtype = pos_embeddings.dtype
+        ctx.ts_dtype = ts_embeddings.dtype
+        return out
+
+    @staticmethod
+    # pyre-ignore[14]
+    def backward(
+        ctx, d_out: torch.Tensor
+    ) -> Tuple[
+        torch.Tensor,
+        None,
+        torch.Tensor,
+        torch.Tensor,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+    ]:
+        (
+            sorted_pos_key_inds,
+            sorted_pos_value_inds,
+            sorted_ts_key_inds,
+            sorted_ts_value_inds,
+        ) = ctx.saved_tensors
+        d_pos_embeddings = torch.empty(
+            (ctx.pos_emb_size, ctx.D), device=d_out.device, dtype=torch.float32
+        )
+        d_ts_embeddings = torch.empty(
+            (ctx.ts_emb_size, ctx.D), device=d_out.device, dtype=torch.float32
+        )
+        grid = lambda meta: (triton.cdiv(d_out.shape[0], meta["BLOCK"]),)  # noqa E731
+        AUTOTUNE_B = prev_power_of_2(ctx.B)
+        _add_embeddings_bwd_kernel[grid](
+            In=d_out,
+            KeyInds=sorted_pos_key_inds,
+            ValueInds=sorted_pos_value_inds,
+            Out=d_pos_embeddings,
+            AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(ctx.max_seq_len),
+            AUTOTUNE_B=AUTOTUNE_B,
+            D=ctx.D,
+            jagged_size=d_out.shape[0],
+            stride_in=d_out.stride(0),
+            stride_on=d_pos_embeddings.stride(0),
+            BLOCK_D=triton.next_power_of_2(ctx.D),
+        )
+        _add_embeddings_bwd_kernel[grid](
+            In=d_out,
+            KeyInds=sorted_ts_key_inds,
+            ValueInds=sorted_ts_value_inds,
+            Out=d_ts_embeddings,
+            AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(ctx.max_seq_len),
+            AUTOTUNE_B=AUTOTUNE_B,
+            D=ctx.D,
+            jagged_size=d_out.shape[0],
+            stride_in=d_out.stride(0),
+            stride_on=d_ts_embeddings.stride(0),
+            BLOCK_D=triton.next_power_of_2(ctx.D),
+        )
+        return (
+            d_out,
+            None,
+            d_pos_embeddings.to(ctx.pos_dtype),
+            d_ts_embeddings.to(ctx.ts_dtype),
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def triton_add_timestamp_positional_embeddings(
+    seq_embeddings: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    pos_embeddings: torch.Tensor,
+    ts_embeddings: torch.Tensor,
+    timestamps: torch.Tensor,
+    max_seq_len: int,
+    max_contextual_seq_len: int,
+    seq_lengths: torch.Tensor,
+    num_targets: Optional[torch.Tensor],
+    interleave_targets: bool,
+    time_bucket_fn: str,
+) -> torch.Tensor:
+    return _AddTimestampPositionEmbeddingsFunction.apply(
+        seq_embeddings,
+        seq_offsets,
+        pos_embeddings,
+        ts_embeddings,
+        timestamps,
+        max_seq_len,
+        max_contextual_seq_len,
+        seq_lengths,
+        num_targets,
+        interleave_targets,
+        time_bucket_fn,
+    )
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_swiglu.py b/recommendation_v4/generative_recommenders/ops/triton/triton_swiglu.py
new file mode 100644
index 000000000..5f30da53b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_swiglu.py
@@ -0,0 +1,753 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-unsafe
+
+from typing import List
+
+import torch
+
+# @manual=//triton:triton
+import triton
+
+# @manual=//triton:triton
+import triton.language as tl
+from generative_recommenders.common import triton_autotune
+from generative_recommenders.ops.utils import is_sm100_plus
+
+TMA_AVAILABLE = False
+try:
+    # @manual=//triton:triton
+    from triton.tools.tensor_descriptor import TensorDescriptor
+
+    TMA_AVAILABLE = True
+except ImportError:
+    pass
+
+HAS_TLX = False
+try:
+    # @manual=//triton:triton
+    import triton.language.extra.tlx as tlx  # type: ignore
+
+    HAS_TLX = True
+except ImportError:
+    pass
+
+
+def is_blackwell_triton_swiglu_supported() -> bool:
+    return is_sm100_plus() and TMA_AVAILABLE and HAS_TLX
+
+
+def _swiglu_tma_set_block_size_hook(nargs) -> None:
+    BLOCK_M = nargs["BLOCK_M"]
+    BLOCK_N = nargs["BLOCK_N"]
+    BLOCK_K = nargs["BLOCK_K"]
+    EPILOGUE_SUBTILE = nargs.get("EPILOGUE_SUBTILE", 1)
+
+    nargs["x_desc"].block_shape = [BLOCK_M, BLOCK_K]
+    nargs["w_gate_desc"].block_shape = [BLOCK_N, BLOCK_K]
+    nargs["w_up_desc"].block_shape = [BLOCK_N, BLOCK_K]
+    nargs["out_desc"].block_shape = [BLOCK_M, BLOCK_N // EPILOGUE_SUBTILE]
+
+
+def get_swiglu_configs(pre_hook) -> List[triton.Config]:
+    return [
+        triton.Config(
+            {
+                "BLOCK_M": 128,
+                "BLOCK_N": 128,
+                "BLOCK_K": 64,
+                "GROUP_M": 8,
+            },
+            num_stages=1,
+            num_warps=4,
+            pre_hook=pre_hook,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 64,
+                "BLOCK_N": 128,
+                "BLOCK_K": 64,
+                "GROUP_M": 8,
+            },
+            num_stages=1,
+            num_warps=4,
+            pre_hook=pre_hook,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 128,
+                "BLOCK_N": 64,
+                "BLOCK_K": 64,
+                "GROUP_M": 8,
+            },
+            num_stages=1,
+            num_warps=4,
+            pre_hook=pre_hook,
+        ),
+    ]
+
+
+@triton.jit
+def _compute_pid_swiglu(
+    tile_id,
+    num_pid_in_group,
+    num_pid_m,
+    GROUP_M: tl.constexpr,
+    NUM_SMS: tl.constexpr,
+):
+    group_id = tile_id // num_pid_in_group
+    first_pid_m = group_id * GROUP_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_M)
+    pid_m = first_pid_m + (tile_id % group_size_m)
+    pid_n = (tile_id % num_pid_in_group) // group_size_m
+    return pid_m, pid_n
+
+
+@triton_autotune(
+    configs=get_swiglu_configs(pre_hook=_swiglu_tma_set_block_size_hook),
+    key=["M_BLOCK", "N", "K"],
+)
+@triton.jit
+def _swiglu_fwd_tma_ws_persistent(
+    x_desc,
+    w_gate_desc,
+    w_up_desc,
+    out_desc,
+    M,
+    N,
+    K,
+    M_BLOCK,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_M: tl.constexpr,
+    NUM_SMEM_BUFFERS: tl.constexpr,
+    NUM_TMEM_BUFFERS: tl.constexpr,
+    NUM_SMS: tl.constexpr,
+    EPILOGUE_SUBTILE: tl.constexpr,
+):
+    # Allocate SMEM buffers
+    x_buffers = tlx.local_alloc((BLOCK_M, BLOCK_K), x_desc.dtype, NUM_SMEM_BUFFERS)
+
+    # Allocate SMEM buffers for W_gate and W_up
+    w_gate_buffers = tlx.local_alloc(
+        (BLOCK_N, BLOCK_K), w_gate_desc.dtype, NUM_SMEM_BUFFERS
+    )
+    w_up_buffers = tlx.local_alloc(
+        (BLOCK_N, BLOCK_K), w_up_desc.dtype, NUM_SMEM_BUFFERS
+    )
+
+    # Allocate TMEM for accumulators
+    tmem_gate_buffers = tlx.local_alloc(
+        (BLOCK_M, BLOCK_N), tl.float32, NUM_TMEM_BUFFERS, tlx.storage_kind.tmem
+    )
+    tmem_up_buffers = tlx.local_alloc(
+        (BLOCK_M, BLOCK_N), tl.float32, NUM_TMEM_BUFFERS, tlx.storage_kind.tmem
+    )
+
+    # Barriers for Producer <-> MMA synchronization
+    smem_full_bars_x_gate = tlx.alloc_barriers(
+        num_barriers=NUM_SMEM_BUFFERS,
+        arrive_count=1,  # pyre-ignore[6]
+    )
+    smem_full_bars_up = tlx.alloc_barriers(
+        num_barriers=NUM_SMEM_BUFFERS,
+        arrive_count=1,  # pyre-ignore[6]
+    )
+    # Empty barriers: arrive_count=2 because both GEMM1 and GEMM2 signal completion
+    smem_empty_bars = tlx.alloc_barriers(
+        num_barriers=NUM_SMEM_BUFFERS,
+        arrive_count=2,  # pyre-ignore[6]
+    )
+
+    # Barriers for MMA <-> Epilogue synchronization
+    # pyre-ignore[6]
+    tmem_full_bars = tlx.alloc_barriers(num_barriers=NUM_TMEM_BUFFERS, arrive_count=1)
+    # pyre-ignore[6]
+    tmem_empty_bars = tlx.alloc_barriers(num_barriers=NUM_TMEM_BUFFERS, arrive_count=1)
+
+    with tlx.async_tasks():
+        # Epilogue Consumer: Reads from TMEM, applies SwiGLU, and stores to output
+        with tlx.async_task("default"):
+            start_pid = tl.program_id(axis=0)
+            num_pid_m = tl.cdiv(M, BLOCK_M)
+            num_pid_n = tl.cdiv(N, BLOCK_N)
+            num_pid_in_group = GROUP_M * num_pid_n
+            num_tiles = num_pid_m * num_pid_n
+
+            # Initialize buffer tracking
+            processed_tiles = 0
+
+            for tile_id in range(start_pid, num_tiles, NUM_SMS):
+                pid_m, pid_n = _compute_pid_swiglu(
+                    tile_id, num_pid_in_group, num_pid_m, GROUP_M, NUM_SMS
+                )
+                offs_m = pid_m * BLOCK_M
+                offs_n = pid_n * BLOCK_N
+
+                cur_tmem_buf = processed_tiles % int(NUM_TMEM_BUFFERS)
+                tmem_read_phase = (processed_tiles // int(NUM_TMEM_BUFFERS)) & 1
+
+                # Wait for MMA to finish writing to TMEM
+                # pyre-ignore[16]
+                tlx.barrier_wait(tmem_full_bars[cur_tmem_buf], tmem_read_phase)
+
+                # Load gate and up results from TMEM
+                # pyre-ignore[16]
+                gate_tmem = tmem_gate_buffers[cur_tmem_buf]
+                up_tmem = tmem_up_buffers[cur_tmem_buf]
+
+                if EPILOGUE_SUBTILE > 1:
+                    # Process tile in subtiles
+                    slice_size: tl.constexpr = BLOCK_N // EPILOGUE_SUBTILE
+                    for slice_id in tl.static_range(EPILOGUE_SUBTILE):
+                        gate_subslice = tlx.local_slice(
+                            gate_tmem,
+                            [0, slice_id * slice_size],
+                            # pyre-ignore[6]
+                            [BLOCK_M, slice_size],
+                        )
+                        up_subslice = tlx.local_slice(
+                            up_tmem,
+                            [0, slice_id * slice_size],
+                            # pyre-ignore[6]
+                            [BLOCK_M, slice_size],
+                        )
+
+                        gate = tlx.local_load(gate_subslice).to(out_desc.dtype)
+                        up = tlx.local_load(up_subslice).to(out_desc.dtype)
+
+                        gate_fp32 = gate.to(tl.float32)
+                        silu_gate = (gate_fp32 * tl.sigmoid(gate_fp32)).to(
+                            out_desc.dtype
+                        )
+                        result = silu_gate * up
+
+                        out_desc.store([offs_m, offs_n + slice_id * slice_size], result)
+                else:
+                    # Process full tile
+                    gate = tlx.local_load(gate_tmem).to(out_desc.dtype)
+                    up = tlx.local_load(up_tmem).to(out_desc.dtype)
+
+                    gate_fp32 = gate.to(tl.float32)
+                    silu_gate = (gate_fp32 * tl.sigmoid(gate_fp32)).to(out_desc.dtype)
+                    result = silu_gate * up
+
+                    out_desc.store([offs_m, offs_n], result)
+
+                # Signal MMA that TMEM buffer is free
+                # pyre-ignore[6]
+                tlx.barrier_arrive(tmem_empty_bars[cur_tmem_buf], 1)
+
+                processed_tiles += 1
+
+        # MMA Consumer: Computes both GEMMs: gate = X @ W_gate, up = X @ W_up
+        with tlx.async_task(num_warps=4, num_regs=232):
+            start_pid = tl.program_id(axis=0)
+            num_pid_m = tl.cdiv(M, BLOCK_M)
+            num_pid_n = tl.cdiv(N, BLOCK_N)
+            num_pid_in_group = GROUP_M * num_pid_n
+            num_tiles = num_pid_m * num_pid_n
+            k_tiles = tl.cdiv(K, BLOCK_K)
+
+            processed_k_iters = 0
+            processed_tiles = 0
+
+            for tile_id in range(start_pid, num_tiles, NUM_SMS):
+                pid_m, pid_n = _compute_pid_swiglu(
+                    tile_id, num_pid_in_group, num_pid_m, GROUP_M, NUM_SMS
+                )
+
+                cur_tmem_buf = processed_tiles % int(NUM_TMEM_BUFFERS)
+                tmem_write_phase = (processed_tiles // int(NUM_TMEM_BUFFERS)) & 1
+
+                # Wait for epilogue to finish
+                tlx.barrier_wait(tmem_empty_bars[cur_tmem_buf], tmem_write_phase ^ 1)
+
+                # Perform K-dimension reduction for both GEMMs
+                for k in range(0, k_tiles):
+                    buf = (processed_k_iters + k) % int(NUM_SMEM_BUFFERS)
+
+                    total_iters = processed_k_iters + k
+                    dot_phase = (total_iters // int(NUM_SMEM_BUFFERS)) & 1
+
+                    # Wait for x and w_gate to be loaded, then start GEMM1
+                    tlx.barrier_wait(smem_full_bars_x_gate[buf], dot_phase)
+
+                    # Transpose weight buffer for MMA
+                    w_gate_trans = tlx.local_trans(w_gate_buffers[buf])
+
+                    # GEMM 1: gate = X @ W_gate.T
+                    tlx.async_dot(
+                        x_buffers[buf],
+                        w_gate_trans,
+                        tmem_gate_buffers[cur_tmem_buf],
+                        # pyre-ignore[6]
+                        use_acc=(k > 0),
+                        mBarriers=[smem_empty_bars[buf]],
+                        out_dtype=tl.float32,
+                    )
+
+                    # Wait for w_up to be loaded before starting GEMM2
+                    tlx.barrier_wait(smem_full_bars_up[buf], dot_phase)
+
+                    w_up_trans = tlx.local_trans(w_up_buffers[buf])
+
+                    # GEMM 2: up = X @ W_up.T
+                    tlx.async_dot(
+                        x_buffers[buf],
+                        w_up_trans,
+                        tmem_up_buffers[cur_tmem_buf],
+                        # pyre-ignore[6]
+                        use_acc=(k > 0),
+                        mBarriers=[smem_empty_bars[buf]],
+                        out_dtype=tl.float32,
+                    )
+
+                # Wait for last MMA to complete
+                last_buf = (processed_k_iters + k_tiles - 1) % int(NUM_SMEM_BUFFERS)
+                last_total_iters = processed_k_iters + k_tiles - 1
+                last_dot_phase = (last_total_iters // int(NUM_SMEM_BUFFERS)) & 1
+                tlx.barrier_wait(smem_empty_bars[last_buf], last_dot_phase)
+
+                # Signal epilogue that results are ready
+                # pyre-ignore[6]
+                tlx.barrier_arrive(tmem_full_bars[cur_tmem_buf], 1)
+
+                processed_tiles += 1
+                processed_k_iters += k_tiles
+
+        # Producer: TMA loads for X, W_gate, W_up
+        with tlx.async_task(num_warps=1, num_regs=24):
+            start_pid = tl.program_id(axis=0)
+            num_pid_m = tl.cdiv(M, BLOCK_M)
+            num_pid_n = tl.cdiv(N, BLOCK_N)
+            num_pid_in_group = GROUP_M * num_pid_n
+            num_tiles = num_pid_m * num_pid_n
+            k_tiles = tl.cdiv(K, BLOCK_K)
+
+            # Initialize phase tracking
+            processed_k_iters = 0
+
+            for tile_id in range(start_pid, num_tiles, NUM_SMS):
+                pid_m, pid_n = _compute_pid_swiglu(
+                    tile_id, num_pid_in_group, num_pid_m, GROUP_M, NUM_SMS
+                )
+                offs_m = pid_m * BLOCK_M
+                offs_n = pid_n * BLOCK_N
+
+                for k in range(0, k_tiles):
+                    buf = (processed_k_iters + k) % int(NUM_SMEM_BUFFERS)
+
+                    total_iters = processed_k_iters + k
+                    load_phase = (total_iters // int(NUM_SMEM_BUFFERS)) & 1
+
+                    # Wait for buffer to be free
+                    tlx.barrier_wait(smem_empty_bars[buf], load_phase ^ 1)
+
+                    offs_k = k * BLOCK_K
+
+                    # Set expected bytes for x+w_gate barrier
+                    tlx.barrier_expect_bytes(
+                        smem_full_bars_x_gate[buf],
+                        # pyre-ignore[6]
+                        2 * (BLOCK_M * BLOCK_K + BLOCK_K * BLOCK_N),
+                    )
+
+                    # Set expected bytes for w_up barrier
+                    tlx.barrier_expect_bytes(
+                        smem_full_bars_up[buf],
+                        # pyre-ignore[6]
+                        2 * (BLOCK_K * BLOCK_N),
+                    )
+
+                    # Load x and w_gate first, signal smem_full_bars_x_gate
+                    tlx.async_descriptor_load(
+                        x_desc,
+                        x_buffers[buf],
+                        [offs_m, offs_k],
+                        smem_full_bars_x_gate[buf],
+                    )
+
+                    # Weights are in [N, K] layout, load with [offs_n, offs_k]
+                    tlx.async_descriptor_load(
+                        w_gate_desc,
+                        w_gate_buffers[buf],
+                        [offs_n, offs_k],
+                        smem_full_bars_x_gate[buf],
+                    )
+
+                    # Load w_up separately, signal smem_full_bars_up
+                    tlx.async_descriptor_load(
+                        w_up_desc,
+                        w_up_buffers[buf],
+                        [offs_n, offs_k],
+                        smem_full_bars_up[buf],
+                    )
+
+                processed_k_iters += k_tiles
+
+
+@torch.fx.wrap
+def triton_swiglu_fwd_tma_ws_persistent_tlx(
+    x: torch.Tensor,
+    w_gate: torch.Tensor,
+    w_up: torch.Tensor,
+) -> torch.Tensor:
+    M, K = x.shape
+    N, K_gate = w_gate.shape
+    N_up, K_up = w_up.shape
+
+    # Only bf16/fp16 supported by the kernel
+    supported_dtypes = (torch.bfloat16, torch.float16)
+    assert x.dtype in supported_dtypes, (
+        f"x.dtype must be bfloat16 or float16, got {x.dtype}"
+    )
+    assert w_gate.dtype in supported_dtypes, (
+        f"w_gate.dtype must be bfloat16 or float16, got {w_gate.dtype}"
+    )
+    assert w_up.dtype in supported_dtypes, (
+        f"w_up.dtype must be bfloat16 or float16, got {w_up.dtype}"
+    )
+
+    assert K == K_gate, f"Incompatible dimensions: x.K={K}, w_gate.K={K_gate}"
+    assert K == K_up, f"Incompatible dimensions: x.K={K}, w_up.K={K_up}"
+    assert N == N_up, f"Incompatible dimensions: w_gate.N={N}, w_up.N={N_up}"
+
+    # Allocate output
+    out = torch.empty((M, N), device=x.device, dtype=x.dtype)
+    if M == 0 or N == 0:
+        return out
+
+    M_BLOCK = triton.next_power_of_2(M)
+
+    NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
+
+    # A dummy block value that will be overwritten by the hook
+    dummy_block = [1, 1]
+
+    # pyre-ignore[6]
+    x_desc = TensorDescriptor(x, x.shape, x.stride(), dummy_block)
+    # pyre-ignore[6]
+    w_gate_desc = TensorDescriptor(w_gate, w_gate.shape, w_gate.stride(), dummy_block)
+    # pyre-ignore[6]
+    w_up_desc = TensorDescriptor(w_up, w_up.shape, w_up.stride(), dummy_block)
+    # pyre-ignore[6]
+    out_desc = TensorDescriptor(out, out.shape, out.stride(), dummy_block)
+
+    def grid(meta):
+        BLOCK_M = meta["BLOCK_M"]
+        BLOCK_N = meta["BLOCK_N"]
+        return (
+            min(
+                NUM_SMS,
+                triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N),
+            ),
+        )
+
+    _swiglu_fwd_tma_ws_persistent[grid](
+        x_desc,
+        w_gate_desc,
+        w_up_desc,
+        out_desc,
+        M,
+        N,
+        K,
+        M_BLOCK,
+        NUM_SMS=NUM_SMS,
+        NUM_SMEM_BUFFERS=4,
+        NUM_TMEM_BUFFERS=2,
+        EPILOGUE_SUBTILE=2,
+    )
+    return out
+
+
+# =============================================================================
+# Standard fused SwiGLU kernel for A100/H100 (non-TLX path).
+#
+# Fuses silu(x @ W_gate^T) * (x @ W_up^T) into a single kernel launch.
+# Uses standard Triton pointer arithmetic (no TMA), works on SM80+.
+#
+# Key optimization: x is loaded from HBM ONCE and reused for both GEMMs.
+# Activation (silu * up) is computed in float32 registers, no HBM round-trip.
+#
+# Weight layout: expects [N, K] (nn.Linear native format).
+# The wrapper transposes to [K, N] for the GEMM internally.
+# =============================================================================
+
+
+def _get_swiglu_fwd_configs() -> List[triton.Config]:
+    """
+    Autotune configs for the standard (non-TLX) fused SwiGLU kernel.
+
+    Two float32 accumulators (gate + up) double register pressure vs single
+    GEMM, so smaller block sizes are included.
+    """
+    configs = [
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8},
+            num_stages=3,
+            num_warps=4,
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8},
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8},
+            num_stages=3,
+            num_warps=4,
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8},
+            num_stages=3,
+            num_warps=8,
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_M": 8},
+            num_stages=3,
+            num_warps=8,
+        ),
+        triton.Config(
+            {"BLOCK_M": 32, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8},
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 32, "BLOCK_K": 32, "GROUP_M": 8},
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 64, "GROUP_M": 8},
+            num_stages=3,
+            num_warps=8,
+        ),
+    ]
+    if torch.version.hip:
+        hip_num_stages = 2 if triton.__version__ >= "3.2.0" else 0
+        configs.extend(
+            [
+                triton.Config(
+                    {"BLOCK_M": 128, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_M": 8},
+                    num_stages=hip_num_stages,
+                    num_warps=4,
+                ),
+                triton.Config(
+                    {"BLOCK_M": 64, "BLOCK_N": 128, "BLOCK_K": 32, "GROUP_M": 8},
+                    num_stages=hip_num_stages,
+                    num_warps=4,
+                ),
+                triton.Config(
+                    {"BLOCK_M": 128, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8},
+                    num_stages=hip_num_stages,
+                    num_warps=4,
+                ),
+                triton.Config(
+                    {"BLOCK_M": 64, "BLOCK_N": 64, "BLOCK_K": 32, "GROUP_M": 8},
+                    num_stages=hip_num_stages,
+                    num_warps=4,
+                ),
+            ]
+        )
+    return configs
+
+
+@triton_autotune(
+    configs=_get_swiglu_fwd_configs(),
+    key=["M_BLOCK", "N", "K"],
+)
+@triton.jit
+def _swiglu_fwd_kernel(
+    # Pointers to input/output tensors
+    x_ptr,  # [M, K] input activation
+    w_gate_ptr,  # [K, N] gate weight (already transposed from [N, K])
+    w_up_ptr,  # [K, N] up weight (already transposed from [N, K])
+    out_ptr,  # [M, N] output = silu(x @ w_gate) * (x @ w_up)
+    # Matrix dimensions
+    M,  # rows in x (batch_size * seq_len)
+    N,  # output dimension (hidden_dim)
+    K,  # input/reduction dimension (input_dim)
+    M_BLOCK,  # next_power_of_2(M) for stable autotuning
+    # Strides
+    stride_xm,
+    stride_xk,
+    stride_wgk,
+    stride_wgn,
+    stride_wuk,
+    stride_wun,
+    stride_om,
+    stride_on,
+    # Compile-time constants
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_M: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+):
+    """
+    Fused SwiGLU forward: out = silu(x @ W_gate) * (x @ W_up).
+
+    Each thread block computes one [BLOCK_M, BLOCK_N] output tile.
+    Two accumulators share the same x tile loads (the fusion benefit).
+    """
+    # -- Step 1: Compute tile coordinates with grouped ordering (L2 reuse) --
+    pid = tl.program_id(axis=0)
+    num_pid_m = tl.cdiv(M, BLOCK_M)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    num_pid_in_group = GROUP_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_M
+
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_M)
+    pid_in_group = pid % num_pid_in_group
+    pid_m = first_pid_m + (pid_in_group % group_size_m)
+    pid_n = pid_in_group // group_size_m
+
+    # -- Step 2: Set up pointers for x, w_gate, w_up tiles --
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_k = tl.arange(0, BLOCK_K)
+    mask_m = (pid_m * BLOCK_M + offs_m)[:, None] < M
+    mask_n = (pid_n * BLOCK_N + offs_n)[None, :] < N
+    # [BLOCK_M, BLOCK_K]
+    x_ptrs = (
+        x_ptr
+        + (pid_m.to(tl.int64) * BLOCK_M + offs_m)[:, None] * stride_xm
+        + offs_k[None, :] * stride_xk
+    )
+    # [BLOCK_K, BLOCK_N]
+    wg_ptrs = (
+        w_gate_ptr
+        + offs_k[:, None] * stride_wgk
+        + (pid_n.to(tl.int64) * BLOCK_N + offs_n)[None, :] * stride_wgn
+    )
+
+    # [BLOCK_K, BLOCK_N]
+    wu_ptrs = (
+        w_up_ptr
+        + offs_k[:, None] * stride_wuk
+        + (pid_n.to(tl.int64) * BLOCK_N + offs_n)[None, :] * stride_wun
+    )
+
+    # -- Step 3: K-loop - two GEMMs sharing the same x tile --
+    acc_gate = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    acc_up = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        mask_k = offs_k[None, :] < K - k * BLOCK_K
+        x = tl.load(x_ptrs, mask=mask_m & mask_k, other=0.0)
+        mask_k = offs_k[:, None] < K - k * BLOCK_K
+        wg = tl.load(wg_ptrs, mask=mask_k & mask_n, other=0.0)
+        wu = tl.load(wu_ptrs, mask=mask_k & mask_n, other=0.0)
+
+        acc_gate += tl.dot(x, wg, allow_tf32=ALLOW_TF32)
+        acc_up += tl.dot(x, wu, allow_tf32=ALLOW_TF32)
+
+        x_ptrs += BLOCK_K * stride_xk
+        wg_ptrs += BLOCK_K * stride_wgk
+        wu_ptrs += BLOCK_K * stride_wuk
+
+    # -- Step 4: Apply SwiGLU activation in registers (no HBM round-trip) --
+    gate_activated = acc_gate * tl.sigmoid(acc_gate)  # silu
+    result = (gate_activated * acc_up).to(out_ptr.dtype.element_ty)
+
+    # -- Step 5: Store result --
+    offs_m = pid_m * BLOCK_M + offs_m
+    offs_n = pid_n * BLOCK_N + offs_n
+    out_ptrs = out_ptr + offs_m[:, None] * stride_om + offs_n[None, :] * stride_on
+    tl.store(out_ptrs, result, mask=mask_m & mask_n)
+
+
+def triton_swiglu_fwd(
+    x: torch.Tensor,
+    w_gate: torch.Tensor,
+    w_up: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Forward pass of fused SwiGLU (non-TLX path). Works on A100/H100/MI300X.
+
+    Computes: silu(x @ w_gate^T) * (x @ w_up^T)
+
+    Args:
+        x: [M, K] input tensor
+        w_gate: [N, K] gate weight (nn.Linear format)
+        w_up: [N, K] up weight (nn.Linear format)
+
+    Returns:
+        [M, N] output tensor
+    """
+    M, K = x.shape
+    N, K_gate = w_gate.shape
+    N_up, K_up = w_up.shape
+    assert K == K_gate, f"x.K={K} != w_gate.K={K_gate}"
+    assert K == K_up, f"x.K={K} != w_up.K={K_up}"
+    assert N == N_up, f"w_gate.N={N} != w_up.N={N_up}"
+
+    out = torch.empty((M, N), device=x.device, dtype=x.dtype)
+    if M == 0 or N == 0:
+        return out
+
+    M_BLOCK = triton.next_power_of_2(M)
+
+    # Transpose weights from [N, K] to [K, N] for the GEMM kernel
+    w_gate_t = w_gate.t().contiguous()
+    w_up_t = w_up.t().contiguous()
+
+    grid = lambda meta: (  # noqa E731
+        triton.cdiv(M, meta["BLOCK_M"]) * triton.cdiv(N, meta["BLOCK_N"]),
+    )
+
+    _swiglu_fwd_kernel[grid](
+        x,
+        w_gate_t,
+        w_up_t,
+        out,
+        M,
+        N,
+        K,
+        M_BLOCK,
+        x.stride(0),
+        x.stride(1),
+        w_gate_t.stride(0),
+        w_gate_t.stride(1),
+        w_up_t.stride(0),
+        w_up_t.stride(1),
+        out.stride(0),
+        out.stride(1),
+        ALLOW_TF32=torch.backends.cuda.matmul.allow_tf32,
+    )
+    return out
+
+
+def triton_swiglu(
+    x: torch.Tensor,
+    w_gate: torch.Tensor,
+    w_up: torch.Tensor,
+) -> torch.Tensor:
+    if is_sm100_plus() and TMA_AVAILABLE and HAS_TLX:
+        # Blackwell: use the fast TLX persistent kernel with TMA
+        _, K = x.shape
+        N, _ = w_gate.shape
+        assert K % 16 == 0 and N % 16 == 0, (
+            f"K ({K}) and N ({N}) must be divisible by 16 for TMA alignment"
+        )
+        return triton_swiglu_fwd_tma_ws_persistent_tlx(x, w_gate, w_up)
+    else:
+        # A100/H100: use the standard fused kernel
+        return triton_swiglu_fwd(x, w_gate, w_up)
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/README.md b/recommendation_v4/generative_recommenders/ops/triton_aot/README.md
new file mode 100644
index 000000000..2b0b1a834
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/README.md
@@ -0,0 +1,54 @@
+# Local Triton AOT Support
+
+This package is a minimal local copy of the Triton AOT pieces needed by the
+DLRM v3 HSTU inference end-to-end test. It avoids depending on the standalone
+`fbcode/triton_aot` package while preserving the compile, transform, and
+runtime-loading flow used by `generative_recommenders`.
+
+This is not intended to be a full fork of `fbcode/triton_aot`. Keep changes
+scoped to the GR inference use case unless a broader migration plan exists.
+
+## Code Structure
+
+- `types.py`: local `TritonAOT` registration object and `triton_aot` helper used
+  by GR AOT wrapper modules.
+- `preprocess.py`: FX graph preprocessing helpers, including wrapper-node
+  unwrapping before compile/transform.
+- `triton_*.py`: GR kernel-specific AOT wrapper modules for addmm, jagged
+  concat/split, layer norm variants, HSTU attention, and timestamp position
+  embeddings.
+- `compile/`: compile-time state, Triton signature/spec processing, generated
+  C++ codegen, and the `TritonAOTCompile` context manager.
+- `transform/`: FX graph transformation and generated Python wrapper code that
+  swaps Python AOT wrappers for `torch.ops.triton_aot.*` calls backed by built
+  shared libraries.
+- `build/`: extension builders and CUBIN embedding utilities used to create
+  loadable kernel libraries from compiled Triton artifacts.
+- `templates/`: C++ template files used by the compile/codegen path for kernel
+  entry points, embedded CUBIN data, and Torch operator registration.
+- `shared/`: compatibility helpers and type/spec conversion utilities shared by
+  compile and transform code.
+
+## Runtime Flow
+
+1. GR `triton_*.py` wrappers expose Triton kernels through local `triton_aot`
+   descriptors.
+2. `TritonAOTCompile` runs representative CUDA inputs, records kernel specs, and
+   compiles the collected Triton kernels into shared libraries.
+3. `transform_kernels` rewrites the FX graph so wrapper calls dispatch through
+   `torch.ops.triton_aot.*`.
+4. The e2e test copies the generated libraries into its workdir and passes them
+   to the C++ runner before executing the scripted sparse/dense modules.
+
+
+## Authors
+
+- Chang Pan <changpan@meta.com>
+- Zhiyong Wang (MRS) <zhiywang@meta.com>
+- Chenzhi Yu <chenzhi@meta.com>
+- Runming Lu <runming@meta.com>
+- Chun-Wei Chen <jcwchen@meta.com>
+- Michael He <michaelhe@meta.com>
+- Linjian Ma <linjianma@meta.com>
+- Xing Liu <xingl@meta.com>
+- Zhuoran Zhao <zhuoran@meta.com>
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/arg_descriptor.py b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/arg_descriptor.py
new file mode 100644
index 000000000..bc5963674
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/arg_descriptor.py
@@ -0,0 +1,146 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+
+# pyre-strict
+
+"""ArgDescriptor — per-arg codegen descriptor for AOT-T.
+
+Centralises arg classification (pointer / scalar / constant) so every
+``gen_*`` function in ``codegen.py`` iterates descriptors instead of
+doing its own dict lookups into ``OpsUnit`` fields.
+
+Also provides type-mapping helpers that convert ``ArgDescriptor``
+metadata into context-specific C++ / TorchScript type strings.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Any
+
+from generative_recommenders.ops.triton_aot.compile.spec_processing import OpsUnit
+from triton.runtime.jit import JITFunction
+
+
+# ---------------------------------------------------------------------------
+# Type-mapping helpers
+# ---------------------------------------------------------------------------
+
+CONSTANT_SELECTOR_CTYPE: dict[type[Any], str] = {
+    bool: "bool",
+    int: "int",
+    str: "const std::string&",
+}
+
+CONSTANT_CPP_OP_CTYPE: dict[type[Any], str] = {
+    bool: "bool",
+    int: "int64_t",
+    str: "const std::string&",
+}
+
+CONSTANT_TORCH_SCHEMA: dict[type[Any], str] = {
+    bool: "bool",
+    int: "int",
+    str: "str",
+}
+
+
+def scalar_cpp_op_ctype(triton_dtype: str) -> str:
+    """Triton scalar dtype → widened C++ type for cpp_op / torch_op params."""
+    if triton_dtype.startswith("i"):
+        return "int64_t"
+    if triton_dtype.startswith("f"):
+        return "double"
+    if triton_dtype == "bool":
+        return "bool"
+    raise ValueError(f"Unsupported scalar dtype for cpp_op: {triton_dtype}")
+
+
+def scalar_torch_schema(triton_dtype: str) -> str:
+    """Triton scalar dtype → TorchScript schema type string."""
+    if triton_dtype.startswith("i"):
+        return "int"
+    if triton_dtype.startswith("f"):
+        return "float"
+    if triton_dtype == "bool":
+        return "bool"
+    raise ValueError(f"Unsupported scalar dtype for torch schema: {triton_dtype}")
+
+
+# ---------------------------------------------------------------------------
+# ArgDescriptor hierarchy
+# ---------------------------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class ArgDescriptor:
+    """Base class for per-arg codegen descriptors.
+
+    Built once by ``build_arg_descriptors`` and consumed by all ``gen_*``
+    functions.  Use ``isinstance`` to dispatch on arg kind:
+
+    - ``PointerArg`` — tensor pointer (required or optional)
+    - ``ScalarArg`` — non-pointer signature arg with a Triton dtype
+    - ``ConstantArg`` — compile-time constant with a Python type
+    """
+
+    name: str
+    index: int
+
+
+@dataclass(frozen=True)
+class PointerArg(ArgDescriptor):
+    """Tensor pointer arg (required or optional)."""
+
+    is_optional: bool
+
+
+@dataclass(frozen=True)
+class ScalarArg(ArgDescriptor):
+    """Non-pointer signature arg with a Triton dtype (e.g., ``"i32"``, ``"fp32"``).
+
+    ``triton_dtype`` is the **widest** type across all specs for this
+    position, computed by ``_compute_invariants`` via ``_wider_type``.
+    Individual specs may use a narrower type (e.g., ``"i32"`` when
+    ``triton_dtype`` is ``"i64"``); codegen adds ``fits_i32`` guards
+    and ``static_cast`` for narrowing.
+    """
+
+    triton_dtype: str
+
+
+@dataclass(frozen=True)
+class ConstantArg(ArgDescriptor):
+    """Compile-time constant arg with a Python type (``int``, ``str``, ``bool``)."""
+
+    python_type: type[Any]
+
+
+def build_arg_descriptors(
+    func: JITFunction[list[Any]],
+    unit: OpsUnit,
+) -> list[ArgDescriptor]:
+    """Build ordered arg descriptors from func arg names + OpsUnit invariants.
+
+    Single source of truth for arg classification.  Called once in
+    ``compile_to_cpp`` and passed to all downstream codegen functions.
+    """
+    result: list[ArgDescriptor] = []
+    for i, name in enumerate(func.arg_names):
+        if i in unit.pointer_args:
+            result.append(
+                PointerArg(name=name, index=i, is_optional=i in unit.optional)
+            )
+        elif i in unit.scalar_dtypes:
+            result.append(
+                ScalarArg(name=name, index=i, triton_dtype=unit.scalar_dtypes[i])
+            )
+        elif i in unit.constant_types:
+            result.append(
+                ConstantArg(name=name, index=i, python_type=unit.constant_types[i])
+            )
+        else:
+            raise ValueError(
+                f"Arg {name} (index {i}) not classified as pointer, scalar, "
+                f"or constant in OpsUnit"
+            )
+    return result
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/codegen.py b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/codegen.py
new file mode 100644
index 000000000..f1b03848f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/codegen.py
@@ -0,0 +1,780 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+
+# pyre-strict
+
+"""C++ and Python code generation for AOT-T compiled kernels.
+
+Generates:
+  - kernel.h      (header with gridDims, tuner meta, selector proto)
+  - kernel.cpp    (cubin externs, loaders, launchers, selector)
+  - _torch_op.cpp (torch op registration)
+  - _meta.py      (Python autotuner meta function)
+"""
+
+import textwrap
+from collections import Counter
+from typing import Any
+
+# @manual=//triton:triton
+import triton
+from generative_recommenders.ops.triton_aot.compile.arg_descriptor import (
+    ArgDescriptor,
+    CONSTANT_CPP_OP_CTYPE,
+    CONSTANT_SELECTOR_CTYPE,
+    CONSTANT_TORCH_SCHEMA,
+    ConstantArg,
+    PointerArg,
+    scalar_cpp_op_ctype,
+    scalar_torch_schema,
+    ScalarArg,
+)
+from generative_recommenders.ops.triton_aot.compile.spec_processing import (
+    KernelSpec,
+    OpsUnit,
+)
+from generative_recommenders.ops.triton_aot.compile.stable_types import (
+    PY_TYPES_TO_CPP_TYPES,
+    SCALAR_TYPES,
+)
+from generative_recommenders.ops.triton_aot.compile.utils import (
+    hash_kernel_name,
+    unwrap_heuristic,
+)
+from generative_recommenders.ops.triton_aot.shared.compat import get_scratch_parameters
+from generative_recommenders.ops.triton_aot.shared.types import AUTOTUNE_ATTRs, CTYPES
+from generative_recommenders.ops.triton_aot.templates.template_utils import (
+    load_template,
+    render_template,
+)
+from triton.runtime.jit import JITFunction
+
+
+# ---------------------------------------------------------------------------
+# Kernel naming and binary generation
+# ---------------------------------------------------------------------------
+
+
+def gen_kernel_name(
+    fn: Any,
+    spec: KernelSpec,
+    cc: int | str,
+) -> str:
+    name = fn.__name__
+    sig = "_".join([p.replace("*", "p") for p in spec.signature.values()])
+    const = "_".join(map(str, spec.constants.values()))
+    cc_str = f"sm{cc}"
+    autotune_configs = []
+    autotune_configs.append(f"w{spec.num_warps}")
+    autotune_configs.append(f"s{spec.num_stages}")
+    # AMD only
+    autotune_configs.append(f"matrix{spec.matrix_instr_nonkdim}")
+    autotune_configs.append(f"wave{spec.waves_per_eu}")
+    autotune_configs.append(f"kpack{spec.kpack}")
+    # See kernel_suffix in triton/compiler/code_generator.py
+    suffix = ""
+    for i, _ in enumerate(spec.signature):
+        suffix += str(i)
+        if i in spec.divisible_by_16:
+            suffix += "d"
+        if i in spec.divisible_by_8:
+            suffix += "e"
+    return "_".join([name, cc_str, sig, const] + autotune_configs + [suffix])
+
+
+def gen_cubin(kernel_name: str, kernel: Any, install_dir: str, backend: str) -> str:
+    """Generate kernel binary file (.cubin or .hsaco) and return extern declaration.
+
+    Args:
+        kernel_name: Full kernel name including specialization suffix.
+        kernel: Compiled Triton kernel object containing binary in kernel.asm.
+        install_dir: Directory to write binary file.
+        backend: GPU backend ("cuda" or "hip").
+
+    Returns:
+        C++ extern declaration for the kernel binary array.
+    """
+    hashed = hash_kernel_name(kernel_name)
+    if backend == "hip":
+        binary_file = f"{install_dir}/{hashed}.hsaco"
+        with open(binary_file, "wb") as hsaco:
+            hsaco.write(kernel.asm["hsaco"])
+        target_symbol_name = f"{kernel_name}_cubin"
+    else:
+        binary_file = f"{install_dir}/{hashed}.cubin"
+        with open(binary_file, "wb") as cubin:
+            cubin.write(kernel.asm["cubin"])
+        target_symbol_name = f"{kernel_name}_cubin"
+
+    # We return extern declarations for both the array and its pointer.
+    # The pointer is used by gen_loader() to generate R_X86_64_64 relocations
+    # instead of R_X86_64_32, which allows the .triton section to be placed
+    # beyond the 4GB address limit in large binaries.
+    # Note: The pointer is volatile to prevent optimizer constant-propagation.
+    return f'extern "C" {{ extern unsigned char {target_symbol_name}[]; extern const void* volatile {target_symbol_name}_ptr; }}'
+
+
+def gen_loader(kernel_name: str, cubin_name: str, shared: int) -> str:
+    # TODO(changpan): Extract inline cuModuleLoadData/cuModuleGetFunction error
+    # handling into a shared helper to reduce generated code size.
+    return textwrap.dedent(
+        f"""
+        CUfunction load_{kernel_name}(void)
+        {{
+            thread_local std::unordered_map<int32_t, CUfunction> cache;
+            auto idx = torch::stable::accelerator::getCurrentDeviceIndex();
+            auto res = cache.find(idx);
+            if (res != cache.end()) {{
+                return res->second;
+            }}
+            CUfunction func;
+            CUmodule mod_ptr;
+            CUresult err;
+            // Use pointer to cubin data to generate R_X86_64_64 relocation
+            // instead of R_X86_64_32, allowing cubin data to be placed beyond 4GB
+            const void *image = {kernel_name}_cubin_ptr;
+
+            err = cuModuleLoadData(&mod_ptr, image);
+            if (err != 0) {{
+                const char* errStr;
+                cuGetErrorString(err, &errStr);
+                throw std::runtime_error("cuModuleLoadData failed for {kernel_name}: error " + std::to_string(err) + " (" + (errStr ? errStr : "unknown") + ")");
+            }}
+
+            err = cuModuleGetFunction(&func, mod_ptr, "{cubin_name}");
+            if (err != 0) {{
+                const char* errStr;
+                cuGetErrorString(err, &errStr);
+                throw std::runtime_error("cuModuleGetFunction failed for {kernel_name}: error " + std::to_string(err) + " (" + (errStr ? errStr : "unknown") + ")");
+            }}
+
+            check_errors({shared}, func);
+            cache.emplace(idx, func);
+            return func;
+        }}
+    """
+    )
+
+
+# ---------------------------------------------------------------------------
+# Launcher codegen (per-spec)
+# ---------------------------------------------------------------------------
+
+
+def gen_launcher_params(
+    descriptors: list[ArgDescriptor],
+    signature: dict[int, str],
+) -> str:
+    args = ["gridDims grid"]
+    for d in descriptors:
+        if d.index in signature:
+            if isinstance(d, PointerArg):
+                ctype = "void*"
+            else:
+                ctype = CTYPES[signature[d.index]]
+            args.append(f"{ctype} {d.name}")
+    return ", ".join(args)
+
+
+def gen_launch_args(
+    func: JITFunction[list[Any]],
+    spec: KernelSpec,
+) -> list[str]:
+    """Generate kernel launch argument list (pointers to non-constant arguments)."""
+    args = []
+    for i, arg in enumerate(func.arg_names):
+        if i in spec.constants:
+            continue
+        assert i in spec.signature, f"Argument {i} ({arg}) does not appear in signature"
+        args.append(f"&{arg}")
+    return args
+
+
+def gen_launcher(
+    kernel_name: str,
+    func: JITFunction[list[Any]],
+    kernel: Any,
+    shared: int,
+    warp_size: int,
+    spec: KernelSpec,
+    descriptors: list[ArgDescriptor],
+) -> str:
+    params = gen_launcher_params(descriptors, spec.signature)
+    args = gen_launch_args(func, spec)
+
+    scratch_declarations, scratch_args = get_scratch_parameters(kernel)
+    args.extend(scratch_args)
+
+    args_str = ", ".join(args)
+
+    return textwrap.dedent(
+        f"""
+        void {kernel_name}({params}) {{
+            CUfunction func = load_{kernel_name}();
+            cudaStream_t stream = grid.stream ? grid.stream : triton_aot_get_current_stream();
+            {scratch_declarations}
+            void *args[] = {{ {args_str} }};
+            auto res = cuLaunchKernel(func, grid.x, grid.y, grid.z, {warp_size} * {spec.num_warps}, 1, 1, {shared}, stream, args, NULL);
+            TRITON_AOT_CU_CHECK(res);
+        }}
+    """
+    )
+
+
+# ---------------------------------------------------------------------------
+# Selector codegen (invariant)
+# ---------------------------------------------------------------------------
+
+
+def gen_selector_params(
+    descriptors: list[ArgDescriptor],
+) -> str:
+    """Generate C++ selector function parameter list."""
+    args = ["gridDims grid"]
+    for d in descriptors:
+        if isinstance(d, PointerArg):
+            args.append(f"const std::optional<torch::stable::Tensor>& {d.name}")
+        elif isinstance(d, ScalarArg):
+            args.append(f"{CTYPES[d.triton_dtype]} {d.name}")
+        elif isinstance(d, ConstantArg):
+            args.append(f"{CONSTANT_SELECTOR_CTYPE[d.python_type]} {d.name}")
+
+    for name, value in AUTOTUNE_ATTRs.items():
+        args.append(f"{type(value).__name__} {name}")
+    return ", ".join(args)
+
+
+def gen_launcher_call_args(
+    descriptors: list[ArgDescriptor],
+    signature: dict[int, str],
+) -> str:
+    args = ["grid"]
+    for d in descriptors:
+        if d.index in signature:
+            if isinstance(d, PointerArg):
+                args.append(f"{d.name}.value().data_ptr()")
+            elif isinstance(d, ScalarArg) and signature[d.index] != d.triton_dtype:
+                args.append(f"static_cast<{CTYPES[signature[d.index]]}>({d.name})")
+            else:
+                args.append(d.name)
+    return ", ".join(args)
+
+
+def gen_guarded_calls(  # noqa: C901
+    func: JITFunction[list[Any]],
+    unit: OpsUnit,
+    descriptors: list[ArgDescriptor],
+) -> str:
+    desc_by_idx: dict[int, ArgDescriptor] = {d.index: d for d in descriptors}
+    calls = []
+    for spec in unit.specs:
+        kernel_name = gen_kernel_name(func, spec, unit.cc)
+        args = gen_launcher_call_args(descriptors, spec.signature)
+        guards = ""
+
+        # Guard on tensor dtypes (per-spec: different specs may have different dtypes)
+        for i, ttype in spec.signature.items():
+            d = desc_by_idx[i]
+            if not isinstance(d, PointerArg):
+                continue
+            arg = d.name
+            atype = SCALAR_TYPES[ttype]
+            guards += f"if ({arg}.has_value()) "
+            guards += f"if ({arg}.value().scalar_type() == {atype}) "
+
+        # Guard on int range (spec uses narrower type than selector)
+        for i, dtype in spec.signature.items():
+            d = desc_by_idx[i]
+            if isinstance(d, ScalarArg) and dtype != d.triton_dtype:
+                if dtype == "i32":
+                    guards += f"if (fits_i32({d.name})) "
+
+        # Guard on constant values.
+        for i, val in spec.constants.items():
+            arg = desc_by_idx[i].name
+            if isinstance(val, bool):
+                guards += f"if ({arg}) " if val else f"if (!({arg})) "
+            elif isinstance(val, str):
+                guards += f'if ({arg} == "{val}") '
+            elif val is None:
+                guards += f"if (!{arg}.has_value()) "
+            else:
+                guards += f"if ({arg} == {val}) "
+
+        # Guard on special constants
+        for name in AUTOTUNE_ATTRs.keys():
+            guards += f"if ({name} == {getattr(spec, name)}) "
+
+        # Guard on divisible_by_16
+        for i in spec.divisible_by_16:
+            arg = desc_by_idx[i].name
+            if i in spec.signature:
+                ttype = spec.signature[i]
+                if ttype.startswith("*"):
+                    guards += f"if ((((uintptr_t){arg}.value().data_ptr()) % 16) == 0) "
+                else:
+                    guards += f"if (({arg} % 16) == 0) "
+            elif i in spec.constants:
+                assert (spec.constants[i] % 16) == 0
+
+        # Guard on divisible_by_8
+        for i in spec.divisible_by_8:
+            arg = desc_by_idx[i].name
+            if i in spec.signature:
+                ttype = spec.signature[i]
+                # divisible_by_8 is only applied to int
+                if not ttype.startswith("*"):
+                    guards += f"if (({arg} % 8) == 0) "
+            elif i in spec.constants:
+                assert (spec.constants[i] % 8) == 0
+
+        # Call the specialization.
+        calls.append(f"{guards}return {kernel_name}({args});\n")
+    return "".join(calls)
+
+
+def gen_selector_proto(
+    descriptors: list[ArgDescriptor],
+    func_name: str,
+) -> str:
+    params = gen_selector_params(descriptors)
+    # Add Triton's default values for num warps/stages, etc
+    for name, value in AUTOTUNE_ATTRs.items():
+        params = params.replace(name, f"{name}={value}")
+    return f"void {func_name}({params});"
+
+
+def gen_failure_msg(
+    descriptors: list[ArgDescriptor],
+) -> str:
+    """Generate C++ ``<<``-chain for the dispatch-failure error message.
+
+    Groups parameters by category (Tensors / Scalars / Constants /
+    Autotune / Device).  Tensor entries include aligned16 status.
+    """
+    tensors: list[str] = []
+    scalars: list[str] = []
+    constants: list[str] = []
+
+    for d in descriptors:
+        if isinstance(d, PointerArg):
+            dtype_expr = (
+                f"({d.name}.has_value()"
+                f" ? c10::toString({d.name}.value().scalar_type())"
+                f' : "nullptr")'
+            )
+            align_expr = (
+                f"(({d.name}.has_value()"
+                f" && (((uintptr_t){d.name}.value().data_ptr()) % 16) == 0)"
+                f' ? "true" : "false")'
+            )
+            tensors.append(
+                f'" {d.name}=" << {dtype_expr} << "(aligned16=" << {align_expr} << ")"'
+            )
+        elif isinstance(d, ScalarArg):
+            scalars.append(f'" {d.name}=" << {d.name}')
+        elif isinstance(d, ConstantArg):
+            constants.append(f'" {d.name}=" << {d.name}')
+
+    autotune: list[str] = [f'" {n}=" << {n}' for n in AUTOTUNE_ATTRs]
+
+    sections: list[str] = []
+    if tensors:
+        sections.append('"\\n  Tensors:" << ' + " << ".join(tensors))
+    if scalars:
+        sections.append('"\\n  Scalars:" << ' + " << ".join(scalars))
+    if constants:
+        sections.append('"\\n  Constants:" << ' + " << ".join(constants))
+    sections.append('"\\n  Autotune:" << ' + " << ".join(autotune))
+    sections.append('"\\n  Device: cc=" << cc')
+
+    return " << ".join(sections)
+
+
+def gen_selector(
+    func: JITFunction[list[Any]],
+    unit: OpsUnit,
+    descriptors: list[ArgDescriptor],
+) -> str:
+    params = gen_selector_params(descriptors)
+    guarded_calls = gen_guarded_calls(func, unit, descriptors)
+    failure_msg = gen_failure_msg(descriptors)
+    return f"""
+        void {func.__name__}({params}) {{
+            auto cc = compute_capability();
+            if (grid.x * grid.y * grid.z > 0) {{
+                {guarded_calls}
+                std::stringstream ss;
+                ss << "[TritonAOT] No implementation found for {func.__name__}" << {failure_msg};
+                throw std::runtime_error(ss.str());
+            }}
+        }}
+    """
+
+
+# ---------------------------------------------------------------------------
+# Torch op codegen (invariant)
+# ---------------------------------------------------------------------------
+
+
+def gen_cpp_op_params(
+    descriptors: list[ArgDescriptor],
+) -> str:
+    args = []
+    for d in descriptors:
+        if isinstance(d, PointerArg):
+            args.append(f"std::optional<torch::stable::Tensor> {d.name}")
+        elif isinstance(d, ScalarArg):
+            args.append(f"{scalar_cpp_op_ctype(d.triton_dtype)} {d.name}")
+        elif isinstance(d, ConstantArg):
+            args.append(f"{CONSTANT_CPP_OP_CTYPE[d.python_type]} {d.name}")
+    for name, value in AUTOTUNE_ATTRs.items():
+        args.append(f"{PY_TYPES_TO_CPP_TYPES[type(value)]} {name}")
+    return ", ".join(args)
+
+
+def gen_torch_op_params(
+    descriptors: list[ArgDescriptor],
+    default_values: dict[str, Any],
+) -> str:
+    args = []
+
+    def gen_str_wrap(value: Any) -> Any:
+        return f'\\"{value}\\"' if isinstance(value, str) else value
+
+    def gen_default_str(arg: str) -> str:
+        return (
+            f" = {gen_str_wrap(default_values[arg])}" if arg in default_values else ""
+        )
+
+    for d in descriptors:
+        df_str = gen_default_str(d.name)
+        if isinstance(d, PointerArg):
+            t = chr(ord("a") + d.index)
+            args.append(f"Tensor({t}!)? {d.name}")
+        elif isinstance(d, ScalarArg):
+            args.append(f"{scalar_torch_schema(d.triton_dtype)} {d.name}{df_str}")
+        elif isinstance(d, ConstantArg):
+            args.append(f"{CONSTANT_TORCH_SCHEMA[d.python_type]} {d.name}{df_str}")
+    for name, value in AUTOTUNE_ATTRs.items():
+        args.append(f"{type(value).__name__} {name}={value}")
+    return ", ".join(args)
+
+
+def gen_torch_op(
+    func: JITFunction[list[Any]],
+    descriptors: list[ArgDescriptor],
+    default_values: dict[str, Any],
+) -> str:
+    cpp_params = gen_cpp_op_params(descriptors)
+    torch_params = gen_torch_op_params(descriptors, default_values)
+    arg_names = list(func.arg_names) + list(AUTOTUNE_ATTRs.keys())
+    args = ", ".join(arg_names)
+
+    # Generate a comment noting which tensor params are non-optional but
+    # promoted to Tensor? for TorchScript compatibility.
+    promoted = [
+        d.name for d in descriptors if isinstance(d, PointerArg) and not d.is_optional
+    ]
+    type_comment = ""
+    if promoted:
+        type_comment = (
+            f"// Note: {', '.join(promoted)} are non-optional but use Tensor? "
+            "for TorchScript compatibility.\n"
+            "// Dispatch uses HAS_XXX constexpr ints, not tensor presence.\n"
+        )
+    return textwrap.dedent(
+        f"""
+        namespace {{
+        triton::aot::gridDims dims_from_vec(
+            const std::vector<int64_t>& grid
+        ) {{
+          return triton::aot::gridDims(
+              grid.size() > 0 ? grid[0] : 1,
+              grid.size() > 1 ? grid[1] : 1,
+              grid.size() > 2 ? grid[2] : 1
+          );
+        }}
+
+        {type_comment}void {func.__name__}_op(
+            std::vector<int64_t> grid,
+            {cpp_params}
+        ) {{
+            triton::aot::{func.__name__}(
+                dims_from_vec(grid),
+                {args}
+            );
+        }}
+
+        void {func.__name__}_dummy_op(
+            std::vector<int64_t> grid,
+            {cpp_params}
+        ) {{
+            // Do nothing.  The op is a dummy for model transform,
+            // processing, and splitting services.
+        }}
+        }}
+
+        STABLE_TORCH_LIBRARY_FRAGMENT(triton_aot, m) {{
+          m.def("{func.__name__}(int[] grid, {torch_params}) -> ()");
+        }}
+        STABLE_TORCH_LIBRARY_IMPL(triton_aot, CUDA, m) {{
+          m.impl("{func.__name__}", TORCH_BOX(&{func.__name__}_op));
+        }}
+
+        STABLE_TORCH_LIBRARY_IMPL(triton_aot, CPU, m) {{
+          m.impl("{func.__name__}", TORCH_BOX(&{func.__name__}_dummy_op));
+        }}
+
+        STABLE_TORCH_LIBRARY_IMPL(triton_aot, Meta, m) {{
+          m.impl("{func.__name__}", TORCH_BOX(&{func.__name__}_dummy_op));
+        }}
+        """
+    )
+
+
+# ---------------------------------------------------------------------------
+# Tuner meta codegen
+# ---------------------------------------------------------------------------
+
+
+def key_names_and_idx(func: Any) -> tuple[list[str], list[int]]:
+    if hasattr(func, "key_idx"):
+        arg_names = [func.arg_names[idx] for idx in func.key_idx]
+        key_idx = func.key_idx
+    else:
+        arg_names = func.keys
+        key_idx = [func.arg_names.index(arg) for arg in arg_names]
+    return arg_names, key_idx
+
+
+def is_non_empty_mapping_of_type(obj: object, value_type: type[Any]) -> bool:
+    """Check if object is a non-empty dict with all values of specific type"""
+    if not obj or not isinstance(obj, dict):
+        return False
+
+    return all(isinstance(value, value_type) for value in obj.values())
+
+
+_LAUNCH_PARAM_NAMES: list[str] = ["num_warps", "num_stages"]
+
+
+def gen_tuner_meta_py(
+    func: Any,
+    tuner_fallback: bool,
+    unit: OpsUnit,
+) -> str:
+    vals = []
+
+    guard_list = []
+
+    # Use custom meta generation function if available
+    if hasattr(func, "gen_autotune_select_meta_src"):
+        return func.gen_autotune_select_meta_src(unit.constant_types)
+
+    if hasattr(func, "cache") and is_non_empty_mapping_of_type(
+        func.cache, triton.runtime.autotuner.Config
+    ):
+        # auto tuned configs
+        arg_names, key_idx = key_names_and_idx(func)
+
+        in_args = ", ".join(
+            [
+                f"{name}: {unit.constant_types[idx].__name__ if idx in unit.constant_types else 'int'}"
+                for idx, name in zip(key_idx, arg_names)
+            ]
+        )
+
+        cfg_first = next(iter(func.cache.values()))
+        return_names = list(cfg_first.kwargs.keys()) + _LAUNCH_PARAM_NAMES
+
+        for key, cfg in func.cache.items():
+            val = list(cfg.kwargs.values()) + [cfg.num_warps, cfg.num_stages]
+            val = tuple(val)
+            vals.append(val)
+            equations = []
+            for arg, value in zip(arg_names, key):
+                if isinstance(value, str):
+                    equations.append(f"{arg} == '{value}'")
+                elif isinstance(value, bool):
+                    equations.append(f"{arg} == {int(value)}")
+                else:
+                    equations.append(f"{arg} == {value}")
+            guard_list.append(f"if {' and '.join(equations)}: return {val}")
+
+    else:
+        # default configs — single spec, use specs[0]
+        in_args = ""
+        arg_names = list(_LAUNCH_PARAM_NAMES)
+        return_names = list(_LAUNCH_PARAM_NAMES)
+        val = unit.specs[0].num_warps, unit.specs[0].num_stages
+        vals.append(val)
+
+    name = unwrap_heuristic(func, JITFunction).__name__
+    meta = name + "_meta"
+
+    guards = "\n        ".join(guard_list)
+
+    fmt_args = ", ".join([f"{{{arg_name}}}" for arg_name in arg_names])
+
+    raise_runtime_error_str = (
+        f"""raise RuntimeError(f"No autotuning config found for {name}({fmt_args})")"""
+    )
+    fallback_str = f"""return {Counter(vals).most_common(1)[0][0]}"""
+
+    returns_comment = f"# Returns: ({', '.join(return_names)})"
+
+    return textwrap.dedent(
+        f"""
+    def {meta}({in_args}):
+        {returns_comment}
+        {guards}
+        {fallback_str if tuner_fallback else raise_runtime_error_str}
+    """
+    )
+
+
+def gen_tuner_meta_cpp(
+    func: Any,
+    tuner_fallback: bool,
+    constant_types: dict[int, type[Any]],
+) -> str:
+    # TODO(changpan): This C++ inline _meta is currently dead code — no C++ caller
+    # invokes it.  The Python _meta.py (gen_tuner_meta_py) is the only consumer.
+    # Double check, try remove this and the TUNER_META_CPP template region.
+    def infer_arg_type(idx: int) -> str:
+        if idx in constant_types:
+            return PY_TYPES_TO_CPP_TYPES[constant_types[idx]]
+        else:
+            return "int64_t"
+
+    arg_names, key_idx = key_names_and_idx(func)
+
+    in_args = ", ".join(
+        [f"{infer_arg_type(idx)} {name}" for idx, name in zip(key_idx, arg_names)]
+    )
+
+    vals = []
+    guard_list = []
+    for key, cfg in func.cache.items():
+        val = list(cfg.kwargs.values()) + [cfg.num_warps, cfg.num_stages]
+        val = tuple(val)
+        vals.append(val)
+        equations = []
+        for arg, value in zip(arg_names, key):
+            if isinstance(value, str):
+                equations.append(f'{arg} == "{value}"')
+            elif isinstance(value, bool):
+                equations.append(f"{arg} == {int(value)}")
+            else:
+                equations.append(f"{arg} == {value}")
+        guard_list.append(f"if ({' && '.join(equations)}) return std::make_tuple{val};")
+    guards = "\n        ".join(guard_list)
+    name = unwrap_heuristic(func, JITFunction).__name__
+    meta = name + "_meta"
+    fmt_args = ", ".join([f"{arg_name}" for arg_name in arg_names])
+    raise_runtime_error_str = f"""throw std::runtime_error("No autotuning config found for {name}({fmt_args})");"""
+    fallback_str = f"""return std::make_tuple{Counter(vals).most_common(1)[0][0]};"""
+    # Infer the return type from the actual values
+    return_type = _infer_return_type(vals[0])
+    return textwrap.dedent(
+        f"""
+    inline std::tuple<{return_type}> {meta}({in_args}) {{
+        {guards}
+        {fallback_str if tuner_fallback else raise_runtime_error_str}
+    }}
+    """
+    )
+
+
+def _infer_return_type(vals: tuple[Any, ...]) -> str:
+    types = [PY_TYPES_TO_CPP_TYPES.get(type(val)) for val in vals]
+    try:
+        # pyre-fixme[6]: For 1st argument expected
+        #  `Iterable[typing_extensions.LiteralString]` but got `List[Optional[str]]`.
+        return ", ".join(types)
+    except TypeError:  # one of the types cannot be inferred, e.g. `None`
+        raise ValueError("Cannot infer return type from `vals`")
+
+
+# ---------------------------------------------------------------------------
+# Top-level codegen entry points
+# ---------------------------------------------------------------------------
+
+
+def generate_header_content(
+    tuned_func: triton.runtime.autotuner.Autotuner | None,
+    func: JITFunction[list[Any]],
+    unit: OpsUnit,
+    descriptors: list[ArgDescriptor],
+    tuner_fallback: bool,
+) -> str:
+    """Generate the content of the .h header file."""
+    h_template = load_template("kernel.h")
+    tuner_meta_cpp = (
+        gen_tuner_meta_cpp(tuned_func, tuner_fallback, unit.constant_types)
+        if tuned_func
+        else ""
+    )
+    selector_proto = gen_selector_proto(descriptors, func.__name__)
+    return render_template(
+        h_template,
+        {
+            "TUNER_META_CPP": tuner_meta_cpp,
+            "SELECTOR_PROTO": selector_proto,
+        },
+    )
+
+
+def generate_kernel_cpp_content(
+    func: JITFunction[list[Any]],
+    unit: OpsUnit,
+    descriptors: list[ArgDescriptor],
+    prefix: str,
+    generated_specs: list[str],
+    backend: str,
+) -> str:
+    """Generate the content of the kernel .cpp file.
+
+    All tensor params use Tensor? for TorchScript compatibility.
+    Dispatch relies on HAS_XXX constexpr ints, not tensor presence.
+    """
+    cpp_template = load_template("kernel.cpp")
+    kernel_specs = "\n".join(generated_specs)
+    selector = gen_selector(func, unit, descriptors)
+
+    # On AMD, apply hipification to generated code (KERNEL_SPECS, SELECTOR)
+    # Templates are already hipified at load time (from hip/ subdirectory)
+    if backend == "hip":
+        from torch._inductor.codegen.aoti_hipify_utils import maybe_hipify_code_wrapper
+
+        kernel_specs = maybe_hipify_code_wrapper(kernel_specs, force_hipify=True)
+        selector = maybe_hipify_code_wrapper(selector, force_hipify=True)
+
+    cpp_content = render_template(
+        cpp_template,
+        {
+            "HEADER_INCLUDE": f'#include "{prefix}.h"\n',
+            "KERNEL_SPECS": kernel_specs,
+            "SELECTOR": selector,
+        },
+    )
+    return cpp_content
+
+
+def generate_torch_op_content(
+    func: JITFunction[list[Any]],
+    descriptors: list[ArgDescriptor],
+    prefix: str,
+    default_values: dict[str, Any],
+) -> str:
+    """Generate the content of the torch_op .cpp file."""
+    torch_template = load_template("torch_op.cpp")
+    torch_op_content = gen_torch_op(func, descriptors, default_values)
+    torch_content = render_template(
+        torch_template,
+        {
+            "HEADER_INCLUDE": f'#include "{prefix}.h"\n',
+            "TORCH_OP": torch_op_content,
+        },
+    )
+    return torch_content
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/compile_state.py b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/compile_state.py
new file mode 100644
index 000000000..231cb52f1
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/compile_state.py
@@ -0,0 +1,409 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# pyre-strict
+
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import hashlib
+import json
+import os
+import tempfile
+from inspect import getcallargs, Parameter, signature
+from typing import Any, Callable, Dict, List, Optional, Set
+
+import torch
+
+# @manual=//triton:triton
+import triton.language as tl
+from generative_recommenders.ops.triton_aot.compile.stable_types import SCALAR_TYPES
+from generative_recommenders.ops.triton_aot.compile.utils import is_autotuner
+from generative_recommenders.ops.triton_aot.types import (
+    Annotation,
+    AnnotationHint,
+    TritonAOT,
+)
+
+# @manual=//triton:triton
+from triton.runtime.jit import KernelInterface, mangle_type
+
+
+class CustomEncoder(json.JSONEncoder):
+    # pyre-ignore[14]: Inconsistent override
+    def default(self, obj: object) -> Any:
+        if isinstance(obj, set):
+            return {"__set__": True, "items": sorted(obj)}
+        # Handle other non-serializable types
+        return super().default(obj)
+
+
+def hash_spec(spec: Dict[str, Any]) -> str:
+    serialized_dict = json.dumps(spec, cls=CustomEncoder, sort_keys=True)
+    return hashlib.sha256(serialized_dict.encode("utf-8")).hexdigest()
+
+
+class AOTTCompileState:
+    """
+    Singleton state container for Triton AOT compilation.
+
+    Description:
+    This singleton pattern enables state sharing between code loaded via
+    torch.package (which creates isolated module namespaces) and the regular
+    Python import system. Without this pattern, the packaged module would have
+    its own copy of global state, leading to inconsistencies.
+
+    Usage:
+        # Normal usage - get the singleton instance
+        state = AOTTCompileState.get_instance()
+
+        # For torch.package integration - inject shared instance into packaged module
+        packaged_module = package_importer.import_module("triton_aot.compile.compile_state")
+        packaged_module.AOTTCompileState.set_instance(AOTTCompileState.get_instance())
+    """
+
+    _instance: Optional["AOTTCompileState"] = None
+
+    kernel_specs: Dict[KernelInterface[List[Any]], List[Dict[str, List[Any]]]] = {}
+    specs_hashset: Dict[KernelInterface[List[Any]], Set[str]] = {}
+    enable_aott_compile: bool = False
+    compile_base_dir: str = ""
+    compile_path: str = ""
+
+    def __new__(cls) -> "AOTTCompileState":
+        if cls._instance is None:
+            instance = super().__new__(cls)
+            instance._initialize()
+            cls._instance = instance
+        return cls._instance
+
+    def _initialize(self) -> None:
+        """Initialize the singleton state. Called only once."""
+        self.kernel_specs: Dict[
+            KernelInterface[List[Any]], List[Dict[str, List[Any]]]
+        ] = {}
+        self.specs_hashset: Dict[KernelInterface[List[Any]], Set[str]] = {}
+        self.enable_aott_compile: bool = False
+        self.compile_base_dir: str = os.getenv("TRITON_AOT_PATH_PREFIX", "/var/tmp")
+        self.compile_path: str = tempfile.mkdtemp(
+            dir=self.compile_base_dir, prefix="triton_aot_compile_"
+        )
+
+    @classmethod
+    def get_instance(cls) -> "AOTTCompileState":
+        """Get the singleton instance, creating it if necessary."""
+        if cls._instance is None:
+            cls._instance = cls()
+        return cls._instance
+
+    @classmethod
+    def set_instance(cls, instance: "AOTTCompileState") -> None:
+        """
+        Set the singleton instance. Used for torch.package integration.
+
+        When code is loaded via torch.package, it creates a separate module
+        namespace with its own class objects. This method allows injecting
+        a shared instance from the main module into the packaged module.
+        """
+        cls._instance = instance
+
+    def reset(self) -> None:
+        """Reset all state to initial values."""
+        self.kernel_specs = {}
+        self.specs_hashset = {}
+        self.disable()
+        self.compile_base_dir = os.getenv("TRITON_AOT_PATH_PREFIX", "/var/tmp")
+        self.compile_path = tempfile.mkdtemp(
+            dir=self.compile_base_dir, prefix="triton_aot_compile_"
+        )
+
+    def add_kernel_spec(
+        self,
+        fn: KernelInterface[List[Any]],
+        spec: Dict[str, List[Any]],
+        hashed_spec: str,
+    ) -> None:
+        """Add a kernel spec if not already present (based on hash).
+        If the same Triton kernel is used at multiple locations in a model:
+            - All calls share one spec list under the same kernel function key
+            - Specs with identical signatures (same dtypes, shapes) are deduplicated via hash
+            - Specs with different signatures (e.g., fp32 vs bf16) are recorded separately
+
+        Example:
+            # Two call sites using the same kernel:
+            my_kernel[grid](tensor_fp32, ...)  # Records spec with "*fp32"
+            my_kernel[grid](tensor_bf16, ...)  # Records spec with "*bf16"
+            my_kernel[grid](tensor_fp32, ...)  # Deduplicated, same hash as first call
+
+            # Result: kernel_specs[my_kernel] = [fp32_spec, bf16_spec]
+        """
+        if fn not in self.kernel_specs:
+            self.kernel_specs[fn] = []
+            self.specs_hashset[fn] = set()
+        if hashed_spec not in self.specs_hashset[fn]:
+            self.kernel_specs[fn].append(spec)
+            self.specs_hashset[fn].add(hashed_spec)
+
+    def _collect_spec(
+        self,
+        fn: KernelInterface[List[Any]],
+        annotations: Dict[str, Annotation],
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
+        """Spec collection callback registered on TritonAOT during compile.
+
+        Always collects the annotated spec (which equals the inferred spec
+        when no annotations are present).  Also collects the inferred spec
+        when it differs and either:
+        - annotations conflict with sample (fallback for safety), or
+        - inferred has perf hints the annotation lacks (perf variant).
+        """
+        spec = infer_spec(fn, annotations, *args, **kwargs)
+        annotated_hash = hash_spec(spec)
+        self.add_kernel_spec(fn, spec, annotated_hash)
+
+        if annotations:
+            inferred = infer_spec(fn, {}, *args, **kwargs)
+            inferred_hash = hash_spec(inferred)
+            if inferred_hash == annotated_hash:
+                return
+            if _annotation_conflicts_with_sample(
+                fn, annotations, *args, **kwargs
+            ) or _inferred_has_perf_advantage(spec, inferred):
+                self.add_kernel_spec(fn, inferred, inferred_hash)
+
+    def enable(self) -> None:
+        """Enable AOT compile and register the spec collection hook."""
+        self.enable_aott_compile = True
+        TritonAOT.set_spec_collector(self._collect_spec)
+
+    def disable(self) -> None:
+        """Disable AOT compile and unregister the spec collection hook."""
+        self.enable_aott_compile = False
+        TritonAOT.set_spec_collector(None)
+
+
+def get_aott_compile_state() -> AOTTCompileState:
+    """Get the current AOTTCompileState singleton.
+
+    Uses get_instance() so injected instances (via set_instance() for
+    torch.package integration) are respected.
+    """
+    return AOTTCompileState.get_instance()
+
+
+########
+# Module-level global accessors that delegate to singleton
+########
+
+
+def get_triton_aot_kernel_specs() -> Dict[
+    KernelInterface[List[Any]], List[Dict[str, List[Any]]]
+]:
+    return get_aott_compile_state().kernel_specs
+
+
+def get_triton_aot_specs_hashset() -> Dict[KernelInterface[List[Any]], Set[str]]:
+    return get_aott_compile_state().specs_hashset
+
+
+def get_aott_compile_path() -> str:
+    return get_aott_compile_state().compile_path
+
+
+def add_kernel_spec(
+    fn: KernelInterface[List[Any]], spec: Dict[str, List[Any]], hashed_spec: str
+) -> None:
+    get_aott_compile_state().add_kernel_spec(fn, spec, hashed_spec)
+
+
+def _unwrap_triton_fn(
+    fn: KernelInterface[List[Any]],
+) -> Callable[..., Any]:
+    while isinstance(fn, KernelInterface):
+        # pyre-ignore[16]: KernelInterface has `fn` attribute at runtime
+        fn = fn.fn
+    return fn
+
+
+def _inferred_has_perf_advantage(
+    annotated_spec: Dict[str, List[Any]],
+    inferred_spec: Dict[str, List[Any]],
+) -> bool:
+    """True if inferred spec has alignment/divisibility hints the annotated lacks.
+
+    A tuple element ``(type, N)`` carries alignment or divisibility info
+    that a bare string does not.  When inference adds such hints (e.g.,
+    tensor alignment from ``data_ptr() % 16 == 0``), the inferred spec
+    produces a more optimized cubin worth keeping as a perf variant.
+    """
+    for ann_elem, inf_elem in zip(
+        annotated_spec["signature"], inferred_spec["signature"]
+    ):
+        if isinstance(inf_elem, tuple) and not isinstance(ann_elem, tuple):
+            return True
+    return False
+
+
+# Triton-internal kwargs injected by KernelInterface.__getitem__
+# (triton/runtime/jit.py).  These are not kernel parameters and must
+# be stripped before getcallargs.
+_TRITON_INTERNAL_KWARGS: frozenset[str] = frozenset({"warmup", "grid"})
+
+
+def _resolve_call_args(
+    fn: KernelInterface[List[Any]],
+    *args: Any,
+    **kwargs: Any,
+) -> tuple[Callable[..., Any], dict[str, Any]]:
+    """Unwrap kernel and resolve call args with autotune placeholder fill."""
+    triton_fn = _unwrap_triton_fn(fn)
+    # Filter Triton-internal kwargs injected by KernelInterface.__getitem__
+    # (triton/runtime/jit.py) — not part of the kernel signature.
+    clean_kwargs = {k: v for k, v in kwargs.items() if k not in _TRITON_INTERNAL_KWARGS}
+    if is_autotuner(fn):
+        # pyre-ignore[16]: Attributes checked by is_autotuner
+        for arg_name in fn.configs[0].kwargs.keys():
+            if arg_name not in clean_kwargs:
+                clean_kwargs[arg_name] = -1
+    return triton_fn, getcallargs(triton_fn, *args, **clean_kwargs)
+
+
+_I32_MIN: int = -(2**31)
+_I32_MAX: int = 2**31 - 1
+
+
+def _sample_satisfies_int_type(sample: int, ann_type: str) -> bool:
+    """True if sample int fits the annotated type range."""
+    if ann_type == "i32":
+        return _I32_MIN <= sample <= _I32_MAX
+    return True
+
+
+def _sample_satisfies_annotation(sample: Any, ann: Annotation) -> bool:
+    """True if a single sample value satisfies its annotation constraint."""
+    if isinstance(ann, AnnotationHint):
+        if isinstance(sample, torch.Tensor):
+            return sample.data_ptr() % ann.hint == 0
+        if isinstance(sample, int):
+            if ann.hint == 1:
+                return sample == 1
+            if not _sample_satisfies_int_type(sample, ann.dtype):
+                return False
+            if ann.hint > 1:
+                return sample % ann.hint == 0
+        return True
+    if isinstance(ann, str) and not ann.startswith("*") and isinstance(sample, int):
+        return _sample_satisfies_int_type(sample, ann)
+    return True
+
+
+def _annotation_conflicts_with_sample(
+    fn: KernelInterface[List[Any]],
+    annotations: Dict[str, Annotation],
+    *args: Any,
+    **kwargs: Any,
+) -> bool:
+    """True if any annotated param's sample value doesn't satisfy the annotation.
+
+    Used by ``_collect_spec`` to decide whether to generate an inferred
+    fallback spec.  When the sample satisfies all annotations, only the
+    annotated spec is needed (the user's constraints hold for this input).
+    """
+    _, sample_args = _resolve_call_args(fn, *args, **kwargs)
+
+    for param_name, ann in annotations.items():
+        sample = sample_args.get(param_name)
+        if sample is None:
+            continue
+        if not _sample_satisfies_annotation(sample, ann):
+            return True
+
+    return False
+
+
+def _infer_spec_entry(
+    arg_name: str,
+    arg: Any,
+    arg_annotation: Any,
+    annotations: Dict[str, Annotation],
+) -> Any:
+    if arg_annotation != Parameter.empty:
+        if arg_annotation == tl.constexpr:
+            return arg
+        raise RuntimeError(
+            f"TritonAOT: unsupported scalar annotation {arg_annotation}."
+        )
+
+    if arg_name in annotations:
+        ann = annotations[arg_name]
+        # Convert to tuple for raw spec format (shared/spec_conversion
+        # processes plain tuples).
+        return ann.to_tuple() if isinstance(ann, AnnotationHint) else ann
+
+    if arg is None:
+        return None
+
+    if isinstance(arg, torch.Tensor):
+        # Reject dtypes SCALAR_TYPES can't render (e.g. *u1, *u16, *fp8e5)
+        # so codegen doesn't KeyError downstream.
+        type_str = mangle_type(arg)
+        if type_str not in SCALAR_TYPES:
+            raise RuntimeError(
+                f"TritonAOT: unsupported tensor type for {arg_name}: "
+                f"{arg.dtype} (Triton mangled to {type_str!r}). "
+                f"Supported tensor dtypes: {sorted(SCALAR_TYPES.keys())}."
+            )
+        return (type_str, 16) if arg.data_ptr() % 16 == 0 else type_str
+
+    if isinstance(arg, bool):
+        # bool is subclass of int; must check before int.
+        # Non-constexpr bools have no CTYPES entry for codegen.
+        raise RuntimeError(
+            f"TritonAOT: parameter {arg_name} is a bool without "
+            f"tl.constexpr annotation.  Add `{arg_name}: tl.constexpr` "
+            f"to the kernel signature."
+        )
+
+    if isinstance(arg, int):
+        # Always i64 for safety; users annotate "i32" for narrower variant via
+        # annotation-as-variant.
+        if not -(2**63) <= arg <= 2**63 - 1:
+            raise RuntimeError(
+                f"TritonAOT: unsupported int value for {arg_name}: "
+                f"value exceeds i64 range. Use a smaller value or tl.constexpr."
+            )
+        return "i64"
+
+    if isinstance(arg, float):
+        return "fp32"
+
+    raise RuntimeError(f"TritonAOT: parameter {arg_name} needs annotation.")
+
+
+def infer_spec(
+    fn: KernelInterface[List[Any]],
+    annotations: Dict[str, Annotation],
+    *args: Any,
+    **kwargs: Any,
+) -> Dict[str, List[Any]]:
+    """Infer kernel spec from sample args.
+
+    Tensor dtype: ``mangle_type``, alignment: ``data_ptr() % 16``.
+    Scalar int: always ``"i64"`` (safe default; user can annotate ``"i32"``
+    to get a narrower variant via annotation-as-variant).
+    Float: ``mangle_type`` → fp32.
+    """
+    triton_fn, call_args = _resolve_call_args(fn, *args, **kwargs)
+    fn_sig = signature(triton_fn)
+    arg_annotations = {
+        name: param.annotation for name, param in fn_sig.parameters.items()
+    }
+    spec = []
+
+    for arg_name in fn_sig.parameters.keys():
+        arg = call_args[arg_name]
+        spec.append(
+            _infer_spec_entry(arg_name, arg, arg_annotations[arg_name], annotations)
+        )
+    return {"signature": spec}
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/pipeline.py b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/pipeline.py
new file mode 100644
index 000000000..cdf4d1821
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/pipeline.py
@@ -0,0 +1,300 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+
+# pyre-strict
+
+"""AOT-T compilation pipeline.
+
+Orchestrates: spec processing → Triton native compile → C++ / Python codegen.
+"""
+
+from __future__ import annotations
+
+import logging
+import multiprocessing as mp
+import os
+import signal
+import threading
+from concurrent.futures import ThreadPoolExecutor
+from types import FrameType, ModuleType
+from typing import Any, Callable
+
+# @manual=//triton:triton
+import triton
+import triton.compiler
+from generative_recommenders.ops.triton_aot.compile.arg_descriptor import (
+    ArgDescriptor,
+    build_arg_descriptors,
+)
+from generative_recommenders.ops.triton_aot.compile.codegen import (
+    gen_cubin,
+    gen_kernel_name,
+    gen_launcher,
+    gen_loader,
+    gen_tuner_meta_py,
+    generate_header_content,
+    generate_kernel_cpp_content,
+    generate_torch_op_content,
+)
+from generative_recommenders.ops.triton_aot.compile.spec_processing import (
+    gen_compile_arg,
+    KernelSpec,
+    OpsUnit,
+    RawKernelSpec,
+)
+from generative_recommenders.ops.triton_aot.compile.utils import (
+    is_autotuner,
+    unwrap_heuristic,
+)
+from generative_recommenders.ops.triton_aot.shared.types import AUTOTUNE_ATTRs
+from triton.backends.compiler import GPUTarget
+from triton.runtime.jit import JITFunction, KernelInterface
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+def compile_specs_parallel(
+    specs: list[KernelSpec],
+    install_dir: str,
+    module: str,
+    name: str,
+    gpu_target: GPUTarget,
+    import_module: Callable[[str], ModuleType],
+    descriptors: list[ArgDescriptor],
+) -> list[str]:
+    """Compile kernel specs in parallel using multiprocessing.
+
+    When TRITON_AOT_DEBUG=1 is set, compiles sequentially for easier debugging.
+
+    Args:
+        specs: List of kernel specifications to compile
+        install_dir: Directory to install generated files
+        module: The module name of the function
+        name: The function name
+        gpu_target: GPU target for compilation
+        import_module: Function to import modules (e.g., importlib.import_module or PackageImporter.import_module)
+
+    Returns:
+        List of generated code strings for each spec (cubin, loader, launcher)
+    """
+
+    debug = os.environ.get("TRITON_AOT_DEBUG", "0") == "1"
+    if debug:
+        outputs = [
+            spec_gen(
+                install_dir,
+                spec,
+                module,
+                name,
+                gpu_target,
+                import_module,
+                descriptors,
+            )
+            for spec in specs
+        ]
+    else:
+        max_workers = mp.cpu_count() // 2 + 1
+        with ThreadPoolExecutor(max_workers=min(len(specs), max_workers)) as executor:
+            outputs = list(
+                executor.map(
+                    lambda spec: spec_gen(
+                        install_dir,
+                        spec,
+                        module,
+                        name,
+                        gpu_target,
+                        import_module,
+                        descriptors,
+                    ),
+                    specs,
+                )
+            )
+    return outputs
+
+
+# For each spec, generate a kernel:
+# - cubin
+# - loader
+# - launcher
+def spec_gen(
+    install_dir: str,
+    spec: KernelSpec,
+    module: str,
+    name: str,
+    gpu_target: GPUTarget,
+    import_module: Callable[[str], ModuleType],
+    descriptors: list[ArgDescriptor],
+) -> str:
+    # To run this function with multiprocessing, we need to import the function by name,
+    # since JITFunction cannot be pickled.
+    # we have the case where the func name is injected with a suffix, like "_cuda" or "_amd",
+    # we should use the original name to import the func in such case
+    original_name = name
+    splits = name.split("_")
+    end_idx = len(splits)
+
+    while end_idx > 0:
+        original_name = "_".join(splits[:end_idx])
+        if hasattr(import_module(module), original_name):
+            break
+        end_idx -= 1
+    func = unwrap_heuristic(getattr(import_module(module), original_name), JITFunction)
+    func.__name__ = name
+
+    # Generate cubin.
+    kernel_name = gen_kernel_name(func, spec, gpu_target.arch)
+
+    compile_arg = gen_compile_arg(spec, func)
+    options = {name: getattr(spec, name) for name in AUTOTUNE_ATTRs.keys()}
+    compile_kwargs = {
+        "target": gpu_target,
+        "options": options,
+    }
+    kernel = triton.compiler.compile(*compile_arg, **compile_kwargs)
+    if getattr(kernel.metadata, "global_scratch_size", 0) > 0:
+        raise RuntimeError(f"{kernel_name=} with global scratch is not supported.")
+
+    metadata_name = kernel.metadata.name
+    metadata_shared = kernel.metadata.shared
+
+    cubin = gen_cubin(kernel_name, kernel, install_dir, gpu_target.backend)
+    out = [
+        cubin,
+        # Generate loader.
+        gen_loader(kernel_name, metadata_name, metadata_shared),
+        # Generate launcher.
+        gen_launcher(
+            kernel_name,
+            func,
+            kernel,
+            metadata_shared,
+            gpu_target.warp_size,
+            spec,
+            descriptors,
+        ),
+    ]
+    return "".join(out)
+
+
+def sigchld_handler(signum: int, frame: FrameType | None) -> None:
+    sketchy_signals = map(int, [signal.SIGSEGV, signal.SIGABRT, signal.SIGBUS])
+    try:
+        # Consume all pending SIGCHLDs, looking for unexpected failures
+        while True:
+            pid, status = os.waitpid(-1, os.WNOHANG)
+            if pid == 0:
+                break
+            if os.WIFSIGNALED(status) and os.WTERMSIG(status) in sketchy_signals:
+                logger.error(
+                    f"Child process {pid} exited catastrophically with signal {os.WTERMSIG(status)}, terminating!"
+                )
+
+                # Avoid triggering atexit etc which can get stuck and behave improperly
+                # because multiprocessing sets up an atexit handler to join workers
+                # (sigh).  We want to exit, now, so use os._exit instead of sys.exit.
+                os._exit(1)
+    except ChildProcessError:
+        pass
+
+
+def compile_to_cpp(
+    func: KernelInterface[list[Any]] | triton.runtime.autotuner.Autotuner,
+    base_specs: list[RawKernelSpec],
+    install_dir: str,
+    prefix: str,
+    *,
+    gpu_target: GPUTarget,
+    import_module: Callable[[str], ModuleType],
+    default_values: dict[str, Any] | None = None,
+    tuner_fallback: bool = False,
+) -> None:
+    """Compile a Triton kernel into .cpp, .h, _torch_op.cpp, _meta.py files.
+
+    Args:
+        func: Triton JITFunction or Autotuner to compile.
+        base_specs: List of kernel specialization specs.
+        install_dir: Directory to output generated files.
+        prefix: Kernel name prefix, e.g., "_addmm_fwd".
+        gpu_target: GPU target for compilation.
+        import_module: torch.package importer for loading kernels source code.
+        default_values: Default values for kernel arguments.
+        tuner_fallback: If True, generate fallback tuner code.
+    """
+    tuned_func = func if is_autotuner(func) else None
+    # pyre-ignore[6]: Attributes verified by is_autotuner
+    unit = OpsUnit.from_raw_specs(base_specs, gpu_target, tuned_func)
+    default_values = {} if default_values is None else default_values
+
+    func_unwrapped = unwrap_heuristic(func, JITFunction)
+    descriptors = build_arg_descriptors(func_unwrapped, unit)
+
+    # Python's multiprocessing.Pool class is not great at handling unexpected child
+    # failures such as segfaults.  Account for this by temporarily installing a signal
+    # handler that considers such signals a catastrophic compilation failure.  If not
+    # for this, the Pool will deadlock.
+    if threading.current_thread() is threading.main_thread():
+        previous_child_handler = signal.signal(signal.SIGCHLD, sigchld_handler)
+    else:
+        previous_child_handler = None
+
+    func = func_unwrapped
+
+    # sanity check to make sure args with default values are always at the end
+    has_default_value_arg = False
+    for name in func.arg_names:
+        if name in default_values:
+            has_default_value_arg = True
+        elif has_default_value_arg:
+            raise RuntimeError(
+                f"default values must be at the end of the argument list. {func.arg_names=} {default_values=}"
+            )
+
+    h_out = f"{install_dir}/{prefix}.h"
+    cu_out = f"{install_dir}/{prefix}.cpp"
+    torch_out = f"{install_dir}/{prefix}_torch_op.cpp"
+    py_out = f"{install_dir}/{prefix}_meta.py"
+
+    # Generate kernel.h file
+    h_content = generate_header_content(
+        tuned_func,  # pyre-ignore[6]: Autotuner when set (verified by is_autotuner)
+        func,
+        unit,
+        descriptors,
+        tuner_fallback,
+    )
+
+    with open(h_out, "w") as fp:
+        fp.write(h_content)
+
+    generated_specs = compile_specs_parallel(
+        unit.specs,
+        install_dir,
+        func.__module__,
+        func.__name__,
+        gpu_target,
+        import_module,
+        descriptors,
+    )
+    # Generate kernel.cpp file
+    cu_content = generate_kernel_cpp_content(
+        func, unit, descriptors, prefix, generated_specs, gpu_target.backend
+    )
+    with open(cu_out, "w") as fp:
+        fp.write(cu_content)
+
+    # Generate torch_op.cpp file
+    torch_op_content = generate_torch_op_content(
+        func, descriptors, prefix, default_values
+    )
+
+    with open(torch_out, "w") as fp:
+        fp.write(torch_op_content)
+
+    if tuned_func:
+        with open(py_out, "w") as fp:
+            fp.write(gen_tuner_meta_py(tuned_func, tuner_fallback, unit))
+    else:
+        with open(py_out, "w") as fp:
+            fp.write(gen_tuner_meta_py(func, tuner_fallback, unit))
+
+    if previous_child_handler is not None:
+        signal.signal(signal.SIGCHLD, previous_child_handler)
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/spec_processing.py b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/spec_processing.py
new file mode 100644
index 000000000..e8c181121
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/spec_processing.py
@@ -0,0 +1,593 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+
+# pyre-strict
+
+"""Kernel spec processing for AOT-T compilation.
+
+Transforms raw kernel specs (from infer_spec) into compiled specs ready
+for Triton native compile and C++ codegen.
+"""
+
+from __future__ import annotations
+
+import copy
+import dataclasses
+import logging
+from dataclasses import dataclass
+from typing import Any, cast
+
+# @manual=//triton:triton
+import triton
+from generative_recommenders.ops.triton_aot.compile.compile_state import hash_spec
+from generative_recommenders.ops.triton_aot.shared.spec_conversion import (
+    collect_constraints,
+    extract_constants,
+    get_fp8_replacement_signature_for_amd,
+    get_fp8_replacement_signature_for_sm80,
+    signature_list_to_dict,
+    SignatureElement,
+)
+from generative_recommenders.ops.triton_aot.shared.types import AUTOTUNE_ATTRs
+from triton.backends.compiler import BaseBackend, GPUTarget
+from triton.compiler.compiler import ASTSource
+from triton.runtime.jit import JITFunction
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+TRITON_VERSION: str = triton.__version__
+
+# A raw kernel spec produced by infer_spec.  The only key is "signature".
+RawKernelSpec = dict[str, list[SignatureElement]]
+
+
+@dataclass
+class KernelSpec:
+    """A single compilation variant for a kernel.
+
+    Each variant represents one combination of dtypes, constant values,
+    alignment constraints, and autotune configuration.  Multiple variants
+    are grouped together in an ``OpsUnit``.
+
+    Attributes:
+        signature: Non-constant arg index → dtype string (e.g., ``{0: "*fp32", 4: "i32"}``).
+        constants: Arg index → compile-time constant value.  Includes bare
+            literals (128, ``"leaky_relu"``), absent optional tensors (None),
+            and equal-to-1 specializations (stride=1 → constexpr folding).
+        divisible_by_16: Indices of args whose values are divisible by 16.
+            For pointers this means the address is 16-byte aligned;
+            for scalars it means the value itself is a multiple of 16.
+        divisible_by_8: Indices of args whose values are divisible by 8.
+            Only meaningful for scalars (pointer alignment is always ≥16).
+        num_warps: Number of warps per block.
+        num_stages: Number of pipeline stages.
+        matrix_instr_nonkdim: AMD matrix instruction non-K dimension.
+        waves_per_eu: AMD waves per execution unit.
+        kpack: AMD kpack factor.
+    """
+
+    signature: dict[int, str]
+    constants: dict[int, Any]
+    divisible_by_16: set[int]
+    divisible_by_8: set[int]
+    num_warps: int = 4
+    num_stages: int = 3
+    matrix_instr_nonkdim: int = 0
+    waves_per_eu: int = 1
+    kpack: int = 1
+
+
+@dataclass
+class OpsUnit:
+    """All compilation variants for a single kernel op.
+
+    Groups per-kernel invariants with the list of ``KernelSpec`` variants.
+    Use ``OpsUnit.from_raw_specs()`` to build — it performs the complete
+    spec processing pipeline (convert → detect optional → validate →
+    autotune → dedup → compute invariants).
+
+    Attributes:
+        cc: Compute capability (int for NVIDIA, str for AMD).
+        optional: Indices of optional tensor args (unified across all call sites).
+        pointer_args: Indices of all tensor pointer args (required + optional).
+            Invariant across specs — a pointer arg never becomes a non-pointer.
+        scalar_dtypes: Non-pointer signature arg index → widest dtype string
+            across all specs (e.g., ``"i32"``, ``"i64"``, ``"fp32"``).
+            Computed by ``_wider_type`` — individual specs may use narrower types.
+        constant_types: Python type per constant arg position (e.g., ``{15: int, 19: bool}``).
+            Excludes optional tensor args (None constants).
+            Invariant across specs — same Python type for each position.
+        specs: Per-variant compilation specs.
+    """
+
+    cc: int | str
+    optional: set[int]
+    pointer_args: set[int]
+    scalar_dtypes: dict[int, str]
+    constant_types: dict[int, type[Any]]
+    specs: list[KernelSpec]
+
+    @classmethod
+    def from_raw_specs(
+        cls,
+        base_specs: list[RawKernelSpec],
+        gpu_target: GPUTarget,
+        tuned_func: triton.runtime.autotuner.Autotuner | None = None,
+    ) -> OpsUnit:
+        """Build an OpsUnit from raw kernel specs.
+
+        Performs the complete spec processing pipeline:
+        1. Convert raw specs to KernelSpecs
+        2. Detect optional tensor args (cross-spec + 3-tuple)
+        3. Validate consistency across converted specs
+        4. Apply autotuning (if tuned_func provided)
+        5. Deduplicate specs
+        6. Compute shared invariants (pointer_args, scalar_dtypes, constant_types)
+        """
+        # Validate raw specs upfront, before any rewriting.
+        num_params = _check_uniform_signature_length(base_specs)
+        specs, three_tuple_optional = _convert_raw_specs(base_specs, gpu_target)
+        optional = _detect_optional_args(specs) | three_tuple_optional
+
+        _validate_converted_specs(specs, optional, num_params)
+
+        # Plain @triton.jit kernels (no @triton.autotune) skip config expansion.
+        if tuned_func is not None:
+            specs = _autotune_specs(tuned_func, gpu_target, specs)
+
+        specs = _dedup_specs(specs)
+
+        pointer_args, scalar_dtypes, constant_types = _compute_invariants(
+            specs, optional
+        )
+
+        return cls(
+            cc=gpu_target.arch,
+            optional=optional,
+            pointer_args=pointer_args,
+            scalar_dtypes=scalar_dtypes,
+            constant_types=constant_types,
+            specs=specs,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Public helpers (used outside spec processing)
+# ---------------------------------------------------------------------------
+
+
+def gen_compile_arg(
+    spec: KernelSpec,
+    func: JITFunction[list[Any]],
+) -> tuple[ASTSource]:
+    # ASTSource expects tuple-keyed dicts: {(idx,): value} for constants,
+    # {(idx,): [[attr_name, attr_val], ...]} for attrs.  Tuple keys support
+    # nested paths into structured types (asserted by ASTSource.__init__).
+    new_signature = {}
+    new_constants = {}
+    param_names = list(func.signature.parameters.keys())
+    for idx, param in enumerate(param_names):
+        if idx in spec.signature:
+            new_signature[param] = spec.signature[idx]
+        if idx in spec.constants:
+            new_constants[(idx,)] = spec.constants[idx]
+            new_signature[param] = "constexpr"
+
+    # parse_attr("D") returns a fresh [["tt.divisibility", 16]] each call.
+    new_attrs = {(idx,): BaseBackend.parse_attr("D") for idx in spec.divisible_by_16}
+
+    return (
+        ASTSource(
+            func,
+            new_signature,
+            constexprs=new_constants,
+            attrs=new_attrs,
+        ),
+    )
+
+
+# ---------------------------------------------------------------------------
+# Int width helpers
+# ---------------------------------------------------------------------------
+
+_INT_WIDTH_RANK: dict[str, int] = {"i32": 0, "i64": 1}
+
+
+def _wider_type(t1: str, t2: str) -> str:
+    """Return the wider of two scalar dtypes.
+
+    Only i32/i64 widening is supported.  All other types must match exactly.
+    """
+    if t1 == t2:
+        return t1
+    r1 = _INT_WIDTH_RANK.get(t1)
+    r2 = _INT_WIDTH_RANK.get(t2)
+    if r1 is not None and r2 is not None:
+        return t1 if r1 >= r2 else t2
+    raise ValueError(f"Cannot widen incompatible types: {t1!r} vs {t2!r}")
+
+
+# ---------------------------------------------------------------------------
+# Private helpers — called by OpsUnit.from_raw_specs
+# ---------------------------------------------------------------------------
+
+
+def _detect_optional_args(specs: list[KernelSpec]) -> set[int]:
+    """Detect optional tensor args by cross-spec comparison.
+
+    An arg at index ``i`` is optional if:
+    - Some specs have ``i`` in ``signature`` as a pointer type (``*...``)
+    - Other specs have ``constants[i] = None``
+
+    Single-spec None args (always-absent tensors) are NOT detected here
+    but are handled by ``_compute_invariants`` which adds any
+    ``constants[i] = None`` to ``pointer_args``.
+    """
+    if len(specs) <= 1:
+        return set()
+    optional: set[int] = set()
+    all_indices: set[int] = set()
+    for spec in specs:
+        all_indices |= spec.signature.keys()
+        all_indices |= spec.constants.keys()
+    for i in all_indices:
+        has_pointer = any(
+            i in s.signature and s.signature[i].startswith("*") for s in specs
+        )
+        has_none_const = any(i in s.constants and s.constants[i] is None for s in specs)
+        if has_pointer and has_none_const:
+            optional.add(i)
+    return optional
+
+
+def _check_uniform_signature_length(base_specs: list[RawKernelSpec]) -> int:
+    """All raw specs must declare the same param count; return that count.
+
+    Each raw spec is one ``infer_spec`` call site for the same kernel,
+    so all should have ``len(fn.signature.parameters)`` entries.  Differing
+    lengths means upstream bug (mixed kernels, truncated spec, etc.) and
+    would surface later as silent IndexError or wrong bound checks.
+    """
+    if not base_specs:
+        return 0
+    sig_lens = {len(spec["signature"]) for spec in base_specs}
+    if len(sig_lens) != 1:
+        raise ValueError(
+            f"Raw specs declare inconsistent signature lengths: "
+            f"{sorted(sig_lens)}.  All specs for the same kernel must have "
+            f"one entry per declared param."
+        )
+    return sig_lens.pop()
+
+
+def _check_arg_indices_in_range(
+    specs: list[KernelSpec],
+    num_params: int,
+) -> None:
+    """Every spec arg index must be in ``[0, num_params)``.
+
+    Out-of-range indices would silently drop in ``gen_compile_arg``'s
+    ``enumerate(param_names)`` loop.  ``num_params <= 0`` disables the check.
+    """
+    if num_params <= 0:
+        return
+    for idx, spec in enumerate(specs):
+        all_indices = (
+            spec.signature.keys()
+            | spec.constants.keys()
+            | spec.divisible_by_16
+            | spec.divisible_by_8
+        )
+        for i in all_indices:
+            if not 0 <= i < num_params:
+                raise ValueError(
+                    f"Spec {idx}: arg index {i} out of range "
+                    f"[0, {num_params}) — kernel has {num_params} declared params"
+                )
+
+
+def _collect_pointer_args(
+    specs: list[KernelSpec],
+    optional: set[int],
+) -> set[int]:
+    """Collect all tensor pointer indices across all specs.
+
+    Includes optional args (from _detect_optional_args) AND any arg
+    whose constant value is None (single-spec optional tensor case
+    where _detect_optional_args didn't fire).
+    """
+    pointer_args: set[int] = set(optional)
+    for spec in specs:
+        for i, dtype in spec.signature.items():
+            if dtype.startswith("*"):
+                pointer_args.add(i)
+        for i, val in spec.constants.items():
+            if val is None:
+                pointer_args.add(i)
+    return pointer_args
+
+
+def _collect_scalar_dtypes(
+    specs: list[KernelSpec],
+    pointer_args: set[int],
+) -> dict[int, str]:
+    """Collect non-pointer signature arg dtypes, widening compatible int types.
+
+    Invariant across specs (validated by _validate_converted_specs).
+    """
+    scalar_dtypes: dict[int, str] = {}
+    for spec in specs:
+        for i, dtype in spec.signature.items():
+            if i not in pointer_args:
+                if i in scalar_dtypes:
+                    scalar_dtypes[i] = _wider_type(scalar_dtypes[i], dtype)
+                else:
+                    scalar_dtypes[i] = dtype
+    return scalar_dtypes
+
+
+def _collect_constant_types(
+    specs: list[KernelSpec],
+) -> dict[int, type[Any]]:
+    """Collect Python type per constant position.
+
+    Excludes None constants (optional tensor args — already in pointer_args).
+    """
+    constant_types: dict[int, type[Any]] = {}
+    for spec in specs:
+        for i, val in spec.constants.items():
+            if val is not None and i not in constant_types:
+                constant_types[i] = type(val)
+    return constant_types
+
+
+def _compute_invariants(
+    specs: list[KernelSpec],
+    optional: set[int],
+) -> tuple[set[int], dict[int, str], dict[int, type[Any]]]:
+    """Compute shared invariants from processed specs.
+
+    Returns (pointer_args, scalar_dtypes, constant_types).
+
+    When annotation-as-variant produces mixed partitions (arg in
+    ``signature`` in some specs, ``constants`` in others), the arg
+    appears in both ``scalar_dtypes`` and ``constant_types``.  The
+    selector must receive it as a runtime parameter for dispatch,
+    so ``scalar_dtypes`` wins and the arg is removed from
+    ``constant_types``.
+    """
+    pointer_args = _collect_pointer_args(specs, optional)
+    scalar_dtypes = _collect_scalar_dtypes(specs, pointer_args)
+    constant_types = _collect_constant_types(specs)
+
+    # Resolve overlap: if any spec has the arg in signature (scalar),
+    # the selector needs it as a runtime parameter → not a constant.
+    for i in scalar_dtypes:
+        constant_types.pop(i, None)
+
+    return pointer_args, scalar_dtypes, constant_types
+
+
+def _validate_converted_specs(
+    specs: list[KernelSpec],
+    optional: set[int],
+    num_params: int = 0,
+) -> None:
+    """Validate that converted specs are consistent before further processing.
+
+    Checks that all specs produce identical C++ function signatures:
+    - All arg indices are in ``[0, num_params)`` (when ``num_params > 0``)
+    - Optional args: each spec has either a pointer in signature or None in constants
+    - Non-optional scalar args: same dtype (or compatible int widths)
+    - Non-optional constant args: same Python type
+
+    Called after _convert_raw_specs + _detect_optional_args, before autotuning.
+    """
+    _check_arg_indices_in_range(specs, num_params)
+    if len(specs) <= 1:
+        return
+    ref = specs[0]
+    for idx, spec in enumerate(specs[1:], 1):
+        _check_optional_consistency(ref, spec, idx, optional)
+        _check_signature_consistency(ref, spec, idx, optional)
+        _check_constants_consistency(ref, spec, idx, optional)
+
+
+def _check_optional_consistency(
+    ref: KernelSpec,
+    spec: KernelSpec,
+    idx: int,
+    optional: set[int],
+) -> None:
+    """Optional positions must be pointer-in-signature or None-in-constants.
+
+    Validates that optional tensor args are not misclassified as scalars
+    or non-None constants, which would produce incompatible C++ types.
+    """
+    for i in optional:
+        for label, s in [("spec 0", ref), (f"spec {idx}", spec)]:
+            if i in s.signature:
+                if not s.signature[i].startswith("*"):
+                    raise ValueError(
+                        f"Arg {i}: optional position has non-pointer type "
+                        f"'{s.signature[i]}' in {label}"
+                    )
+            elif i in s.constants:
+                if s.constants[i] is not None:
+                    raise ValueError(
+                        f"Arg {i}: optional position has non-None constant "
+                        f"{s.constants[i]!r} in {label}"
+                    )
+
+
+def _check_signature_consistency(
+    ref: KernelSpec,
+    spec: KernelSpec,
+    idx: int,
+    optional: set[int],
+) -> None:
+    """Non-optional, non-pointer scalar args must have compatible dtypes.
+
+    Pointer args are skipped (different tensor dtypes are dispatched by
+    the dtype guard in ``gen_guarded_calls``).  Compatible int widths
+    (i32/i64) are allowed — handled by ``_wider_type`` and int range guards.
+    Optional positions are validated by ``_check_optional_consistency``.
+
+    Partition differences are allowed: an arg may be in ``signature`` in
+    one spec and in ``constants`` in another (e.g., annotation-as-variant
+    where stride=1 is constexpr in one spec but a runtime parameter in
+    another).  The per-spec codegen handles this correctly.
+    """
+    for i in ref.signature.keys() | spec.signature.keys():
+        if i in optional:
+            continue
+        if (i in ref.signature and ref.signature[i].startswith("*")) or (
+            i in spec.signature and spec.signature[i].startswith("*")
+        ):
+            continue
+        # Allow partition differences: arg in signature in one spec,
+        # in constants in another (annotation-as-variant pattern).
+        if i not in ref.signature or i not in spec.signature:
+            continue
+        if ref.signature[i] != spec.signature[i]:
+            r1 = _INT_WIDTH_RANK.get(ref.signature[i])
+            r2 = _INT_WIDTH_RANK.get(spec.signature[i])
+            if r1 is not None and r2 is not None:
+                continue
+            raise ValueError(
+                f"Arg {i}: dtype mismatch '{ref.signature[i]}' vs "
+                f"'{spec.signature[i]}' (spec 0 vs spec {idx})"
+            )
+
+
+def _check_constants_consistency(
+    ref: KernelSpec,
+    spec: KernelSpec,
+    idx: int,
+    optional: set[int],
+) -> None:
+    """Non-optional constant args must have the same Python type across specs.
+
+    C++ codegen uses one type per constant arg position (``PY_TYPES_TO_CPP_TYPES``),
+    so ``BLOCK_M=64`` (int) and ``BLOCK_M=64.0`` (float) would produce
+    incompatible launchers.  Optional positions are validated separately
+    by ``_check_optional_consistency``.
+    """
+    for i in ref.constants.keys() | spec.constants.keys():
+        if i in optional:
+            continue
+        if ref.constants.get(i) is None or spec.constants.get(i) is None:
+            continue
+        if type(ref.constants[i]) is not type(spec.constants[i]):
+            raise ValueError(
+                f"Arg {i}: constant type mismatch "
+                f"{type(ref.constants[i]).__name__} vs "
+                f"{type(spec.constants[i]).__name__} (spec 0 vs spec {idx})"
+            )
+
+
+def _convert_raw_specs(
+    base_specs: list[RawKernelSpec],
+    gpu_target: GPUTarget,
+) -> tuple[list[KernelSpec], set[int]]:
+    """Convert raw specs to KernelSpecs.
+
+    Returns (specs, three_tuple_optional) where three_tuple_optional is the
+    union of optional_args detected from 3-tuple signature elements across
+    all specs (backward compat with ``collect_constraints``).
+    """
+    raw_specs = cast(list[dict[str, Any]], copy.deepcopy(base_specs))
+    is_amd = gpu_target.backend == "hip"
+
+    result: list[KernelSpec] = []
+    three_tuple_optional: set[int] = set()
+    for raw_spec in raw_specs:
+        constraints = collect_constraints(raw_spec["signature"])
+        constants = extract_constants(raw_spec["signature"], constraints)
+        signature: dict[int, str] = signature_list_to_dict(
+            raw_spec["signature"], constants
+        )
+        three_tuple_optional |= constraints.optional_args
+
+        spec = KernelSpec(
+            signature=signature,
+            constants=constants,
+            divisible_by_16=constraints.divisible_by_16,
+            divisible_by_8=constraints.divisible_by_8,
+        )
+
+        if constraints.has_fp8:
+            if is_amd:
+                spec.signature = get_fp8_replacement_signature_for_amd(
+                    {"signature": spec.signature}, {str(gpu_target.arch)}
+                )
+            elif gpu_target.arch == 80:
+                spec.signature = get_fp8_replacement_signature_for_sm80(
+                    {"signature": spec.signature}
+                )
+
+        result.append(spec)
+
+    return result, three_tuple_optional
+
+
+def _autotune_specs(
+    func: triton.runtime.autotuner.Autotuner,
+    target: GPUTarget,
+    specs: list[KernelSpec],
+) -> list[KernelSpec]:
+    tuned_specs: list[KernelSpec] = []
+    for spec in specs:
+        for cfg in func.cache.values():
+            constants = spec.constants.copy()
+            for arg_name, arg_val in cfg.kwargs.items():
+                if arg_name in AUTOTUNE_ATTRs:
+                    continue
+                arg_idx = func.arg_names.index(arg_name)
+                if constants.get(arg_idx, -1) == -1:
+                    constants[arg_idx] = arg_val
+
+            autotune_values: dict[str, int] = {}
+            for name, default in AUTOTUNE_ATTRs.items():
+                if name in cfg.kwargs:
+                    autotune_values[name] = cfg.kwargs[name]
+                else:
+                    autotune_values[name] = getattr(cfg, name, default)
+                # AMD has changed their software pipeliner in Triton
+                # It now expects num_stages == 2 instead of 0
+                # see: https://github.com/pytorch/pytorch/pull/139881
+                # if we see someone try to set num_stages == 0, set it to the default (2) instead
+                # We can't use the Triton hook to get the default value because it requires the AMD runtime to be loaded
+                if (
+                    target.backend == "hip"
+                    and name == "num_stages"
+                    and autotune_values[name] == 0
+                    and TRITON_VERSION >= "3.2.0"
+                ):
+                    autotune_values[name] = 2
+
+            tuned_spec = dataclasses.replace(
+                spec,
+                constants=constants,
+                # pyrefly: ignore [bad-argument-type]
+                **autotune_values,
+            )
+            tuned_specs.append(tuned_spec)
+    return tuned_specs
+
+
+def _dedup_specs(specs: list[KernelSpec]) -> list[KernelSpec]:
+    deduped_specs: list[KernelSpec] = []
+    duplicated_specs: list[KernelSpec] = []
+    hash_spec_ids: set[str] = set()
+    for spec in specs:
+        id = hash_spec(dataclasses.asdict(spec))
+        if id in hash_spec_ids:
+            duplicated_specs.append(spec)
+        else:
+            hash_spec_ids.add(id)
+            deduped_specs.append(spec)
+
+    logger.debug(
+        f"[TritonAOT Dedup] {len(specs)=} {len(deduped_specs)=} {len(duplicated_specs)=}"
+    )
+    return deduped_specs
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/stable_types.py b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/stable_types.py
new file mode 100644
index 000000000..33038a534
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/stable_types.py
@@ -0,0 +1,35 @@
+# pyre-strict
+
+"""AOTT-local type mappings for stable ABI codegen.
+
+These replace ``shared.types.ATYPES`` and ``shared.types.PY_TYPES_TO_CPP_TYPES``
+with versions that have zero link dependency on ATen.  The shared dicts are kept
+unchanged so TritonCC is not affected.
+"""
+
+from typing import Any
+
+# Stable ABI scalar type mapping: Triton pointer dtype → c10::ScalarType enum.
+# Uses c10::ScalarType:: (from torch/headeronly/core/ScalarType.h) instead of
+# at::kFloat aliases (which require ATen headers).
+SCALAR_TYPES: dict[str, str] = {
+    "*i1": "c10::ScalarType::Bool",
+    "*u8": "c10::ScalarType::Byte",
+    "*i8": "c10::ScalarType::Char",
+    "*i16": "c10::ScalarType::Short",
+    "*i32": "c10::ScalarType::Int",
+    "*i64": "c10::ScalarType::Long",
+    "*fp16": "c10::ScalarType::Half",
+    "*fp32": "c10::ScalarType::Float",
+    "*fp64": "c10::ScalarType::Double",
+    "*bf16": "c10::ScalarType::BFloat16",
+    "*fp8e4nv": "c10::ScalarType::Float8_e4m3fn",
+    "*fp8e4b8": "c10::ScalarType::Float8_e4m3fnuz",
+}
+
+# Stable ABI override: str → "std::string" instead of "at::string".
+PY_TYPES_TO_CPP_TYPES: dict[type[Any], str] = {
+    int: "int64_t",
+    str: "std::string",
+    float: "double",
+}
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/triton_aot_compile.py b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/triton_aot_compile.py
new file mode 100644
index 000000000..74179435a
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/triton_aot_compile.py
@@ -0,0 +1,149 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# pyre-strict
+
+import importlib
+import logging
+import os
+import pickle
+from types import ModuleType, TracebackType
+from typing import Any, Callable, Optional, Type
+
+from generative_recommenders.ops.triton_aot.build.extension_builder import (
+    build_triton_aot_extension,
+)
+from generative_recommenders.ops.triton_aot.compile.codegen import (
+    is_non_empty_mapping_of_type,
+)
+from generative_recommenders.ops.triton_aot.compile.compile_state import (
+    get_aott_compile_path,
+    get_aott_compile_state,
+    get_triton_aot_kernel_specs,
+)
+from generative_recommenders.ops.triton_aot.compile.pipeline import compile_to_cpp
+from generative_recommenders.ops.triton_aot.compile.utils import unwrap_heuristic
+from torch import package
+from triton.backends.compiler import GPUTarget
+from triton.runtime import driver, JITFunction
+
+# @manual=//triton:triton
+from triton.runtime.autotuner import Config
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class TritonAOTCompile:
+    """
+    Context manager to compile Triton kernels to C++ and build a shared library.
+    The compiled kernels are cached in a temporary directory.
+
+    - package_importer:
+        torch.package importer for loading kernels source code (aott/ops).
+        If not provided, the default importlib is used (for local use cases)
+    - gpu_target:
+        GPU target to compile for (default: active GPU target, determined by Triton driver)
+    This local copy intentionally omits Manifold autotune-cache overrides. The
+    HSTU e2e path only needs representative-input autotuning captured during
+    the compile context.
+    """
+
+    def __init__(
+        self,
+        package_importer: Optional[package.PackageImporter] = None,
+        gpu_target: Optional[GPUTarget] = None,
+        auto_tune_cache_override_path: Optional[str] = None,
+    ) -> None:
+        self._import_module: Callable[[str], ModuleType] = (
+            package_importer.import_module
+            if package_importer is not None
+            else importlib.import_module
+        )
+        self.gpu_target: GPUTarget = gpu_target or driver.active.get_current_target()
+        self.auto_tune_cache_override_path: Optional[str] = (
+            auto_tune_cache_override_path
+        )
+
+    def _load_autotune_cache_overrides(
+        self,
+    ) -> dict[str, Any]:
+        if self.auto_tune_cache_override_path is None:
+            return {}
+        raise NotImplementedError(
+            "Local generative_recommenders AOT-T compile does not support "
+            "auto_tune_cache_override_path."
+        )
+
+    def __enter__(self) -> None:
+        state = get_aott_compile_state()
+        state.reset()
+        state.enable()
+        logger.info(
+            f"Start AOTT compile, output dir: {get_aott_compile_path()}, gpu_target: {self.gpu_target}"
+        )
+
+    def _resolve_autotune_cache(
+        self,
+        fn: Any,
+        fn_name: str,
+        fn_dir: str,
+        overrides: dict[str, Any],
+    ) -> None:
+        """Apply override (if matched) and dump the autotune cache to fn_dir."""
+        override = overrides.get(fn_name)
+        if override is not None:
+            logger.info(
+                f"[AOTT]: Overriding autotune cache for {fn_name} "
+                f"from {self.auto_tune_cache_override_path}"
+            )
+            fn.cache = override
+
+        # cache are dumped just for testing
+        if hasattr(fn, "cache") and is_non_empty_mapping_of_type(fn.cache, Config):
+            with open(f"{fn_dir}/{fn_name}_autotune_cache", "wb") as data:
+                # @lint-ignore PYTHONPICKLEISBAD
+                pickle.dump(fn.cache, data)
+
+    def __exit__(
+        self,
+        exc_type: Optional[Type[BaseException]],
+        exc_value: Optional[BaseException],
+        traceback: Optional[TracebackType],
+    ) -> None:
+        compile_path = get_aott_compile_path()
+        if not os.path.exists(compile_path):
+            os.makedirs(compile_path)
+
+        kernel_specs = get_triton_aot_kernel_specs()
+        auto_tune_overrides = self._load_autotune_cache_overrides()
+
+        logger.info(f"[AOTT]: compiling {len(kernel_specs)} kernels")
+
+        for fn, specs in kernel_specs.items():
+            jit_fn = unwrap_heuristic(fn, JITFunction)
+            fn_name = jit_fn.__name__
+
+            logger.info(f"[AOTT]: compiling {fn_name} with specs: {specs}")
+
+            module_suffix = jit_fn.__module__.rsplit(".", 1)[-1]
+            fn_dir = f"{compile_path}/{module_suffix}_{fn_name}"
+            if not os.path.exists(fn_dir):
+                os.makedirs(fn_dir)
+
+            self._resolve_autotune_cache(fn, fn_name, fn_dir, auto_tune_overrides)
+
+            compile_to_cpp(
+                func=fn,
+                base_specs=specs,
+                install_dir=f"{fn_dir}",
+                prefix=f"{fn_name}",
+                gpu_target=self.gpu_target,
+                tuner_fallback=True,
+                import_module=self._import_module,
+            )
+
+            build_triton_aot_extension(
+                source_dir=fn_dir,
+                kernel_name=fn_name,
+                output_dir=fn_dir,
+            )
+
+        get_aott_compile_state().disable()
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/utils.py b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/utils.py
new file mode 100644
index 000000000..e3848fd2e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/utils.py
@@ -0,0 +1,47 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+
+# pyre-strict
+
+import hashlib
+from typing import Any, Type, TypeVar
+
+T = TypeVar("T")
+
+
+def unwrap_heuristic(func: Any, return_type: Type[T]) -> T:
+    while not isinstance(func, return_type):
+        func = func.fn
+        if not hasattr(func, "fn"):
+            # pyre-fixme[7]: Incompatible return type [7]: Expected `Variable[T]` but got `None`.
+            return None
+    return func
+
+
+def is_autotuner(obj: Any) -> bool:
+    """Check whether *obj* is a Triton Autotuner using duck typing.
+
+    In Buck builds the ``Autotuner`` class can be loaded from multiple module
+    paths (e.g. via ``torch.package`` re-imports), causing ``isinstance`` to
+    return ``False`` for genuine Autotuner instances.  We combine a class-name
+    check with duck-typing on the attributes that callers actually need
+    (``cache``, ``configs``, ``arg_names``), making detection robust against
+    module-path aliasing.
+    """
+    return "Autotuner" in type(obj).__name__ and all(
+        hasattr(obj, attr) for attr in ("cache", "configs", "arg_names")
+    )
+
+
+def hash_kernel_name(kernel_name: str) -> str:
+    """Hash kernel name to create shorter, filesystem-safe names.
+
+    Args:
+        kernel_name: Full kernel name (can be very long with specialization suffixes).
+            e.g., "_addmm_fwd_sm80_pfp32_pfp32_pfp32_pfp32_i32_..."
+
+    Returns:
+        Hashed name in format "kernel_<sha256_hex>".
+            e.g., "kernel_a1b2c3d4e5f6..."
+
+    """
+    return "kernel_" + hashlib.sha256(kernel_name.encode("utf-8")).hexdigest()
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/preprocess.py b/recommendation_v4/generative_recommenders/ops/triton_aot/preprocess.py
new file mode 100644
index 000000000..ce2e63d43
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/preprocess.py
@@ -0,0 +1,76 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+# pyre-strict
+
+"""
+Preprocessing utilities for triton_aot models before AOT compilation.
+"""
+
+import logging
+
+from tgif.fx.tgif_tracer import TGIFTracer
+from torch.fx import GraphModule
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+# "aot_triton_kernel_wrapper_" is a pre-defined prefix for
+# AOT-T triton kernel wrapper functions. This is required for
+# AOT-T backend to recognize and trace correctly for ops transformation.
+AOTT_WRAPPER_PREFIX: str = "aot_triton_kernel_wrapper_"
+
+
+def unwrap_aott_wrapper_nodes(fx_m: GraphModule, tracer: TGIFTracer) -> GraphModule:
+    """Mark ``aot_triton_kernel_wrapper_*`` FX nodes as unwrapped and re-trace.
+
+    In the traced FX graph, outer wrapper functions (prefixed with
+    ``aot_triton_kernel_wrapper_``) are ``@torch.fx.wrap`` leaves.
+    Setting ``node.meta["is_wrapped"] = False`` causes a subsequent
+    ``symbolic_trace`` to trace *through* them, exposing the inner
+    ``@torch.fx.wrap`` functions (e.g., ``_triton_aot_grouped_gemm``)
+    that contain the actual kernel calls.
+
+    Any ``_body_transformer`` hook (e.g. one registered by
+    ``early_return_fx_code_transform``) is temporarily removed before
+    re-tracing to avoid injecting un-traceable control flow
+    (``if Proxy: …``) into the generated ``forward``.  After re-trace
+    the hook is restored on the new module.  See P2266562545.
+
+    Args:
+        fx_m: The FX GraphModule to modify **in-place** before re-trace.
+        tracer: Tracer instance used for the re-trace step.
+
+    Returns:
+        The re-traced ``GraphModule`` with AOTT wrappers expanded.
+    """
+    logger.info("Re-trace to get the AOTT node exposed.")
+
+    # Save and clear the body transformer so that re-trace does not hit
+    # ``if Proxy:`` from code-level hooks like early_return_fx_code_transform.
+    saved_body_transformer = fx_m.graph._codegen._body_transformer
+    fx_m.graph._codegen._body_transformer = None
+
+    unwrap_count = 0
+    for node in fx_m.graph.nodes:
+        if node.op == "call_function":
+            target = node.target
+            if hasattr(target, "__name__") and target.__name__.startswith(
+                AOTT_WRAPPER_PREFIX
+            ):
+                logger.info(f"[AOTT] Found inference wrapper node: {node=}")
+                node.meta["is_wrapped"] = False
+                unwrap_count += 1
+
+    if unwrap_count > 0:
+        logger.info(f"[AOTT] Found {unwrap_count} inference wrapper nodes.")
+        fx_m.recompile()
+    else:
+        logger.warning("[AOTT] No inference wrapper node found. Skip re-compile.")
+
+    result = tracer.symbolic_trace(fx_m)
+
+    # Restore the body transformer on the new module.
+    if saved_body_transformer is not None:
+        result.graph._codegen._body_transformer = saved_body_transformer
+        result.recompile()
+
+    return result
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/shared/compat.py b/recommendation_v4/generative_recommenders/ops/triton_aot/shared/compat.py
new file mode 100644
index 000000000..be6235701
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/shared/compat.py
@@ -0,0 +1,91 @@
+# pyre-strict
+"""
+This module provides shared utilities that handle differences between
+Triton versions.
+"""
+
+from typing import Any
+
+# @manual=//triton:triton
+import triton
+from packaging.version import Version
+from triton.runtime.jit import JITFunction
+
+TRITON_VERSION: str = triton.__version__
+
+
+def version_gte(version: str, target: str) -> bool:
+    """
+    Check if version >= target using semantic version comparison.
+    Simple string comparison fails for versions like "3.10" vs "3.5"
+    """
+    return Version(version) >= Version(target)
+
+
+def get_kernel_name(jit_fn: JITFunction[Any]) -> str:
+    """
+    Get the simple kernel name from a JITFunction.
+
+    In Triton 3.5+, JITFunction._fn_name returns the full qualified name
+    (e.g., "generative_recommenders.ops.triton_aot.triton_addmm._addmm_fwd").
+    In older versions, it returns just the simple name (e.g., "_addmm_fwd").
+
+    This function normalizes the behavior to always return the simple name.
+
+    Args:
+        jit_fn: A Triton JITFunction
+
+    Returns:
+        The simple kernel name (e.g., "_addmm_fwd")
+    """
+    fn_name = jit_fn._fn_name
+    if version_gte(TRITON_VERSION, "3.5"):
+        # Triton 3.5+ uses get_full_name(fn) which returns qualified name
+        return fn_name.rsplit(".", 1)[-1]
+    else:
+        # Older versions use fn.__name__ which is already simple
+        return fn_name
+
+
+def get_scratch_parameters(kernel: Any) -> tuple[str, list[str]]:
+    """
+    Get scratch parameter declarations and argument pointers for the kernel launcher.
+
+    Scratch parameters are backend and version-specific features for profiling
+    and global memory management.
+
+    Detection Strategy:
+        1. Check metadata first for each parameter
+        2. Fall back to version-based detection if metadata unavailable
+
+    Version Requirements (fallback):
+        - v3.4+: both global_scratch and profile_scratch
+        - v3.3: only global_scratch
+        - v3.2 and earlier: no scratch parameters
+
+    Args:
+        kernel: Compiled Triton kernel with metadata attribute
+
+    Returns:
+        Tuple of (declarations, arg_pointers):
+        - declarations: C++ variable declarations for scratch parameters
+        - arg_pointers: List of argument pointers to append to kernel args
+    """
+    declarations = []
+    arg_pointers = []
+
+    if hasattr(kernel.metadata, "global_scratch_size"):
+        declarations.append("CUdeviceptr global_scratch = 0;")
+        arg_pointers.append("&global_scratch")
+    elif version_gte(TRITON_VERSION, "3.3"):
+        declarations.append("CUdeviceptr global_scratch = 0;")
+        arg_pointers.append("&global_scratch")
+
+    if hasattr(kernel.metadata, "profile_scratch_size"):
+        declarations.append("CUdeviceptr profile_scratch = 0;")
+        arg_pointers.append("&profile_scratch")
+    elif version_gte(TRITON_VERSION, "3.4"):
+        declarations.append("CUdeviceptr profile_scratch = 0;")
+        arg_pointers.append("&profile_scratch")
+
+    return ("\n            ".join(declarations), arg_pointers)
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/shared/spec_conversion.py b/recommendation_v4/generative_recommenders/ops/triton_aot/shared/spec_conversion.py
new file mode 100644
index 000000000..4a1ebb133
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/shared/spec_conversion.py
@@ -0,0 +1,389 @@
+# pyre-strict
+
+"""Functions for converting kernel specs to architecture-specific formats.
+
+A "spec" (specification) describes how to compile a Triton kernel for a specific
+set of input shapes and types. Users provide "base specs" in a human-friendly
+format that describes kernel arguments:
+
+    {"signature": [("*fp32", 16), ("*bf16", 16), ("i32", None), 128]}
+
+This format encodes dtypes, alignment hints, and constant values together.
+Before compilation, base specs must be converted to "compiled specs" that
+separate this information into distinct fields the compiler understands:
+
+    {"signature": {0: "*fp32", 1: "*bf16"},
+     "constants": {2: None, 3: 128},
+     "configs": (instance_descriptor(...),),
+     "cc": 80}
+
+This module provides the functionality to perform this transformation,
+extracting constraints, identifying constants, and preparing specs for each
+target GPU architecture.
+"""
+
+from collections import namedtuple
+from dataclasses import dataclass
+from typing import Any, TypeAlias
+
+from generative_recommenders.ops.triton_aot.shared.types import CTYPES
+
+# Compile-time constant values that can appear in signatures or be returned
+# by constexpr(). These are values the compiler can fold into generated code.
+ConstantValue: TypeAlias = str | int | float | bool | None
+
+# A single element in a kernel signature list.
+# Can be: dtype string, (dtype, alignment) tuple, (dtype, alignment, has_value)
+# triple for optional args, or a bare literal constant.
+SignatureElement: TypeAlias = (
+    ConstantValue | tuple[str, int | None] | tuple[str, int | None, bool]
+)
+
+
+instance_descriptor = namedtuple(
+    "instance_descriptor",
+    [
+        "divisible_by_16",
+        "equal_to_1",
+        "ids_of_folded_args",
+        "divisible_by_8",
+    ],
+)
+
+
+def constexpr(s: SignatureElement) -> ConstantValue:
+    """Identify compile-time constant expressions in signature elements.
+
+    Args:
+        s: A signature element.
+
+    Returns:
+        The constant value if s is a compile-time constant, None otherwise.
+        Constants are: int, float, bool, or strings that aren't dtype names.
+    """
+    expr = s[0] if isinstance(s, tuple) and len(s) > 1 else s
+
+    if expr is None:
+        return expr
+
+    try:
+        ret = int(expr)
+        return ret
+    except (ValueError, TypeError):
+        pass
+    try:
+        ret = float(expr)
+        return ret
+    except (ValueError, TypeError):
+        pass
+
+    if isinstance(expr, bool):
+        return expr
+    if isinstance(expr, str) and expr not in CTYPES and not expr.startswith("*"):
+        return expr
+    return None
+
+
+@dataclass
+class SignatureConstraints:
+    """Constraints extracted from parsing a kernel signature.
+
+    When compiling a Triton kernel, the compiler can generate more efficient
+    code if it knows certain properties about the arguments:
+
+    - Pointer alignment: If a pointer is always 16-byte aligned, the compiler
+      can use faster aligned memory operations.
+    - Constant values: Arguments known at compile time can be folded into the
+      generated code, eliminating runtime checks.
+    - FP8 dtypes: Some GPU architectures require dtype substitutions for FP8
+      types (e.g., gfx942 needs fp8e4b8 instead of fp8e4nv).
+
+    This dataclass collects all these constraints from a single pass over the
+    signature, so downstream code can use them without re-parsing.
+
+    Attributes:
+        divisible_by_16: Indices of args with values divisible by 16.
+        divisible_by_8: Indices of args with values divisible by 8.
+        equal_to_1: Indices of args with value equal to 1.
+        none_args: Indices of args that are None (not provided).
+        optional_args: Indices of optional arguments.
+        has_fp8: Whether any argument has an FP8 dtype.
+    """
+
+    divisible_by_16: set[int]
+    divisible_by_8: set[int]
+    equal_to_1: set[int]
+    none_args: set[int]
+    optional_args: set[int]
+    has_fp8: bool
+
+
+def collect_constraints(signature: list[SignatureElement]) -> SignatureConstraints:
+    """Collect divisibility and type constraints from a signature list.
+
+    Iterates through signature elements and identifies:
+    - Arguments divisible by 16 or 8 (for memory alignment)
+    - Arguments equal to 1 (for optimization)
+    - Optional arguments and those not provided (None)
+    - Whether any FP8 dtypes are present
+
+    Args:
+        signature: List of signature elements. The input format is unfortunately
+            variable; each element can be one of several types:
+
+            1. Plain string (dtype only, no alignment info):
+               "*fp32"           - A float32 pointer
+               "i32"             - A 32-bit integer scalar
+
+            2. Tuple of (dtype, value) where value indicates alignment or constness:
+               ("*fp32", 16)     - Float32 pointer, 16-byte aligned
+               ("i32", None)     - Integer arg not provided (becomes constant None)
+               ("*bf16", 1)      - Pointer with value=1 (folded as constant)
+
+            3. Triple of (dtype, value, has_value) for optional arguments:
+               ("*fp32", 16, True)  - Optional arg that IS provided, 16-byte aligned
+               ("*fp32", 16, False) - Optional arg NOT provided (becomes None)
+
+            4. Bare literals (become compile-time constants):
+               128               - Integer constant
+               "leaky_relu"      - String constant (e.g., activation name)
+
+    Returns:
+        SignatureConstraints with all constraint sets populated.
+
+    Example:
+        >>> sig = [("*fp32", 16), ("i32", None), ("*fp8e4nv", 8)]
+        >>> c = collect_constraints(sig)
+        >>> 0 in c.divisible_by_16
+        True
+        >>> c.has_fp8
+        True
+    """
+    divisible_by_16: set[int] = set()
+    divisible_by_8: set[int] = set()
+    equal_to_1: set[int] = set()
+    none_args: set[int] = set()
+    optional_args: set[int] = set()
+    has_fp8: bool = False
+
+    for i, s in enumerate(signature):
+        # Handle optional tensor case: tuple with 3 elements where s[2] indicates
+        # whether the optional arg has a value
+        if isinstance(s, tuple) and len(s) > 2:
+            optional_args.add(i)
+            # pyrefly: ignore [bad-index]
+            if not s[2]:  # has_value is False
+                none_args.add(i)
+                continue
+
+        # Extract dtype
+        dtype = s[0] if isinstance(s, tuple) else s
+
+        # Check for FP8 types
+        if isinstance(dtype, str) and ("fp8e4nv" in dtype or "fp8e4b8" in dtype):
+            has_fp8 = True
+
+        # Extract value (alignment or constant)
+        value = s[1] if isinstance(s, tuple) else s
+
+        # Check divisibility and equality constraints
+        if isinstance(value, int):
+            if value % 16 == 0:
+                divisible_by_16.add(i)
+            if value % 8 == 0:
+                divisible_by_8.add(i)
+            if value == 1:
+                equal_to_1.add(i)
+
+        if value is None:
+            none_args.add(i)
+
+    return SignatureConstraints(
+        divisible_by_16=divisible_by_16,
+        divisible_by_8=divisible_by_8,
+        equal_to_1=equal_to_1,
+        none_args=none_args,
+        optional_args=optional_args,
+        has_fp8=has_fp8,
+    )
+
+
+def make_instance_descriptor(
+    constraints: SignatureConstraints,
+) -> tuple[instance_descriptor]:
+    """Create an instance_descriptor tuple from constraints.
+
+    Args:
+        constraints: The collected signature constraints.
+
+    Returns:
+        A tuple containing a single instance_descriptor namedtuple with
+        divisible_by_16, equal_to_1, ids_of_folded_args, and divisible_by_8.
+    """
+    ids_of_folded_args = constraints.equal_to_1 | constraints.none_args
+    return (
+        instance_descriptor(
+            divisible_by_16=constraints.divisible_by_16,
+            equal_to_1=constraints.equal_to_1,
+            ids_of_folded_args=ids_of_folded_args,
+            divisible_by_8=constraints.divisible_by_8,
+        ),
+    )
+
+
+def extract_constants(
+    signature: list[SignatureElement],
+    constraints: SignatureConstraints,
+) -> dict[int, ConstantValue]:
+    """Extract compile-time constant values from signature elements.
+
+    Identifies arguments that can be folded into generated code at compile time.
+    Constants come from three sources:
+
+    1. Bare literals in the signature (e.g., 128 for block size, "leaky_relu"
+       for activation type, True for a boolean flag)
+    2. Arguments with value=1 (tracked in constraints.equal_to_1)
+    3. Arguments not provided (tracked in constraints.none_args)
+
+    Args:
+        signature: List of signature elements in input format.
+        constraints: The collected signature constraints.
+
+    Returns:
+        Dict mapping argument indices to their constant values.
+    """
+    # Use constexpr to identify constant expressions
+    constexprs = {i: constexpr(s) for i, s in enumerate(signature)}
+    constants: dict[int, ConstantValue] = {
+        k: v for k, v in constexprs.items() if v is not None
+    }
+
+    # Add equal_to_1 args with value 1
+    for k in constraints.equal_to_1:
+        constants[k] = 1
+
+    # Add none_args with value None
+    for k in constraints.none_args:
+        constants[k] = None
+
+    return constants
+
+
+def signature_list_to_dict(
+    signature: list[SignatureElement],
+    constants: dict[int, ConstantValue],
+) -> dict[int, str]:
+    """Convert signature from list format to dict format.
+
+    Transforms the input signature list into a dict mapping argument
+    indices to dtype strings. Arguments that are constants are excluded
+    since they don't need runtime type information.
+
+    Args:
+        signature: List of signature elements in input format.
+        constants: Dict of constant argument indices to exclude.
+
+    Returns:
+        Dict mapping non-constant argument indices to their dtype strings.
+    """
+    result: dict[int, str] = {}
+    for i, s in enumerate(signature):
+        if i in constants:
+            continue
+        # After filtering out constants, remaining elements are dtype declarations.
+        # For tuples like ("*fp32", 16), s[0] is the dtype string.
+        # For plain strings like "*fp32", the element itself is the dtype.
+        if isinstance(s, tuple) and len(s) > 1:
+            dtype = s[0]
+        else:
+            dtype = s
+        assert isinstance(dtype, str)
+        result[i] = dtype
+    return result
+
+
+# CC (compute capability) to AMD GPU architecture mapping
+# CC is a 2-digit shorthand: 94 -> gfx942, 95 -> gfx950
+HIP_CC_TO_ARCH_INFO: dict[int, str] = {
+    90: "gfx90a",
+    94: "gfx942",
+    95: "gfx950",
+}
+
+# Reverse mapping: architecture string -> CC string
+HIP_ARCH_TO_CC: dict[str, str] = {v: str(k) for k, v in HIP_CC_TO_ARCH_INFO.items()}
+
+HIP_CC_MI350X: str = "95"  # CC string for gfx950 (MI350X/MI355X)
+
+
+def _normalize_cc(cc: set[str]) -> set[str]:
+    """Normalize CC values to 2-digit format for internal comparison.
+
+    Accepts both tritoncc format ("94", "95") and Triton driver format
+    ("gfx942", "gfx950"). Returns 2-digit CC strings.
+    """
+    return {HIP_ARCH_TO_CC.get(c, c) for c in cc}
+
+
+def get_fp8_replacement_signature_for_amd(
+    spec: dict[str, Any], cc: set[str]
+) -> dict[int, str]:
+    """Replace FP8 dtypes in signature for AMD architectures.
+
+    Args:
+        spec: Compiled spec dict with 'signature' in dict format.
+        cc: Set of CC strings in either format:
+            - 2-digit tritoncc format: {"94"} for gfx942
+            - Triton driver format: {"gfx942"}
+            See HIP_CC_TO_ARCH_INFO.
+
+    Returns:
+        Dict mapping argument indices to dtype strings with FP8 types replaced.
+    """
+    normalized_cc: set[str] = _normalize_cc(cc)
+
+    def replace_fp8_type(dtype_str: str) -> str:
+        if "fp8e4nv" in dtype_str:
+            if HIP_CC_MI350X not in normalized_cc:
+                return dtype_str.replace("fp8e4nv", "fp8e4b8")
+        elif "fp8e4b8" in dtype_str and HIP_CC_MI350X in normalized_cc:
+            return dtype_str.replace("fp8e4b8", "fp8e4nv")
+        return dtype_str
+
+    replace_fp8_signatures: dict[int, str] = {}
+    for key, value in spec["signature"].items():
+        if isinstance(value, str):
+            replace_fp8_signatures[key] = replace_fp8_type(value)
+        else:
+            replace_fp8_signatures[key] = value
+
+    return replace_fp8_signatures
+
+
+def get_fp8_replacement_signature_for_sm80(
+    spec: dict[str, Any],
+) -> dict[int, Any]:
+    """Replace FP8 dtypes with bf16 for SM80 (A100) which lacks native FP8 support.
+
+    Args:
+        spec: Compiled spec dict with 'signature' in dict format.
+
+    Returns:
+        Dict mapping argument indices to dtype strings with FP8 types replaced by bf16.
+    """
+
+    def replace_fp8_type(dtype_str: str) -> str:
+        if "fp8e4nv" in dtype_str:
+            return dtype_str.replace("fp8e4nv", "bf16")
+        return dtype_str
+
+    replace_fp8_signatures: dict[int, Any] = {}
+    for key, value in spec["signature"].items():
+        if isinstance(value, tuple) and isinstance(value[0], str):
+            replace_fp8_signatures[key] = (replace_fp8_type(value[0]), value[1])
+        elif isinstance(value, str):
+            replace_fp8_signatures[key] = replace_fp8_type(value)
+        else:
+            replace_fp8_signatures[key] = value
+
+    return replace_fp8_signatures
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/shared/types.py b/recommendation_v4/generative_recommenders/ops/triton_aot/shared/types.py
new file mode 100644
index 000000000..6a870fc4d
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/shared/types.py
@@ -0,0 +1,58 @@
+# pyre-strict
+
+"""Shared type definitions for AOTT and Triton CC.
+
+This module contains fundamental type mappings used across the compiler.
+"""
+
+from typing import Any
+
+# Mapping from Triton dtype names to C type names
+CTYPES: dict[str, str] = {
+    "i1": "bool",
+    "u8": "uint8_t",
+    "i8": "int8_t",
+    "i16": "int16_t",
+    "i32": "int32_t",
+    "i64": "int64_t",
+    "fp16": "half",
+    "fp32": "float",
+    "fp64": "double",
+    "bf16": "__nv_bfloat16",
+    "fp8e4nv": "__nv_fp8_e4m3",
+    "fp8e4b8": "__hip_fp8_e4m3_fnuz",
+}
+
+# Mapping from Triton pointer dtype names to ATen scalar types
+ATYPES: dict[str, str] = {
+    "*i1": "at::kBool",
+    "*u8": "at::kByte",
+    "*i8": "at::kChar",
+    "*i16": "at::kShort",
+    "*i32": "at::kInt",
+    "*i64": "at::kLong",
+    "*fp16": "at::kHalf",
+    "*fp32": "at::kFloat",
+    "*fp64": "at::kDouble",
+    "*bf16": "at::kBFloat16",
+    "*fp8e4nv": "at::kFloat8_e4m3fn",
+    "*fp8e4b8": "at::kFloat8_e4m3fnuz",
+}
+
+# Mapping from Python types to C++ type names
+PY_TYPES_TO_CPP_TYPES: dict[type[Any], str] = {
+    int: "int64_t",
+    str: "at::string",
+    float: "double",
+}
+
+# Default values for autotuning attributes.
+# These are used as default kernel launch parameters.
+AUTOTUNE_ATTRs: dict[str, int] = {
+    "num_warps": 4,
+    "num_stages": 3,
+    # AMD only
+    "matrix_instr_nonkdim": 0,
+    "waves_per_eu": 1,
+    "kpack": 1,
+}
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/templates/embedded_cubins.cpp b/recommendation_v4/generative_recommenders/ops/triton_aot/templates/embedded_cubins.cpp
new file mode 100644
index 000000000..d269c3555
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/templates/embedded_cubins.cpp
@@ -0,0 +1,7 @@
+#include <cstddef>
+
+extern "C" {
+// __TRITON_AOT_GENERATE_BEGIN__ CUBIN_ARRAYS
+// placeholder
+// __TRITON_AOT_GENERATE_END__ CUBIN_ARRAYS
+}
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/templates/kernel.cpp b/recommendation_v4/generative_recommenders/ops/triton_aot/templates/kernel.cpp
new file mode 100644
index 000000000..7f882f11f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/templates/kernel.cpp
@@ -0,0 +1,104 @@
+// __TRITON_AOT_GENERATE_BEGIN__ HEADER_INCLUDE
+#include "kernel.h"
+// __TRITON_AOT_GENERATE_END__ HEADER_INCLUDE
+// These headers are used by code generated at runtime in KERNEL_SPECS blocks
+#include <torch/csrc/stable/accelerator.h>
+#include <torch/headeronly/core/ScalarType.h> // NOLINT(facebook-unused-include-check)
+
+inline void triton_aot_cu_check(CUresult err, const char* file, int line) {
+  if (err != CUDA_SUCCESS) {
+    const char* err_str;
+    cuGetErrorString(err, &err_str);
+    throw std::runtime_error(
+        std::string(file) + ":" + std::to_string(line) +
+        " CUDA driver error: " + (err_str ? err_str : "unknown"));
+  }
+}
+#define TRITON_AOT_CU_CHECK(EXPR) triton_aot_cu_check(EXPR, __FILE__, __LINE__)
+
+// NOLINTNEXTLINE(facebook-hte-NullableReturn): error path throws
+inline cudaStream_t triton_aot_get_current_stream() {
+  auto device_idx = torch::stable::accelerator::getCurrentDeviceIndex();
+  void* stream_ptr = nullptr;
+  // TODO: No torch::stable op provides the same functionality
+  // today. Revisit if torch exposes a proper stable::accelerator stream API.
+  if (aoti_torch_get_current_cuda_stream(device_idx, &stream_ptr) != 0) {
+    throw std::runtime_error("Failed to get current CUDA stream");
+  }
+  return reinterpret_cast<cudaStream_t>(stream_ptr);
+}
+
+namespace triton {
+namespace aot {
+
+namespace {
+[[maybe_unused]] int compute_capability() {
+  // Cached: AOTT hosts use homogeneous GPUs.
+  static int cc = 0;
+  if (cc == 0) {
+    CUdevice device;
+    TRITON_AOT_CU_CHECK(cuCtxGetDevice(&device));
+    int major, minor;
+    TRITON_AOT_CU_CHECK(cuDeviceGetAttribute(
+        &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
+    TRITON_AOT_CU_CHECK(cuDeviceGetAttribute(
+        &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
+    cc = major * 10 + minor;
+  }
+  return cc;
+}
+} // namespace
+
+namespace {
+#ifdef USE_ROCM
+[[maybe_unused]] void check_errors(int shared, hipFunction_t func) {
+  // HIP doesn't need the same shared memory configuration as CUDA
+  return;
+}
+#else
+[[maybe_unused]] void check_errors(int shared, CUfunction func) {
+  int shared_optin;
+  int device = 0;
+  TRITON_AOT_CU_CHECK(cuDeviceGetAttribute(
+      &shared_optin,
+      CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
+      device));
+  if (shared > 49152 && shared_optin > 49152) {
+    // If requested/shared_optin exceed 48 KB, it switches cache to prefer
+    // shared memory and sets the max dynamic shared memory so the kernel can
+    // allocate the larger amount needed.
+    TRITON_AOT_CU_CHECK(
+        cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_SHARED));
+    int shared_total, shared_static;
+    TRITON_AOT_CU_CHECK(cuDeviceGetAttribute(
+        &shared_total,
+        CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
+        device));
+    TRITON_AOT_CU_CHECK(cuFuncGetAttribute(
+        &shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, func));
+    TRITON_AOT_CU_CHECK(cuFuncSetAttribute(
+        func,
+        CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
+        shared_optin - shared_static));
+  }
+}
+#endif
+} // namespace
+
+// __TRITON_AOT_GENERATE_BEGIN__ KERNEL_SPECS
+// __TRITON_AOT_GENERATE_END__ KERNEL_SPECS
+
+// __TRITON_AOT_GENERATE_BEGIN__ SELECTOR
+// __TRITON_AOT_GENERATE_END__ SELECTOR
+
+} // namespace aot
+} // namespace triton
+
+// Anchor: keeps the inline `triton_aot_get_current_stream` (and its reference
+// to `aoti_torch_get_current_cuda_stream`) from being dead-stripped at
+// buck-build time, where KERNEL_SPECS is empty. `weak` dedups the symbol
+// across the per-op .so files generated by the runtime template substitution.
+extern "C" __attribute__((weak, visibility("default"))) cudaStream_t
+__triton_aot_anchor_get_stream() {
+  return triton_aot_get_current_stream();
+}
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/templates/kernel.h b/recommendation_v4/generative_recommenders/ops/triton_aot/templates/kernel.h
new file mode 100644
index 000000000..6b6f76e17
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/templates/kernel.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <torch/csrc/stable/tensor.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+namespace triton {
+namespace aot {
+
+#ifndef GRID_DIM_DEFINED_MACRO
+struct gridDims {
+  int x = 1;
+  int y = 1;
+  int z = 1;
+  cudaStream_t stream = nullptr;
+  gridDims(int _x = 1, int _y = 1, int _z = 1, cudaStream_t _stream = nullptr)
+      : x(_x), y(_y), z(_z), stream(_stream) {}
+};
+#define GRID_DIM_DEFINED_MACRO
+#endif
+
+#ifndef FITS_I32_DEFINED_MACRO
+constexpr bool fits_i32(int64_t v) {
+  return v >= INT32_MIN && v <= INT32_MAX;
+}
+#define FITS_I32_DEFINED_MACRO
+#endif
+
+// __TRITON_AOT_GENERATE_BEGIN__ TUNER_META_CPP
+// __TRITON_AOT_GENERATE_END__ TUNER_META_CPP
+// __TRITON_AOT_GENERATE_BEGIN__ SELECTOR_PROTO
+// __TRITON_AOT_GENERATE_END__ SELECTOR_PROTO
+
+} // namespace aot
+} // namespace triton
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/templates/template_utils.py b/recommendation_v4/generative_recommenders/ops/triton_aot/templates/template_utils.py
new file mode 100644
index 000000000..d89bbe8b6
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/templates/template_utils.py
@@ -0,0 +1,96 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# pyre-strict
+
+"""
+Common utilities for template loading and rendering.
+
+This module provides functions to load template files from the templates
+directory and render them by replacing marker blocks with actual values.
+"""
+
+import re
+from collections import Counter
+from importlib import resources
+
+
+def load_template(name: str) -> str:
+    """Load template file content from Buck resources.
+
+    Templates are loaded from the resources bundled with this package via Buck's
+    select_accelerator mechanism:
+    - AMD builds: hipified templates with HIP APIs
+    - NVIDIA builds: original templates with CUDA APIs
+
+    Args:
+        name: Template filename (e.g., 'kernel.cpp', 'embedded_cubins.cpp').
+
+    Returns:
+        The template file content as a string.
+    """
+    return resources.files(__package__).joinpath(name).read_text()
+
+
+def render_template(template: str, replacements: dict[str, str]) -> str:
+    """Replace block markers in template with actual values.
+
+    Replaces content between "// __TRITON_AOT_GENERATE_BEGIN__ NAME"
+    and "// __TRITON_AOT_GENERATE_END__ NAME" with the value for key "NAME".
+    Each key must have exactly one BEGIN/END pair in the template.
+    The markers are preserved for easier debugging.
+
+    Args:
+        template: Template string containing marker blocks.
+        replacements: Dict mapping marker names to replacement values.
+
+    Returns:
+        Rendered template with all marker blocks replaced.
+
+    Raises:
+        AssertionError: If markers are duplicated, mismatched, or keys don't match.
+    """
+    BEGIN_PREFIX = "// __TRITON_AOT_GENERATE_BEGIN__ "
+    END_PREFIX = "// __TRITON_AOT_GENERATE_END__ "
+
+    begin_keys = re.findall(r"// __TRITON_AOT_GENERATE_BEGIN__ (\w+)", template)
+    end_keys = re.findall(r"// __TRITON_AOT_GENERATE_END__ (\w+)", template)
+
+    # Check for duplicate keys
+    begin_key_counts = Counter(begin_keys)
+    end_key_counts = Counter(end_keys)
+    for key, count in begin_key_counts.items():
+        assert count == 1, f"Duplicate BEGIN marker for key: {key}"
+    for key, count in end_key_counts.items():
+        assert count == 1, f"Duplicate END marker for key: {key}"
+
+    # Check BEGIN and END keys match
+    template_keys = set(begin_keys)
+    assert template_keys == set(end_keys), (
+        f"Mismatched BEGIN/END markers: BEGIN={template_keys}, END={set(end_keys)}"
+    )
+
+    # Validate keys match between template and replacements
+    replacement_keys = set(replacements.keys())
+    assert template_keys == replacement_keys, (
+        f"Keys mismatch: in template but not in replacements: {template_keys - replacement_keys}, "
+        f"in replacements but not in template: {replacement_keys - template_keys}"
+    )
+
+    # Do the replacements
+    result = template
+    for key, value in replacements.items():
+        begin_marker = f"{BEGIN_PREFIX}{key}"
+        end_marker = f"{END_PREFIX}{key}"
+
+        begin_idx = result.find(begin_marker)
+        newline_idx = result.find("\n", begin_idx)
+        assert newline_idx != -1, (
+            f"BEGIN marker for key '{key}' must be followed by newline"
+        )
+        content_start = newline_idx + 1
+        end_idx = result.find(end_marker, begin_idx)
+
+        result = result[:content_start] + value + result[end_idx:]
+
+    return result
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/templates/torch_op.cpp b/recommendation_v4/generative_recommenders/ops/triton_aot/templates/torch_op.cpp
new file mode 100644
index 000000000..c2e5a4063
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/templates/torch_op.cpp
@@ -0,0 +1,22 @@
+// __TRITON_AOT_GENERATE_BEGIN__ HEADER_INCLUDE
+#include "kernel.h"
+// __TRITON_AOT_GENERATE_END__ HEADER_INCLUDE
+#include <torch/csrc/stable/library.h>
+#include <torch/csrc/stable/tensor.h> // NOLINT(facebook-unused-include-check)
+
+// __TRITON_AOT_GENERATE_BEGIN__ TORCH_OP
+namespace {
+// no-op, force link StableLibrary
+torch::stable::Tensor _triton_aot_placeholder_noop(
+    torch::stable::Tensor input) {
+  return input;
+}
+} // namespace
+
+STABLE_TORCH_LIBRARY_FRAGMENT(triton_aot, m) {
+  m.def("_placeholder_noop(Tensor input) -> Tensor");
+}
+STABLE_TORCH_LIBRARY_IMPL(triton_aot, CPU, m) {
+  m.impl("_placeholder_noop", TORCH_BOX(&_triton_aot_placeholder_noop));
+}
+// __TRITON_AOT_GENERATE_END__ TORCH_OP
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/transform/import_utils.py b/recommendation_v4/generative_recommenders/ops/triton_aot/transform/import_utils.py
new file mode 100644
index 000000000..e3c4f1955
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/transform/import_utils.py
@@ -0,0 +1,89 @@
+# pyre-strict
+
+"""
+Import-header utilities for triton_aot codegen.
+"""
+
+import ast
+
+from torch import package
+
+
+def get_original_import_header(source_code: str) -> str:
+    """Extract all import statements from *source_code* as a single string."""
+    tree = ast.parse(source_code)
+    import_header = ""
+    for node in ast.walk(tree):
+        if isinstance(node, ast.ImportFrom):
+            import_header += ast.unparse(node) + "\n"
+        elif isinstance(node, ast.Import):
+            import_header += ast.unparse(node) + "\n"
+    return import_header
+
+
+def _is_extern_module(module_name: str, extern_modules: set[str]) -> bool:
+    """Return True if *module_name* (or a parent) is in the extern set."""
+    if module_name in extern_modules:
+        return True
+    parts = module_name.split(".")
+    for i in range(1, len(parts)):
+        if ".".join(parts[:i]) in extern_modules:
+            return True
+    return False
+
+
+def rewrite_package_imports(
+    import_header: str,
+    package_importer: package.PackageImporter,
+) -> str:
+    """Rewrite interned imports to use ``_package_importer``.
+
+    Extern modules (``torch``, ``typing``, …) keep regular ``import``
+    statements.  Interned modules (for example, local
+    ``generative_recommenders.*`` modules) are rewritten to::
+
+        _pkg_mod = _package_importer.import_module(
+            'generative_recommenders.ops.triton.triton_utils'
+        )
+        helper = _pkg_mod.helper
+
+    The ``_package_importer`` object is injected into the wrapper module's
+    namespace by ``replace_kernels`` before ``exec_module`` is called.
+    """
+    extern_modules = set(package_importer.extern_modules)
+    header_tree = ast.parse(import_header)
+
+    regular: list[str] = []
+    from_package: list[str] = []
+
+    for node in header_tree.body:
+        if isinstance(node, ast.Import):
+            for alias in node.names:
+                if _is_extern_module(alias.name, extern_modules):
+                    regular.append(ast.unparse(node))
+                else:
+                    local = alias.asname or alias.name
+                    from_package.append(
+                        f"{local} = _package_importer.import_module('{alias.name}')"
+                    )
+        elif isinstance(node, ast.ImportFrom):
+            mod = node.module or ""
+            if _is_extern_module(mod, extern_modules):
+                regular.append(ast.unparse(node))
+            else:
+                var = f"_pkg_{mod.replace('.', '_')}"
+                from_package.append(f"{var} = _package_importer.import_module('{mod}')")
+                for alias in node.names:
+                    local = alias.asname or alias.name
+                    from_package.append(f"{local} = {var}.{alias.name}")
+        else:
+            # Non-import statement (should not appear, but preserve if it does)
+            regular.append(ast.unparse(node))
+
+    parts: list[str] = []
+    if regular:
+        parts.append("\n".join(regular))
+    if from_package:
+        parts.append("# Imports resolved from torch package via _package_importer")
+        parts.append("\n".join(from_package))
+    return "\n".join(parts) + "\n" if parts else ""
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/transform/kernel_wrapper_codegen.py b/recommendation_v4/generative_recommenders/ops/triton_aot/transform/kernel_wrapper_codegen.py
new file mode 100644
index 000000000..51d144f1b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/transform/kernel_wrapper_codegen.py
@@ -0,0 +1,500 @@
+# pyre-strict
+import ast
+import inspect
+import os
+from typing import Any, Callable, Dict, List, Optional
+
+from generative_recommenders.ops.triton_aot.compile.compile_state import (
+    get_aott_compile_path,
+    get_triton_aot_kernel_specs,
+)
+from generative_recommenders.ops.triton_aot.compile.utils import unwrap_heuristic
+from generative_recommenders.ops.triton_aot.shared.compat import get_kernel_name
+from generative_recommenders.ops.triton_aot.transform.import_utils import (
+    get_original_import_header,
+    rewrite_package_imports,
+)
+from generative_recommenders.ops.triton_aot.types import TritonAOT
+from pyre_extensions import none_throws
+from torch import package
+from torch.fx import GraphModule
+
+# @manual=//triton:triton
+from triton.runtime.autotuner import Autotuner
+from triton.runtime.jit import JITFunction, KernelInterface
+
+
+def _is_torch_package_module(module_name: str) -> bool:
+    """Check if a module name is from torch.package namespace."""
+    return module_name.startswith("<torch_package")
+
+
+def _strip_torch_package_prefix(module_name: str) -> str:
+    """Strip the torch.package namespace prefix from a module name.
+
+    Example:
+        '<torch_package_0>.generative_recommenders.ops.triton_aot.triton_layer_norm'
+        -> 'generative_recommenders.ops.triton_aot.triton_layer_norm'
+    """
+    if _is_torch_package_module(module_name):
+        # Remove '<torch_package_N>.' prefix
+        return module_name.split(".", 1)[1]
+    return module_name
+
+
+def _get_clean_module_basename(module_name: str) -> str:
+    """Get the basename of a module, stripping torch.package prefix if present.
+
+    Example:
+        '<torch_package_0>.generative_recommenders.ops.triton_aot.triton_layer_norm'
+        -> 'triton_layer_norm'
+        'generative_recommenders.ops.triton_aot.triton_layer_norm'
+        -> 'triton_layer_norm'
+    """
+    clean_name = _strip_torch_package_prefix(module_name)
+    return clean_name.rsplit(".", 1)[-1]
+
+
+def _extract_function_source(module_source: str, fn_name: str) -> str:
+    """Extract a function's source code from module source.
+
+    Parses the module source and extracts just the function definition.
+    """
+    tree = ast.parse(module_source)
+    for node in ast.walk(tree):
+        if isinstance(node, ast.FunctionDef) and node.name == fn_name:
+            return ast.unparse(node)
+    raise ValueError(f"Function '{fn_name}' not found in module source")
+
+
+def _get_module_and_source(
+    target: Callable[..., Any],
+    package_importer: Optional[package.PackageImporter],
+) -> tuple[Any, str, str]:
+    """Get module, module source, and function source for a callable.
+
+    Handles both regular modules and torch.package loaded modules.
+
+    Args:
+        target: The callable (function) to get source for
+        package_importer: Optional PackageImporter for torch.package modules
+
+    Returns:
+        Tuple of (module, module_source, function_source)
+    """
+    module_name = target.__module__
+    fn_name = target.__name__
+
+    if _is_torch_package_module(module_name) and package_importer is not None:
+        # Handle torch.package namespace
+        real_module_name = _strip_torch_package_prefix(module_name)
+        assert real_module_name.startswith(
+            "generative_recommenders.ops.triton_aot"
+        ) or real_module_name.startswith("prime_perf_optimizer"), (
+            f"Expected module under 'generative_recommenders.ops.triton_aot' or 'prime_perf_optimizer', got: {real_module_name}"
+        )
+
+        # Get module source from package
+        module_source = package_importer.get_source(real_module_name)
+
+        # Import the module through the package importer
+        fn_module = package_importer.import_module(real_module_name)
+
+        # Extract function source from module source
+        fn_source = _extract_function_source(module_source, fn_name)
+
+        return fn_module, module_source, fn_source
+    else:
+        # Standard module handling
+        fn_module = inspect.getmodule(target)
+        module_source = inspect.getsource(none_throws(fn_module))
+        fn_source = inspect.getsource(target)
+
+        return fn_module, module_source, fn_source
+
+
+def _calls_triton_aot_kernel(node: ast.FunctionDef, kernel_name: str) -> bool:
+    """
+    kernel_name is the JIT function name (e.g. "_weighted_layer_norm_fwd"),
+    which may differ from the wrapper function name (e.g.
+    "_triton_aot_swish_layer_norm").  We match by looking for a
+    Subscript-call ``kernel_name[grid](...)`` inside the function body.
+    """
+    for child in ast.walk(node):
+        if (
+            isinstance(child, ast.Call)
+            and isinstance(child.func, ast.Subscript)
+            and isinstance(child.func.value, ast.Name)
+            and child.func.value.id == kernel_name
+        ):
+            return True
+    return False
+
+
+def _is_torch_jit_unused(d: ast.expr) -> bool:
+    """Check if a decorator AST node represents @torch.jit.unused."""
+    return (
+        isinstance(d, ast.Attribute)
+        and d.attr == "unused"
+        and isinstance(d.value, ast.Attribute)
+        and d.value.attr == "jit"
+        and isinstance(d.value.value, ast.Name)
+        and d.value.value.id == "torch"
+    )
+
+
+def strip_jit_unused_decorator(
+    node: ast.FunctionDef, kernel_name: str
+) -> ast.FunctionDef:
+    """Strip @torch.jit.unused if the function body calls ``kernel_name[grid](...)``.
+
+    kernel_name is the TritonAOT kernel's JIT function name (e.g.
+    ``_weighted_layer_norm_fwd``), not the wrapper function name.  This avoids
+    relying on a naming convention on the wrapper function itself.
+    """
+    if _calls_triton_aot_kernel(node, kernel_name):
+        node.decorator_list = [
+            d for d in node.decorator_list if not _is_torch_jit_unused(d)
+        ]
+    return node
+
+
+class TritonAOTOperatorTransform(ast.NodeTransformer):
+    def __init__(self, kernel: Any) -> None:
+        super().__init__()
+        self._kernel: Any = kernel
+        self._kernel_jit_fn: JITFunction[List[Any]] = unwrap_heuristic(
+            kernel, return_type=JITFunction
+        )
+        self._kernel_autotuner: Optional[Autotuner] = unwrap_heuristic(
+            kernel, return_type=Autotuner
+        )
+        self._kernel_name: str = get_kernel_name(self._kernel_jit_fn)
+        # Only transform the function body
+        self._autotune_params: List[str] = (
+            list(list(self._kernel_autotuner.cache.values())[0].kwargs.keys())
+            if self._kernel_autotuner is not None
+            else []
+        )
+        self._autotune_params += ["num_warps", "num_stages"]
+
+        self._lambda_arg_name: Optional[str] = None
+        self._grid_name: Optional[str] = None
+        self._autotune_key_id: Optional[Dict[str, int]] = None
+        self._autotune_key_map: Optional[Dict[str, ast.expr]] = None
+        self._kernel_meta: Optional[ast.Assign] = None
+
+        if self._kernel_autotuner is not None:
+            autotune_key_id: Dict[str, int] = {}
+            self._autotune_key_id = autotune_key_id
+            # pyre-ignore[16]: JITFunction has arg_names at runtime
+            for key in self._kernel_autotuner.keys:
+                autotune_key_id[key] = self._kernel_jit_fn.arg_names.index(key)
+
+    def generate_function_meta(self) -> None:
+        targets = [
+            ast.Name(id=param, ctx=ast.Store()) for param in self._autotune_params
+        ]
+        autotune_key_map = self._autotune_key_map
+        kernel_autotuner = self._kernel_autotuner
+        call = ast.Call(
+            func=ast.Name(id=f"{self._kernel_name}_meta", ctx=ast.Load()),
+            args=[
+                none_throws(autotune_key_map)[key]
+                for key in none_throws(kernel_autotuner).keys
+            ]
+            if kernel_autotuner is not None
+            else [],
+            keywords=[],
+        )
+        self._kernel_meta = ast.Assign(
+            # pyre-ignore[6]: ast.Assign targets type
+            targets=[ast.Tuple(elts=targets, ctx=ast.Store())],
+            value=call,
+        )
+
+    def visit_FunctionDef(self, node: ast.FunctionDef) -> ast.FunctionDef:
+        strip_jit_unused_decorator(node, self._kernel_name)
+
+        new_body: List[ast.stmt] = []
+        stmts = node.body
+        for stmt in stmts:
+            if isinstance(stmt, ast.Assign):
+                for target in stmt.targets:
+                    if isinstance(target, ast.Name) and target.id == self._grid_name:
+                        assert self._kernel_meta is not None
+                        self._kernel_meta.lineno = stmt.lineno
+                        new_body.append(self._kernel_meta)
+            new_body.append(self.visit(stmt))
+        node.body = new_body
+        return node
+
+    def visit_Assign(self, node: ast.Assign) -> ast.Assign:
+        for target in node.targets:
+            if isinstance(target, ast.Name) and isinstance(node.value, ast.Lambda):
+                lambda_node = node.value
+                self._lambda_arg_name = lambda_node.args.args[0].arg
+                lambda_body = lambda_node.body
+                assert isinstance(lambda_body, ast.Tuple)
+                new_elts: List[ast.expr] = []
+                for elt in lambda_body.elts:
+                    new_elts.append(self.visit(elt))
+                node.value = ast.Tuple(elts=new_elts, ctx=ast.Load())
+                self._lambda_arg_name = None
+        return node
+
+    def visit_Subscript(self, node: ast.Subscript) -> ast.expr:
+        if isinstance(node.value, ast.Name) and node.value.id == self._lambda_arg_name:
+            assert isinstance(node.slice, ast.Constant)
+            assert isinstance(node.slice.value, str)
+            var_name = node.slice.value
+            # pyre-ignore
+            node = ast.Name(id=var_name, ctx=ast.Load())
+        return node
+
+    def visit_Expr(self, node: ast.Expr) -> ast.Expr:
+        if isinstance(node.value, ast.Call):
+            call = node.value
+            if (
+                isinstance(call.func, ast.Subscript)
+                and isinstance(call.func.value, ast.Name)
+                and call.func.value.id == self._kernel_name
+            ):
+                grid_arg = call.func.slice
+                new_func = ast.Attribute(
+                    value=ast.Attribute(
+                        value=ast.Attribute(
+                            value=ast.Name(id="torch", ctx=ast.Load()),
+                            attr="ops",
+                            ctx=ast.Load(),
+                        ),
+                        attr="triton_aot",
+                        ctx=ast.Load(),
+                    ),
+                    attr=self._kernel_name,
+                    ctx=ast.Load(),
+                )
+                new_args = [grid_arg] + call.args
+                new_keywords = call.keywords + [
+                    ast.keyword(arg=param, value=ast.Name(id=param, ctx=ast.Load()))
+                    for param in self._autotune_params
+                ]
+                node.value = ast.Call(
+                    func=new_func,
+                    args=new_args,
+                    keywords=new_keywords,
+                )
+        return node
+
+    def contains_triton_call(self, node: ast.AST) -> bool:
+        for child in ast.walk(node):
+            if (
+                isinstance(child, ast.Call)
+                and isinstance(child.func, ast.Subscript)
+                # pyre-ignore[16]: ast.expr may have `id` attribute at runtime
+                and child.func.value.id == self._kernel_name
+            ):
+                # pyrefly: ignore [missing-attribute]
+                self._grid_name = child.func.slice.id
+
+                if self._kernel_autotuner is not None:
+                    autotune_key_map: Dict[str, ast.expr] = {}
+                    self._autotune_key_map = autotune_key_map
+                    # pyre-ignore[16]: Autotuner has keys at runtime
+                    for key in self._kernel_autotuner.keys:
+                        found_key = False
+                        for keyword in child.keywords:
+                            if keyword.arg == key:
+                                autotune_key_map[key] = keyword.value
+                                found_key = True
+                                break
+
+                        if not found_key:
+                            autotune_key_id = self._autotune_key_id
+                            assert autotune_key_id is not None
+                            assert key in autotune_key_id
+                            key_id = autotune_key_id[key]
+                            autotune_key_map[key] = child.args[key_id]
+
+                self.generate_function_meta()
+                return True
+        return False
+
+    def contains_lambda(self, node: ast.AST) -> bool:
+        for child in ast.walk(node):
+            if isinstance(child, ast.Lambda):
+                return True
+        return False
+
+    def _get_grid_name(self, node: ast.AST) -> Optional[str]:
+        for child in ast.walk(node):
+            if (
+                isinstance(child, ast.Call)
+                and isinstance(child.func, ast.Subscript)
+                # pyre-ignore[16]: ast.expr may have `id` attribute at runtime
+                and child.func.value.id == self._kernel_name
+            ):
+                # pyrefly: ignore [missing-attribute]
+                return child.func.slice.id
+        return None
+
+    def generate_so_loading_code(
+        self,
+        node: ast.AST,
+        abs_triton_aot_path: str,
+    ) -> str:
+        """Return auto-generated code to load the compiled kernel at runtime.
+
+        If *node* contains a call to this transformer's kernel, returns
+        ``import importlib.util`` + meta-module loading + ``torch.ops.load_library``
+        code.  Otherwise returns an empty string.
+
+        This method also sets up internal transformer state (grid name,
+        autotune key map, etc.) via ``contains_triton_call`` as a side effect.
+
+        Example for _addmm_fwd kernel:
+            kernel_dir = "triton_addmm__addmm_fwd"
+            meta_module_path = "/path/to/triton_aot_compile/triton_addmm__addmm_fwd/_addmm_fwd_meta.py"
+            so_path = "/path/to/triton_aot_compile/triton_addmm__addmm_fwd/addmm_fwd.so"
+        """
+        if not self.contains_triton_call(node):
+            return ""
+
+        kernel_dir = f"{_get_clean_module_basename(self._kernel_jit_fn.__module__)}_{self._kernel_name}"
+
+        meta_module_path = os.path.join(
+            abs_triton_aot_path, kernel_dir, f"{self._kernel_name}_meta.py"
+        )
+
+        so_path = os.path.join(
+            abs_triton_aot_path,
+            kernel_dir,
+            f"{self._kernel_name.lstrip('_')}.so",
+        )
+
+        return f"""
+# Auto-generated by triton_aot.kernel_wrapper_codegen
+import importlib.util
+_meta_spec = importlib.util.spec_from_file_location("{self._kernel_name}_meta", "{meta_module_path}")
+_meta_module = importlib.util.module_from_spec(_meta_spec)
+_meta_spec.loader.exec_module(_meta_module)
+{self._kernel_name}_meta = _meta_module.{self._kernel_name}_meta
+
+torch.ops.load_library("{so_path}")
+"""
+
+
+def _find_triton_aot_kernel(
+    node_target: Any,
+    kernel_specs: Dict[KernelInterface[List[Any]], List[Dict[str, List[Any]]]],
+) -> Optional[TritonAOT]:
+    """Find the single TritonAOT kernel referenced in a node target's globals.
+
+    Scans ``node_target.__globals__`` for ``TritonAOT`` instances, validates
+    that every instance appears in *kernel_specs*, and asserts at most one
+    kernel is present (per the one-kernel-per-wrapper invariant).
+
+    Returns the kernel, or ``None`` if the function references no kernels.
+    """
+    kernels: set[TritonAOT] = set()
+    for _, var in node_target.__globals__.items():
+        if isinstance(var, TritonAOT):
+            if var.fn in kernel_specs:
+                kernels.add(var)
+            else:
+                raise RuntimeError(
+                    f"Cannot find TritonAOT kernel {var.fn} in TRITON_AOT_KERNEL_SPECS"
+                )
+
+    if len(kernels) == 0:
+        return None
+
+    fn_name = node_target.__name__
+    assert len(kernels) == 1, (
+        f"Expected exactly 1 kernel per wrapper function '{fn_name}', "
+        f"got {len(kernels)}"
+    )
+    (kernel_obj,) = kernels
+    return kernel_obj
+
+
+def _generate_wrapper_files(
+    node_target: Any,
+    kernel: TritonAOT,
+    compile_path: str,
+    package_importer: Optional[package.PackageImporter],
+) -> None:
+    """Generate ``_original.py`` and ``_wrapper.py`` for a single kernel.
+
+    Creates a per-kernel subdirectory under *compile_path*, writes the
+    original function source, then AST-transforms the wrapper to replace
+    ``kernel[grid](...)`` with ``torch.ops.triton_aot.*`` calls.
+    """
+    fn_name = node_target.__name__
+
+    jit_fn = none_throws(
+        unwrap_heuristic(kernel, return_type=JITFunction),
+        f"Failed to unwrap kernel to JITFunction: {kernel}",
+    )
+    kernel_dir = (
+        f"{_get_clean_module_basename(jit_fn.__module__)}_{get_kernel_name(jit_fn)}"
+    )
+    output_dir = os.path.join(compile_path, kernel_dir)
+    os.makedirs(output_dir, exist_ok=True)
+
+    _, module_code, wrapper_code = _get_module_and_source(node_target, package_importer)
+    import_header = get_original_import_header(module_code)
+
+    with open(os.path.join(output_dir, f"{fn_name}_original.py"), "w") as f:
+        f.write(import_header)
+        f.write(wrapper_code)
+
+    # When source comes from a torch package, rewrite interned
+    # imports to use _package_importer, which
+    # is injected by replace_kernels at load time.  Must happen
+    # before auto-generated code is appended (so stdlib imports
+    # like ``import importlib.util`` are not touched).
+    if package_importer is not None:
+        import_header = rewrite_package_imports(import_header, package_importer)
+
+    tree = ast.parse(wrapper_code)
+    transformer = TritonAOTOperatorTransform(kernel=kernel)
+    import_header += transformer.generate_so_loading_code(tree, compile_path)
+    tree = transformer.visit(tree)
+
+    new_source_code = ast.unparse(tree)
+
+    with open(os.path.join(output_dir, f"{fn_name}_wrapper.py"), "w") as f:
+        f.write(import_header)
+        f.write(new_source_code)
+
+
+def kernel_wrapper_codegen(
+    module: GraphModule, packageImporter: package.PackageImporter | None = None
+) -> None:
+    """
+    Generate wrapper files for TritonAOT kernels.
+    Requirement: under wrapper.py, @triton.jit kernel/func is imported without 'as' alias.
+
+    For each function containing TritonAOT kernels, generates:
+    - {fn_name}_original.py: Original source code with imports
+    - {fn_name}_wrapper.py: Transformed wrapper that uses torch.ops.triton_aot
+    """
+    compile_path = get_aott_compile_path()
+    if not os.path.exists(compile_path):
+        os.makedirs(compile_path)
+
+    transformed_ops: set[Callable[..., Any]] = set()
+    kernel_specs = get_triton_aot_kernel_specs()
+    for node in module.graph.nodes:
+        if node.op == "call_function" and hasattr(node.target, "__globals__"):
+            if node.target not in transformed_ops:
+                transformed_ops.add(node.target)
+            else:
+                continue
+
+            kernel = _find_triton_aot_kernel(node.target, kernel_specs)
+            if kernel is not None:
+                _generate_wrapper_files(
+                    node.target, kernel, compile_path, packageImporter
+                )
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/transform/replace_kernels.py b/recommendation_v4/generative_recommenders/ops/triton_aot/transform/replace_kernels.py
new file mode 100644
index 000000000..53e5d83e9
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/transform/replace_kernels.py
@@ -0,0 +1,137 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files
+# (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge,
+# publish, distribute, sublicense, and/or sell copies of the Software,
+# and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import importlib.util
+import logging
+import os
+import sys
+from typing import Any, Dict, Optional
+
+from generative_recommenders.ops.triton_aot.compile.compile_state import (
+    get_aott_compile_path,
+)
+from torch import package
+from torch.fx import GraphModule
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+def _find_wrapper_files(
+    compile_path: str,
+) -> list[tuple[str, str, str]]:
+    """Find all ``*_wrapper.py`` files under *compile_path*.
+
+    Walks one level deep into kernel subdirectories and returns a list of
+    ``(wrapper_name, fn_name, wrapper_path)`` tuples.
+    """
+    results: list[tuple[str, str, str]] = []
+    for dirpath, dirnames, filenames in os.walk(compile_path):
+        if dirpath != compile_path:
+            dirnames.clear()  # only recurse one level into kernel subdirs
+        for item in filenames:
+            if item.endswith("_wrapper.py"):
+                wrapper_name = item.removesuffix(".py")
+                fn_name = wrapper_name.removesuffix("_wrapper")
+                wrapper_path = os.path.join(dirpath, item)
+                results.append((wrapper_name, fn_name, wrapper_path))
+    return results
+
+
+def _load_wrapper_module(
+    wrapper_name: str,
+    fn_name: str,
+    wrapper_path: str,
+    package_importer: Optional[package.PackageImporter],
+) -> Optional[Any]:
+    """Dynamically import a single ``*_wrapper.py`` and return its wrapper callable.
+
+    Returns ``None`` if the module does not expose a function named *fn_name*.
+    """
+    spec = importlib.util.spec_from_file_location(wrapper_name, wrapper_path)
+    assert spec is not None, f"Failed to create spec for {wrapper_path}"
+    assert spec.loader is not None, f"Spec has no loader for {wrapper_path}"
+
+    loader = spec.loader
+    wrapper_module = importlib.util.module_from_spec(spec)
+
+    sys.modules[wrapper_name] = wrapper_module
+
+    if package_importer is not None:
+        wrapper_module._package_importer = package_importer  # type: ignore[attr-defined]
+
+    loader.exec_module(wrapper_module)
+
+    if hasattr(wrapper_module, fn_name):
+        return getattr(wrapper_module, fn_name)
+    return None
+
+
+def replace_kernels(
+    fx_m: GraphModule,
+    eager: bool = False,
+    package_importer: Optional[package.PackageImporter] = None,
+) -> GraphModule:
+    if eager:
+        raise NotImplementedError(
+            "Local generative_recommenders AOT-T transform does not support "
+            "eager replacement."
+        )
+
+    compile_path = get_aott_compile_path()
+    assert os.path.exists(compile_path), "triton_aot_compile dir does not exist"
+
+    wrapper_dict: Dict[str, Any] = {}
+    for wrapper_name, fn_name, wrapper_path in _find_wrapper_files(compile_path):
+        wrapper_fn = _load_wrapper_module(
+            wrapper_name, fn_name, wrapper_path, package_importer
+        )
+        if wrapper_fn is not None:
+            wrapper_dict[fn_name] = wrapper_fn
+
+    logger.info(f"replace_kernels: {wrapper_dict=}")
+
+    # Phase 2: Replace FX graph nodes
+    #   Walk the FX graph, find call_function nodes whose target name
+    #   matches a loaded wrapper, and swap the target so that
+    #   kernel[grid](...) calls become torch.ops.triton_aot.* calls.
+    replaced_count = 0
+    for nodes in fx_m.graph.nodes:
+        if nodes.op == "call_function" and nodes.target.__name__ in wrapper_dict.keys():
+            logger.info(
+                f"Replaced node: {nodes.op} {nodes.target} -> {wrapper_dict[nodes.target.__name__]} {nodes.meta}"
+            )
+            nodes.target = wrapper_dict[nodes.target.__name__]
+            replaced_count += 1
+
+    assert replaced_count > 0, (
+        f"No ops were replaced with triton_aot wrappers. "
+        f"wrapper_dict={wrapper_dict}, compile_path={compile_path}"
+    )
+    logger.info(
+        f"Successfully replaced {replaced_count} op(s) with triton_aot wrappers."
+    )
+
+    fx_m.recompile()
+    return fx_m
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/transform/transform_kernels.py b/recommendation_v4/generative_recommenders/ops/triton_aot/transform/transform_kernels.py
new file mode 100644
index 000000000..c2be78989
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/transform/transform_kernels.py
@@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# pyre-strict
+
+from typing import Optional
+
+from generative_recommenders.ops.triton_aot.transform.kernel_wrapper_codegen import (
+    kernel_wrapper_codegen,
+)
+from generative_recommenders.ops.triton_aot.transform.replace_kernels import (
+    replace_kernels,
+)
+from torch import package
+from torch.fx import GraphModule
+
+
+def transform_kernels(
+    fx_m: GraphModule,
+    eager: bool = False,
+    package_importer: Optional[package.PackageImporter] = None,
+) -> GraphModule:
+    """Generate AOT wrappers and replace FX graph nodes in one step.
+
+    1. kernel_wrapper_codegen: AST-transforms wrapper functions,
+       rewrites kernel[grid](...) -> torch.ops.triton_aot.kernel(...),
+       writes {fn}_wrapper.py
+    2. replace_kernels: loads wrappers and replaces graph node targets
+    """
+    kernel_wrapper_codegen(fx_m, package_importer)
+    return replace_kernels(fx_m, eager=eager, package_importer=package_importer)
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_addmm.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_addmm.py
new file mode 100644
index 000000000..b71ec8144
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_addmm.py
@@ -0,0 +1,347 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files
+# (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge,
+# publish, distribute, sublicense, and/or sell copies of the Software,
+# and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+# pyre-strict
+# pyre-ignore-all-errors[2]: Triton has its own type system on func's input
+
+#!/usr/bin/env python3
+
+
+from typing import Any, List, Tuple
+
+import torch
+
+# @manual=//triton:triton
+import triton
+
+# @manual=//triton:triton
+import triton.language as tl
+from generative_recommenders.common import (
+    BACKEND_ALLOW_TF32,
+    cdiv,
+    should_trigger_eager_impl,
+)
+from generative_recommenders.ops.triton_aot.types import triton_aot
+
+
+def get_mm_configs() -> List[triton.Config]:
+    return [
+        triton.Config(
+            {
+                "BLOCK_M": 32,
+                "BLOCK_N": 64,
+                "BLOCK_K": 32,
+                "GROUP_M": 8,
+            },
+            num_stages=5,
+            num_warps=2,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 128,
+                "BLOCK_N": 256,
+                "BLOCK_K": 64,
+                "GROUP_M": 8,
+            },
+            num_stages=3,
+            num_warps=8,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 64,
+                "BLOCK_N": 256,
+                "BLOCK_K": 32,
+                "GROUP_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 128,
+                "BLOCK_N": 128,
+                "BLOCK_K": 32,
+                "GROUP_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 128,
+                "BLOCK_N": 64,
+                "BLOCK_K": 32,
+                "GROUP_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 64,
+                "BLOCK_N": 128,
+                "BLOCK_K": 32,
+                "GROUP_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 128,
+                "BLOCK_N": 32,
+                "BLOCK_K": 32,
+                "GROUP_M": 8,
+            },
+            num_stages=4,
+            num_warps=4,
+        ),
+        triton.Config(
+            {
+                "BLOCK_M": 64,
+                "BLOCK_N": 32,
+                "BLOCK_K": 32,
+                "GROUP_M": 8,
+            },
+            num_stages=5,
+            num_warps=2,
+        ),
+    ]
+
+
+@triton_aot(
+    annotations={
+        "M": "i32",
+        "N": ("i32", 16),
+        "K": ("i32", 16),
+        "stride_xm": ("i32", 16),
+        "stride_xk": ("i32", 1),
+        "stride_wk": ("i32", 16),
+        "stride_wn": ("i32", 1),
+        "stride_ym": ("i32", 16),
+        "stride_yn": ("i32", 1),
+        "stride_zm": ("i32", 16),
+        "stride_zn": ("i32", 1),
+    },
+)
+# pyre-ignore[56]: Pyre cannot infer triton.autotune decorator type
+@triton.autotune(
+    configs=get_mm_configs(),
+    key=["N", "K"],
+)
+@triton.jit
+def _addmm_fwd(
+    x_ptr,
+    w_ptr,
+    y_ptr,
+    z_ptr,
+    M,
+    N,
+    K,
+    stride_xm,
+    stride_xk,
+    stride_wk,
+    stride_wn,
+    stride_ym,
+    stride_yn,
+    stride_zm,
+    stride_zn,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_K: tl.constexpr,
+    GROUP_M: tl.constexpr,
+    ALLOW_TF32: tl.constexpr,
+    BROADCAST_Y: tl.constexpr,
+) -> None:
+    pid_0, pid_1 = tl.program_id(axis=0), tl.program_id(axis=1)
+    pid = pid_0 * tl.num_programs(axis=1) + pid_1
+    num_pid_m = tl.cdiv(M, BLOCK_M)
+    num_pid_n = tl.cdiv(N, BLOCK_N)
+    num_pid_in_group = GROUP_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_M)
+    pid_m = first_pid_m + (pid % group_size_m)
+    pid_n = (pid % num_pid_in_group) // group_size_m
+
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_k = tl.arange(0, BLOCK_K)
+    offs_n = tl.arange(0, BLOCK_N)
+    mask_m = (pid_m * BLOCK_M + offs_m)[:, None] < M
+    mask_n = (pid_n * BLOCK_N + offs_n)[None, :] < N
+    x_ptr += pid_m.to(tl.int64) * BLOCK_M * stride_xm
+    x_ptrs = x_ptr + (offs_m[:, None] * stride_xm + offs_k[None, :] * stride_xk)
+    w_ptr += pid_n.to(tl.int64) * BLOCK_N * stride_wn
+    w_ptrs = w_ptr + (offs_k[:, None] * stride_wk + offs_n[None, :] * stride_wn)
+    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
+    for k in range(0, tl.cdiv(K, BLOCK_K)):
+        mask_k = offs_k[None, :] < K - k * BLOCK_K
+        x = tl.load(x_ptrs, mask=mask_k & mask_m, other=0.0)
+        mask_k = offs_k[:, None] < K - k * BLOCK_K
+        w = tl.load(w_ptrs, mask=mask_k & mask_n, other=0.0)
+        accumulator += tl.dot(x, w, allow_tf32=ALLOW_TF32)
+        x_ptrs += BLOCK_K * stride_xk
+        w_ptrs += BLOCK_K * stride_wk
+
+    z_mask = mask_m & mask_n
+    if BROADCAST_Y:
+        # y is a vector, broadcast to add to z
+        y_ptr += pid_n.to(tl.int64) * BLOCK_N * stride_yn
+        y_ptrs = y_ptr + stride_yn * offs_n[None, :]
+        y = tl.load(y_ptrs, mask=mask_n)
+    else:
+        y_ptr += pid_m.to(tl.int64) * BLOCK_M * stride_ym
+        y_ptr += pid_n.to(tl.int64) * BLOCK_N * stride_yn
+        y_ptrs = y_ptr + stride_ym * offs_m[:, None] + stride_yn * offs_n[None, :]
+        y = tl.load(y_ptrs, mask=z_mask)
+    z = (accumulator + y.to(tl.float32)).to(z_ptr.dtype.element_ty)
+    z_ptr += pid_m.to(tl.int64) * BLOCK_M * stride_zm
+    z_ptr += pid_n.to(tl.int64) * BLOCK_N * stride_zn
+    z_ptrs = z_ptr + stride_zm * offs_m[:, None] + stride_zn * offs_n[None, :]
+    tl.store(z_ptrs, z, mask=z_mask)
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def _triton_aot_addmm_fwd(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    y: torch.Tensor,
+    allow_tf32: bool = BACKEND_ALLOW_TF32,
+) -> torch.Tensor:
+    M, K = x.shape
+    KB, N = w.shape
+    assert K == KB, f"incompatible dimensions {K}, {KB}"
+
+    is_y_1d = y.dim() == 1
+    NY = y.shape[0] if is_y_1d else y.shape[1]
+    assert N == NY, f"incompatible dimensions {N}, {NY}"
+
+    # Allocate output
+    z = torch.empty((M, N), device=x.device, dtype=x.dtype)
+    if M == 0 or N == 0:
+        return z
+
+    grid = lambda meta: (  # noqa E731
+        cdiv(M, meta["BLOCK_M"]),
+        cdiv(N, meta["BLOCK_N"]),
+    )
+
+    _addmm_fwd[grid](
+        x,
+        w,
+        y,
+        z,
+        M,
+        N,
+        K,
+        x.stride(0),
+        x.stride(1),
+        w.stride(0),
+        w.stride(1),
+        y.stride(0) if not is_y_1d else 0,
+        y.stride(1) if not is_y_1d else y.stride(0),
+        z.stride(0),
+        z.stride(1),
+        ALLOW_TF32=allow_tf32,
+        BROADCAST_Y=is_y_1d,
+    )
+    return z
+
+
+def _triton_aot_addmm_fwd_eager(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    y: torch.Tensor,
+) -> torch.Tensor:
+    return torch.addmm(y, x, w)
+
+
+@torch.fx.wrap
+def _triton_aot_addmm_fwd_maybe_eager(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    y: torch.Tensor,
+) -> torch.Tensor:
+    if torch.jit.is_scripting():
+        # call eager
+        return torch.addmm(y, x, w)
+    else:
+        return _triton_aot_addmm_fwd(x, w, y)
+
+
+def triton_addmm_bwd(
+    x: torch.Tensor,
+    w: torch.Tensor,
+    dz: torch.Tensor,
+    is_y_1d: bool,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if is_y_1d:
+        dy = torch.sum(dz, dim=0)
+    else:
+        dy = dz
+    dw = torch.mm(x.t(), dz)
+    dx = torch.mm(dz, w.t())
+
+    return dx, dw, dy
+
+
+class _AddMmFunction(torch.autograd.Function):
+    @staticmethod
+    # pyre-ignore[14]: autograd.Function signature override
+    def forward(
+        ctx: Any,
+        x: torch.Tensor,
+        w: torch.Tensor,
+        y: torch.Tensor,
+    ) -> torch.Tensor:
+        ctx.save_for_backward(x, w)
+        ctx.is_y_1d = y.dim() == 1
+        return _triton_aot_addmm_fwd(x, w, y)
+
+    @staticmethod
+    # pyre-ignore[14]: autograd.Function signature override
+    def backward(
+        ctx: Any, dz: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        (x, w) = ctx.saved_tensors
+        return triton_addmm_bwd(x, w, dz, ctx.is_y_1d)
+
+
+def triton_addmm(
+    input: torch.Tensor,
+    mat1: torch.Tensor,
+    mat2: torch.Tensor,
+) -> torch.Tensor:
+    return _AddMmFunction.apply(mat1, mat2, input)
+
+
+@torch.fx.wrap
+def aot_triton_kernel_wrapper_addmm(
+    input: torch.Tensor,
+    mat1: torch.Tensor,
+    mat2: torch.Tensor,
+    allow_tf32: bool = BACKEND_ALLOW_TF32,
+) -> torch.Tensor:
+    if should_trigger_eager_impl():
+        return torch.addmm(input, mat1, mat2)
+    else:
+        return _triton_aot_addmm_fwd(mat1, mat2, input, allow_tf32=allow_tf32)
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_concat_2d_jagged.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_concat_2d_jagged.py
new file mode 100644
index 000000000..8afc3c18e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_concat_2d_jagged.py
@@ -0,0 +1,183 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files
+# (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge,
+# publish, distribute, sublicense, and/or sell copies of the Software,
+# and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+# pyre-strict
+
+from typing import Optional
+
+import torch
+from generative_recommenders.common import (
+    fx_unwrap_optional_tensor,
+    next_power_of_2,
+    should_trigger_eager_impl,
+)
+from generative_recommenders.ops.pytorch.pt_jagged import (
+    pytorch_replace_last_n_with_jagged,
+)
+from generative_recommenders.ops.pytorch.pt_jagged_tensors import (
+    pytorch_concat_2D_jagged,
+)
+from generative_recommenders.ops.triton.triton_jagged import concat_2D_jagged
+from generative_recommenders.ops.triton_aot.types import triton_aot
+
+
+concat_2D_jagged = triton_aot(
+    annotations={
+        "DenseSize": "i32",
+        "D": "i32",
+        "stride_ad": "i32",
+        "stride_bd": "i32",
+        "stride_dense_batch": "i32",
+        "stride_od": "i32",
+    },
+    # pyrefly: ignore [bad-argument-type]
+)(concat_2D_jagged)
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def _triton_aot_concat_2D_jagged(
+    max_seq_len: int,
+    values_a: torch.Tensor,
+    values_b: torch.Tensor,
+    offsets_a: Optional[torch.Tensor] = None,
+    offsets_b: Optional[torch.Tensor] = None,
+    is_replace: bool = False,
+) -> torch.Tensor:
+    is_dense_a = offsets_a is None
+    is_dense_b = offsets_b is None
+
+    dense_size: int = 0
+    if is_dense_a:
+        B, dense_size, D = values_a.size()
+        offsets_b = fx_unwrap_optional_tensor(offsets_b)
+        jagged_seq_len, _ = values_b.shape
+        values_out = torch.empty(
+            (dense_size * B + jagged_seq_len, D),
+            device=values_b.device,
+            dtype=values_b.dtype,
+        )
+        offsets_a = offsets_b.new_empty(0)
+        stride_dense_batch = values_a.stride(0)
+    elif is_dense_b:
+        B, dense_size, D = values_b.size()
+        offsets_a = fx_unwrap_optional_tensor(offsets_a)
+        jagged_seq_len, _ = values_a.shape
+        values_out = torch.empty(
+            (jagged_seq_len + dense_size * B, D),
+            device=values_a.device,
+            dtype=values_a.dtype,
+        )
+        offsets_b = offsets_a.new_empty(0)
+        stride_dense_batch = values_b.stride(0)
+    else:
+        offsets_a = fx_unwrap_optional_tensor(offsets_a)
+        offsets_b = fx_unwrap_optional_tensor(offsets_b)
+        B = offsets_a.size(0) - 1
+        seq_len_a, D = values_a.shape
+        seq_len_b, _ = values_b.shape
+        if is_replace:
+            values_out = torch.empty_like(values_a)
+        else:
+            values_out = torch.empty(
+                (seq_len_a + seq_len_b, D), device=values_a.device, dtype=values_a.dtype
+            )
+        stride_dense_batch = 0
+
+    # Make sure offsets are alignted on 16-byte to match AOTT spec
+    if (
+        offsets_a is not None
+        and (offsets_a.storage_offset() * offsets_a.element_size()) % 16 != 0
+    ):
+        offsets_a = offsets_a.clone()
+    if (
+        offsets_b is not None
+        and (offsets_b.storage_offset() * offsets_b.element_size()) % 16 != 0
+    ):
+        offsets_b = offsets_b.clone()
+
+    BLOCK_D = next_power_of_2(D)
+
+    grid = (max_seq_len, B)
+    # pyrefly: ignore [not-callable]
+    concat_2D_jagged[grid](
+        OffsetsA=offsets_a,
+        ValuesA=values_a,
+        OffsetsB=offsets_b,
+        ValuesB=values_b,
+        DenseSize=dense_size,
+        Out=values_out,
+        D=D,
+        stride_ad=(values_a.stride(1) if is_dense_a else values_a.stride(0)),
+        stride_bd=(values_b.stride(1) if is_dense_b else values_b.stride(0)),
+        stride_dense_batch=stride_dense_batch,
+        stride_od=values_out.stride(0),
+        # pyrefly: ignore [bad-argument-type]
+        IS_DENSE_A=is_dense_a,
+        # pyrefly: ignore [bad-argument-type]
+        IS_DENSE_B=is_dense_b,
+        # pyrefly: ignore [bad-argument-type]
+        BLOCK_D=BLOCK_D,
+        # pyrefly: ignore [bad-argument-type]
+        IS_REPLACE=is_replace,
+    )
+    return values_out
+
+
+@torch.fx.wrap
+# "aot_triton_kernel_wrapper_" is a pre-defined prefix for
+# AOT-T triton kernel wrapper functions. This is required for
+# AOT-T backend to recognize and trace correctly for ops transformation.
+def aot_triton_kernel_wrapper_concat_2D_jagged(
+    max_seq_len: int,
+    values_a: torch.Tensor,
+    values_b: torch.Tensor,
+    offsets_a: Optional[torch.Tensor] = None,
+    offsets_b: Optional[torch.Tensor] = None,
+    is_replace: bool = False,
+) -> torch.Tensor:
+    if should_trigger_eager_impl():
+        if is_replace:
+            assert offsets_a is not None and offsets_b is not None
+            return pytorch_replace_last_n_with_jagged(
+                max_seq_len_left=max_seq_len,
+                offsets_left=offsets_a,
+                values_left=values_a,
+                offsets_right=offsets_b,
+                values_right=values_b,
+            )
+        return pytorch_concat_2D_jagged(
+            values_left=values_a,
+            values_right=values_b,
+            max_len_left=max_seq_len if offsets_a is None else None,
+            max_len_right=max_seq_len if offsets_b is None else None,
+            offsets_left=offsets_a,
+            offsets_right=offsets_b,
+        )
+    else:
+        return _triton_aot_concat_2D_jagged(
+            max_seq_len=max_seq_len,
+            values_a=values_a,
+            values_b=values_b,
+            offsets_a=offsets_a,
+            offsets_b=offsets_b,
+            is_replace=is_replace,
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_group_norm_mul_dropout.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_group_norm_mul_dropout.py
new file mode 100644
index 000000000..15c609a3c
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_group_norm_mul_dropout.py
@@ -0,0 +1,124 @@
+# pyre-strict
+
+import torch
+from generative_recommenders.common import next_power_of_2, should_trigger_eager_impl
+from generative_recommenders.ops.pytorch.pt_hstu_linear import pytorch_norm_mul_dropout
+from generative_recommenders.ops.triton.triton_hstu_linear import (
+    _group_norm_mul_dropout_fwd,
+)
+from generative_recommenders.ops.triton_aot.types import triton_aot
+
+_group_norm_mul_dropout_fwd = triton_aot(
+    annotations={
+        "D": ("i32", 16),
+        "eps": "fp32",
+        "seed": "i64",
+        "dropout_ratio": "fp32",
+        "stride_x": ("i32", 16),
+        "stride_u": ("i32", 16),
+        "stride_y": ("i32", 16),
+    },
+    # pyrefly: ignore [bad-argument-type]
+)(_group_norm_mul_dropout_fwd)
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def _triton_aot_group_norm_mul_dropout(
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    silu_u: bool,
+    concat_ux: bool,
+    num_heads: int,
+    linear_dim: int,
+) -> torch.Tensor:
+    x = x.contiguous()
+    u = u.contiguous()
+    N, _ = x.shape
+    if concat_ux:
+        y = torch.empty((N, 3 * num_heads * linear_dim), dtype=x.dtype, device=x.device)
+    else:
+        y = torch.empty((N, num_heads * linear_dim), dtype=x.dtype, device=x.device)
+    mean = torch.empty((N * num_heads,), dtype=x.dtype, device=x.device)
+    rstd = torch.empty((N * num_heads,), dtype=x.dtype, device=x.device)
+
+    BLOCK_D = next_power_of_2(linear_dim)
+    BLOCK_H = next_power_of_2(num_heads)
+
+    seed = 0
+    dropout_ratio = 0.0
+
+    grid = (N,)
+    # pyrefly: ignore [not-callable]
+    _group_norm_mul_dropout_fwd[grid](
+        x,  # X
+        u,  # U
+        y,  # Y
+        weight,  # W
+        bias,  # B
+        mean,  # Mean
+        rstd,  # Rstd
+        linear_dim,  # D
+        num_heads,  # Heads
+        eps,  # eps
+        seed,  # seed
+        dropout_ratio,  # dropout_ratio
+        x.stride(0),  # stride_x
+        u.stride(0),  # stride_u
+        y.stride(0),  # stride_y
+        # pyrefly: ignore [bad-argument-type]
+        SILU_U=silu_u,
+        # pyrefly: ignore [bad-argument-type]
+        BLOCK_D=BLOCK_D,
+        # pyrefly: ignore [bad-argument-type]
+        BLOCK_H=BLOCK_H,
+        # pyrefly: ignore [bad-argument-type]
+        TRAINING=False,
+        # pyrefly: ignore [bad-argument-type]
+        CONCAT_UX=concat_ux,
+    )
+    return y
+
+
+@torch.fx.wrap
+def aot_triton_kernel_wrapper_group_norm_mul_dropout(
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    silu_u: bool,
+    concat_ux: bool,
+    num_heads: int,
+    linear_dim: int,
+) -> torch.Tensor:
+    if should_trigger_eager_impl():
+        return pytorch_norm_mul_dropout(
+            x=x,
+            u=u,
+            weight=weight,
+            bias=bias,
+            eps=eps,
+            dropout_ratio=0.0,
+            training=False,
+            silu_u=silu_u,
+            concat_u=concat_ux,
+            concat_x=concat_ux,
+            group_norm=True,
+            num_heads=num_heads,
+            linear_dim=linear_dim,
+        )
+    return _triton_aot_group_norm_mul_dropout(
+        x=x,
+        u=u,
+        weight=weight,
+        bias=bias,
+        eps=eps,
+        silu_u=silu_u,
+        concat_ux=concat_ux,
+        num_heads=num_heads,
+        linear_dim=linear_dim,
+    )
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_layer_norm.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_layer_norm.py
new file mode 100644
index 000000000..c5339033f
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_layer_norm.py
@@ -0,0 +1,119 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files
+# (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge,
+# publish, distribute, sublicense, and/or sell copies of the Software,
+# and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+# pyre-strict
+
+#!/usr/bin/env python3
+
+import torch
+from generative_recommenders.common import (
+    cdiv,
+    next_power_of_2,
+    should_trigger_eager_impl,
+    switch_to_contiguous_if_needed,
+)
+from generative_recommenders.ops.pytorch.pt_layer_norm import (
+    pytorch_layer_norm,
+    pytorch_swish_layer_norm,
+)
+from generative_recommenders.ops.triton.triton_layer_norm import (
+    _weighted_layer_norm_fwd,
+)
+from generative_recommenders.ops.triton_aot.types import triton_aot
+
+
+_weighted_layer_norm_fwd = triton_aot(
+    annotations={
+        "N": "i32",
+        "D": ("i32", 16),
+        "stride_x": ("i32", 16),
+        "stride_y": ("i32", 16),
+    },
+)(_weighted_layer_norm_fwd)
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def _triton_aot_swish_layer_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    is_swish: bool,
+) -> torch.Tensor:
+    assert x.dim() == 2, f"x.dim() == {x.dim()}, expected 2"
+    x = switch_to_contiguous_if_needed(x)
+    N, D = x.shape
+
+    assert weight.dim() == 1
+    assert bias.dim() == 1
+    assert weight.numel() == D
+    assert bias.numel() == D
+
+    y = torch.empty_like(x)
+
+    BLOCK_D = next_power_of_2(D)
+
+    grid = lambda meta: (  # noqa E731
+        cdiv(N, meta["BLOCK_N"]),
+    )
+    # pyrefly: ignore [not-callable]
+    _weighted_layer_norm_fwd[grid](
+        x,
+        y,
+        weight,
+        bias,
+        torch.empty(0, dtype=torch.float32),
+        torch.empty(0, dtype=torch.float32),
+        N,
+        D,
+        eps,
+        stride_x=x.stride(0),
+        stride_y=y.stride(0),
+        IS_SWISH=is_swish,
+        TRAINING=False,
+        BLOCK_D=BLOCK_D,
+        COMPUTE_MEAN_AND_RSTD=True,
+    )
+
+    return y
+
+
+@torch.fx.wrap
+# "aot_triton_kernel_wrapper_" is a pre-defined prefix for
+# AOT-T triton kernel wrapper functions. This is required for
+# AOT-T backend to recognize and trace correctly for ops transformation.
+def aot_triton_kernel_wrapper_swish_layer_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    is_swish: bool,
+) -> torch.Tensor:
+    if should_trigger_eager_impl():
+        if is_swish:
+            return pytorch_swish_layer_norm(x, [x.shape[1]], weight, bias, eps).to(
+                x.dtype
+            )
+        else:
+            return pytorch_layer_norm(x, [x.shape[1]], weight, bias, eps).to(x.dtype)
+    else:
+        return _triton_aot_swish_layer_norm(x, weight, bias, eps, is_swish)
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_layer_norm_mul_dropout.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_layer_norm_mul_dropout.py
new file mode 100644
index 000000000..7f7a6c743
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_layer_norm_mul_dropout.py
@@ -0,0 +1,162 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files
+# (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge,
+# publish, distribute, sublicense, and/or sell copies of the Software,
+# and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+# pyre-strict
+
+#!/usr/bin/env python3
+
+
+import torch
+from generative_recommenders.common import next_power_of_2, should_trigger_eager_impl
+from generative_recommenders.ops.pytorch.pt_hstu_linear import pytorch_norm_mul_dropout
+from generative_recommenders.ops.triton.triton_hstu_linear import _ln_mul_dropout_fwd
+from generative_recommenders.ops.triton_aot.types import triton_aot
+
+_ln_mul_dropout_fwd = triton_aot(
+    annotations={
+        "D": ("i32", 16),
+        "stride_x": ("i32", 16),
+        "stride_u": ("i32", 16),
+        "stride_y": ("i32", 16),
+    },
+    # pyrefly: ignore [bad-argument-type]
+)(_ln_mul_dropout_fwd)
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def _triton_aot_layer_norm_mul_dropout(
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    dropout_ratio: float,
+    training: bool,
+    silu_u: bool,
+    concat_ux: bool,
+    mul_u_activation_type: str,
+) -> torch.Tensor:
+    assert x.dim() == 2
+    if x.stride(1) != 1:
+        x = x.contiguous()
+    N, D = x.shape
+    assert weight.dim() == 1
+    assert bias.dim() == 1
+    assert weight.numel() == D
+    assert bias.numel() == D
+
+    if concat_ux:
+        y = torch.empty((N, 3 * D), dtype=x.dtype, device=x.device)
+    else:
+        y = torch.empty_like(x)
+    if N == 0:
+        return y
+    mean = x.new_empty((N,))
+    rstd = x.new_empty((N,))
+
+    # Less than 64KB per feature: enqueue fused kernel
+    MAX_FUSED_SIZE = 65536 // x.element_size()
+    BLOCK_D = min(MAX_FUSED_SIZE, next_power_of_2(D))
+    if D > BLOCK_D:
+        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+
+    seed = 0
+    # num_warps = min(max(BLOCK_D // 256, 1), 8)
+    grid = (N,)
+    # pyrefly: ignore [not-callable]
+    _ln_mul_dropout_fwd[grid](
+        x,
+        u,
+        y,
+        weight,
+        bias,
+        mean,
+        rstd,
+        D,
+        eps,
+        seed,
+        dropout_ratio,
+        x.stride(0),
+        u.stride(0),
+        y.stride(0),
+        # pyrefly: ignore [bad-argument-type]
+        SILU_U=silu_u,
+        # pyrefly: ignore [bad-argument-type]
+        BLOCK_D=BLOCK_D,
+        # pyrefly: ignore [bad-argument-type]
+        TRAINING=training,
+        # pyrefly: ignore [bad-argument-type]
+        CONCAT_U=concat_ux,
+        # pyrefly: ignore [bad-argument-type]
+        CONCAT_X=concat_ux,
+        # pyrefly: ignore [bad-argument-type]
+        MUL_U_ACTIVATION_TYPE=mul_u_activation_type,
+        # pyrefly: ignore [bad-argument-type]
+        FAST_DROPOUT=False,
+    )
+    return y
+
+
+@torch.fx.wrap
+# "aot_triton_kernel_wrapper_" is a pre-defined prefix for
+# AOT-T triton kernel wrapper functions. This is required for
+# AOT-T backend to recognize and trace correctly for ops transformation.
+def aot_triton_kernel_wrapper_layer_norm_mul_dropout(
+    x: torch.Tensor,
+    u: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    dropout_ratio: float,
+    training: bool,
+    silu_u: bool,
+    concat_ux: bool,
+    mul_u_activation_type: str,
+) -> torch.Tensor:
+    if should_trigger_eager_impl():
+        return pytorch_norm_mul_dropout(
+            x=x,
+            u=u,
+            weight=weight,
+            bias=bias,
+            eps=eps,
+            dropout_ratio=dropout_ratio,
+            training=training,
+            silu_u=silu_u,
+            concat_u=concat_ux,
+            concat_x=concat_ux,
+            mul_u_activation_type=mul_u_activation_type,
+            group_norm=False,
+        )
+    else:
+        return _triton_aot_layer_norm_mul_dropout(
+            x=x,
+            u=u,
+            weight=weight,
+            bias=bias,
+            eps=eps,
+            dropout_ratio=dropout_ratio,
+            training=training,
+            silu_u=silu_u,
+            concat_ux=concat_ux,
+            mul_u_activation_type=mul_u_activation_type,
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_position.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_position.py
new file mode 100644
index 000000000..828a4f4d4
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_position.py
@@ -0,0 +1,176 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+# pyre-strict
+
+from typing import Optional
+
+import torch
+from generative_recommenders.common import (
+    cdiv,
+    fx_unwrap_optional_tensor,
+    next_power_of_2,
+    prev_power_of_2,
+    should_trigger_eager_impl,
+)
+from generative_recommenders.ops.pytorch.pt_position import (
+    pytorch_add_timestamp_positional_embeddings,
+)
+from generative_recommenders.ops.triton.triton_position import (
+    _add_timestamp_position_embeddings_kernel,
+)
+from generative_recommenders.ops.triton_aot.types import triton_aot
+
+
+_add_timestamp_position_embeddings_kernel = triton_aot(
+    annotations={
+        "SeqEmb": ("*bf16", 16),
+        "Offsets": ("*i64", 16),
+        "Lengths": ("*i64", 16),
+        "PosEmb": ("*fp32", 16),
+        "TsEmb": ("*fp32", 16),
+        "Out": ("*bf16", 16),
+        "TS": ("*i64", 16),
+        "PosInds": ("*i32", 16),
+        "TsInds": ("*i32", 16),
+        "NumTargets": ("*i64", 16),
+        "AUTOTUNE_MAX_SEQ_LEN": "i32",
+        "D": "i32",
+        "num_time_buckets": "i32",
+        "time_bucket_increments": "fp32",
+        "time_bucket_scale": "fp32",
+        "time_delta": "i32",
+        "max_contextual_seq_len": "i32",
+        "max_pos_ind": "i32",
+        "stride_sn": ("i32", 16),
+        "stride_pn": ("i32", 16),
+        "stride_tn": ("i32", 16),
+        "stride_on": ("i32", 16),
+    },
+)(_add_timestamp_position_embeddings_kernel)
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def _triton_aot_position(
+    seq_embeddings: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    pos_embeddings: torch.Tensor,
+    ts_embeddings: torch.Tensor,
+    timestamps: torch.Tensor,
+    max_seq_len: int,
+    max_contextual_seq_len: int,
+    seq_lengths: torch.Tensor,
+    num_targets: Optional[torch.Tensor],
+    interleave_targets: bool,
+    time_bucket_fn: str,
+) -> torch.Tensor:
+    has_multiple_targets = num_targets is not None
+    if not has_multiple_targets:
+        num_targets_resolved = torch.empty(
+            0, dtype=torch.int64, device=seq_embeddings.device
+        )
+    else:
+        num_targets_resolved = fx_unwrap_optional_tensor(num_targets).to(torch.int64)
+
+    seq_embeddings = seq_embeddings.contiguous()
+    pos_embeddings = pos_embeddings.contiguous()
+    ts_embeddings = ts_embeddings.contiguous()
+
+    max_pos_ind = pos_embeddings.shape[0]
+    B = seq_lengths.shape[0]
+
+    N, D = seq_embeddings.shape
+    out = torch.empty_like(seq_embeddings)
+
+    timestamps = timestamps.contiguous()
+    ts_inds = torch.empty((N,), device=timestamps.device, dtype=torch.int32)
+    pos_inds = torch.empty((N,), device=timestamps.device, dtype=torch.int32)
+
+    autotune_max_seq_len = prev_power_of_2(max_seq_len)
+    BLOCK_D = next_power_of_2(D) if D < 64 else 64
+
+    grid = lambda meta: (  # noqa E731
+        B,
+        cdiv(max_seq_len, meta["BLOCK_N"]),
+    )
+    # pyrefly: ignore [not-callable]
+    _add_timestamp_position_embeddings_kernel[grid](
+        SeqEmb=seq_embeddings,
+        Offsets=seq_offsets,
+        Lengths=seq_lengths,
+        PosEmb=pos_embeddings,
+        TsEmb=ts_embeddings,
+        Out=out,
+        TS=timestamps,
+        PosInds=pos_inds,
+        TsInds=ts_inds,
+        NumTargets=num_targets_resolved,
+        AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len,
+        D=D,
+        num_time_buckets=2048,
+        time_bucket_increments=60.0,
+        time_bucket_scale=1.0,
+        time_delta=0,
+        max_contextual_seq_len=max_contextual_seq_len,
+        max_pos_ind=max_pos_ind,
+        stride_sn=seq_embeddings.stride(0),
+        stride_pn=pos_embeddings.stride(0),
+        stride_tn=ts_embeddings.stride(0),
+        stride_on=out.stride(0),
+        TRAINING=False,
+        HAS_MULTIPLE_TARGETS=has_multiple_targets,
+        INTERLEAVE_TARGETS=interleave_targets,
+        TIME_BUCKET_FN=time_bucket_fn,
+        BLOCK_D=BLOCK_D,
+    )
+
+    return out
+
+
+@torch.fx.wrap
+# "aot_triton_kernel_wrapper_" is a pre-defined prefix for
+# AOT-T triton kernel wrapper functions. This is required for
+# AOT-T backend to recognize and trace correctly for ops transformation.
+def aot_triton_kernel_wrapper_position(
+    alpha: float,
+    max_seq_len: int,
+    max_contextual_seq_len: int,
+    position_embeddings_weight: torch.Tensor,
+    timestamp_embeddings_weight: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    seq_lengths: torch.Tensor,
+    seq_embeddings: torch.Tensor,
+    timestamps: torch.Tensor,
+    num_targets: Optional[torch.Tensor],
+    interleave_targets: bool,
+    time_bucket_fn: str,
+) -> torch.Tensor:
+    seq_embeddings = seq_embeddings * alpha
+    if should_trigger_eager_impl():
+        return pytorch_add_timestamp_positional_embeddings(
+            seq_embeddings=seq_embeddings,
+            seq_offsets=seq_offsets,
+            pos_embeddings=position_embeddings_weight,
+            ts_embeddings=timestamp_embeddings_weight,
+            timestamps=timestamps,
+            max_seq_len=max_seq_len,
+            max_contextual_seq_len=max_contextual_seq_len,
+            seq_lengths=seq_lengths,
+            num_targets=num_targets,
+            interleave_targets=interleave_targets,
+            time_bucket_fn=time_bucket_fn,
+        )
+    else:
+        return _triton_aot_position(
+            seq_embeddings=seq_embeddings,
+            seq_offsets=seq_offsets,
+            pos_embeddings=position_embeddings_weight,
+            ts_embeddings=timestamp_embeddings_weight,
+            timestamps=timestamps,
+            max_seq_len=max_seq_len,
+            max_contextual_seq_len=max_contextual_seq_len,
+            seq_lengths=seq_lengths,
+            num_targets=num_targets,
+            interleave_targets=interleave_targets,
+            time_bucket_fn=time_bucket_fn,
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_ragged_hstu_attention.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_ragged_hstu_attention.py
new file mode 100644
index 000000000..4fa11dddc
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_ragged_hstu_attention.py
@@ -0,0 +1,366 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files
+# (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge,
+# publish, distribute, sublicense, and/or sell copies of the Software,
+# and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+# pyre-strict
+
+#!/usr/bin/env python3
+
+from typing import Optional
+
+import torch
+from generative_recommenders.common import (
+    autotune_max_seq_len,
+    BACKEND_ALLOW_TF32,
+    cdiv,
+    prev_power_of_2,
+    should_trigger_eager_impl,
+)
+from generative_recommenders.ops.pytorch.pt_hstu_attention import (
+    pytorch_cached_hstu_mha,
+    pytorch_hstu_mha,
+)
+from generative_recommenders.ops.triton.triton_hstu_attention import _hstu_attn_fwd
+from generative_recommenders.ops.triton_aot.types import triton_aot
+
+
+for _config in _hstu_attn_fwd.configs:
+    if isinstance(_config.kwargs.get("USE_TLX"), bool):
+        _config.kwargs["USE_TLX"] = int(_config.kwargs["USE_TLX"])
+
+
+_hstu_attn_fwd = triton_aot(
+    annotations={
+        "stride_qm": ("i32", 16),
+        "stride_qh": ("i32", 16),
+        "stride_kn": ("i32", 16),
+        "stride_kh": ("i32", 16),
+        "stride_vn": ("i32", 16),
+        "stride_vh": ("i32", 16),
+        "stride_om": ("i32", 16),
+        "stride_oh": ("i32", 16),
+        "contextual_seq_len": "i32",
+        "max_attn_len": "i32",
+        "Z": "i32",
+        "AUTOTUNE_Z": "i32",
+        "H": "i32",
+        "MAX_SEQ_LEN": "i32",
+        "AUTOTUNE_MAX_SEQ_LEN": "i32",
+        "DimQ": "i32",
+        "DimV": "i32",
+        "DeltaSize": "i32",
+        "workspace_ptr": "*i8",
+        "sort_by_length_indices": "*i64",
+    }
+)(_hstu_attn_fwd)
+
+
+def _check_common_args(
+    invalid_attn_mask_type: str,
+    attn_scale: Optional[torch.Tensor],
+    full_attn_size: int,
+    num_softmax_heads: int,
+) -> None:
+    assert invalid_attn_mask_type in ("causal", "lower_triangular"), (
+        f"unsupported invalid_attn_mask_type: {invalid_attn_mask_type}"
+    )
+    assert attn_scale is None, "attn_scale is not implemented for AOT-T HSTU MHA"
+    assert full_attn_size == 0, "full_attn_size is not implemented for AOT-T HSTU MHA"
+    assert num_softmax_heads == 0, (
+        "num_softmax_heads is not implemented for AOT-T HSTU MHA"
+    )
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def _triton_aot_ragged_hstu_mha(
+    N: int,
+    alpha: float,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    invalid_attn_mask_type: str,
+    num_targets: Optional[torch.Tensor],
+    attn_scale: Optional[torch.Tensor],
+    max_attn_len: int,
+    contextual_seq_len: int,
+    full_attn_size: int,
+    num_softmax_heads: int = 0,
+    allow_tf32: bool = BACKEND_ALLOW_TF32,
+) -> torch.Tensor:
+    assert invalid_attn_mask_type in ("causal", "lower_triangular"), (
+        f"unsupported invalid_attn_mask_type: {invalid_attn_mask_type}"
+    )
+    assert attn_scale is None, "attn_scale is not implemented for AOT-T HSTU MHA"
+    assert full_attn_size == 0, "full_attn_size is not implemented for AOT-T HSTU MHA"
+    assert num_softmax_heads == 0, (
+        "num_softmax_heads is not implemented for AOT-T HSTU MHA"
+    )
+    Z = seq_offsets.numel() - 1
+    L, H, DimQ = q.shape
+    DimV = v.shape[2]
+
+    q = q.contiguous()
+    k = k.contiguous()
+    v = v.contiguous()
+    seq_offsets = seq_offsets.contiguous()
+
+    out = torch.empty_like(v)
+    if L == 0:
+        return out
+    workspace = torch.empty(0, dtype=torch.int8, device=q.device)
+    sort_by_length_indices = torch.empty(
+        0, dtype=torch.int64, device=seq_offsets.device
+    )
+
+    grid = lambda meta: (  # noqa E731
+        cdiv(N, meta["BLOCK_M"]),
+        Z * H,
+    )
+    # pyrefly: ignore [not-callable]
+    _hstu_attn_fwd[grid](
+        Q=q,
+        K=k,
+        V=v,
+        workspace_ptr=workspace,
+        sort_by_length_indices=sort_by_length_indices,
+        seq_offsets=seq_offsets,
+        num_targets=num_targets,
+        Out=out,
+        stride_qm=q.stride(0),
+        stride_qh=q.stride(1),
+        stride_kn=k.stride(0),
+        stride_kh=k.stride(1),
+        stride_vn=v.stride(0),
+        stride_vh=v.stride(1),
+        stride_om=out.stride(0),
+        stride_oh=out.stride(1),
+        alpha=alpha,
+        contextual_seq_len=contextual_seq_len,
+        max_attn_len=max_attn_len,
+        Z=Z,
+        AUTOTUNE_Z=prev_power_of_2(Z),
+        H=H,
+        MAX_SEQ_LEN=N,
+        AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(N),
+        DimQ=DimQ,
+        DimV=DimV,
+        DeltaSize=0,
+        HAS_MULTIPLE_TARGETS=num_targets is not None,
+        IS_DELTA_Q=False,
+        ALLOW_TF32=allow_tf32,
+        BLOCK_D_Q=DimQ,
+        BLOCK_D_V=DimV,
+        HAS_CONTEXTUAL_SEQ_LEN=contextual_seq_len > 0,
+        HAS_MAX_ATTN_LEN=max_attn_len > 0,
+        HAS_SORT_BY_LENGTH_INDICES=False,
+        ENABLE_TMA=False,
+        TMA_DESC_SIZE=128,
+    )
+    return out
+
+
+@torch.fx.wrap
+def aot_triton_kernel_wrapper_ragged_hstu_mha(
+    N: int,
+    alpha: float,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    invalid_attn_mask_type: str,
+    num_targets: Optional[torch.Tensor],
+    attn_scale: Optional[torch.Tensor],
+    max_attn_len: int,
+    contextual_seq_len: int,
+    full_attn_size: int,
+    num_softmax_heads: int,
+    allow_tf32: bool = BACKEND_ALLOW_TF32,
+) -> torch.Tensor:
+    _check_common_args(
+        invalid_attn_mask_type=invalid_attn_mask_type,
+        attn_scale=attn_scale,
+        full_attn_size=full_attn_size,
+        num_softmax_heads=num_softmax_heads,
+    )
+    if should_trigger_eager_impl():
+        return pytorch_hstu_mha(
+            max_seq_len=N,
+            alpha=alpha,
+            q=q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            causal=True,
+            dropout_pr=0.0,
+            training=False,
+            num_targets=num_targets,
+            attn_scale=attn_scale,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=contextual_seq_len,
+            min_full_attn_seq_len=full_attn_size,
+        )
+    return _triton_aot_ragged_hstu_mha(
+        N=N,
+        alpha=alpha,
+        q=q,
+        k=k,
+        v=v,
+        seq_offsets=seq_offsets,
+        invalid_attn_mask_type=invalid_attn_mask_type,
+        num_targets=num_targets,
+        attn_scale=attn_scale,
+        max_attn_len=max_attn_len,
+        contextual_seq_len=contextual_seq_len,
+        full_attn_size=full_attn_size,
+        num_softmax_heads=num_softmax_heads,
+        allow_tf32=allow_tf32,
+    )
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def _triton_aot_cached_hstu_mha(
+    N: int,
+    alpha: float,
+    delta_q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    delta_x_offsets: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    num_targets: Optional[torch.Tensor],
+    attn_scale: Optional[torch.Tensor],
+    max_attn_len: int,
+    full_attn_size: int,
+    allow_tf32: bool = BACKEND_ALLOW_TF32,
+) -> torch.Tensor:
+    assert attn_scale is None, "attn_scale is not implemented for AOT-T HSTU MHA"
+    assert full_attn_size == 0, "full_attn_size is not implemented for AOT-T HSTU MHA"
+    Z = seq_offsets.size(0) - 1
+    DELTA_L, H, DimQ = delta_q.shape
+    DeltaSize = DELTA_L // Z
+    DimV = v.shape[2]
+
+    delta_q = delta_q.contiguous()
+    k = k.contiguous()
+    v = v.contiguous()
+    seq_offsets = seq_offsets.contiguous()
+
+    out = torch.empty((DELTA_L, H, DimV), dtype=delta_q.dtype, device=delta_q.device)
+    if DELTA_L == 0:
+        return out
+    workspace = torch.empty(0, dtype=torch.int8, device=delta_q.device)
+    sort_by_length_indices = torch.empty(
+        0, dtype=torch.int64, device=seq_offsets.device
+    )
+
+    grid = lambda meta: (  # noqa E731
+        cdiv(DeltaSize, meta["BLOCK_M"]),
+        Z * H,
+    )
+    # pyrefly: ignore [not-callable]
+    _hstu_attn_fwd[grid](
+        Q=delta_q,
+        K=k,
+        V=v,
+        workspace_ptr=workspace,
+        sort_by_length_indices=sort_by_length_indices,
+        seq_offsets=seq_offsets,
+        num_targets=num_targets,
+        Out=out,
+        stride_qm=delta_q.stride(0),
+        stride_qh=delta_q.stride(1),
+        stride_kn=k.stride(0),
+        stride_kh=k.stride(1),
+        stride_vn=v.stride(0),
+        stride_vh=v.stride(1),
+        stride_om=out.stride(0),
+        stride_oh=out.stride(1),
+        alpha=alpha,
+        contextual_seq_len=0,
+        max_attn_len=max_attn_len,
+        Z=Z,
+        AUTOTUNE_Z=prev_power_of_2(Z),
+        H=H,
+        MAX_SEQ_LEN=N,
+        AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(N),
+        DimQ=DimQ,
+        DimV=DimV,
+        DeltaSize=DeltaSize,
+        HAS_MULTIPLE_TARGETS=num_targets is not None,
+        IS_DELTA_Q=True,
+        ALLOW_TF32=allow_tf32,
+        BLOCK_D_Q=DimQ,
+        BLOCK_D_V=DimV,
+        HAS_CONTEXTUAL_SEQ_LEN=False,
+        HAS_MAX_ATTN_LEN=max_attn_len > 0,
+        HAS_SORT_BY_LENGTH_INDICES=False,
+        ENABLE_TMA=False,
+        TMA_DESC_SIZE=128,
+    )
+    return out
+
+
+@torch.fx.wrap
+def aot_triton_kernel_wrapper_cached_hstu_mha(
+    N: int,
+    alpha: float,
+    delta_q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    delta_x_offsets: torch.Tensor,
+    seq_offsets: torch.Tensor,
+    num_targets: Optional[torch.Tensor],
+    attn_scale: Optional[torch.Tensor],
+    max_attn_len: int,
+    full_attn_size: int,
+) -> torch.Tensor:
+    _check_common_args(
+        invalid_attn_mask_type="causal",
+        attn_scale=attn_scale,
+        full_attn_size=full_attn_size,
+        num_softmax_heads=0,
+    )
+    if should_trigger_eager_impl():
+        return pytorch_cached_hstu_mha(
+            max_seq_len=N,
+            alpha=alpha,
+            delta_q=delta_q,
+            k=k,
+            v=v,
+            seq_offsets=seq_offsets,
+            num_targets=num_targets,
+            max_attn_len=max_attn_len,
+            contextual_seq_len=0,
+        )
+    return _triton_aot_cached_hstu_mha(
+        N=N,
+        alpha=alpha,
+        delta_q=delta_q,
+        k=k,
+        v=v,
+        delta_x_offsets=delta_x_offsets,
+        seq_offsets=seq_offsets,
+        num_targets=num_targets,
+        attn_scale=attn_scale,
+        max_attn_len=max_attn_len,
+        full_attn_size=full_attn_size,
+    )
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_rms_norm.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_rms_norm.py
new file mode 100644
index 000000000..e5d9e093e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_rms_norm.py
@@ -0,0 +1,114 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Permission is hereby granted, free of charge, to any person obtaining
+# a copy of this software and associated documentation files
+# (the "Software"), to deal in the Software without restriction,
+# including without limitation the rights to use, copy, modify, merge,
+# publish, distribute, sublicense, and/or sell copies of the Software,
+# and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be
+# included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+# pyre-strict
+
+#!/usr/bin/env python3
+
+import torch
+from generative_recommenders.common import (
+    cdiv,
+    next_power_of_2,
+    should_trigger_eager_impl,
+    switch_to_contiguous_if_needed,
+)
+from generative_recommenders.ops.pytorch.pt_layer_norm import pytorch_rms_norm
+from generative_recommenders.ops.triton.triton_layer_norm import _weighted_rms_norm_fwd
+from generative_recommenders.ops.triton_aot.types import triton_aot
+
+_weighted_rms_norm_fwd = triton_aot(
+    annotations={
+        "N": "i32",
+        "D": ("i32", 16),
+        "stride_x": ("i32", 16),
+        "stride_y": ("i32", 16),
+    },
+)(_weighted_rms_norm_fwd)
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def _triton_aot_rms_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float,
+    silu: bool,
+) -> torch.Tensor:
+    """Internal AOTT kernel function for RMS norm."""
+    assert x.dim() == 2, f"x.dim() == {x.dim()}, expected 2"
+    x = switch_to_contiguous_if_needed(x)
+    N, D = x.shape
+
+    assert weight.dim() == 1
+    assert weight.numel() == D
+
+    y = torch.empty_like(x)
+    rstd = torch.empty(N, dtype=torch.float32, device=x.device)
+
+    BLOCK_D = next_power_of_2(D)
+
+    grid = lambda meta: (  # noqa E731
+        cdiv(N, meta["BLOCK_N"]),
+    )
+    # pyrefly: ignore [not-callable]
+    _weighted_rms_norm_fwd[grid](
+        x,
+        y,
+        weight,
+        rstd,
+        N,
+        D,
+        eps,
+        stride_x=x.stride(0),
+        stride_y=y.stride(0),
+        SILU=silu,
+        BLOCK_D=BLOCK_D,
+    )
+
+    return y
+
+
+def _pytorch_rms_norm_fallback(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float,
+    silu: bool,
+) -> torch.Tensor:
+    """PyTorch fallback for RMS norm in eager mode."""
+
+    return pytorch_rms_norm(x, [x.shape[-1]], weight, eps, silu)
+
+
+@torch.fx.wrap
+def aot_triton_kernel_wrapper_rms_norm(
+    x: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float,
+    silu: bool,
+) -> torch.Tensor:
+    """AOT-T wrapper for RMS norm.
+
+    Routes between PyTorch fallback (for tracing/serialization) and AOTT kernel path.
+    """
+    if should_trigger_eager_impl():
+        return _pytorch_rms_norm_fallback(x, weight, eps, silu)
+    else:
+        return _triton_aot_rms_norm(x, weight, eps, silu)
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_split_2d_jagged.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_split_2d_jagged.py
new file mode 100644
index 000000000..9aa0c655b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_split_2d_jagged.py
@@ -0,0 +1,138 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+# pyre-strict
+
+from typing import Optional, Tuple
+
+import torch
+from generative_recommenders.common import (
+    fx_unwrap_optional_tensor,
+    next_power_of_2,
+    should_trigger_eager_impl,
+)
+from generative_recommenders.ops.pytorch.pt_jagged_tensors import (
+    pytorch_split_2D_jagged,
+)
+from generative_recommenders.ops.triton.triton_jagged import split_2D_jagged
+from generative_recommenders.ops.triton_aot.types import triton_aot
+
+
+split_2D_jagged = triton_aot(
+    annotations={
+        "DenseSize": "i32",
+        "D": ("i32", 16),
+        "stride_id": ("i32", 16),
+        "stride_ad": ("i32", 16),
+        "stride_bd": ("i32", 16),
+    },
+    # pyrefly: ignore [bad-argument-type]
+)(split_2D_jagged)
+
+
+@torch.jit.unused
+@torch.fx.wrap
+def _triton_aot_split_2D_jagged(
+    values: torch.Tensor,
+    max_seq_len: int,
+    offsets_a: torch.Tensor,
+    offsets_b: torch.Tensor,
+    dense_size: int = 0,
+    is_dense_a: bool = False,
+    is_dense_b: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    _, D = values.shape
+    BLOCK_D = next_power_of_2(D)
+
+    if is_dense_a:
+        L, _ = values.shape
+        B = offsets_b.size(0) - 1
+        seq_len_a = dense_size * B
+        seq_len_b = L - seq_len_a
+    elif is_dense_b:
+        L, _ = values.shape
+        B = offsets_a.size(0) - 1
+        seq_len_b = dense_size * B
+        seq_len_a = L - seq_len_b
+    else:
+        B = offsets_a.size(0) - 1
+        seq_len_a = int(offsets_a[-1].item())
+        seq_len_b = int(offsets_b[-1].item())
+
+    values_a = torch.empty((seq_len_a, D), device=values.device, dtype=values.dtype)
+    values_b = torch.empty((seq_len_b, D), device=values.device, dtype=values.dtype)
+
+    grid = (max_seq_len, B)
+    # pyre-ignore[29]: TritonAOT.__getitem__ is callable at runtime
+    split_2D_jagged[grid](
+        JaggedIn=values,
+        DenseSize=dense_size,
+        OffsetsA=offsets_a,
+        OffsetsB=offsets_b,
+        OutA=values_a,
+        OutB=values_b,
+        D=D,
+        stride_id=values.stride(0),
+        stride_ad=values_a.stride(0),
+        stride_bd=values_b.stride(0),
+        # pyrefly: ignore [bad-argument-type]
+        IS_DENSE_A=is_dense_a,
+        # pyrefly: ignore [bad-argument-type]
+        IS_DENSE_B=is_dense_b,
+        # pyrefly: ignore [bad-argument-type]
+        BLOCK_D=BLOCK_D,
+        # pyrefly: ignore [bad-argument-type]
+        IS_REPLACE=False,
+    )
+
+    if is_dense_a:
+        values_a = values_a.reshape(B, dense_size, D)
+    if is_dense_b:
+        values_b = values_b.reshape(B, dense_size, D)
+
+    return values_a, values_b
+
+
+@torch.fx.wrap
+def aot_triton_kernel_wrapper_split_2D_jagged(
+    values: torch.Tensor,
+    max_seq_len: int,
+    offsets_a: Optional[torch.Tensor] = None,
+    offsets_b: Optional[torch.Tensor] = None,
+    dense_size: int = 0,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if should_trigger_eager_impl():
+        assert offsets_a is not None and offsets_b is not None, (
+            "Eager fallback requires both offsets_a and offsets_b"
+        )
+        return pytorch_split_2D_jagged(
+            max_seq_len=max_seq_len,
+            values=values,
+            max_len_left=None,
+            max_len_right=None,
+            offsets_left=offsets_a,
+            offsets_right=offsets_b,
+        )
+    else:
+        is_dense_a: bool = offsets_a is None
+        is_dense_b: bool = offsets_b is None
+        resolved_offsets_a: torch.Tensor = values.new_empty(0)
+        resolved_offsets_b: torch.Tensor = values.new_empty(0)
+        if is_dense_a:
+            resolved_offsets_b = fx_unwrap_optional_tensor(offsets_b)
+            resolved_offsets_a = resolved_offsets_b.new_empty(0)
+        elif is_dense_b:
+            resolved_offsets_a = fx_unwrap_optional_tensor(offsets_a)
+            resolved_offsets_b = resolved_offsets_a.new_empty(0)
+        else:
+            resolved_offsets_a = fx_unwrap_optional_tensor(offsets_a)
+            resolved_offsets_b = fx_unwrap_optional_tensor(offsets_b)
+
+        return _triton_aot_split_2D_jagged(
+            values=values,
+            max_seq_len=max_seq_len,
+            offsets_a=resolved_offsets_a,
+            offsets_b=resolved_offsets_b,
+            dense_size=dense_size,
+            is_dense_a=is_dense_a,
+            is_dense_b=is_dense_b,
+        )
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/types.py b/recommendation_v4/generative_recommenders/ops/triton_aot/types.py
new file mode 100644
index 000000000..bd7b2a1ed
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton_aot/types.py
@@ -0,0 +1,181 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+
+# pyre-strict
+
+from __future__ import annotations
+
+import logging
+from dataclasses import dataclass
+from typing import Any, Callable, ClassVar, Dict, List, Optional, Protocol, Union
+
+from generative_recommenders.ops.triton_aot.compile.utils import is_autotuner
+
+# @manual=//triton:triton
+from triton.runtime.jit import KernelInterface
+# triton.fb.triton_util depends on torch
+# @dep=//caffe2:_torch
+
+
+_VALID_HINTS: frozenset[int] = frozenset({1, 8, 16})
+_VALID_POINTER_HINTS: frozenset[int] = frozenset({16})
+
+
+@dataclass(frozen=True)
+class AnnotationHint:
+    """Annotation with a value hint (dtype + divisibility/alignment).
+
+    Valid hints: 16 (divisible_by_16), 8 (divisible_by_8), 1 (equal_to_1).
+    For pointers (dtype starts with ``*``), only 16 is valid — other values
+    would cause incorrect codegen (e.g. alignment=1 folds the pointer as a
+    constexpr constant, causing a segfault at launch).
+    """
+
+    dtype: str
+    hint: int
+
+    def __post_init__(self) -> None:
+        if self.hint not in _VALID_HINTS:
+            raise RuntimeError(
+                f"TritonAOT: invalid annotation hint {self.hint!r} for "
+                f"dtype {self.dtype!r}. Valid hints: {sorted(_VALID_HINTS)}."
+            )
+        if self.dtype.startswith("*") and self.hint not in _VALID_POINTER_HINTS:
+            raise RuntimeError(
+                f"TritonAOT: invalid pointer alignment {self.hint!r} for "
+                f"dtype {self.dtype!r}. Pointer annotations only support "
+                f"alignment={sorted(_VALID_POINTER_HINTS)}."
+            )
+
+    def to_tuple(self) -> tuple[str, int]:
+        """Convert to plain tuple for raw spec format."""
+        return (self.dtype, self.hint)
+
+
+# Internal annotation type (after normalization).
+Annotation = Union[str, AnnotationHint]
+
+# User-facing input type (also accepts raw tuples).
+AnnotationInput = Union[str, tuple[str, int], AnnotationHint]
+
+
+def _normalize_annotation(ann: AnnotationInput) -> Annotation:
+    """Convert a raw tuple to AnnotationHint (triggers validation)."""
+    if isinstance(ann, AnnotationHint):
+        return ann
+    if isinstance(ann, tuple):
+        return AnnotationHint(ann[0], ann[1])
+    return ann
+
+
+class SpecCollector(Protocol):
+    """Callback invoked by TritonAOT.run() to collect kernel specs during AOT compile."""
+
+    def __call__(
+        self,
+        fn: KernelInterface[List[Any]],
+        annotations: Dict[str, Annotation],
+        *args: Any,
+        **kwargs: Any,
+    ) -> None: ...
+
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class TritonAOTMeta(type):
+    # TODO consider merge with AOTTCompileState
+    def __init__(cls, name, bases, attrs):  # pyre-ignore [2,3]
+        super().__init__(name, bases, attrs)
+        # Initialize an empty list for each new class created
+        cls._instances: List["TritonAOT"] = []
+
+    def __call__(cls, *args, **kwargs):  # pyre-ignore [2,3]
+        # Create the instance using the default behavior
+        instance = super().__call__(*args, **kwargs)
+        # Store the instance in the class-specific list
+        cls._instances.append(instance)
+        return instance
+
+    def get_instances(cls) -> List["TritonAOT"]:
+        return cls._instances
+
+
+class TritonAOT(KernelInterface[List[Any]], metaclass=TritonAOTMeta):
+    """Wraps a Triton kernel for ahead-of-time compilation.
+
+    Annotations specify dtype and optional value hints for kernel parameters:
+
+    - Scalar:  ``"i32"``, ``"fp32"``, or ``AnnotationHint("i32", 16)``
+      where 16 means the runtime value is divisible by 16.
+    - Pointer: ``AnnotationHint("*fp32", 16)`` for 16-byte aligned tensors.
+      Only alignment=16 is valid for pointers.
+    - Tensor:  typically inferred from runtime ``torch.Tensor.dtype``.
+    - Optional tensor:  auto-detected when the same kernel is called
+      with a tensor at one site and ``None`` at another.
+    """
+
+    _spec_collector: ClassVar[Optional[SpecCollector]] = None
+
+    def __init__(
+        self,
+        fn: KernelInterface[List[Any]],
+        annotations: Dict[str, AnnotationInput],
+    ) -> None:
+        self.fn: KernelInterface[List[Any]] = fn
+        self.annotations: Dict[str, Annotation] = {
+            k: _normalize_annotation(v) for k, v in annotations.items()
+        }
+
+    @classmethod
+    def set_spec_collector(cls, collector: Optional[SpecCollector]) -> None:
+        """Register or unregister the spec collection callback.
+
+        When a collector is registered (not None), TritonAOT.run() will call
+        it to collect kernel specs for AOT compilation.  When None, run()
+        simply delegates to the underlying Triton kernel (normal JIT path).
+        """
+        cls._spec_collector = collector
+
+    # pyrefly: ignore [bad-override]
+    def run(self, *args: Any, **kwargs: Any) -> Any:
+        if self._spec_collector is not None:
+            self._spec_collector(self.fn, self.annotations, *args, **kwargs)
+        # pyre-ignore[29]: KernelInterface.run is callable at runtime
+        return self.fn.run(*args, **kwargs)
+
+
+def triton_aot(
+    annotations: Dict[str, AnnotationInput],
+) -> Callable[[KernelInterface[List[Any]]], TritonAOT]:
+    def decorator(fn: KernelInterface[List[Any]]) -> TritonAOT:
+        return TritonAOT(fn, annotations)
+
+    return decorator
+
+
+def get_all_triton_aot_instances() -> List[TritonAOT]:
+    """Return all triton aot function instances (e.g. decorated with @triton_aot)."""
+    return TritonAOT.get_instances()
+
+
+def reset_all_triton_aot_autotune_cache() -> bool:
+    """Reset triton autotune cache for all triton aot kernels.
+
+    If triton aot compile is not enabled, this function is no op. Return True if any
+    kernel's autotune cache is reset. Else return False.
+
+    """
+    if TritonAOT._spec_collector is None:
+        return False
+
+    reset = False
+    for triton_aot_kernel in get_all_triton_aot_instances():
+        if is_autotuner(triton_aot_kernel.fn):
+            autotune_fn = triton_aot_kernel.fn
+            autotune_fn.cache.clear()  # pyre-ignore [16]
+            logger.info(
+                f"Reset autotune cache for triton kernel {autotune_fn.fn.__name__}"  # pyre-ignore [16]
+            )
+            reset = True
+
+    return reset
diff --git a/recommendation_v4/generative_recommenders/ops/utils.py b/recommendation_v4/generative_recommenders/ops/utils.py
new file mode 100644
index 000000000..94ab69e30
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/utils.py
@@ -0,0 +1,93 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-ignore-all-errors
+
+import functools
+import os
+
+import torch
+
+
+class _PlainFuncWrapper:
+    """Thin wrapper around a plain function that provides no-op register_fake
+    and register_kernel methods, mirroring the CustomOpDef API so that
+    downstream @func.register_fake / func.register_kernel("cpu") calls
+    don't break when the function is not wrapped as a custom op."""
+
+    def __init__(self, func):
+        self._func = func
+        functools.update_wrapper(self, func)
+
+    def __call__(self, *args, **kwargs):
+        return self._func(*args, **kwargs)
+
+    def register_fake(self, fake_func):
+        return fake_func
+
+    def register_kernel(self, device):
+        def inner(func):
+            return func
+
+        return inner
+
+
+def maybe_register_custom_op(op_name, mutates_args):
+    """
+    Conditionally registers a function as a torch custom op.
+
+    When AOTI_LOWER is set in the environment, the function is returned
+    unwrapped so that torch.export / Dynamo can trace through the plain
+    Python implementation instead of treating the custom op as opaque.
+    """
+
+    def decorator(func):
+        if os.environ.get("AOTI_LOWER"):
+            return _PlainFuncWrapper(func)
+        return torch.library.custom_op(op_name, func, mutates_args=mutates_args)
+
+    return decorator
+
+
+def is_sm100_plus() -> bool:
+    """
+    Check if this is a Blackwell Datacenter GPU.
+    These are between 100 and 103 for B200-GB300.
+    """
+    if not torch.cuda.is_available():
+        return False
+    props = torch.cuda.get_device_properties(0)
+    return props.major == 10 and (props.minor >= 0 and props.minor <= 3)
+
+
+def is_sm90() -> bool:
+    if not torch.cuda.is_available():
+        return False
+    props = torch.cuda.get_device_properties(0)
+    return props.major == 9 and props.minor == 0
+
+
+def is_sm90_plus() -> bool:
+    return is_sm100_plus() or is_sm90()
+
+
+def copy_if_different_ptr(dst: torch.Tensor, src: torch.Tensor) -> None:
+    if torch.compiler.is_compiling():
+        # .data_ptr() will break PT2
+        dst.copy_(src)
+    else:
+        if dst.data_ptr() != src.data_ptr():
+            dst.copy_(src)
diff --git a/recommendation_v4/generative_recommenders/research/data/dataset.py b/recommendation_v4/generative_recommenders/research/data/dataset.py
new file mode 100644
index 000000000..09a18ae01
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/data/dataset.py
@@ -0,0 +1,248 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import csv
+import linecache
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+import torch
+
+
+class DatasetV2(torch.utils.data.Dataset):
+    """In reverse chronological order."""
+
+    def __init__(
+        self,
+        ratings_file: str,
+        padding_length: int,
+        ignore_last_n: int,  # used for creating train/valid/test sets
+        shift_id_by: int = 0,
+        chronological: bool = False,
+        sample_ratio: float = 1.0,
+    ) -> None:
+        """
+        Args:
+            csv_file (string): Path to the csv file.
+        """
+        super().__init__()
+
+        self.ratings_frame: pd.DataFrame = pd.read_csv(
+            ratings_file,
+            delimiter=",",
+            # iterator=True,
+        )
+        self._padding_length: int = padding_length
+        self._ignore_last_n: int = ignore_last_n
+        self._cache: Dict[int, Dict[str, torch.Tensor]] = dict()
+        self._shift_id_by: int = shift_id_by
+        self._chronological: bool = chronological
+        self._sample_ratio: float = sample_ratio
+
+    def __len__(self) -> int:
+        return len(self.ratings_frame)
+
+    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
+        if idx in self._cache.keys():
+            return self._cache[idx]
+        data = self.ratings_frame.iloc[idx]
+        sample = self.load_item(data)
+        self._cache[idx] = sample
+        return sample
+
+    def load_item(self, data) -> Dict[str, torch.Tensor]:
+        user_id = data.user_id
+
+        def eval_as_list(x: str, ignore_last_n: int) -> List[int]:
+            y = eval(x)
+            y_list = [y] if type(y) == int else list(y)
+            if ignore_last_n > 0:
+                # for training data creation
+                y_list = y_list[:-ignore_last_n]
+            return y_list
+
+        def eval_int_list(
+            x: str,
+            target_len: int,
+            ignore_last_n: int,
+            shift_id_by: int,
+            sampling_kept_mask: Optional[List[bool]],
+        ) -> Tuple[List[int], int]:
+            y = eval_as_list(x, ignore_last_n=ignore_last_n)
+            if sampling_kept_mask is not None:
+                y = [x for x, kept in zip(y, sampling_kept_mask) if kept]
+            y_len = len(y)
+            y.reverse()
+            if shift_id_by > 0:
+                y = [x + shift_id_by for x in y]
+            return y, y_len
+
+        if self._sample_ratio < 1.0:
+            raw_length = len(eval_as_list(data.sequence_item_ids, self._ignore_last_n))
+            sampling_kept_mask = (
+                torch.rand((raw_length,), dtype=torch.float32) < self._sample_ratio
+            ).tolist()
+        else:
+            sampling_kept_mask = None
+
+        movie_history, movie_history_len = eval_int_list(
+            data.sequence_item_ids,
+            self._padding_length,
+            self._ignore_last_n,
+            shift_id_by=self._shift_id_by,
+            sampling_kept_mask=sampling_kept_mask,
+        )
+        movie_history_ratings, ratings_len = eval_int_list(
+            data.sequence_ratings,
+            self._padding_length,
+            self._ignore_last_n,
+            0,
+            sampling_kept_mask=sampling_kept_mask,
+        )
+        movie_timestamps, timestamps_len = eval_int_list(
+            data.sequence_timestamps,
+            self._padding_length,
+            self._ignore_last_n,
+            0,
+            sampling_kept_mask=sampling_kept_mask,
+        )
+        assert movie_history_len == timestamps_len, (
+            f"history len {movie_history_len} differs from timestamp len {timestamps_len}."
+        )
+        assert movie_history_len == ratings_len, (
+            f"history len {movie_history_len} differs from ratings len {ratings_len}."
+        )
+
+        def _truncate_or_pad_seq(
+            y: List[int], target_len: int, chronological: bool
+        ) -> List[int]:
+            y_len = len(y)
+            if y_len < target_len:
+                y = y + [0] * (target_len - y_len)
+            else:
+                if not chronological:
+                    y = y[:target_len]
+                else:
+                    y = y[-target_len:]
+            assert len(y) == target_len
+            return y
+
+        historical_ids = movie_history[1:]
+        historical_ratings = movie_history_ratings[1:]
+        historical_timestamps = movie_timestamps[1:]
+        target_ids = movie_history[0]
+        target_ratings = movie_history_ratings[0]
+        target_timestamps = movie_timestamps[0]
+        if self._chronological:
+            historical_ids.reverse()
+            historical_ratings.reverse()
+            historical_timestamps.reverse()
+
+        max_seq_len = self._padding_length - 1
+        history_length = min(len(historical_ids), max_seq_len)
+        historical_ids = _truncate_or_pad_seq(
+            historical_ids,
+            max_seq_len,
+            self._chronological,
+        )
+        historical_ratings = _truncate_or_pad_seq(
+            historical_ratings,
+            max_seq_len,
+            self._chronological,
+        )
+        historical_timestamps = _truncate_or_pad_seq(
+            historical_timestamps,
+            max_seq_len,
+            self._chronological,
+        )
+        # moved to features.py
+        # if self._chronological:
+        #     historical_ids.append(0)
+        #     historical_ratings.append(0)
+        #     historical_timestamps.append(0)
+        # print(historical_ids, historical_ratings, historical_timestamps, target_ids, target_ratings, target_timestamps)
+        ret = {
+            "user_id": user_id,
+            "historical_ids": torch.tensor(historical_ids, dtype=torch.int64),
+            "historical_ratings": torch.tensor(historical_ratings, dtype=torch.int64),
+            "historical_timestamps": torch.tensor(
+                historical_timestamps, dtype=torch.int64
+            ),
+            "history_lengths": history_length,
+            "target_ids": target_ids,
+            "target_ratings": target_ratings,
+            "target_timestamps": target_timestamps,
+        }
+        return ret
+
+
+class MultiFileDatasetV2(DatasetV2, torch.utils.data.Dataset):
+    def __init__(
+        self,
+        file_prefix: str,
+        num_files: int,
+        padding_length: int,
+        ignore_last_n: int,  # used for creating train/valid/test sets
+        shift_id_by: int = 0,
+        chronological: bool = False,
+        sample_ratio: float = 1.0,
+    ) -> None:
+        torch.utils.data.Dataset().__init__()
+        self._file_prefix: str = file_prefix
+        self._num_files: int = num_files
+        with open(f"{file_prefix}_users.csv", "r") as file:
+            reader = csv.reader(file)
+            self.users_cumsum: List[int] = np.cumsum(
+                [int(row[1]) for row in reader]
+            ).tolist()
+        self._padding_length: int = padding_length
+        self._ignore_last_n: int = ignore_last_n
+        self._shift_id_by: int = shift_id_by
+        self._chronological: bool = chronological
+        self._sample_ratio: float = sample_ratio
+
+    def __len__(self) -> int:
+        return self.users_cumsum[-1]
+
+    def _process_line(self, line: str) -> pd.Series:
+        reader = csv.reader([line])
+        parsed_line = next(reader)
+        user_id = int(parsed_line[0])
+        sequence_item_ids = parsed_line[1]
+        sequence_ratings = parsed_line[2]
+        return pd.Series(
+            data={
+                "user_id": user_id,
+                "sequence_item_ids": sequence_item_ids,
+                "sequence_ratings": sequence_ratings,
+                "sequence_timestamps": sequence_item_ids,  # placeholder
+            }
+        )
+
+    def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
+        assert idx < self.users_cumsum[-1]
+        file_idx: int = 0
+        while self.users_cumsum[file_idx] <= idx:
+            file_idx += 1
+        if file_idx == 0:
+            local_idx = idx
+        else:
+            local_idx = idx - self.users_cumsum[file_idx - 1]
+        line = linecache.getline(f"{self._file_prefix}_{file_idx}.csv", local_idx + 1)
+        data = self._process_line(line)
+        sample = self.load_item(data)
+        return sample
diff --git a/recommendation_v4/generative_recommenders/research/data/eval.py b/recommendation_v4/generative_recommenders/research/data/eval.py
new file mode 100644
index 000000000..16026e5c7
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/data/eval.py
@@ -0,0 +1,263 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import logging
+import sys
+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Set, Union
+
+import torch
+import torch.distributed as dist
+from generative_recommenders.research.indexing.candidate_index import (
+    CandidateIndex,
+    TopKModule,
+)
+from generative_recommenders.research.modeling.sequential.features import (
+    SequentialFeatures,
+)
+from generative_recommenders.research.rails.similarities.module import SimilarityModule
+from torch.utils.tensorboard import SummaryWriter
+
+
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+
+@dataclass
+class EvalState:
+    all_item_ids: Set[int]
+    candidate_index: CandidateIndex
+    top_k_module: TopKModule
+
+
+def get_eval_state(
+    model: SimilarityModule,
+    all_item_ids: List[int],  # [X]
+    negatives_sampler: torch.nn.Module,
+    top_k_module_fn: Callable[[torch.Tensor, torch.Tensor], TopKModule],
+    device: int,
+    float_dtype: Optional[torch.dtype] = None,
+) -> EvalState:
+    # Exhaustively eval all items (incl. seen ids).
+    eval_negatives_ids = torch.as_tensor(all_item_ids).to(device).unsqueeze(0)  # [1, X]
+    # pyre-fixme[29]: `Union[Tensor, Module]` is not a function.
+    eval_negative_embeddings = negatives_sampler.normalize_embeddings(
+        # pyre-fixme[29]: `Union[Tensor, Module]` is not a function.
+        model.get_item_embeddings(eval_negatives_ids)
+    )
+    if float_dtype is not None:
+        eval_negative_embeddings = eval_negative_embeddings.to(float_dtype)
+    candidates = CandidateIndex(
+        ids=eval_negatives_ids,
+        embeddings=eval_negative_embeddings,
+    )
+    return EvalState(
+        all_item_ids=set(all_item_ids),
+        candidate_index=candidates,
+        top_k_module=top_k_module_fn(eval_negative_embeddings, eval_negatives_ids),
+    )
+
+
+@torch.inference_mode  # pyre-ignore [56]
+def eval_metrics_v2_from_tensors(
+    eval_state: EvalState,
+    model: SimilarityModule,
+    seq_features: SequentialFeatures,
+    target_ids: torch.Tensor,  # [B, 1]
+    min_positive_rating: int = 4,
+    target_ratings: Optional[torch.Tensor] = None,  # [B, 1]
+    epoch: Optional[str] = None,
+    filter_invalid_ids: bool = True,
+    user_max_batch_size: Optional[int] = None,
+    dtype: Optional[torch.dtype] = None,
+) -> Dict[str, Union[float, torch.Tensor]]:
+    """
+    Args:
+        eval_negatives_ids: Optional[Tensor]. If not present, defaults to eval over
+            the entire corpus (`num_items`) excluding all the items that users have
+            seen in the past (historical_ids, target_ids). This is consistent with
+            papers like SASRec and TDM but may not be fair in practice as retrieval
+            modules don't have access to read state during the initial fetch stage.
+        filter_invalid_ids: bool. If true, filters seen ids by default.
+    Returns:
+        keyed metric -> list of values for each example.
+    """
+    B, _ = target_ids.shape
+    device = target_ids.device
+
+    for target_id in target_ids:
+        target_id = int(target_id)
+        if target_id not in eval_state.all_item_ids:
+            print(f"missing target_id {target_id}")
+
+    # computes ro- part exactly once.
+    # pyre-fixme[29]: `Union[Tensor, Module]` is not a function.
+    shared_input_embeddings = model.encode(
+        past_lengths=seq_features.past_lengths,
+        past_ids=seq_features.past_ids,
+        # pyre-fixme[29]: `Union[Tensor, Module]` is not a function.
+        past_embeddings=model.get_item_embeddings(seq_features.past_ids),
+        past_payloads=seq_features.past_payloads,
+    )
+    if dtype is not None:
+        shared_input_embeddings = shared_input_embeddings.to(dtype)
+
+    MAX_K = 2500
+    k = min(MAX_K, eval_state.candidate_index.ids.size(1))
+    user_max_batch_size = user_max_batch_size or shared_input_embeddings.size(0)
+    num_batches = (
+        shared_input_embeddings.size(0) + user_max_batch_size - 1
+    ) // user_max_batch_size
+    eval_top_k_ids_all = []
+    eval_top_k_prs_all = []
+    for mb in range(num_batches):
+        eval_top_k_ids, eval_top_k_prs, _ = (
+            eval_state.candidate_index.get_top_k_outputs(
+                query_embeddings=shared_input_embeddings[
+                    mb * user_max_batch_size : (mb + 1) * user_max_batch_size, ...
+                ],
+                top_k_module=eval_state.top_k_module,
+                k=k,
+                invalid_ids=(
+                    seq_features.past_ids[
+                        mb * user_max_batch_size : (mb + 1) * user_max_batch_size, :
+                    ]
+                    if filter_invalid_ids
+                    else None
+                ),
+                return_embeddings=False,
+            )
+        )
+        eval_top_k_ids_all.append(eval_top_k_ids)
+        eval_top_k_prs_all.append(eval_top_k_prs)
+
+    if num_batches == 1:
+        eval_top_k_ids = eval_top_k_ids_all[0]
+        eval_top_k_prs = eval_top_k_prs_all[0]
+    else:
+        eval_top_k_ids = torch.cat(eval_top_k_ids_all, dim=0)
+        eval_top_k_prs = torch.cat(eval_top_k_prs_all, dim=0)
+
+    assert eval_top_k_ids.size(1) == k
+    _, eval_rank_indices = torch.max(
+        torch.cat(
+            [eval_top_k_ids, target_ids],
+            dim=1,
+        )
+        == target_ids,
+        dim=1,
+    )
+    eval_ranks = torch.where(eval_rank_indices == k, MAX_K + 1, eval_rank_indices + 1)
+
+    output = {
+        "ndcg@1": torch.where(
+            eval_ranks <= 1,
+            torch.div(1.0, torch.log2(eval_ranks + 1)),
+            torch.zeros(1, dtype=torch.float32, device=device),
+        ),
+        "ndcg@10": torch.where(
+            eval_ranks <= 10,
+            torch.div(1.0, torch.log2(eval_ranks + 1)),
+            torch.zeros(1, dtype=torch.float32, device=device),
+        ),
+        "ndcg@50": torch.where(
+            eval_ranks <= 50,
+            torch.div(1.0, torch.log2(eval_ranks + 1)),
+            torch.zeros(1, dtype=torch.float32, device=device),
+        ),
+        "ndcg@100": torch.where(
+            eval_ranks <= 100,
+            torch.div(1.0, torch.log2(eval_ranks + 1)),
+            torch.zeros(1, dtype=torch.float32, device=device),
+        ),
+        "ndcg@200": torch.where(
+            eval_ranks <= 200,
+            torch.div(1.0, torch.log2(eval_ranks + 1)),
+            torch.zeros(1, dtype=torch.float32, device=device),
+        ),
+        "hr@1": (eval_ranks <= 1),
+        "hr@10": (eval_ranks <= 10),
+        "hr@50": (eval_ranks <= 50),
+        "hr@100": (eval_ranks <= 100),
+        "hr@200": (eval_ranks <= 200),
+        "hr@500": (eval_ranks <= 500),
+        "hr@1000": (eval_ranks <= 1000),
+        "mrr": torch.div(1.0, eval_ranks),
+    }
+    if target_ratings is not None:
+        target_ratings = target_ratings.squeeze(1)  # [B]
+        output["ndcg@10_>=4"] = torch.where(
+            eval_ranks[target_ratings >= 4] <= 10,
+            torch.div(1.0, torch.log2(eval_ranks[target_ratings >= 4] + 1)),
+            torch.zeros(1, dtype=torch.float32, device=device),
+        )
+        output[f"hr@10_>={min_positive_rating}"] = (
+            eval_ranks[target_ratings >= min_positive_rating] <= 10
+        )
+        output[f"hr@50_>={min_positive_rating}"] = (
+            eval_ranks[target_ratings >= min_positive_rating] <= 50
+        )
+        output[f"mrr_>={min_positive_rating}"] = torch.div(
+            1.0, eval_ranks[target_ratings >= min_positive_rating]
+        )
+
+    return output  # pyre-ignore [7]
+
+
+def eval_recall_metrics_from_tensors(
+    eval_state: EvalState,
+    model: SimilarityModule,
+    seq_features: SequentialFeatures,
+    user_max_batch_size: Optional[int] = None,
+    dtype: Optional[torch.dtype] = None,
+) -> Dict[str, torch.Tensor]:
+    target_ids = seq_features.past_ids[:, -1].unsqueeze(1)
+    filtered_past_ids = seq_features.past_ids.detach().clone()
+    filtered_past_ids[:, -1] = torch.zeros_like(target_ids.squeeze(1))
+    return eval_metrics_v2_from_tensors(
+        eval_state=eval_state,
+        model=model,
+        seq_features=SequentialFeatures(
+            past_lengths=seq_features.past_lengths - 1,
+            past_ids=filtered_past_ids,
+            past_embeddings=seq_features.past_embeddings,
+            past_payloads=seq_features.past_payloads,
+        ),
+        target_ids=target_ids,
+        user_max_batch_size=user_max_batch_size,
+        dtype=dtype,
+    )
+
+
+def _avg(x: torch.Tensor, world_size: int) -> torch.Tensor:
+    _sum_and_numel = torch.tensor(
+        [x.sum(), x.numel()], dtype=torch.float32, device=x.device
+    )
+    if world_size > 1:
+        dist.all_reduce(_sum_and_numel, op=dist.ReduceOp.SUM)
+    return _sum_and_numel[0] / _sum_and_numel[1]
+
+
+def add_to_summary_writer(
+    writer: Optional[SummaryWriter],
+    batch_id: int,
+    metrics: Dict[str, torch.Tensor],
+    prefix: str,
+    world_size: int,
+) -> None:
+    for key, values in metrics.items():
+        avg_value = _avg(values, world_size)
+        if writer is not None:
+            writer.add_scalar(f"{prefix}/{key}", avg_value, batch_id)
diff --git a/recommendation_v4/generative_recommenders/research/data/item_features.py b/recommendation_v4/generative_recommenders/research/data/item_features.py
new file mode 100644
index 000000000..8ecb6ea6a
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/data/item_features.py
@@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+from dataclasses import dataclass
+from typing import List
+
+import torch
+
+
+@dataclass
+class ItemFeatures:
+    num_items: int
+    max_jagged_dimension: int
+    max_ind_range: List[int]  # [(,)] x num_features
+    lengths: List[torch.Tensor]  # [(num_items,)] x num_features
+    values: List[torch.Tensor]  # [(num_items, max_jagged_dimension)] x num_features
diff --git a/recommendation_v4/generative_recommenders/research/data/preprocessor.py b/recommendation_v4/generative_recommenders/research/data/preprocessor.py
new file mode 100644
index 000000000..bf52f41da
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/data/preprocessor.py
@@ -0,0 +1,474 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import abc
+import logging
+import os
+import sys
+import tarfile
+from typing import Dict, Optional, Union
+from urllib.request import urlretrieve
+from zipfile import ZipFile
+
+import numpy as np
+import pandas as pd
+
+
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+
+class DataProcessor:
+    """
+    This preprocessor does not remap item_ids. This is intended so that we can easily join other
+    side-information based on item_ids later.
+    """
+
+    def __init__(
+        self,
+        prefix: str,
+        expected_num_unique_items: Optional[int],
+        expected_max_item_id: Optional[int],
+    ) -> None:
+        self._prefix: str = prefix
+        self._expected_num_unique_items = expected_num_unique_items
+        self._expected_max_item_id = expected_max_item_id
+
+    @abc.abstractmethod
+    def expected_num_unique_items(self) -> Optional[int]:
+        return self._expected_num_unique_items
+
+    @abc.abstractmethod
+    def expected_max_item_id(self) -> Optional[int]:
+        return self._expected_max_item_id
+
+    @abc.abstractmethod
+    def processed_item_csv(self) -> str:
+        pass
+
+    def output_format_csv(self) -> str:
+        return f"tmp/{self._prefix}/sasrec_format.csv"
+
+    def to_seq_data(
+        self,
+        ratings_data: pd.DataFrame,
+        user_data: Optional[pd.DataFrame] = None,
+    ) -> pd.DataFrame:
+        if user_data is not None:
+            ratings_data_transformed = ratings_data.join(
+                user_data.set_index("user_id"), on="user_id"
+            )
+        else:
+            ratings_data_transformed = ratings_data
+        ratings_data_transformed.item_ids = ratings_data_transformed.item_ids.apply(
+            lambda x: ",".join([str(v) for v in x])
+        )
+        ratings_data_transformed.ratings = ratings_data_transformed.ratings.apply(
+            lambda x: ",".join([str(v) for v in x])
+        )
+        ratings_data_transformed.timestamps = ratings_data_transformed.timestamps.apply(
+            lambda x: ",".join([str(v) for v in x])
+        )
+        ratings_data_transformed.rename(
+            columns={
+                "item_ids": "sequence_item_ids",
+                "ratings": "sequence_ratings",
+                "timestamps": "sequence_timestamps",
+            },
+            inplace=True,
+        )
+        return ratings_data_transformed
+
+    def file_exists(self, name: str) -> bool:
+        return os.path.isfile("%s/%s" % (os.getcwd(), name))
+
+
+class MovielensSyntheticDataProcessor(DataProcessor):
+    def __init__(
+        self,
+        prefix: str,
+        expected_num_unique_items: Optional[int] = None,
+        expected_max_item_id: Optional[int] = None,
+    ) -> None:
+        super().__init__(prefix, expected_num_unique_items, expected_max_item_id)
+
+    def preprocess_rating(self) -> None:
+        return
+
+
+class MovielensDataProcessor(DataProcessor):
+    def __init__(
+        self,
+        download_path: str,
+        saved_name: str,
+        prefix: str,
+        convert_timestamp: bool,
+        expected_num_unique_items: Optional[int] = None,
+        expected_max_item_id: Optional[int] = None,
+    ) -> None:
+        super().__init__(prefix, expected_num_unique_items, expected_max_item_id)
+        self._download_path = download_path
+        self._saved_name = saved_name
+        self._convert_timestamp: bool = convert_timestamp
+
+    def download(self) -> None:
+        if not self.file_exists(self._saved_name):
+            urlretrieve(self._download_path, self._saved_name)
+        if self._saved_name[-4:] == ".zip":
+            ZipFile(self._saved_name, "r").extractall(path="tmp/")
+        else:
+            with tarfile.open(self._saved_name, "r:*") as tar_ref:
+                tar_ref.extractall("tmp/")
+
+    def processed_item_csv(self) -> str:
+        return f"tmp/processed/{self._prefix}/movies.csv"
+
+    def sasrec_format_csv_by_user_train(self) -> str:
+        return f"tmp/{self._prefix}/sasrec_format_by_user_train.csv"
+
+    def sasrec_format_csv_by_user_test(self) -> str:
+        return f"tmp/{self._prefix}/sasrec_format_by_user_test.csv"
+
+    def preprocess_rating(self) -> int:
+        self.download()
+
+        if self._prefix == "ml-1m":
+            users = pd.read_csv(
+                f"tmp/{self._prefix}/users.dat",
+                sep="::",
+                names=["user_id", "sex", "age_group", "occupation", "zip_code"],
+            )
+            ratings = pd.read_csv(
+                f"tmp/{self._prefix}/ratings.dat",
+                sep="::",
+                names=["user_id", "movie_id", "rating", "unix_timestamp"],
+            )
+            movies = pd.read_csv(
+                f"tmp/{self._prefix}/movies.dat",
+                sep="::",
+                names=["movie_id", "title", "genres"],
+                encoding="iso-8859-1",
+            )
+        elif self._prefix == "ml-20m":
+            # ml-20m
+            # ml-20m doesn't have user data.
+            users = None
+            # ratings: userId,movieId,rating,timestamp
+            ratings = pd.read_csv(
+                f"tmp/{self._prefix}/ratings.csv",
+                sep=",",
+            )
+            ratings.rename(
+                columns={
+                    "userId": "user_id",
+                    "movieId": "movie_id",
+                    "timestamp": "unix_timestamp",
+                },
+                inplace=True,
+            )
+            # movieId,title,genres
+            # 1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
+            # 2,Jumanji (1995),Adventure|Children|Fantasy
+            movies = pd.read_csv(
+                f"tmp/{self._prefix}/movies.csv",
+                sep=",",
+                encoding="iso-8859-1",
+            )
+            movies.rename(columns={"movieId": "movie_id"}, inplace=True)
+        else:
+            assert self._prefix == "ml-20mx16x32"
+            # ml-1b
+            user_ids = []
+            movie_ids = []
+            for i in range(16):
+                train_file = f"tmp/{self._prefix}/trainx16x32_{i}.npz"
+                with np.load(train_file) as data:
+                    user_ids.extend([x[0] for x in data["arr_0"]])
+                    movie_ids.extend([x[1] for x in data["arr_0"]])
+            ratings = pd.DataFrame(
+                data={
+                    "user_id": user_ids,
+                    "movie_id": movie_ids,
+                    "rating": user_ids,  # placeholder
+                    "unix_timestamp": movie_ids,  # placeholder
+                }
+            )
+            users = None
+            movies = None
+
+        if movies is not None:
+            # ML-1M and ML-20M only
+            movies["year"] = movies["title"].apply(lambda x: x[-5:-1])
+            movies["cleaned_title"] = movies["title"].apply(lambda x: x[:-7])
+            # movies.year = pd.Categorical(movies.year)
+            # movies["year"] = movies.year.cat.codes
+
+        if users is not None:
+            ## Users (ml-1m only)
+            users.sex = pd.Categorical(users.sex)
+            users["sex"] = users.sex.cat.codes
+
+            users.age_group = pd.Categorical(users.age_group)
+            users["age_group"] = users.age_group.cat.codes
+
+            users.occupation = pd.Categorical(users.occupation)
+            users["occupation"] = users.occupation.cat.codes
+
+            users.zip_code = pd.Categorical(users.zip_code)
+            users["zip_code"] = users.zip_code.cat.codes
+
+        # Normalize movie ids to speed up training
+        print(
+            f"{self._prefix} #item before normalize: {len(set(ratings['movie_id'].values))}"
+        )
+        print(
+            f"{self._prefix} max item id before normalize: {max(set(ratings['movie_id'].values))}"
+        )
+        # print(f"ratings.movie_id.cat.categories={ratings.movie_id.cat.categories}; {type(ratings.movie_id.cat.categories)}")
+        # print(f"ratings.movie_id.cat.codes={ratings.movie_id.cat.codes}; {type(ratings.movie_id.cat.codes)}")
+        # print(movie_id_to_cat)
+        # ratings["movie_id"] = ratings.movie_id.cat.codes
+        # print(f"{self._prefix} #item after normalize: {len(set(ratings['movie_id'].values))}")
+        # print(f"{self._prefix} max item id after normalize: {max(set(ratings['movie_id'].values))}")
+        # movies["remapped_id"] = movies["movie_id"].apply(lambda x: movie_id_to_cat[x])
+
+        if self._convert_timestamp:
+            ratings["unix_timestamp"] = pd.to_datetime(
+                ratings["unix_timestamp"], unit="s"
+            )
+
+        # Save primary csv's
+        if not os.path.exists(f"tmp/processed/{self._prefix}"):
+            os.makedirs(f"tmp/processed/{self._prefix}")
+        if users is not None:
+            users.to_csv(f"tmp/processed/{self._prefix}/users.csv", index=False)
+        if movies is not None:
+            movies.to_csv(f"tmp/processed/{self._prefix}/movies.csv", index=False)
+        ratings.to_csv(f"tmp/processed/{self._prefix}/ratings.csv", index=False)
+
+        num_unique_users = len(set(ratings["user_id"].values))
+        num_unique_items = len(set(ratings["movie_id"].values))
+
+        # SASRec version
+        ratings_group = ratings.sort_values(by=["unix_timestamp"]).groupby("user_id")
+        seq_ratings_data = pd.DataFrame(
+            data={
+                "user_id": list(ratings_group.groups.keys()),
+                "item_ids": list(ratings_group.movie_id.apply(list)),
+                "ratings": list(ratings_group.rating.apply(list)),
+                "timestamps": list(ratings_group.unix_timestamp.apply(list)),
+            }
+        )
+
+        result = pd.DataFrame([[]])
+        for col in ["item_ids"]:
+            result[col + "_mean"] = seq_ratings_data[col].apply(len).mean()
+            result[col + "_min"] = seq_ratings_data[col].apply(len).min()
+            result[col + "_max"] = seq_ratings_data[col].apply(len).max()
+        print(self._prefix)
+        print(result)
+
+        seq_ratings_data = self.to_seq_data(seq_ratings_data, users)
+        seq_ratings_data.sample(frac=1).reset_index().to_csv(
+            self.output_format_csv(), index=False, sep=","
+        )
+
+        # Split by user ids (not tested yet)
+        user_id_split = int(num_unique_users * 0.9)
+        seq_ratings_data_train = seq_ratings_data[
+            seq_ratings_data["user_id"] <= user_id_split
+        ]
+        seq_ratings_data_train.sample(frac=1).reset_index().to_csv(
+            self.sasrec_format_csv_by_user_train(),
+            index=False,
+            sep=",",
+        )
+        seq_ratings_data_test = seq_ratings_data[
+            seq_ratings_data["user_id"] > user_id_split
+        ]
+        seq_ratings_data_test.sample(frac=1).reset_index().to_csv(
+            self.sasrec_format_csv_by_user_test(), index=False, sep=","
+        )
+        print(
+            f"{self._prefix}: train num user: {len(set(seq_ratings_data_train['user_id'].values))}"
+        )
+        print(
+            f"{self._prefix}: test num user: {len(set(seq_ratings_data_test['user_id'].values))}"
+        )
+
+        # print(seq_ratings_data)
+        if self.expected_num_unique_items() is not None:
+            assert self.expected_num_unique_items() == num_unique_items, (
+                f"Expected items: {self.expected_num_unique_items()}, got: {num_unique_items}"
+            )
+
+        return num_unique_items
+
+
+class AmazonDataProcessor(DataProcessor):
+    def __init__(
+        self,
+        download_path: str,
+        saved_name: str,
+        prefix: str,
+        expected_num_unique_items: Optional[int],
+    ) -> None:
+        super().__init__(
+            prefix,
+            expected_num_unique_items=expected_num_unique_items,
+            expected_max_item_id=None,
+        )
+        self._download_path = download_path
+        self._saved_name = saved_name
+        self._prefix = prefix
+
+    def download(self) -> None:
+        if not self.file_exists(self._saved_name):
+            urlretrieve(self._download_path, self._saved_name)
+
+    def preprocess_rating(self) -> int:
+        self.download()
+
+        ratings = pd.read_csv(
+            self._saved_name,
+            sep=",",
+            names=["user_id", "item_id", "rating", "timestamp"],
+        )
+        print(f"{self._prefix} #data points before filter: {ratings.shape[0]}")
+        print(
+            f"{self._prefix} #user before filter: {len(set(ratings['user_id'].values))}"
+        )
+        print(
+            f"{self._prefix} #item before filter: {len(set(ratings['item_id'].values))}"
+        )
+
+        # filter users and items with presence < 5
+        item_id_count = (
+            ratings["item_id"]
+            .value_counts()
+            .rename_axis("unique_values")
+            .reset_index(name="item_count")
+        )
+        user_id_count = (
+            ratings["user_id"]
+            .value_counts()
+            .rename_axis("unique_values")
+            .reset_index(name="user_count")
+        )
+        ratings = ratings.join(item_id_count.set_index("unique_values"), on="item_id")
+        ratings = ratings.join(user_id_count.set_index("unique_values"), on="user_id")
+        ratings = ratings[ratings["item_count"] >= 5]
+        ratings = ratings[ratings["user_count"] >= 5]
+        print(f"{self._prefix} #data points after filter: {ratings.shape[0]}")
+
+        # categorize user id and item id
+        ratings["item_id"] = pd.Categorical(ratings["item_id"])
+        ratings["item_id"] = ratings["item_id"].cat.codes
+        ratings["user_id"] = pd.Categorical(ratings["user_id"])
+        ratings["user_id"] = ratings["user_id"].cat.codes
+        print(
+            f"{self._prefix} #user after filter: {len(set(ratings['user_id'].values))}"
+        )
+        print(
+            f"{self._prefix} #item ater filter: {len(set(ratings['item_id'].values))}"
+        )
+
+        num_unique_items = len(set(ratings["item_id"].values))
+
+        # SASRec version
+        ratings_group = ratings.sort_values(by=["timestamp"]).groupby("user_id")
+
+        seq_ratings_data = pd.DataFrame(
+            data={
+                "user_id": list(ratings_group.groups.keys()),
+                "item_ids": list(ratings_group.item_id.apply(list)),
+                "ratings": list(ratings_group.rating.apply(list)),
+                "timestamps": list(ratings_group.timestamp.apply(list)),
+            }
+        )
+
+        seq_ratings_data = seq_ratings_data[
+            seq_ratings_data["item_ids"].apply(len) >= 5
+        ]
+
+        result = pd.DataFrame([[]])
+        for col in ["item_ids"]:
+            result[col + "_mean"] = seq_ratings_data[col].apply(len).mean()
+            result[col + "_min"] = seq_ratings_data[col].apply(len).min()
+            result[col + "_max"] = seq_ratings_data[col].apply(len).max()
+        print(self._prefix)
+        print(result)
+
+        if not os.path.exists(f"tmp/{self._prefix}"):
+            os.makedirs(f"tmp/{self._prefix}")
+
+        seq_ratings_data = self.to_seq_data(seq_ratings_data)
+        seq_ratings_data.sample(frac=1).reset_index().to_csv(
+            self.output_format_csv(), index=False, sep=","
+        )
+
+        if self.expected_num_unique_items() is not None:
+            assert self.expected_num_unique_items() == num_unique_items, (
+                f"expected: {self.expected_num_unique_items()}, actual: {num_unique_items}"
+            )
+            logging.info(f"{self.expected_num_unique_items()} unique items.")
+
+        return num_unique_items
+
+
+def get_common_preprocessors() -> Dict[
+    str,
+    Union[AmazonDataProcessor, MovielensDataProcessor, MovielensSyntheticDataProcessor],
+]:
+    ml_1m_dp = MovielensDataProcessor(  # pyre-ignore [45]
+        "http://files.grouplens.org/datasets/movielens/ml-1m.zip",
+        "tmp/movielens1m.zip",
+        prefix="ml-1m",
+        convert_timestamp=False,
+        expected_num_unique_items=3706,
+        expected_max_item_id=3952,
+    )
+    ml_20m_dp = MovielensDataProcessor(  # pyre-ignore [45]
+        "http://files.grouplens.org/datasets/movielens/ml-20m.zip",
+        "tmp/movielens20m.zip",
+        prefix="ml-20m",
+        convert_timestamp=False,
+        expected_num_unique_items=26744,
+        expected_max_item_id=131262,
+    )
+    ml_1b_dp = MovielensDataProcessor(  # pyre-ignore [45]
+        "https://files.grouplens.org/datasets/movielens/ml-20mx16x32.tar",
+        "tmp/movielens1b.tar",
+        prefix="ml-20mx16x32",
+        convert_timestamp=False,
+    )
+    ml_3b_dp = MovielensSyntheticDataProcessor(  # pyre-ignore [45]
+        prefix="ml-3b",
+        expected_num_unique_items=26743 * 32,
+        expected_max_item_id=26743 * 32,
+    )
+    amzn_books_dp = AmazonDataProcessor(  # pyre-ignore [45]
+        "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/ratings_Books.csv",
+        "tmp/ratings_Books.csv",
+        prefix="amzn_books",
+        expected_num_unique_items=695762,
+    )
+    return {
+        "ml-1m": ml_1m_dp,
+        "ml-20m": ml_20m_dp,
+        "ml-1b": ml_1b_dp,
+        "ml-3b": ml_3b_dp,
+        "amzn-books": amzn_books_dp,
+    }
diff --git a/recommendation_v4/generative_recommenders/research/data/reco_dataset.py b/recommendation_v4/generative_recommenders/research/data/reco_dataset.py
new file mode 100644
index 000000000..eedcdc08a
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/data/reco_dataset.py
@@ -0,0 +1,176 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+from dataclasses import dataclass
+from typing import List
+
+import pandas as pd
+import torch
+from generative_recommenders.research.data.dataset import DatasetV2, MultiFileDatasetV2
+from generative_recommenders.research.data.item_features import ItemFeatures
+from generative_recommenders.research.data.preprocessor import get_common_preprocessors
+
+
+@dataclass
+class RecoDataset:
+    max_sequence_length: int
+    num_unique_items: int
+    max_item_id: int
+    all_item_ids: List[int]
+    train_dataset: torch.utils.data.Dataset
+    eval_dataset: torch.utils.data.Dataset
+
+
+def get_reco_dataset(
+    dataset_name: str,
+    max_sequence_length: int,
+    chronological: bool,
+    positional_sampling_ratio: float = 1.0,
+) -> RecoDataset:
+    if dataset_name == "ml-1m":
+        dp = get_common_preprocessors()[dataset_name]
+        train_dataset = DatasetV2(
+            ratings_file=dp.output_format_csv(),
+            padding_length=max_sequence_length + 1,  # target
+            ignore_last_n=1,
+            chronological=chronological,
+            sample_ratio=positional_sampling_ratio,
+        )
+        eval_dataset = DatasetV2(
+            ratings_file=dp.output_format_csv(),
+            padding_length=max_sequence_length + 1,  # target
+            ignore_last_n=0,
+            chronological=chronological,
+            sample_ratio=1.0,  # do not sample
+        )
+    elif dataset_name == "ml-20m":
+        dp = get_common_preprocessors()[dataset_name]
+        train_dataset = DatasetV2(
+            ratings_file=dp.output_format_csv(),
+            padding_length=max_sequence_length + 1,  # target
+            ignore_last_n=1,
+            chronological=chronological,
+        )
+        eval_dataset = DatasetV2(
+            ratings_file=dp.output_format_csv(),
+            padding_length=max_sequence_length + 1,  # target
+            ignore_last_n=0,
+            chronological=chronological,
+        )
+    elif dataset_name == "ml-3b":
+        dp = get_common_preprocessors()[dataset_name]
+        train_dataset = MultiFileDatasetV2(
+            file_prefix="tmp/ml-3b/16x32",
+            num_files=16,
+            padding_length=max_sequence_length + 1,  # target
+            ignore_last_n=1,
+            chronological=chronological,
+        )
+        eval_dataset = MultiFileDatasetV2(
+            file_prefix="tmp/ml-3b/16x32",
+            num_files=16,
+            padding_length=max_sequence_length + 1,  # target
+            ignore_last_n=0,
+            chronological=chronological,
+        )
+    elif dataset_name == "amzn-books":
+        dp = get_common_preprocessors()[dataset_name]
+        train_dataset = DatasetV2(
+            ratings_file=dp.output_format_csv(),
+            padding_length=max_sequence_length + 1,  # target
+            ignore_last_n=1,
+            shift_id_by=1,  # [0..n-1] -> [1..n]
+            chronological=chronological,
+        )
+        eval_dataset = DatasetV2(
+            ratings_file=dp.output_format_csv(),
+            padding_length=max_sequence_length + 1,  # target
+            ignore_last_n=0,
+            shift_id_by=1,  # [0..n-1] -> [1..n]
+            chronological=chronological,
+        )
+    else:
+        raise ValueError(f"Unknown dataset {dataset_name}")
+
+    if dataset_name == "ml-1m" or dataset_name == "ml-20m":
+        items = pd.read_csv(dp.processed_item_csv(), delimiter=",")
+        max_jagged_dimension = 16
+        expected_max_item_id = dp.expected_max_item_id()
+        assert expected_max_item_id is not None
+        item_features: ItemFeatures = ItemFeatures(
+            max_ind_range=[63, 16383, 511],
+            num_items=expected_max_item_id + 1,
+            max_jagged_dimension=max_jagged_dimension,
+            lengths=[
+                torch.zeros((expected_max_item_id + 1,), dtype=torch.int64),
+                torch.zeros((expected_max_item_id + 1,), dtype=torch.int64),
+                torch.zeros((expected_max_item_id + 1,), dtype=torch.int64),
+            ],
+            values=[
+                torch.zeros(
+                    (expected_max_item_id + 1, max_jagged_dimension),
+                    dtype=torch.int64,
+                ),
+                torch.zeros(
+                    (expected_max_item_id + 1, max_jagged_dimension),
+                    dtype=torch.int64,
+                ),
+                torch.zeros(
+                    (expected_max_item_id + 1, max_jagged_dimension),
+                    dtype=torch.int64,
+                ),
+            ],
+        )
+        all_item_ids = []
+        for df_index, row in items.iterrows():
+            # print(f"index {df_index}: {row}")
+            movie_id = int(row["movie_id"])
+            genres = row["genres"].split("|")
+            titles = row["cleaned_title"].split(" ")
+            # print(f"{index}: genres{genres}, title{titles}")
+            genres_vector = [hash(x) % item_features.max_ind_range[0] for x in genres]
+            titles_vector = [hash(x) % item_features.max_ind_range[1] for x in titles]
+            years_vector = [hash(row["year"]) % item_features.max_ind_range[2]]
+            item_features.lengths[0][movie_id] = min(
+                len(genres_vector), max_jagged_dimension
+            )
+            item_features.lengths[1][movie_id] = min(
+                len(titles_vector), max_jagged_dimension
+            )
+            item_features.lengths[2][movie_id] = min(
+                len(years_vector), max_jagged_dimension
+            )
+            for f, f_values in enumerate([genres_vector, titles_vector, years_vector]):
+                for j in range(min(len(f_values), max_jagged_dimension)):
+                    item_features.values[f][movie_id][j] = f_values[j]
+            all_item_ids.append(movie_id)
+        max_item_id = dp.expected_max_item_id()
+        for x in all_item_ids:
+            assert x > 0, "x in all_item_ids should be positive"
+    else:
+        # expected_max_item_id and item_features are not set for Amazon datasets.
+        item_features = None
+        max_item_id = dp.expected_num_unique_items()
+        all_item_ids = [x + 1 for x in range(max_item_id)]  # pyre-ignore [6]
+
+    return RecoDataset(
+        max_sequence_length=max_sequence_length,
+        num_unique_items=dp.expected_num_unique_items(),  # pyre-ignore [6]
+        max_item_id=max_item_id,  # pyre-ignore [6]
+        all_item_ids=all_item_ids,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+    )
diff --git a/recommendation_v4/generative_recommenders/research/indexing/candidate_index.py b/recommendation_v4/generative_recommenders/research/indexing/candidate_index.py
new file mode 100644
index 000000000..fee763eaa
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/indexing/candidate_index.py
@@ -0,0 +1,179 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+from typing import Optional, Tuple
+
+import torch
+from generative_recommenders.research.rails.indexing.candidate_index import TopKModule
+
+
+class CandidateIndex(object):
+    def __init__(
+        self,
+        ids: torch.Tensor,
+        embeddings: torch.Tensor,
+        invalid_ids: Optional[torch.Tensor] = None,
+        debug_path: Optional[str] = None,
+    ) -> None:
+        super().__init__()
+
+        self._ids: torch.Tensor = ids
+        self._embeddings: torch.Tensor = embeddings
+        self._invalid_ids: Optional[torch.Tensor] = invalid_ids
+        self._debug_path: Optional[str] = debug_path
+
+    @property
+    def ids(self) -> torch.Tensor:
+        """
+        Returns:
+            (1, X) or (B, X), where valid ids are positive integers.
+        """
+        return self._ids
+
+    @property
+    def num_objects(self) -> int:
+        return self._ids.size(1)
+
+    @property
+    def embeddings(self) -> torch.Tensor:
+        """
+        Returns:
+            (1, X, D) or (B, X, D) with the same shape as `ids'.
+        """
+        return self._embeddings
+
+    def filter_invalid_ids(
+        self,
+        invalid_ids: torch.Tensor,
+    ) -> "CandidateIndex":
+        """
+        Filters invalid_ids (batch dimension dependent) from the current index.
+
+        Args:
+            invalid_ids: (B, N) x int64.
+
+        Returns:
+            CandidateIndex with invalid_ids filtered.
+        """
+        X = self._ids.size(1)
+        if self._ids.size(0) == 1:
+            # ((1, X, 1) == (B, 1, N)) -> (B, X)
+            invalid_mask, _ = (self._ids.unsqueeze(2) == invalid_ids.unsqueeze(1)).max(
+                dim=2
+            )
+            lengths = (~invalid_mask).int().sum(-1)  # (B,)
+            valid_1d_mask = (~invalid_mask).view(-1)
+            B: int = lengths.size(0)
+            D: int = self._embeddings.size(-1)
+            jagged_ids = self._ids.expand(B, -1).reshape(-1)[valid_1d_mask]
+            jagged_embeddings = self._embeddings.expand(B, -1, -1).reshape(-1, D)[
+                valid_1d_mask
+            ]
+            X_prime: int = lengths.max(-1)[0].item()
+            jagged_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
+            return CandidateIndex(
+                ids=torch.ops.fbgemm.jagged_to_padded_dense(
+                    values=jagged_ids.unsqueeze(-1),
+                    offsets=[jagged_offsets],
+                    max_lengths=[X_prime],
+                    padding_value=0,
+                ).squeeze(-1),
+                embeddings=torch.ops.fbgemm.jagged_to_padded_dense(
+                    values=jagged_embeddings,
+                    offsets=[jagged_offsets],
+                    max_lengths=[X_prime],
+                    padding_value=0.0,
+                ),
+                debug_path=self._debug_path,
+            )
+        else:
+            assert self._invalid_ids == None
+            return CandidateIndex(
+                ids=self.ids,
+                embeddings=self.embeddings,
+                invalid_ids=invalid_ids,
+                debug_path=self._debug_path,
+            )
+
+    def get_top_k_outputs(
+        self,
+        query_embeddings: torch.Tensor,
+        k: int,
+        top_k_module: TopKModule,
+        invalid_ids: Optional[torch.Tensor],
+        r: int = 1,
+        return_embeddings: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Gets top-k outputs specified by `policy_fn', while filtering out
+        invalid ids per row as specified by `invalid_ids'.
+
+        Args:
+            k: int. top k to return.
+            policy_fn: lambda that takes in item-side embeddings (B, X, D,) and user-side
+                embeddings (B * r, ...), and returns predictions (unnormalized logits)
+                of shape (B * r, X,).
+            invalid_ids: (B * r, N_0) x int64. The list of ids (if > 0) to filter from
+                results if present. Expect N_0 to be a small constant.
+            return_embeddings: bool if we should additionally return embeddings for the
+                top k results.
+
+        Returns:
+            A tuple of (top_k_ids, top_k_prs, top_k_embeddings) of shape (B * r, k, ...).
+        """
+        B: int = query_embeddings.size(0)
+        max_num_invalid_ids = 0
+        if invalid_ids is not None:
+            max_num_invalid_ids = invalid_ids.size(1)
+
+        k_prime = min(k + max_num_invalid_ids, self.num_objects)
+        top_k_prime_scores, top_k_prime_ids = top_k_module(
+            query_embeddings=query_embeddings, k=k_prime
+        )
+        # Masks out invalid items rowwise.
+        if invalid_ids is not None:
+            id_is_valid = ~(
+                (top_k_prime_ids.unsqueeze(2) == invalid_ids.unsqueeze(1)).max(2)[0]
+            )  # [B, K + N_0]
+            id_is_valid = torch.logical_and(
+                id_is_valid, torch.cumsum(id_is_valid.int(), dim=1) <= k
+            )
+            # [[1, 0, 1, 0], [0, 1, 1, 1]], k=2 -> [[0, 2], [1, 2]]
+            top_k_rowwise_offsets = torch.nonzero(id_is_valid, as_tuple=True)[1].view(
+                -1, k
+            )
+            top_k_scores = torch.gather(
+                top_k_prime_scores, dim=1, index=top_k_rowwise_offsets
+            )
+            top_k_ids = torch.gather(
+                top_k_prime_ids, dim=1, index=top_k_rowwise_offsets
+            )
+        else:
+            top_k_scores = top_k_prime_scores
+            top_k_ids = top_k_prime_ids
+
+        # TODO: this should be decoupled from candidate_index.
+        if return_embeddings:
+            raise ValueError("return_embeddings not supported yet.")
+        else:
+            top_k_embeddings = None
+        return top_k_ids, top_k_scores, top_k_embeddings
+
+    def apply_object_filter(self) -> "CandidateIndex":
+        """
+        Applies general per batch filters.
+        """
+        raise NotImplementedError("not implemented.")
diff --git a/recommendation_v4/generative_recommenders/research/indexing/utils.py b/recommendation_v4/generative_recommenders/research/indexing/utils.py
new file mode 100644
index 000000000..972d3c2e7
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/indexing/utils.py
@@ -0,0 +1,43 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import torch
+from generative_recommenders.research.rails.indexing.candidate_index import TopKModule
+from generative_recommenders.research.rails.indexing.mips_top_k import (
+    MIPSBruteForceTopK,
+)
+from generative_recommenders.research.rails.indexing.mol_top_k import MoLBruteForceTopK
+
+
+def get_top_k_module(
+    top_k_method: str,
+    model: torch.nn.Module,
+    item_embeddings: torch.Tensor,
+    item_ids: torch.Tensor,
+) -> TopKModule:
+    if top_k_method == "MIPSBruteForceTopK":
+        top_k_module = MIPSBruteForceTopK(
+            item_embeddings=item_embeddings,
+            item_ids=item_ids,
+        )
+    elif top_k_method == "MoLBruteForceTopK":
+        top_k_module = MoLBruteForceTopK(  # pyre-ignore [20]
+            item_embeddings=item_embeddings,
+            item_ids=item_ids,
+        )
+    else:
+        raise ValueError(f"Invalid top-k method {top_k_method}")
+    return top_k_module
diff --git a/recommendation_v4/generative_recommenders/research/modeling/initialization.py b/recommendation_v4/generative_recommenders/research/modeling/initialization.py
new file mode 100644
index 000000000..c80d60075
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/modeling/initialization.py
@@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import torch
+
+
+def truncated_normal(x: torch.Tensor, mean: float, std: float) -> torch.Tensor:
+    with torch.no_grad():
+        size = x.shape
+        tmp = x.new_empty(size + (4,)).normal_()
+        valid = (tmp < 2) & (tmp > -2)
+        ind = valid.max(-1, keepdim=True)[1]
+        x.data.copy_(tmp.gather(-1, ind).squeeze(-1))
+        x.data.mul_(std).add_(mean)
+        return x
+
+
+def init_mlp_xavier_weights_zero_bias(m: torch.nn.Module) -> None:
+    if isinstance(m, torch.nn.Linear):
+        torch.nn.init.xavier_uniform(m.weight)
+        if getattr(m, "bias", None) is not None:
+            m.bias.data.fill_(0.0)
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/autoregressive_losses.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/autoregressive_losses.py
new file mode 100644
index 000000000..c32bedf0e
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/modeling/sequential/autoregressive_losses.py
@@ -0,0 +1,477 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import abc
+from collections import OrderedDict
+from typing import List, Tuple
+
+import torch
+import torch.nn.functional as F
+from generative_recommenders.research.rails.similarities.module import SimilarityModule
+from torch.utils.checkpoint import checkpoint
+
+
+class NegativesSampler(torch.nn.Module):
+    def __init__(self, l2_norm: bool, l2_norm_eps: float) -> None:
+        super().__init__()
+
+        self._l2_norm: bool = l2_norm
+        self._l2_norm_eps: float = l2_norm_eps
+
+    def normalize_embeddings(self, x: torch.Tensor) -> torch.Tensor:
+        return self._maybe_l2_norm(x)
+
+    def _maybe_l2_norm(self, x: torch.Tensor) -> torch.Tensor:
+        if self._l2_norm:
+            x = x / torch.clamp(
+                torch.linalg.norm(x, ord=2, dim=-1, keepdim=True),
+                min=self._l2_norm_eps,
+            )
+        return x
+
+    @abc.abstractmethod
+    def debug_str(self) -> str:
+        pass
+
+    @abc.abstractmethod
+    def process_batch(
+        self,
+        ids: torch.Tensor,
+        presences: torch.Tensor,
+        embeddings: torch.Tensor,
+    ) -> None:
+        pass
+
+    @abc.abstractmethod
+    def forward(
+        self,
+        positive_ids: torch.Tensor,
+        num_to_sample: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Returns:
+            A tuple of (sampled_ids, sampled_negative_embeddings).
+        """
+        pass
+
+
+class LocalNegativesSampler(NegativesSampler):
+    def __init__(
+        self,
+        num_items: int,
+        item_emb: torch.nn.Embedding,
+        all_item_ids: List[int],
+        l2_norm: bool,
+        l2_norm_eps: float,
+    ) -> None:
+        super().__init__(l2_norm=l2_norm, l2_norm_eps=l2_norm_eps)
+
+        self._num_items: int = len(all_item_ids)
+        self._item_emb: torch.nn.Embedding = item_emb
+        self.register_buffer("_all_item_ids", torch.tensor(all_item_ids))
+
+    def debug_str(self) -> str:
+        sampling_debug_str = (
+            f"local{f'-l2-eps{self._l2_norm_eps}' if self._l2_norm else ''}"
+        )
+        return sampling_debug_str
+
+    def process_batch(
+        self,
+        ids: torch.Tensor,
+        presences: torch.Tensor,
+        embeddings: torch.Tensor,
+    ) -> None:
+        pass
+
+    def forward(
+        self,
+        positive_ids: torch.Tensor,
+        num_to_sample: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Returns:
+            A tuple of (sampled_ids, sampled_negative_embeddings).
+        """
+        # assert torch.max(torch.abs(self._item_emb(positive_ids) - positive_embeddings)) < 1e-4
+        output_shape = positive_ids.size() + (num_to_sample,)
+        sampled_offsets = torch.randint(
+            low=0,
+            high=self._num_items,
+            size=output_shape,
+            dtype=positive_ids.dtype,
+            device=positive_ids.device,
+        )
+        sampled_ids = self._all_item_ids[sampled_offsets.view(-1)].reshape(output_shape)
+        return sampled_ids, self.normalize_embeddings(self._item_emb(sampled_ids))
+
+
+class InBatchNegativesSampler(NegativesSampler):
+    def __init__(
+        self,
+        l2_norm: bool,
+        l2_norm_eps: float,
+        dedup_embeddings: bool,
+    ) -> None:
+        super().__init__(l2_norm=l2_norm, l2_norm_eps=l2_norm_eps)
+
+        self._dedup_embeddings: bool = dedup_embeddings
+
+    def debug_str(self) -> str:
+        sampling_debug_str = (
+            f"in-batch{f'-l2-eps{self._l2_norm_eps}' if self._l2_norm else ''}"
+        )
+        if self._dedup_embeddings:
+            sampling_debug_str += "-dedup"
+        return sampling_debug_str
+
+    def process_batch(
+        self,
+        ids: torch.Tensor,
+        presences: torch.Tensor,
+        embeddings: torch.Tensor,
+    ) -> None:
+        """
+        Args:
+           ids: (N') or (B, N) x int64
+           presences: (N') or (B, N) x bool
+           embeddings: (N', D) or (B, N, D) x float
+        """
+        assert ids.size() == presences.size()
+        assert ids.size() == embeddings.size()[:-1]
+        if self._dedup_embeddings:
+            valid_ids = ids[presences]
+            unique_ids, unique_ids_inverse_indices = torch.unique(
+                input=valid_ids, sorted=False, return_inverse=True
+            )
+            device = unique_ids.device
+            unique_embedding_offsets = torch.empty(
+                (unique_ids.numel(),),
+                dtype=torch.int64,
+                device=device,
+            )
+            unique_embedding_offsets[unique_ids_inverse_indices] = torch.arange(
+                valid_ids.numel(), dtype=torch.int64, device=device
+            )
+            unique_embeddings = embeddings[presences][unique_embedding_offsets, :]
+            self._cached_embeddings = self._maybe_l2_norm(  # pyre-ignore [16]
+                unique_embeddings
+            )
+            self._cached_ids = unique_ids  # pyre-ignore [16]
+        else:
+            self._cached_embeddings = self._maybe_l2_norm(embeddings[presences])
+            self._cached_ids = ids[presences]
+
+    def get_all_ids_and_embeddings(self) -> Tuple[torch.Tensor, torch.Tensor]:
+        return self._cached_ids, self._cached_embeddings  # pyre-ignore [7]
+
+    def forward(
+        self,
+        positive_ids: torch.Tensor,
+        num_to_sample: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Returns:
+            A tuple of (sampled_ids, sampled_negative_embeddings,).
+        """
+        X = self._cached_ids.size(0)
+        sampled_offsets = torch.randint(
+            low=0,
+            high=X,
+            size=positive_ids.size() + (num_to_sample,),
+            dtype=positive_ids.dtype,
+            device=positive_ids.device,
+        )
+        return (
+            self._cached_ids[sampled_offsets],  # pyre-ignore [29]
+            self._cached_embeddings[sampled_offsets],  # pyre-ignore [29]
+        )
+
+
+class AutoregressiveLoss(torch.nn.Module):
+    @abc.abstractmethod
+    def jagged_forward(
+        self,
+        output_embeddings: torch.Tensor,
+        supervision_ids: torch.Tensor,
+        supervision_embeddings: torch.Tensor,
+        supervision_weights: torch.Tensor,
+        negatives_sampler: NegativesSampler,
+    ) -> torch.Tensor:
+        """
+        Variant of forward() when the tensors are already in jagged format.
+
+        Args:
+            output_embeddings: [N', D] x float, embeddings for the current
+                input sequence.
+            supervision_ids: [N'] x int64, (positive) supervision ids.
+            supervision_embeddings: [N', D] x float.
+            supervision_weights: Optional [N'] x float. Optional weights for
+                masking out invalid positions, or reweighting supervision labels.
+            negatives_sampler: sampler used to obtain negative examples paired with
+                positives.
+
+        Returns:
+            (1), loss for the current engaged sequence.
+        """
+        pass
+
+    @abc.abstractmethod
+    def forward(
+        self,
+        lengths: torch.Tensor,
+        output_embeddings: torch.Tensor,
+        supervision_ids: torch.Tensor,
+        supervision_embeddings: torch.Tensor,
+        supervision_weights: torch.Tensor,
+        negatives_sampler: NegativesSampler,
+    ) -> torch.Tensor:
+        """
+        Args:
+            lengths: [B] x int32 representing number of non-zero elements per row.
+            output_embeddings: [B, N, D] x float, embeddings for the current
+                input sequence.
+            supervision_ids: [B, N] x int64, (positive) supervision ids.
+            supervision_embeddings: [B, N, D] x float.
+            supervision_weights: Optional [B, N] x float. Optional weights for
+                masking out invalid positions, or reweighting supervision labels.
+            negatives_sampler: sampler used to obtain negative examples paired with
+                positives.
+
+        Returns:
+            (1), loss for the current engaged sequence.
+        """
+        pass
+
+
+class BCELoss(AutoregressiveLoss):
+    def __init__(
+        self,
+        temperature: float,
+        model: SimilarityModule,
+    ) -> None:
+        super().__init__()
+        self._temperature: float = temperature
+        self._model = model
+
+    def jagged_forward(
+        self,
+        output_embeddings: torch.Tensor,
+        supervision_ids: torch.Tensor,
+        supervision_embeddings: torch.Tensor,
+        supervision_weights: torch.Tensor,
+        negatives_sampler: NegativesSampler,
+    ) -> torch.Tensor:
+        assert output_embeddings.size() == supervision_embeddings.size()
+        assert supervision_ids.size() == supervision_embeddings.size()[:-1]
+        assert supervision_ids.size() == supervision_weights.size()
+
+        sampled_ids, sampled_negative_embeddings = negatives_sampler(
+            positive_ids=supervision_ids,
+            num_to_sample=1,
+        )
+
+        positive_logits = (
+            self._model.interaction(  # pyre-ignore [29]
+                input_embeddings=output_embeddings,  # [B, D] = [N', D]
+                target_ids=supervision_ids.unsqueeze(1),  # [N', 1]
+                target_embeddings=supervision_embeddings.unsqueeze(
+                    1
+                ),  # [N', D] -> [N', 1, D]
+            )[0].squeeze(1)
+            / self._temperature
+        )  # [N']
+
+        sampled_negatives_logits = (
+            self._model.interaction(  # pyre-ignore [29]
+                input_embeddings=output_embeddings,  # [N', D]
+                target_ids=sampled_ids,  # [N', 1]
+                target_embeddings=sampled_negative_embeddings,  # [N', 1, D]
+            )[0].squeeze(1)
+            / self._temperature
+        )  # [N']
+        sampled_negatives_valid_mask = (
+            supervision_ids != sampled_ids.squeeze(1)
+        ).float()  # [N']
+        loss_weights = supervision_weights * sampled_negatives_valid_mask
+        weighted_losses = (
+            (
+                F.binary_cross_entropy_with_logits(
+                    input=positive_logits,
+                    target=torch.ones_like(positive_logits),
+                    reduction="none",
+                )
+                + F.binary_cross_entropy_with_logits(
+                    input=sampled_negatives_logits,
+                    target=torch.zeros_like(sampled_negatives_logits),
+                    reduction="none",
+                )
+            )
+            * loss_weights
+            * 0.5
+        )
+        return weighted_losses.sum() / loss_weights.sum()
+
+    def forward(
+        self,
+        lengths: torch.Tensor,
+        output_embeddings: torch.Tensor,
+        supervision_ids: torch.Tensor,
+        supervision_embeddings: torch.Tensor,
+        supervision_weights: torch.Tensor,
+        negatives_sampler: NegativesSampler,
+    ) -> torch.Tensor:
+        """
+        Args:
+          lengths: [B] x int32 representing number of non-zero elements per row.
+          output_embeddings: [B, N, D] x float, embeddings for the current
+              input sequence.
+          supervision_ids: [B, N] x int64, (positive) supervision ids.
+          supervision_embeddings: [B, N, D] x float.
+          supervision_weights: Optional [B, N] x float. Optional weights for
+              masking out invalid positions, or reweighting supervision labels.
+          negatives_sampler: sampler used to obtain negative examples paired with
+              positives.
+        Returns:
+          (1), loss for the current engaged sequence.
+        """
+        assert output_embeddings.size() == supervision_embeddings.size()
+        assert supervision_ids.size() == supervision_embeddings.size()[:-1]
+        jagged_id_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
+        jagged_supervision_ids = (
+            torch.ops.fbgemm.dense_to_jagged(
+                supervision_ids.unsqueeze(-1).float(), [jagged_id_offsets]
+            )[0]
+            .squeeze(1)
+            .long()
+        )
+        jagged_supervision_weights = torch.ops.fbgemm.dense_to_jagged(
+            supervision_weights.unsqueeze(-1),
+            [jagged_id_offsets],
+        )[0].squeeze(1)
+        return self.jagged_forward(
+            output_embeddings=torch.ops.fbgemm.dense_to_jagged(
+                output_embeddings,
+                [jagged_id_offsets],
+            )[0],
+            supervision_ids=jagged_supervision_ids,
+            supervision_embeddings=torch.ops.fbgemm.dense_to_jagged(
+                supervision_embeddings,
+                [jagged_id_offsets],
+            )[0],
+            supervision_weights=jagged_supervision_weights,
+            negatives_sampler=negatives_sampler,
+        )
+
+
+class BCELossWithRatings(AutoregressiveLoss):
+    def __init__(
+        self,
+        temperature: float,
+        model: SimilarityModule,
+    ) -> None:
+        super().__init__()
+        self._temperature: float = temperature
+        self._model = model
+
+    def jagged_forward(
+        self,
+        output_embeddings: torch.Tensor,
+        supervision_ids: torch.Tensor,
+        supervision_embeddings: torch.Tensor,
+        supervision_weights: torch.Tensor,
+        supervision_ratings: torch.Tensor,
+        negatives_sampler: NegativesSampler,
+    ) -> torch.Tensor:
+        assert output_embeddings.size() == supervision_embeddings.size()
+        assert supervision_ids.size() == supervision_embeddings.size()[:-1]
+        assert supervision_ids.size() == supervision_weights.size()
+
+        target_logits = (
+            self._model.interaction(  # pyre-ignore [29]
+                input_embeddings=output_embeddings,  # [B, D] = [N', D]
+                target_ids=supervision_ids.unsqueeze(1),  # [N', 1]
+                target_embeddings=supervision_embeddings.unsqueeze(
+                    1
+                ),  # [N', D] -> [N', 1, D]
+            )[0].squeeze(1)
+            / self._temperature
+        )  # [N', 1]
+
+        weighted_losses = (
+            F.binary_cross_entropy_with_logits(
+                input=target_logits,
+                target=supervision_ratings.to(dtype=target_logits.dtype),
+                reduction="none",
+            )
+        ) * supervision_weights
+        return weighted_losses.sum() / supervision_weights.sum()
+
+    def forward(
+        self,
+        lengths: torch.Tensor,
+        output_embeddings: torch.Tensor,
+        supervision_ids: torch.Tensor,
+        supervision_embeddings: torch.Tensor,
+        supervision_weights: torch.Tensor,
+        supervision_ratings: torch.Tensor,
+        negatives_sampler: NegativesSampler,
+    ) -> torch.Tensor:
+        """
+        Args:
+          lengths: [B] x int32 representing number of non-zero elements per row.
+          output_embeddings: [B, N, D] x float, embeddings for the current
+              input sequence.
+          supervision_ids: [B, N] x int64, (positive) supervision ids.
+          supervision_embeddings: [B, N, D] x float.
+          supervision_weights: Optional [B, N] x float. Optional weights for
+              masking out invalid positions, or reweighting supervision labels.
+          negatives_sampler: sampler used to obtain negative examples paired with
+              positives.
+        Returns:
+          (1), loss for the current engaged sequence.
+        """
+        assert output_embeddings.size() == supervision_embeddings.size()
+        assert supervision_ids.size() == supervision_embeddings.size()[:-1]
+        jagged_id_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
+        jagged_supervision_ids = (
+            torch.ops.fbgemm.dense_to_jagged(
+                supervision_ids.unsqueeze(-1).float(), [jagged_id_offsets]
+            )[0]
+            .squeeze(1)
+            .long()
+        )
+        jagged_supervision_weights = torch.ops.fbgemm.dense_to_jagged(
+            supervision_weights.unsqueeze(-1),
+            [jagged_id_offsets],
+        )[0].squeeze(1)
+        return self.jagged_forward(
+            output_embeddings=torch.ops.fbgemm.dense_to_jagged(
+                output_embeddings,
+                [jagged_id_offsets],
+            )[0],
+            supervision_ids=jagged_supervision_ids,
+            supervision_embeddings=torch.ops.fbgemm.dense_to_jagged(
+                supervision_embeddings,
+                [jagged_id_offsets],
+            )[0],
+            supervision_weights=jagged_supervision_weights,
+            supervision_ratings=torch.ops.fbgemm.dense_to_jagged(
+                supervision_ratings.unsqueeze(-1),
+                [jagged_id_offsets],
+            )[0].squeeze(1),
+            negatives_sampler=negatives_sampler,
+        )
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/embedding_modules.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/embedding_modules.py
new file mode 100644
index 000000000..6e85a62dd
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/modeling/sequential/embedding_modules.py
@@ -0,0 +1,108 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import abc
+
+import torch
+from generative_recommenders.research.modeling.initialization import truncated_normal
+
+
+class EmbeddingModule(torch.nn.Module):
+    @abc.abstractmethod
+    def debug_str(self) -> str:
+        pass
+
+    @abc.abstractmethod
+    def get_item_embeddings(self, item_ids: torch.Tensor) -> torch.Tensor:
+        pass
+
+    @property
+    @abc.abstractmethod
+    def item_embedding_dim(self) -> int:
+        pass
+
+
+class LocalEmbeddingModule(EmbeddingModule):
+    def __init__(
+        self,
+        num_items: int,
+        item_embedding_dim: int,
+    ) -> None:
+        super().__init__()
+
+        self._item_embedding_dim: int = item_embedding_dim
+        self._item_emb = torch.nn.Embedding(
+            num_items + 1, item_embedding_dim, padding_idx=0
+        )
+        self.reset_params()
+
+    def debug_str(self) -> str:
+        return f"local_emb_d{self._item_embedding_dim}"
+
+    def reset_params(self) -> None:
+        for name, params in self.named_parameters():
+            if "_item_emb" in name:
+                print(
+                    f"Initialize {name} as truncated normal: {params.data.size()} params"
+                )
+                truncated_normal(params, mean=0.0, std=0.02)
+            else:
+                print(f"Skipping initializing params {name} - not configured")
+
+    def get_item_embeddings(self, item_ids: torch.Tensor) -> torch.Tensor:
+        return self._item_emb(item_ids)
+
+    @property
+    def item_embedding_dim(self) -> int:
+        return self._item_embedding_dim
+
+
+class CategoricalEmbeddingModule(EmbeddingModule):
+    def __init__(
+        self,
+        num_items: int,
+        item_embedding_dim: int,
+        item_id_to_category_id: torch.Tensor,
+    ) -> None:
+        super().__init__()
+
+        self._item_embedding_dim: int = item_embedding_dim
+        self._item_emb: torch.nn.Embedding = torch.nn.Embedding(
+            num_items + 1, item_embedding_dim, padding_idx=0
+        )
+        self.register_buffer("_item_id_to_category_id", item_id_to_category_id)
+        self.reset_params()
+
+    def debug_str(self) -> str:
+        return f"cat_emb_d{self._item_embedding_dim}"
+
+    def reset_params(self) -> None:
+        for name, params in self.named_parameters():
+            if "_item_emb" in name:
+                print(
+                    f"Initialize {name} as truncated normal: {params.data.size()} params"
+                )
+                truncated_normal(params, mean=0.0, std=0.02)
+            else:
+                print(f"Skipping initializing params {name} - not configured")
+
+    def get_item_embeddings(self, item_ids: torch.Tensor) -> torch.Tensor:
+        item_ids = self._item_id_to_category_id[(item_ids - 1).clamp(min=0)] + 1
+        return self._item_emb(item_ids)
+
+    @property
+    def item_embedding_dim(self) -> int:
+        return self._item_embedding_dim
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/encoder_utils.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/encoder_utils.py
new file mode 100644
index 000000000..dc64aa2cf
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/modeling/sequential/encoder_utils.py
@@ -0,0 +1,150 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import gin
+from generative_recommenders.research.modeling.sequential.embedding_modules import (
+    EmbeddingModule,
+)
+from generative_recommenders.research.modeling.sequential.hstu import HSTU
+from generative_recommenders.research.modeling.sequential.input_features_preprocessors import (
+    InputFeaturesPreprocessorModule,
+)
+from generative_recommenders.research.modeling.sequential.output_postprocessors import (
+    OutputPostprocessorModule,
+)
+from generative_recommenders.research.modeling.sequential.sasrec import SASRec
+from generative_recommenders.research.modeling.similarity_module import (
+    SequentialEncoderWithLearnedSimilarityModule,
+)
+from generative_recommenders.research.rails.similarities.module import SimilarityModule
+
+
+@gin.configurable
+def sasrec_encoder(
+    max_sequence_length: int,
+    max_output_length: int,
+    embedding_module: EmbeddingModule,
+    similarity_module: SimilarityModule,
+    input_preproc_module: InputFeaturesPreprocessorModule,
+    output_postproc_module: OutputPostprocessorModule,
+    activation_checkpoint: bool,
+    verbose: bool,
+    ffn_hidden_dim: int = 64,
+    ffn_activation_fn: str = "relu",
+    ffn_dropout_rate: float = 0.2,
+    num_blocks: int = 2,
+    num_heads: int = 1,
+) -> SequentialEncoderWithLearnedSimilarityModule:
+    return SASRec(
+        embedding_module=embedding_module,
+        max_sequence_len=max_sequence_length,
+        max_output_len=max_output_length,
+        embedding_dim=embedding_module.item_embedding_dim,
+        ffn_hidden_dim=ffn_hidden_dim,
+        ffn_activation_fn=ffn_activation_fn,
+        ffn_dropout_rate=ffn_dropout_rate,
+        num_blocks=num_blocks,
+        num_heads=num_heads,
+        similarity_module=similarity_module,  # pyre-ignore [6]
+        input_features_preproc_module=input_preproc_module,
+        output_postproc_module=output_postproc_module,
+        activation_checkpoint=activation_checkpoint,
+        verbose=verbose,
+    )
+
+
+@gin.configurable
+def hstu_encoder(
+    max_sequence_length: int,
+    max_output_length: int,
+    embedding_module: EmbeddingModule,
+    similarity_module: SimilarityModule,
+    input_preproc_module: InputFeaturesPreprocessorModule,
+    output_postproc_module: OutputPostprocessorModule,
+    activation_checkpoint: bool,
+    verbose: bool,
+    num_blocks: int = 2,
+    num_heads: int = 1,
+    dqk: int = 64,
+    dv: int = 64,
+    linear_dropout_rate: float = 0.0,
+    attn_dropout_rate: float = 0.0,
+    normalization: str = "rel_bias",
+    linear_config: str = "uvqk",
+    linear_activation: str = "silu",
+    concat_ua: bool = False,
+    enable_relative_attention_bias: bool = True,
+) -> SequentialEncoderWithLearnedSimilarityModule:
+    return HSTU(
+        embedding_module=embedding_module,
+        similarity_module=similarity_module,  # pyre-ignore [6]
+        input_features_preproc_module=input_preproc_module,
+        output_postproc_module=output_postproc_module,
+        max_sequence_len=max_sequence_length,
+        max_output_len=max_output_length,
+        embedding_dim=embedding_module.item_embedding_dim,
+        num_blocks=num_blocks,
+        num_heads=num_heads,
+        attention_dim=dqk,
+        linear_dim=dv,
+        linear_dropout_rate=linear_dropout_rate,
+        attn_dropout_rate=attn_dropout_rate,
+        linear_config=linear_config,
+        linear_activation=linear_activation,
+        normalization=normalization,
+        concat_ua=concat_ua,
+        enable_relative_attention_bias=enable_relative_attention_bias,
+        verbose=verbose,
+    )
+
+
+@gin.configurable
+def get_sequential_encoder(
+    module_type: str,
+    max_sequence_length: int,
+    max_output_length: int,
+    embedding_module: EmbeddingModule,
+    interaction_module: SimilarityModule,
+    input_preproc_module: InputFeaturesPreprocessorModule,
+    output_postproc_module: OutputPostprocessorModule,
+    verbose: bool,
+    activation_checkpoint: bool = False,
+) -> SequentialEncoderWithLearnedSimilarityModule:
+    if module_type == "SASRec":
+        model = sasrec_encoder(
+            max_sequence_length=max_sequence_length,
+            max_output_length=max_output_length,
+            embedding_module=embedding_module,
+            similarity_module=interaction_module,
+            input_preproc_module=input_preproc_module,
+            output_postproc_module=output_postproc_module,
+            activation_checkpoint=activation_checkpoint,
+            verbose=verbose,
+        )
+    elif module_type == "HSTU":
+        model = hstu_encoder(
+            max_sequence_length=max_sequence_length,
+            max_output_length=max_output_length,
+            embedding_module=embedding_module,
+            similarity_module=interaction_module,
+            input_preproc_module=input_preproc_module,
+            output_postproc_module=output_postproc_module,
+            activation_checkpoint=activation_checkpoint,
+            verbose=verbose,
+        )
+    else:
+        raise ValueError(f"Unsupported module_type {module_type}")
+    return model
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/features.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/features.py
new file mode 100644
index 000000000..70bf80cc0
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/modeling/sequential/features.py
@@ -0,0 +1,94 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+from typing import Dict, NamedTuple, Optional, Tuple
+
+import torch
+
+
+class SequentialFeatures(NamedTuple):
+    # (B,) x int64. Requires past_lengths[i] > 0 \forall i.
+    past_lengths: torch.Tensor
+    # (B, N,) x int64. 0 denotes valid ids.
+    past_ids: torch.Tensor
+    # (B, N, D) x float.
+    past_embeddings: Optional[torch.Tensor]
+    # Implementation-specific payloads.
+    # e.g., past timestamps, past event_types (e.g., clicks, likes), etc.
+    past_payloads: Dict[str, torch.Tensor]
+
+
+def movielens_seq_features_from_row(
+    row: Dict[str, torch.Tensor],
+    device: int,
+    max_output_length: int,
+) -> Tuple[SequentialFeatures, torch.Tensor, torch.Tensor]:
+    historical_lengths = row["history_lengths"].to(device)  # [B]
+    historical_ids = row["historical_ids"].to(device)  # [B, N]
+    historical_ratings = row["historical_ratings"].to(device)
+    historical_timestamps = row["historical_timestamps"].to(device)
+    target_ids = row["target_ids"].to(device).unsqueeze(1)  # [B, 1]
+    target_ratings = row["target_ratings"].to(device).unsqueeze(1)
+    target_timestamps = row["target_timestamps"].to(device).unsqueeze(1)
+    if max_output_length > 0:
+        B = historical_lengths.size(0)
+        historical_ids = torch.cat(
+            [
+                historical_ids,
+                torch.zeros(
+                    (B, max_output_length), dtype=historical_ids.dtype, device=device
+                ),
+            ],
+            dim=1,
+        )
+        historical_ratings = torch.cat(
+            [
+                historical_ratings,
+                torch.zeros(
+                    (B, max_output_length),
+                    dtype=historical_ratings.dtype,
+                    device=device,
+                ),
+            ],
+            dim=1,
+        )
+        historical_timestamps = torch.cat(
+            [
+                historical_timestamps,
+                torch.zeros(
+                    (B, max_output_length),
+                    dtype=historical_timestamps.dtype,
+                    device=device,
+                ),
+            ],
+            dim=1,
+        )
+        historical_timestamps.scatter_(
+            dim=1,
+            index=historical_lengths.view(-1, 1),
+            src=target_timestamps.view(-1, 1),
+        )
+        # print(f"historical_ids.size()={historical_ids.size()}, historical_timestamps.size()={historical_timestamps.size()}")
+    features = SequentialFeatures(
+        past_lengths=historical_lengths,
+        past_ids=historical_ids,
+        past_embeddings=None,
+        past_payloads={
+            "timestamps": historical_timestamps,
+            "ratings": historical_ratings,
+        },
+    )
+    return features, target_ids, target_ratings
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/hstu.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/hstu.py
new file mode 100644
index 000000000..3c89245a2
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/modeling/sequential/hstu.py
@@ -0,0 +1,808 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+"""
+Implements HSTU (Hierarchical Sequential Transduction Unit) in
+Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations
+(https://arxiv.org/abs/2402.17152, ICML'24).
+"""
+
+import abc
+import math
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from generative_recommenders.research.modeling.sequential.embedding_modules import (
+    EmbeddingModule,
+)
+from generative_recommenders.research.modeling.sequential.input_features_preprocessors import (
+    InputFeaturesPreprocessorModule,
+)
+from generative_recommenders.research.modeling.sequential.output_postprocessors import (
+    OutputPostprocessorModule,
+)
+from generative_recommenders.research.modeling.sequential.utils import (
+    get_current_embeddings,
+)
+from generative_recommenders.research.modeling.similarity_module import (
+    SequentialEncoderWithLearnedSimilarityModule,
+)
+from generative_recommenders.research.rails.similarities.module import SimilarityModule
+
+
+TIMESTAMPS_KEY = "timestamps"
+
+
+class RelativeAttentionBiasModule(torch.nn.Module):
+    @abc.abstractmethod
+    def forward(
+        self,
+        all_timestamps: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+            all_timestamps: [B, N] x int64
+        Returns:
+            torch.float tensor broadcastable to [B, N, N]
+        """
+        pass
+
+
+class RelativePositionalBias(RelativeAttentionBiasModule):
+    def __init__(self, max_seq_len: int) -> None:
+        super().__init__()
+
+        self._max_seq_len: int = max_seq_len
+        self._w = torch.nn.Parameter(
+            torch.empty(2 * max_seq_len - 1).normal_(mean=0, std=0.02),
+        )
+
+    def forward(
+        self,
+        all_timestamps: torch.Tensor,
+    ) -> torch.Tensor:
+        del all_timestamps
+        n: int = self._max_seq_len
+        t = F.pad(self._w[: 2 * n - 1], [0, n]).repeat(n)
+        t = t[..., :-n].reshape(1, n, 3 * n - 2)
+        r = (2 * n - 1) // 2
+        return t[..., r:-r]
+
+
+class RelativeBucketedTimeAndPositionBasedBias(RelativeAttentionBiasModule):
+    """
+    Bucketizes timespans based on ts(next-item) - ts(current-item).
+    """
+
+    def __init__(
+        self,
+        max_seq_len: int,
+        num_buckets: int,
+        bucketization_fn: Callable[[torch.Tensor], torch.Tensor],
+    ) -> None:
+        super().__init__()
+
+        self._max_seq_len: int = max_seq_len
+        self._ts_w = torch.nn.Parameter(
+            torch.empty(num_buckets + 1).normal_(mean=0, std=0.02),
+        )
+        self._pos_w = torch.nn.Parameter(
+            torch.empty(2 * max_seq_len - 1).normal_(mean=0, std=0.02),
+        )
+        self._num_buckets: int = num_buckets
+        self._bucketization_fn: Callable[[torch.Tensor], torch.Tensor] = (
+            bucketization_fn
+        )
+
+    def forward(
+        self,
+        all_timestamps: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Args:
+            all_timestamps: (B, N).
+        Returns:
+            (B, N, N).
+        """
+        B = all_timestamps.size(0)
+        N = self._max_seq_len
+        t = F.pad(self._pos_w[: 2 * N - 1], [0, N]).repeat(N)
+        t = t[..., :-N].reshape(1, N, 3 * N - 2)
+        r = (2 * N - 1) // 2
+
+        # [B, N + 1] to simplify tensor manipulations.
+        ext_timestamps = torch.cat(
+            [all_timestamps, all_timestamps[:, N - 1 : N]], dim=1
+        )
+        # causal masking. Otherwise [:, :-1] - [:, 1:] works
+        bucketed_timestamps = torch.clamp(
+            self._bucketization_fn(
+                ext_timestamps[:, 1:].unsqueeze(2) - ext_timestamps[:, :-1].unsqueeze(1)
+            ),
+            min=0,
+            max=self._num_buckets,
+        ).detach()
+        rel_pos_bias = t[:, :, r:-r]
+        rel_ts_bias = torch.index_select(
+            self._ts_w, dim=0, index=bucketed_timestamps.view(-1)
+        ).view(B, N, N)
+        return rel_pos_bias + rel_ts_bias
+
+
+HSTUCacheState = Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
+
+
+def _hstu_attention_maybe_from_cache(
+    num_heads: int,
+    attention_dim: int,
+    linear_dim: int,
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cached_q: Optional[torch.Tensor],
+    cached_k: Optional[torch.Tensor],
+    delta_x_offsets: Optional[Tuple[torch.Tensor, torch.Tensor]],
+    x_offsets: torch.Tensor,
+    all_timestamps: Optional[torch.Tensor],
+    invalid_attn_mask: torch.Tensor,
+    rel_attn_bias: RelativeAttentionBiasModule,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    B: int = x_offsets.size(0) - 1
+    n: int = invalid_attn_mask.size(-1)
+    if delta_x_offsets is not None:
+        padded_q, padded_k = cached_q, cached_k
+        flattened_offsets = delta_x_offsets[1] + torch.arange(
+            start=0,
+            end=B * n,
+            step=n,
+            device=delta_x_offsets[1].device,
+            dtype=delta_x_offsets[1].dtype,
+        )
+        assert isinstance(padded_q, torch.Tensor)
+        assert isinstance(padded_k, torch.Tensor)
+        padded_q = (
+            padded_q.view(B * n, -1)
+            .index_copy_(
+                dim=0,
+                index=flattened_offsets,
+                source=q,
+            )
+            .view(B, n, -1)
+        )
+        padded_k = (
+            padded_k.view(B * n, -1)
+            .index_copy_(
+                dim=0,
+                index=flattened_offsets,
+                source=k,
+            )
+            .view(B, n, -1)
+        )
+    else:
+        padded_q = torch.ops.fbgemm.jagged_to_padded_dense(
+            values=q, offsets=[x_offsets], max_lengths=[n], padding_value=0.0
+        )
+        padded_k = torch.ops.fbgemm.jagged_to_padded_dense(
+            values=k, offsets=[x_offsets], max_lengths=[n], padding_value=0.0
+        )
+
+    qk_attn = torch.einsum(
+        "bnhd,bmhd->bhnm",
+        padded_q.view(B, n, num_heads, attention_dim),
+        padded_k.view(B, n, num_heads, attention_dim),
+    )
+    if all_timestamps is not None:
+        qk_attn = qk_attn + rel_attn_bias(all_timestamps).unsqueeze(1)
+    qk_attn = F.silu(qk_attn) / n
+    qk_attn = qk_attn * invalid_attn_mask.unsqueeze(0).unsqueeze(0)
+    attn_output = torch.ops.fbgemm.dense_to_jagged(
+        torch.einsum(
+            "bhnm,bmhd->bnhd",
+            qk_attn,
+            torch.ops.fbgemm.jagged_to_padded_dense(v, [x_offsets], [n]).reshape(
+                B, n, num_heads, linear_dim
+            ),
+        ).reshape(B, n, num_heads * linear_dim),
+        [x_offsets],
+    )[0]
+    return attn_output, padded_q, padded_k
+
+
+class SequentialTransductionUnitJagged(torch.nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        linear_hidden_dim: int,
+        attention_dim: int,
+        dropout_ratio: float,
+        attn_dropout_ratio: float,
+        num_heads: int,
+        linear_activation: str,
+        relative_attention_bias_module: Optional[RelativeAttentionBiasModule] = None,
+        normalization: str = "rel_bias",
+        linear_config: str = "uvqk",
+        concat_ua: bool = False,
+        epsilon: float = 1e-6,
+        max_length: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        self._embedding_dim: int = embedding_dim
+        self._linear_dim: int = linear_hidden_dim
+        self._attention_dim: int = attention_dim
+        self._dropout_ratio: float = dropout_ratio
+        self._attn_dropout_ratio: float = attn_dropout_ratio
+        self._num_heads: int = num_heads
+        self._rel_attn_bias: Optional[RelativeAttentionBiasModule] = (
+            relative_attention_bias_module
+        )
+        self._normalization: str = normalization
+        self._linear_config: str = linear_config
+        if self._linear_config == "uvqk":
+            self._uvqk: torch.nn.Parameter = torch.nn.Parameter(
+                torch.empty(
+                    (
+                        embedding_dim,
+                        linear_hidden_dim * 2 * num_heads
+                        + attention_dim * num_heads * 2,
+                    )
+                ).normal_(mean=0, std=0.02),
+            )
+        else:
+            raise ValueError(f"Unknown linear_config {self._linear_config}")
+        self._linear_activation: str = linear_activation
+        self._concat_ua: bool = concat_ua
+        self._o = torch.nn.Linear(
+            in_features=linear_hidden_dim * num_heads * (3 if concat_ua else 1),
+            out_features=embedding_dim,
+        )
+        torch.nn.init.xavier_uniform_(self._o.weight)
+        self._eps: float = epsilon
+
+    def _norm_input(self, x: torch.Tensor) -> torch.Tensor:
+        return F.layer_norm(x, normalized_shape=[self._embedding_dim], eps=self._eps)
+
+    def _norm_attn_output(self, x: torch.Tensor) -> torch.Tensor:
+        return F.layer_norm(
+            x, normalized_shape=[self._linear_dim * self._num_heads], eps=self._eps
+        )
+
+    def forward(  # pyre-ignore [3]
+        self,
+        x: torch.Tensor,
+        x_offsets: torch.Tensor,
+        all_timestamps: Optional[torch.Tensor],
+        invalid_attn_mask: torch.Tensor,
+        delta_x_offsets: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        cache: Optional[HSTUCacheState] = None,
+        return_cache_states: bool = False,
+    ):
+        """
+        Args:
+            x: (\sum_i N_i, D) x float.
+            x_offsets: (B + 1) x int32.
+            all_timestamps: optional (B, N) x int64.
+            invalid_attn_mask: (B, N, N) x float, each element in {0, 1}.
+            delta_x_offsets: optional 2-tuple ((B,) x int32, (B,) x int32).
+                For the 1st element in the tuple, each element is in [0, x_offsets[-1]). For the
+                2nd element in the tuple, each element is in [0, N).
+            cache: Optional 4-tuple of (v, padded_q, padded_k, output) from prior runs,
+                where all except padded_q, padded_k are jagged.
+        Returns:
+            x' = f(x), (\sum_i N_i, D) x float.
+        """
+        n: int = invalid_attn_mask.size(-1)
+        cached_q = None
+        cached_k = None
+        if delta_x_offsets is not None:
+            # In this case, for all the following code, x, u, v, q, k become restricted to
+            # [delta_x_offsets[0], :].
+            assert cache is not None
+            x = x[delta_x_offsets[0], :]
+            cached_v, cached_q, cached_k, cached_outputs = cache
+
+        normed_x = self._norm_input(x)
+
+        if self._linear_config == "uvqk":
+            batched_mm_output = torch.mm(normed_x, self._uvqk)
+            if self._linear_activation == "silu":
+                batched_mm_output = F.silu(batched_mm_output)
+            elif self._linear_activation == "none":
+                batched_mm_output = batched_mm_output
+            u, v, q, k = torch.split(
+                batched_mm_output,
+                [
+                    self._linear_dim * self._num_heads,
+                    self._linear_dim * self._num_heads,
+                    self._attention_dim * self._num_heads,
+                    self._attention_dim * self._num_heads,
+                ],
+                dim=1,
+            )
+        else:
+            raise ValueError(f"Unknown self._linear_config {self._linear_config}")
+
+        if delta_x_offsets is not None:
+            v = cached_v.index_copy_(dim=0, index=delta_x_offsets[0], source=v)
+
+        B: int = x_offsets.size(0) - 1
+        if self._normalization == "rel_bias" or self._normalization == "hstu_rel_bias":
+            assert self._rel_attn_bias is not None
+            attn_output, padded_q, padded_k = _hstu_attention_maybe_from_cache(
+                num_heads=self._num_heads,
+                attention_dim=self._attention_dim,
+                linear_dim=self._linear_dim,
+                q=q,
+                k=k,
+                v=v,
+                cached_q=cached_q,
+                cached_k=cached_k,
+                delta_x_offsets=delta_x_offsets,
+                x_offsets=x_offsets,
+                all_timestamps=all_timestamps,
+                invalid_attn_mask=invalid_attn_mask,
+                rel_attn_bias=self._rel_attn_bias,
+            )
+        elif self._normalization == "softmax_rel_bias":
+            if delta_x_offsets is not None:
+                B = x_offsets.size(0) - 1
+                padded_q, padded_k = cached_q, cached_k
+                flattened_offsets = delta_x_offsets[1] + torch.arange(
+                    start=0,
+                    end=B * n,
+                    step=n,
+                    device=delta_x_offsets[1].device,
+                    dtype=delta_x_offsets[1].dtype,
+                )
+                assert padded_q is not None
+                assert padded_k is not None
+                padded_q = (
+                    padded_q.view(B * n, -1)
+                    .index_copy_(
+                        dim=0,
+                        index=flattened_offsets,
+                        source=q,
+                    )
+                    .view(B, n, -1)
+                )
+                padded_k = (
+                    padded_k.view(B * n, -1)
+                    .index_copy_(
+                        dim=0,
+                        index=flattened_offsets,
+                        source=k,
+                    )
+                    .view(B, n, -1)
+                )
+            else:
+                padded_q = torch.ops.fbgemm.jagged_to_padded_dense(
+                    values=q, offsets=[x_offsets], max_lengths=[n], padding_value=0.0
+                )
+                padded_k = torch.ops.fbgemm.jagged_to_padded_dense(
+                    values=k, offsets=[x_offsets], max_lengths=[n], padding_value=0.0
+                )
+
+            qk_attn = torch.einsum("bnd,bmd->bnm", padded_q, padded_k)
+            if self._rel_attn_bias is not None:
+                qk_attn = qk_attn + self._rel_attn_bias(all_timestamps)
+            qk_attn = F.softmax(qk_attn / math.sqrt(self._attention_dim), dim=-1)
+            qk_attn = qk_attn * invalid_attn_mask
+            attn_output = torch.ops.fbgemm.dense_to_jagged(
+                torch.bmm(
+                    qk_attn,
+                    torch.ops.fbgemm.jagged_to_padded_dense(v, [x_offsets], [n]),
+                ),
+                [x_offsets],
+            )[0]
+        else:
+            raise ValueError(f"Unknown normalization method {self._normalization}")
+
+        attn_output = (
+            attn_output
+            if delta_x_offsets is None
+            else attn_output[delta_x_offsets[0], :]
+        )
+        if self._concat_ua:
+            a = self._norm_attn_output(attn_output)
+            o_input = torch.cat([u, a, u * a], dim=-1)
+        else:
+            o_input = u * self._norm_attn_output(attn_output)
+
+        new_outputs = (
+            self._o(
+                F.dropout(
+                    o_input,
+                    p=self._dropout_ratio,
+                    training=self.training,
+                )
+            )
+            + x
+        )
+
+        if delta_x_offsets is not None:
+            new_outputs = cached_outputs.index_copy_(
+                dim=0, index=delta_x_offsets[0], source=new_outputs
+            )
+
+        if return_cache_states and delta_x_offsets is None:
+            v = v.contiguous()
+
+        return new_outputs, (v, padded_q, padded_k, new_outputs)
+
+
+class HSTUJagged(torch.nn.Module):
+    def __init__(
+        self,
+        modules: List[SequentialTransductionUnitJagged],
+        autocast_dtype: Optional[torch.dtype],
+    ) -> None:
+        super().__init__()
+
+        self._attention_layers: torch.nn.ModuleList = torch.nn.ModuleList(
+            modules=modules
+        )
+        self._autocast_dtype: Optional[torch.dtype] = autocast_dtype
+
+    def jagged_forward(
+        self,
+        x: torch.Tensor,
+        x_offsets: torch.Tensor,
+        all_timestamps: Optional[torch.Tensor],
+        invalid_attn_mask: torch.Tensor,
+        delta_x_offsets: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        cache: Optional[List[HSTUCacheState]] = None,
+        return_cache_states: bool = False,
+    ) -> Tuple[torch.Tensor, List[HSTUCacheState]]:
+        """
+        Args:
+            x: (\sum_i N_i, D) x float
+            x_offsets: (B + 1) x int32
+            all_timestamps: (B, 1 + N) x int64
+            invalid_attn_mask: (B, N, N) x float, each element in {0, 1}
+            return_cache_states: bool. True if we should return cache states.
+
+        Returns:
+            x' = f(x), (\sum_i N_i, D) x float
+        """
+        cache_states: List[HSTUCacheState] = []
+
+        with torch.autocast(
+            "cuda",
+            enabled=self._autocast_dtype is not None,
+            dtype=self._autocast_dtype or torch.float16,
+        ):
+            for i, layer in enumerate(self._attention_layers):
+                x, cache_states_i = layer(
+                    x=x,
+                    x_offsets=x_offsets,
+                    all_timestamps=all_timestamps,
+                    invalid_attn_mask=invalid_attn_mask,
+                    delta_x_offsets=delta_x_offsets,
+                    cache=cache[i] if cache is not None else None,
+                    return_cache_states=return_cache_states,
+                )
+                if return_cache_states:
+                    cache_states.append(cache_states_i)
+
+        return x, cache_states
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        x_offsets: torch.Tensor,
+        all_timestamps: Optional[torch.Tensor],
+        invalid_attn_mask: torch.Tensor,
+        delta_x_offsets: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        cache: Optional[List[HSTUCacheState]] = None,
+        return_cache_states: bool = False,
+    ) -> Tuple[torch.Tensor, List[HSTUCacheState]]:
+        """
+        Args:
+            x: (B, N, D) x float.
+            x_offsets: (B + 1) x int32.
+            all_timestamps: (B, 1 + N) x int64
+            invalid_attn_mask: (B, N, N) x float, each element in {0, 1}.
+        Returns:
+            x' = f(x), (B, N, D) x float
+        """
+        if len(x.size()) == 3:
+            x = torch.ops.fbgemm.dense_to_jagged(x, [x_offsets])[0]
+
+        jagged_x, cache_states = self.jagged_forward(
+            x=x,
+            x_offsets=x_offsets,
+            all_timestamps=all_timestamps,
+            invalid_attn_mask=invalid_attn_mask,
+            delta_x_offsets=delta_x_offsets,
+            cache=cache,
+            return_cache_states=return_cache_states,
+        )
+        y = torch.ops.fbgemm.jagged_to_padded_dense(
+            values=jagged_x,
+            offsets=[x_offsets],
+            max_lengths=[invalid_attn_mask.size(1)],
+            padding_value=0.0,
+        )
+        return y, cache_states
+
+
+class HSTU(SequentialEncoderWithLearnedSimilarityModule):
+    """
+    Implements HSTU (Hierarchical Sequential Transduction Unit) in
+    Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations,
+    https://arxiv.org/abs/2402.17152.
+
+    Note that this implementation is intended for reproducing experiments in
+    the traditional sequential recommender setting (Section 4.1.1), and does
+    not yet use optimized kernels discussed in the paper.
+    """
+
+    def __init__(
+        self,
+        max_sequence_len: int,
+        max_output_len: int,
+        embedding_dim: int,
+        num_blocks: int,
+        num_heads: int,
+        linear_dim: int,
+        attention_dim: int,
+        normalization: str,
+        linear_config: str,
+        linear_activation: str,
+        linear_dropout_rate: float,
+        attn_dropout_rate: float,
+        embedding_module: EmbeddingModule,
+        similarity_module: SimilarityModule,
+        input_features_preproc_module: InputFeaturesPreprocessorModule,
+        output_postproc_module: OutputPostprocessorModule,
+        enable_relative_attention_bias: bool = True,
+        concat_ua: bool = False,
+        verbose: bool = True,
+    ) -> None:
+        super().__init__(ndp_module=similarity_module)
+
+        self._embedding_dim: int = embedding_dim
+        self._item_embedding_dim: int = embedding_module.item_embedding_dim
+        self._max_sequence_length: int = max_sequence_len
+        self._embedding_module: EmbeddingModule = embedding_module
+        self._input_features_preproc: InputFeaturesPreprocessorModule = (
+            input_features_preproc_module
+        )
+        self._output_postproc: OutputPostprocessorModule = output_postproc_module
+        self._num_blocks: int = num_blocks
+        self._num_heads: int = num_heads
+        self._dqk: int = attention_dim
+        self._dv: int = linear_dim
+        self._linear_activation: str = linear_activation
+        self._linear_dropout_rate: float = linear_dropout_rate
+        self._attn_dropout_rate: float = attn_dropout_rate
+        self._enable_relative_attention_bias: bool = enable_relative_attention_bias
+        self._hstu = HSTUJagged(
+            modules=[
+                SequentialTransductionUnitJagged(
+                    embedding_dim=self._embedding_dim,
+                    linear_hidden_dim=linear_dim,
+                    attention_dim=attention_dim,
+                    normalization=normalization,
+                    linear_config=linear_config,
+                    linear_activation=linear_activation,
+                    num_heads=num_heads,
+                    # TODO: change to lambda x.
+                    relative_attention_bias_module=(
+                        RelativeBucketedTimeAndPositionBasedBias(
+                            max_seq_len=max_sequence_len
+                            + max_output_len,  # accounts for next item.
+                            num_buckets=128,
+                            bucketization_fn=lambda x: (
+                                torch.log(torch.abs(x).clamp(min=1)) / 0.301
+                            ).long(),
+                        )
+                        if enable_relative_attention_bias
+                        else None
+                    ),
+                    dropout_ratio=linear_dropout_rate,
+                    attn_dropout_ratio=attn_dropout_rate,
+                    concat_ua=concat_ua,
+                )
+                for _ in range(num_blocks)
+            ],
+            autocast_dtype=None,
+        )
+        # causal forward, w/ +1 for padding.
+        self.register_buffer(
+            "_attn_mask",
+            torch.triu(
+                torch.ones(
+                    (
+                        self._max_sequence_length + max_output_len,
+                        self._max_sequence_length + max_output_len,
+                    ),
+                    dtype=torch.bool,
+                ),
+                diagonal=1,
+            ),
+        )
+        self._verbose: bool = verbose
+        self.reset_params()
+
+    def reset_params(self) -> None:
+        for name, params in self.named_parameters():
+            if ("_hstu" in name) or ("_embedding_module" in name):
+                if self._verbose:
+                    print(f"Skipping init for {name}")
+                continue
+            try:
+                torch.nn.init.xavier_normal_(params.data)
+                if self._verbose:
+                    print(
+                        f"Initialize {name} as xavier normal: {params.data.size()} params"
+                    )
+            except:
+                if self._verbose:
+                    print(f"Failed to initialize {name}: {params.data.size()} params")
+
+    def get_item_embeddings(self, item_ids: torch.Tensor) -> torch.Tensor:
+        return self._embedding_module.get_item_embeddings(item_ids)
+
+    def debug_str(self) -> str:
+        debug_str = (
+            f"HSTU-b{self._num_blocks}-h{self._num_heads}-dqk{self._dqk}-dv{self._dv}"
+            + f"-l{self._linear_activation}d{self._linear_dropout_rate}"
+            + f"-ad{self._attn_dropout_rate}"
+        )
+        if not self._enable_relative_attention_bias:
+            debug_str += "-norab"
+        return debug_str
+
+    def generate_user_embeddings(
+        self,
+        past_lengths: torch.Tensor,
+        past_ids: torch.Tensor,
+        past_embeddings: torch.Tensor,
+        past_payloads: Dict[str, torch.Tensor],
+        delta_x_offsets: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        cache: Optional[List[HSTUCacheState]] = None,
+        return_cache_states: bool = False,
+    ) -> Tuple[torch.Tensor, List[HSTUCacheState]]:
+        """
+        [B, N] -> [B, N, D].
+        """
+        device = past_lengths.device
+        float_dtype = past_embeddings.dtype
+        B, N, _ = past_embeddings.size()
+
+        past_lengths, user_embeddings, _ = self._input_features_preproc(
+            past_lengths=past_lengths,
+            past_ids=past_ids,
+            past_embeddings=past_embeddings,
+            past_payloads=past_payloads,
+        )
+
+        float_dtype = user_embeddings.dtype
+        user_embeddings, cached_states = self._hstu(
+            x=user_embeddings,
+            x_offsets=torch.ops.fbgemm.asynchronous_complete_cumsum(past_lengths),
+            all_timestamps=(
+                past_payloads[TIMESTAMPS_KEY]
+                if TIMESTAMPS_KEY in past_payloads
+                else None
+            ),
+            invalid_attn_mask=1.0 - self._attn_mask.to(float_dtype),
+            delta_x_offsets=delta_x_offsets,
+            cache=cache,
+            return_cache_states=return_cache_states,
+        )
+        return self._output_postproc(user_embeddings), cached_states
+
+    def forward(
+        self,
+        past_lengths: torch.Tensor,
+        past_ids: torch.Tensor,
+        past_embeddings: torch.Tensor,
+        past_payloads: Dict[str, torch.Tensor],
+        batch_id: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Runs the main encoder.
+
+        Args:
+            past_lengths: (B,) x int64
+            past_ids: (B, N,) x int64 where the latest engaged ids come first. In
+                particular, past_ids[i, past_lengths[i] - 1] should correspond to
+                the latest engaged values.
+            past_embeddings: (B, N, D) x float or (\sum_b N_b, D) x float.
+            past_payloads: implementation-specific keyed tensors of shape (B, N, ...).
+
+        Returns:
+            encoded_embeddings of [B, N, D].
+        """
+        encoded_embeddings, _ = self.generate_user_embeddings(
+            past_lengths=past_lengths,
+            past_ids=past_ids,
+            past_embeddings=past_embeddings,
+            past_payloads=past_payloads,
+        )
+        return encoded_embeddings
+
+    def _encode(
+        self,
+        past_lengths: torch.Tensor,
+        past_ids: torch.Tensor,
+        past_embeddings: torch.Tensor,
+        past_payloads: Dict[str, torch.Tensor],
+        delta_x_offsets: Optional[Tuple[torch.Tensor, torch.Tensor]],
+        cache: Optional[List[HSTUCacheState]],
+        return_cache_states: bool,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[HSTUCacheState]]]:
+        """
+        Args:
+            past_lengths: (B,) x int64.
+            past_ids: (B, N,) x int64.
+            past_embeddings: (B, N, D,) x float.
+            past_payloads: implementation-specific keyed tensors of shape (B, N, ...).
+            return_cache_states: bool.
+
+        Returns:
+            (B, D) x float, representing embeddings for the current state.
+        """
+        encoded_seq_embeddings, cache_states = self.generate_user_embeddings(
+            past_lengths=past_lengths,
+            past_ids=past_ids,
+            past_embeddings=past_embeddings,
+            past_payloads=past_payloads,
+            delta_x_offsets=delta_x_offsets,
+            cache=cache,
+            return_cache_states=return_cache_states,
+        )  # [B, N, D]
+        current_embeddings = get_current_embeddings(
+            lengths=past_lengths, encoded_embeddings=encoded_seq_embeddings
+        )
+        if return_cache_states:
+            return current_embeddings, cache_states
+        else:
+            return current_embeddings
+
+    def encode(
+        self,
+        past_lengths: torch.Tensor,
+        past_ids: torch.Tensor,
+        past_embeddings: torch.Tensor,
+        past_payloads: Dict[str, torch.Tensor],
+        delta_x_offsets: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        cache: Optional[List[HSTUCacheState]] = None,
+        return_cache_states: bool = False,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[HSTUCacheState]]]:
+        """
+        Runs encoder to obtain the current hidden states.
+
+        Args:
+            past_lengths: (B,) x int.
+            past_ids: (B, N,) x int.
+            past_embeddings: (B, N, D) x float.
+            past_payloads: implementation-specific keyed tensors of shape (B, N, ...).
+
+        Returns:
+            (B, D,) x float, representing encoded states at the most recent time step.
+        """
+        return self._encode(
+            past_lengths=past_lengths,
+            past_ids=past_ids,
+            past_embeddings=past_embeddings,
+            past_payloads=past_payloads,
+            delta_x_offsets=delta_x_offsets,
+            cache=cache,
+            return_cache_states=return_cache_states,
+        )
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/input_features_preprocessors.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/input_features_preprocessors.py
new file mode 100644
index 000000000..a461ab879
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/modeling/sequential/input_features_preprocessors.py
@@ -0,0 +1,259 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import abc
+import math
+from typing import Dict, Tuple
+
+import torch
+from generative_recommenders.research.modeling.initialization import truncated_normal
+
+
+class InputFeaturesPreprocessorModule(torch.nn.Module):
+    @abc.abstractmethod
+    def debug_str(self) -> str:
+        pass
+
+    @abc.abstractmethod
+    def forward(
+        self,
+        past_lengths: torch.Tensor,
+        past_ids: torch.Tensor,
+        past_embeddings: torch.Tensor,
+        past_payloads: Dict[str, torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        pass
+
+
+class LearnablePositionalEmbeddingInputFeaturesPreprocessor(
+    InputFeaturesPreprocessorModule
+):
+    def __init__(
+        self,
+        max_sequence_len: int,
+        embedding_dim: int,
+        dropout_rate: float,
+    ) -> None:
+        super().__init__()
+
+        self._embedding_dim: int = embedding_dim
+        self._pos_emb: torch.nn.Embedding = torch.nn.Embedding(
+            max_sequence_len,
+            self._embedding_dim,
+        )
+        self._dropout_rate: float = dropout_rate
+        self._emb_dropout = torch.nn.Dropout(p=dropout_rate)
+        self.reset_state()
+
+    def debug_str(self) -> str:
+        return f"posi_d{self._dropout_rate}"
+
+    def reset_state(self) -> None:
+        truncated_normal(
+            self._pos_emb.weight.data,
+            mean=0.0,
+            std=math.sqrt(1.0 / self._embedding_dim),
+        )
+
+    def forward(
+        self,
+        past_lengths: torch.Tensor,
+        past_ids: torch.Tensor,
+        past_embeddings: torch.Tensor,
+        past_payloads: Dict[str, torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        B, N = past_ids.size()
+        D = past_embeddings.size(-1)
+
+        user_embeddings = past_embeddings * (self._embedding_dim**0.5) + self._pos_emb(
+            torch.arange(N, device=past_ids.device).unsqueeze(0).repeat(B, 1)
+        )
+        user_embeddings = self._emb_dropout(user_embeddings)
+
+        valid_mask = (past_ids != 0).unsqueeze(-1).float()  # [B, N, 1]
+        user_embeddings *= valid_mask
+        return past_lengths, user_embeddings, valid_mask
+
+
+class LearnablePositionalEmbeddingRatedInputFeaturesPreprocessor(
+    InputFeaturesPreprocessorModule
+):
+    def __init__(
+        self,
+        max_sequence_len: int,
+        item_embedding_dim: int,
+        dropout_rate: float,
+        rating_embedding_dim: int,
+        num_ratings: int,
+    ) -> None:
+        super().__init__()
+
+        self._embedding_dim: int = item_embedding_dim + rating_embedding_dim
+        self._pos_emb: torch.nn.Embedding = torch.nn.Embedding(
+            max_sequence_len,
+            self._embedding_dim,
+        )
+        self._dropout_rate: float = dropout_rate
+        self._emb_dropout = torch.nn.Dropout(p=dropout_rate)
+        self._rating_emb: torch.nn.Embedding = torch.nn.Embedding(
+            num_ratings,
+            rating_embedding_dim,
+        )
+        self.reset_state()
+
+    def debug_str(self) -> str:
+        return f"posir_d{self._dropout_rate}"
+
+    def reset_state(self) -> None:
+        truncated_normal(
+            self._pos_emb.weight.data,
+            mean=0.0,
+            std=math.sqrt(1.0 / self._embedding_dim),
+        )
+        truncated_normal(
+            self._rating_emb.weight.data,
+            mean=0.0,
+            std=math.sqrt(1.0 / self._embedding_dim),
+        )
+
+    def forward(
+        self,
+        past_lengths: torch.Tensor,
+        past_ids: torch.Tensor,
+        past_embeddings: torch.Tensor,
+        past_payloads: Dict[str, torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        B, N = past_ids.size()
+
+        user_embeddings = torch.cat(
+            [past_embeddings, self._rating_emb(past_payloads["ratings"].int())],
+            dim=-1,
+        ) * (self._embedding_dim**0.5) + self._pos_emb(
+            torch.arange(N, device=past_ids.device).unsqueeze(0).repeat(B, 1)
+        )
+        user_embeddings = self._emb_dropout(user_embeddings)
+
+        valid_mask = (past_ids != 0).unsqueeze(-1).float()  # [B, N, 1]
+        user_embeddings *= valid_mask
+        return past_lengths, user_embeddings, valid_mask
+
+
+class CombinedItemAndRatingInputFeaturesPreprocessor(InputFeaturesPreprocessorModule):
+    def __init__(
+        self,
+        max_sequence_len: int,
+        item_embedding_dim: int,
+        dropout_rate: float,
+        num_ratings: int,
+    ) -> None:
+        super().__init__()
+
+        self._embedding_dim: int = item_embedding_dim
+        # Due to [item_0, rating_0, item_1, rating_1, ...]
+        self._pos_emb: torch.nn.Embedding = torch.nn.Embedding(
+            max_sequence_len * 2,
+            self._embedding_dim,
+        )
+        self._dropout_rate: float = dropout_rate
+        self._emb_dropout = torch.nn.Dropout(p=dropout_rate)
+        self._rating_emb: torch.nn.Embedding = torch.nn.Embedding(
+            num_ratings,
+            item_embedding_dim,
+        )
+        self.reset_state()
+
+    def debug_str(self) -> str:
+        return f"combir_d{self._dropout_rate}"
+
+    def reset_state(self) -> None:
+        truncated_normal(
+            self._pos_emb.weight.data,
+            mean=0.0,
+            std=math.sqrt(1.0 / self._embedding_dim),
+        )
+        truncated_normal(
+            self._rating_emb.weight.data,
+            mean=0.0,
+            std=math.sqrt(1.0 / self._embedding_dim),
+        )
+
+    def get_preprocessed_ids(
+        self,
+        past_lengths: torch.Tensor,
+        past_ids: torch.Tensor,
+        past_embeddings: torch.Tensor,
+        past_payloads: Dict[str, torch.Tensor],
+    ) -> torch.Tensor:
+        """
+        Returns (B, N * 2,) x int64.
+        """
+        B, N = past_ids.size()
+        return torch.cat(
+            [
+                past_ids.unsqueeze(2),  # (B, N, 1)
+                past_payloads["ratings"].to(past_ids.dtype).unsqueeze(2),
+            ],
+            dim=2,
+        ).reshape(B, N * 2)
+
+    def get_preprocessed_masks(
+        self,
+        past_lengths: torch.Tensor,
+        past_ids: torch.Tensor,
+        past_embeddings: torch.Tensor,
+        past_payloads: Dict[str, torch.Tensor],
+    ) -> torch.Tensor:
+        """
+        Returns (B, N * 2,) x bool.
+        """
+        B, N = past_ids.size()
+        return (past_ids != 0).unsqueeze(2).expand(-1, -1, 2).reshape(B, N * 2)
+
+    def forward(
+        self,
+        past_lengths: torch.Tensor,
+        past_ids: torch.Tensor,
+        past_embeddings: torch.Tensor,
+        past_payloads: Dict[str, torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        B, N = past_ids.size()
+        D = past_embeddings.size(-1)
+
+        user_embeddings = torch.cat(
+            [
+                past_embeddings,  # (B, N, D)
+                self._rating_emb(past_payloads["ratings"].int()),
+            ],
+            dim=2,
+        ) * (self._embedding_dim**0.5)
+        user_embeddings = user_embeddings.view(B, N * 2, D)
+        user_embeddings = user_embeddings + self._pos_emb(
+            torch.arange(N * 2, device=past_ids.device).unsqueeze(0).repeat(B, 1)
+        )
+        user_embeddings = self._emb_dropout(user_embeddings)
+
+        valid_mask = (
+            self.get_preprocessed_masks(
+                past_lengths,
+                past_ids,
+                past_embeddings,
+                past_payloads,
+            )
+            .unsqueeze(2)
+            .float()
+        )  # (B, N * 2, 1,)
+        user_embeddings *= valid_mask
+        return past_lengths * 2, user_embeddings, valid_mask
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/losses/sampled_softmax.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/losses/sampled_softmax.py
new file mode 100644
index 000000000..8e2195783
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/modeling/sequential/losses/sampled_softmax.py
@@ -0,0 +1,193 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+from collections import OrderedDict
+from typing import Dict, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from generative_recommenders.research.modeling.sequential.autoregressive_losses import (
+    AutoregressiveLoss,
+    NegativesSampler,
+)
+from torch.utils.checkpoint import checkpoint
+
+
+class SampledSoftmaxLoss(AutoregressiveLoss):
+    def __init__(
+        self,
+        num_to_sample: int,
+        softmax_temperature: float,
+        model,
+        activation_checkpoint: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self._num_to_sample: int = num_to_sample
+        self._softmax_temperature: float = softmax_temperature
+        self._model = model
+        self._activation_checkpoint: bool = activation_checkpoint
+
+    def jagged_forward(  # pyre-ignore [15]
+        self,
+        output_embeddings: torch.Tensor,
+        supervision_ids: torch.Tensor,
+        supervision_embeddings: torch.Tensor,
+        supervision_weights: torch.Tensor,
+        negatives_sampler: NegativesSampler,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        assert output_embeddings.size() == supervision_embeddings.size()
+        assert supervision_ids.size() == supervision_embeddings.size()[:-1]
+        assert supervision_ids.size() == supervision_weights.size()
+
+        sampled_ids, sampled_negative_embeddings = negatives_sampler(
+            positive_ids=supervision_ids,
+            num_to_sample=self._num_to_sample,
+        )
+        positive_embeddings = negatives_sampler.normalize_embeddings(
+            supervision_embeddings
+        )
+        positive_logits, aux_losses = self._model.similarity_fn(
+            query_embeddings=output_embeddings,  # [B, D] = [N', D]
+            item_ids=supervision_ids.unsqueeze(1),  # [N', 1]
+            item_embeddings=positive_embeddings.unsqueeze(1),  # [N', D] -> [N', 1, D]
+            **kwargs,
+        )
+        positive_logits = positive_logits / self._softmax_temperature  # [0]
+        sampled_negatives_logits, _ = self._model.similarity_fn(
+            query_embeddings=output_embeddings,  # [N', D]
+            item_ids=sampled_ids,  # [N', R]
+            item_embeddings=sampled_negative_embeddings,  # [N', R, D]
+            **kwargs,
+        )  # [N', R]  # [0]
+        sampled_negatives_logits = torch.where(
+            supervision_ids.unsqueeze(1) == sampled_ids,  # [N', R]
+            -5e4,
+            sampled_negatives_logits / self._softmax_temperature,
+        )
+        jagged_loss = -F.log_softmax(
+            torch.cat([positive_logits, sampled_negatives_logits], dim=1), dim=1
+        )[:, 0]
+        return (
+            jagged_loss * supervision_weights
+        ).sum() / supervision_weights.sum(), aux_losses
+
+    def forward(  # pyre-ignore [15]
+        self,
+        lengths: torch.Tensor,
+        output_embeddings: torch.Tensor,
+        supervision_ids: torch.Tensor,
+        supervision_embeddings: torch.Tensor,
+        supervision_weights: torch.Tensor,
+        negatives_sampler: NegativesSampler,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Args:
+            lengths: [B] x int32 representing number of non-zero elements per row.
+            output_embeddings: [B, N, D] x float, embeddings for the current
+                input sequence.
+            supervision_ids: [B, N] x int64, (positive) supervision ids.
+            supervision_embeddings: [B, N, D] x float.
+            supervision_weights: Optional [B, N] x float. Optional weights for
+                masking out invalid positions, or reweighting supervision labels.
+            negatives_sampler: sampler used to obtain negative examples paired with
+                positives.
+
+        Returns:
+            Tuple of (loss for the current engaged sequence, str-keyed aux_losses).
+        """
+        torch._assert(
+            output_embeddings.size() == supervision_embeddings.size(),
+            "Invalid supervision embeddings size.",
+        )
+        torch._assert(
+            supervision_ids.size() == supervision_embeddings.size()[:-1],
+            "Invalid supervision ids size.",
+        )
+
+        jagged_id_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
+        jagged_supervision_ids = (
+            torch.ops.fbgemm.dense_to_jagged(
+                supervision_ids.unsqueeze(-1).float(), [jagged_id_offsets]
+            )[0]
+            .squeeze(1)
+            .long()
+        )
+        if "user_ids" in kwargs:
+            # expand to jagged.
+            max_length: int = int(lengths.max())
+            kwargs["user_ids"] = torch.ops.fbgemm.dense_to_jagged(
+                kwargs["user_ids"]
+                .unsqueeze(1)
+                .expand(-1, max_length)
+                .unsqueeze(2),  # (B, max_length, 1)
+                [jagged_id_offsets],
+            )[0].squeeze(1)
+
+        args = OrderedDict(
+            [
+                (
+                    "output_embeddings",
+                    torch.ops.fbgemm.dense_to_jagged(
+                        output_embeddings,
+                        [jagged_id_offsets],
+                    )[0],
+                ),
+                ("supervision_ids", jagged_supervision_ids),
+                (
+                    "supervision_embeddings",
+                    torch.ops.fbgemm.dense_to_jagged(
+                        supervision_embeddings,
+                        [jagged_id_offsets],
+                    )[0],
+                ),
+                (
+                    "supervision_weights",
+                    torch.ops.fbgemm.dense_to_jagged(
+                        supervision_weights.unsqueeze(-1),
+                        [jagged_id_offsets],
+                    )[0].squeeze(1),
+                ),
+                ("negatives_sampler", negatives_sampler),
+            ]
+        )
+        args.update(kwargs)
+        if self._activation_checkpoint:
+            return checkpoint(
+                self.jagged_forward,
+                *args.values(),
+                use_reentrant=False,
+            )
+        else:
+            return self.jagged_forward(
+                output_embeddings=torch.ops.fbgemm.dense_to_jagged(
+                    output_embeddings,
+                    [jagged_id_offsets],
+                )[0],
+                supervision_ids=jagged_supervision_ids,
+                supervision_embeddings=torch.ops.fbgemm.dense_to_jagged(
+                    supervision_embeddings,
+                    [jagged_id_offsets],
+                )[0],
+                supervision_weights=torch.ops.fbgemm.dense_to_jagged(
+                    supervision_weights.unsqueeze(-1),
+                    [jagged_id_offsets],
+                )[0].squeeze(1),
+                negatives_sampler=negatives_sampler,
+                **kwargs,
+            )
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/output_postprocessors.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/output_postprocessors.py
new file mode 100644
index 000000000..3319dfd93
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/modeling/sequential/output_postprocessors.py
@@ -0,0 +1,82 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import abc
+
+import torch
+import torch.nn.functional as F
+
+
+class OutputPostprocessorModule(torch.nn.Module):
+    @abc.abstractmethod
+    def debug_str(self) -> str:
+        pass
+
+    @abc.abstractmethod
+    def forward(
+        self,
+        output_embeddings: torch.Tensor,
+    ) -> torch.Tensor:
+        pass
+
+
+class L2NormEmbeddingPostprocessor(OutputPostprocessorModule):
+    def __init__(
+        self,
+        embedding_dim: int,
+        eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        self._embedding_dim: int = embedding_dim
+        self._eps: float = eps
+
+    def debug_str(self) -> str:
+        return "l2"
+
+    def forward(
+        self,
+        output_embeddings: torch.Tensor,
+    ) -> torch.Tensor:
+        output_embeddings = output_embeddings[..., : self._embedding_dim]
+        return output_embeddings / torch.clamp(
+            torch.linalg.norm(output_embeddings, ord=None, dim=-1, keepdim=True),
+            min=self._eps,
+        )
+
+
+class LayerNormEmbeddingPostprocessor(OutputPostprocessorModule):
+    def __init__(
+        self,
+        embedding_dim: int,
+        eps: float = 1e-6,
+    ) -> None:
+        super().__init__()
+        self._embedding_dim: int = embedding_dim
+        self._eps: float = eps
+
+    def debug_str(self) -> str:
+        return "ln"
+
+    def forward(
+        self,
+        output_embeddings: torch.Tensor,
+    ) -> torch.Tensor:
+        output_embeddings = output_embeddings[..., : self._embedding_dim]
+        return F.layer_norm(
+            output_embeddings,
+            normalized_shape=(self._embedding_dim,),
+            eps=self._eps,
+        )
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/sasrec.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/sasrec.py
new file mode 100644
index 000000000..2709ddb08
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/modeling/sequential/sasrec.py
@@ -0,0 +1,316 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+"""
+Implements SASRec (Self-Attentive Sequential Recommendation, https://arxiv.org/abs/1808.09781, ICDM'18).
+
+Compared with the original paper which used BCE loss, this implementation is modified so that
+we can utilize a Sampled Softmax loss proposed in Revisiting Neural Retrieval on Accelerators
+(https://arxiv.org/abs/2306.04039, KDD'23) and Turning Dross Into Gold Loss: is BERT4Rec really
+better than SASRec? (https://arxiv.org/abs/2309.07602, RecSys'23), where the authors showed
+sampled softmax loss to significantly improved SASRec model quality.
+"""
+
+from typing import Dict, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from generative_recommenders.research.modeling.sequential.embedding_modules import (
+    EmbeddingModule,
+)
+from generative_recommenders.research.modeling.sequential.input_features_preprocessors import (
+    InputFeaturesPreprocessorModule,
+)
+from generative_recommenders.research.modeling.sequential.output_postprocessors import (
+    OutputPostprocessorModule,
+)
+from generative_recommenders.research.modeling.sequential.utils import (
+    get_current_embeddings,
+)
+from generative_recommenders.research.modeling.similarity_module import (
+    SequentialEncoderWithLearnedSimilarityModule,
+)
+from generative_recommenders.research.rails.similarities.module import SimilarityModule
+
+
+class StandardAttentionFF(torch.nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        hidden_dim: int,
+        activation_fn: str,
+        dropout_rate: float,
+    ) -> None:
+        super().__init__()
+
+        assert activation_fn == "relu" or activation_fn == "gelu", (
+            f"Invalid activation_fn {activation_fn}"
+        )
+
+        self._conv1d = torch.nn.Sequential(
+            torch.nn.Conv1d(
+                in_channels=embedding_dim,
+                out_channels=hidden_dim,
+                kernel_size=1,
+            ),
+            torch.nn.GELU() if activation_fn == "gelu" else torch.nn.ReLU(),
+            torch.nn.Dropout(p=dropout_rate),
+            torch.nn.Conv1d(
+                in_channels=hidden_dim,
+                out_channels=embedding_dim,
+                kernel_size=1,
+            ),
+            torch.nn.Dropout(p=dropout_rate),
+        )
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        # Conv1D requires (B, D, N)
+        return self._conv1d(inputs.transpose(-1, -2)).transpose(-1, -2) + inputs
+
+
+class SASRec(SequentialEncoderWithLearnedSimilarityModule):
+    """
+    Implements SASRec (Self-Attentive Sequential Recommendation, https://arxiv.org/abs/1808.09781, ICDM'18).
+
+    Compared with the original paper which used BCE loss, this implementation is modified so that
+    we can utilize a Sampled Softmax loss proposed in Revisiting Neural Retrieval on Accelerators
+    (https://arxiv.org/abs/2306.04039, KDD'23) and Turning Dross Into Gold Loss: is BERT4Rec really
+    better than SASRec? (https://arxiv.org/abs/2309.07602, RecSys'23), where the authors showed
+    sampled softmax loss to significantly improved SASRec model quality.
+    """
+
+    def __init__(
+        self,
+        max_sequence_len: int,
+        max_output_len: int,
+        embedding_dim: int,
+        num_blocks: int,
+        num_heads: int,
+        ffn_hidden_dim: int,
+        ffn_activation_fn: str,
+        ffn_dropout_rate: float,
+        embedding_module: EmbeddingModule,
+        similarity_module: SimilarityModule,
+        input_features_preproc_module: InputFeaturesPreprocessorModule,
+        output_postproc_module: OutputPostprocessorModule,
+        activation_checkpoint: bool = False,
+        verbose: bool = False,
+    ) -> None:
+        super().__init__(ndp_module=similarity_module)
+
+        self._embedding_module: EmbeddingModule = embedding_module
+        self._embedding_dim: int = embedding_dim
+        self._item_embedding_dim: int = embedding_module.item_embedding_dim
+        self._max_sequence_length: int = max_sequence_len + max_output_len
+        self._input_features_preproc: InputFeaturesPreprocessorModule = (
+            input_features_preproc_module
+        )
+        self._output_postproc: OutputPostprocessorModule = output_postproc_module
+        self._activation_checkpoint: bool = activation_checkpoint
+        self._verbose: bool = verbose
+
+        self.attention_layers = torch.nn.ModuleList()
+        self.forward_layers = torch.nn.ModuleList()
+        self._num_blocks: int = num_blocks
+        self._num_heads: int = num_heads
+        self._ffn_hidden_dim: int = ffn_hidden_dim
+        self._ffn_activation_fn: str = ffn_activation_fn
+        self._ffn_dropout_rate: float = ffn_dropout_rate
+
+        for _ in range(num_blocks):
+            self.attention_layers.append(
+                torch.nn.MultiheadAttention(
+                    embed_dim=self._embedding_dim,
+                    num_heads=num_heads,
+                    dropout=ffn_dropout_rate,
+                    batch_first=True,
+                )
+            )
+            self.forward_layers.append(
+                StandardAttentionFF(
+                    embedding_dim=self._embedding_dim,
+                    hidden_dim=ffn_hidden_dim,
+                    activation_fn=ffn_activation_fn,
+                    dropout_rate=self._ffn_dropout_rate,
+                )
+            )
+
+        self.register_buffer(
+            "_attn_mask",
+            torch.triu(
+                torch.ones(
+                    (self._max_sequence_length, self._max_sequence_length),
+                    dtype=torch.bool,
+                ),
+                diagonal=1,
+            ),
+        )
+        self.reset_state()
+
+    def reset_state(self) -> None:
+        for name, params in self.named_parameters():
+            if (
+                "_input_features_preproc" in name
+                or "_embedding_module" in name
+                or "_output_postproc" in name
+            ):
+                if self._verbose:
+                    print(f"Skipping initialization for {name}")
+                continue
+            try:
+                torch.nn.init.xavier_normal_(params.data)
+                if self._verbose:
+                    print(
+                        f"Initialize {name} as xavier normal: {params.data.size()} params"
+                    )
+            except:
+                if self._verbose:
+                    print(f"Failed to initialize {name}: {params.data.size()} params")
+
+    def get_item_embeddings(self, item_ids: torch.Tensor) -> torch.Tensor:
+        return self._embedding_module.get_item_embeddings(item_ids)
+
+    def debug_str(self) -> str:
+        return (
+            f"SASRec-d{self._item_embedding_dim}-b{self._num_blocks}-h{self._num_heads}"
+            + "-"
+            + self._input_features_preproc.debug_str()
+            + "-"
+            + self._output_postproc.debug_str()
+            + f"-ffn{self._ffn_hidden_dim}-{self._ffn_activation_fn}-d{self._ffn_dropout_rate}"
+            + f"{'-ac' if self._activation_checkpoint else ''}"
+        )
+
+    def _run_one_layer(
+        self,
+        i: int,
+        user_embeddings: torch.Tensor,
+        valid_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        Q = F.layer_norm(
+            user_embeddings,
+            normalized_shape=(self._embedding_dim,),
+            eps=1e-8,
+        )
+        mha_outputs, _ = self.attention_layers[i](
+            query=Q,
+            key=user_embeddings,
+            value=user_embeddings,
+            attn_mask=self._attn_mask,
+        )
+        user_embeddings = self.forward_layers[i](
+            F.layer_norm(
+                Q + mha_outputs,
+                normalized_shape=(self._embedding_dim,),
+                eps=1e-8,
+            )
+        )
+        user_embeddings *= valid_mask
+        return user_embeddings
+
+    def generate_user_embeddings(
+        self,
+        past_lengths: torch.Tensor,
+        past_ids: torch.Tensor,
+        past_embeddings: torch.Tensor,
+        past_payloads: Dict[str, torch.Tensor],
+    ) -> torch.Tensor:
+        """
+        Args:
+            past_ids: (B, N,) x int
+
+        Returns:
+            (B, N, D,) x float
+        """
+        past_lengths, user_embeddings, valid_mask = self._input_features_preproc(
+            past_lengths=past_lengths,
+            past_ids=past_ids,
+            past_embeddings=past_embeddings,
+            past_payloads=past_payloads,
+        )
+
+        for i in range(len(self.attention_layers)):
+            if self._activation_checkpoint:
+                user_embeddings = torch.utils.checkpoint.checkpoint(
+                    self._run_one_layer,
+                    i,
+                    user_embeddings,
+                    valid_mask,
+                    use_reentrant=False,
+                )
+            else:
+                user_embeddings = self._run_one_layer(i, user_embeddings, valid_mask)
+
+        return self._output_postproc(user_embeddings)
+
+    def forward(
+        self,
+        past_lengths: torch.Tensor,
+        past_ids: torch.Tensor,
+        past_embeddings: torch.Tensor,
+        past_payloads: Dict[str, torch.Tensor],
+        batch_id: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            past_ids: [B, N] x int64 where the latest engaged ids come first. In
+                particular, [:, 0] should correspond to the last engaged values.
+            past_ratings: [B, N] x int64.
+            past_timestamps: [B, N] x int64.
+
+        Returns:
+            encoded_embeddings of [B, N, D].
+        """
+        encoded_embeddings = self.generate_user_embeddings(
+            past_lengths,
+            past_ids,
+            past_embeddings,
+            past_payloads,
+        )
+        return encoded_embeddings
+
+    def encode(
+        self,
+        past_lengths: torch.Tensor,
+        past_ids: torch.Tensor,  # [B, N] x int64
+        past_embeddings: torch.Tensor,
+        past_payloads: Dict[str, torch.Tensor],
+    ) -> torch.Tensor:
+        encoded_seq_embeddings = self.generate_user_embeddings(
+            past_lengths, past_ids, past_embeddings, past_payloads
+        )  # [B, N, D]
+        return get_current_embeddings(
+            lengths=past_lengths, encoded_embeddings=encoded_seq_embeddings
+        )
+
+    def predict(
+        self,
+        past_ids: torch.Tensor,
+        past_ratings: torch.Tensor,
+        past_timestamps: torch.Tensor,
+        next_timestamps: torch.Tensor,
+        target_ids: torch.Tensor,
+        batch_id: Optional[int] = None,
+    ) -> torch.Tensor:
+        return self.interaction(  # pyre-ignore [29]
+            self.encode(
+                past_ids,
+                past_ratings,
+                past_timestamps,
+                next_timestamps,  # pyre-ignore [6]
+            ),
+            target_ids,
+        )  # [B, X]
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/utils.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/utils.py
new file mode 100644
index 000000000..60dfb8e44
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/modeling/sequential/utils.py
@@ -0,0 +1,129 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import torch
+
+
+def batch_gather_embeddings(
+    rowwise_indices: torch.Tensor,
+    embeddings: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Args:
+        rowwise_indices: (B, N) x int, where each entry is in [0, X).
+        embeddings: (B, X, D,) x float.
+
+    Returns:
+        (B, N, D,) x float, embeddings corresponding to rowwise_indices.
+    """
+    _, N = rowwise_indices.size()
+    B, X, D = embeddings.size()
+    flattened_indices = (
+        rowwise_indices
+        + torch.arange(
+            start=0,
+            end=B,
+            step=1,
+            dtype=rowwise_indices.dtype,
+            device=rowwise_indices.device,
+        )
+        .unsqueeze(1)
+        .expand(-1, N)
+        * X
+    )
+    return embeddings.view(-1, D)[flattened_indices, :].reshape(
+        rowwise_indices.size() + (D,)
+    )
+
+
+def batch_scatter_embeddings(
+    dst_embeddings: torch.Tensor,
+    rowwise_indices: torch.Tensor,
+    src_embeddings: torch.Tensor,
+) -> None:
+    """
+    Args:
+        dst_embeddings: (B, N, D,) x float.
+        rowwise_indices: (B,) x int, where each entry is in [0, N - 1).
+        source_embeddings: (B, D,) x float.
+    """
+    B, N, D = dst_embeddings.size()
+    flattened_indices = rowwise_indices + torch.arange(
+        start=0,
+        end=B * N,
+        step=N,
+        dtype=rowwise_indices.dtype,
+        device=rowwise_indices.device,
+    )
+    dst_embeddings.view(B * N, D)[flattened_indices, :] = src_embeddings
+
+
+def get_current_embeddings(
+    lengths: torch.Tensor,
+    encoded_embeddings: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Args:
+        lengths: (B,) x int
+        seq_embeddings: (B, N, D,) x float
+
+    Returns:
+        (B, D,) x float, where [i, :] == encoded_embeddings[i, lengths[i] - 1, :]
+    """
+    B, N, D = encoded_embeddings.size()
+    flattened_offsets = (lengths - 1) + torch.arange(
+        start=0, end=B, step=1, dtype=lengths.dtype, device=lengths.device
+    ) * N
+    return encoded_embeddings.reshape(-1, D)[flattened_offsets, :].reshape(B, D)
+
+
+def jagged_or_dense_repeat_interleave_dim0(
+    x: torch.Tensor, lengths: torch.Tensor, repeats: int
+) -> torch.Tensor:
+    if len(x.size()) == 3:
+        return x.repeat_interleave(repeats, dim=0)
+    else:
+        assert len(x.size()) == 2, f"x.size() = {x.size()}"
+        padded_x = torch.ops.fbgemm.jagged_to_padded_dense(
+            values=x,
+            offsets=[torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)],
+            max_lengths=[lengths.max()],
+            padding_value=0.0,
+        )
+        lengths = lengths.repeat_interleave(repeats, dim=0)
+        return torch.ops.fbgemm.dense_to_jagged(
+            padded_x.repeat_interleave(repeats, dim=0),
+            [torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)],
+        )[0]
+
+
+def jagged_or_dense_index_select_dim0(
+    x: torch.Tensor, lengths: torch.Tensor, indices: torch.Tensor
+) -> torch.Tensor:
+    if len(x.size()) == 3:
+        return x[indices, :, :]
+    else:
+        assert len(x.size()) == 2, f"x.size() = {x.size()}"
+        padded_x = torch.ops.fbgemm.jagged_to_padded_dense(
+            values=x,
+            offsets=[torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)],
+            max_lengths=[lengths.max()],
+            padding_value=0.0,
+        )
+        return torch.ops.fbgemm.dense_to_jagged(
+            padded_x[indices, :],
+            [torch.ops.fbgemm.asynchronous_complete_cumsum(lengths[indices])],
+        )[0]
diff --git a/recommendation_v4/generative_recommenders/research/modeling/similarity_module.py b/recommendation_v4/generative_recommenders/research/modeling/similarity_module.py
new file mode 100644
index 000000000..3ba32d239
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/modeling/similarity_module.py
@@ -0,0 +1,68 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import abc
+from typing import Optional
+
+import torch
+from generative_recommenders.research.rails.similarities.module import SimilarityModule
+
+
+class SequentialEncoderWithLearnedSimilarityModule(torch.nn.Module):
+    """
+    Interface enabling using various similarity functions (besides inner products)
+    as part of a sequential encoder/decoder.
+
+    See rails/ for more details.
+    """
+
+    def __init__(
+        self,
+        ndp_module: SimilarityModule,
+    ) -> None:
+        super().__init__()
+
+        self._ndp_module: SimilarityModule = ndp_module
+
+    @abc.abstractmethod
+    def debug_str(
+        self,
+    ) -> str:
+        pass
+
+    def similarity_fn(
+        self,
+        query_embeddings: torch.Tensor,
+        item_ids: torch.Tensor,
+        item_embeddings: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        torch._assert(
+            len(query_embeddings.size()) == 2, "len(query_embeddings.size()) must be 2"
+        )
+        torch._assert(len(item_ids.size()) == 2, "len(item_ids.size()) must be 2")
+        if item_embeddings is None:
+            item_embeddings = self.get_item_embeddings(item_ids)  # pyre-ignore [29]
+        torch._assert(
+            len(item_embeddings.size()) == 3, "len(item_embeddings.size()) must be 3"
+        )
+
+        return self._ndp_module(
+            query_embeddings=query_embeddings,  # (B, query_embedding_dim)
+            item_embeddings=item_embeddings,  # (1/B, X, item_embedding_dim)
+            item_ids=item_ids,
+            **kwargs,
+        )
diff --git a/recommendation_v4/generative_recommenders/research/modeling/similarity_utils.py b/recommendation_v4/generative_recommenders/research/modeling/similarity_utils.py
new file mode 100644
index 000000000..7fd870b4b
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/modeling/similarity_utils.py
@@ -0,0 +1,222 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+from typing import List, Optional, Tuple
+
+import gin
+import torch
+from generative_recommenders.research.rails.similarities.dot_product_similarity_fn import (
+    DotProductSimilarity,
+)
+from generative_recommenders.research.rails.similarities.layers import SwiGLU
+from generative_recommenders.research.rails.similarities.mol.item_embeddings_fn import (
+    RecoMoLItemEmbeddingsFn,
+)
+from generative_recommenders.research.rails.similarities.mol.query_embeddings_fn import (
+    RecoMoLQueryEmbeddingsFn,
+)
+from generative_recommenders.research.rails.similarities.mol.similarity_fn import (
+    MoLSimilarity,
+    SoftmaxDropoutCombiner,
+)
+
+
+def init_mlp_xavier_weights_zero_bias(m) -> None:
+    if isinstance(m, torch.nn.Linear):
+        torch.nn.init.xavier_uniform(m.weight)
+        if getattr(m, "bias", None) is not None:
+            m.bias.data.fill_(0.0)
+
+
+@gin.configurable
+def create_mol_interaction_module(
+    query_embedding_dim: int,
+    item_embedding_dim: int,
+    dot_product_dimension: int,
+    query_dot_product_groups: int,
+    item_dot_product_groups: int,
+    temperature: float,
+    query_dropout_rate: float,
+    query_hidden_dim: int,
+    item_dropout_rate: float,
+    item_hidden_dim: int,
+    gating_query_hidden_dim: int,
+    gating_qi_hidden_dim: int,
+    gating_item_hidden_dim: int,
+    softmax_dropout_rate: float,
+    bf16_training: bool,
+    gating_query_fn: bool = True,
+    gating_item_fn: bool = True,
+    dot_product_l2_norm: bool = True,
+    query_nonlinearity: str = "geglu",
+    item_nonlinearity: str = "geglu",
+    uid_dropout_rate: float = 0.5,
+    uid_embedding_hash_sizes: Optional[List[int]] = None,
+    uid_embedding_level_dropout: bool = False,
+    gating_combination_type: str = "glu_silu",
+    gating_item_dropout_rate: float = 0.0,
+    gating_qi_dropout_rate: float = 0.0,
+    eps: float = 1e-6,
+) -> Tuple[MoLSimilarity, str]:
+    """
+    Gin wrapper for creating MoL learned similarity.
+    """
+    mol_module = MoLSimilarity(
+        query_embedding_dim=query_embedding_dim,
+        item_embedding_dim=item_embedding_dim,
+        dot_product_dimension=dot_product_dimension,
+        query_dot_product_groups=query_dot_product_groups,
+        item_dot_product_groups=item_dot_product_groups,
+        temperature=temperature,
+        dot_product_l2_norm=dot_product_l2_norm,
+        query_embeddings_fn=RecoMoLQueryEmbeddingsFn(
+            query_embedding_dim=query_embedding_dim,
+            query_dot_product_groups=query_dot_product_groups,
+            dot_product_dimension=dot_product_dimension,
+            dot_product_l2_norm=dot_product_l2_norm,
+            proj_fn=lambda input_dim, output_dim: (
+                torch.nn.Sequential(
+                    torch.nn.Dropout(p=query_dropout_rate),
+                    SwiGLU(
+                        in_features=input_dim,
+                        out_features=query_hidden_dim,
+                    ),
+                    torch.nn.Linear(
+                        in_features=query_hidden_dim,
+                        out_features=output_dim,
+                    ),
+                ).apply(init_mlp_xavier_weights_zero_bias)
+            ),
+            eps=eps,
+        ),
+        item_embeddings_fn=RecoMoLItemEmbeddingsFn(
+            item_embedding_dim=item_embedding_dim,
+            item_dot_product_groups=item_dot_product_groups,
+            dot_product_dimension=dot_product_dimension,
+            dot_product_l2_norm=dot_product_l2_norm,
+            proj_fn=lambda input_dim, output_dim: (
+                torch.nn.Sequential(
+                    torch.nn.Dropout(p=item_dropout_rate),
+                    SwiGLU(in_features=input_dim, out_features=item_hidden_dim),
+                    torch.nn.Linear(
+                        in_features=item_hidden_dim,
+                        out_features=output_dim,
+                    ),
+                ).apply(init_mlp_xavier_weights_zero_bias)
+            ),
+            eps=eps,
+        ),
+        gating_query_only_partial_fn=lambda input_dim, output_dim: (  # pyre-ignore [6]
+            torch.nn.Sequential(
+                torch.nn.Linear(
+                    in_features=input_dim,
+                    out_features=gating_query_hidden_dim,
+                ),
+                torch.nn.SiLU(),
+                torch.nn.Linear(
+                    in_features=gating_query_hidden_dim,
+                    out_features=output_dim,
+                    bias=False,
+                ),
+            ).apply(init_mlp_xavier_weights_zero_bias)
+            if gating_query_fn
+            else None
+        ),
+        gating_item_only_partial_fn=lambda input_dim, output_dim: (  # pyre-ignore [6]
+            torch.nn.Sequential(
+                torch.nn.Dropout(p=gating_item_dropout_rate),
+                torch.nn.Linear(
+                    in_features=input_dim,
+                    out_features=gating_item_hidden_dim,
+                ),
+                torch.nn.SiLU(),
+                torch.nn.Linear(
+                    in_features=gating_item_hidden_dim,
+                    out_features=output_dim,
+                    bias=False,
+                ),
+            ).apply(init_mlp_xavier_weights_zero_bias)
+            if gating_item_fn
+            else None
+        ),
+        gating_qi_partial_fn=lambda input_dim, output_dim: (  # pyre-ignore [6]
+            torch.nn.Sequential(
+                torch.nn.Dropout(p=gating_qi_dropout_rate),
+                torch.nn.Linear(
+                    in_features=input_dim,
+                    out_features=gating_qi_hidden_dim,
+                ),
+                torch.nn.SiLU(),
+                torch.nn.Linear(
+                    in_features=gating_qi_hidden_dim,
+                    out_features=output_dim,
+                ),
+            ).apply(init_mlp_xavier_weights_zero_bias)
+            if gating_qi_hidden_dim > 0
+            else torch.nn.Sequential(
+                torch.nn.Dropout(p=gating_qi_dropout_rate),
+                torch.nn.Linear(
+                    in_features=input_dim,
+                    out_features=output_dim,
+                ),
+            ).apply(init_mlp_xavier_weights_zero_bias)
+        ),
+        gating_combination_type=gating_combination_type,
+        gating_normalization_fn=lambda _: SoftmaxDropoutCombiner(
+            dropout_rate=softmax_dropout_rate, eps=1e-6
+        ),
+        eps=eps,
+        autocast_bf16=bf16_training,
+    )
+    interaction_module_debug_str = (
+        f"MoL-{query_dot_product_groups}x{item_dot_product_groups}x{dot_product_dimension}"
+        + f"-t{temperature}-d{softmax_dropout_rate}"
+        + f"{'-l2' if dot_product_l2_norm else ''}"
+        + f"-q{query_hidden_dim}d{query_dropout_rate}{query_nonlinearity}"
+        + f"-i{item_hidden_dim}d{item_dropout_rate}{item_nonlinearity}"
+        + (f"-gq{gating_query_hidden_dim}" if gating_query_fn else "")
+        + (
+            f"-gi{gating_item_hidden_dim}d{gating_item_dropout_rate}"
+            if gating_item_fn
+            else ""
+        )
+        + f"-gqi{gating_qi_hidden_dim}d{gating_qi_dropout_rate}-x-{gating_combination_type}"
+    )
+    return mol_module, interaction_module_debug_str
+
+
+@gin.configurable
+def get_similarity_function(
+    module_type: str,
+    query_embedding_dim: int,
+    item_embedding_dim: int,
+    bf16_training: bool = False,
+    activation_checkpoint: bool = False,
+) -> Tuple[torch.nn.Module, str]:
+    if module_type == "DotProduct":
+        interaction_module = DotProductSimilarity()
+        interaction_module_debug_str = "DotProduct"
+    elif module_type == "MoL":
+        interaction_module, interaction_module_debug_str = (
+            create_mol_interaction_module(
+                query_embedding_dim=query_embedding_dim,
+                item_embedding_dim=item_embedding_dim,
+                bf16_training=bf16_training,
+            )
+        )
+    else:
+        raise ValueError(f"Unknown interaction_module_type {module_type}")
+    return interaction_module, interaction_module_debug_str
diff --git a/recommendation_v4/generative_recommenders/research/rails/indexing/candidate_index.py b/recommendation_v4/generative_recommenders/research/rails/indexing/candidate_index.py
new file mode 100644
index 000000000..f628468ce
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/rails/indexing/candidate_index.py
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import abc
+from typing import Tuple
+
+import torch
+
+
+class TopKModule(torch.nn.Module):
+    @abc.abstractmethod
+    def forward(
+        self,
+        query_embeddings: torch.Tensor,
+        k: int,
+        sorted: bool = True,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            query_embeddings: (B, X, ...). Implementation-specific.
+            k: int. top k to return.
+            sorted: bool.
+
+        Returns:
+            Tuple of (top_k_scores, top_k_ids), both of shape (B, K,)
+        """
+        pass
diff --git a/recommendation_v4/generative_recommenders/research/rails/indexing/mips_top_k.py b/recommendation_v4/generative_recommenders/research/rails/indexing/mips_top_k.py
new file mode 100644
index 000000000..810b24c42
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/rails/indexing/mips_top_k.py
@@ -0,0 +1,80 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+from typing import Tuple
+
+import torch
+from generative_recommenders.research.rails.indexing.candidate_index import TopKModule
+
+
+class MIPSTopKModule(TopKModule):
+    def __init__(
+        self,
+        item_embeddings: torch.Tensor,
+        item_ids: torch.Tensor,
+    ) -> None:
+        """
+        Args:
+            item_embeddings: (1, X, D)
+            item_ids: (1, X,)
+        """
+        super().__init__()
+
+        self._item_embeddings: torch.Tensor = item_embeddings
+        self._item_ids: torch.Tensor = item_ids
+
+
+class MIPSBruteForceTopK(MIPSTopKModule):
+    def __init__(
+        self,
+        item_embeddings: torch.Tensor,
+        item_ids: torch.Tensor,
+    ) -> None:
+        super().__init__(
+            item_embeddings=item_embeddings,
+            item_ids=item_ids,
+        )
+        del self._item_embeddings
+        self._item_embeddings_t: torch.Tensor = item_embeddings.permute(
+            2, 1, 0
+        ).squeeze(2)
+
+    def forward(
+        self,
+        query_embeddings: torch.Tensor,
+        k: int,
+        sorted: bool = True,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            query_embeddings: (B, ...). Implementation-specific.
+            k: int. final top-k to return.
+            sorted: bool. whether to sort final top-k results or not.
+
+        Returns:
+            Tuple of (top_k_scores x float, top_k_ids x int), both of shape (B, K,)
+        """
+        # (B, X,)
+        all_logits = torch.mm(query_embeddings, self._item_embeddings_t)
+        top_k_logits, top_k_indices = torch.topk(
+            all_logits,
+            dim=1,
+            k=k,
+            sorted=sorted,
+            largest=True,
+        )  # (B, k,)
+        return top_k_logits, self._item_ids.squeeze(0)[top_k_indices]
diff --git a/recommendation_v4/generative_recommenders/research/rails/indexing/mol_top_k.py b/recommendation_v4/generative_recommenders/research/rails/indexing/mol_top_k.py
new file mode 100644
index 000000000..fe88ca919
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/rails/indexing/mol_top_k.py
@@ -0,0 +1,132 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+"""
+Defines exact- and approximate- Top-K modules for Mixture-of-Logits (MoL),
+discussed in Retrieval with Learned Similarities (https://arxiv.org/abs/2407.15462).
+
+Forked from bailuding/rails @ 664fdb9.
+"""
+
+from typing import Tuple
+
+import torch
+from generative_recommenders.research.rails.indexing.candidate_index import TopKModule
+from generative_recommenders.research.rails.similarities.mol.similarity_fn import (
+    MoLSimilarity,
+)
+
+
+class MoLTopKModule(TopKModule):
+    def __init__(
+        self,
+        mol_module: MoLSimilarity,
+        item_embeddings: torch.Tensor,
+        item_ids: torch.Tensor,
+        flatten_item_ids_and_embeddings: bool,
+        keep_component_level_item_embeddings: bool,
+        component_level_item_embeddings_dtype: torch.dtype = torch.bfloat16,
+    ) -> None:
+        """
+        Args:
+            mol_module: MoLSimilarity.
+            item_embeddings: (1, X, D) if mol_module._apply_item_embeddings_fn is True,
+                (1, X, P_X, D_P) otherwise.
+            item_ids: (1, X,) representing the item ids.
+            flatten_item_ids_and_embeddings: bool. If true, do not keep the extra (1,)
+                dimension at size(0).
+            keep_component_level_item_embeddings: bool. If true, keep P_x component-level
+                embeddings in `self._mol_item_embeddings` for downstream applications.
+            component_level_item_embeddings_dtype: torch.dtype. If set, the dtype
+                to keep component-level item embeddings in. By default we use bfloat16.
+        """
+        super().__init__()
+
+        self._mol_module: MoLSimilarity = mol_module
+        self._item_embeddings: torch.Tensor = (
+            item_embeddings
+            if not flatten_item_ids_and_embeddings
+            else item_embeddings.squeeze(0)
+        )
+
+        if keep_component_level_item_embeddings:
+            self._mol_item_embeddings: torch.Tensor = (
+                mol_module.get_item_component_embeddings(
+                    (
+                        self._item_embeddings.squeeze(0)
+                        if not flatten_item_ids_and_embeddings
+                        else self._item_embeddings
+                    ),
+                    decoupled_inference=True,
+                )[0]  # (X, D) -> (X, P_X, D_P)
+            ).to(component_level_item_embeddings_dtype)
+
+        self._item_ids: torch.Tensor = (
+            item_ids if not flatten_item_ids_and_embeddings else item_ids.squeeze(0)
+        )
+
+    @property
+    def mol_module(self) -> MoLSimilarity:
+        return self._mol_module
+
+
+class MoLBruteForceTopK(MoLTopKModule):
+    def __init__(
+        self,
+        mol_module: MoLSimilarity,
+        item_embeddings: torch.Tensor,
+        item_ids: torch.Tensor,
+    ) -> None:
+        super().__init__(
+            mol_module=mol_module,
+            item_embeddings=item_embeddings,
+            item_ids=item_ids,
+            flatten_item_ids_and_embeddings=False,
+            keep_component_level_item_embeddings=False,
+        )
+
+    def forward(
+        self,
+        query_embeddings: torch.Tensor,
+        k: int,
+        sorted: bool = True,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            query_embeddings: (B, X, D) if mol_module._apply_query_embeddings_fn is True,
+                (B, X, P_Q, D_P) otherwise.
+            k: int. final top-k to return.
+            sorted: bool. whether to sort final top-k results or not.
+            **kwargs: Implementation-specific keys/values.
+
+        Returns:
+            Tuple of (top_k_scores x float, top_k_ids x int), both of shape (B, K,)
+        """
+        # (B, X,)
+        all_logits, _ = self.mol_module(
+            query_embeddings,
+            self._item_embeddings,
+            **kwargs,
+        )
+        top_k_logits, top_k_indices = torch.topk(
+            all_logits,
+            dim=1,
+            k=k,
+            sorted=sorted,
+            largest=True,
+        )  # (B, k,)
+        return top_k_logits, self._item_ids.squeeze(0)[top_k_indices]
diff --git a/recommendation_v4/generative_recommenders/research/rails/similarities/dot_product_similarity_fn.py b/recommendation_v4/generative_recommenders/research/rails/similarities/dot_product_similarity_fn.py
new file mode 100644
index 000000000..9357fd0e4
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/rails/similarities/dot_product_similarity_fn.py
@@ -0,0 +1,68 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+from typing import Dict, Tuple
+
+import torch
+from generative_recommenders.research.rails.similarities.module import SimilarityModule
+
+
+class DotProductSimilarity(SimilarityModule):
+    def __init__(
+        self,
+    ) -> None:
+        super().__init__()
+
+    def debug_str(self) -> str:
+        return "dp"
+
+    def forward(
+        self,
+        query_embeddings: torch.Tensor,
+        item_embeddings: torch.Tensor,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Args:
+            query_embeddings: (B, D,) or (B * r, D) x float.
+            item_embeddings: (1, X, D) or (B, X, D) x float.
+
+        Returns:
+            (B, X) x float.
+        """
+
+        B_I, X, D = item_embeddings.size()
+        if B_I == 1:
+            # [B, D] x ([1, X, D] -> [D, X]) => [B, X]
+            return (
+                torch.mm(query_embeddings, item_embeddings.squeeze(0).t()),
+                {},
+            )  # [B, X]
+        elif query_embeddings.size(0) != B_I:
+            # (B * r, D) x (B, X, D).
+            return (
+                torch.bmm(
+                    query_embeddings.view(B_I, -1, D),
+                    item_embeddings.permute(0, 2, 1),
+                ).view(-1, X),
+                {},
+            )
+        else:
+            # [B, X, D] x ([B, D] -> [B, D, 1]) => [B, X, 1] -> [B, X]
+            return (
+                torch.bmm(item_embeddings, query_embeddings.unsqueeze(2)).squeeze(2),
+                {},
+            )
diff --git a/recommendation_v4/generative_recommenders/research/rails/similarities/layers.py b/recommendation_v4/generative_recommenders/research/rails/similarities/layers.py
new file mode 100644
index 000000000..3f838bc48
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/rails/similarities/layers.py
@@ -0,0 +1,82 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+"""
+Defines network architectures used in constructing various learned similarities.
+
+Forked from bailuding/rails @ 664fdb9.
+"""
+
+import torch
+import torch.nn.functional as F
+
+
+class GeGLU(torch.nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+    ) -> None:
+        super().__init__()
+
+        self._in_features = in_features
+        self._out_features = out_features
+        self._w = torch.nn.Parameter(
+            torch.empty((in_features, out_features * 2)).normal_(mean=0, std=0.02),
+        )
+        self._b = torch.nn.Parameter(
+            torch.zeros((1, out_features * 2)),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        bs = x.size()[:-1]
+        lhs, rhs = torch.split(
+            torch.mm(x.reshape(-1, self._in_features), self._w) + self._b,
+            [self._out_features, self._out_features],
+            dim=-1,
+        )
+        return (F.gelu(lhs) * rhs).reshape(bs + (self._out_features,))
+
+
+class SwiGLU(torch.nn.Module):
+    """
+    SwiGLU from https://arxiv.org/abs/2002.05202.
+    """
+
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+    ) -> None:
+        super().__init__()
+
+        self._in_features = in_features
+        self._out_features = out_features
+        self._w = torch.nn.Parameter(
+            torch.empty((in_features, out_features * 2)).normal_(mean=0, std=0.02),
+        )
+        self._b = torch.nn.Parameter(
+            torch.zeros((1, out_features * 2)),
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        bs = x.size()[:-1]
+        lhs, rhs = torch.split(
+            torch.mm(x.reshape(-1, self._in_features), self._w) + self._b,
+            [self._out_features, self._out_features],
+            dim=-1,
+        )
+        return (F.silu(lhs) * rhs).reshape(bs + (self._out_features,))
diff --git a/recommendation_v4/generative_recommenders/research/rails/similarities/module.py b/recommendation_v4/generative_recommenders/research/rails/similarities/module.py
new file mode 100644
index 000000000..e4061fa74
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/rails/similarities/module.py
@@ -0,0 +1,55 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import abc
+from typing import Dict, Tuple
+
+import torch
+
+
+class SimilarityModule(torch.nn.Module):
+    """
+    Interface enabling interfacing with various similarity functions.
+
+    While the discussions in our initial ICML'24 paper are based on inner products
+    for simplicity, we provide this interface (SimilarityModule) to support various
+    learned similarities at the retrieval stage, such as MLPs, Factorization Machines
+    (FMs), and Mixture-of-Logits (MoL), which we discussed in
+    - Revisiting Neural Retrieval on Accelerators (KDD'23), and
+    - Retrieval with Learned Similarities (https://arxiv.org/abs/2407.15462).
+    """
+
+    @abc.abstractmethod
+    def forward(
+        self,
+        query_embeddings: torch.Tensor,
+        item_embeddings: torch.Tensor,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Args:
+            query_embeddings: (B, input_embedding_dim) x float.
+            item_embeddings: (1/B, X, item_embedding_dim) x float.
+            **kwargs: Implementation-specific keys/values (e.g.,
+                item ids / sideinfo, etc.)
+
+        Returns:
+            A tuple of (
+                (B, X,) similarity values,
+                keyed outputs representing auxiliary losses at training time.
+            ).
+        """
+        pass
diff --git a/recommendation_v4/generative_recommenders/research/rails/similarities/mol/embeddings_fn.py b/recommendation_v4/generative_recommenders/research/rails/similarities/mol/embeddings_fn.py
new file mode 100644
index 000000000..fd94e6f22
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/rails/similarities/mol/embeddings_fn.py
@@ -0,0 +1,52 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+"""
+Defines interface for generating query- and item-side embeddings for MoL.
+
+Forked from bailuding/rails @ 664fdb9.
+"""
+
+import abc
+from typing import Dict, Tuple
+
+import torch
+
+
+class MoLEmbeddingsFn(torch.nn.Module):
+    """
+    Generates K_Q query-side (K_I item-side) embeddings for MoL based on
+    input embeddings and other optional implementation-specific tensors.
+    """
+
+    @abc.abstractmethod
+    def forward(
+        self,
+        input_embeddings: torch.Tensor,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Args:
+            input_embeddings: (B, ...) x float where B is the batch size.
+            kwargs: implementation-specific.
+
+        Returns:
+            Tuple of (
+                (B, query_dot_product_groups/item_dot_product_groups, dot_product_embedding_dim) x float,
+                str-keyed auxiliary losses.
+            ).
+        """
+        pass
diff --git a/recommendation_v4/generative_recommenders/research/rails/similarities/mol/item_embeddings_fn.py b/recommendation_v4/generative_recommenders/research/rails/similarities/mol/item_embeddings_fn.py
new file mode 100644
index 000000000..237cd8942
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/rails/similarities/mol/item_embeddings_fn.py
@@ -0,0 +1,99 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+"""
+Defines functions to generate item-side embeddings for MoL.
+
+Forked from bailuding/rails @ 664fdb9.
+"""
+
+from typing import Callable, Dict, Tuple
+
+import torch
+from generative_recommenders.research.rails.similarities.mol.embeddings_fn import (
+    MoLEmbeddingsFn,
+)
+
+
+def init_mlp_xavier_weights_zero_bias(m) -> None:
+    if isinstance(m, torch.nn.Linear):
+        torch.nn.init.xavier_uniform_(m.weight)
+        if getattr(m, "bias", None) is not None:
+            m.bias.data.fill_(0.0)
+
+
+class RecoMoLItemEmbeddingsFn(MoLEmbeddingsFn):
+    """
+    Generates P_X query-side embeddings for MoL based on input embeddings and other
+    optional tensors for recommendation models. Tested for sequential retrieval
+    scenarios.
+    """
+
+    def __init__(
+        self,
+        item_embedding_dim: int,
+        item_dot_product_groups: int,
+        dot_product_dimension: int,
+        dot_product_l2_norm: bool,
+        proj_fn: Callable[[int, int], torch.nn.Module],
+        eps: float,
+    ) -> None:
+        super().__init__()
+
+        self._item_emb_based_dot_product_groups: int = item_dot_product_groups
+        self._item_emb_proj_module: torch.nn.Module = proj_fn(
+            item_embedding_dim,
+            dot_product_dimension * self._item_emb_based_dot_product_groups,
+        )
+        self._dot_product_dimension: int = dot_product_dimension
+        self._dot_product_l2_norm: bool = dot_product_l2_norm
+        self._eps: float = eps
+
+    def forward(
+        self,
+        input_embeddings: torch.Tensor,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Args:
+            input_embeddings: (B, item_embedding_dim,) x float where B is the batch size.
+            kwargs: str-keyed tensors. Implementation-specific.
+
+        Returns:
+            Tuple of (
+                (B, item_dot_product_groups, dot_product_embedding_dim) x float,
+                str-keyed aux_losses,
+            ).
+        """
+        split_item_embeddings = self._item_emb_proj_module(input_embeddings).reshape(
+            input_embeddings.size()[:-1]
+            + (
+                self._item_emb_based_dot_product_groups,
+                self._dot_product_dimension,
+            )
+        )
+
+        if self._dot_product_l2_norm:
+            split_item_embeddings = split_item_embeddings / torch.clamp(
+                torch.linalg.norm(
+                    split_item_embeddings,
+                    ord=None,
+                    dim=-1,
+                    keepdim=True,
+                ),
+                min=self._eps,
+            )
+        return split_item_embeddings, {}
diff --git a/recommendation_v4/generative_recommenders/research/rails/similarities/mol/query_embeddings_fn.py b/recommendation_v4/generative_recommenders/research/rails/similarities/mol/query_embeddings_fn.py
new file mode 100644
index 000000000..8fe28ee11
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/rails/similarities/mol/query_embeddings_fn.py
@@ -0,0 +1,164 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+"""
+Defines functions to generate query-side embeddings for MoL.
+
+Forked from bailuding/rails @ 664fdb9.
+"""
+
+from typing import Callable, Dict, List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from generative_recommenders.research.rails.similarities.mol.embeddings_fn import (
+    MoLEmbeddingsFn,
+)
+
+
+def init_mlp_xavier_weights_zero_bias(m) -> None:
+    if isinstance(m, torch.nn.Linear):
+        torch.nn.init.xavier_uniform_(m.weight)
+        if getattr(m, "bias", None) is not None:
+            m.bias.data.fill_(0.0)
+
+
+class RecoMoLQueryEmbeddingsFn(MoLEmbeddingsFn):
+    """
+    Generates P_Q query-side embeddings for MoL based on input embeddings and other
+    optional tensors for recommendation models. Tested for sequential retrieval
+    scenarios.
+
+    The current implementation accesses user_ids associated with the query from
+    `user_ids' in kwargs.
+    """
+
+    def __init__(
+        self,
+        query_embedding_dim: int,
+        query_dot_product_groups: int,
+        dot_product_dimension: int,
+        dot_product_l2_norm: bool,
+        proj_fn: Callable[[int, int], torch.nn.Module],
+        eps: float,
+        uid_embedding_hash_sizes: Optional[List[int]] = None,
+        uid_dropout_rate: float = 0.0,
+        uid_embedding_level_dropout: bool = False,
+    ) -> None:
+        super().__init__()
+        self._uid_embedding_hash_sizes: List[int] = uid_embedding_hash_sizes or []
+        self._query_emb_based_dot_product_groups: int = query_dot_product_groups - len(
+            self._uid_embedding_hash_sizes
+        )
+        self._query_emb_proj_module: torch.nn.Module = proj_fn(
+            query_embedding_dim,
+            dot_product_dimension * self._query_emb_based_dot_product_groups,
+        )
+        self._dot_product_dimension: int = dot_product_dimension
+        self._dot_product_l2_norm: bool = dot_product_l2_norm
+        if len(self._uid_embedding_hash_sizes) > 0:
+            for i, hash_size in enumerate(self._uid_embedding_hash_sizes):
+                setattr(
+                    self,
+                    f"_uid_embeddings_{i}",
+                    torch.nn.Embedding(
+                        hash_size + 1, dot_product_dimension, padding_idx=0
+                    ),
+                )
+        self._uid_dropout_rate: float = uid_dropout_rate
+        self._uid_embedding_level_dropout: bool = uid_embedding_level_dropout
+        self._eps: float = eps
+
+    def forward(
+        self,
+        input_embeddings: torch.Tensor,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Args:
+            input_embeddings: (B, query_embedding_dim,) x float where B is the batch size.
+            kwargs: str-keyed tensors. Implementation-specific.
+
+        Returns:
+            Tuple of (
+                (B, query_dot_product_groups, dot_product_embedding_dim) x float,
+                str-keyed aux_losses,
+            ).
+        """
+        split_query_embeddings = self._query_emb_proj_module(input_embeddings).reshape(
+            (
+                input_embeddings.size(0),
+                self._query_emb_based_dot_product_groups,
+                self._dot_product_dimension,
+            )
+        )
+
+        aux_losses: Dict[str, torch.Tensor] = {}
+
+        if len(self._uid_embedding_hash_sizes) > 0:
+            all_uid_embeddings = []
+            for i, hash_size in enumerate(self._uid_embedding_hash_sizes):
+                # TODO: decouple this from MoLQueryEmbeddingFn.
+                uid_embeddings = getattr(self, f"_uid_embeddings_{i}")(
+                    (kwargs["user_ids"] % hash_size) + 1
+                )
+                if self.training:
+                    l2_norm = (uid_embeddings * uid_embeddings).sum(-1).mean()
+                    if i == 0:
+                        aux_losses["uid_embedding_l2_norm"] = l2_norm
+                    else:
+                        aux_losses["uid_embedding_l2_norm"] = (
+                            aux_losses["uid_embedding_l2_norm"] + l2_norm
+                        )
+
+                if self._uid_dropout_rate > 0.0:
+                    if self._uid_embedding_level_dropout:
+                        # conditionally dropout the entire embedding.
+                        if self.training:
+                            uid_dropout_mask = (
+                                torch.rand(
+                                    uid_embeddings.size()[:-1],
+                                    device=uid_embeddings.device,
+                                )
+                                > self._uid_dropout_rate
+                            )
+                            uid_embeddings = (
+                                uid_embeddings
+                                * uid_dropout_mask.unsqueeze(-1)
+                                / (1.0 - self._uid_dropout_rate)
+                            )
+                    else:
+                        uid_embeddings = F.dropout(
+                            uid_embeddings,
+                            p=self._uid_dropout_rate,
+                            training=self.training,
+                        )
+                all_uid_embeddings.append(uid_embeddings.unsqueeze(1))
+            split_query_embeddings = torch.cat(
+                [split_query_embeddings] + all_uid_embeddings, dim=1
+            )
+
+        if self._dot_product_l2_norm:
+            split_query_embeddings = split_query_embeddings / torch.clamp(
+                torch.linalg.norm(
+                    split_query_embeddings,
+                    ord=None,
+                    dim=-1,
+                    keepdim=True,
+                ),
+                min=self._eps,
+            )
+        return split_query_embeddings, aux_losses
diff --git a/recommendation_v4/generative_recommenders/research/rails/similarities/mol/similarity_fn.py b/recommendation_v4/generative_recommenders/research/rails/similarities/mol/similarity_fn.py
new file mode 100644
index 000000000..34e4c4a23
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/rails/similarities/mol/similarity_fn.py
@@ -0,0 +1,388 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+"""
+Implements MoL (Mixture-of-Logits) with load balancing regularization loss, as discussed in:
+- Revisiting Neural Retrieval on Accelerators (https://arxiv.org/abs/2306.04039, KDD'23).
+- Retrieval with Learned Similarities (https://arxiv.org/abs/2407.15462).
+
+Forked from bailuding/rails @ 664fdb9.
+"""
+
+from typing import Callable, Dict, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from generative_recommenders.research.rails.similarities.module import SimilarityModule
+from generative_recommenders.research.rails.similarities.mol.embeddings_fn import (
+    MoLEmbeddingsFn,
+)
+
+
+@torch.compile(dynamic=True)
+def _softmax_dropout_combiner_fn(
+    x: torch.Tensor,
+    y: torch.Tensor,
+    dropout_pr: float,
+    eps: float,
+    training: bool,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Computes (_softmax_dropout_fn(x) * y).sum(-1).
+    """
+    x = F.softmax(x, dim=-1)
+    if dropout_pr > 0.0:
+        x = F.dropout(x, p=dropout_pr, training=training)
+        x = x / torch.clamp(x.sum(-1, keepdims=True), min=eps)  # pyre-ignore [19]
+    return x, (x * y).sum(-1)
+
+
+@torch.compile
+def _load_balancing_mi_loss_fn(
+    gating_prs: torch.Tensor,
+    eps: float,
+) -> torch.Tensor:
+    """
+    See Retrieval with Learned Similarities (RAILS, https://arxiv.org/abs/2407.15462) for discussions.
+    """
+    B, X, E = gating_prs.size()
+    expert_util_prs = gating_prs.view(B * X, E).sum(0, keepdim=False) / (1.0 * B * X)
+    expert_util_entropy = -(expert_util_prs * torch.log(expert_util_prs + eps)).sum()
+    per_example_expert_entropy = -(gating_prs * torch.log(gating_prs + eps)).sum() / (
+        1.0 * B * X
+    )
+    return -expert_util_entropy + per_example_expert_entropy
+
+
+class SoftmaxDropoutCombiner(torch.nn.Module):
+    def __init__(
+        self,
+        dropout_rate: float,
+        eps: float,
+    ) -> None:
+        super().__init__()
+
+        self._dropout_rate: float = dropout_rate
+        self._eps: float = eps
+
+    def forward(
+        self,
+        gating_weights: torch.Tensor,
+        x: torch.Tensor,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        gating_prs, combined_logits = _softmax_dropout_combiner_fn(
+            x=gating_weights,
+            y=x,
+            dropout_pr=self._dropout_rate,
+            eps=self._eps,
+            training=self.training,
+        )
+
+        aux_losses = {}
+        if self.training:
+            aux_losses["mi_loss"] = _load_balancing_mi_loss_fn(
+                gating_prs, eps=self._eps
+            )
+
+        return combined_logits, aux_losses
+
+
+class MoLGatingFn(torch.nn.Module):
+    """
+    Implements the gating function for MoL, used to compute pi_p(q, x) for a given (p, x) pair.
+    """
+
+    def __init__(
+        self,
+        num_logits: int,
+        query_embedding_dim: int,
+        item_embedding_dim: int,
+        query_only_partial_fn: Optional[Callable[[int, int], torch.nn.Module]],
+        item_only_partial_fn: Optional[Callable[[int, int], torch.nn.Module]],
+        qi_partial_fn: Optional[Callable[[int, int], torch.nn.Module]],
+        combination_type: str,
+        normalization_fn: Callable[[int], torch.nn.Module],
+    ) -> None:
+        super().__init__()
+
+        self._query_only_partial_module: Optional[torch.nn.Module] = (
+            query_only_partial_fn(query_embedding_dim, num_logits)
+            if query_only_partial_fn
+            else None
+        )
+        self._item_only_partial_module: Optional[torch.nn.Module] = (
+            item_only_partial_fn(item_embedding_dim, num_logits)
+            if item_only_partial_fn
+            else None
+        )
+        self._qi_partial_module: Optional[torch.nn.Module] = (
+            qi_partial_fn(
+                num_logits,
+                num_logits,
+            )
+            if qi_partial_fn is not None
+            else None
+        )
+        if (
+            self._query_only_partial_module is None
+            and self._item_only_partial_module is None
+            and self._qi_partial_module is None
+        ):
+            raise ValueError(
+                "At least one of query_only_partial_fn, item_only_partial_fn, "
+                "and qi_partial_fn must not be None."
+            )
+        self._num_logits: int = num_logits
+        self._combination_type: str = combination_type
+        self._normalization_fn: torch.nn.Module = normalization_fn(num_logits)
+
+    def forward(
+        self,
+        logits: torch.Tensor,
+        query_embeddings: torch.Tensor,
+        item_embeddings: torch.Tensor,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Args:
+            logits: (B, X, P_Q * P_X) x float;
+            query_embeddings: (B, D) x float;
+            item_embeddings: (1/B, X, D') x float;
+
+        Returns:
+            (B, X) x float, Dict[str, Tensor] representing auxiliary losses.
+        """
+        B, X, _ = logits.size()
+        # [B, 1, F], [1/B, X, F], [B, X, F]
+        query_partial_inputs, item_partial_inputs, qi_partial_inputs = None, None, None
+        if self._query_only_partial_module is not None:
+            query_partial_inputs = self._query_only_partial_module(
+                query_embeddings
+            ).unsqueeze(1)
+        if self._item_only_partial_module is not None:
+            item_partial_inputs = self._item_only_partial_module(item_embeddings)
+        if self._qi_partial_module is not None:
+            qi_partial_inputs = self._qi_partial_module(logits)
+
+        if self._combination_type == "glu_silu":
+            gating_inputs = (
+                query_partial_inputs * item_partial_inputs + qi_partial_inputs
+            )
+            gating_weights = gating_inputs * F.sigmoid(gating_inputs)
+        elif self._combination_type == "glu_silu_ln":
+            gating_inputs = (
+                query_partial_inputs * item_partial_inputs + qi_partial_inputs
+            )
+            gating_weights = gating_inputs * F.sigmoid(
+                F.layer_norm(gating_inputs, normalized_shape=[self._num_logits])
+            )
+        elif self._combination_type == "none":
+            gating_inputs = query_partial_inputs
+            if gating_inputs is None:
+                gating_inputs = item_partial_inputs
+            elif item_partial_inputs is not None:
+                gating_inputs += item_partial_inputs
+            if gating_inputs is None:
+                gating_inputs = qi_partial_inputs
+            elif qi_partial_inputs is not None:
+                gating_inputs += qi_partial_inputs
+            gating_weights = gating_inputs
+        else:
+            raise ValueError(f"Unknown combination_type {self._combination_type}")
+
+        return self._normalization_fn(gating_weights, logits)
+
+
+class MoLSimilarity(SimilarityModule):
+    def __init__(
+        self,
+        query_embedding_dim: int,
+        item_embedding_dim: int,
+        dot_product_dimension: int,
+        query_dot_product_groups: int,
+        item_dot_product_groups: int,
+        temperature: float,
+        dot_product_l2_norm: bool,
+        query_embeddings_fn: MoLEmbeddingsFn,
+        item_embeddings_fn: Optional[MoLEmbeddingsFn],
+        gating_query_only_partial_fn: Optional[Callable[[int, int], torch.nn.Module]],
+        gating_item_only_partial_fn: Optional[Callable[[int, int], torch.nn.Module]],
+        gating_qi_partial_fn: Optional[Callable[[int], torch.nn.Module]],
+        gating_combination_type: str,
+        gating_normalization_fn: Callable[[int], torch.nn.Module],
+        eps: float,
+        apply_query_embeddings_fn: bool = True,
+        apply_item_embeddings_fn: bool = True,
+        autocast_bf16: bool = False,
+    ) -> None:
+        """
+        Args:
+            apply_query_embeddings_fn: bool. If true, compute query_embeddings_fn
+                to input during forward(). Otherwise, we assume the caller will
+                invoke get_query_component_embeddings() separately before
+                calling forward().
+            apply_item_embeddings_fn: bool. If true, compute item_embeddings_fn
+                to input during forward(). Otherwise, we assume the caller will
+                invoke get_item_component_embeddings() separately before
+                calling forward().
+        """
+        super().__init__()
+
+        self._gating_fn: MoLGatingFn = MoLGatingFn(
+            num_logits=query_dot_product_groups * item_dot_product_groups,
+            query_embedding_dim=query_embedding_dim,
+            item_embedding_dim=item_embedding_dim,
+            query_only_partial_fn=gating_query_only_partial_fn,
+            item_only_partial_fn=gating_item_only_partial_fn,
+            qi_partial_fn=gating_qi_partial_fn,  # pyre-ignore [6]
+            combination_type=gating_combination_type,
+            normalization_fn=gating_normalization_fn,
+        )
+        self._query_embeddings_fn: MoLEmbeddingsFn = query_embeddings_fn
+        self._item_embeddings_fn: MoLEmbeddingsFn = (  # pyre-ignore [8]
+            item_embeddings_fn
+        )
+        self._apply_query_embeddings_fn: bool = apply_query_embeddings_fn
+        self._apply_item_embeddings_fn: bool = apply_item_embeddings_fn
+        self._dot_product_l2_norm: bool = dot_product_l2_norm
+        self._query_dot_product_groups: int = query_dot_product_groups
+        self._item_dot_product_groups: int = item_dot_product_groups
+        self._dot_product_dimension: int = dot_product_dimension
+        self._temperature: float = temperature
+        self._eps: float = eps
+        self._autocast_bf16: bool = autocast_bf16
+
+    def get_query_component_embeddings(
+        self,
+        input_embeddings: torch.Tensor,
+        decoupled_inference: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Args:
+            input_embeddings: (B, self._input_embedding_dim,) x float
+                or (B, P_Q, self._dot_product_dimension) x float.
+            decoupled_inference: bool. If true, the call represents an attempt to run
+                forward() in decoupled mode at inference time (e.g., to pre-compute
+                component-level query embeddings for filtering, etc.). We simulate
+                the logic in forward() in this case (e.g., if forward() doesn't apply
+                query_embeddings_fn, then this call won't either).
+            kwargs: additional implementation-specific arguments.
+
+        Returns:
+            (B, query_dot_product_groups, dot_product_embedding_dim) x float.
+        """
+        if decoupled_inference and not self._apply_query_embeddings_fn:
+            return input_embeddings, {}
+        return self._query_embeddings_fn(input_embeddings, **kwargs)
+
+    def get_item_component_embeddings(
+        self,
+        input_embeddings: torch.Tensor,
+        decoupled_inference: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Args:
+            input_embeddings: (..., self._input_embedding_dim,) x float
+                or (..., P_X, self._dot_product_dimension) x float.
+            decoupled_inference: bool. If true, the call represents an attempt to run
+                forward() in decoupled mode at inference time (e.g., to pre-compute
+                component-level item embeddings for filtering, etc.). We simulate
+                the logic in forward() in this case (e.g., if forward() doesn't apply
+                item_embeddings_fn, then this call won't either).
+            kwargs: additional implementation-specific arguments.
+
+        Returns:
+            (..., item_dot_product_groups, dot_product_embedding_dim) x float.
+        """
+        if decoupled_inference and not self._apply_item_embeddings_fn:
+            return input_embeddings, {}
+
+        return self._item_embeddings_fn(input_embeddings, **kwargs)
+
+    def forward(
+        self,
+        query_embeddings: torch.Tensor,
+        item_embeddings: torch.Tensor,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Args:
+            query_embeddings: (B, self._input_embedding_dim) x float or
+                (B, P_Q, self._dot_product_dimension) x float (when query_embeddings_fn
+                is applied externally).
+            item_embeddings: (1/B, X, self._item_embedding_dim) x float or
+                (1/B, X, P_X, self._dot_product_dimension) x float (when item_embeddings_fn
+                is applied externally).
+            kwargs: additional implementation-specific arguments.
+
+        Returns:
+            (B, X) x float, Dict[str, Tensor] representing auxiliary losses.
+        """
+        with torch.autocast(
+            enabled=self._autocast_bf16, dtype=torch.bfloat16, device_type="cuda"
+        ):
+            B = query_embeddings.size(0)
+            B_prime = item_embeddings.shape[0]  # 1 or B
+            X = item_embeddings.shape[1]
+
+            if self._apply_query_embeddings_fn:
+                (
+                    split_query_embeddings,
+                    query_aux_losses,
+                ) = self.get_query_component_embeddings(
+                    query_embeddings,
+                    **kwargs,
+                )
+            else:
+                split_query_embeddings, query_aux_losses = query_embeddings, {}
+
+            if self._apply_item_embeddings_fn:
+                (
+                    split_item_embeddings,
+                    item_aux_losses,
+                ) = self.get_item_component_embeddings(
+                    input_embeddings=item_embeddings,
+                    **kwargs,
+                )
+            else:
+                split_item_embeddings, item_aux_losses = item_embeddings, {}
+
+            if B_prime == 1:
+                logits = torch.einsum(
+                    "bnd,xmd->bxnm",
+                    split_query_embeddings,
+                    split_item_embeddings.squeeze(0),
+                ).reshape(
+                    B, X, self._query_dot_product_groups * self._item_dot_product_groups
+                )
+            else:
+                logits = torch.einsum(
+                    "bnd,bxmd->bxnm", split_query_embeddings, split_item_embeddings
+                ).reshape(
+                    B, X, self._query_dot_product_groups * self._item_dot_product_groups
+                )
+
+            gated_outputs, gating_aux_losses = self._gating_fn(
+                logits=logits / self._temperature,  # [B, X, L]
+                query_embeddings=query_embeddings,  # [B, D]
+                item_embeddings=item_embeddings,  # [1/B, X, D']
+            )
+            return gated_outputs, {
+                **gating_aux_losses,
+                **query_aux_losses,
+                **item_aux_losses,
+            }
diff --git a/recommendation_v4/generative_recommenders/research/trainer/data_loader.py b/recommendation_v4/generative_recommenders/research/trainer/data_loader.py
new file mode 100644
index 000000000..390b04bdb
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/trainer/data_loader.py
@@ -0,0 +1,57 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import os
+from typing import Optional, Tuple
+
+import gin
+import torch
+
+
+@gin.configurable
+def create_data_loader(
+    dataset: torch.utils.data.Dataset,
+    batch_size: int,
+    world_size: int,
+    rank: int,
+    shuffle: bool,
+    prefetch_factor: int = 128,
+    num_workers: Optional[int] = os.cpu_count(),
+    drop_last: bool = False,
+) -> Tuple[
+    Optional[torch.utils.data.distributed.DistributedSampler[torch.utils.data.Dataset]],
+    torch.utils.data.DataLoader,
+]:
+    if shuffle:
+        sampler = torch.utils.data.distributed.DistributedSampler(
+            dataset,
+            num_replicas=world_size,
+            rank=rank,
+            shuffle=True,
+            seed=0,
+            drop_last=drop_last,
+        )
+    else:
+        sampler = None
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=batch_size,
+        # shuffle=True, cannot use with sampler
+        num_workers=num_workers or 0,
+        sampler=sampler,
+        prefetch_factor=prefetch_factor,
+    )
+    return sampler, data_loader
diff --git a/recommendation_v4/generative_recommenders/research/trainer/train.py b/recommendation_v4/generative_recommenders/research/trainer/train.py
new file mode 100644
index 000000000..6d2da5be7
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/research/trainer/train.py
@@ -0,0 +1,532 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+import logging
+import os
+import random
+import time
+from datetime import date
+from typing import Dict, Optional
+
+import gin
+import torch
+import torch.distributed as dist
+from generative_recommenders.research.data.eval import (
+    _avg,
+    add_to_summary_writer,
+    eval_metrics_v2_from_tensors,
+    get_eval_state,
+)
+from generative_recommenders.research.data.reco_dataset import get_reco_dataset
+from generative_recommenders.research.indexing.utils import get_top_k_module
+from generative_recommenders.research.modeling.sequential.autoregressive_losses import (
+    BCELoss,
+    InBatchNegativesSampler,
+    LocalNegativesSampler,
+)
+from generative_recommenders.research.modeling.sequential.embedding_modules import (
+    EmbeddingModule,
+    LocalEmbeddingModule,
+)
+from generative_recommenders.research.modeling.sequential.encoder_utils import (
+    get_sequential_encoder,
+)
+from generative_recommenders.research.modeling.sequential.features import (
+    movielens_seq_features_from_row,
+)
+from generative_recommenders.research.modeling.sequential.input_features_preprocessors import (
+    LearnablePositionalEmbeddingInputFeaturesPreprocessor,
+)
+from generative_recommenders.research.modeling.sequential.losses.sampled_softmax import (
+    SampledSoftmaxLoss,
+)
+from generative_recommenders.research.modeling.sequential.output_postprocessors import (
+    L2NormEmbeddingPostprocessor,
+    LayerNormEmbeddingPostprocessor,
+)
+from generative_recommenders.research.modeling.similarity_utils import (
+    get_similarity_function,
+)
+from generative_recommenders.research.trainer.data_loader import create_data_loader
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.utils.tensorboard import SummaryWriter
+
+
+def setup(rank: int, world_size: int, master_port: int) -> None:
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = str(master_port)
+
+    # initialize the process group
+    dist.init_process_group("nccl", rank=rank, world_size=world_size)
+
+
+def cleanup() -> None:
+    dist.destroy_process_group()
+
+
+@gin.configurable
+def get_weighted_loss(
+    main_loss: torch.Tensor,
+    aux_losses: Dict[str, torch.Tensor],
+    weights: Dict[str, float],
+) -> torch.Tensor:
+    weighted_loss = main_loss
+    for key, weight in weights.items():
+        cur_weighted_loss = aux_losses[key] * weight
+        weighted_loss = weighted_loss + cur_weighted_loss
+    return weighted_loss
+
+
+@gin.configurable
+def train_fn(
+    rank: int,
+    world_size: int,
+    master_port: int,
+    dataset_name: str = "ml-20m",
+    max_sequence_length: int = 200,
+    positional_sampling_ratio: float = 1.0,
+    local_batch_size: int = 128,
+    eval_batch_size: int = 128,
+    eval_user_max_batch_size: Optional[int] = None,
+    main_module: str = "SASRec",
+    main_module_bf16: bool = False,
+    dropout_rate: float = 0.2,
+    user_embedding_norm: str = "l2_norm",
+    sampling_strategy: str = "in-batch",
+    loss_module: str = "SampledSoftmaxLoss",
+    loss_weights: Optional[Dict[str, float]] = {},
+    num_negatives: int = 1,
+    loss_activation_checkpoint: bool = False,
+    item_l2_norm: bool = False,
+    temperature: float = 0.05,
+    num_epochs: int = 101,
+    learning_rate: float = 1e-3,
+    num_warmup_steps: int = 0,
+    weight_decay: float = 1e-3,
+    top_k_method: str = "MIPSBruteForceTopK",
+    eval_interval: int = 100,
+    full_eval_every_n: int = 1,
+    save_ckpt_every_n: int = 1000,
+    partial_eval_num_iters: int = 32,
+    embedding_module_type: str = "local",
+    item_embedding_dim: int = 240,
+    interaction_module_type: str = "",
+    gr_output_length: int = 10,
+    l2_norm_eps: float = 1e-6,
+    enable_tf32: bool = False,
+    random_seed: int = 42,
+) -> None:
+    # to enable more deterministic results.
+    random.seed(random_seed)
+    torch.backends.cuda.matmul.allow_tf32 = enable_tf32
+    torch.backends.cudnn.allow_tf32 = enable_tf32
+    logging.info(f"cuda.matmul.allow_tf32: {enable_tf32}")
+    logging.info(f"cudnn.allow_tf32: {enable_tf32}")
+    logging.info(f"Training model on rank {rank}.")
+    setup(rank, world_size, master_port)
+
+    dataset = get_reco_dataset(
+        dataset_name=dataset_name,
+        max_sequence_length=max_sequence_length,
+        chronological=True,
+        positional_sampling_ratio=positional_sampling_ratio,
+    )
+
+    train_data_sampler, train_data_loader = create_data_loader(
+        dataset.train_dataset,
+        batch_size=local_batch_size,
+        world_size=world_size,
+        rank=rank,
+        shuffle=True,
+        drop_last=world_size > 1,
+    )
+    eval_data_sampler, eval_data_loader = create_data_loader(
+        dataset.eval_dataset,
+        batch_size=eval_batch_size,
+        world_size=world_size,
+        rank=rank,
+        shuffle=True,  # needed for partial eval
+        drop_last=world_size > 1,
+    )
+
+    model_debug_str = main_module
+    if embedding_module_type == "local":
+        embedding_module: EmbeddingModule = LocalEmbeddingModule(
+            num_items=dataset.max_item_id,
+            item_embedding_dim=item_embedding_dim,
+        )
+    else:
+        raise ValueError(f"Unknown embedding_module_type {embedding_module_type}")
+    model_debug_str += f"-{embedding_module.debug_str()}"
+
+    interaction_module, interaction_module_debug_str = get_similarity_function(
+        module_type=interaction_module_type,
+        query_embedding_dim=item_embedding_dim,
+        item_embedding_dim=item_embedding_dim,
+    )
+
+    assert user_embedding_norm == "l2_norm" or user_embedding_norm == "layer_norm", (
+        f"Not implemented for {user_embedding_norm}"
+    )
+    output_postproc_module = (
+        L2NormEmbeddingPostprocessor(
+            embedding_dim=item_embedding_dim,
+            eps=1e-6,
+        )
+        if user_embedding_norm == "l2_norm"
+        else LayerNormEmbeddingPostprocessor(
+            embedding_dim=item_embedding_dim,
+            eps=1e-6,
+        )
+    )
+    input_preproc_module = LearnablePositionalEmbeddingInputFeaturesPreprocessor(
+        max_sequence_len=dataset.max_sequence_length + gr_output_length + 1,
+        embedding_dim=item_embedding_dim,
+        dropout_rate=dropout_rate,
+    )
+
+    model = get_sequential_encoder(
+        module_type=main_module,
+        max_sequence_length=dataset.max_sequence_length,
+        max_output_length=gr_output_length + 1,
+        embedding_module=embedding_module,
+        interaction_module=interaction_module,
+        input_preproc_module=input_preproc_module,
+        output_postproc_module=output_postproc_module,
+        verbose=True,
+    )
+    model_debug_str = model.debug_str()
+
+    # loss
+    loss_debug_str = loss_module
+    if loss_module == "BCELoss":
+        loss_debug_str = loss_debug_str[:-4]
+        assert temperature == 1.0
+        ar_loss = BCELoss(temperature=temperature, model=model)
+    elif loss_module == "SampledSoftmaxLoss":
+        loss_debug_str = "ssl"
+        if temperature != 1.0:
+            loss_debug_str += f"-t{temperature}"
+        ar_loss = SampledSoftmaxLoss(
+            num_to_sample=num_negatives,
+            softmax_temperature=temperature,
+            model=model,
+            activation_checkpoint=loss_activation_checkpoint,
+        )
+        loss_debug_str += (
+            f"-n{num_negatives}{'-ac' if loss_activation_checkpoint else ''}"
+        )
+    else:
+        raise ValueError(f"Unrecognized loss module {loss_module}.")
+
+    # sampling
+    if sampling_strategy == "in-batch":
+        negatives_sampler = InBatchNegativesSampler(
+            l2_norm=item_l2_norm,
+            l2_norm_eps=l2_norm_eps,
+            dedup_embeddings=True,
+        )
+        sampling_debug_str = (
+            f"in-batch{f'-l2-eps{l2_norm_eps}' if item_l2_norm else ''}-dedup"
+        )
+    elif sampling_strategy == "local":
+        negatives_sampler = LocalNegativesSampler(
+            num_items=dataset.max_item_id,
+            item_emb=model._embedding_module._item_emb,
+            all_item_ids=dataset.all_item_ids,
+            l2_norm=item_l2_norm,
+            l2_norm_eps=l2_norm_eps,
+        )
+    else:
+        raise ValueError(f"Unrecognized sampling strategy {sampling_strategy}.")
+    sampling_debug_str = negatives_sampler.debug_str()
+
+    # Creates model and moves it to GPU with id rank
+    device = rank
+    if main_module_bf16:
+        model = model.to(torch.bfloat16)
+    model = model.to(device)
+    ar_loss = ar_loss.to(device)
+    negatives_sampler = negatives_sampler.to(device)
+    model = DDP(model, device_ids=[rank], broadcast_buffers=False)
+
+    # TODO: wrap in create_optimizer.
+    opt = torch.optim.AdamW(
+        model.parameters(),
+        lr=learning_rate,
+        betas=(0.9, 0.98),
+        weight_decay=weight_decay,
+    )
+
+    date_str = date.today().strftime("%Y-%m-%d")
+    model_subfolder = f"{dataset_name}-l{max_sequence_length}"
+    model_desc = (
+        f"{model_subfolder}"
+        + f"/{model_debug_str}_{interaction_module_debug_str}_{sampling_debug_str}_{loss_debug_str}"
+        + f"{f'-ddp{world_size}' if world_size > 1 else ''}-b{local_batch_size}-lr{learning_rate}-wu{num_warmup_steps}-wd{weight_decay}{'' if enable_tf32 else '-notf32'}-{date_str}"
+    )
+    if full_eval_every_n > 1:
+        model_desc += f"-fe{full_eval_every_n}"
+    if positional_sampling_ratio is not None and positional_sampling_ratio < 1:
+        model_desc += f"-d{positional_sampling_ratio}"
+    # creates subfolders.
+    os.makedirs(f"./exps/{model_subfolder}", exist_ok=True)
+    os.makedirs(f"./ckpts/{model_subfolder}", exist_ok=True)
+    log_dir = f"./exps/{model_desc}"
+    if rank == 0:
+        writer = SummaryWriter(log_dir=log_dir)
+        logging.info(f"Rank {rank}: writing logs to {log_dir}")
+    else:
+        writer = None
+        logging.info(f"Rank {rank}: disabling summary writer")
+
+    last_training_time = time.time()
+    torch.autograd.set_detect_anomaly(True)
+
+    batch_id = 0
+    epoch = 0
+    for epoch in range(num_epochs):
+        if train_data_sampler is not None:
+            train_data_sampler.set_epoch(epoch)
+        if eval_data_sampler is not None:
+            eval_data_sampler.set_epoch(epoch)
+        model.train()
+        for row in iter(train_data_loader):
+            seq_features, target_ids, target_ratings = movielens_seq_features_from_row(
+                row,
+                device=device,
+                max_output_length=gr_output_length + 1,
+            )
+
+            if (batch_id % eval_interval) == 0:
+                model.eval()
+
+                eval_state = get_eval_state(
+                    model=model.module,
+                    all_item_ids=dataset.all_item_ids,
+                    negatives_sampler=negatives_sampler,
+                    top_k_module_fn=lambda item_embeddings, item_ids: get_top_k_module(
+                        top_k_method=top_k_method,
+                        model=model.module,
+                        item_embeddings=item_embeddings,
+                        item_ids=item_ids,
+                    ),
+                    device=device,
+                    float_dtype=torch.bfloat16 if main_module_bf16 else None,
+                )
+                eval_dict = eval_metrics_v2_from_tensors(
+                    eval_state,
+                    model.module,
+                    seq_features,
+                    target_ids=target_ids,
+                    target_ratings=target_ratings,
+                    user_max_batch_size=eval_user_max_batch_size,
+                    dtype=torch.bfloat16 if main_module_bf16 else None,
+                )
+                add_to_summary_writer(
+                    writer, batch_id, eval_dict, prefix="eval", world_size=world_size
+                )
+                logging.info(
+                    f"rank {rank}:  batch-stat (eval): iter {batch_id} (epoch {epoch}): "
+                    + f"NDCG@10 {_avg(eval_dict['ndcg@10'], world_size):.4f}, "
+                    f"HR@10 {_avg(eval_dict['hr@10'], world_size):.4f}, "
+                    f"HR@50 {_avg(eval_dict['hr@50'], world_size):.4f}, "
+                    + f"MRR {_avg(eval_dict['mrr'], world_size):.4f} "
+                )
+                model.train()
+
+            # TODO: consider separating this out?
+            B, N = seq_features.past_ids.shape
+            seq_features.past_ids.scatter_(
+                dim=1,
+                index=seq_features.past_lengths.view(-1, 1),
+                src=target_ids.view(-1, 1),
+            )
+
+            opt.zero_grad()
+            input_embeddings = model.module.get_item_embeddings(seq_features.past_ids)
+            seq_embeddings = model(
+                past_lengths=seq_features.past_lengths,
+                past_ids=seq_features.past_ids,
+                past_embeddings=input_embeddings,
+                past_payloads=seq_features.past_payloads,
+            )  # [B, X]
+
+            supervision_ids = seq_features.past_ids
+
+            if sampling_strategy == "in-batch":
+                # get_item_embeddings currently assume 1-d tensor.
+                in_batch_ids = supervision_ids.view(-1)
+                negatives_sampler.process_batch(
+                    ids=in_batch_ids,
+                    presences=(in_batch_ids != 0),
+                    embeddings=model.module.get_item_embeddings(in_batch_ids),
+                )
+            else:
+                # pyre-fixme[16]: `InBatchNegativesSampler` has no attribute
+                #  `_item_emb`.
+                negatives_sampler._item_emb = model.module._embedding_module._item_emb
+
+            ar_mask = supervision_ids[:, 1:] != 0
+            loss, aux_losses = ar_loss(
+                lengths=seq_features.past_lengths,  # [B],
+                output_embeddings=seq_embeddings[:, :-1, :],  # [B, N-1, D]
+                supervision_ids=supervision_ids[:, 1:],  # [B, N-1]
+                supervision_embeddings=input_embeddings[:, 1:, :],  # [B, N - 1, D]
+                supervision_weights=ar_mask.float(),
+                negatives_sampler=negatives_sampler,
+                **seq_features.past_payloads,
+            )  # [B, N]
+
+            main_loss = loss.detach().clone()
+            loss = get_weighted_loss(loss, aux_losses, weights=loss_weights or {})
+
+            if rank == 0:
+                assert writer is not None
+                writer.add_scalar("losses/ar_loss", loss, batch_id)
+                writer.add_scalar("losses/main_loss", main_loss, batch_id)
+
+            loss.backward()
+
+            # Optional linear warmup.
+            if batch_id < num_warmup_steps:
+                lr_scalar = min(1.0, float(batch_id + 1) / num_warmup_steps)
+                for pg in opt.param_groups:
+                    pg["lr"] = lr_scalar * learning_rate
+                lr = lr_scalar * learning_rate
+            else:
+                lr = learning_rate
+
+            if (batch_id % eval_interval) == 0:
+                logging.info(
+                    f" rank: {rank}, batch-stat (train): step {batch_id} "
+                    f"(epoch {epoch} in {time.time() - last_training_time:.2f}s): {loss:.6f}"
+                )
+                last_training_time = time.time()
+                if rank == 0:
+                    assert writer is not None
+                    writer.add_scalar("loss/train", loss, batch_id)
+                    writer.add_scalar("lr", lr, batch_id)
+
+            opt.step()
+
+            batch_id += 1
+
+        def is_full_eval(epoch: int) -> bool:
+            return (epoch % full_eval_every_n) == 0
+
+        # eval per epoch
+        eval_dict_all = None
+        eval_start_time = time.time()
+        model.eval()
+        eval_state = get_eval_state(
+            model=model.module,
+            all_item_ids=dataset.all_item_ids,
+            negatives_sampler=negatives_sampler,
+            top_k_module_fn=lambda item_embeddings, item_ids: get_top_k_module(
+                top_k_method=top_k_method,
+                model=model.module,
+                item_embeddings=item_embeddings,
+                item_ids=item_ids,
+            ),
+            device=device,
+            float_dtype=torch.bfloat16 if main_module_bf16 else None,
+        )
+        for eval_iter, row in enumerate(iter(eval_data_loader)):
+            seq_features, target_ids, target_ratings = movielens_seq_features_from_row(
+                row, device=device, max_output_length=gr_output_length + 1
+            )
+            eval_dict = eval_metrics_v2_from_tensors(
+                eval_state,
+                model.module,
+                seq_features,
+                target_ids=target_ids,
+                target_ratings=target_ratings,
+                user_max_batch_size=eval_user_max_batch_size,
+                dtype=torch.bfloat16 if main_module_bf16 else None,
+            )
+
+            if eval_dict_all is None:
+                eval_dict_all = {}
+                for k, v in eval_dict.items():
+                    eval_dict_all[k] = []
+
+            for k, v in eval_dict.items():
+                eval_dict_all[k] = eval_dict_all[k] + [v]
+            del eval_dict
+
+            if (eval_iter + 1 >= partial_eval_num_iters) and (not is_full_eval(epoch)):
+                logging.info(
+                    f"Truncating epoch {epoch} eval to {eval_iter + 1} iters to save cost.."
+                )
+                break
+
+        assert eval_dict_all is not None
+        for k, v in eval_dict_all.items():
+            eval_dict_all[k] = torch.cat(v, dim=-1)
+
+        ndcg_10 = _avg(eval_dict_all["ndcg@10"], world_size=world_size)
+        ndcg_50 = _avg(eval_dict_all["ndcg@50"], world_size=world_size)
+        hr_10 = _avg(eval_dict_all["hr@10"], world_size=world_size)
+        hr_50 = _avg(eval_dict_all["hr@50"], world_size=world_size)
+        mrr = _avg(eval_dict_all["mrr"], world_size=world_size)
+
+        add_to_summary_writer(
+            writer,
+            batch_id=epoch,
+            metrics=eval_dict_all,
+            prefix="eval_epoch",
+            world_size=world_size,
+        )
+        if full_eval_every_n > 1 and is_full_eval(epoch):
+            add_to_summary_writer(
+                writer,
+                batch_id=epoch,
+                metrics=eval_dict_all,
+                prefix="eval_epoch_full",
+                world_size=world_size,
+            )
+        if rank == 0 and epoch > 0 and (epoch % save_ckpt_every_n) == 0:
+            torch.save(
+                {
+                    "epoch": epoch,
+                    "model_state_dict": model.state_dict(),
+                    "optimizer_state_dict": opt.state_dict(),
+                },
+                f"./ckpts/{model_desc}_ep{epoch}",
+            )
+
+        logging.info(
+            f"rank {rank}: eval @ epoch {epoch} in {time.time() - eval_start_time:.2f}s: "
+            f"NDCG@10 {ndcg_10:.4f}, NDCG@50 {ndcg_50:.4f}, HR@10 {hr_10:.4f}, HR@50 {hr_50:.4f}, MRR {mrr:.4f}"
+        )
+        last_training_time = time.time()
+
+    if rank == 0:
+        if writer is not None:
+            writer.flush()
+            writer.close()
+
+        torch.save(
+            {
+                "epoch": epoch,
+                "model_state_dict": model.state_dict(),
+                "optimizer_state_dict": opt.state_dict(),
+            },
+            f"./ckpts/{model_desc}_ep{epoch}",
+        )
+
+    cleanup()
diff --git a/recommendation_v4/generative_recommenders/tests/test_common.py b/recommendation_v4/generative_recommenders/tests/test_common.py
new file mode 100644
index 000000000..be3823d67
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/tests/test_common.py
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#!/usr/bin/env python3
+
+# pyre-strict
+
+import unittest
+
+import torch
+from generative_recommenders.common import switch_to_contiguous_if_needed
+
+
+class SwitchToContiguousIfNeededTest(unittest.TestCase):
+    def test_torchscript_does_not_compile_fx_tracing_helper(self) -> None:
+        class ContiguousModule(torch.nn.Module):
+            def forward(self, x: torch.Tensor) -> torch.Tensor:
+                return switch_to_contiguous_if_needed(x)
+
+        scripted = torch.jit.script(ContiguousModule())
+        x = torch.arange(12).reshape(3, 4).transpose(0, 1)
+
+        out = scripted(x)
+
+        self.assertTrue(torch.equal(out, x))
+        self.assertTrue(out.is_contiguous())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/recommendation_v4/main.py b/recommendation_v4/main.py
new file mode 100644
index 000000000..445f25820
--- /dev/null
+++ b/recommendation_v4/main.py
@@ -0,0 +1,82 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+"""
+Main entry point for model training. Please refer to README.md for usage instructions.
+"""
+
+import logging
+import os
+from typing import List, Optional
+
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"  # Hide excessive tensorflow debug messages
+import sys
+
+import fbgemm_gpu  # noqa: F401, E402
+import gin
+import torch
+import torch.multiprocessing as mp
+from absl import app, flags
+from generative_recommenders.research.trainer.train import train_fn
+
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+
+def delete_flags(FLAGS, keys_to_delete: List[str]) -> None:  # pyre-ignore [2]
+    keys = [key for key in FLAGS._flags()]
+    for key in keys:
+        if key in keys_to_delete:
+            delattr(FLAGS, key)
+
+
+delete_flags(flags.FLAGS, ["gin_config_file", "master_port"])
+flags.DEFINE_string("gin_config_file", None, "Path to the config file.")
+flags.DEFINE_integer("master_port", 12355, "Master port.")
+FLAGS = flags.FLAGS  # pyre-ignore [5]
+
+
+def mp_train_fn(
+    rank: int,
+    world_size: int,
+    master_port: int,
+    gin_config_file: Optional[str],
+) -> None:
+    if gin_config_file is not None:
+        # Hack as absl doesn't support flag parsing inside multiprocessing.
+        logging.info(f"Rank {rank}: loading gin config from {gin_config_file}")
+        gin.parse_config_file(gin_config_file)
+
+    train_fn(rank, world_size, master_port)
+
+
+def _main(argv) -> None:  # pyre-ignore [2]
+    world_size = torch.cuda.device_count()
+
+    mp.set_start_method("forkserver")
+    mp.spawn(
+        mp_train_fn,
+        args=(world_size, FLAGS.master_port, FLAGS.gin_config_file),
+        nprocs=world_size,
+        join=True,
+    )
+
+
+def main() -> None:
+    app.run(_main)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/preprocess_public_data.py b/recommendation_v4/preprocess_public_data.py
new file mode 100644
index 000000000..927ccf4c6
--- /dev/null
+++ b/recommendation_v4/preprocess_public_data.py
@@ -0,0 +1,32 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+"""
+Usage: mkdir -p tmp/ && python3 preprocess_public_data.py
+"""
+
+from generative_recommenders.research.data.preprocessor import get_common_preprocessors
+
+
+def main() -> None:
+    get_common_preprocessors()["ml-1m"].preprocess_rating()
+    get_common_preprocessors()["ml-20m"].preprocess_rating()
+    # get_common_preprocessors()["ml-1b"].preprocess_rating()
+    get_common_preprocessors()["amzn-books"].preprocess_rating()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/requirements.txt b/recommendation_v4/requirements.txt
new file mode 100644
index 000000000..023c22332
--- /dev/null
+++ b/recommendation_v4/requirements.txt
@@ -0,0 +1,7 @@
+torch>=2.6.0
+fbgemm_gpu>=1.1.0
+torchrec>=1.1.0
+gin_config>=0.5.0
+pandas>=2.2.0
+tensorboard>=2.19.0
+pybind11
diff --git a/recommendation_v4/run_fractal_expansion.py b/recommendation_v4/run_fractal_expansion.py
new file mode 100644
index 000000000..308eadea2
--- /dev/null
+++ b/recommendation_v4/run_fractal_expansion.py
@@ -0,0 +1,588 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+"""
+Run fractal expansion introduced in https://arxiv.org/abs/1901.08910.
+Implementation adapted from the scripts used to generate MovieLens-1B
+(https://grouplens.org/datasets/movielens/movielens-1b/).
+"""
+
+# Generate a 3B dataset (takes around 50 minutes):
+# python run_fractal_expansion.py --input-csv-file ~/data/ml-20m/ratings.csv --write-dataset True --output-prefix ~/data/ml-3b/
+# Generate a 13B dataset with 440M item size:
+# python run_fractal_expansion.py --input-csv-file ~/data/ml-20m/ratings.csv --write-dataset True --output-prefix ~/data/ml-13b/ --num-row-multiplier 16 --num-col-multiplier 16384 --element-sample-rate 0.2 --block-sample-rate 0.05
+# Generate a 18B dataset with 1B item size:
+# python run_fractal_expansion.py --input-csv-file ~/data/ml-20m/ratings.csv --write-dataset True --output-prefix ~/data/ml-18b/ --num-row-multiplier 20 --num-col-multiplier 36864 --element-sample-rate 0.08 --block-sample-rate 0.05
+
+import csv
+import linecache
+import logging
+import os
+import pickle
+from dataclasses import dataclass
+
+import click
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import scipy.linalg
+import skimage.transform as transform
+from scipy import sparse
+from scipy.sparse import linalg
+from sklearn.utils import shuffle
+from tqdm import tqdm
+
+
+logging.basicConfig()
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+@dataclass
+class SparseMatrixMetadata:
+    num_interactions: int = 0
+    num_rows: int = 0
+    num_cols: int = 0
+
+
+def _dropout_sparse_coo_matrix(
+    sparse_matrix, rate, min_dropout_rate=0.005, max_dropout_rate=0.999
+):
+    assert min_dropout_rate <= max_dropout_rate
+    sampling_rate = 1.0 - rate
+
+    sampled_fraction = min(
+        max(sampling_rate, 1.0 - max_dropout_rate), 1.0 - min_dropout_rate
+    )
+    if sampled_fraction != sampling_rate:
+        logger.warning(
+            f"Desired sampling rate {sampling_rate} clipped to {sampled_fraction}."
+        )
+    num_sampled = min(
+        max(int(sparse_matrix.nnz * sampled_fraction), 1), sparse_matrix.nnz
+    )
+    sampled_indices = np.random.choice(
+        sparse_matrix.nnz, size=num_sampled, replace=False
+    )
+    return sparse.coo_matrix(
+        (
+            sparse_matrix.data[sampled_indices],
+            (sparse_matrix.row[sampled_indices], sparse_matrix.col[sampled_indices]),
+        ),
+        shape=sparse_matrix.shape,
+    )
+
+
+def shuffle_sparse_matrix(
+    sparse_matrix, dropout_rate=0.0, min_dropout_rate=0.005, max_dropout_rate=0.999
+):
+    """
+    Shuffle sparse matrix encoded as a SciPy csr matrix.
+    """
+
+    assert dropout_rate >= 0.0 and dropout_rate <= 1.0
+    (num_rows, num_cols) = sparse_matrix.shape
+    shuffled_rows = shuffle(np.arange(num_rows))
+    shuffled_cols = shuffle(np.arange(num_cols))
+    sparse_matrix = _dropout_sparse_coo_matrix(
+        sparse_matrix, dropout_rate, min_dropout_rate, max_dropout_rate
+    )
+    new_row = np.take(shuffled_rows, sparse_matrix.row)
+    new_col = np.take(shuffled_cols, sparse_matrix.col)
+    return sparse.csr_matrix(
+        (sparse_matrix.data, (new_row, new_col)), shape=(num_rows, num_cols)
+    )
+
+
+def graph_reduce(usv, num_rows, num_cols):
+    """Apply algorithm 2 in https://arxiv.org/pdf/1901.08910.pdf."""
+
+    def _closest_column_orthogonal_matrix(matrix):
+        return np.matmul(
+            matrix, np.linalg.inv(scipy.linalg.sqrtm(np.matmul(matrix.T, matrix)))
+        )
+
+    u, s, v = usv
+    k = min(num_rows, num_cols)
+    u_random_proj = transform.resize(u[:, :k], (num_rows, k))
+    v_random_proj = transform.resize(v[:k, :], (k, num_cols))
+    u_random_proj_orth = _closest_column_orthogonal_matrix(u_random_proj)
+    v_random_proj_orth = _closest_column_orthogonal_matrix(v_random_proj.T).T
+    return np.matmul(u_random_proj_orth, np.matmul(np.diag(s[:k]), v_random_proj_orth))
+
+
+def rescale(matrix, rescale_w_abs=False, element_sample_rate=1.0):
+    """Rescale all values of the matrix into [0, 1]."""
+    if rescale_w_abs:
+        abs_matrix = np.abs(matrix.copy())
+        out = abs_matrix / abs_matrix.max()
+    else:
+        out = (matrix - matrix.min()) / (matrix.max() - matrix.min())
+        assert out.min() >= 0 and out.max() <= 1
+    return out * element_sample_rate
+
+
+def _compute_row_block(
+    i, left_matrix, right_matrix, block_sample_rate, indices_out_path, remove_empty_rows
+):
+    """Compute row block of expansion for row i of the left_matrix."""
+
+    kron_blocks = []
+    num_rows = 0
+    num_removed_rows = 0
+    num_interactions = 0
+
+    for j in range(left_matrix.shape[1]):
+        if np.random.random() <= block_sample_rate:
+            dropout_rate = 1.0 - left_matrix[i, j]
+            kron_block = shuffle_sparse_matrix(right_matrix, dropout_rate).tocsr()
+            num_interactions += kron_block.nnz
+            kron_blocks.append(kron_block)
+            logger.info(f"Kronecker block ({i}, {j}) processed.")
+        else:
+            kron_blocks.append(sparse.csr_matrix(right_matrix.shape))
+            logger.info(f"Kronecker block ({i}, {j}) skipped.")
+
+    rows_to_write = sparse.hstack(kron_blocks).tocsr()
+    logger.info("Writing dataset row by row.")
+
+    # Write Kronecker product line per line.
+    filepath = f"{indices_out_path}_{i}.csv"
+    os.makedirs(os.path.dirname(filepath), exist_ok=True)
+    with open(filepath, "w", newline="") as file:
+        writer = csv.writer(file)
+        for k in range(right_matrix.shape[0]):
+            items_to_write = rows_to_write.getrow(k).indices
+            ratings_to_write = rows_to_write.getrow(k).data
+            num = items_to_write.shape[0]
+            if remove_empty_rows and (not num):
+                logger.info(f"Removed empty output row {i * left_matrix.shape[0] + k}.")
+                num_removed_rows += 1
+                continue
+            num_rows += 1
+            writer.writerow(
+                [
+                    i * right_matrix.shape[0] + k,
+                    ",".join([str(x) for x in items_to_write]),
+                    ",".join([str(x) for x in ratings_to_write]),
+                ]
+            )
+            if k % 100000 == 0:
+                logger.info(f"Done producing data set row {k}.")
+
+    num_cols = rows_to_write.shape[1]
+    metadata = SparseMatrixMetadata(
+        num_interactions=num_interactions, num_rows=num_rows, num_cols=num_cols
+    )
+    logger.info(
+        f"Done with left matrix row {i}, {num_interactions} interactions written in shard, {num_removed_rows} rows removed in shard."
+    )
+    return (num_removed_rows, metadata)
+
+
+def visualize_samples(
+    right_matrix,
+    visualize_num_samples,
+    expanded_file_name,
+    output_prefix,
+):
+    # Note: only the rows of the first Kronecker block are visualized.
+    logger.info("visualize dataset row by row.")
+    fig, axs = plt.subplots(1, 2, figsize=(12, 5))
+    axs[0].set_title("Original data Histogram")
+    axs[0].set_xlabel("Value")
+    axs[0].set_ylabel("Frequency")
+    axs[1].set_title("Expended Row Histogram")
+    axs[1].set_xlabel("Value")
+    axs[1].set_ylabel("Frequency")
+    for k in range(visualize_num_samples):
+        original_row = right_matrix.getrow(k).data
+        line = linecache.getline(expanded_file_name, k + 1)
+        reader = csv.reader([line])
+        parsed_line = next(reader)
+        expended_row = eval(parsed_line[2])
+        original_hist_counts, original_bin_edges = np.histogram(original_row, bins=9)
+        expended_hist_counts, expended_bin_edges = np.histogram(expended_row, bins=9)
+        axs[0].plot(original_bin_edges[:-1], original_hist_counts, alpha=0.2)
+        axs[1].plot(expended_bin_edges[:-1], expended_hist_counts, alpha=0.2)
+        axs[0].fill_between(original_bin_edges[:-1], original_hist_counts, alpha=0.2)
+        axs[1].fill_between(expended_bin_edges[:-1], expended_hist_counts, alpha=0.2)
+    plt.tight_layout()
+    plt.savefig(f"{output_prefix}_sample_distribution.png")
+    logger.info("Sample visualization finished.")
+
+
+def build_randomized_kronecker(
+    left_matrix,
+    right_matrix,
+    block_sample_rate,
+    indices_out_path,
+    metadata_out_path=None,
+    remove_empty_rows=True,
+):
+    """Compute randomized Kronecker product and dump it on the fly based on https://arxiv.org/pdf/1901.08910.pdf."""
+    logger.info(f"Writing item sequences to pickle files {metadata_out_path}.")
+
+    num_rows = 0
+    num_removed_rows = 0
+    num_cols = left_matrix.shape[1] * right_matrix.shape[1]
+    num_interactions = 0
+
+    filepath = f"{indices_out_path}_users.csv"
+    os.makedirs(os.path.dirname(filepath), exist_ok=True)
+    with open(filepath, "w", newline="") as file:
+        writer = csv.writer(file)
+        for i in tqdm(range(left_matrix.shape[0])):
+            (shard_num_removed_rows, shard_metadata) = _compute_row_block(
+                i,
+                left_matrix,
+                right_matrix,
+                block_sample_rate,
+                indices_out_path,
+                remove_empty_rows,
+            )
+            writer.writerow([i, shard_metadata.num_rows])
+            file.flush()
+            num_rows += shard_metadata.num_rows
+            num_removed_rows += shard_num_removed_rows
+            num_interactions += shard_metadata.num_interactions
+
+    logger.info(f"{num_interactions / num_rows} average sequence length")
+    logger.info(f"{num_interactions} total interactions written.")
+    logger.info(f"{num_removed_rows} total rows removed.")
+
+    metadata = SparseMatrixMetadata(
+        num_interactions=num_interactions, num_rows=num_rows, num_cols=num_cols
+    )
+    if metadata_out_path is not None:
+        logger.info(f"Writing metadata file to {metadata_out_path}")
+        with open(metadata_out_path, "wb") as output_file:
+            pickle.dump(metadata, output_file)
+    return metadata
+
+
+def _preprocess_movie_lens(ratings_df, binary=False):
+    """
+    Filters out users with less than three distinct timestamps.
+    """
+
+    def _create_index(df, colname):
+        value_set = sorted(set(df[colname].values))
+        num_unique = len(value_set)
+        return dict(zip(value_set, range(num_unique)))
+
+    if not binary:
+        ratings_df["data"] = ratings_df["rating"]
+    else:
+        ratings_df["data"] = 1.0
+    ratings_df["binary_data"] = 1.0
+    num_timestamps = ratings_df[["userId", "timestamp"]].groupby("userId").nunique()
+    ratings_df["numberOfTimestamps"] = ratings_df["userId"].apply(
+        lambda x: num_timestamps["timestamp"][x]
+    )
+    ratings_df = ratings_df[ratings_df["numberOfTimestamps"] > 2]
+    user_id_to_user_idx = _create_index(ratings_df, "userId")
+    item_id_to_item_idx = _create_index(ratings_df, "movieId")
+    ratings_df["row"] = ratings_df["userId"].apply(lambda x: user_id_to_user_idx[x])
+    ratings_df["col"] = ratings_df["movieId"].apply(lambda x: item_id_to_item_idx[x])
+    return ratings_df
+
+
+def normalize(matrix):
+    norm_matrix = matrix.copy()
+    if isinstance(norm_matrix, np.ndarray):
+        norm_matrix -= norm_matrix.mean()
+    else:
+        norm_matrix.data -= norm_matrix.mean()
+    max_val = norm_matrix.max()
+    min_val = norm_matrix.min()
+    if isinstance(norm_matrix, np.ndarray):
+        norm_matrix /= max(abs(max_val), abs(min_val))
+    else:
+        norm_matrix.data /= max(abs(max_val), abs(min_val))
+    return norm_matrix
+
+
+def plot_distribution(user_wise_sum, item_wise_sum, s, title_prefix, normalized=False):
+    y_label = "rating sums" if normalized else "number of ratings"
+    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))
+    ax1.loglog(
+        np.arange(len(user_wise_sum)) + 1,
+        np.sort(user_wise_sum)[::-1],
+        linestyle="-",
+        color="blue",
+        marker="",
+    )
+    ax1.set_title(f"{title_prefix} matrix user-wise rating sums")
+    ax1.set_xlabel("User rank")
+    ax1.set_ylabel(y_label)
+    ax1.grid(True)
+    ax2.loglog(
+        np.arange(len(item_wise_sum)) + 1,
+        np.sort(item_wise_sum)[::-1],
+        linestyle="-",
+        color="green",
+        marker="",
+    )
+    ax2.set_title(f"{title_prefix} matrix item-wise rating sums")
+    ax2.set_xlabel("Item rank")
+    ax2.set_ylabel(y_label)
+    ax2.grid(True)
+    ax3.loglog(
+        np.arange(len(s)) + 1, np.sort(s)[::-1], linestyle="-", color="red", marker=""
+    )
+    ax3.set_title(f"{title_prefix} matrix singular values")
+    ax3.set_xlabel("Singular value Rank")
+    ax3.set_ylabel("Magnitude")
+    ax3.grid(True)
+    plt.tight_layout()
+    plt.savefig(f"{title_prefix}_distribution.png")
+
+
+def visualize_distribution(mat, reduced_mat, s, reduced_s, normalized=False, title=""):
+    user_wise_sum = np.asarray(mat.sum(axis=1)).flatten()
+    item_wise_sum = np.asarray(mat.sum(axis=0)).flatten()
+    assert len(user_wise_sum) == mat.shape[0]
+    assert len(item_wise_sum) == mat.shape[1]
+    plot_distribution(
+        user_wise_sum,
+        item_wise_sum,
+        s,
+        title_prefix=f"{title}_Original",
+        normalized=normalized,
+    )
+
+    reduced_user_wise_sum = np.asarray(reduced_mat.sum(axis=1)).flatten()
+    reduced_item_wise_sum = np.asarray(reduced_mat.sum(axis=0)).flatten()
+    assert len(reduced_user_wise_sum) == reduced_mat.shape[0]
+    assert len(reduced_item_wise_sum) == reduced_mat.shape[1]
+    plot_distribution(
+        reduced_user_wise_sum,
+        reduced_item_wise_sum,
+        reduced_s,
+        title_prefix=f"{title}_Reduced",
+        normalized=normalized,
+    )
+
+    expanded_s = np.einsum("i,j->ij", reduced_s, s).flatten()
+    expanded_user_wise_sum = np.einsum("ij,k->ik", reduced_mat, user_wise_sum).flatten()
+    expanded_item_wise_sum = np.einsum("ij,k->jk", reduced_mat, item_wise_sum).flatten()
+    assert len(expanded_user_wise_sum) == reduced_mat.shape[0] * mat.shape[0]
+    assert len(expanded_item_wise_sum) == reduced_mat.shape[1] * mat.shape[1]
+    plot_distribution(
+        expanded_user_wise_sum,
+        expanded_item_wise_sum,
+        expanded_s,
+        title_prefix=f"{title}_Expanded",
+        normalized=normalized,
+    )
+
+
+def expand_dataset(
+    ratings_matrix,
+    binary_ratings_matrix,
+    num_users,
+    num_items,
+    reduced_num_rows,
+    reduced_num_cols,
+    rescale_w_abs,
+    element_sample_rate,
+    block_sample_rate,
+    visualize,
+    write_dataset,
+    output_prefix,
+):
+    k = min(reduced_num_rows, reduced_num_cols)
+    norm_rating_matrix = normalize(ratings_matrix)
+    (u, s, v) = linalg.svds(
+        norm_rating_matrix, k=k, maxiter=None, return_singular_vectors=True
+    )
+
+    logger.info(
+        f"Creating reduced rating matrix (size {reduced_num_rows}, {reduced_num_cols})"
+    )
+    reduced_matrix = graph_reduce((u, s, v), reduced_num_rows, reduced_num_cols)
+    norm_reduced_matrix = normalize(reduced_matrix)
+    (_, s_reduce, _) = linalg.svds(
+        norm_reduced_matrix, k=k - 1, maxiter=None, return_singular_vectors=True
+    )
+    reduced_matrix = rescale(
+        reduced_matrix,
+        rescale_w_abs=rescale_w_abs,
+        element_sample_rate=element_sample_rate,
+    )
+    logger.info(f"largest singular value of the reduced matrix is {s_reduce[-1]}")
+    logger.info(
+        f"Sampling rate mean is {reduced_matrix.mean()}, var is {reduced_matrix.var()}, min is {reduced_matrix.min()}, max is {reduced_matrix.max()}"
+    )
+    samples = reduced_matrix.sum() * ratings_matrix.nnz * block_sample_rate
+    logger.info(
+        f"Expected number of synthetic samples: {samples}, sparsity is {samples / (num_users * num_items * reduced_num_rows * reduced_num_cols)}, average seqlen is {samples / (num_users * reduced_num_rows)}"
+    )
+
+    if visualize:
+        s = linalg.svds(
+            norm_rating_matrix, k=20 * k, maxiter=None, return_singular_vectors=False
+        )
+        visualize_distribution(
+            norm_rating_matrix,
+            norm_reduced_matrix,
+            s,
+            s_reduce,
+            normalized=True,
+            title="Normalized",
+        )
+        visualize_distribution(
+            binary_ratings_matrix,
+            reduced_matrix,
+            s,
+            s_reduce,
+            normalized=False,
+            title="Binary",
+        )
+    if write_dataset:
+        output_file = (
+            output_prefix + str(reduced_num_rows) + "x" + str(reduced_num_cols)
+        )
+        output_file_metadata = None
+
+        logger.info(f"Creating synthetic dataset and dumping to {output_file}.")
+        build_randomized_kronecker(
+            left_matrix=reduced_matrix,
+            right_matrix=ratings_matrix.tocoo(),
+            block_sample_rate=block_sample_rate,
+            indices_out_path=output_file,
+            metadata_out_path=output_file_metadata,
+        )
+
+
+@click.command()
+@click.option(
+    "--random-seed",
+    type=int,
+    default=0,
+)
+@click.option(
+    "--input-csv-file",
+    type=str,
+    default="ratings.csv",
+)
+@click.option(
+    "--output-prefix",
+    type=str,
+    default="",
+)
+@click.option(
+    "--num-row-multiplier",
+    type=int,
+    default=16,
+)
+@click.option(
+    "--num-col-multiplier",
+    type=int,
+    default=32,
+)
+@click.option(
+    "--element-sample-rate",
+    type=float,
+    default=1.0,
+)
+@click.option(
+    "--block-sample-rate",
+    type=float,
+    default=1.0,
+)
+@click.option(
+    "--visualize",
+    type=bool,
+    default=False,
+)
+@click.option(
+    "--write-dataset",
+    type=bool,
+    default=False,
+)
+@click.option(
+    "--visualize-num-samples",
+    type=int,
+    default=0,
+)
+def main(
+    random_seed: int,
+    input_csv_file: str,
+    output_prefix: str,
+    num_row_multiplier: int,
+    num_col_multiplier: int,
+    element_sample_rate: float,
+    block_sample_rate: float,
+    visualize: bool,
+    write_dataset: bool,
+    visualize_num_samples: int,
+):
+    np.random.seed(random_seed)
+
+    logger.info(f"Loading and preprocessing MovieLens-20m from {input_csv_file}")
+    with open(input_csv_file, "r") as infile:
+        ratings_df = pd.read_csv(infile, sep=",", header=0)
+    ratings_df = _preprocess_movie_lens(ratings_df, binary=False)
+    num_ratings = len(ratings_df)
+    num_users = len(set(ratings_df["row"].values))
+    num_items = len(set(ratings_df["col"].values))
+    logger.info(
+        f"number of ratings of input dataset is {num_ratings}, number of users is {num_users}, number of items is {num_items}, sparsity is {num_ratings / (num_users * num_items)}, average seqlen is {num_ratings / num_users}"
+    )
+
+    ratings_matrix = sparse.csr_matrix(
+        (
+            ratings_df["data"].values,
+            (ratings_df["row"].values, ratings_df["col"].values),
+        ),
+        shape=(num_users, num_items),
+    )
+    binary_ratings_matrix = sparse.csr_matrix(
+        (
+            ratings_df["binary_data"].values,
+            (ratings_df["row"].values, ratings_df["col"].values),
+        ),
+        shape=(num_users, num_items),
+    )
+    if write_dataset or visualize:
+        expand_dataset(
+            ratings_matrix=ratings_matrix,
+            binary_ratings_matrix=binary_ratings_matrix,
+            num_users=num_users,
+            num_items=num_items,
+            reduced_num_rows=num_row_multiplier,
+            reduced_num_cols=num_col_multiplier,
+            rescale_w_abs=False,
+            element_sample_rate=element_sample_rate,
+            block_sample_rate=block_sample_rate,
+            visualize=visualize,
+            write_dataset=write_dataset,
+            output_prefix=output_prefix,
+        )
+    if visualize_num_samples > 0:
+        logger.info(f"Visualizing {visualize_num_samples} samples.")
+        visualize_samples(
+            right_matrix=ratings_matrix.tocoo(),
+            visualize_num_samples=visualize_num_samples,
+            expanded_file_name=f"{output_prefix}{num_row_multiplier}x{num_col_multiplier}_0.csv",
+            output_prefix="Sample_Histogram",
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/recommendation_v4/scripts/launch_smoke_8gpu.sh b/recommendation_v4/scripts/launch_smoke_8gpu.sh
new file mode 100755
index 000000000..92daa6ef8
--- /dev/null
+++ b/recommendation_v4/scripts/launch_smoke_8gpu.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+# 8-GPU yambda-5b run. Resolves the package root from this script's location,
+# so it works from any container mount point. Dataset path is in the gin file
+# (generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin).
+set -uo pipefail
+
+REPO_ROOT=$(cd "$(dirname "$0")/.." && pwd)
+cd "$REPO_ROOT"
+
+LOG=${LOG:-/apps/chcai/yambda_5b_8gpu.log}
+echo "[$(date)] REPO_ROOT=$REPO_ROOT" | tee "$LOG"
+
+# polars-u64-idx (NOT stock polars) — yambda parquet's flat-explode overruns
+# 32-bit row index. Reserved node has no outbound DNS, so we install from a
+# pre-staged tarball under /apps/chcai/. Override PIP_LOCAL_TGZ for other hosts.
+PIP_LOCAL_TGZ=${PIP_LOCAL_TGZ:-/apps/chcai/pip_local_yambda.tgz}
+PIP_LOCAL_DIR=${PIP_LOCAL_DIR:-/tmp/pip_local}
+if [ ! -f "$PIP_LOCAL_DIR/lib/python3.12/site-packages/polars/__init__.py" ]; then
+  rm -rf "$PIP_LOCAL_DIR"
+  mkdir -p "$PIP_LOCAL_DIR" && tar xzf "$PIP_LOCAL_TGZ" -C "$(dirname "$PIP_LOCAL_DIR")" 2>&1 | tail -3 | tee -a "$LOG"
+fi
+
+export PYTHONPATH="$PIP_LOCAL_DIR/lib/python3.12/site-packages:$REPO_ROOT:${PYTHONPATH:-}"
+export HOME=${HOME:-/tmp}
+echo "[$(date)] PYTHONPATH=$PYTHONPATH" | tee -a "$LOG"
+python -c "import torch, fbgemm_gpu, torchrec, polars, xxhash, gin; print('imports OK,', torch.__version__, torch.cuda.device_count(),'gpus')" 2>&1 | tee -a "$LOG"
+
+export HIP_VISIBLE_DEVICES=${HIP_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}
+export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}
+export WORLD_SIZE=$(python -c "import torch; print(torch.cuda.device_count())")
+# AMD/ROCm: Triton HSTU kernel hits PassManager errors on some shapes; force
+# PYTORCH backend. On CUDA, unset this to default to TRITON for ~3-5x speedup.
+export HSTU_HAMMER_KERNEL=${HSTU_HAMMER_KERNEL:-PYTORCH}
+export PYTORCH_CUDA_ALLOC_CONF=${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}
+echo "[$(date)] launching train_ranker with WORLD_SIZE=$WORLD_SIZE" | tee -a "$LOG"
+
+python -m generative_recommenders.dlrm_v3.train.train_ranker \
+    --dataset yambda-5b --mode train-eval 2>&1 | tee -a "$LOG"
diff --git a/recommendation_v4/setup.py b/recommendation_v4/setup.py
new file mode 100644
index 000000000..bdab528f4
--- /dev/null
+++ b/recommendation_v4/setup.py
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+
+from setuptools import find_packages, setup
+
+setup(
+    name="generative_recommenders",
+    version="0.1.0",
+    description="Library for generative recommendation algorithms.",
+    packages=find_packages(exclude=["configs"]),
+    python_requires=">=3.10",
+    install_requires=[
+        "torch>=2.6.0",
+        "fbgemm_gpu>=1.1.0",
+        "torchrec>=1.1.0",
+        "gin_config>=0.5.0",
+        "pandas>=2.2.0",
+        "tensorboard>=2.19.0",
+        "pybind11",
+        "click",
+        "pandas",
+        "matplotlib",
+    ],
+    long_description=open("README.md").read(),
+    long_description_content_type="text/markdown",
+    url="https://github.com/meta-recsys/generative-recommenders",
+    license="Apache-2.0",
+)

From 9b56d4f35dec4a7799e7cbf0d1ec9c927506c732 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Fri, 29 May 2026 17:54:39 -0500
Subject: [PATCH 002/113] Enable Triton HSTU kernels on AMD/ROCm (gfx950
 MI350X)

Four fixes unlocking the HSTU_HAMMER_KERNEL=TRITON path on MI350X:

1. triton_hstu_attention.py _should_enable_tma(): add HIP early-out.
   torch.cuda.get_device_capability() on gfx950 returns (9, 5) which would
   pass the major==9 Hopper check and trick the kernel into the TMA path,
   producing kernels that don't compile on ROCm.

2. triton_hstu_attention.py _get_fw_configs(): hoist the USE_TLX/NUM_BUFFERS/
   NUM_MMA_WARPS_PER_GROUP/NUM_MMA_GROUPS defaults loop out of the CUDA-only
   else: branch. The _hstu_attn_fwd signature requires these constexprs
   regardless of backend; missing them on HIP triggered TypeError:
   dynamic_func() missing N required positional arguments at autotune.
   Also gate the H100 TLX configs append on `not torch.version.hip`.

3. triton_jagged_tensors.py concat/split dispatch: route AMD/ROCm through
   *_2D_jagged_multirow instead of the basic _concat_2D_jagged /
   _split_2D_jagged kernels. The basic kernels fail PassManager::run at
   make_ttgir (TritonAMDGPUCanonicalizePointers pass) on ROCm; multirow
   compiles fine. NVIDIA non-Blackwell paths (H100/A100) are unchanged.

4. triton_jagged_tensors.py _Concat2DJaggedFunction.backward: replace the
   raw _split_2D_jagged[grid] call with _triton_split_2D_jagged_internal
   so the backward pass benefits from the same AMD multirow routing as
   the forward.

Verified end-to-end on 8x MI350X: yambda-5b bs=32 seq=4k at 782 global_sps
vs PYTORCH backend 547 sps -- 1.43x throughput, 75% peak VRAM vs 92%.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../ops/triton/triton_hstu_attention.py       | 47 +++++++++++++----
 .../ops/triton/triton_jagged_tensors.py       | 51 +++++++++++++------
 2 files changed, 74 insertions(+), 24 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_attention.py b/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_attention.py
index ac667e139..03a1f8f67 100644
--- a/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_attention.py
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_attention.py
@@ -129,6 +129,19 @@ def _should_enable_tma() -> bool:
         return False
     if not torch.cuda.is_available():
         return False
+    # NVIDIA-only gate: TMA (Tensor Memory Accelerator) is Hopper-specific
+    # hardware. On ROCm/HIP, `torch.cuda.get_device_capability()` mirrors the
+    # gfx name into a major.minor tuple — gfx950 (MI350X) returns (9, 5), which
+    # would otherwise pass the `device_capability == 9` check below and trick
+    # the kernel into taking the TMA path. The TMA path uses
+    # `triton.tools.tensor_descriptor.TensorDescriptor` and `TensorDescriptor.load`
+    # which lower to PTX `cp.async.bulk.tensor.*`; on AMD this either fails to
+    # compile or produces a kernel with mismatched reduction-dim shapes for
+    # `tl.dot(silu, v)` in `_hstu_attn_fwd_one_block` (see WARNING in
+    # `_hstu_attn_fwd` for the cascade). Bail out early on HIP so the
+    # non-TMA path is selected and AMD gets a working kernel.
+    if torch.version.hip:
+        return False
     try:
         device_capability = torch.cuda.get_device_capability()[0]
     except (RuntimeError, AssertionError):
@@ -335,15 +348,31 @@ def _get_fw_configs() -> List[triton.Config]:  # noqa: C901
             ),
         ]
 
-        # Add 'USE_TLX' : False, 'NUM_BUFFERS': 1, 'NUM_MMA_WARPS_PER_GROUP': 1, 'NUM_MMA_GROUPS': 1 to non-TLX configs
-        for config in configs:
-            if not config.kwargs.get("USE_TLX", False):
-                config.kwargs["USE_TLX"] = False
-                config.kwargs["NUM_BUFFERS"] = 1
-                config.kwargs["NUM_MMA_WARPS_PER_GROUP"] = 1
-                config.kwargs["NUM_MMA_GROUPS"] = 1
-
-        # Add TLX configs if TLX is available
+    # The `_hstu_attn_fwd` kernel signature unconditionally declares the four
+    # constexprs `USE_TLX`, `NUM_BUFFERS`, `NUM_MMA_WARPS_PER_GROUP`,
+    # `NUM_MMA_GROUPS` (introduced for the Hopper TLX warp-specialized variant).
+    # Triton requires every constexpr be bound at autotune time; missing any one
+    # of them triggers `TypeError: dynamic_func() missing N required positional
+    # arguments` during kernel dispatch. This loop populates the non-TLX defaults
+    # so the kernel call site doesn't have to know about TLX at all.
+    #
+    # IMPORTANT: this loop must apply to BOTH the HIP branch and the CUDA branch
+    # above. It used to live inside the CUDA `else:` block which meant HIP
+    # configs reached `_hstu_attn_fwd[grid](...)` without these defaults and
+    # crashed at dispatch. Keep this hoisted (outside the if/else) when
+    # editing — see commit message for the symptom.
+    for config in configs:
+        if not config.kwargs.get("USE_TLX", False):
+            config.kwargs["USE_TLX"] = False
+            config.kwargs["NUM_BUFFERS"] = 1
+            config.kwargs["NUM_MMA_WARPS_PER_GROUP"] = 1
+            config.kwargs["NUM_MMA_GROUPS"] = 1
+
+    # TLX (Triton Language Extension) warp-specialized configs are Hopper-only.
+    # Guard with `not torch.version.hip` so AMD never sees them — the TLX code
+    # path inside `_hstu_attn_fwd` calls `tlx.async_descriptor_load(...)` which
+    # requires real TMA tensor descriptors and only compiles on CUDA.
+    if not torch.version.hip:
         if HAS_TLX:
             try:
                 device_capability = torch.cuda.get_device_capability()[0]
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_jagged_tensors.py b/recommendation_v4/generative_recommenders/ops/triton/triton_jagged_tensors.py
index 3488e308a..2c3728c0a 100644
--- a/recommendation_v4/generative_recommenders/ops/triton/triton_jagged_tensors.py
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_jagged_tensors.py
@@ -49,7 +49,19 @@ def _triton_concat_2D_jagged_internal(
     is_dense_b: bool,
     BLOCK_D: int,
 ) -> None:
-    if is_sm100_plus():
+    if is_sm100_plus() or (torch.cuda.is_available() and torch.version.hip):
+        # Route AMD/ROCm through the multirow kernel.
+        #
+        # The basic `_concat_2D_jagged` kernel below issues one program per
+        # output row (grid = `(max_seq_len, B)`). On ROCm Triton this fails to
+        # lower in the `TritonAMDGPUCanonicalizePointers` pass with
+        # `RuntimeError: PassManager::run failed` at `make_ttgir`. The
+        # multirow variant tiles rows with a tunable `BLOCK_N` (grid =
+        # `(cdiv(max_seq_len, BLOCK_N), B)`) and compiles cleanly on ROCm.
+        # The original `is_sm100_plus()` gate was conservative — only Blackwell
+        # was opted in. Adding HIP keeps NVIDIA H100/A100 on the basic kernel
+        # they were validated against and unblocks AMD without behavior change
+        # on existing NVIDIA paths.
 
         def grid(meta):
             return (triton.cdiv(max_seq_len, meta["BLOCK_N"]), B)
@@ -107,7 +119,11 @@ def _triton_split_2D_jagged_internal(
     is_dense_b: bool,
     BLOCK_D: int,
 ) -> None:
-    if is_sm100_plus():
+    if is_sm100_plus() or (torch.cuda.is_available() and torch.version.hip):
+        # Route AMD/ROCm through the multirow kernel for the same reason as
+        # `_triton_concat_2D_jagged_internal` above: basic `_split_2D_jagged`
+        # hits `PassManager::run failed` in `TritonAMDGPUCanonicalizePointers`
+        # on ROCm; multirow lowers cleanly.
 
         def grid(meta):
             return (triton.cdiv(max_seq_len, meta["BLOCK_N"]), B)
@@ -608,22 +624,27 @@ def backward(
         d_values_b = torch.empty(
             (ctx.total_len_b, D), device=d_out.device, dtype=d_out.dtype
         )
-        _split_2D_jagged[(ctx.max_seq_len, ctx.B)](
-            JaggedIn=d_out,
-            OffsetsA=offsets_a,
-            OffsetsB=offsets_b,
-            MaxLenA=ctx.max_len_a,
-            MaxLenB=ctx.max_len_b,
-            OutA=d_values_a,
-            OutB=d_values_b,
+        # Go through `_triton_split_2D_jagged_internal` (not raw
+        # `_split_2D_jagged[grid]`) so this backward pass benefits from the same
+        # AMD-routing-through-multirow workaround as the forward. Calling the
+        # raw kernel directly would hit `PassManager::run failed` on ROCm at
+        # `TritonAMDGPUCanonicalizePointers`. If you refactor this, do not
+        # collapse it back to `_split_2D_jagged[(ctx.max_seq_len, ctx.B)](...)`.
+        _triton_split_2D_jagged_internal(
+            jagged_in=d_out,
+            max_seq_len=ctx.max_seq_len,
+            B=ctx.B,
+            offsets_a=offsets_a,
+            offsets_b=offsets_b,
+            max_len_a=ctx.max_len_a,
+            max_len_b=ctx.max_len_b,
+            out_a=d_values_a,
+            out_b=d_values_b,
             D=D,
-            stride_id=d_out.stride(-2),
-            stride_ad=d_values_a.stride(-2),
-            stride_bd=d_values_b.stride(-2),
             n_prefix_to_B=ctx.n_prefix_from_B,
+            is_dense_a=ctx.is_dense_a,
+            is_dense_b=ctx.is_dense_b,
             BLOCK_D=BLOCK_D,
-            IS_DENSE_A=ctx.is_dense_a,
-            IS_DENSE_B=ctx.is_dense_b,
         )
         return None, d_values_a, d_values_b, None, None, None, None, None
 

From 751f2f0ec3c14bc7a41bb71993c2bc0c67c190e4 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Fri, 29 May 2026 23:14:14 +0000
Subject: [PATCH 003/113] Fix AttributeError on triton.knobs.nvidia.use_meta_ws

The attribute is absent in some Triton builds (e.g. nvcr.io/nvidia/pytorch:26.01-py3),
causing import-time AttributeError before any training step runs. Use getattr with
a False default so _use_meta_ws() gracefully reports disabled on those builds.
---
 .../generative_recommenders/ops/triton/triton_addmm.py        | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_addmm.py b/recommendation_v4/generative_recommenders/ops/triton/triton_addmm.py
index 915d85742..487aae189 100644
--- a/recommendation_v4/generative_recommenders/ops/triton/triton_addmm.py
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_addmm.py
@@ -65,7 +65,9 @@ def _use_meta_ws() -> bool:
         is_sm100_plus()
         and hasattr(triton, "knobs")
         and hasattr(triton.knobs, "nvidia")
-        and triton.knobs.nvidia.use_meta_ws
+        # `use_meta_ws` is absent in some Triton builds (e.g. nvcr.io/nvidia/pytorch:26.01-py3);
+        # use getattr so import doesn't crash on AttributeError before any step runs.
+        and getattr(triton.knobs.nvidia, "use_meta_ws", False)
     )
 
 

From 0f1fcf05b6fa7c3d69f4bb45df1d2887ec76a683 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Fri, 29 May 2026 19:12:08 -0500
Subject: [PATCH 004/113] Make HSTU model arch + dataset history_length
 gin-tunable

Three small changes so you can sweep model size and per-sample sequence
length from a gin file without editing configs.py.

configs.py:
  - get_hstu_configs is now @gin.configurable. Accepts optional overrides
    for max_seq_len, max_num_candidates, hstu_embedding_table_dim,
    hstu_transducer_embedding_dim, hstu_num_heads, hstu_attn_num_layers,
    hstu_attn_linear_dim, hstu_attn_qk_dim, hstu_input_dropout_ratio,
    hstu_linear_dropout_rate. Per-dataset defaults still apply unless
    explicitly overridden in gin.
  - get_embedding_table_config is now @gin.configurable with an
    embedding_dim override that uniformly sets the dim for all tables
    of the chosen dataset.
  - Drop the YAMBDA_EMBEDDING_DIM constant (was a duplicate of
    HSTU_EMBEDDING_DIM=512). Yambda branch now uses HSTU_EMBEDDING_DIM
    directly. Add a comment noting the model+table dim must stay aligned
    when overriding either via gin.

utils.py:
  - get_dataset accepts an optional history_length kwarg that wins over
    the yambda dataset's hardcoded default of 4096. Caches are still
    keyed on disk under hstu_cache_L<N>/ so switching L between previously
    built values is free.

train/gin/yambda_5b.gin:
  - Pin history_length=2048 and max_seq_len=2048 for the seq-2k smoke
    config. Both lines have inline comments explaining the +9 overhead
    (uid + 7 cross + 1 candidate) so total per-sample seq is ~2046,
    within the 2048 budget.

Verified: default codepath unchanged, gin overrides apply consistently
to both get_hstu_configs (model) and get_embedding_table_config (tables).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../dlrm_v3/configs.py                        | 116 +++++++++++++-----
 .../dlrm_v3/train/gin/yambda_5b.gin           |   8 ++
 .../generative_recommenders/dlrm_v3/utils.py  |   6 +-
 3 files changed, 94 insertions(+), 36 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/configs.py b/recommendation_v4/generative_recommenders/dlrm_v3/configs.py
index 2981f01e3..1fd7f07a9 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/configs.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/configs.py
@@ -19,7 +19,9 @@
 This module provides configuration functions for the HSTU model architecture and embedding table configurations.
 """
 
-from typing import Dict
+from typing import Dict, Optional
+
+import gin
 
 from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig
 from generative_recommenders.modules.multitask_module import (
@@ -32,8 +34,6 @@
 HASH_SIZE = 10_000_000
 HASH_SIZE_1B = 1_000_000_000
 
-YAMBDA_EMBEDDING_DIM = 512
-
 # (name, keys, num_embeddings, salt) — single source of truth for both
 # get_embedding_table_config("yambda-5b") and the dataset's cross-hash inputs.
 # Sizes mirror Primus-DLRM/configs/bench_onetrans_large_5b_cross_feat_shampoo.yaml.
@@ -48,7 +48,20 @@
 ]
 
 
-def get_hstu_configs(dataset: str = "debug") -> DlrmHSTUConfig:
+@gin.configurable
+def get_hstu_configs(
+    dataset: str = "debug",
+    max_seq_len: Optional[int] = None,
+    max_num_candidates: Optional[int] = None,
+    hstu_embedding_table_dim: Optional[int] = None,
+    hstu_transducer_embedding_dim: Optional[int] = None,
+    hstu_num_heads: Optional[int] = None,
+    hstu_attn_num_layers: Optional[int] = None,
+    hstu_attn_linear_dim: Optional[int] = None,
+    hstu_attn_qk_dim: Optional[int] = None,
+    hstu_input_dropout_ratio: Optional[float] = None,
+    hstu_linear_dropout_rate: Optional[float] = None,
+) -> DlrmHSTUConfig:
     """
     Create and return HSTU model configuration.
 
@@ -333,9 +346,11 @@ def get_hstu_configs(dataset: str = "debug") -> DlrmHSTUConfig:
     elif "yambda" in dataset:
         assert dataset in ["yambda-5b"]
         cross_names = [name for (name, _k, _n, _s) in YAMBDA_5B_CROSS_SPECS]
-        # Smaller per-table dim for yambda (see YAMBDA_EMBEDDING_DIM); transducer
-        # still projects to 512.
-        hstu_config.hstu_embedding_table_dim = YAMBDA_EMBEDDING_DIM
+        # Per-table dim defaults to HSTU_EMBEDDING_DIM (512); override via the
+        # `get_hstu_configs.hstu_embedding_table_dim = N` gin binding if needed.
+        # Note: the embedding tables in get_embedding_table_config also use
+        # HSTU_EMBEDDING_DIM and must stay aligned with this value.
+        hstu_config.hstu_embedding_table_dim = HSTU_EMBEDDING_DIM
         hstu_config.hstu_transducer_embedding_dim = 512
         hstu_config.max_seq_len = 8192
         hstu_config.max_num_candidates = 1
@@ -468,10 +483,37 @@ def get_hstu_configs(dataset: str = "debug") -> DlrmHSTUConfig:
                 task_type=MultitaskTaskType.BINARY_CLASSIFICATION,
             )
         ]
+
+    # Apply gin overrides last so a value set in the gin file wins over the
+    # per-dataset defaults above. Anything left as None inherits the default
+    # the dataset branch (or DlrmHSTUConfig) chose. Example in a gin file:
+    #   get_hstu_configs.max_seq_len = 4096
+    #   get_hstu_configs.hstu_embedding_table_dim = 256
+    _gin_overrides = {
+        "max_seq_len": max_seq_len,
+        "max_num_candidates": max_num_candidates,
+        "max_num_candidates_inference": max_num_candidates,
+        "hstu_embedding_table_dim": hstu_embedding_table_dim,
+        "hstu_transducer_embedding_dim": hstu_transducer_embedding_dim,
+        "hstu_num_heads": hstu_num_heads,
+        "hstu_attn_num_layers": hstu_attn_num_layers,
+        "hstu_attn_linear_dim": hstu_attn_linear_dim,
+        "hstu_attn_qk_dim": hstu_attn_qk_dim,
+        "hstu_input_dropout_ratio": hstu_input_dropout_ratio,
+        "hstu_linear_dropout_rate": hstu_linear_dropout_rate,
+    }
+    for _name, _val in _gin_overrides.items():
+        if _val is not None:
+            setattr(hstu_config, _name, _val)
+
     return hstu_config
 
 
-def get_embedding_table_config(dataset: str = "debug") -> Dict[str, EmbeddingConfig]:
+@gin.configurable
+def get_embedding_table_config(
+    dataset: str = "debug",
+    embedding_dim: Optional[int] = None,
+) -> Dict[str, EmbeddingConfig]:
     """
     Create and return embedding table configurations.
 
@@ -480,10 +522,16 @@ def get_embedding_table_config(dataset: str = "debug") -> Dict[str, EmbeddingCon
 
     Args:
         dataset: Dataset identifier (currently unused, reserved for dataset-specific configs).
+        embedding_dim: Per-table embedding width override. When set via gin
+            (e.g. `get_embedding_table_config.embedding_dim = 256`), wins over
+            `HSTU_EMBEDDING_DIM`. Keep in sync with the matching gin override on
+            `get_hstu_configs.hstu_embedding_table_dim` — the model and the
+            tables must agree on dim or sharding will reject the plan.
 
     Returns:
         Dict mapping table names to their EmbeddingConfig objects.
     """
+    DIM = embedding_dim if embedding_dim is not None else HSTU_EMBEDDING_DIM
     if "movielens" in dataset:
         assert dataset in [
             "movielens-1m",
@@ -495,42 +543,42 @@ def get_embedding_table_config(dataset: str = "debug") -> Dict[str, EmbeddingCon
             {
                 "movie_id": EmbeddingConfig(
                     num_embeddings=HASH_SIZE,
-                    embedding_dim=HSTU_EMBEDDING_DIM,
+                    embedding_dim=DIM,
                     name="movie_id",
                     data_type=DataType.FP16,
                     feature_names=["movie_id", "item_movie_id"],
                 ),
                 "user_id": EmbeddingConfig(
                     num_embeddings=HASH_SIZE,
-                    embedding_dim=HSTU_EMBEDDING_DIM,
+                    embedding_dim=DIM,
                     name="user_id",
                     data_type=DataType.FP16,
                     feature_names=["user_id"],
                 ),
                 "sex": EmbeddingConfig(
                     num_embeddings=HASH_SIZE,
-                    embedding_dim=HSTU_EMBEDDING_DIM,
+                    embedding_dim=DIM,
                     name="sex",
                     data_type=DataType.FP16,
                     feature_names=["sex"],
                 ),
                 "age_group": EmbeddingConfig(
                     num_embeddings=HASH_SIZE,
-                    embedding_dim=HSTU_EMBEDDING_DIM,
+                    embedding_dim=DIM,
                     name="age_group",
                     data_type=DataType.FP16,
                     feature_names=["age_group"],
                 ),
                 "occupation": EmbeddingConfig(
                     num_embeddings=HASH_SIZE,
-                    embedding_dim=HSTU_EMBEDDING_DIM,
+                    embedding_dim=DIM,
                     name="occupation",
                     data_type=DataType.FP16,
                     feature_names=["occupation"],
                 ),
                 "zip_code": EmbeddingConfig(
                     num_embeddings=HASH_SIZE,
-                    embedding_dim=HSTU_EMBEDDING_DIM,
+                    embedding_dim=DIM,
                     name="zip_code",
                     data_type=DataType.FP16,
                     feature_names=["zip_code"],
@@ -540,14 +588,14 @@ def get_embedding_table_config(dataset: str = "debug") -> Dict[str, EmbeddingCon
             else {
                 "movie_id": EmbeddingConfig(
                     num_embeddings=HASH_SIZE_1B,
-                    embedding_dim=HSTU_EMBEDDING_DIM,
+                    embedding_dim=DIM,
                     name="movie_id",
                     data_type=DataType.FP16,
                     feature_names=["movie_id", "item_movie_id"],
                 ),
                 "user_id": EmbeddingConfig(
                     num_embeddings=3_000_000,
-                    embedding_dim=HSTU_EMBEDDING_DIM,
+                    embedding_dim=DIM,
                     name="user_id",
                     data_type=DataType.FP16,
                     feature_names=["user_id"],
@@ -558,14 +606,14 @@ def get_embedding_table_config(dataset: str = "debug") -> Dict[str, EmbeddingCon
         return {
             "item_id": EmbeddingConfig(
                 num_embeddings=HASH_SIZE_1B,
-                embedding_dim=HSTU_EMBEDDING_DIM,
+                embedding_dim=DIM,
                 name="item_id",
                 data_type=DataType.FP16,
                 feature_names=["item_id", "item_candidate_id"],
             ),
             "item_category_id": EmbeddingConfig(
                 num_embeddings=128,
-                embedding_dim=HSTU_EMBEDDING_DIM,
+                embedding_dim=DIM,
                 name="item_category_id",
                 data_type=DataType.FP16,
                 weight_init_max=1.0,
@@ -574,7 +622,7 @@ def get_embedding_table_config(dataset: str = "debug") -> Dict[str, EmbeddingCon
             ),
             "user_id": EmbeddingConfig(
                 num_embeddings=10_000_000,
-                embedding_dim=HSTU_EMBEDDING_DIM,
+                embedding_dim=DIM,
                 name="user_id",
                 data_type=DataType.FP16,
                 feature_names=["user_id"],
@@ -584,49 +632,49 @@ def get_embedding_table_config(dataset: str = "debug") -> Dict[str, EmbeddingCon
         return {
             "video_id": EmbeddingConfig(
                 num_embeddings=HASH_SIZE,
-                embedding_dim=HSTU_EMBEDDING_DIM,
+                embedding_dim=DIM,
                 name="video_id",
                 data_type=DataType.FP16,
                 feature_names=["video_id", "item_video_id"],
             ),
             "user_id": EmbeddingConfig(
                 num_embeddings=HASH_SIZE,
-                embedding_dim=HSTU_EMBEDDING_DIM,
+                embedding_dim=DIM,
                 name="user_id",
                 data_type=DataType.FP16,
                 feature_names=["user_id"],
             ),
             "user_active_degree": EmbeddingConfig(
                 num_embeddings=8,
-                embedding_dim=HSTU_EMBEDDING_DIM,
+                embedding_dim=DIM,
                 name="user_active_degree",
                 data_type=DataType.FP16,
                 feature_names=["user_active_degree"],
             ),
             "follow_user_num_range": EmbeddingConfig(
                 num_embeddings=9,
-                embedding_dim=HSTU_EMBEDDING_DIM,
+                embedding_dim=DIM,
                 name="follow_user_num_range",
                 data_type=DataType.FP16,
                 feature_names=["follow_user_num_range"],
             ),
             "fans_user_num_range": EmbeddingConfig(
                 num_embeddings=9,
-                embedding_dim=HSTU_EMBEDDING_DIM,
+                embedding_dim=DIM,
                 name="fans_user_num_range",
                 data_type=DataType.FP16,
                 feature_names=["fans_user_num_range"],
             ),
             "friend_user_num_range": EmbeddingConfig(
                 num_embeddings=8,
-                embedding_dim=HSTU_EMBEDDING_DIM,
+                embedding_dim=DIM,
                 name="friend_user_num_range",
                 data_type=DataType.FP16,
                 feature_names=["friend_user_num_range"],
             ),
             "register_days_range": EmbeddingConfig(
                 num_embeddings=8,
-                embedding_dim=HSTU_EMBEDDING_DIM,
+                embedding_dim=DIM,
                 name="register_days_range",
                 data_type=DataType.FP16,
                 feature_names=["register_days_range"],
@@ -637,28 +685,28 @@ def get_embedding_table_config(dataset: str = "debug") -> Dict[str, EmbeddingCon
         tables: Dict[str, EmbeddingConfig] = {
             "item_id": EmbeddingConfig(
                 num_embeddings=9_390_000,
-                embedding_dim=YAMBDA_EMBEDDING_DIM,
+                embedding_dim=DIM,
                 name="item_id",
                 data_type=DataType.FP32,
                 feature_names=["item_id", "item_candidate_id"],
             ),
             "artist_id": EmbeddingConfig(
                 num_embeddings=1_290_000,
-                embedding_dim=YAMBDA_EMBEDDING_DIM,
+                embedding_dim=DIM,
                 name="artist_id",
                 data_type=DataType.FP32,
                 feature_names=["artist_id", "item_candidate_artist_id"],
             ),
             "album_id": EmbeddingConfig(
                 num_embeddings=3_370_000,
-                embedding_dim=YAMBDA_EMBEDDING_DIM,
+                embedding_dim=DIM,
                 name="album_id",
                 data_type=DataType.FP32,
                 feature_names=["album_id", "item_candidate_album_id"],
             ),
             "uid": EmbeddingConfig(
                 num_embeddings=1_000_000,
-                embedding_dim=YAMBDA_EMBEDDING_DIM,
+                embedding_dim=DIM,
                 name="uid",
                 data_type=DataType.FP32,
                 feature_names=["uid"],
@@ -667,7 +715,7 @@ def get_embedding_table_config(dataset: str = "debug") -> Dict[str, EmbeddingCon
         for name, _keys, num_embeddings, _salt in YAMBDA_5B_CROSS_SPECS:
             tables[name] = EmbeddingConfig(
                 num_embeddings=num_embeddings,
-                embedding_dim=YAMBDA_EMBEDDING_DIM,
+                embedding_dim=DIM,
                 name=name,
                 data_type=DataType.FP32,
                 feature_names=[name],
@@ -677,7 +725,7 @@ def get_embedding_table_config(dataset: str = "debug") -> Dict[str, EmbeddingCon
         return {
             "post_id": EmbeddingConfig(
                 num_embeddings=HASH_SIZE,
-                embedding_dim=HSTU_EMBEDDING_DIM,
+                embedding_dim=DIM,
                 name="post_id",
                 data_type=DataType.FP16,
                 feature_names=[
@@ -689,14 +737,14 @@ def get_embedding_table_config(dataset: str = "debug") -> Dict[str, EmbeddingCon
             ),
             "viewer_id": EmbeddingConfig(
                 num_embeddings=HASH_SIZE,
-                embedding_dim=HSTU_EMBEDDING_DIM,
+                embedding_dim=DIM,
                 name="viewer_id",
                 data_type=DataType.FP16,
                 feature_names=["viewer_id"],
             ),
             "dummy_contexual": EmbeddingConfig(
                 num_embeddings=HASH_SIZE,
-                embedding_dim=HSTU_EMBEDDING_DIM,
+                embedding_dim=DIM,
                 name="dummy_contexual",
                 data_type=DataType.FP16,
                 feature_names=["dummy_contexual"],
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index a483f8766..0a8da31ab 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -34,6 +34,14 @@ make_train_test_dataloaders.num_blocks = 1
 
 get_dataset.name = %dataset
 get_dataset.new_path_prefix = "/apps/chcai/dlrm_data"
+# Per-pool truncation cap. Cache is keyed by L on disk under
+# <processed_dir>/hstu_cache_L<N>/; switching L reuses an existing cache
+# (L=2048 was built in a prior session, no rebuild needed).
+get_dataset.history_length = 2048
+
+# Model-side attention budget. Dataset truncates UIH to fit this value if
+# `history_length + contextual + candidate` would overflow.
+get_hstu_configs.max_seq_len = 2048
 
 # train-eval loop variables (yambda is non-streaming)
 train_eval_loop.num_epochs = 1
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
index 52091d8dd..43d641a0c 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
@@ -509,7 +509,7 @@ def reset(self, mode: str = "train"):
 
 
 @gin.configurable
-def get_dataset(name: str, new_path_prefix: str = ""):
+def get_dataset(name: str, new_path_prefix: str = "", history_length: Optional[int] = None):
     """
     Get dataset class and configuration by name.
 
@@ -630,7 +630,9 @@ def get_dataset(name: str, new_path_prefix: str = ""):
                 # all ranks on a node share the same physical pages.
                 "processed_dir": os.path.join(new_path_prefix, "processed_5b"),
                 "metadata_dir": os.path.join(new_path_prefix, "shared_metadata"),
-                "history_length": 4096,
+                # Per-pool truncation cap; total interleaved UIH ~ 3*L/3 = L.
+                # Override via `get_dataset.history_length = N` in gin.
+                "history_length": history_length if history_length is not None else 4096,
                 "scan_window": 20000,
                 "cross_specs": YAMBDA_5B_CROSS_SPECS,
             },

From b7f8b2d56cdc5c58774c9d02233be648648650fb Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Fri, 29 May 2026 22:43:21 -0500
Subject: [PATCH 005/113] Make EmbeddingShardingPlanner hbm_cap_gb gin-tunable,
 set ddr_cap=0

make_optimizer_and_shard now accepts hbm_cap_gb (default 260, the MI350X
value) via @gin.configurable. The yambda gin pins the same default so
sweeps just change the number in the gin file instead of editing utils.py.

ddr_cap dropped from 32 GiB to 0: with all 11 yambda 5b embedding tables
fitting on 8x MI350X HBM, allowing host DRAM offload only invites the
planner to pick slower per-lookup-PCIe-traffic plans.

Verified gin binding flows through to the Topology: a probe with
hbm_cap_gb=100 produced Topology(hbm_cap=107374182400) and the planner
correctly raised insufficient-storage error at that tightness.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin                      | 3 +++
 .../generative_recommenders/dlrm_v3/train/utils.py       | 9 ++++-----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 0a8da31ab..939b42b78 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -32,6 +32,9 @@ make_train_test_dataloaders.num_workers = %num_workers
 make_train_test_dataloaders.prefetch_factor = %prefetch_factor
 make_train_test_dataloaders.num_blocks = 1
 
+# embedding planner
+make_optimizer_and_shard.hbm_cap_gb = 260
+
 get_dataset.name = %dataset
 get_dataset.new_path_prefix = "/apps/chcai/dlrm_data"
 # Per-pool truncation cap. Cache is keyed by L on disk under
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index 21d2baa6e..90669c1a6 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -295,10 +295,12 @@ def sparse_optimizer_factory_and_class(
     return optimizer_cls, kwargs, optimizer_factory
 
 
+@gin.configurable
 def make_optimizer_and_shard(
     model: torch.nn.Module,
     device: torch.device,
     world_size: int,
+    hbm_cap_gb: int = 260,
 ) -> Tuple[DistributedModelParallel, torch.optim.Optimizer]:
     dense_opt_cls, dense_opt_args, dense_opt_factory = (
         dense_optimizer_factory_and_class()
@@ -316,16 +318,13 @@ def make_optimizer_and_shard(
                         sparse_opt_cls, [param], sparse_opt_args
                     )
     sharders = get_default_sharders()
-    # MI350X has 288 GiB HBM3e per GPU; the 160 GiB cap was sized for older parts.
-    # Matches Primus-DLRM (hbm_cap_gb: 260) which runs the same 5b cross-feat
-    # table set on the same hardware without host materialization.
     planner = EmbeddingShardingPlanner(
         topology=Topology(
             local_world_size=world_size,
             world_size=world_size,
             compute_device="cuda",
-            hbm_cap=260 * 1024 * 1024 * 1024,
-            ddr_cap=32 * 1024 * 1024 * 1024,
+            hbm_cap=hbm_cap_gb * 1024 * 1024 * 1024,
+            ddr_cap=0,
         )
     )
     pg = dist.GroupMember.WORLD

From 5ba9b074c7eccb74b58e8104cffca9b0bfbbb092 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Sat, 30 May 2026 00:13:27 -0500
Subject: [PATCH 006/113] Add yambda 50m/500m/5b preprocessor + DLRM_DATA_PATH
 env override

preprocess_public_data.py:
  - Add DLRMYambdaProcessor: downloads Yambda multi_event + catalog
    metadata from the yandex/yambda HuggingFace repo, then runs a
    temporal split (300 train days / 30 min gap / 1 test day),
    builds per-user sessions (1800s inactivity threshold), and
    writes the layout DLRMv3YambdaDataset expects:
      <data-path>/raw/<size>/multi_event.parquet
      <data-path>/shared_metadata/{artist,album,embeddings}.parquet
      <data-path>/processed_<size>/{train_sessions,test_events,
                                    session_index}.parquet
      <data-path>/processed_<size>/item_popularity.npy
      <data-path>/processed_<size>/split_meta.json
  - 5b variant uses chunked polars load (10M rows/chunk) to keep
    peak RAM under control (single-shot read of the 50 GB parquet
    OOMs ~150 GB systems).
  - SUPPORTED_DATASETS adds yambda-50m, yambda-500m, yambda-5b.
  - main() takes --data-path for custom output root.
  - Verified end-to-end: 50m run completes in ~2 min, 5b in ~53 min
    (download dominates), output is byte-compatible with the dataset
    cache builder; TRITON training reaches steady state on the
    fresh data at 2050 sps.

utils.py:
  - Add env_path(key, default) @gin.configurable helper. Used as a
    gin macro so any string-valued binding can be overridden by an
    env var without editing the gin file.

train/gin/yambda_5b.gin:
  - Declare DATA_PATH = @env_path() macro with key="DLRM_DATA_PATH"
    and default="/apps/chcai/dlrm_data". Both new_path_prefix
    bindings (make_train_test_dataloaders and get_dataset) now
    consume %DATA_PATH. Setting DLRM_DATA_PATH=/some/path at run
    time redirects the dataset without a gin edit.

datasets/yambda.py:
  - Strip stale references to upstream-internal preprocessing in
    docstrings/comments; point at preprocess_public_data.py instead.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../dlrm_v3/datasets/yambda.py                |  14 +-
 .../dlrm_v3/preprocess_public_data.py         | 352 +++++++++++++++++-
 .../dlrm_v3/train/gin/yambda_5b.gin           |  10 +-
 .../generative_recommenders/dlrm_v3/utils.py  |  15 +
 4 files changed, 368 insertions(+), 23 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py
index fb8b212b1..5a13ac034 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py
@@ -10,15 +10,12 @@
 """
 Yambda dataset for the DLRMv3 HSTU `modules/` path.
 
-Reads the same parquets produced by Primus-DLRM's preprocessing (no runtime
-dep on Primus). Each sample is one anchor LISTEN event with:
+Reads the parquets produced by `dlrm_v3/preprocess_public_data.py
+--dataset yambda-<size>`. Each sample is one anchor LISTEN event with:
   * label = (played_ratio >= LISTEN_PLUS_THRESHOLD) — the listen_plus bit
   * a chronologically interleaved 3-pool history (listen+/like/skip), with
     pool identity tagged per-position in `action_weight` (bits 1/2/4)
   * 7 pre-hashed cross-feature ids exposed as length-1 contextual entries
-
-Hash formula is byte-identical to `primus_dlrm.data.hashing.cross_hash_nway`
-so embedding rows are interchangeable.
 """
 
 import logging
@@ -60,7 +57,7 @@ def _load_npy_readonly(path: Union[str, Path]) -> np.ndarray:
     arr.flags.writeable = False
     return arr
 
-# Match primus_dlrm.data.preprocessing.EVENT_TYPE_MAP / dataset.LISTEN_PLUS_THRESHOLD
+# Yambda event-type encoding written by preprocess_public_data.py.
 LISTEN_TYPE = 0
 LIKE_TYPE = 1
 LISTEN_PLUS_THRESHOLD = 50
@@ -72,15 +69,14 @@ def _load_npy_readonly(path: Union[str, Path]) -> np.ndarray:
 
 
 class _FlatEventStore:
-    """Minimal port of Primus-DLRM's FlatEventStore.
+    """Per-user flat event index built from the preprocessed sessions parquet.
 
     Reads `train_sessions.parquet` and explodes per-session arrays into flat
     numpy columns + per-user `(start, end)` index arrays. Cache-compatible
     layout, but writes nothing (rebuilds from parquet each construction).
     """
 
-    # On-disk column layout written by Primus-DLRM's FlatEventStore.save_mmap.
-    # Bit-identical to that schema so the cache is interchangeable.
+    # On-disk column layout.
     _MMAP_COLS = (
         "flat_uid", "flat_item_ids", "flat_timestamps",
         "flat_event_types", "flat_played_ratio",
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/preprocess_public_data.py b/recommendation_v4/generative_recommenders/dlrm_v3/preprocess_public_data.py
index 488af712d..06b43aa48 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/preprocess_public_data.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/preprocess_public_data.py
@@ -14,10 +14,12 @@
 
 # pyre-unsafe
 import argparse
+import json
 import logging
 import os
 import tarfile
-from typing import Dict, List
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
 from urllib.request import urlretrieve
 
 import numpy as np
@@ -28,10 +30,20 @@
 log = logging.getLogger("main")
 
 """
-Usage: mkdir -p data/ && python3 preprocess_public_data.py --dataset kuairand-1k
+Usage:
+    mkdir -p data/ && python3 preprocess_public_data.py --dataset kuairand-1k
+    python3 preprocess_public_data.py --dataset yambda-5b   --data-path <root>
+    python3 preprocess_public_data.py --dataset yambda-500m --data-path <root>
+    python3 preprocess_public_data.py --dataset yambda-50m  --data-path <root>
 """
 
-SUPPORTED_DATASETS = ["kuairand-1k", "kuairand-27k"]
+SUPPORTED_DATASETS = [
+    "kuairand-1k",
+    "kuairand-27k",
+    "yambda-50m",
+    "yambda-500m",
+    "yambda-5b",
+]
 
 
 def get_feature_merge_weights(dataset: str = "debug") -> Dict[str, int]:
@@ -185,26 +197,342 @@ def _one_hot_encode(row):
         log.info(f"Processed file saved to {self._output_file}")
 
 
+# ----------------------------------------------------------------------------
+# Yambda processor
+# ----------------------------------------------------------------------------
+#
+# Yambda is hosted on HuggingFace at `yandex/yambda` and comes in three sizes:
+# 50m, 500m, 5b. Each size shares the same catalog metadata (embeddings,
+# artist/album mappings); only the interaction stream differs.
+#
+# This processor:
+#   1) Downloads `multi_event.parquet` for the chosen size + the catalog
+#      metadata files via the `datasets` library.
+#   2) Encodes event_type strings into uint8.
+#   3) Splits temporally into train + test (Global Temporal Split, GTS).
+#   4) Builds per-user sessions by inactivity gap.
+#   5) Computes item popularity counts.
+#   6) Writes the layout expected by `DLRMv3YambdaDataset`:
+#
+#      <data_path>/processed_<size>/
+#          train_sessions.parquet
+#          test_events.parquet
+#          session_index.parquet
+#          item_popularity.npy
+#          split_meta.json
+#
+#      <data_path>/shared_metadata/
+#          artist_item_mapping.parquet
+#          album_item_mapping.parquet
+#          embeddings.parquet           (optional; not used by HSTU training)
+#
+# The HSTU training path then auto-builds an `hstu_cache_L<N>/` mmap under
+# `processed_<size>/` on first use.
+# ----------------------------------------------------------------------------
+
+YAMBDA_HF_REPO = "yandex/yambda"
+YAMBDA_SIZES = {"yambda-50m": "50m", "yambda-500m": "500m", "yambda-5b": "5b"}
+YAMBDA_METADATA_FILES = (
+    "artist_item_mapping",
+    "album_item_mapping",
+    "embeddings",
+)
+
+# Yambda timestamps are seconds (rounded to 5s boundaries).
+SECONDS_PER_DAY = 86400
+# Polars chunk size for streaming the 5b parquet (~150 GB on disk).
+YAMBDA_CHUNK_SIZE = 10_000_000
+EVENT_TYPE_MAP = {"listen": 0, "like": 1, "dislike": 2, "unlike": 3, "undislike": 4}
+
+
+class DLRMYambdaProcessor(DataProcessor):
+    """Download + preprocess Yambda (50m / 500m / 5b) for DLRMv3YambdaDataset."""
+
+    def __init__(
+        self,
+        data_path: str,
+        size: str,
+        session_gap_seconds: int = 1800,
+        train_days: int = 300,
+        gap_minutes: int = 30,
+        test_days: int = 1,
+    ) -> None:
+        assert size in {"50m", "500m", "5b"}, f"unknown yambda size {size}"
+        super().__init__(
+            download_url="",  # download is via HuggingFace `datasets` lib
+            data_path=data_path.rstrip("/") + "/",
+            file_name=f"{size}/multi_event.parquet",
+            prefix=f"yambda-{size}",
+        )
+        self._size: str = size
+        self._raw_dir: Path = Path(self._data_path) / "raw"
+        self._processed_dir: Path = Path(self._data_path) / f"processed_{size}"
+        self._shared_dir: Path = Path(self._data_path) / "shared_metadata"
+        self._session_gap_seconds: int = session_gap_seconds
+        self._train_days: int = train_days
+        self._gap_minutes: int = gap_minutes
+        self._test_days: int = test_days
+
+    def download(self) -> None:
+        try:
+            from datasets import DatasetDict, load_dataset
+        except ImportError as e:
+            raise ImportError(
+                "Downloading Yambda requires the `datasets` package "
+                "(`pip install datasets`)."
+            ) from e
+
+        self._raw_dir.mkdir(parents=True, exist_ok=True)
+        self._shared_dir.mkdir(parents=True, exist_ok=True)
+
+        # Size-specific interaction stream.
+        event_path = self._raw_dir / self._size / "multi_event.parquet"
+        if not event_path.exists():
+            event_path.parent.mkdir(parents=True, exist_ok=True)
+            log.info(
+                f"Downloading multi_event.parquet for {self._size} "
+                f"from {YAMBDA_HF_REPO} ..."
+            )
+            ds = load_dataset(
+                YAMBDA_HF_REPO,
+                data_dir=f"flat/{self._size}",
+                data_files="multi_event.parquet",
+            )
+            assert isinstance(ds, DatasetDict)
+            ds["train"].to_parquet(str(event_path))
+            log.info(f"Saved {event_path}")
+        else:
+            log.info(f"Already exists: {event_path}")
+
+        # Catalog metadata files (shared across sizes).
+        for name in YAMBDA_METADATA_FILES:
+            shared_path = self._shared_dir / f"{name}.parquet"
+            if shared_path.exists():
+                log.info(f"Already exists: {shared_path}")
+                continue
+            log.info(f"Downloading {name}.parquet from {YAMBDA_HF_REPO} ...")
+            ds = load_dataset(YAMBDA_HF_REPO, data_files=f"{name}.parquet")
+            assert isinstance(ds, DatasetDict)
+            ds["train"].to_parquet(str(shared_path))
+            log.info(f"Saved {shared_path}")
+
+    def preprocess(self) -> None:
+        self.download()
+        try:
+            import polars as pl
+        except ImportError as e:
+            raise ImportError(
+                "Yambda preprocessing requires polars "
+                "(`pip install polars-u64-idx` is recommended for the 5b "
+                "variant — stock polars overflows its 32-bit row index)."
+            ) from e
+
+        self._processed_dir.mkdir(parents=True, exist_ok=True)
+        event_path = self._raw_dir / self._size / "multi_event.parquet"
+
+        log.info(f"Loading multi_event from {event_path} ...")
+        events = self._load_events(pl, event_path)
+        log.info(f"Loaded {len(events):,} events")
+
+        events = self._encode_event_types(pl, events)
+        t_min = int(events["timestamp"].min())
+        t_max = int(events["timestamp"].max())
+        log.info(
+            f"Timestamp range: {t_min}..{t_max} "
+            f"({(t_max - t_min) / SECONDS_PER_DAY:.1f} days)"
+        )
+
+        train_start, train_end, test_start, test_end = self._split_boundaries(t_max)
+        log.info(
+            f"GTS train=[{train_start},{train_end}) gap=[{train_end},{test_start}) "
+            f"test=[{test_start},{test_end})"
+        )
+        train_events, test_events = self._temporal_split(
+            pl, events, train_start, train_end, test_start, test_end
+        )
+        log.info(
+            f"Train: {len(train_events):,} events, Test: {len(test_events):,} events"
+        )
+
+        gap_units = self._session_gap_seconds  # 1 unit = 1 second
+        sessions = self._build_sessions(pl, train_events, gap_units)
+        log.info(f"Built {len(sessions):,} sessions")
+
+        session_index = self._build_session_index(pl, sessions)
+        log.info(f"Session index covers {len(session_index):,} users")
+
+        item_popularity = self._compute_item_popularity(train_events)
+
+        sessions.write_parquet(str(self._processed_dir / "train_sessions.parquet"))
+        test_events.write_parquet(str(self._processed_dir / "test_events.parquet"))
+        session_index.write_parquet(str(self._processed_dir / "session_index.parquet"))
+        np.save(self._processed_dir / "item_popularity.npy", item_popularity)
+
+        with open(self._processed_dir / "split_meta.json", "w") as f:
+            json.dump(
+                {
+                    "size": self._size,
+                    "t_min": t_min,
+                    "t_max": t_max,
+                    "train_start": train_start,
+                    "train_end": train_end,
+                    "test_start": test_start,
+                    "test_end": test_end,
+                    "train_days": self._train_days,
+                    "gap_minutes": self._gap_minutes,
+                    "test_days": self._test_days,
+                    "session_gap_seconds": self._session_gap_seconds,
+                    "num_train_events": int(len(train_events)),
+                    "num_test_events": int(len(test_events)),
+                    "num_sessions": int(len(sessions)),
+                    "num_users": int(len(session_index)),
+                },
+                f,
+                indent=2,
+            )
+        log.info(f"Preprocessing complete: {self._processed_dir}")
+
+    # ------- helpers --------
+
+    def _load_events(self, pl, parquet_path: Path):
+        # 5b is too large to load in one polars pass on most boxes (~150 GB
+        # peak in-RAM with eager read). Stream in 10M-row chunks for safety.
+        if self._size == "5b":
+            log.info(f"Streaming load (chunk_size={YAMBDA_CHUNK_SIZE:,})...")
+            lf = pl.scan_parquet(parquet_path)
+            n = lf.select(pl.len()).collect().item()
+            log.info(f"Total rows: {n:,}")
+            chunks = []
+            for off in range(0, n, YAMBDA_CHUNK_SIZE):
+                chunk = lf.slice(off, YAMBDA_CHUNK_SIZE).collect()
+                chunks.append(chunk)
+                log.info(f"  loaded {off:,}..{off + len(chunk):,}")
+            return pl.concat(chunks)
+        return pl.read_parquet(parquet_path)
+
+    def _encode_event_types(self, pl, events):
+        dt = events["event_type"].dtype
+        if dt == pl.Utf8 or isinstance(dt, (pl.Categorical, pl.Enum)):
+            events = events.with_columns(
+                pl.col("event_type")
+                .cast(pl.Utf8)
+                .replace_strict(EVENT_TYPE_MAP)
+                .cast(pl.UInt8)
+                .alias("event_type")
+            )
+        return events
+
+    def _split_boundaries(self, t_max: int) -> Tuple[int, int, int, int]:
+        test_end = t_max
+        test_start = test_end - self._test_days * SECONDS_PER_DAY
+        train_end = test_start - self._gap_minutes * 60
+        train_start = train_end - self._train_days * SECONDS_PER_DAY
+        return train_start, train_end, test_start, test_end
+
+    def _temporal_split(self, pl, events, train_start, train_end, test_start, test_end):
+        train = events.filter(
+            (pl.col("timestamp") >= train_start) & (pl.col("timestamp") < train_end)
+        )
+        test_all = events.filter(
+            (pl.col("timestamp") >= test_start) & (pl.col("timestamp") < test_end)
+        )
+        # Test users must also appear in train (next-item prediction setup).
+        train_users = train.select("uid").unique()
+        test = test_all.join(train_users, on="uid", how="inner")
+        return train, test
+
+    def _build_sessions(self, pl, events, session_gap_units: int):
+        sorted_events = events.sort(["uid", "timestamp"])
+        return (
+            sorted_events
+            .with_columns(
+                (
+                    (pl.col("timestamp").diff().fill_null(0) > session_gap_units)
+                    .cast(pl.UInt32)
+                    .cum_sum()
+                )
+                .over("uid")
+                .alias("session_id")
+            )
+            .group_by(["uid", "session_id"])
+            .agg(
+                pl.col("item_id").alias("item_ids"),
+                pl.col("timestamp").alias("timestamps"),
+                pl.col("event_type").alias("event_types"),
+                pl.col("is_organic").alias("is_organic"),
+                pl.col("played_ratio_pct").alias("played_ratio_pct"),
+                pl.col("track_length_seconds").alias("track_length_seconds"),
+            )
+            .sort(["uid", "session_id"])
+        )
+
+    def _build_session_index(self, pl, sessions):
+        return (
+            sessions
+            .with_columns(pl.col("item_ids").list.len().alias("session_len"))
+            .group_by("uid")
+            .agg(
+                pl.col("session_id").alias("session_ids"),
+                pl.col("session_len").alias("session_lens"),
+                pl.col("session_len").cum_sum().alias("session_offsets"),
+            )
+            .sort("uid")
+        )
+
+    def _compute_item_popularity(self, train_events) -> np.ndarray:
+        counts = (
+            train_events
+            .group_by("item_id")
+            .len()
+            .sort("item_id")
+        )
+        max_item = int(counts["item_id"].max())
+        popularity = np.zeros(max_item + 1, dtype=np.int64)
+        popularity[counts["item_id"].to_numpy()] = counts["len"].to_numpy()
+        return popularity
+
+
 def main() -> None:
     parser = argparse.ArgumentParser()
-    parser.add_argument("--dataset", choices=SUPPORTED_DATASETS, help="dataset")
+    parser.add_argument(
+        "--dataset",
+        choices=SUPPORTED_DATASETS,
+        required=True,
+        help="dataset",
+    )
+    parser.add_argument(
+        "--data-path",
+        default="data/",
+        help=(
+            "Root directory for raw + processed data. KuaiRand defaults to "
+            "the existing `data/` convention; Yambda defaults to `data/` too "
+            "but is commonly overridden to a shared filesystem location with "
+            "enough space for the 5b variant (~500 GB)."
+        ),
+    )
     args = parser.parse_args()
+
+    data_path = args.data_path.rstrip("/") + "/"
+
     if args.dataset == "kuairand-1k":
-        kuairand_processor = DLRMKuaiRandProcessor(
+        DLRMKuaiRandProcessor(
             download_url="https://zenodo.org/records/10439422/files/KuaiRand-1K.tar.gz",
-            data_path="data/",
+            data_path=data_path,
             file_name="KuaiRand-1K.tar.gz",
             prefix="KuaiRand-1K",
-        )
-        kuairand_processor.preprocess()
+        ).preprocess()
     elif args.dataset == "kuairand-27k":
-        kuairand_processor = DLRMKuaiRandProcessor(
+        DLRMKuaiRandProcessor(
             download_url="https://zenodo.org/records/10439422/files/KuaiRand-27K.tar.gz",
-            data_path="data/",
+            data_path=data_path,
             file_name="KuaiRand-27K.tar.gz",
             prefix="KuaiRand-27K",
-        )
-        kuairand_processor.preprocess()
+        ).preprocess()
+    elif args.dataset in YAMBDA_SIZES:
+        DLRMYambdaProcessor(
+            data_path=data_path,
+            size=YAMBDA_SIZES[args.dataset],
+        ).preprocess()
 
 
 if __name__ == "__main__":
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 939b42b78..afc7581ee 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -22,12 +22,18 @@ sparse_optimizer_factory_and_class.weight_decay = 0
 sparse_optimizer_factory_and_class.eps = 1e-8
 sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
 
+# Data root: resolved at runtime from $DLRM_DATA_PATH if set, else the literal
+# below. Used by both make_train_test_dataloaders and get_dataset.
+DATA_PATH = @env_path()
+env_path.key = "DLRM_DATA_PATH"
+env_path.default = "/apps/chcai/dlrm_data"
+
 # dataloader configs
 make_train_test_dataloaders.batch_size = %batch_size
 make_train_test_dataloaders.eval_batch_size = 32
 make_train_test_dataloaders.dataset_type = %dataset
 make_train_test_dataloaders.train_split_percentage = 0.90
-make_train_test_dataloaders.new_path_prefix = "/apps/chcai/dlrm_data"
+make_train_test_dataloaders.new_path_prefix = %DATA_PATH
 make_train_test_dataloaders.num_workers = %num_workers
 make_train_test_dataloaders.prefetch_factor = %prefetch_factor
 make_train_test_dataloaders.num_blocks = 1
@@ -36,7 +42,7 @@ make_train_test_dataloaders.num_blocks = 1
 make_optimizer_and_shard.hbm_cap_gb = 260
 
 get_dataset.name = %dataset
-get_dataset.new_path_prefix = "/apps/chcai/dlrm_data"
+get_dataset.new_path_prefix = %DATA_PATH
 # Per-pool truncation cap. Cache is keyed by L on disk under
 # <processed_dir>/hstu_cache_L<N>/; switching L reuses an existing cache
 # (L=2048 was built in a prior session, no rebuild needed).
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
index 43d641a0c..9f60c162a 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
@@ -508,6 +508,21 @@ def reset(self, mode: str = "train"):
 ]
 
 
+@gin.configurable
+def env_path(key: str = "", default: str = "") -> str:
+    """Resolve a path from os.environ[key], falling back to `default`.
+
+    Intended as a gin macro so paths can be overridden via env vars without
+    editing the gin file. Example gin usage:
+
+        DATA_PATH = @env_path()
+        env_path.key = "DLRM_DATA_PATH"
+        env_path.default = "/some/default/path"
+        make_train_test_dataloaders.new_path_prefix = %DATA_PATH
+    """
+    return os.environ.get(key, default) if key else default
+
+
 @gin.configurable
 def get_dataset(name: str, new_path_prefix: str = "", history_length: Optional[int] = None):
     """

From c639f6a9de5093300540f657f1e7fe9e28137af8 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Sat, 30 May 2026 05:37:25 +0000
Subject: [PATCH 007/113] Fix NCCL init: set CUDA device before
 init_process_group

Every rank's first CUDA context was landing on GPU 0 (the default
device), so NCCL bound its communicators there before set_device
switched to the correct GPU. This leaked allocations on GPU 0 across
all 8 ranks and caused spurious OOMs during embedding-table init at
high HBM caps. Moving set_device above init_process_group and passing
device_id ensures each rank's NCCL state is created on its own GPU.
---
 .../generative_recommenders/dlrm_v3/train/utils.py   | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index 90669c1a6..acf2f14dd 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -79,18 +79,22 @@ def setup(
     BACKEND = dist.Backend.NCCL
     TIMEOUT = 1800
 
+    # set device BEFORE init_process_group so NCCL binds this rank to its
+    # own GPU; otherwise every rank's first CUDA context lands on GPU 0,
+    # leaving stale allocations and triggering OOMs on rank 0.
+    torch.cuda.set_device(device)
+
     # initialize the process group
     if not dist.is_initialized():
-        dist.init_process_group("nccl", rank=rank, world_size=world_size)
+        dist.init_process_group(
+            "nccl", rank=rank, world_size=world_size, device_id=device
+        )
 
     pg = dist.new_group(
         backend=BACKEND,
         timeout=timedelta(seconds=TIMEOUT),
     )
 
-    # set device
-    torch.cuda.set_device(device)
-
     return pg
 
 

From d76f96b6cac67569829fee37e962655253edae27 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Sat, 30 May 2026 23:04:25 -0500
Subject: [PATCH 008/113] Profiler: write traces to local disk under
 <repo>/results/<run_name>/

dlrm_v3/utils.py:
  - Replace the hardcoded manifold:// URL in _on_trace_ready_fn with a
    local trace_dir (default /tmp/dlrm_v3_traces). Filename now follows
    trace_step{step}_rank{rank}.json so per-rank captures don't collide.
  - Add _multi_window_schedule helper: a torch.profiler schedule that
    fires around each step in trace_steps=[...] (warmup before, active
    after, RECORD_AND_SAVE at the last active step). Lets one run
    capture multiple windows (e.g. early-step + steady-state) without
    re-running.
  - Make Profiler @gin.configurable. New knobs: trace_dir, trace_steps,
    wait, warmup, repeat, record_shapes, profile_memory, with_stack,
    with_flops, with_modules. Defaults preserve the prior single-window
    behavior (wait=10, warmup=20, active=50, repeat=1) so existing
    callers are unaffected.
  - Add run_results_dir(run_name) gin macro: resolves to
    <recommendation_v4>/results/<run_name>/. Used as the canonical
    output prefix for traces (and any future per-run artifacts).
    recommendation_v4/ is bind-mounted into the training container, so
    files written through this helper persist on the host.

train/gin/yambda_5b.gin:
  - Wire RUN_NAME env override -> run_results_dir(run_name=%RUN_NAME)
    -> Profiler.trace_dir. Sets trace_steps=[52], warmup=5, active=5
    (capture the 5-step window 52-56 on every rank).
  - Toggle train_eval_loop.output_trace = True so the profiler actually
    instantiates.

.gitignore:
  - Add results/ alongside the existing tmp/exps/ckpts/ runtime
    directories so per-run trace dumps don't show up in git status.

Verified: 8x MI350X TRITON yambda-5b run at bs=32 seq=2k drops
8 well-formed trace_step62_rank{0..7}.json files (~37 MB each) into
recommendation_v4/results/default/; visible on the host immediately.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 recommendation_v4/.gitignore                  |   1 +
 .../dlrm_v3/train/gin/yambda_5b.gin           |  13 ++
 .../generative_recommenders/dlrm_v3/utils.py  | 157 ++++++++++++------
 3 files changed, 122 insertions(+), 49 deletions(-)

diff --git a/recommendation_v4/.gitignore b/recommendation_v4/.gitignore
index 560f823c4..5edddc5b3 100644
--- a/recommendation_v4/.gitignore
+++ b/recommendation_v4/.gitignore
@@ -2,6 +2,7 @@
 tmp/
 exps/
 ckpts/
+results/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index afc7581ee..01cb92ed6 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -59,6 +59,19 @@ train_eval_loop.metric_log_frequency = 50
 train_eval_loop.eval_frequency = 5000
 train_eval_loop.num_eval_batches = 500
 train_eval_loop.checkpoint_frequency = 1000000000  # disable mid-training checkpoints (disk-full guard)
+train_eval_loop.output_trace = True
+
+# Run name → recommendation_v4/results/<run_name>/ (override via $RUN_NAME env).
+RUN_NAME = @env_path()
+env_path.key     = "RUN_NAME"
+env_path.default = "default"
+run_results_dir.run_name = %RUN_NAME
+
+# profiler: capture a 5-step window starting at step 52, every rank.
+Profiler.trace_steps = [52]
+Profiler.warmup = 5
+Profiler.active = 5
+Profiler.trace_dir = @run_results_dir()
 
 # logger variables
 MetricsLogger.tensorboard_log_path = "/tmp/tb/yambda_5b/"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
index 9f60c162a..3c534dafc 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
@@ -21,6 +21,7 @@
 import logging
 import os
 import time
+from pathlib import Path
 from typing import Callable, Dict, List, Optional
 
 import gin
@@ -81,54 +82,41 @@ def _compute(self) -> List[MetricComputationReport]:
 
 def _on_trace_ready_fn(
     rank: Optional[int] = None,
+    trace_dir: str = "/tmp/dlrm_v3_traces",
 ) -> Callable[[torch.profiler.profile], None]:
-    """
-    Create a callback function for handling profiler trace output.
-
-    Args:
-        rank: Optional process rank for distributed training (included in filename).
+    """Create the on_trace_ready callback that exports a chrome trace to disk.
 
-    Returns:
-        A callback function that exports profiler traces to Manifold storage.
+    Filename follows the convention ``trace_step{step}_rank{rank}.json`` so
+    multi-rank captures don't collide and ``scripts/stitch_traces.py`` can
+    merge them by step number.
     """
 
     def handle_fn(p: torch.profiler.profile) -> None:
-        bucket_name = "hammer_gpu_traces"
-        pid = os.getpid()
-        rank_str = f"_rank_{rank}" if rank is not None else ""
-        file_name = f"libkineto_activities_{pid}_{rank_str}.json"
-        manifold_path = "tree/dlrm_v3_bench"
-        target_object_name = manifold_path + "/" + file_name + ".gz"
-        path = f"manifold://{bucket_name}/{manifold_path}/{file_name}"
+        os.makedirs(trace_dir, exist_ok=True)
+        step = getattr(p, "step_num", 0)
+        rank_str = f"_rank{rank}" if rank is not None else ""
+        file_name = f"trace_step{step}{rank_str}.json"
+        path = os.path.join(trace_dir, file_name)
         logger.warning(
             p.key_averages(group_by_input_shape=True).table(
                 sort_by="self_cuda_time_total"
             )
         )
-        logger.warning(
-            f"trace url: https://www.internalfb.com/intern/perfdoctor/trace_view?filepath={target_object_name}&bucket={bucket_name}"
-        )
         p.export_chrome_trace(path)
+        logger.warning(f"Trace written to: {path}")
 
     return handle_fn
 
 
-def profiler_or_nullcontext(enabled: bool, with_stack: bool):
-    """
-    Create a profiler context manager or null context based on enabled flag.
-
-    Args:
-        enabled: Whether to enable profiling.
-        with_stack: Whether to include stack traces in profile.
-
-    Returns:
-        Either a torch.profiler.profile context manager or nullcontext.
-    """
+def profiler_or_nullcontext(
+    enabled: bool, with_stack: bool, trace_dir: str = "/tmp/dlrm_v3_traces"
+):
+    """One-shot profile context for ad-hoc captures (no scheduling)."""
     return (
         profile(
             # pyre-fixme[16]: Module `profiler` has no attribute `ProfilerActivity`.
             activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
-            on_trace_ready=_on_trace_ready_fn(),
+            on_trace_ready=_on_trace_ready_fn(trace_dir=trace_dir),
             with_stack=with_stack,
         )
         if enabled
@@ -136,35 +124,85 @@ def profiler_or_nullcontext(enabled: bool, with_stack: bool):
     )
 
 
-class Profiler:
+def _multi_window_schedule(
+    trace_steps,
+    warmup: int,
+    active: int,
+):
+    """Custom schedule that profiles around each step in ``trace_steps``.
+
+    Step s gets:
+        [s - warmup, s)        -> WARMUP
+        [s, s + active - 1)    -> RECORD
+        s + active - 1         -> RECORD_AND_SAVE
     """
-    Wrapper around PyTorch profiler with scheduled profiling.
+    windows = [(s - warmup, s, s + active) for s in sorted(trace_steps)]
 
-    Implements a wait-warmup-active schedule for controlled profiling that
-    avoids startup noise and captures representative performance data.
+    def schedule_fn(step: int) -> torch.profiler.ProfilerAction:
+        for warmup_start, active_start, active_end in windows:
+            if warmup_start <= step < active_start:
+                return torch.profiler.ProfilerAction.WARMUP
+            if active_start <= step < active_end - 1:
+                return torch.profiler.ProfilerAction.RECORD
+            if step == active_end - 1:
+                return torch.profiler.ProfilerAction.RECORD_AND_SAVE
+        return torch.profiler.ProfilerAction.NONE
 
-    Args:
-        rank: Process rank for trace file naming.
-        active: Number of active profiling steps (default: 50).
+    return schedule_fn
+
+
+@gin.configurable
+class Profiler:
+    """Scheduled torch.profiler wrapper that writes Chrome traces to disk.
+
+    Two modes (set via gin):
+
+    * Single window (default): ``wait=10, warmup=20, active=50, repeat=1``.
+      Captures one contiguous window starting after ``wait`` steps.
+    * Multi-window: ``trace_steps=[500, 1000, 5000]`` (overrides wait+repeat).
+      Captures a separate window around each listed step.
+
+    All knobs are gin-tunable, e.g. in a gin file::
+
+        Profiler.trace_dir = "/apps/chcai/dlrm_runs/exp42/trace"
+        Profiler.trace_steps = [500, 1000, 5000]
+        Profiler.warmup = 5
+        Profiler.active = 10
     """
 
-    def __init__(self, rank, active: int = 50) -> None:
+    def __init__(
+        self,
+        rank: int,
+        active: int = 50,
+        wait: int = 10,
+        warmup: int = 20,
+        repeat: int = 1,
+        trace_steps: Optional[List[int]] = None,
+        trace_dir: str = "/tmp/dlrm_v3_traces",
+        record_shapes: bool = True,
+        profile_memory: bool = False,
+        with_stack: bool = False,
+        with_flops: bool = False,
+        with_modules: bool = False,
+    ) -> None:
         self.rank = rank
+        self.trace_dir = trace_dir
+        if trace_steps:
+            sched = _multi_window_schedule(trace_steps, warmup, active)
+        else:
+            sched = torch.profiler.schedule(
+                wait=wait, warmup=warmup, active=active, repeat=repeat
+            )
         self._profiler: profiler.profile = torch.profiler.profile(
-            schedule=torch.profiler.schedule(
-                wait=10,
-                warmup=20,
-                active=active,
-                repeat=1,
-            ),
-            on_trace_ready=_on_trace_ready_fn(self.rank),
+            schedule=sched,
+            on_trace_ready=_on_trace_ready_fn(self.rank, trace_dir),
             # pyre-fixme[16]: Module `profiler` has no attribute `ProfilerActivity`.
             activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
-            record_shapes=True,
-            profile_memory=False,
-            with_stack=False,
-            with_flops=False,
-            with_modules=False,
+            record_shapes=record_shapes,
+            profile_memory=profile_memory,
+            with_stack=with_stack,
+            with_flops=with_flops,
+            with_modules=with_modules,
         )
 
     def step(self) -> None:
@@ -523,6 +561,27 @@ def env_path(key: str = "", default: str = "") -> str:
     return os.environ.get(key, default) if key else default
 
 
+@gin.configurable
+def run_results_dir(run_name: str = "default", subdir: str = "results") -> str:
+    """Resolve ``<recommendation_v4>/<subdir>/<run_name>`` from this file's location.
+
+    Used as a gin macro to give per-run output directories that persist on the
+    host (recommendation_v4 is bind-mounted into the training container).
+
+    Example gin usage::
+
+        RUN_NAME = @env_path()
+        env_path.key     = "RUN_NAME"
+        env_path.default = "default"
+        run_results_dir.run_name = %RUN_NAME
+        Profiler.trace_dir = @run_results_dir()
+    """
+    # utils.py lives at <recommendation_v4>/generative_recommenders/dlrm_v3/utils.py;
+    # parents[2] climbs to <recommendation_v4>/.
+    repo_root = Path(__file__).resolve().parents[2]
+    return str(repo_root / subdir / run_name)
+
+
 @gin.configurable
 def get_dataset(name: str, new_path_prefix: str = "", history_length: Optional[int] = None):
     """

From 4a2a7bde225191e28f66817762a127fe139d8793 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Sun, 31 May 2026 00:09:01 -0500
Subject: [PATCH 009/113] Tracing fixes: gin scoping, drop active=10 override,
 intuitive filenames, trim_warmup

dlrm_v3/utils.py
  * Add run_results_dir(run_name) gin macro (resolves to
    <recommendation_v4>/results/<run_name>/) so trace artifacts persist on
    the host via the bind-mount.
  * Add _trim_warmup_from_trace post-processor: dedupes ProfilerStep spans
    by name first, then keeps only the last N unique steps' worth of
    events. Drops WARMUP-phase events that torch.profiler otherwise
    includes in the chrome trace.
  * Add trim_warmup kwarg (default True) on Profiler; auto-invokes the
    trimmer with N=active so the exported file matches the user-requested
    active window.
  * Filename now uses trace_steps[i] (the user-requested step) as the
    {step} label when multi-window mode is in use, instead of
    torch.profiler's internal step_num (which is off by ~warmup+active
    from the schedule trigger and confused everyone).

train/utils.py
  * Drop hardcoded `active=10` from the four `Profiler(rank, active=10)`
    call sites in train_loop / train_eval_loop. Positional args block
    gin overrides; once removed, Profiler.active in gin (default 50) and
    user gin bindings actually take effect.

train/gin/yambda_5b.gin
  * Fix env_path scoping collision: both DATA_PATH and RUN_NAME used the
    unscoped @env_path() configurable, which made the second binding's
    `env_path.key = "RUN_NAME"` overwrite the first's
    `env_path.key = "DLRM_DATA_PATH"`. Both names then resolved via the
    same env var (whichever was last), pointing DATA_PATH at trace_run2/
    and breaking dataset loads.
    Fixed by giving each call site its own scope: @data/env_path() and
    @run/env_path(), each with independent .key/.default bindings.
  * Set Profiler.trace_steps=[52], warmup=1, active=5; let trim_warmup
    default to True so the exported trace contains exactly 5 active
    ProfilerStep events.

Verified end-to-end:
  - Run with RUN_NAME=trace_run2 writes results/trace_run2/trace_step52_
    rank{0..7}.json (~19 MB each), step labels match trace_steps gin.
  - Triton cache persisted across runs: cold start ~6 min -> warm start
    ~2 min for autotune-to-first-step.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin           |  20 ++--
 .../dlrm_v3/train/utils.py                    |   8 +-
 .../generative_recommenders/dlrm_v3/utils.py  | 104 +++++++++++++++++-
 3 files changed, 115 insertions(+), 17 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 01cb92ed6..b090695dc 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -24,9 +24,12 @@ sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
 
 # Data root: resolved at runtime from $DLRM_DATA_PATH if set, else the literal
 # below. Used by both make_train_test_dataloaders and get_dataset.
-DATA_PATH = @env_path()
-env_path.key = "DLRM_DATA_PATH"
-env_path.default = "/apps/chcai/dlrm_data"
+# Scoped (`data/env_path`) so this binding doesn't collide with the RUN_NAME
+# env_path binding below — every distinct env_path() call site needs its own
+# scope or the later `env_path.key=...` overrides earlier ones.
+DATA_PATH = @data/env_path()
+data/env_path.key = "DLRM_DATA_PATH"
+data/env_path.default = "/apps/chcai/dlrm_data"
 
 # dataloader configs
 make_train_test_dataloaders.batch_size = %batch_size
@@ -62,14 +65,17 @@ train_eval_loop.checkpoint_frequency = 1000000000  # disable mid-training checkp
 train_eval_loop.output_trace = True
 
 # Run name → recommendation_v4/results/<run_name>/ (override via $RUN_NAME env).
-RUN_NAME = @env_path()
-env_path.key     = "RUN_NAME"
-env_path.default = "default"
+RUN_NAME = @run/env_path()
+run/env_path.key     = "RUN_NAME"
+run/env_path.default = "default"
 run_results_dir.run_name = %RUN_NAME
 
 # profiler: capture a 5-step window starting at step 52, every rank.
+# `trim_warmup = True` (default) post-filters the chrome trace so only the 5
+# `active` steps appear; the 1 WARMUP step still runs (lets CUPTI/ROCprof
+# settle) but its events are dropped from the output file.
 Profiler.trace_steps = [52]
-Profiler.warmup = 5
+Profiler.warmup = 1
 Profiler.active = 5
 Profiler.trace_dir = @run_results_dir()
 
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index acf2f14dd..2ff025394 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -498,7 +498,7 @@ def train_loop(
 ) -> None:
     model.train()
     batch_idx: int = start_batch_idx
-    profiler = Profiler(rank, active=10) if output_trace else None
+    profiler = Profiler(rank) if output_trace else None
 
     for epoch in range(num_epochs):
         dataloader.sampler.set_epoch(epoch)  # pyre-ignore [16]
@@ -567,7 +567,7 @@ def eval_loop(
 ) -> None:
     model.eval()
     batch_idx: int = 0
-    profiler = Profiler(rank, active=10) if output_trace else None
+    profiler = Profiler(rank) if output_trace else None
     metric_logger.reset(mode="eval")
     with torch.no_grad():
         for sample in dataloader:
@@ -627,7 +627,7 @@ def train_eval_loop(
 ) -> None:
     train_batch_idx: int = start_train_batch_idx
     eval_batch_idx: int = start_eval_batch_idx
-    profiler = Profiler(rank, active=10) if output_trace else None
+    profiler = Profiler(rank) if output_trace else None
     assert train_dataloader is not None and eval_dataloader is not None
 
     eval_data_iterator = iter(eval_dataloader)
@@ -752,7 +752,7 @@ def streaming_train_eval_loop(
     metric_log_frequency: int = 1,
     checkpoint_frequency: int = 100,
 ) -> None:
-    profiler = Profiler(rank, active=10) if output_trace else None
+    profiler = Profiler(rank) if output_trace else None
     dataset_class, kwargs = get_dataset()
     kwargs["embedding_config"] = embedding_table_configs
     dataset = HammerToTorchDataset(
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
index 3c534dafc..508f894a9 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
@@ -80,22 +80,108 @@ def _compute(self) -> List[MetricComputationReport]:
 logger = logging.getLogger("utils")
 
 
+def _trim_warmup_from_trace(path: str, keep_n_active: int) -> None:
+    """Post-process a chrome trace to drop events from WARMUP-phase steps.
+
+    torch.profiler captures events during BOTH the WARMUP and RECORD phases
+    of a schedule and writes them all to the exported trace. There is no
+    built-in flag to exclude WARMUP from the export. We approximate it by:
+
+      1) Finding all ``ProfilerStep#N`` spans in the file.
+      2) Keeping only the last ``keep_n_active`` of them (sorted by start
+         timestamp) as the "active" range.
+      3) Filtering ``traceEvents`` to events whose ``ts`` falls inside that
+         range. Metadata events (``ph='M'``) are always preserved.
+
+    Mutates the file in place.
+    """
+    import json as _json
+    with open(path) as f:
+        d = _json.load(f)
+    events = d.get("traceEvents", [])
+
+    # ProfilerStep spans mark training-step boundaries; we filter by their
+    # time ranges rather than by name index because step numbering can offset
+    # between schedule_fn argument and the value printed in the trace.
+    # torch.profiler emits one ProfilerStep#N span per CPU thread that ran
+    # during that step, so dedupe by name first so "5 active steps" means
+    # 5 distinct step numbers, not 5 spans.
+    name_to_span: Dict[str, tuple] = {}
+    for e in events:
+        nm = e.get("name", "")
+        if "ProfilerStep" not in nm or e.get("ph") != "X" or "ts" not in e:
+            continue
+        ts = e["ts"]
+        end = ts + e.get("dur", 0)
+        prev = name_to_span.get(nm)
+        if prev is None:
+            name_to_span[nm] = (ts, end)
+        else:
+            name_to_span[nm] = (min(prev[0], ts), max(prev[1], end))
+    if len(name_to_span) <= keep_n_active:
+        return
+    sorted_spans = sorted(name_to_span.values())
+    active = sorted_spans[-keep_n_active:]
+    t_start = min(s for s, _ in active)
+    t_end = max(e for _, e in active)
+
+    def _keep(e: dict) -> bool:
+        if e.get("ph") == "M":
+            return True
+        ts = e.get("ts")
+        if ts is None:
+            return True
+        return t_start <= ts < t_end
+
+    kept = [e for e in events if _keep(e)]
+    d["traceEvents"] = kept
+    with open(path, "w") as f:
+        _json.dump(d, f)
+    logger.warning(
+        f"Trimmed WARMUP events from {path}: {len(events):,} -> {len(kept):,} "
+        f"(kept active range [{t_start:.0f}, {t_end:.0f}] us)"
+    )
+
+
 def _on_trace_ready_fn(
     rank: Optional[int] = None,
     trace_dir: str = "/tmp/dlrm_v3_traces",
+    keep_n_active: Optional[int] = None,
+    trace_steps: Optional[List[int]] = None,
 ) -> Callable[[torch.profiler.profile], None]:
     """Create the on_trace_ready callback that exports a chrome trace to disk.
 
-    Filename follows the convention ``trace_step{step}_rank{rank}.json`` so
-    multi-rank captures don't collide and ``scripts/stitch_traces.py`` can
-    merge them by step number.
+    Filename follows ``trace_step{step}_rank{rank}.json`` so multi-rank
+    captures don't collide and ``scripts/stitch_traces.py`` can merge them
+    by step number.
+
+    The ``{step}`` label:
+
+    * If ``trace_steps`` is provided (multi-window mode), the Nth callback
+      invocation labels its file with ``trace_steps[N]`` -- i.e. the
+      user-requested step that triggered the window. This is the most
+      intuitive labelling.
+    * Otherwise falls back to ``p.step_num`` (torch.profiler's internal
+      counter at trigger time, off by ~warmup+active from the schedule
+      arg).
+
+    If ``keep_n_active`` is set, the exported file is post-processed to keep
+    only the last N ProfilerStep-spans worth of events (i.e. drop WARMUP).
     """
+    state = {"fire_count": 0}
 
     def handle_fn(p: torch.profiler.profile) -> None:
         os.makedirs(trace_dir, exist_ok=True)
-        step = getattr(p, "step_num", 0)
+        if trace_steps:
+            i = state["fire_count"]
+            step_label = (
+                trace_steps[i] if i < len(trace_steps) else getattr(p, "step_num", 0)
+            )
+        else:
+            step_label = getattr(p, "step_num", 0)
+        state["fire_count"] += 1
         rank_str = f"_rank{rank}" if rank is not None else ""
-        file_name = f"trace_step{step}{rank_str}.json"
+        file_name = f"trace_step{step_label}{rank_str}.json"
         path = os.path.join(trace_dir, file_name)
         logger.warning(
             p.key_averages(group_by_input_shape=True).table(
@@ -104,6 +190,8 @@ def handle_fn(p: torch.profiler.profile) -> None:
         )
         p.export_chrome_trace(path)
         logger.warning(f"Trace written to: {path}")
+        if keep_n_active is not None and keep_n_active > 0:
+            _trim_warmup_from_trace(path, keep_n_active)
 
     return handle_fn
 
@@ -179,6 +267,7 @@ def __init__(
         repeat: int = 1,
         trace_steps: Optional[List[int]] = None,
         trace_dir: str = "/tmp/dlrm_v3_traces",
+        trim_warmup: bool = True,
         record_shapes: bool = True,
         profile_memory: bool = False,
         with_stack: bool = False,
@@ -193,9 +282,12 @@ def __init__(
             sched = torch.profiler.schedule(
                 wait=wait, warmup=warmup, active=active, repeat=repeat
             )
+        keep_n = active if trim_warmup else None
         self._profiler: profiler.profile = torch.profiler.profile(
             schedule=sched,
-            on_trace_ready=_on_trace_ready_fn(self.rank, trace_dir),
+            on_trace_ready=_on_trace_ready_fn(
+                self.rank, trace_dir, keep_n, trace_steps
+            ),
             # pyre-fixme[16]: Module `profiler` has no attribute `ProfilerActivity`.
             activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
             record_shapes=record_shapes,

From 9b0ae8b3607e1f306c3c19a173e65029c437930e Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Mon, 1 Jun 2026 02:04:49 -0500
Subject: [PATCH 010/113] =?UTF-8?q?gin:=20history=5Flength=202048=20?=
 =?UTF-8?q?=E2=86=92=202039=20+=20expanded=20per-pool=20comment?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

2048 was chosen for "round number near max_seq_len" but it slightly
overflows the per-sample budget: 3 * (2048//3) + 9 = 2055 > 2048, so
the dataset truncates ~7 UIH events to fit. 2039 makes the math exact
(3 * 679 + 9 = 2046 ≤ 2048) so no truncation.

Comment block expanded to document:
  - The 3-pool gather semantic (L//3 events per pool, interleaved
    chronologically).
  - The like-pool under-fill observation: like events are only 1.9%
    of yambda corpus and max user lifetime is ~28k events, so the
    like pool fills to ~105 events per anchor on average (not 679).
    TRITON's jagged attention skips the unfilled slots, so under-fill
    costs sequence budget but not GPU compute.

No code change. Cache for L=2039 already built and reused.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin            | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index b090695dc..f9b451f33 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -46,10 +46,20 @@ make_optimizer_and_shard.hbm_cap_gb = 260
 
 get_dataset.name = %dataset
 get_dataset.new_path_prefix = %DATA_PATH
-# Per-pool truncation cap. Cache is keyed by L on disk under
-# <processed_dir>/hstu_cache_L<N>/; switching L reuses an existing cache
-# (L=2048 was built in a prior session, no rebuild needed).
-get_dataset.history_length = 2048
+# Total user-interaction-history (UIH) budget per sample, distributed evenly
+# across 3 behaviour pools (listen+ / like / skip) at L//3 events each.
+# Per-sample sequence the model sees =
+#     3 × (L // 3) + 8 contextual + 1 candidate
+# Choosing 2039 makes 3 × 679 + 9 = 2046, the largest value that fits
+# get_hstu_configs.max_seq_len = 2048 with no dataset-side truncation.
+# Larger L overflows the budget; the dataset truncates UIH events to fit.
+# Note: like events are only 1.9% of the yambda corpus and max user lifetime
+# is ~28k events, so the like pool fills to ~105 events per anchor on
+# average (not 679) — TRITON's jagged attention skips the unfilled slots,
+# so the under-fill costs sequence budget but not GPU compute.
+# Cache is keyed by L on disk under <processed_dir>/hstu_cache_L<N>/;
+# switching L reuses an existing cache or builds a new one (~5 min).
+get_dataset.history_length = 2039
 
 # Model-side attention budget. Dataset truncates UIH to fit this value if
 # `history_length + contextual + candidate` would overflow.

From 2b58a63f113869a3d78f98bfc0fef415b10ffaa8 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Mon, 1 Jun 2026 02:17:35 -0500
Subject: [PATCH 011/113] =?UTF-8?q?README:=20rewrite=20for=20yambda-5b=20f?=
 =?UTF-8?q?ork=20=E2=80=94=20upstream=20link,=20data=20prep,=20per-pool=20?=
 =?UTF-8?q?gather?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Documents the fork's scope (yambda-5b on HSTU dlrm_v3 path), per-pool gather
strategy with effective fill table, and dataset statistics. Sections indexed
1–5 for navigation.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 recommendation_v4/README.MD | 185 ++++++++++++++++++------------------
 1 file changed, 93 insertions(+), 92 deletions(-)

diff --git a/recommendation_v4/README.MD b/recommendation_v4/README.MD
index a60d0d3d7..f22f1e165 100644
--- a/recommendation_v4/README.MD
+++ b/recommendation_v4/README.MD
@@ -1,135 +1,136 @@
-# Generative Recommenders
+# Recommendation v4 — HSTU + Yambda-5b
 
-Repository hosting code for ``Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations`` ([ICML'24 paper](https://proceedings.mlr.press/v235/zhai24a.html)) and related code, where we demonstrate that the ubiquitously used classical deep learning recommendation paradigm (DLRMs) can be reformulated as a generative modeling problem (Generative Recommenders or GRs) to overcome known compute scaling bottlenecks, propose efficient algorithms such as HSTU and M-FALCON to accelerate training and inference for large-scale sequential models by 10x-1000x, and demonstrate scaling law for the first-time in deployed, billion-user scale recommendation systems.
+This is a fork of [meta-recsys/generative-recommenders](https://github.com/meta-recsys/generative-recommenders) extended to train HSTU (Hierarchical Sequential Transducer Units) on the [Yambda-5b](https://huggingface.co/datasets/yandex/yambda) music-recommendation dataset, sized as an MLPerf-style training benchmark inside the `mlcommons/training` tree.
 
-## Getting started
+For the original repository and the underlying ICML'24 paper (*Actions Speak Louder than Words*), see the upstream README at the link above. This README focuses on what this fork adds: the Yambda data pipeline, the per-pool gather strategy, and how the data feeds into the HSTU `modules/` (dlrm_v3) path.
 
-We recommend using `requirements.txt`. This has been tested with Ubuntu 22.04, CUDA 12.4, and Python 3.10.
+## 1. Quick start (8-GPU Yambda)
 
 ```bash
-pip3 install -r requirements.txt
+docker exec yambda_8gpu bash -c \
+  'cd /workspace/recommendation_v4 && bash scripts/launch_smoke_8gpu.sh'
 ```
 
-Alternatively, you can manually install PyTorch based on official instructions. Then,
+Override the data path or run name without editing the gin:
 
 ```bash
-pip3 install gin-config pandas fbgemm_gpu torchrec tensorboard
+DLRM_DATA_PATH=/apps/chcai/dlrm_data \
+RUN_NAME=my_experiment \
+bash scripts/launch_smoke_8gpu.sh
 ```
 
-## Experiments
+Data path resolves at runtime via `env_path` gin macros (see [yambda_5b.gin](generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin)). Traces and any per-run outputs land in `results/<RUN_NAME>/`.
 
-### Public Experiments
-
-To reproduce the public experiments in our paper (traditional sequential recommender setting, Section 4.1.1) on MovieLens and Amazon Reviews in the paper, please follow these steps:
-
-#### Download and preprocess data.
-
-```bash
-mkdir -p tmp/ && python3 preprocess_public_data.py
-```
-
-A GPU with 24GB or more HBM should work for most datasets.
+## 2. Data preparation
 
 ```bash
-CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-1m/hstu-sampled-softmax-n128-large-final.gin --master_port=12345
+python3 -m generative_recommenders.dlrm_v3.preprocess_public_data \
+  --dataset yambda-5b --data-path /apps/chcai/dlrm_data
 ```
 
-Other configurations are included in configs/ml-1m, configs/ml-20m, and configs/amzn-books to make reproducing these experiments easier.
-
-#### Verify results.
+This downloads the 5b variant of [yandex/yambda](https://huggingface.co/datasets/yandex/yambda) from HuggingFace, then:
 
-By default we write experimental logs to exps/. We can launch tensorboard with something like the following:
+1. **Encodes** the raw `event_type` string column into a uint8 lookup (listen=0, like=1, dislike=2, unlike=3, undislike=4).
+2. **Splits** events temporally — 300 train days, 30-min gap, 1 test day — by `Global Temporal Split` (GTS).
+3. **Segments** per-user event timelines into sessions on a 30-min inactivity gap.
+4. **Computes** per-item popularity for downstream metric weighting.
+5. **Writes** the layout `DLRMv3YambdaDataset` expects:
 
-```bash
-tensorboard --logdir ~/generative-recommenders/exps/ml-1m-l200/ --port 24001 --bind_all
-tensorboard --logdir ~/generative-recommenders/exps/ml-20m-l200/ --port 24001 --bind_all
-tensorboard --logdir ~/generative-recommenders/exps/amzn-books-l50/ --port 24001 --bind_all
+```
+<data-path>/
+├── raw/5b/multi_event.parquet                    50 GB (downloaded)
+├── shared_metadata/
+│   ├── artist_item_mapping.parquet               60 MB
+│   ├── album_item_mapping.parquet                76 MB
+│   └── embeddings.parquet                        18 GB (unused by HSTU training)
+└── processed_5b/
+    ├── train_sessions.parquet                    47 GB  ← main training input
+    ├── test_events.parquet                       152 MB
+    ├── session_index.parquet                     600 MB
+    ├── item_popularity.npy                       75 MB
+    └── split_meta.json                           anchor + boundary stats
 ```
 
-With the provided configuration (.gin) files, you should be able to reproduce the following results (verified as of 04/15/2024):
-
-**MovieLens-1M (ML-1M)**:
-
-| Method        | HR@10            | NDCG@10         | HR@50           | NDCG@50         | HR@200          | NDCG@200        |
-| ------------- | ---------------- | ----------------| --------------- | --------------- | --------------- | --------------- |
-| SASRec        | 0.2853           | 0.1603          | 0.5474          | 0.2185          | 0.7528          | 0.2498          |
-| BERT4Rec      | 0.2843 (-0.4%)   | 0.1537 (-4.1%)  |                 |                 |                 |                 |
-| GRU4Rec       | 0.2811 (-1.5%)   | 0.1648 (+2.8%)  |                 |                 |                 |                 |
-| HSTU          | 0.3097 (+8.6%)   | 0.1720 (+7.3%)  | 0.5754 (+5.1%)  | 0.2307 (+5.6%)  | 0.7716 (+2.5%)  | 0.2606 (+4.3%)  |
-| HSTU-large    | **0.3294 (+15.5%)**  | **0.1893 (+18.1%)** | **0.5935 (+8.4%)**  | **0.2481 (+13.5%)** | **0.7839 (+4.1%)**  | **0.2771 (+10.9%)** |
+For smaller variants (yambda-50m / yambda-500m) substitute the dataset name. The preprocessor takes ~2 min for 50m and ~53 min for 5b end-to-end.
 
-**MovieLens-20M (ML-20M)**:
+## 3. Yambda dataset statistics
 
-| Method        | HR@10            | NDCG@10         | HR@50           | NDCG@50         | HR@200          | NDCG@200        |
-| ------------- | ---------------- | --------------- | --------------- | --------------- | --------------- | --------------- |
-| SASRec        | 0.2889           | 0.1621          | 0.5503          | 0.2199          | 0.7661          | 0.2527          |
-| BERT4Rec      | 0.2816 (-2.5%)   | 0.1703 (+5.1%)  |                 |                 |                 |                 |
-| GRU4Rec       | 0.2813 (-2.6%)   | 0.1730 (+6.7%)  |                 |                 |                 |                 |
-| HSTU          | 0.3273 (+13.3%)  | 0.1895 (+16.9%) | 0.5889 (+7.0%)  | 0.2473 (+12.5%) | 0.7952 (+3.8%)  | 0.2787 (+10.3%) |
-| HSTU-large    | **0.3556 (+23.1%)**  | **0.2098 (+29.4%)** | **0.6143 (+11.6%)** | **0.2671 (+21.5%)** | **0.8074 (+5.4%)**  | **0.2965 (+17.4%)** |
+Numbers from the 5b variant, after preprocessing:
 
-**Amazon Reviews (Books)**:
+| | |
+|---|---|
+| Total interaction events | **4.76 B** |
+| Unique users | **1.00 M** |
+| Max events per user | 27,738 |
+| Median events per user | 2,695 |
+| Mean events per user | 4,763 |
+| Train events (300d) | 4.76 B |
+| Test events (1d) | 22.4 M |
+| Training positions (≥2039 prior events filter) | **3.23 B** |
+| Item catalog size | 9.39 M |
 
-| Method        | HR@10            | NDCG@10         | HR@50           | NDCG@50         | HR@200          | NDCG@200        |
-| ------------- | ---------------- | ----------------|---------------- | --------------- | --------------- | --------------- |
-| SASRec        | 0.0306           | 0.0164          | 0.0754          | 0.0260          | 0.1431          | 0.0362          |
-| HSTU          | 0.0416 (+36.4%)  | 0.0227 (+39.3%) | 0.0957 (+27.1%) | 0.0344 (+32.3%) | 0.1735 (+21.3%) | 0.0461 (+27.7%) |
-| HSTU-large    | **0.0478 (+56.7%)**  | **0.0262 (+60.7%)** | **0.1082 (+43.7%)** | **0.0393 (+51.2%)** | **0.1908 (+33.4%)** | **0.0517 (+43.2%)** |
+### 3.1 Per-event-type distribution (across the full 4.76 B corpus)
 
-for all three tables above, the ``SASRec`` rows are based on [Self-Attentive Sequential Recommendation](https://arxiv.org/abs/1808.09781) but with the original binary cross entropy loss
-replaced with sampled softmax losses proposed in [Revisiting Neural Retrieval on Accelerators](https://arxiv.org/abs/2306.04039). These rows are reproducible with ``configs/*/sasrec-*-final.gin``.
-The ``BERT4Rec`` and ``GRU4Rec`` rows are based on results reported by [Turning Dross Into Gold Loss: is BERT4Rec really better than SASRec?](https://arxiv.org/abs/2309.07602) -
-note that the comparison slightly favors these two, due to them using full negatives whereas the other rows used 128/512 sampled negatives. The ``HSTU`` and ``HSTU-large`` rows are based on [Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations](https://arxiv.org/abs/2402.17152); in particular, HSTU rows utilize identical configurations as SASRec. ``HSTU`` and ``HSTU-large`` results can be reproduced with ``configs/*/hstu-*-final.gin``.
+| Pool | Definition | Count | Share |
+|---|---|---|---|
+| **listen_plus (lp)** | `is_listen AND played_ratio ≥ 50%` | 2.92 B | **61.3%** |
+| **skip** | `is_listen AND played_ratio < 50%` | 1.71 B | **35.9%** |
+| **like** | explicit thumbs-up action | 89 M | **1.9%** |
+| other | dislike / unlike / undislike | 47 M | 1.0% |
 
-### Synthetic Dataset / MovieLens-3B
+The `like` pool is roughly **30× rarer** than `lp` — important context for the gather strategy below.
 
-We support generating synthetic dataset with fractal expansion introduced in https://arxiv.org/abs/1901.08910. This allows us to expand the current 20 million real-world ratings in ML-20M to 3 billion.
+## 4. How data is fed to HSTU
 
-To download the pre-generated synthetic dataset:
+For every training anchor (a LISTEN event with ≥ `history_length` prior events), the dataset builds a `(uih_kjt, candidate_kjt)` pair:
 
-```bash
-pip3 install gdown
-mkdir -p tmp/ && cd tmp/
-gdown https://drive.google.com/uc?id=1-jZ6k0el7e7PyFnwqMLfqUTRh_Qdumt-
-unzip ml-3b.zip && rm ml-3b.zip
 ```
-
-To generate the synthetic dataset on your own:
-
-```bash
-python3 run_fractal_expansion.py --input-csv-file tmp/ml-20m/ratings.csv --write-dataset True --output-prefix tmp/ml-3b/
+UIH (User Interaction History):
+  ┌─ Sequence features (chronologically interleaved across 3 pools)
+  │    item_id, artist_id, album_id            ← per-position
+  │    action_weight                            ← per-position (LP_BIT/LIKE_BIT/SKIP_BIT)
+  │    action_timestamp, dummy_watch_time      ← per-position
+  └─ Contextual features (length 1 each)
+       uid + 7 cross-feature hashes (user_x_artist, item_x_hour, …)
+                                                = 8 contextual entries
+
+CANDIDATE (the LISTEN event at the anchor):
+  item_id, artist_id, album_id, item_query_time,
+  item_action_weight (LP_BIT if listen_plus, else 0),
+  item_dummy_watchtime
 ```
 
-### Efficiency experiments
+The candidate's `action_weight` is **the supervision label**: HSTU's `_get_supervision_labels_and_weights` masks BCE training to `(supervision_bitmask & task_weight) > 0`, with `task_weight = 1` (LP bit) for the single `listen_plus` task — so only listen_plus candidates supervise.
 
-``ops/triton`` contains triton kernels needed for efficiency experiments. ``ops/cpp`` contains efficient CUDA kernels. In particular, ``ops/cpp/hstu_attention`` contains the attention implementation based on [FlashAttention V3](https://github.com/Dao-AILab/flash-attention) with state-of-the-art efficiency on H100 GPUs.
+### 4.1 Per-pool gather (the cap = L // 3 strategy)
 
-## DLRM-v3
+The UIH is built by `DLRMv3YambdaDataset._gather_interleaved_history`. For each anchor, it:
 
-We have created a DLRM model using HSTU and have developed benchmarks for both training and inference to faciliate production RecSys use cases.
+1. Scans the most recent `scan_window` (default 20,000) events of any type before the anchor, **clipped to user_start** so users with shorter history get a smaller window.
+2. From those, takes **the last `L // 3` events** from each of the three pools (lp, like, skip) independently.
+3. Concatenates the three streams and **re-sorts chronologically** to produce an interleaved sequence.
+4. Tags each event's pool identity into `action_weight` via OR'd bitmask (LP=1, LIKE=2, SKIP=4).
 
-#### Run model training with 4 GPUs
+With `L = 2039` and `max_seq_len = 2048`:
+- Per-pool cap = `L // 3 = 679`
+- Maximum total UIH = `3 × 679 = 2037` events
+- Plus `8 contextual + 1 candidate = 9` overhead → 2046 ≤ 2048 model budget (no truncation)
 
-```bash
-LOCAL_WORLD_SIZE=4 WORLD_SIZE=4 python3 generative_recommenders/dlrm_v3/train/train_ranker.py --dataset debug --mode train
-```
+### 4.2 Effective per-anchor fill on real data
 
-#### Run model inference with 4 GPUs
+Because the `like` pool is rare (1.9% of events) and the average user has only ~4,763 lifetime events:
 
-```bash
-git clone --recurse-submodules https://github.com/mlcommons/inference.git mlperf_inference
-cd mlperf_inference/loadgen
-CFLAGS="-std=c++14 -O3" python -m pip install .
-
-LOCAL_WORLD_SIZE=4 WORLD_SIZE=4 python3 generative_recommenders/dlrm_v3/inference/main.py --dataset debug
-```
+| Pool | per-pool cap (L//3) | actual avg fill per anchor | fill rate |
+|---|---|---|---|
+| lp | 679 | ~673 | **99%** |
+| like | 679 | ~105 | **15%** (data-bounded, not cap-bounded) |
+| skip | 679 | ~624 | 92% |
+| **total UIH** | 2037 max | **~1402** | 69% |
 
-## License
-This codebase is Apache 2.0 licensed, as found in the [LICENSE](LICENSE) file.
+The `like` cap of 679 is unreachable for yambda data — at the 1.9% global like rate, filling 679 likes would require a user to have ~36k prior events, but the **longest user in the dataset has only 27,738 events total** (and the median user has 2,695). So under-fill on `like` is fundamental to the data.
 
-## Contributors
-The overall project is made possible thanks to the joint work from many technical contributors (listed in alphabetical order):
+This means the model sees on average ~1,402 UIH events per sample, not the theoretical 2,037. With the TRITON jagged-attention backend the GPU only does work for the actual events, so the under-fill costs **sequence budget but not GPU compute** — no wasted attention work, just less context per sample than the budget suggests.
 
-Adnan Akhundov, Bugra Akyildiz, Shabab Ayub, Alex Bao, Renqin Cai, Jennifer Cao, Xuan Cao, Guoqiang Jerry Chen, Lei Chen, Li Chen, Sean Chen, Xianjie Chen, Huihui Cheng, Weiwei Chu, Ted Cui, Shiyan Deng, Nimit Desai, Fei Ding, Shilin Ding, Francois Fagan, Lu Fang, Leon Gao, Zhaojie Gong, Fangda Gu, Liang Guo, Liz Guo, Jeevan Gyawali, Yuchen Hao, Daisy Shi He, Michael Jiayuan He, Yu He, Samuel Hsia, Jie Hua, Yanzun Huang, Hongyi Jia, Rui Jian, Jian Jin, Rafay Khurram, Rahul Kindi, Changkyu Kim, Yejin Lee, Fu Li, Han Li, Hong Li, Shen Li, Rui Li, Wei Li, Zhijing Li, Lucy Liao, Xueting Liao, Emma Lin, Hao Lin, Chloe Liu, Jingzhou Liu, Xing Liu, Xingyu Liu, Kai Londenberg, Yinghai Lu, Liang Luo, Linjian Ma, Matt Ma, Yun Mao, Bert Maher, Ajit Mathews, Matthew Murphy, Satish Nadathur, Min Ni, Jongsoo Park, Colin Peppler, Jing Qian, Lijing Qin, Jing Shan, Alex Singh, Timothy Shi,  Yu Shi, Dennis van der Staay, Xiao Sun, Colin Taylor, Shin-Yeh Tsai, Rohan Varma, Omkar Vichare, Alyssa Wang, Pengchao Wang, Shengzhi Wang, Wenting Wang, Xiaolong Wang, Yueming Wang, Zhiyong Wang, Wei Wei, Bin Wen, Carole-Jean Wu, Yanhong Wu, Eric Xu, Bi Xue, Hong Yan, Zheng Yan, Chao Yang, Junjie Yang, Wen-Yun Yang, Ze Yang, Zimeng Yang, Yuanjun Yao, Chunxing Yin, Daniel Yin, Yiling You, Jiaqi Zhai, Keke Zhai, Yanli Zhao, Zhuoran Zhao, Hui Zhang, Jingjing Zhang, Lu Zhang, Lujia Zhang, Na Zhang, Rui Zhang, Xiong Zhang, Ying Zhang, Zhiyun Zhang, Charles Zheng, Erheng Zhong, Zhao Zhu, Xin Zhuang.
+## 5. License
 
-For the initial paper describing the Generative Recommender problem formulation and the algorithms used, including HSTU and M-FALCON, please refer to ``Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations``([ICML'24 paper](https://dl.acm.org/doi/10.5555/3692070.3694484), [slides](https://icml.cc/media/icml-2024/Slides/32684.pdf)).
+Apache 2.0 (inherited from upstream).

From 03662de7ed538b7189915fd8981fa397389b6b16 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Mon, 1 Jun 2026 02:39:55 -0500
Subject: [PATCH 012/113] gin: make hbm_cap_gb overridable via \$HBM_CAP_GB

Adds env_int gin macro (companion to env_path) and wires
make_optimizer_and_shard.hbm_cap_gb through it so the per-rank HBM
ceiling can be tuned without editing the gin file.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin                |  7 +++++--
 .../generative_recommenders/dlrm_v3/utils.py       | 14 ++++++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index f9b451f33..cc2c1000d 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -41,8 +41,11 @@ make_train_test_dataloaders.num_workers = %num_workers
 make_train_test_dataloaders.prefetch_factor = %prefetch_factor
 make_train_test_dataloaders.num_blocks = 1
 
-# embedding planner
-make_optimizer_and_shard.hbm_cap_gb = 260
+# embedding planner: per-rank HBM ceiling the torchrec sharder targets.
+# Override via $HBM_CAP_GB (e.g. lower to 150 to force more CW sharding).
+make_optimizer_and_shard.hbm_cap_gb = @env_int()
+env_int.key = "HBM_CAP_GB"
+env_int.default = 260
 
 get_dataset.name = %dataset
 get_dataset.new_path_prefix = %DATA_PATH
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
index 508f894a9..f276780c2 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
@@ -653,6 +653,20 @@ def env_path(key: str = "", default: str = "") -> str:
     return os.environ.get(key, default) if key else default
 
 
+@gin.configurable
+def env_int(key: str = "", default: int = 0) -> int:
+    """Resolve an int from os.environ[key], falling back to `default`.
+
+    Companion to `env_path` for numeric overrides. Example gin usage:
+
+        make_optimizer_and_shard.hbm_cap_gb = @env_int()
+        env_int.key = "HBM_CAP_GB"
+        env_int.default = 260
+    """
+    raw = os.environ.get(key) if key else None
+    return int(raw) if raw else default
+
+
 @gin.configurable
 def run_results_dir(run_name: str = "default", subdir: str = "results") -> str:
     """Resolve ``<recommendation_v4>/<subdir>/<run_name>`` from this file's location.

From bb012a2cfb301a1aa388171d5bce496bec9f2216 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Mon, 1 Jun 2026 19:17:02 +0000
Subject: [PATCH 013/113] docs: add B200 training recipe for yambda-5b

Document the container image, dependency versions (native NGC torch 2.10,
triton 3.6, source-built fbgemm_gpu, torchrec 1.4.0, polars-u64-idx), gin
training configuration, and env vars needed to reproduce the 8x B200 run.
---
 recommendation_v4/docs/training_recipe.md | 93 +++++++++++++++++++++++
 1 file changed, 93 insertions(+)
 create mode 100644 recommendation_v4/docs/training_recipe.md

diff --git a/recommendation_v4/docs/training_recipe.md b/recommendation_v4/docs/training_recipe.md
new file mode 100644
index 000000000..155dc9afd
--- /dev/null
+++ b/recommendation_v4/docs/training_recipe.md
@@ -0,0 +1,93 @@
+# Training Recipe
+
+Reproducible environment + configuration for training HSTU / DLRM-v3 on the
+`yambda-5b` dataset.
+
+---
+
+## B200
+
+Single-node, 8× NVIDIA **B200** (Blackwell, `sm_100`, ~183 GiB HBM each), HSTU
+ranker on `yambda-5b` with the **TRITON** HSTU kernel and **bf16** mixed-precision
+training.
+
+### Hardware / host
+
+| item | value |
+|---|---|
+| GPUs | 8× NVIDIA B200 (`sm_100`, compute capability 10.0) |
+| Host driver | 580.159.03 (reports CUDA 13.2) |
+| Forward-compat userspace driver | `libcuda.so.595.45.04` (engaged automatically by the NGC image) |
+
+### Container image
+
+```
+nvcr.io/nvidia/pytorch:26.01-py3
+```
+
+Digest: `sha256:38ed2ecb2c16d10677006d73fb0a150855d6ec81db8fc66e800b5ae92741007e`
+
+The image's native PyTorch is kept as-is and must not be reinstalled (so CUPTI
+stays matched to the driver and `sm_100` support is preserved).
+
+### Dependency versions
+
+| package | version | notes |
+|---|---|---|
+| **torch** | `2.10.0a0+a36e1d39eb.nv26.01.42222806` (CUDA 13.1) | native to the image; not reinstalled |
+| **triton** | `3.6.0` | native to the image; provides `triton.language.make_tensor_descriptor` (required by the TRITON HSTU path) |
+| **fbgemm_gpu** | `fbgemm_gpu_nightly-2026.6.1` (CUDA 13.1, `sm_100`) | built from source against the native torch, from FBGEMM commit `939f2da156b05d2f1bcba8c037d613c1098d0db5` (2026-04-29); public wheels are ABI-incompatible with the NGC torch |
+| **torchrec** | `1.4.0` | installed with `--no-deps` |
+| **polars-u64-idx** | `1.33.1` | 64-bit row index — `yambda-5b` has > 4.29 B rows (overflows stock polars' 32-bit index) |
+| CUPTI (for `torch.profiler`) | 13.1 (native) | matches the driver; the `+cu128` stack's CUPTI 12.8 fails on B200 (`CUPTI_ERROR_INVALID_DEVICE`) |
+
+Additional Python deps:
+`xxhash`, `gin-config`, `absl-py`, `pandas`, `tensorboard`, `pyarrow`, `pyyaml`,
+`tqdm`, `psutil`, `torchmetrics==1.0.3`, `tensordict`, `pyre-extensions`,
+`iopath`, `typing-inspect`.
+
+### Training configuration
+
+From `generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin`:
+
+| parameter | value | gin binding |
+|---|---|---|
+| batch_size (train) | 32 | `make_train_test_dataloaders.batch_size` |
+| eval_batch_size | 32 | `make_train_test_dataloaders.eval_batch_size` |
+| num_workers (dataloader) | 4 | `make_train_test_dataloaders.num_workers` |
+| prefetch_factor | 8 | `make_train_test_dataloaders.prefetch_factor` |
+| num_blocks | 1 | `make_train_test_dataloaders.num_blocks` |
+| train_split_percentage | 0.90 | `make_train_test_dataloaders.train_split_percentage` |
+| history_length (per-sample UIH budget) | 2039 | `get_dataset.history_length` |
+| max_seq_len (attention budget) | 2048 | `get_hstu_configs.max_seq_len` |
+| bf16 training | True | `make_model.bf16_training` |
+| HBM cap (per GPU) | 150 GiB | `make_optimizer_and_shard.hbm_cap_gb` (env `HBM_CAP_GB`) |
+| dense optimizer | Adam, lr 1e-3, betas (0.95, 0.999), eps 1e-8 | `dense_optimizer_factory_and_class.*` |
+| sparse optimizer | RowWiseAdagrad, lr 1e-3, betas (0.95, 0.999), eps 1e-8 | `sparse_optimizer_factory_and_class.*` |
+| world_size | 8 | `MetricsLogger.world_size` |
+
+Effective global batch = `batch_size × world_size = 32 × 8 = 256` samples/step.
+
+### Environment variables
+
+| var | value | purpose |
+|---|---|---|
+| `HSTU_HAMMER_KERNEL` | `TRITON` | fast HSTU kernel (vs `PYTORCH` fallback) |
+| `TORCH_CUDA_ARCH_LIST` | `10.0` | target `sm_100` for JIT / Triton compilation |
+| `DLRM_DATA_PATH` | dataset root | overrides gin default `/apps/chcai/dlrm_data` |
+| `HBM_CAP_GB` | `150` | embedding planner HBM budget per GPU |
+| `RUN_NAME` | run id | results dir → `results/<RUN_NAME>/` |
+| `PYTORCH_CUDA_ALLOC_CONF` | `expandable_segments:True` | allocator headroom |
+| `TRITON_CACHE_DIR` | cache path | persist compiled Triton kernels across runs |
+| `WORLD_SIZE` / `LOCAL_WORLD_SIZE` | `8` | mp.spawn rank count |
+
+### Known pitfalls
+
+- Never reinstall torch in this image — a cu12x wheel breaks CUPTI and may drop
+  `sm_100`.
+- The `+cu128` stack (`torch==2.7.1+cu128` + `fbgemm-gpu==1.2.0+cu128` +
+  `torchrec==1.2.0+cu128`) runs on B200 but cannot profile GPU activity (CUPTI
+  12.8 vs the 13.2 driver).
+- Stock `polars` silently overflows on `yambda-5b` (> 4.29 B rows); always use
+  `polars-u64-idx`.
+- `EmbeddingBoundsCheck ... Setting idx to zero` warnings are benign data clamps.

From 8d8844b4d8ab35fbbd8aad632e76c873de1a8784 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Mon, 1 Jun 2026 16:37:22 -0500
Subject: [PATCH 014/113] bf16 + triton autotune pinning with gin-driven
 full-tune override
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds three knobs, all driven from the gin file:
- make_model.bf16_training: enable bf16 autocast for the DlrmHSTU model.
- env_int macro: lets numeric gin values come from env vars (used by the
  existing hbm_cap_gb binding).
- apply_env_bootstrap.TRITON_FULL_AUTOTUNE: when False (default), three
  layer-norm/jagged triton kernels are pinned to a single Config so cold
  starts land at the same steady-state deterministically. When True, the
  full autotune search runs again — use this when changing shape, GPU,
  or triton/torch version, then re-pin from the discovered winners.

train_ranker._main_func now parses gin in two phases (skip_unknown=True
early, full pass after the heavy imports) so the bootstrap env var is set
BEFORE the triton kernel modules evaluate their @triton.autotune
decorators at module load time.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../dlrm_v3/train/_env_bootstrap.py           | 28 +++++++++++
 .../dlrm_v3/train/gin/yambda_5b.gin           | 13 ++++-
 .../dlrm_v3/train/train_ranker.py             | 50 +++++++++++++------
 .../dlrm_v3/train/utils.py                    |  7 ++-
 .../ops/triton/_autotune_pinning.py           | 27 ++++++++++
 .../ops/triton/triton_jagged.py               |  6 ++-
 .../ops/triton/triton_layer_norm.py           | 11 +++-
 7 files changed, 122 insertions(+), 20 deletions(-)
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/train/_env_bootstrap.py
 create mode 100644 recommendation_v4/generative_recommenders/ops/triton/_autotune_pinning.py

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/_env_bootstrap.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/_env_bootstrap.py
new file mode 100644
index 000000000..2890851de
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/_env_bootstrap.py
@@ -0,0 +1,28 @@
+"""Gin-driven env-var bootstrap.
+
+Some env vars must be set *before* certain modules import (e.g. Triton's
+`@triton.autotune` decorator reads `TRITON_FULL_AUTOTUNE` at module load
+time, well before `gin.parse_config_file` runs in the default ordering).
+
+`apply_env_bootstrap()` is `@gin.configurable`, so the gin file becomes the
+canonical source of truth. `train_ranker.py` parses gin with
+`skip_unknown=True` early in `_main_func`, calls this function to push the
+bindings into `os.environ`, then does the heavy imports.
+"""
+
+import logging
+import os
+from typing import Optional
+
+import gin
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+@gin.configurable
+def apply_env_bootstrap(
+    TRITON_FULL_AUTOTUNE: Optional[bool] = None,
+) -> None:
+    if TRITON_FULL_AUTOTUNE is not None:
+        os.environ["TRITON_FULL_AUTOTUNE"] = "1" if TRITON_FULL_AUTOTUNE else "0"
+        logger.info("env bootstrap: TRITON_FULL_AUTOTUNE=%s", os.environ["TRITON_FULL_AUTOTUNE"])
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index cc2c1000d..da5727de3 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -1,10 +1,19 @@
 batch_size = 32
-num_workers = 1
-prefetch_factor = 2
+num_workers = 4
+prefetch_factor = 8
 dataset = "yambda-5b"
 
 # model parameters
 make_model.dataset = %dataset
+make_model.bf16_training = True
+
+# False = use pinned triton kernel configs (deterministic; whether that's
+# the fast or slow equilibrium depends on which config was pinned for the
+# current training shape + GPU). For a NEW training config (new shape,
+# new GPU, new triton/torch version), set True and run with
+# TRITON_PRINT_AUTOTUNING=1 to discover the fast configs, then update the
+# pinned constants in ops/triton/_autotune_pinning.py call sites.
+apply_env_bootstrap.TRITON_FULL_AUTOTUNE = False
 
 # dense model optimizer
 dense_optimizer_factory_and_class.learning_rate = 0.001
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
index d17e2992d..c6a90c2b7 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
@@ -23,22 +23,14 @@
 
 import gin
 import torch
-from generative_recommenders.dlrm_v3.checkpoint import load_dmp_checkpoint
-from generative_recommenders.dlrm_v3.train.utils import (
-    cleanup,
-    eval_loop,
-    make_model,
-    make_optimizer_and_shard,
-    make_train_test_dataloaders,
-    setup,
-    streaming_train_eval_loop,
-    train_eval_loop,
-    train_loop,
-)
-from generative_recommenders.dlrm_v3.utils import MetricsLogger
 from torch import multiprocessing as mp
 from torchrec.test_utils import get_free_port
 
+# NOTE: heavy imports of generative_recommenders.dlrm_v3.* are deferred to
+# inside _main_func so that gin-driven env-var bootstrap (see
+# _env_bootstrap.apply_env_bootstrap) can run BEFORE the triton kernel
+# modules evaluate their `@triton.autotune` decorators at module-load time.
+
 logger: logging.Logger = logging.getLogger(__name__)
 
 
@@ -65,13 +57,43 @@ def _main_func(
 ) -> None:
     device = torch.device(f"cuda:{rank}")
     logger.info(f"rank: {rank}, world_size: {world_size}, device: {device}")
+    # Phase 1: parse gin early with skip_unknown=True so env-bootstrap
+    # bindings take effect BEFORE any module-level @gin.configurable
+    # discovers itself. This is required because triton @triton.autotune
+    # decorators in generative_recommenders.ops.triton.* read env vars at
+    # module import time, and the heavy imports below pull those in.
+    from generative_recommenders.dlrm_v3.train._env_bootstrap import apply_env_bootstrap
+
+    gin.parse_config_file(gin_file, skip_unknown=True)
+    apply_env_bootstrap()
+
+    # Phase 2: heavy imports. Triton kernel modules evaluate their autotune
+    # decorators here, using the env vars set above.
+    from generative_recommenders.dlrm_v3.checkpoint import load_dmp_checkpoint
+    from generative_recommenders.dlrm_v3.train.utils import (
+        cleanup,
+        eval_loop,
+        make_model,
+        make_optimizer_and_shard,
+        make_train_test_dataloaders,
+        setup,
+        streaming_train_eval_loop,
+        train_eval_loop,
+        train_loop,
+    )
+    from generative_recommenders.dlrm_v3.utils import MetricsLogger
+
     setup(
         rank=rank,
         world_size=world_size,
         master_port=master_port,
         device=device,
     )
-    # parse all arguments
+    # Phase 3: re-parse to bind the @gin.configurables now that they are
+    # registered. The earlier skip_unknown pass already consumed the
+    # env-bootstrap binding, but bindings are idempotent so re-applying is
+    # fine, and this pass is the one that actually wires up make_model,
+    # make_train_test_dataloaders, etc.
     gin.parse_config_file(gin_file)
 
     model, model_configs, embedding_table_configs = make_model()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index 2ff025394..a957122a3 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -204,15 +204,20 @@ def set_epoch(self, epoch: int) -> None:
 @gin.configurable
 def make_model(
     dataset: str,
+    bf16_training: bool = False,
 ) -> Tuple[torch.nn.Module, DlrmHSTUConfig, Dict[str, EmbeddingConfig]]:
     hstu_config = get_hstu_configs(dataset)
     table_config = get_embedding_table_config(dataset)
 
+    # bf16 autocast is off by default: on the PYTORCH attn backend the
+    # pt_hstu_attention QK einsum backward overflows in bf16 at long
+    # sequences (NaN at step 1 when N>1k). Safe with TRITON; flip via
+    # `make_model.bf16_training = True` in the gin.
     model = DlrmHSTU(
         hstu_configs=hstu_config,
         embedding_tables=table_config,
         is_inference=False,
-        bf16_training=False,
+        bf16_training=bf16_training,
     )
 
     # Triton on ROCm fails to compile some jagged kernels at our shapes
diff --git a/recommendation_v4/generative_recommenders/ops/triton/_autotune_pinning.py b/recommendation_v4/generative_recommenders/ops/triton/_autotune_pinning.py
new file mode 100644
index 000000000..5aa24eb85
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/ops/triton/_autotune_pinning.py
@@ -0,0 +1,27 @@
+"""Triton autotune pinning helper.
+
+A handful of Triton kernels in this directory have two stable autotune
+equilibria on MI350X gfx950 at our yambda bs=32 L=2039 shape: a fast one
+(~52 ms/step) and a slow one (~71 ms/step). The autotuner's measurement
+noise puts the choice on a coin flip per cold start. We pin the winning
+config for these kernels so every cold start lands at the fast equilibrium
+deterministically.
+
+Set `TRITON_FULL_AUTOTUNE=1` to bypass the pin and re-enable the full
+autotune search (useful when validating a new shape, GPU, or Triton version
+before re-capturing winners).
+"""
+
+import os
+from typing import Callable, List
+
+import triton
+
+
+def pinned_or_full(
+    pinned: List[triton.Config],
+    full_configs_fn: Callable[[], List[triton.Config]],
+) -> List[triton.Config]:
+    if os.environ.get("TRITON_FULL_AUTOTUNE", "0") == "1":
+        return full_configs_fn()
+    return pinned
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_jagged.py b/recommendation_v4/generative_recommenders/ops/triton/triton_jagged.py
index 7a4e82cf4..8172a0f7b 100644
--- a/recommendation_v4/generative_recommenders/ops/triton/triton_jagged.py
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_jagged.py
@@ -33,6 +33,7 @@
     switch_to_contiguous_if_needed,
     triton_autotune,
 )
+from generative_recommenders.ops.triton._autotune_pinning import pinned_or_full
 from generative_recommenders.ops.utils import is_sm100_plus, is_sm90
 from torch._inductor.runtime import triton_helpers
 
@@ -2150,7 +2151,10 @@ def split_2D_jagged_w_prefix_multirow(
 
 
 @triton_autotune(
-    configs=_get_split_concat_2d_jagged_multirow_configs_wrapper(),
+    configs=pinned_or_full(
+        [triton.Config({"BLOCK_N": 1}, num_warps=2)],
+        _get_split_concat_2d_jagged_multirow_configs_wrapper,
+    ),
     key=["BLOCK_D"],
 )
 @triton.jit
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_layer_norm.py b/recommendation_v4/generative_recommenders/ops/triton/triton_layer_norm.py
index 1e997fd40..5ed508108 100644
--- a/recommendation_v4/generative_recommenders/ops/triton/triton_layer_norm.py
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_layer_norm.py
@@ -30,6 +30,7 @@
     switch_to_contiguous_if_needed,
     triton_autotune,
 )
+from generative_recommenders.ops.triton._autotune_pinning import pinned_or_full
 from generative_recommenders.ops.utils import (
     is_sm100_plus,
     is_sm90,
@@ -306,7 +307,10 @@ def _layer_norm_bwd_dx(
 
 
 @triton_autotune(
-    configs=_get_layer_norm_fwd_configs(),
+    configs=pinned_or_full(
+        [triton.Config({"BLOCK_N": 8}, num_warps=1)],
+        _get_layer_norm_fwd_configs,
+    ),
     key=["BLOCK_D"],
 )
 @triton.jit
@@ -463,7 +467,10 @@ def _get_bwd_dwdb_configs() -> List[triton.Config]:
 
 
 @triton_autotune(
-    configs=_get_bwd_dwdb_configs(),
+    configs=pinned_or_full(
+        [triton.Config({"BLOCK_N": 128}, num_warps=8)],
+        _get_bwd_dwdb_configs,
+    ),
     key=["D"],
 )
 @triton.jit

From 1c9315ae9eb5136584ed2238105b93f932d8535c Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Mon, 1 Jun 2026 16:40:18 -0500
Subject: [PATCH 015/113] docs: add MI350X training recipe section

Mirrors the B200 layout with MI350X (gfx950, ROCm 7.2.1) specifics:
container image (rocm/primus:v26.3), fbgemm_gpu rebuild requirement (HEAD
nightly_rocm-2026.6.1 for ~30% step-time win over the shipped 2026.5.14),
the gin-driven TRITON_FULL_AUTOTUNE knob, and the measured perf ladder
from fp32/PYTORCH baseline (~28 d/epoch) down to the pinned bf16/TRITON
fast equilibrium (~7.6 d/epoch).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 recommendation_v4/docs/training_recipe.md | 103 ++++++++++++++++++++++
 1 file changed, 103 insertions(+)

diff --git a/recommendation_v4/docs/training_recipe.md b/recommendation_v4/docs/training_recipe.md
index 155dc9afd..f6744be39 100644
--- a/recommendation_v4/docs/training_recipe.md
+++ b/recommendation_v4/docs/training_recipe.md
@@ -5,6 +5,109 @@ Reproducible environment + configuration for training HSTU / DLRM-v3 on the
 
 ---
 
+## MI350X
+
+Single-node, 8× AMD **Instinct MI350X** (`gfx950`, ~288 GiB HBM3e each), HSTU
+ranker on `yambda-5b` with the **TRITON** HSTU kernel and **bf16**
+mixed-precision training.
+
+### Hardware / host
+
+| item | value |
+|---|---|
+| GPUs | 8× AMD Instinct MI350X (`gfx950`, ROCm 7.2.1) |
+| Host CPU | AMD EPYC 9655 96-Core (192 cores × 2 threads) |
+
+### Container image
+
+```
+rocm/primus:v26.3
+```
+
+The image's native PyTorch is kept as-is and must not be reinstalled — it is
+the ROCm-matched build used by triton/fbgemm.
+
+### Dependency versions
+
+| package | version | notes |
+|---|---|---|
+| **torch** | `2.10.0+git94c6e04` | native to the image; not reinstalled |
+| **triton** | `3.6.0` | native to the image; same major as B200 path |
+| **fbgemm_gpu** | `fbgemm_gpu_nightly_rocm-2026.6.1` (built from FBGEMM commit `1509423`, 2026-06-01) for `gfx950` | image ships `2026.5.14`; rebuild from source gives a measurable boost from the TBE-forward V2 grid-striding (#5669) + warpSize 32/64 unified build (#5739) + `__syncthreads` cleanup (#5744). Build command: `python setup.py -j 32 bdist_wheel --build-target=default --build-variant=rocm -DHIP_ROOT_DIR=/opt/rocm -DAMDGPU_TARGETS=gfx950` |
+| **torchrec** | `1.4.0` | matches B200 |
+| **polars-u64-idx** | `1.33.1` | 64-bit row index — `yambda-5b` has > 4.29 B rows. Installed from a pre-staged local tarball by `scripts/launch_smoke_8gpu.sh` (reserved nodes have no outbound DNS) |
+
+### Training configuration
+
+From `generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin`:
+
+| parameter | value | gin binding |
+|---|---|---|
+| batch_size (train) | 32 | `make_train_test_dataloaders.batch_size` |
+| eval_batch_size | 32 | `make_train_test_dataloaders.eval_batch_size` |
+| num_workers (dataloader) | 4 | `make_train_test_dataloaders.num_workers` |
+| prefetch_factor | 8 | `make_train_test_dataloaders.prefetch_factor` |
+| num_blocks | 1 | `make_train_test_dataloaders.num_blocks` |
+| train_split_percentage | 0.90 | `make_train_test_dataloaders.train_split_percentage` |
+| history_length (per-sample UIH budget) | 2039 | `get_dataset.history_length` |
+| max_seq_len (attention budget) | 2048 | `get_hstu_configs.max_seq_len` |
+| bf16 training | True | `make_model.bf16_training` |
+| HBM cap (per GPU) | 260 GiB | `make_optimizer_and_shard.hbm_cap_gb` (env `HBM_CAP_GB`) |
+| **triton autotune pinning** | **False (pinned)** | `apply_env_bootstrap.TRITON_FULL_AUTOTUNE` |
+| dense optimizer | Adam, lr 1e-3, betas (0.95, 0.999), eps 1e-8 | `dense_optimizer_factory_and_class.*` |
+| sparse optimizer | RowWiseAdagrad, lr 1e-3, betas (0.95, 0.999), eps 1e-8 | `sparse_optimizer_factory_and_class.*` |
+| world_size | 8 | `MetricsLogger.world_size` |
+
+Effective global batch = `batch_size × world_size = 32 × 8 = 256` samples/step.
+
+### Environment variables
+
+| var | value | purpose |
+|---|---|---|
+| `HSTU_HAMMER_KERNEL` | `TRITON` | fast HSTU kernel (vs `PYTORCH` fallback) |
+| `DLRM_DATA_PATH` | dataset root | overrides gin default `/apps/chcai/dlrm_data` |
+| `HBM_CAP_GB` | (optional) | embedding planner HBM budget per GPU |
+| `RUN_NAME` | run id | results dir → `results/<RUN_NAME>/` |
+| `PYTORCH_CUDA_ALLOC_CONF` | `expandable_segments:True` | allocator headroom |
+| `HIP_VISIBLE_DEVICES` / `CUDA_VISIBLE_DEVICES` | `0,1,2,3,4,5,6,7` | rank visibility |
+
+`TRITON_FULL_AUTOTUNE` is set automatically by the gin-driven bootstrap
+(`generative_recommenders.dlrm_v3.train._env_bootstrap.apply_env_bootstrap`),
+which runs in `train_ranker._main_func` BEFORE the triton kernel modules
+import — so the gin file is the source of truth.
+
+### Measured performance
+
+| variant | steady-state ms/step | global sps | epoch ETA (3.23B anchors) |
+|---|---|---|---|
+| nightly + fp32 + PYTORCH attn (baseline) | ~190 | ~1340 | ~28 d |
+| nightly + bf16 + TRITON attn | ~93 | ~2787 | ~13.4 d |
+| primus + bf16 + TRITON attn | ~67.5 | ~3793 | ~9.9 d |
+| primus + fbgemm HEAD + bf16 + TRITON, autotune drift | ~53 fast / ~70 slow | 3700–4860 | 7.7–10.2 d |
+| **primus + fbgemm HEAD + bf16 + TRITON + pinning (default)** | **~52** | **~4970** | **~7.6 d** |
+
+The "pinning" line is the deterministic per-cold-start equilibrium —
+three layer-norm / jagged triton kernels have two stable autotune winners
+and the pin forces the fast one every run.
+
+### Known pitfalls
+
+- The image ships `fbgemm_gpu==2026.5.14`. The wheel built from FBGEMM HEAD
+  (`2026.6.1`) is required for the 70 → 52 ms step. Build inside the
+  container so the wheel links against the image's native torch.
+- Stock `polars` silently overflows on `yambda-5b` (> 4.29 B rows); always
+  use `polars-u64-idx`.
+- When changing shape (batch size, history length), GPU, or triton/torch
+  version, flip `apply_env_bootstrap.TRITON_FULL_AUTOTUNE = True` and run
+  with `TRITON_PRINT_AUTOTUNING=1` to re-capture winners, then update the
+  pinned configs at the `pinned_or_full(...)` call sites in
+  `generative_recommenders/ops/triton/`.
+- Do not run with bf16 on the `PYTORCH` HSTU attention backend at our
+  sequence length — `pt_hstu_attention`'s QK einsum backward overflows in
+  bf16 at N > 1k and produces NaN at step 1. bf16 is only safe with TRITON.
+
+---
+
 ## B200
 
 Single-node, 8× NVIDIA **B200** (Blackwell, `sm_100`, ~183 GiB HBM each), HSTU

From 30275194c1e3c0a735b4a0161360eedf6c3bd54b Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Mon, 1 Jun 2026 16:48:54 -0500
Subject: [PATCH 016/113] scripts: add stitch_traces.py

Merges per-rank chrome traces (results/<run>/trace_step{N}_rank{R}.json)
into a single Perfetto-loadable file, remapping pid/flow ids so
cross-rank events land on distinct tracks instead of collapsing onto one.
Used to produce the bf16 + pinned-autotune step-52 trace
(results/verify_rename/trace_step52.json.gz).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 recommendation_v4/scripts/stitch_traces.py | 329 +++++++++++++++++++++
 1 file changed, 329 insertions(+)
 create mode 100644 recommendation_v4/scripts/stitch_traces.py

diff --git a/recommendation_v4/scripts/stitch_traces.py b/recommendation_v4/scripts/stitch_traces.py
new file mode 100644
index 000000000..54d7963d6
--- /dev/null
+++ b/recommendation_v4/scripts/stitch_traces.py
@@ -0,0 +1,329 @@
+#!/usr/bin/env python3
+"""Stitch per-rank Chrome traces from a dlrm_v3 run into one merged file.
+
+When ``Profiler`` runs on multiple ranks, each rank writes its own file:
+
+    <trace_dir>/trace_step{step}_rank{rank}.json
+
+Each per-rank trace uses overlapping ``pid`` namespaces (CPU pid = OS pid;
+GPU streams pid = 0..N), so concatenating the raw event lists would collapse
+multiple ranks onto the same Perfetto track. This script:
+
+* Identifies each pid as ``CPU`` / ``GPU`` / ``Spans`` (and other torch.profiler
+  string-pid tracks) using the per-rank ``process_labels`` metadata events.
+* Always drops the ``Spans`` track (low-signal in this codebase, large in
+  visual clutter).
+* Optionally filters to just ``cpu`` or ``gpu`` events via ``--include``.
+* Sorts the surviving tracks into contiguous Perfetto sections:
+  **all CPU tracks (rank 0..N) first, then all GPU tracks (rank 0..N, stream
+  0..K)**.
+* Remaps every event's ``pid`` and flow ``id`` so cross-rank events never
+  collide on the same track or flow arrow.
+
+Because torch.profiler emits ``baseTimeNanoseconds`` from the same node clock,
+timestamps line up directly across ranks — no time-shift needed for single-node
+runs (multi-node would need clock-skew correction, not implemented here).
+
+Examples
+--------
+Stitch step 52, default (CPU + GPU, drop Spans), gzip output::
+
+    python scripts/stitch_traces.py <trace_dir> --step 52 --gzip
+
+GPU-only view (skip CPU thread tree entirely — useful for kernel-level analysis)::
+
+    python scripts/stitch_traces.py <trace_dir> --step 52 --include gpu --gzip
+
+CPU-only view (host-side ops, profiler annotations, comm scheduling)::
+
+    python scripts/stitch_traces.py <trace_dir> --step 52 --include cpu --gzip
+"""
+from __future__ import annotations
+
+import argparse
+import gzip
+import json
+import re
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+# trace_step52_rank3.json or trace_3_rank0.json (legacy filename)
+_RANK_RE = re.compile(r"trace_(?:step)?(\d+)_rank(\d+)\.json$")
+_KEY_RE = re.compile(r"trace_(.+?)_rank\d+\.json$")
+
+# Per-rank pid offset. Picked large enough that no real OS pid collides
+# (Linux pids fit in 22 bits; 1e6 per rank gives ~10 ranks of headroom).
+_PID_STRIDE = 1_000_000
+
+# Per-rank flow-id offset. torch.profiler flow ids are int32/int64 — pack rank
+# into the high bits so cross-rank flows can never link by accident.
+_FLOW_ID_STRIDE = 1 << 40
+
+# Sort-index sections in Perfetto. Lower = appears higher in the timeline UI.
+# Each section reserves a wide range so within-section ordering (rank, stream)
+# fits comfortably without overlapping the next section.
+_SORT_BASE = {
+    "cpu":   0,
+    "gpu":   1_000_000,
+    "other": 10_000_000,   # Traces / "" misc string-pid tracks
+}
+
+# `Spans` carries no useful content in our workloads (one X event per trace)
+# and clutters the timeline — always dropped.
+_ALWAYS_DROP_PIDS_STR = {"Spans"}
+
+
+def _classify_pid(pid_to_label: dict, pid_to_name: dict) -> dict:
+    """Map original pid -> ('cpu'|'gpu'|'spans'|'other', stream_idx_or_0).
+
+    Classification order, first match wins:
+      1. pid (as a string) is in the always-drop set        -> 'spans'
+      2. process_name is in the always-drop set             -> 'spans'
+      3. process_labels == 'CPU'                            -> 'cpu'
+      4. process_labels starts with 'GPU '                  -> 'gpu', stream id
+      5. anything else (including unlabeled pids)           -> 'other'
+    """
+    all_pids = set(pid_to_label) | set(pid_to_name)
+    out: dict = {}
+    for pid in all_pids:
+        label = pid_to_label.get(pid, "")
+        name = pid_to_name.get(pid, "")
+        if isinstance(pid, str) and pid in _ALWAYS_DROP_PIDS_STR:
+            out[pid] = ("spans", 0)
+            continue
+        if name in _ALWAYS_DROP_PIDS_STR:
+            out[pid] = ("spans", 0)
+            continue
+        if label == "CPU":
+            out[pid] = ("cpu", 0)
+        elif label.startswith("GPU"):
+            try:
+                stream_idx = int(label.split()[1])
+            except (IndexError, ValueError):
+                stream_idx = 0
+            out[pid] = ("gpu", stream_idx)
+        else:
+            out[pid] = ("other", 0)
+    return out
+
+
+def _scan_pid_metadata(events: list[dict]) -> tuple[dict, dict]:
+    """First pass: collect per-pid label and name from ``ph='M'`` events."""
+    label: dict = {}
+    name: dict = {}
+    for e in events:
+        if e.get("ph") != "M":
+            continue
+        pid = e.get("pid")
+        if pid is None:
+            continue
+        if e.get("name") == "process_labels":
+            label[pid] = e.get("args", {}).get("labels", "")
+        elif e.get("name") == "process_name":
+            name[pid] = e.get("args", {}).get("name", "")
+    return label, name
+
+
+def _new_sort_index(kind: str, rank: int, stream_idx: int) -> int:
+    """Compute Perfetto sort_index so tracks group as: CPU(rank0..N), GPU(rank0..N, stream0..K), other."""
+    base = _SORT_BASE.get(kind, _SORT_BASE["other"])
+    return base + rank * 100 + stream_idx
+
+
+def _new_pid(orig_pid, rank: int) -> object:
+    """Remap a single pid into a per-rank namespace, preserving int vs str."""
+    if isinstance(orig_pid, int):
+        return orig_pid + rank * _PID_STRIDE
+    if isinstance(orig_pid, str):
+        try:
+            return int(orig_pid) + rank * _PID_STRIDE
+        except ValueError:
+            return f"rank{rank}_{orig_pid}" if orig_pid else f"rank{rank}_misc"
+    return orig_pid
+
+
+def _process_one_rank(
+    events: list[dict],
+    rank: int,
+    include: set[str],
+) -> list[dict]:
+    """Filter + remap one rank's events. ``include`` is a subset of {'cpu','gpu','other'}."""
+    label, name = _scan_pid_metadata(events)
+    classify = _classify_pid(label, name)
+
+    out: list[dict] = []
+    for e in events:
+        pid = e.get("pid")
+        if pid is None:
+            out.append(e)
+            continue
+        # Always-drop check on the raw pid value first - Spans events in our
+        # workloads have NO process_name/process_labels metadata, so the
+        # classifier table doesn't list them. Catch them here directly.
+        if isinstance(pid, str) and pid in _ALWAYS_DROP_PIDS_STR:
+            continue
+        kind, stream_idx = classify.get(pid, ("other", 0))
+        if kind == "spans":          # always dropped
+            continue
+        if kind not in include:      # filtered by --include
+            continue
+
+        # Remap pid + flow id (per-rank namespace).
+        e["pid"] = _new_pid(pid, rank)
+        if "id" in e and e.get("ph") in ("s", "t", "f"):
+            try:
+                e["id"] = int(e["id"]) + rank * _FLOW_ID_STRIDE
+            except (TypeError, ValueError):
+                pass
+
+        # Rewrite metadata: section-aware sort_index + rank-prefixed name.
+        if e.get("ph") == "M":
+            args = e.setdefault("args", {})
+            if e.get("name") == "process_sort_index":
+                args["sort_index"] = _new_sort_index(kind, rank, stream_idx)
+            elif e.get("name") == "process_name":
+                orig = args.get("name", "python")
+                args["name"] = f"[Rank {rank}] {orig}"
+
+        out.append(e)
+
+    return out
+
+
+def _group_by_step(trace_dir: Path) -> dict[str, dict[int, Path]]:
+    """Map step-key (e.g. ``"step52"`` or ``"3"``) -> {rank: path}."""
+    groups: dict[str, dict[int, Path]] = defaultdict(dict)
+    for p in sorted(trace_dir.glob("trace_*_rank*.json")):
+        m = _RANK_RE.search(p.name)
+        if not m:
+            continue
+        prefix_match = _KEY_RE.match(p.name)
+        key = prefix_match.group(1) if prefix_match else m.group(1)
+        groups[key][int(m.group(2))] = p
+    return dict(groups)
+
+
+def stitch_one(rank_to_path: dict[int, Path], out_path: Path, *,
+               include: set[str], gzip_out: bool, verbose: bool) -> None:
+    """Merge one (step, rank->path) group into a single trace file."""
+    merged_events: list[dict] = []
+    base: dict | None = None
+
+    for rank in sorted(rank_to_path):
+        path = rank_to_path[rank]
+        if verbose:
+            sz_mb = path.stat().st_size / (1 << 20)
+            print(f"  rank {rank}: {path.name} ({sz_mb:.1f} MB)", file=sys.stderr)
+        with path.open() as f:
+            trace = json.load(f)
+        if base is None:
+            base = {k: v for k, v in trace.items() if k != "traceEvents"}
+            base["distributedInfo"] = {
+                **trace.get("distributedInfo", {}),
+                "stitched_ranks": sorted(rank_to_path),
+                "stitched_files": [p.name for p in rank_to_path.values()],
+                "stitched_include": sorted(include),
+            }
+        merged_events.extend(
+            _process_one_rank(trace.get("traceEvents", []), rank, include)
+        )
+
+    assert base is not None, "no input traces provided"
+    base["traceEvents"] = merged_events
+
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    if gzip_out:
+        with gzip.open(out_path, "wt") as f:
+            json.dump(base, f)
+    else:
+        with out_path.open("w") as f:
+            json.dump(base, f)
+    if verbose:
+        sz_mb = out_path.stat().st_size / (1 << 20)
+        print(
+            f"  -> {out_path}  ({len(merged_events):,} events, {sz_mb:.1f} MB)",
+            file=sys.stderr,
+        )
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    ap.add_argument("trace_dir", type=Path,
+                    help="Directory containing trace_*_rank*.json files.")
+    ap.add_argument("--step", type=str, default=None,
+                    help="Stitch only the given step key (e.g. '52' or 'step52'). "
+                         "Default: stitch every step group found.")
+    ap.add_argument("--out", type=Path, default=None,
+                    help="Output path. Only valid when --step selects exactly "
+                         "one group. Default: <trace_dir>/trace_<step>.json[.gz] "
+                         "(or trace_<step>_cpu/_gpu when --include filters).")
+    ap.add_argument("--include", choices=("cpu", "gpu", "both"), default="both",
+                    help="Which sections to keep: cpu-only tracks, gpu-only "
+                         "tracks, or both (default). 'Spans' is always dropped.")
+    ap.add_argument("--gzip", action="store_true",
+                    help="Write gzip-compressed JSON (Perfetto auto-detects).")
+    ap.add_argument("-q", "--quiet", action="store_true")
+    args = ap.parse_args()
+
+    if not args.trace_dir.is_dir():
+        print(f"error: {args.trace_dir} is not a directory", file=sys.stderr)
+        return 2
+
+    if args.include == "both":
+        # 'other' covers torch.profiler string-pid tracks (Traces / misc) that
+        # carry low-volume but legitimate annotations. Dropped under cpu/gpu
+        # so each filtered view is clean.
+        include = {"cpu", "gpu", "other"}
+    else:
+        include = {args.include}
+
+    groups = _group_by_step(args.trace_dir)
+    if not groups:
+        print(f"error: no trace_*_rank*.json files under {args.trace_dir}",
+              file=sys.stderr)
+        return 2
+
+    if args.step is not None:
+        wanted = args.step if args.step.startswith("step") else f"step{args.step}"
+        if wanted not in groups and args.step in groups:
+            wanted = args.step
+        if wanted not in groups:
+            print(
+                f"error: step {args.step!r} not found. "
+                f"Available: {sorted(groups)}",
+                file=sys.stderr,
+            )
+            return 2
+        groups = {wanted: groups[wanted]}
+
+    if args.out is not None and len(groups) != 1:
+        print("error: --out requires --step to select exactly one group",
+              file=sys.stderr)
+        return 2
+
+    for key, rank_map in sorted(groups.items()):
+        if not args.quiet:
+            print(
+                f"stitching {key} ({len(rank_map)} ranks, include={args.include}):",
+                file=sys.stderr,
+            )
+        if args.out is not None:
+            out = args.out
+        else:
+            ext = ".json.gz" if args.gzip else ".json"
+            # Default mode ("both") gets the bare filename; explicit cpu/gpu
+            # filters tag the output so they can coexist in one directory.
+            suffix = "" if args.include == "both" else f"_{args.include}"
+            out = args.trace_dir / f"trace_{key}{suffix}{ext}"
+        stitch_one(rank_map, out, include=include,
+                   gzip_out=args.gzip, verbose=not args.quiet)
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

From 8cbbab0ff25c034af48fa04b0192a257059000d3 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Mon, 1 Jun 2026 22:49:02 +0000
Subject: [PATCH 017/113] docs: update B200 recipe deps to NGC 26.04 (torch
 2.12 / CUDA 13.2)

Refresh the B200 dependency versions to the latest validated stack
(torch 2.12.0a0 / CUDA 13.2, fbgemm_gpu built for sm_100+CUDA 13.2,
CUPTI 13.2), note 26.01 as an equivalent alternative, and record the
TRITON_FULL_AUTOTUNE=True setting for B200.
---
 recommendation_v4/docs/training_recipe.md | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/recommendation_v4/docs/training_recipe.md b/recommendation_v4/docs/training_recipe.md
index f6744be39..bd6fee159 100644
--- a/recommendation_v4/docs/training_recipe.md
+++ b/recommendation_v4/docs/training_recipe.md
@@ -125,24 +125,29 @@ training.
 ### Container image
 
 ```
-nvcr.io/nvidia/pytorch:26.01-py3
+nvcr.io/nvidia/pytorch:26.04-py3
 ```
 
-Digest: `sha256:38ed2ecb2c16d10677006d73fb0a150855d6ec81db8fc66e800b5ae92741007e`
+Digest: `sha256:192d749b4d773610ec9e01c0443a9df545d196c412b7b8fd33bfa3da362a49e7`
 
 The image's native PyTorch is kept as-is and must not be reinstalled (so CUPTI
 stays matched to the driver and `sm_100` support is preserved).
 
+`nvcr.io/nvidia/pytorch:26.01-py3` (torch `2.10.0a0` / CUDA 13.1, digest
+`sha256:38ed2ecb2c16d10677006d73fb0a150855d6ec81db8fc66e800b5ae92741007e`) is
+also validated and performance-equivalent — rebuild `fbgemm_gpu` against
+whichever image's torch you run.
+
 ### Dependency versions
 
 | package | version | notes |
 |---|---|---|
-| **torch** | `2.10.0a0+a36e1d39eb.nv26.01.42222806` (CUDA 13.1) | native to the image; not reinstalled |
+| **torch** | `2.12.0a0+0291f960b6.nv26.04.48445190` (CUDA 13.2) | native to the image; not reinstalled |
 | **triton** | `3.6.0` | native to the image; provides `triton.language.make_tensor_descriptor` (required by the TRITON HSTU path) |
-| **fbgemm_gpu** | `fbgemm_gpu_nightly-2026.6.1` (CUDA 13.1, `sm_100`) | built from source against the native torch, from FBGEMM commit `939f2da156b05d2f1bcba8c037d613c1098d0db5` (2026-04-29); public wheels are ABI-incompatible with the NGC torch |
+| **fbgemm_gpu** | `fbgemm_gpu_nightly-2026.6.1` (CUDA 13.2, `sm_100`) | built from source against the native torch, from FBGEMM commit `939f2da156b05d2f1bcba8c037d613c1098d0db5` (2026-04-29); public wheels are ABI-incompatible with the NGC torch. Build command: `TORCH_CUDA_ARCH_LIST=10.0 python setup.py bdist_wheel --build-target default --build-variant cuda --package_channel nightly --nvml_lib_path /usr/lib/x86_64-linux-gnu/libnvidia-ml.so` (~55 min — the `sm_100` TBE-forward kernels dominate via `ptxas`) |
 | **torchrec** | `1.4.0` | installed with `--no-deps` |
 | **polars-u64-idx** | `1.33.1` | 64-bit row index — `yambda-5b` has > 4.29 B rows (overflows stock polars' 32-bit index) |
-| CUPTI (for `torch.profiler`) | 13.1 (native) | matches the driver; the `+cu128` stack's CUPTI 12.8 fails on B200 (`CUPTI_ERROR_INVALID_DEVICE`) |
+| CUPTI (for `torch.profiler`) | 13.2 (native) | matches the driver; the `+cu128` stack's CUPTI 12.8 fails on B200 (`CUPTI_ERROR_INVALID_DEVICE`) |
 
 Additional Python deps:
 `xxhash`, `gin-config`, `absl-py`, `pandas`, `tensorboard`, `pyarrow`, `pyyaml`,
@@ -165,6 +170,7 @@ From `generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin`:
 | max_seq_len (attention budget) | 2048 | `get_hstu_configs.max_seq_len` |
 | bf16 training | True | `make_model.bf16_training` |
 | HBM cap (per GPU) | 150 GiB | `make_optimizer_and_shard.hbm_cap_gb` (env `HBM_CAP_GB`) |
+| **triton autotune pinning** | **True (full autotune)** | `apply_env_bootstrap.TRITON_FULL_AUTOTUNE` — the pinned configs are MI350X-specific, so B200 runs full autotune to find its own `sm_100` winners |
 | dense optimizer | Adam, lr 1e-3, betas (0.95, 0.999), eps 1e-8 | `dense_optimizer_factory_and_class.*` |
 | sparse optimizer | RowWiseAdagrad, lr 1e-3, betas (0.95, 0.999), eps 1e-8 | `sparse_optimizer_factory_and_class.*` |
 | world_size | 8 | `MetricsLogger.world_size` |

From fc4d09268d0a7695fdbe747cde1ff95e7c499c3a Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Tue, 2 Jun 2026 00:00:08 +0000
Subject: [PATCH 018/113] docs: refresh B200 recipe deps (fbgemm HEAD, torchrec
 1.7 nightly, driver)

Point fbgemm at the latest validated source commit (10b77573, 2026-06-01),
record the tested torchrec 1.7.0.dev nightly (1.4.0 stable fallback),
clarify the fbgemm wheel version string is the build date, and correct the
host/forward-compat driver CUDA versions (13.0 host / 595.58.03 compat).
---
 recommendation_v4/docs/training_recipe.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/recommendation_v4/docs/training_recipe.md b/recommendation_v4/docs/training_recipe.md
index bd6fee159..47ef0577e 100644
--- a/recommendation_v4/docs/training_recipe.md
+++ b/recommendation_v4/docs/training_recipe.md
@@ -119,8 +119,8 @@ training.
 | item | value |
 |---|---|
 | GPUs | 8× NVIDIA B200 (`sm_100`, compute capability 10.0) |
-| Host driver | 580.159.03 (reports CUDA 13.2) |
-| Forward-compat userspace driver | `libcuda.so.595.45.04` (engaged automatically by the NGC image) |
+| Host driver | 580.159.03 (reports CUDA 13.0) |
+| Forward-compat userspace driver | `libcuda.so.595.58.03` (CUDA 13.2.1; engaged automatically by the NGC image) |
 
 ### Container image
 
@@ -144,8 +144,8 @@ whichever image's torch you run.
 |---|---|---|
 | **torch** | `2.12.0a0+0291f960b6.nv26.04.48445190` (CUDA 13.2) | native to the image; not reinstalled |
 | **triton** | `3.6.0` | native to the image; provides `triton.language.make_tensor_descriptor` (required by the TRITON HSTU path) |
-| **fbgemm_gpu** | `fbgemm_gpu_nightly-2026.6.1` (CUDA 13.2, `sm_100`) | built from source against the native torch, from FBGEMM commit `939f2da156b05d2f1bcba8c037d613c1098d0db5` (2026-04-29); public wheels are ABI-incompatible with the NGC torch. Build command: `TORCH_CUDA_ARCH_LIST=10.0 python setup.py bdist_wheel --build-target default --build-variant cuda --package_channel nightly --nvml_lib_path /usr/lib/x86_64-linux-gnu/libnvidia-ml.so` (~55 min — the `sm_100` TBE-forward kernels dominate via `ptxas`) |
-| **torchrec** | `1.4.0` | installed with `--no-deps` |
+| **fbgemm_gpu** | FBGEMM commit `10b775730212923f65f7b78f79b6a01d80cf3c29` (2026-06-01 `main`, CUDA 13.2, `sm_100`) | built from source against the native torch; public wheels are ABI-incompatible with the NGC torch. The built wheel is named `fbgemm_gpu_nightly-2026.6.1` — that version is the build date, not the source date, so always identify the build by the commit above. Build command: `TORCH_CUDA_ARCH_LIST=10.0 python setup.py bdist_wheel --build-target default --build-variant cuda --package_channel nightly --nvml_lib_path /usr/lib/x86_64-linux-gnu/libnvidia-ml.so` (~55 min — the `sm_100` TBE-forward kernels dominate via `ptxas`) |
+| **torchrec** | `1.7.0.dev20260601+cu130` (nightly, tested) | installed `--no-deps` from `https://download.pytorch.org/whl/nightly/cu130`. Perf-neutral vs stable `1.4.0`; use `1.4.0` (latest stable) if you prefer a non-pre-release |
 | **polars-u64-idx** | `1.33.1` | 64-bit row index — `yambda-5b` has > 4.29 B rows (overflows stock polars' 32-bit index) |
 | CUPTI (for `torch.profiler`) | 13.2 (native) | matches the driver; the `+cu128` stack's CUPTI 12.8 fails on B200 (`CUPTI_ERROR_INVALID_DEVICE`) |
 

From 194fc9bee96c94ed91cc79310a9e1caf91affe09 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Mon, 1 Jun 2026 19:22:54 -0500
Subject: [PATCH 019/113] MI350X: re-pin 2 triton configs for the torch 2.12 +
 torchrec 1.7 stack

After upgrading to torch 2.12 / torchrec 1.7 (B200-aligned), the pinned
configs from the torch 2.10 stack stopped landing on the fast equilibrium
because the torchrec 1.7 code path invokes these kernels at different
shape keys. Re-captured winners via a fresh autotune run and updated the
pin sites:

- _weighted_layer_norm_bwd_dx: BLOCK_N 8 -> 1 (num_warps 1 unchanged)
- split_2D_jagged_multirow:    BLOCK_N 1 / num_warps 2 -> BLOCK_N 8 / num_warps 1
- _layer_norm_bwd_dwdb:        BLOCK_N 128, num_warps 8 (unchanged - same winner on both stacks)

Verified: 3 consecutive checkpoints (steps 151/201/251) at 52.75-53.36 ms
deterministic on the new stack. Same equilibrium band as the torch 2.10
stack (51.5-53.0 ms).

Also adds a Stack B section to docs/training_recipe.md (MI350X) documenting
the torch 2.12 swap recipe (torch + torchvision + torchaudio + fbgemm
rebuild + torchrec git tag) so the MI350X recipe is dependency-aligned with
the B200 path.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 recommendation_v4/docs/training_recipe.md     | 27 ++++++++++++++++---
 .../ops/triton/triton_jagged.py               |  2 +-
 .../ops/triton/triton_layer_norm.py           |  2 +-
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/recommendation_v4/docs/training_recipe.md b/recommendation_v4/docs/training_recipe.md
index 47ef0577e..d18221cb3 100644
--- a/recommendation_v4/docs/training_recipe.md
+++ b/recommendation_v4/docs/training_recipe.md
@@ -29,14 +29,35 @@ the ROCm-matched build used by triton/fbgemm.
 
 ### Dependency versions
 
+Two stacks validated; both land at the same ~52 ms/step steady state with the
+pinned triton autotune configs.
+
+**Stack A — image-native torch (default, no torch swap):**
+
 | package | version | notes |
 |---|---|---|
 | **torch** | `2.10.0+git94c6e04` | native to the image; not reinstalled |
-| **triton** | `3.6.0` | native to the image; same major as B200 path |
-| **fbgemm_gpu** | `fbgemm_gpu_nightly_rocm-2026.6.1` (built from FBGEMM commit `1509423`, 2026-06-01) for `gfx950` | image ships `2026.5.14`; rebuild from source gives a measurable boost from the TBE-forward V2 grid-striding (#5669) + warpSize 32/64 unified build (#5739) + `__syncthreads` cleanup (#5744). Build command: `python setup.py -j 32 bdist_wheel --build-target=default --build-variant=rocm -DHIP_ROOT_DIR=/opt/rocm -DAMDGPU_TARGETS=gfx950` |
-| **torchrec** | `1.4.0` | matches B200 |
+| **triton** | `3.6.0` | native to the image |
+| **fbgemm_gpu** | `fbgemm_gpu_nightly_rocm-2026.6.1` (built from FBGEMM commit `1509423`, 2026-06-01 `main`) for `gfx950` | image ships `2026.5.14`; rebuild from source gives a measurable boost from the TBE-forward V2 grid-striding (#5669) + warpSize 32/64 unified build (#5739) + `__syncthreads` cleanup (#5744). Build command: `python setup.py -j 32 bdist_wheel --build-target=default --build-variant=rocm -DHIP_ROOT_DIR=/opt/rocm -DAMDGPU_TARGETS=gfx950` |
+| **torchrec** | `1.4.0` | image native |
 | **polars-u64-idx** | `1.33.1` | 64-bit row index — `yambda-5b` has > 4.29 B rows. Installed from a pre-staged local tarball by `scripts/launch_smoke_8gpu.sh` (reserved nodes have no outbound DNS) |
 
+**Stack B — torch 2.12 / torchrec 1.7 (B200-aligned):**
+
+| package | version | install |
+|---|---|---|
+| **torch** | `2.12.0+rocm7.2` | `pip install --upgrade --no-deps --index-url https://download.pytorch.org/whl/rocm7.2 torch==2.12.0+rocm7.2` |
+| **torchvision** | `0.27.0+rocm7.2` | `pip install --upgrade --no-deps --index-url https://download.pytorch.org/whl/rocm7.2 torchvision` — ABI must match torch 2.12 |
+| **torchaudio** | `2.11.0+rocm7.2` | `pip install --upgrade --no-deps --index-url https://download.pytorch.org/whl/rocm7.2 torchaudio` — ABI must match torch 2.12 |
+| **triton** | `3.6.0` | image native, unchanged |
+| **fbgemm_gpu** | `fbgemm_gpu_nightly_rocm-2026.6.1` (rebuilt against torch 2.12) | same FBGEMM commit `1509423`, same build command as Stack A — must rebuild after the torch swap |
+| **torchrec** | `1.7.0a0+bf55480` (git tag `v2026.06.01.00`) | `pip install --force-reinstall --no-deps "git+https://github.com/pytorch/torchrec.git@v2026.06.01.00"` |
+| **polars-u64-idx** | `1.33.1` | as above |
+
+Both stacks use the same pinned triton configs; perf is parity. Stack A is
+the lower-risk path; Stack B aligns the active torch / torchrec versions
+with the B200 path below.
+
 ### Training configuration
 
 From `generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin`:
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_jagged.py b/recommendation_v4/generative_recommenders/ops/triton/triton_jagged.py
index 8172a0f7b..3f5609d75 100644
--- a/recommendation_v4/generative_recommenders/ops/triton/triton_jagged.py
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_jagged.py
@@ -2152,7 +2152,7 @@ def split_2D_jagged_w_prefix_multirow(
 
 @triton_autotune(
     configs=pinned_or_full(
-        [triton.Config({"BLOCK_N": 1}, num_warps=2)],
+        [triton.Config({"BLOCK_N": 8}, num_warps=1)],
         _get_split_concat_2d_jagged_multirow_configs_wrapper,
     ),
     key=["BLOCK_D"],
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_layer_norm.py b/recommendation_v4/generative_recommenders/ops/triton/triton_layer_norm.py
index 5ed508108..62fe626de 100644
--- a/recommendation_v4/generative_recommenders/ops/triton/triton_layer_norm.py
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_layer_norm.py
@@ -308,7 +308,7 @@ def _layer_norm_bwd_dx(
 
 @triton_autotune(
     configs=pinned_or_full(
-        [triton.Config({"BLOCK_N": 8}, num_warps=1)],
+        [triton.Config({"BLOCK_N": 1}, num_warps=1)],
         _get_layer_norm_fwd_configs,
     ),
     key=["BLOCK_D"],

From 671e7e680d97bc2609e472e03a7d35d0857cde1e Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Mon, 1 Jun 2026 19:54:33 -0500
Subject: [PATCH 020/113] docs: update MI350X Stack B to fbgemm @ B200 commit +
 caveat

Bumps the Stack B (torch 2.12 / torchrec 1.7) section to:
- fbgemm commit 10b77573 (same SHA as the B200 path) instead of 1509423
  (one cosmetic commit behind). Wheel rename 2026.6.1 -> 2026.6.2.
- Note that Stack A and Stack B use different pinned triton configs
  (already merged) and explain why (torchrec 1.7 invokes the kernels at
  different shape keys).
- Caveat: HSTU_HAMMER_KERNEL=PYTORCH fallback regresses to ~169 ms on
  Stack B (vs 107 ms on Stack A). TRITON is unaffected and remains the
  default; this only matters for PYTORCH-backend debugging.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 recommendation_v4/docs/training_recipe.md | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/recommendation_v4/docs/training_recipe.md b/recommendation_v4/docs/training_recipe.md
index d18221cb3..c4995b6b7 100644
--- a/recommendation_v4/docs/training_recipe.md
+++ b/recommendation_v4/docs/training_recipe.md
@@ -42,7 +42,7 @@ pinned triton autotune configs.
 | **torchrec** | `1.4.0` | image native |
 | **polars-u64-idx** | `1.33.1` | 64-bit row index — `yambda-5b` has > 4.29 B rows. Installed from a pre-staged local tarball by `scripts/launch_smoke_8gpu.sh` (reserved nodes have no outbound DNS) |
 
-**Stack B — torch 2.12 / torchrec 1.7 (B200-aligned):**
+**Stack B — torch 2.12 / torchrec 1.7 / fbgemm @ B200 commit (B200-aligned, validated):**
 
 | package | version | install |
 |---|---|---|
@@ -50,13 +50,24 @@ pinned triton autotune configs.
 | **torchvision** | `0.27.0+rocm7.2` | `pip install --upgrade --no-deps --index-url https://download.pytorch.org/whl/rocm7.2 torchvision` — ABI must match torch 2.12 |
 | **torchaudio** | `2.11.0+rocm7.2` | `pip install --upgrade --no-deps --index-url https://download.pytorch.org/whl/rocm7.2 torchaudio` — ABI must match torch 2.12 |
 | **triton** | `3.6.0` | image native, unchanged |
-| **fbgemm_gpu** | `fbgemm_gpu_nightly_rocm-2026.6.1` (rebuilt against torch 2.12) | same FBGEMM commit `1509423`, same build command as Stack A — must rebuild after the torch swap |
+| **fbgemm_gpu** | `fbgemm_gpu_nightly_rocm-2026.6.2` (built from FBGEMM commit `10b77573`, same SHA as the B200 path) | rebuild against torch 2.12. Build command unchanged from Stack A: `python setup.py -j 32 bdist_wheel --build-target=default --build-variant=rocm -DHIP_ROOT_DIR=/opt/rocm -DAMDGPU_TARGETS=gfx950` |
 | **torchrec** | `1.7.0a0+bf55480` (git tag `v2026.06.01.00`) | `pip install --force-reinstall --no-deps "git+https://github.com/pytorch/torchrec.git@v2026.06.01.00"` |
 | **polars-u64-idx** | `1.33.1` | as above |
 
-Both stacks use the same pinned triton configs; perf is parity. Stack A is
-the lower-risk path; Stack B aligns the active torch / torchrec versions
-with the B200 path below.
+Both stacks land at the same ~52 ms/step steady state with the TRITON HSTU
+backend + pinned triton configs. Pinned configs differ between Stack A and
+Stack B — the torchrec 1.7 code path invokes layer-norm / jagged kernels at
+different shape keys than torchrec 1.4, so Stack B uses a re-captured pin
+set (already merged in this repo; flip Stack via Container A vs B above).
+
+Stack A is the lower-risk path (no torch swap). Stack B aligns the active
+torch / torchrec / fbgemm SHA exactly with the B200 path below, useful
+for cross-platform A/B comparisons.
+
+**Caveat:** on Stack B the `HSTU_HAMMER_KERNEL=PYTORCH` fallback regresses
+to ~169 ms/step (vs ~107 ms on Stack A). Only the TRITON HSTU path is
+performance-parity across stacks. Default config uses TRITON so this only
+matters if you intentionally force PYTORCH for debugging.
 
 ### Training configuration
 

From b3d17641cb8c75cb9b2b4da404bd3307b1b90dee Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Mon, 1 Jun 2026 19:55:20 -0500
Subject: [PATCH 021/113] docs: drop Stack A; MI350X recipe is now single-stack
 (B200-aligned)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Collapses the two-stack MI350X section into one canonical dependency
table: torch 2.12 / torchrec 1.7 / fbgemm @ 10b77573 — the same SHAs as
the B200 path. The image-native torch 2.10 / torchrec 1.4 / fbgemm
2026.5.14 path still works for development but the recipe doc now
documents the validated production stack only.

PYTORCH-backend caveat preserved.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 recommendation_v4/docs/training_recipe.md | 41 ++++++-----------------
 1 file changed, 10 insertions(+), 31 deletions(-)

diff --git a/recommendation_v4/docs/training_recipe.md b/recommendation_v4/docs/training_recipe.md
index c4995b6b7..f1ed651f2 100644
--- a/recommendation_v4/docs/training_recipe.md
+++ b/recommendation_v4/docs/training_recipe.md
@@ -29,20 +29,9 @@ the ROCm-matched build used by triton/fbgemm.
 
 ### Dependency versions
 
-Two stacks validated; both land at the same ~52 ms/step steady state with the
-pinned triton autotune configs.
-
-**Stack A — image-native torch (default, no torch swap):**
-
-| package | version | notes |
-|---|---|---|
-| **torch** | `2.10.0+git94c6e04` | native to the image; not reinstalled |
-| **triton** | `3.6.0` | native to the image |
-| **fbgemm_gpu** | `fbgemm_gpu_nightly_rocm-2026.6.1` (built from FBGEMM commit `1509423`, 2026-06-01 `main`) for `gfx950` | image ships `2026.5.14`; rebuild from source gives a measurable boost from the TBE-forward V2 grid-striding (#5669) + warpSize 32/64 unified build (#5739) + `__syncthreads` cleanup (#5744). Build command: `python setup.py -j 32 bdist_wheel --build-target=default --build-variant=rocm -DHIP_ROOT_DIR=/opt/rocm -DAMDGPU_TARGETS=gfx950` |
-| **torchrec** | `1.4.0` | image native |
-| **polars-u64-idx** | `1.33.1` | 64-bit row index — `yambda-5b` has > 4.29 B rows. Installed from a pre-staged local tarball by `scripts/launch_smoke_8gpu.sh` (reserved nodes have no outbound DNS) |
-
-**Stack B — torch 2.12 / torchrec 1.7 / fbgemm @ B200 commit (B200-aligned, validated):**
+Aligned with the B200 path: same torch major.minor, same torchrec commit,
+same fbgemm SHA. The image's native torch / torchvision / torchaudio /
+torchrec / fbgemm_gpu are all replaced; only the image's triton stays.
 
 | package | version | install |
 |---|---|---|
@@ -50,24 +39,14 @@ pinned triton autotune configs.
 | **torchvision** | `0.27.0+rocm7.2` | `pip install --upgrade --no-deps --index-url https://download.pytorch.org/whl/rocm7.2 torchvision` — ABI must match torch 2.12 |
 | **torchaudio** | `2.11.0+rocm7.2` | `pip install --upgrade --no-deps --index-url https://download.pytorch.org/whl/rocm7.2 torchaudio` — ABI must match torch 2.12 |
 | **triton** | `3.6.0` | image native, unchanged |
-| **fbgemm_gpu** | `fbgemm_gpu_nightly_rocm-2026.6.2` (built from FBGEMM commit `10b77573`, same SHA as the B200 path) | rebuild against torch 2.12. Build command unchanged from Stack A: `python setup.py -j 32 bdist_wheel --build-target=default --build-variant=rocm -DHIP_ROOT_DIR=/opt/rocm -DAMDGPU_TARGETS=gfx950` |
+| **fbgemm_gpu** | `fbgemm_gpu_nightly_rocm-2026.6.2` (built from FBGEMM commit `10b77573`, same SHA as the B200 path) for `gfx950` | rebuild from source against the replaced torch. Build command: `python setup.py -j 32 bdist_wheel --build-target=default --build-variant=rocm -DHIP_ROOT_DIR=/opt/rocm -DAMDGPU_TARGETS=gfx950` |
 | **torchrec** | `1.7.0a0+bf55480` (git tag `v2026.06.01.00`) | `pip install --force-reinstall --no-deps "git+https://github.com/pytorch/torchrec.git@v2026.06.01.00"` |
-| **polars-u64-idx** | `1.33.1` | as above |
-
-Both stacks land at the same ~52 ms/step steady state with the TRITON HSTU
-backend + pinned triton configs. Pinned configs differ between Stack A and
-Stack B — the torchrec 1.7 code path invokes layer-norm / jagged kernels at
-different shape keys than torchrec 1.4, so Stack B uses a re-captured pin
-set (already merged in this repo; flip Stack via Container A vs B above).
-
-Stack A is the lower-risk path (no torch swap). Stack B aligns the active
-torch / torchrec / fbgemm SHA exactly with the B200 path below, useful
-for cross-platform A/B comparisons.
-
-**Caveat:** on Stack B the `HSTU_HAMMER_KERNEL=PYTORCH` fallback regresses
-to ~169 ms/step (vs ~107 ms on Stack A). Only the TRITON HSTU path is
-performance-parity across stacks. Default config uses TRITON so this only
-matters if you intentionally force PYTORCH for debugging.
+| **polars-u64-idx** | `1.33.1` | 64-bit row index — `yambda-5b` has > 4.29 B rows. Installed from a pre-staged local tarball by `scripts/launch_smoke_8gpu.sh` |
+
+**Caveat:** the `HSTU_HAMMER_KERNEL=PYTORCH` fallback path regresses on
+torch 2.12 (~169 ms/step vs ~107 ms on torch 2.10). The default TRITON
+HSTU backend is unaffected — only matters if you intentionally force
+PYTORCH for debugging.
 
 ### Training configuration
 

From 7f6553e07d2e7dfbaef46072ae3ad0ec590785f4 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Mon, 1 Jun 2026 19:55:41 -0500
Subject: [PATCH 022/113] docs: drop PYTORCH-fallback caveat from MI350X recipe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Not relevant — TRITON is the documented default backend.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 recommendation_v4/docs/training_recipe.md | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/recommendation_v4/docs/training_recipe.md b/recommendation_v4/docs/training_recipe.md
index f1ed651f2..34297fc40 100644
--- a/recommendation_v4/docs/training_recipe.md
+++ b/recommendation_v4/docs/training_recipe.md
@@ -43,11 +43,6 @@ torchrec / fbgemm_gpu are all replaced; only the image's triton stays.
 | **torchrec** | `1.7.0a0+bf55480` (git tag `v2026.06.01.00`) | `pip install --force-reinstall --no-deps "git+https://github.com/pytorch/torchrec.git@v2026.06.01.00"` |
 | **polars-u64-idx** | `1.33.1` | 64-bit row index — `yambda-5b` has > 4.29 B rows. Installed from a pre-staged local tarball by `scripts/launch_smoke_8gpu.sh` |
 
-**Caveat:** the `HSTU_HAMMER_KERNEL=PYTORCH` fallback path regresses on
-torch 2.12 (~169 ms/step vs ~107 ms on torch 2.10). The default TRITON
-HSTU backend is unaffected — only matters if you intentionally force
-PYTORCH for debugging.
-
 ### Training configuration
 
 From `generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin`:

From cb68c8ea4013c0ffe3dab58a51aba23666a24e07 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Mon, 1 Jun 2026 21:39:24 -0500
Subject: [PATCH 023/113] MI350X: fit-entity embedding sizes, bs=1024 default,
 batch-agnostic recipe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Embedding sizes match the true entity counts in yambda-5b:
  item_id   9_390_000 -> 9_390_624
  artist_id 1_290_000 -> 1_293_395
  album_id  3_370_000 -> 3_367_692
  uid       1_000_000 -> 1_000_001
This eliminates the recurring "EmbeddingBoundsCheck ... Setting idx to
zero" warnings at training time.

Gin default raised to batch_size=1024 / eval_batch_size=1024. Measured
steady-state on the torch 2.12 + torchrec 1.7 + fbgemm HEAD stack with
TRITON HSTU + pinned triton configs: ~635 ms/step, ~12.9K sps, ~2.92
days/epoch vs ~7.6 days at bs=32. bs=2048 is feasible but only +3%
throughput at much higher autotune cost, so bs=1024 is the sweet spot.

Triton autotune pin for _weighted_layer_norm_bwd_dx now ships TWO
configs in the pinned list — BLOCK_N=1 (bs=32 winner) and BLOCK_N=8
(bs=1024 winner). Triton's autotune key=[BLOCK_D] dispatches the right
one per shape in <5 sec on cold start (vs ~30 sec from the full pool).
The other two pinned kernels (_layer_norm_bwd_dwdb, split_2D_jagged_multirow)
have identical winners at bs=32 and bs=1024 so they stay single-config.

Training-recipe doc drops the batch_size rows from both MI350X and B200
config tables — the recipe is intentionally batch-size-agnostic now that
the pin set covers a range.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 recommendation_v4/docs/training_recipe.md                 | 2 --
 .../generative_recommenders/dlrm_v3/configs.py            | 8 ++++----
 .../dlrm_v3/train/gin/yambda_5b.gin                       | 4 ++--
 .../ops/triton/triton_layer_norm.py                       | 5 ++++-
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/recommendation_v4/docs/training_recipe.md b/recommendation_v4/docs/training_recipe.md
index 34297fc40..28c88ded2 100644
--- a/recommendation_v4/docs/training_recipe.md
+++ b/recommendation_v4/docs/training_recipe.md
@@ -49,8 +49,6 @@ From `generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin`:
 
 | parameter | value | gin binding |
 |---|---|---|
-| batch_size (train) | 32 | `make_train_test_dataloaders.batch_size` |
-| eval_batch_size | 32 | `make_train_test_dataloaders.eval_batch_size` |
 | num_workers (dataloader) | 4 | `make_train_test_dataloaders.num_workers` |
 | prefetch_factor | 8 | `make_train_test_dataloaders.prefetch_factor` |
 | num_blocks | 1 | `make_train_test_dataloaders.num_blocks` |
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/configs.py b/recommendation_v4/generative_recommenders/dlrm_v3/configs.py
index 1fd7f07a9..1b6ecf62f 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/configs.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/configs.py
@@ -684,28 +684,28 @@ def get_embedding_table_config(
         assert dataset in ["yambda-5b"]
         tables: Dict[str, EmbeddingConfig] = {
             "item_id": EmbeddingConfig(
-                num_embeddings=9_390_000,
+                num_embeddings=9_390_624,
                 embedding_dim=DIM,
                 name="item_id",
                 data_type=DataType.FP32,
                 feature_names=["item_id", "item_candidate_id"],
             ),
             "artist_id": EmbeddingConfig(
-                num_embeddings=1_290_000,
+                num_embeddings=1_293_395,
                 embedding_dim=DIM,
                 name="artist_id",
                 data_type=DataType.FP32,
                 feature_names=["artist_id", "item_candidate_artist_id"],
             ),
             "album_id": EmbeddingConfig(
-                num_embeddings=3_370_000,
+                num_embeddings=3_367_692,
                 embedding_dim=DIM,
                 name="album_id",
                 data_type=DataType.FP32,
                 feature_names=["album_id", "item_candidate_album_id"],
             ),
             "uid": EmbeddingConfig(
-                num_embeddings=1_000_000,
+                num_embeddings=1_000_001,
                 embedding_dim=DIM,
                 name="uid",
                 data_type=DataType.FP32,
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index da5727de3..491cc853a 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -1,4 +1,4 @@
-batch_size = 32
+batch_size = 1024
 num_workers = 4
 prefetch_factor = 8
 dataset = "yambda-5b"
@@ -42,7 +42,7 @@ data/env_path.default = "/apps/chcai/dlrm_data"
 
 # dataloader configs
 make_train_test_dataloaders.batch_size = %batch_size
-make_train_test_dataloaders.eval_batch_size = 32
+make_train_test_dataloaders.eval_batch_size = 1024
 make_train_test_dataloaders.dataset_type = %dataset
 make_train_test_dataloaders.train_split_percentage = 0.90
 make_train_test_dataloaders.new_path_prefix = %DATA_PATH
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_layer_norm.py b/recommendation_v4/generative_recommenders/ops/triton/triton_layer_norm.py
index 62fe626de..cc513433b 100644
--- a/recommendation_v4/generative_recommenders/ops/triton/triton_layer_norm.py
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_layer_norm.py
@@ -308,7 +308,10 @@ def _layer_norm_bwd_dx(
 
 @triton_autotune(
     configs=pinned_or_full(
-        [triton.Config({"BLOCK_N": 1}, num_warps=1)],
+        [
+            triton.Config({"BLOCK_N": 1}, num_warps=1),  # bs=32 winner
+            triton.Config({"BLOCK_N": 8}, num_warps=1),  # bs=1024 winner
+        ],
         _get_layer_norm_fwd_configs,
     ),
     key=["BLOCK_D"],

From 2e95a4d24786798b5f71cb83a52f3c85d9e21481 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Tue, 2 Jun 2026 01:32:45 -0500
Subject: [PATCH 024/113] MI350X: separated-RNG LN-dropout + attention autotune
 pin + clock guard

Enable the multi-row, separated-RNG _ln_mul_dropout path on AMD MI350 (gfx950),
previously Blackwell-only. Batches rows per program and reuses a precomputed
dropout mask in the backward instead of one-program-per-row fused RNG; +5.6%
end-to-end (-> 14,222 global sps) at bs=1024 on yambda-5b.

- ops/utils.py: add is_amd_mi350() + use_separated_rng_ln_mul_dropout() gate.
- ops/triton/triton_hstu_linear.py: dispatch the fwd LN-dropout to the
  separated-RNG path via the new gate.
- ops/triton/triton_hstu_attention.py: pin fast nonkdim:16 fwd/persistent/bwd
  configs via pinned_or_full (TRITON_FULL_AUTOTUNE=1 still bypasses). Multi-config
  lists with an inline "add a new batch size" guide.
- scripts/launch_smoke_8gpu.sh: GPU clock sanity guard - log perf level + sclk,
  auto-restore 'auto' if a perf_determinism/manual/low lock is found (a half-clock
  lock uniformly slowed every Triton kernel ~1.9x and masked perf changes).
- docs/perf_opt.md: document the LN-dropout fix and the clock-lock caveat.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 recommendation_v4/docs/perf_opt.md            | 73 ++++++++++++++++
 .../ops/triton/triton_hstu_attention.py       | 85 ++++++++++++++++++-
 .../ops/triton/triton_hstu_linear.py          | 15 ++--
 .../generative_recommenders/ops/utils.py      | 31 +++++++
 .../scripts/launch_smoke_8gpu.sh              | 20 +++++
 5 files changed, 216 insertions(+), 8 deletions(-)
 create mode 100644 recommendation_v4/docs/perf_opt.md

diff --git a/recommendation_v4/docs/perf_opt.md b/recommendation_v4/docs/perf_opt.md
new file mode 100644
index 000000000..627ab74aa
--- /dev/null
+++ b/recommendation_v4/docs/perf_opt.md
@@ -0,0 +1,73 @@
+# Performance Optimizations — MI350X HSTU / OneTrans (yambda-5b, bs=1024, TRITON)
+
+Performance work for the 8× MI350X HSTU ranker on `yambda-5b` at `batch_size=1024`
+with the **TRITON** HSTU kernel and bf16 training. Companion to
+[`training_recipe.md`](./training_recipe.md) (environment + reproduction).
+
+Throughput numbers are global samples/sec across 8 GPUs (`global_sps`), measured
+at steady state (instantaneous, computed from consecutive logged steps).
+
+---
+
+## LN-dropout: multi-row, separated-RNG path on MI350
+
+### What
+
+`_ln_mul_dropout_*` has two kernel variants:
+
+- **legacy** — single program per row, RNG fused inline (`_ln_mul_dropout_fwd`).
+- **separated-RNG** — multiple rows per program, dropout mask precomputed once
+  and reused by the backward (`_ln_mul_dropout_fwd_rng` /
+  `_ln_mul_dropout_bwd_dx_du_rng`).
+
+The separated path was previously gated to Blackwell only (`is_sm100_plus()`).
+MI350X (`gfx950`) benefits from the same structure, so the gate now also enables
+it on MI350.
+
+### Where
+
+| file | change |
+|---|---|
+| `ops/utils.py` | `is_amd_mi350()` (gfx950 detect) + `use_separated_rng_ln_mul_dropout()` gate |
+| `ops/triton/triton_hstu_linear.py` | dispatch LN-dropout fwd to the separated-RNG path when the gate is true |
+
+```python
+# ops/utils.py
+def use_separated_rng_ln_mul_dropout() -> bool:
+    return is_sm100_plus() or is_amd_mi350()
+```
+
+### Perf
+
+**+5.6% end-to-end → 14,222 global sps** (separated-RNG vs legacy fused, identical
+config, full boost clocks — see the caveat below).
+
+---
+
+## Caveat — GPU clock lock can mask all perf changes
+
+A node-level GPU clock lock will silently invalidate any benchmark on this
+machine, so check it before trusting numbers.
+
+During this work all 8 GPUs were stuck in **`perf_determinism`** performance
+level at **sclk 1093 MHz** (DPM level 1) while the real max is **2200 MHz**
+(level 2) — despite 100% utilization, ~370 W of power headroom (629 / 1000 W),
+and low temps (~50 °C). This was **not** thermal/power throttling; it was
+leftover node state from a prior job.
+
+Effect: a **uniform ~1.87× slowdown of every Triton compute kernel**
+(`2200 / 1093 ≈ 2.0×`), including kernels unrelated to any code change. It made
+the LN-dropout fix above look like a regression until the clock state was found.
+
+### Detect + fix
+
+```bash
+rocm-smi --showperflevel          # expect "auto", not perf_determinism/manual/low
+rocm-smi -d 0 --showclocks        # expect sclk ~2000+ MHz under load
+rocm-smi --setperflevel auto      # restore boost
+```
+
+`scripts/launch_smoke_8gpu.sh` now logs the perf level + a live `sclk` sample on
+every launch, auto-restores `auto` if it finds a `perf_determinism`/`manual`/`low`
+lock, and warns (to reset from the host) if it lacks permission inside the
+container. **Always sanity-check `sclk ≈ 2000+ MHz` before trusting a benchmark.**
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_attention.py b/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_attention.py
index 03a1f8f67..768ef0013 100644
--- a/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_attention.py
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_attention.py
@@ -46,6 +46,7 @@
     switch_to_contiguous_if_needed,
     triton_autotune,
 )
+from generative_recommenders.ops.triton._autotune_pinning import pinned_or_full
 from triton.language.extra.libdevice import (  # @manual=//triton:triton
     fast_dividef,
     fast_expf,
@@ -1585,8 +1586,50 @@ def _hstu_attn_fwd_compute_tlx(  # noqa C901
                 tl.store(out_ptrs, acc, mask=(offs_m < seq_len)[:, None])
 
 
+def _get_fw_pinned_configs() -> List[triton.Config]:
+    # Pinned forward-attention configs for MI350X gfx950. The full search is a
+    # coin flip between matrix_instr_nonkdim 16 (fast) and 32 (slow); pinning the
+    # known winners makes cold starts deterministic. See _autotune_pinning.
+    #
+    # This is a LIST, one entry per training shape we've tuned (currently just
+    # bs=1024). With >1 entry the autotuner still runs a tiny benchmark over only
+    # these candidates and caches the winner per `key` (AUTOTUNE_Z / H /
+    # AUTOTUNE_MAX_SEQ_LEN / DimQ / DimV / ...), so each batch size automatically
+    # picks its own config — same pattern as the layer-norm pins.
+    #
+    # TO ADD A NEW BATCH SIZE / SHAPE:
+    #   1. Run once with TRITON_FULL_AUTOTUNE=1 TRITON_PRINT_AUTOTUNING=1.
+    #   2. Grep the log for "best config selected:" under "_hstu_attn_fwd".
+    #   3. Append that config below (copy BLOCK_M/BLOCK_N/matrix_instr_nonkdim/
+    #      waves_per_eu/kpack/num_stages/num_warps verbatim).
+    # The four USE_TLX/NUM_* defaults below are required by the kernel signature
+    # (see the USE_TLX-default loop in _get_fw_configs); the pinned path bypasses
+    # that loop, so every pinned entry must set them explicitly.
+    if torch.version.hip:
+        return [
+            # --- yambda bs=1024, L=2048 winner (from capture log) ---
+            triton.Config(
+                {
+                    "BLOCK_M": 128,
+                    "BLOCK_N": 32,
+                    "matrix_instr_nonkdim": 16,
+                    "waves_per_eu": 0,
+                    "kpack": 2,
+                    "USE_TLX": False,
+                    "NUM_BUFFERS": 1,
+                    "NUM_MMA_WARPS_PER_GROUP": 1,
+                    "NUM_MMA_GROUPS": 1,
+                },
+                num_stages=2,
+                num_warps=8,
+            ),
+            # --- add more (bs, L) winners here; see "TO ADD A NEW BATCH SIZE" ---
+        ]
+    return _get_fw_configs()
+
+
 @triton_autotune(
-    configs=_get_fw_configs(),
+    configs=pinned_or_full(_get_fw_pinned_configs(), _get_fw_configs),
     key=[
         "AUTOTUNE_Z",
         "H",
@@ -1730,7 +1773,7 @@ def _hstu_attn_fwd(  # noqa C901
 
 
 @triton_autotune(
-    configs=_get_fw_configs(),
+    configs=pinned_or_full(_get_fw_pinned_configs(), _get_fw_configs),
     key=[
         "AUTOTUNE_Z",
         "H",
@@ -2390,8 +2433,44 @@ def _get_bw_configs() -> List[triton.Config]:
     return configs
 
 
+def _get_bw_pinned_configs() -> List[triton.Config]:
+    # Pinned backward-attention configs for MI350X gfx950. Pins the fast
+    # matrix_instr_nonkdim=16 winner(s) to avoid the 16-vs-32 autotune lottery.
+    #
+    # LIST, one entry per tuned shape (currently just bs=1024). With >1 entry the
+    # autotuner benchmarks only these candidates and caches the winner per `key`
+    # (AUTOTUNE_Z / H / AUTOTUNE_MAX_SEQ_LEN / DimQ / DimV), so each batch size
+    # picks its own config automatically — same pattern as the layer-norm pins.
+    #
+    # TO ADD A NEW BATCH SIZE / SHAPE:
+    #   1. Run once with TRITON_FULL_AUTOTUNE=1 TRITON_PRINT_AUTOTUNING=1.
+    #   2. Grep the log for "best config selected:" under "_hstu_attn_bwd".
+    #   3. Append that config below (verbatim BLOCK_M/BLOCK_N/matrix_instr_nonkdim/
+    #      waves_per_eu/SEQUENCE_PARALLEL/UNROLL/num_stages/num_warps).
+    # Keep pre_hook=_bwd_pre_hook on every entry (the bwd configs require it).
+    if torch.version.hip:
+        return [
+            # --- yambda bs=1024, L=2048 winner (from capture log) ---
+            triton.Config(
+                {
+                    "BLOCK_M": 32,
+                    "BLOCK_N": 128,
+                    "matrix_instr_nonkdim": 16,
+                    "waves_per_eu": 0,
+                    "SEQUENCE_PARALLEL": False,
+                    "UNROLL": 1,
+                },
+                num_stages=1,
+                num_warps=4,
+                pre_hook=_bwd_pre_hook,
+            ),
+            # --- add more (bs, L) winners here; see "TO ADD A NEW BATCH SIZE" ---
+        ]
+    return _get_bw_configs()
+
+
 @triton_autotune(
-    configs=_get_bw_configs(),
+    configs=pinned_or_full(_get_bw_pinned_configs(), _get_bw_configs),
     key=[
         "AUTOTUNE_Z",
         "H",
diff --git a/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_linear.py b/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_linear.py
index ff04dde40..516a15664 100644
--- a/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_linear.py
+++ b/recommendation_v4/generative_recommenders/ops/triton/triton_hstu_linear.py
@@ -48,7 +48,7 @@ def _get_layer_norm_mul_dropout_fwd_multirow_configs() -> List[triton.Config]:
     return configs
 
 
-from generative_recommenders.ops.utils import is_sm100_plus
+from generative_recommenders.ops.utils import use_separated_rng_ln_mul_dropout
 
 # @manual=//triton:triton
 from triton.language.extra import libdevice
@@ -1064,11 +1064,16 @@ def _triton_layer_norm_mul_dropout_fwd_impl(
 
     num_warps: int = min(max(BLOCK_D // 256, 1), 8)
     random_mask: torch.Tensor = torch.empty(0, dtype=x.dtype, device=x.device)
-    # Benchmark shows separating RNG from ln_mul_dropout kernel only benefits on
-    # blackwell when CONCAT_UX is enabled. (fused RNG kernel can benefit from rand3x fast
-    # dropout)
+    # Separating RNG from the ln_mul_dropout kernel lets us batch multiple rows per
+    # program (autotuned _ln_mul_dropout_fwd_rng) and reuse the precomputed mask in the
+    # backward, instead of launching one program per row with fused RNG. This is a large
+    # win on Blackwell (sm_100) and AMD MI350 (gfx950); other GPUs keep the fused path.
     # Extended to support concat_u + concat_x for mask reuse optimization
-    if not FUSE_OUTPUT_LN_RNG_BLACKWELL and is_sm100_plus() and training:
+    if (
+        not FUSE_OUTPUT_LN_RNG_BLACKWELL
+        and use_separated_rng_ln_mul_dropout()
+        and training
+    ):
         random_mask = _create_dropout_mask(
             N=N,
             D=D,
diff --git a/recommendation_v4/generative_recommenders/ops/utils.py b/recommendation_v4/generative_recommenders/ops/utils.py
index 94ab69e30..16edd99a9 100644
--- a/recommendation_v4/generative_recommenders/ops/utils.py
+++ b/recommendation_v4/generative_recommenders/ops/utils.py
@@ -84,6 +84,37 @@ def is_sm90_plus() -> bool:
     return is_sm100_plus() or is_sm90()
 
 
+@functools.lru_cache(maxsize=None)
+def is_amd_mi350() -> bool:
+    """Detect an AMD Instinct MI350-class GPU (gfx950) running under ROCm.
+
+    MI350 benefits from the same multi-row, separated-RNG layer-norm-mul-dropout
+    path as Blackwell datacenter parts (sm_100), so it is gated together with
+    is_sm100_plus() at the kernel dispatch sites.
+    """
+    if not torch.cuda.is_available():
+        return False
+    if getattr(torch.version, "hip", None) is None:
+        return False
+    try:
+        arch = torch.cuda.get_device_properties(0).gcnArchName or ""
+    except (AssertionError, RuntimeError, AttributeError):
+        return False
+    return "gfx950" in arch
+
+
+def use_separated_rng_ln_mul_dropout() -> bool:
+    """Hardware that should use the autotuned, multi-row ``_ln_mul_dropout_fwd_rng``
+    kernel with a precomputed dropout mask instead of the legacy single-row,
+    fused-RNG ``_ln_mul_dropout_fwd`` kernel.
+
+    Blackwell datacenter GPUs (sm_100-103) and AMD MI350 (gfx950) both prefer the
+    separated-RNG path: it batches rows per program and lets the backward reuse the
+    same mask, which is a large win over launching one program per row.
+    """
+    return is_sm100_plus() or is_amd_mi350()
+
+
 def copy_if_different_ptr(dst: torch.Tensor, src: torch.Tensor) -> None:
     if torch.compiler.is_compiling():
         # .data_ptr() will break PT2
diff --git a/recommendation_v4/scripts/launch_smoke_8gpu.sh b/recommendation_v4/scripts/launch_smoke_8gpu.sh
index 92daa6ef8..ad363cfaa 100755
--- a/recommendation_v4/scripts/launch_smoke_8gpu.sh
+++ b/recommendation_v4/scripts/launch_smoke_8gpu.sh
@@ -32,6 +32,26 @@ export WORLD_SIZE=$(python -c "import torch; print(torch.cuda.device_count())")
 # PYTORCH backend. On CUDA, unset this to default to TRITON for ~3-5x speedup.
 export HSTU_HAMMER_KERNEL=${HSTU_HAMMER_KERNEL:-PYTORCH}
 export PYTORCH_CUDA_ALLOC_CONF=${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}
+
+# --- GPU clock sanity guard ---------------------------------------------------
+# Leftover node state once pinned all 8 GPUs into `perf_determinism` at half
+# clock (1093 vs 2200 MHz max). That uniformly slowed every Triton kernel ~1.9x
+# and silently masked real perf changes for an entire debugging session. Always
+# log the perf level + a live sclk sample so a capped run is obvious from the
+# log, and try to restore boost. Fully non-fatal (rocm-smi may be absent or
+# lack permission inside the container — in that case reset from the host).
+if command -v rocm-smi >/dev/null 2>&1; then
+  echo "[$(date)] GPU perf-level check:" | tee -a "$LOG"
+  rocm-smi --showperflevel 2>/dev/null | grep -iE "GPU\[[0-9]+\]" | tee -a "$LOG" || true
+  if rocm-smi --showperflevel 2>/dev/null | grep -iqE "Performance Level: *(perf_determinism|manual|low)"; then
+    echo "[$(date)] WARNING: GPUs not in 'auto' perf level — attempting --setperflevel auto" | tee -a "$LOG"
+    rocm-smi --setperflevel auto 2>/dev/null | grep -iE "set to auto" | tee -a "$LOG" \
+      || echo "[$(date)] WARNING: could not set perf level (no permission?). Run 'rocm-smi --setperflevel auto' on the HOST before benchmarking — clocks may be capped." | tee -a "$LOG"
+  fi
+  echo "[$(date)] sclk sample (GPU0):$(rocm-smi -d 0 --showclocks 2>/dev/null | grep -i 'sclk clock level' | sed -E 's/.*sclk clock level//')" | tee -a "$LOG" || true
+fi
+# -----------------------------------------------------------------------------
+
 echo "[$(date)] launching train_ranker with WORLD_SIZE=$WORLD_SIZE" | tee -a "$LOG"
 
 python -m generative_recommenders.dlrm_v3.train.train_ranker \

From 17e04af320676b25b56252d1a5ee2fa5d13c8e76 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Tue, 2 Jun 2026 16:17:56 -0500
Subject: [PATCH 025/113] dlrmv4: TorchRec 3-stage sparse-dist pipeline +
 gin-selectable HSTU kernel
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an opt-in TrainPipelineSparseDist path that overlaps the embedding
input-distribution all-to-all with dense fwd/bwd. To make the embedding
collection pipelineable, the merged sparse KJT is now pre-built in the
dataloader (Samples.merged_sparse_features) and the model consumes it via a
_pipeline_mode forward that takes the batch as a single arg, so TorchRec's
tracer resolves the lookup input as a plain getattr off the batch.

- dataset.py: Samples.merged_sparse_features + merge_uih_candidate_kjts, built
  in collate_fn; wired into to()/record_stream()/pin_memory().
- dlrm_hstu.py: _pipeline_mode flag; forward unpacks the batch and preprocess
  accepts the prebuilt merged KJT (falls back to building it when absent).
- utils.py: _PipelineModelWrapper, build_train_pipeline, train_eval_loop
  use_pipeline branch + eval batch-arg; seed all RNGs in setup() for
  reproducible weight init.
- gin/launch: make_model.hammer_kernel selects TRITON vs PYTORCH (env override
  still honored); launch script defers to the gin default. use_pipeline
  defaults to False.

Validated on MI350/ROCm 8-GPU: embedding collection is pipelined (input-dist
a2a moves to hidden); model quality and throughput match the sequential path
(seeded A/B). The exposed embedding-output a2a still dominates the step, so
throughput is unchanged — pipelining is quality- and perf-neutral here.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/datasets/dataset.py               |  81 ++++++-
 .../dlrm_v3/train/gin/yambda_5b.gin           |   9 +
 .../dlrm_v3/train/utils.py                    | 219 +++++++++++++++---
 .../modules/dlrm_hstu.py                      |  67 ++++--
 .../scripts/launch_smoke_8gpu.sh              |   9 +-
 5 files changed, 323 insertions(+), 62 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/dataset.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/dataset.py
index a1cbb33fa..204c06df1 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/dataset.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/dataset.py
@@ -28,6 +28,7 @@
 import torch
 from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig
 from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
+from torchrec.streamable import Pipelineable
 
 
 logging.basicConfig(level=logging.INFO)
@@ -35,7 +36,7 @@
 
 
 @dataclass
-class Samples:
+class Samples(Pipelineable):
     """
     Container for batched samples with user interaction history and candidate features.
 
@@ -46,16 +47,48 @@ class Samples:
 
     uih_features_kjt: KeyedJaggedTensor
     candidates_features_kjt: KeyedJaggedTensor
-
-    def to(self, device: torch.device) -> None:
+    # UIH + candidate features concatenated into the single KJT that the model's
+    # sharded EmbeddingCollection consumes. Pre-built here (dataloader/CPU) rather
+    # than inside DlrmHSTU.forward so the embedding lookup's input is a plain
+    # attribute of the batch — which lets TorchRec's TrainPipelineSparseDist hoist
+    # its input_dist into the prefetch stage (otherwise the runtime cat +
+    # from_lengths_sync counts as an "input modification" and the embedding
+    # collection is left un-pipelined).
+    merged_sparse_features: KeyedJaggedTensor
+
+    def to(self, device: torch.device, non_blocking: bool = False) -> "Samples":
         """
-        Move all tensors to the specified device.
+        Move all tensors to the specified device (in place) and return self.
 
-        Args:
-            device: Target device to move tensors to.
+        Returning ``self`` (rather than ``None``) and accepting ``non_blocking``
+        makes ``Samples`` conform to TorchRec's ``Pipelineable`` protocol so it
+        can be driven by ``TrainPipelineSparseDist``. Existing call sites that
+        use ``sample.to(device)`` for its side effect continue to work unchanged.
         """
         for attr in vars(self):
-            setattr(self, attr, getattr(self, attr).to(device=device))
+            setattr(
+                self,
+                attr,
+                getattr(self, attr).to(device=device, non_blocking=non_blocking),
+            )
+        return self
+
+    def record_stream(self, stream: torch.Stream) -> None:
+        """Record the contained KJTs on ``stream`` (Pipelineable protocol).
+
+        Required by ``TrainPipelineSparseDist`` so the prefetched batch's H2D
+        copy on the side stream is not freed before compute consumes it.
+        """
+        self.uih_features_kjt.record_stream(stream)
+        self.candidates_features_kjt.record_stream(stream)
+        self.merged_sparse_features.record_stream(stream)
+
+    def pin_memory(self) -> "Samples":
+        """Pin the contained KJTs' host memory (Pipelineable protocol)."""
+        self.uih_features_kjt = self.uih_features_kjt.pin_memory()
+        self.candidates_features_kjt = self.candidates_features_kjt.pin_memory()
+        self.merged_sparse_features = self.merged_sparse_features.pin_memory()
+        return self
 
     def batch_size(self) -> int:
         """
@@ -67,6 +100,31 @@ def batch_size(self) -> int:
         return self.uih_features_kjt.stride()
 
 
+def merge_uih_candidate_kjts(
+    uih_features: KeyedJaggedTensor,
+    candidates_features: KeyedJaggedTensor,
+) -> KeyedJaggedTensor:
+    """Concatenate the UIH and candidate KJTs into the single KJT consumed by the
+    model's ``EmbeddingCollection``.
+
+    Must mirror ``DlrmHSTU.preprocess`` exactly (key order = uih + candidates,
+    values/lengths concatenated in that order). Built on the dataloader side so
+    the model can read it straight off the batch and TorchRec can pipeline the
+    embedding ``input_dist``.
+    """
+    return KeyedJaggedTensor.from_lengths_sync(
+        keys=uih_features.keys() + candidates_features.keys(),
+        values=torch.cat(
+            [uih_features.values(), candidates_features.values()],
+            dim=0,
+        ),
+        lengths=torch.cat(
+            [uih_features.lengths(), candidates_features.lengths()],
+            dim=0,
+        ),
+    )
+
+
 def collate_fn(
     samples: List[Tuple[KeyedJaggedTensor, KeyedJaggedTensor]],
 ) -> Samples:
@@ -84,9 +142,14 @@ def collate_fn(
         candidates_features_kjt_list,
     ) = list(zip(*samples))
 
+    uih_features_kjt = kjt_batch_func(uih_features_kjt_list)
+    candidates_features_kjt = kjt_batch_func(candidates_features_kjt_list)
     return Samples(
-        uih_features_kjt=kjt_batch_func(uih_features_kjt_list),
-        candidates_features_kjt=kjt_batch_func(candidates_features_kjt_list),
+        uih_features_kjt=uih_features_kjt,
+        candidates_features_kjt=candidates_features_kjt,
+        merged_sparse_features=merge_uih_candidate_kjts(
+            uih_features_kjt, candidates_features_kjt
+        ),
     )
 
 
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 491cc853a..9715f0b78 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -6,6 +6,11 @@ dataset = "yambda-5b"
 # model parameters
 make_model.dataset = %dataset
 make_model.bf16_training = True
+# HSTU attention/compute backend: "TRITON" (fused, flash-style — low HBM) or
+# "PYTORCH" (unfused; materializes the dense [B,H,N,N] score tensor, ~32 GiB at
+# N=2048/bs=1024). TRITON validated on MI350/ROCm. The HSTU_HAMMER_KERNEL env
+# var, if set, overrides this binding for one-off runs.
+make_model.hammer_kernel = "TRITON"
 
 # False = use pinned triton kernel configs (deterministic; whether that's
 # the fast or slow equilibrium depends on which config was pinned for the
@@ -85,6 +90,10 @@ train_eval_loop.eval_frequency = 5000
 train_eval_loop.num_eval_batches = 500
 train_eval_loop.checkpoint_frequency = 1000000000  # disable mid-training checkpoints (disk-full guard)
 train_eval_loop.output_trace = True
+# 3-stage TorchRec pipeline: overlaps the embedding all-to-all (the dominant
+# exposed-comm bottleneck) with dense fwd/bwd. Set False to fall back to the
+# sequential fwd/bwd loop.
+train_eval_loop.use_pipeline = False
 
 # Run name → recommendation_v4/results/<run_name>/ (override via $RUN_NAME env).
 RUN_NAME = @run/env_path()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index a957122a3..94237f947 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -38,7 +38,11 @@
     get_embedding_table_config,
     get_hstu_configs,
 )
-from generative_recommenders.dlrm_v3.datasets.dataset import collate_fn, Dataset
+from generative_recommenders.dlrm_v3.datasets.dataset import (
+    collate_fn,
+    Dataset,
+    Samples,
+)
 from generative_recommenders.dlrm_v3.utils import get_dataset, MetricsLogger, Profiler
 from generative_recommenders.common import HammerKernel
 from generative_recommenders.modules.dlrm_hstu import DlrmHSTU, DlrmHSTUConfig
@@ -84,6 +88,21 @@ def setup(
     # leaving stale allocations and triggering OOMs on rank 0.
     torch.cuda.set_device(device)
 
+    # Seed all RNGs so weight init (make_model, called after setup) is
+    # reproducible across runs. Same seed on every rank → dense params are
+    # initialized identically across ranks; sharded embeddings are init'd from
+    # the meta device by DMP. Fixed seed makes pipeline-vs-non-pipeline an
+    # init-matched A/B (data order is already deterministic via the sampler).
+    import random
+
+    import numpy as np
+
+    _SEED = 1
+    random.seed(_SEED)
+    np.random.seed(_SEED)
+    torch.manual_seed(_SEED)
+    torch.cuda.manual_seed_all(_SEED)
+
     # initialize the process group
     if not dist.is_initialized():
         dist.init_process_group(
@@ -205,6 +224,7 @@ def set_epoch(self, epoch: int) -> None:
 def make_model(
     dataset: str,
     bf16_training: bool = False,
+    hammer_kernel: Optional[str] = None,
 ) -> Tuple[torch.nn.Module, DlrmHSTUConfig, Dict[str, EmbeddingConfig]]:
     hstu_config = get_hstu_configs(dataset)
     table_config = get_embedding_table_config(dataset)
@@ -220,13 +240,21 @@ def make_model(
         bf16_training=bf16_training,
     )
 
-    # Triton on ROCm fails to compile some jagged kernels at our shapes
-    # (PassManager::run failed at make_ttgir). Allow the PyTorch backend as a
-    # global override so AMD smoke runs end-to-end. CUDA paths default to TRITON.
-    kernel_override = os.environ.get("HSTU_HAMMER_KERNEL", "").upper()
-    if kernel_override:
-        model.set_hammer_kernel(HammerKernel[kernel_override])
-        logger.warning(f"HSTU_HAMMER_KERNEL override: {kernel_override}")
+    # HSTU attention/compute kernel backend. Precedence:
+    #   HSTU_HAMMER_KERNEL env var  >  make_model.hammer_kernel gin  >  model default.
+    # The env var stays as an ad-hoc override (e.g. forcing PYTORCH for a one-off
+    # debug run) without editing the gin. Note: the fused TRITON path avoids
+    # materializing the dense [B, H, N, N] attention-score tensor that the PYTORCH
+    # path allocates (~32 GiB at N=2048, bs=1024), so TRITON is both faster and
+    # far lighter on HBM. On older ROCm, TRITON could hit PassManager errors at
+    # some shapes (make_ttgir) — fall back to PYTORCH via the gin/env if so.
+    kernel_choice = (
+        os.environ.get("HSTU_HAMMER_KERNEL", "").upper()
+        or (hammer_kernel.upper() if hammer_kernel else "")
+    )
+    if kernel_choice:
+        model.set_hammer_kernel(HammerKernel[kernel_choice])
+        logger.warning(f"HSTU hammer kernel set to: {kernel_choice}")
 
     return (
         model,
@@ -610,6 +638,97 @@ def eval_loop(
         print(f"{k}: {v}")
 
 
+class _PipelineModelWrapper(torch.nn.Module):
+    """Adapt ``DlrmHSTU.forward`` to the ``(loss, output)`` contract that
+    ``TrainPipelineSparseDist`` expects.
+
+    The wrapped ``model`` is the same DMP instance handed to the pipeline as
+    ``model=``; the pipeline rewrites its sharded ``EmbeddingCollection`` in
+    place, so calling it here is what lets the embedding all-to-all overlap the
+    dense forward/backward compute.
+    """
+
+    def __init__(self, model: torch.nn.Module) -> None:
+        super().__init__()
+        self._model = model
+
+    def forward(
+        self, batch: Samples
+    ) -> Tuple[torch.Tensor, Tuple[Any, ...]]:
+        # The model runs in `_pipeline_mode`: it takes the whole batch as its
+        # single arg and reads the pre-merged sparse KJT off it. This keeps the
+        # EmbeddingCollection input a plain getattr on the batch placeholder so
+        # TorchRec pipelines its input_dist (instead of skipping it for "input
+        # modifications").
+        (
+            _,
+            _,
+            aux_losses,
+            mt_target_preds,
+            mt_target_labels,
+            mt_target_weights,
+        ) = self._model(batch)
+        loss = sum(aux_losses.values())
+        num_candidates = batch.candidates_features_kjt.lengths().view(
+            len(batch.candidates_features_kjt.keys()), -1
+        )[0]
+        output = (
+            aux_losses,
+            mt_target_preds,
+            mt_target_labels,
+            mt_target_weights,
+            num_candidates,
+        )
+        return loss, output
+
+
+def build_train_pipeline(
+    model: torch.nn.Module,
+    optimizer: Optimizer,
+    device: torch.device,
+    grad_clip_norm: float = 1.0,
+) -> Any:
+    """Build a ``TrainPipelineSparseDist`` for the DMP-wrapped HSTU model.
+
+    The 3-stage pipeline overlaps (1) H2D transfer of batch N+2, (2) the sparse
+    data-dist all-to-all of batch N+1's embedding lookup, and (3) dense fwd/bwd
+    of batch N, on separate CUDA streams. Requires the model to be wrapped with
+    ``DistributedModelParallel`` (see ``make_optimizer_and_shard``).
+    """
+    # Lazy import: keeps module import working on torchrec builds that move or
+    # rename the pipeline, and matches the reference Primus-DLRM setup.
+    from torchrec.distributed.train_pipeline import TrainPipelineSparseDist
+
+    # Switch the (DMP-wrapped) HSTU model into pipeline mode so both the fx trace
+    # and the live forward consume the batch as a single arg and read the
+    # pre-merged sparse KJT off it — required for the embedding input_dist to be
+    # pipelined. Eval call sites pass the batch the same way (see train_eval_loop).
+    underlying = model.module if hasattr(model, "module") else model
+    underlying._pipeline_mode = True
+
+    # The pipeline calls backward()+optimizer.step() internally inside
+    # progress(), leaving no in-loop hook point for gradient clipping. Clip via
+    # a full-backward hook (fires after autograd populates dense grads, before
+    # the optimizer step) to preserve parity with the sequential path's
+    # clip_grad_norm_(model.parameters(), max_norm=1.0).
+    if grad_clip_norm and grad_clip_norm > 0:
+
+        def _clip_grads(_m: torch.nn.Module, _gi: Any, _go: Any) -> None:
+            torch.nn.utils.clip_grad_norm_(
+                model.parameters(), max_norm=grad_clip_norm
+            )
+
+        model.register_full_backward_hook(_clip_grads)
+
+    return TrainPipelineSparseDist(
+        model=model,
+        optimizer=optimizer,
+        device=device,
+        execute_all_batches=True,
+        custom_model_fwd=_PipelineModelWrapper(model),
+    )
+
+
 @gin.configurable
 def train_eval_loop(
     rank: int,
@@ -628,6 +747,7 @@ def train_eval_loop(
     eval_frequency: int = 1,
     start_train_batch_idx: int = 0,
     start_eval_batch_idx: int = 0,
+    use_pipeline: bool = False,
     # lr_scheduler: to-do: Add a scheduler
 ) -> None:
     train_batch_idx: int = start_train_batch_idx
@@ -638,40 +758,61 @@ def train_eval_loop(
     eval_data_iterator = iter(eval_dataloader)
     train_data_iterator = iter(train_dataloader)
 
+    # 3-stage TorchRec pipeline (overlaps embedding a2a with dense compute).
+    # When enabled, progress() owns H2D copy, sparse-dist, fwd/bwd and the
+    # optimizer step; grad clipping moves to a full-backward hook (see builder).
+    train_pipeline = (
+        build_train_pipeline(model, optimizer, device) if use_pipeline else None
+    )
+
     for epoch in range(num_epochs):
         train_dataloader.sampler.set_epoch(epoch)  # pyre-ignore [16]
         while True:
             model.train()
-            try:
-                sample = next(train_data_iterator)
-            except StopIteration:
-                train_data_iterator = iter(train_dataloader)
-                break
-            optimizer.zero_grad()
-            sample.to(device)
-            (
-                _,
-                _,
-                aux_losses,
-                mt_target_preds,
-                mt_target_labels,
-                mt_target_weights,
-            ) = model.forward(
-                sample.uih_features_kjt,
-                sample.candidates_features_kjt,
-            )
-            # pyre-ignore
-            sum(aux_losses.values()).backward()
-            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
-            optimizer.step()
+            if train_pipeline is not None:
+                try:
+                    (
+                        aux_losses,
+                        mt_target_preds,
+                        mt_target_labels,
+                        mt_target_weights,
+                        num_candidates,
+                    ) = train_pipeline.progress(train_data_iterator)
+                except StopIteration:
+                    train_data_iterator = iter(train_dataloader)
+                    break
+            else:
+                try:
+                    sample = next(train_data_iterator)
+                except StopIteration:
+                    train_data_iterator = iter(train_dataloader)
+                    break
+                optimizer.zero_grad()
+                sample.to(device)
+                (
+                    _,
+                    _,
+                    aux_losses,
+                    mt_target_preds,
+                    mt_target_labels,
+                    mt_target_weights,
+                ) = model.forward(
+                    sample.uih_features_kjt,
+                    sample.candidates_features_kjt,
+                )
+                # pyre-ignore
+                sum(aux_losses.values()).backward()
+                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+                optimizer.step()
+                num_candidates = sample.candidates_features_kjt.lengths().view(
+                    len(sample.candidates_features_kjt.keys()), -1
+                )[0]
             metric_logger.update(
                 mode="train",
                 predictions=mt_target_preds,
                 labels=mt_target_labels,
                 weights=mt_target_weights,
-                num_candidates=sample.candidates_features_kjt.lengths().view(
-                    len(sample.candidates_features_kjt.keys()), -1
-                )[0],
+                num_candidates=num_candidates,
             )
             if train_batch_idx % metric_log_frequency == 0:
                 metric_logger.compute_and_log(
@@ -710,9 +851,15 @@ def train_eval_loop(
                             mt_target_preds,
                             mt_target_labels,
                             mt_target_weights,
-                        ) = model.forward(
-                            sample.uih_features_kjt,
-                            sample.candidates_features_kjt,
+                        ) = (
+                            # In pipeline mode the model takes the batch as one
+                            # arg (see _PipelineModelWrapper / DlrmHSTU.forward).
+                            model.forward(sample)
+                            if use_pipeline
+                            else model.forward(
+                                sample.uih_features_kjt,
+                                sample.candidates_features_kjt,
+                            )
                         )
                         metric_logger.update(
                             mode="eval",
diff --git a/recommendation_v4/generative_recommenders/modules/dlrm_hstu.py b/recommendation_v4/generative_recommenders/modules/dlrm_hstu.py
index af2edc998..3a35df3ba 100644
--- a/recommendation_v4/generative_recommenders/modules/dlrm_hstu.py
+++ b/recommendation_v4/generative_recommenders/modules/dlrm_hstu.py
@@ -52,7 +52,18 @@
 
 logger: logging.Logger = logging.getLogger(__name__)
 
+def fx_total_targets(num_candidates: torch.Tensor) -> int:
+    """Sum a per-sample candidate-count tensor to a Python int.
+
+    Wrapped with ``torch.fx.wrap`` so ``TrainPipelineSparseDist``'s symbolic
+    trace treats it as an opaque leaf instead of recursing into the data-
+    dependent ``int(Proxy.sum().item())`` (which raises during tracing).
+    """
+    return int(num_candidates.sum().item())
+
+
 torch.fx.wrap("fx_infer_max_len")
+torch.fx.wrap("fx_total_targets")
 torch.fx.wrap("len")
 
 
@@ -129,6 +140,12 @@ def __init__(  # noqa C901
     ) -> None:
         super().__init__(is_inference=is_inference)
         logger.info(f"Initialize HSTU module with configs {hstu_configs}")
+        # When True, forward() takes the whole `Samples` batch as its single
+        # positional arg and reads the pre-merged sparse KJT off it. This keeps
+        # the EmbeddingCollection's input a plain getattr on the batch placeholder
+        # so TorchRec's TrainPipelineSparseDist can pipeline its input_dist. Set
+        # by build_train_pipeline(); leave False for eager / inference / eval.
+        self._pipeline_mode: bool = False
         self._hstu_configs = hstu_configs
         self._bf16_training: bool = bf16_training
         set_static_max_seq_lens([self._hstu_configs.max_seq_len])
@@ -345,7 +362,7 @@ def _user_forward(
             kernel=self.hammer_kernel(),
         ).squeeze(-1)
         if total_targets is None:
-            total_targets = int(num_candidates.sum().item())
+            total_targets = fx_total_targets(num_candidates)
         if total_uih_len is None:
             total_uih_len = source_timestamps.numel() - total_targets
         embedding = seq_embeddings[
@@ -415,6 +432,7 @@ def preprocess(
         self,
         uih_features: KeyedJaggedTensor,
         candidates_features: KeyedJaggedTensor,
+        merged_sparse_features: Optional[KeyedJaggedTensor] = None,
     ) -> Tuple[
         Dict[str, SequenceEmbedding],
         Dict[str, torch.Tensor],
@@ -423,18 +441,25 @@ def preprocess(
         int,
         torch.Tensor,
     ]:
-        # embedding lookup for uih and candidates
-        merged_sparse_features = KeyedJaggedTensor.from_lengths_sync(
-            keys=uih_features.keys() + candidates_features.keys(),
-            values=torch.cat(
-                [uih_features.values(), candidates_features.values()],
-                dim=0,
-            ),
-            lengths=torch.cat(
-                [uih_features.lengths(), candidates_features.lengths()],
-                dim=0,
-            ),
-        )
+        # Embedding lookup for uih + candidates. When the caller (the pipeline
+        # path) supplies the pre-merged KJT from the batch, feed it straight to
+        # the EmbeddingCollection: that keeps the lookup's input a plain getattr
+        # off the batch so TorchRec's TrainPipelineSparseDist can hoist its
+        # input_dist into the prefetch stage. Building it here (cat +
+        # from_lengths_sync's .sync()) is an "input modification" that makes
+        # TorchRec skip pipelining the embedding collection.
+        if merged_sparse_features is None:
+            merged_sparse_features = KeyedJaggedTensor.from_lengths_sync(
+                keys=uih_features.keys() + candidates_features.keys(),
+                values=torch.cat(
+                    [uih_features.values(), candidates_features.values()],
+                    dim=0,
+                ),
+                lengths=torch.cat(
+                    [uih_features.lengths(), candidates_features.lengths()],
+                    dim=0,
+                ),
+            )
         seq_embeddings_dict = self._embedding_collection(merged_sparse_features)
         num_candidates = fx_mark_length_features(
             candidates_features.lengths().view(len(candidates_features.keys()), -1)
@@ -593,7 +618,8 @@ def main_forward(
     def forward(
         self,
         uih_features: KeyedJaggedTensor,
-        candidates_features: KeyedJaggedTensor,
+        candidates_features: Optional[KeyedJaggedTensor] = None,
+        merged_sparse_features: Optional[KeyedJaggedTensor] = None,
     ) -> Tuple[
         torch.Tensor,
         torch.Tensor,
@@ -602,6 +628,18 @@ def forward(
         Optional[torch.Tensor],
         Optional[torch.Tensor],
     ]:
+        # Pipeline mode: TorchRec fx-traces this forward (via DMP.module) and the
+        # pipeline calls it with the single `Samples` batch. Unpacking the KJTs
+        # here — rather than in the wrapper — makes the EmbeddingCollection's
+        # input `batch.merged_sparse_features` a getattr off the batch placeholder,
+        # which is what lets TrainPipelineSparseDist hoist the embedding input_dist
+        # into the prefetch stage. Guarded from TorchScript (inference path).
+        if not torch.jit.is_scripting() and self._pipeline_mode:
+            batch = uih_features
+            uih_features = batch.uih_features_kjt
+            candidates_features = batch.candidates_features_kjt
+            merged_sparse_features = batch.merged_sparse_features
+
         with record_function("## preprocess ##"):
             (
                 seq_embeddings,
@@ -613,6 +651,7 @@ def forward(
             ) = self.preprocess(
                 uih_features=uih_features,
                 candidates_features=candidates_features,
+                merged_sparse_features=merged_sparse_features,
             )
 
         with record_function("## main_forward ##"):
diff --git a/recommendation_v4/scripts/launch_smoke_8gpu.sh b/recommendation_v4/scripts/launch_smoke_8gpu.sh
index ad363cfaa..09a3af2ba 100755
--- a/recommendation_v4/scripts/launch_smoke_8gpu.sh
+++ b/recommendation_v4/scripts/launch_smoke_8gpu.sh
@@ -28,9 +28,12 @@ python -c "import torch, fbgemm_gpu, torchrec, polars, xxhash, gin; print('impor
 export HIP_VISIBLE_DEVICES=${HIP_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}
 export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}
 export WORLD_SIZE=$(python -c "import torch; print(torch.cuda.device_count())")
-# AMD/ROCm: Triton HSTU kernel hits PassManager errors on some shapes; force
-# PYTORCH backend. On CUDA, unset this to default to TRITON for ~3-5x speedup.
-export HSTU_HAMMER_KERNEL=${HSTU_HAMMER_KERNEL:-PYTORCH}
+# HSTU attention backend is selected in the gin (make_model.hammer_kernel),
+# defaulting to TRITON — fused/flash-style, so it avoids the dense [B,H,N,N]
+# score tensor the PYTORCH path materializes (~32 GiB at N=2048/bs=1024) and is
+# both faster and far lighter on HBM. Only export HSTU_HAMMER_KERNEL=PYTORCH
+# before launch for a one-off fallback (e.g. a ROCm Triton PassManager error).
+export HSTU_HAMMER_KERNEL=${HSTU_HAMMER_KERNEL:-}
 export PYTORCH_CUDA_ALLOC_CONF=${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}
 
 # --- GPU clock sanity guard ---------------------------------------------------

From 123c55c3aea87d66ad2c7d7ebbee60461b1ba137 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Tue, 2 Jun 2026 22:13:17 -0500
Subject: [PATCH 026/113] dlrmv4: streaming (temporal-order) training for
 yambda-5b

Add a forward-in-time streaming path: slice the timeline into fixed-duration
windows (default 1 day), train window T then eval window T+1, enforcing no
future leakage (across-window + causal-history guarantees). Make it the
default mode in launch_smoke_8gpu.sh.

Window-reset overhead is hidden via a persistent worker pool + double
buffering (next window's index mask and first-batch prefetch overlap compute
on a background thread) and eval-window prefetch one window ahead, dropping
train/eval first-batch waits to ~1-3ms with no steady-state regression.
Window selection uses a lazily-built, mmap'd anchor-timestamp cache so the
default non-streaming path is unaffected.

Also harden trace export (best-effort: IO/permission failures warn instead of
crashing training) now that streaming enables output_trace by default, and
document the path + knobs in the README.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 recommendation_v4/README.MD                   |  87 +++++-
 .../dlrm_v3/datasets/yambda.py                | 106 +++++++
 .../dlrm_v3/train/gin/yambda_5b.gin           |  52 +++-
 .../dlrm_v3/train/utils.py                    | 264 +++++++++++++++++-
 .../generative_recommenders/dlrm_v3/utils.py  |  27 +-
 .../scripts/launch_smoke_8gpu.sh              |   2 +-
 6 files changed, 516 insertions(+), 22 deletions(-)

diff --git a/recommendation_v4/README.MD b/recommendation_v4/README.MD
index f22f1e165..5c78f6627 100644
--- a/recommendation_v4/README.MD
+++ b/recommendation_v4/README.MD
@@ -131,6 +131,91 @@ The `like` cap of 679 is unreachable for yambda data — at the 1.9% global like
 
 This means the model sees on average ~1,402 UIH events per sample, not the theoretical 2,037. With the TRITON jagged-attention backend the GPU only does work for the actual events, so the under-fill costs **sequence budget but not GPU compute** — no wasted attention work, just less context per sample than the budget suggests.
 
-## 5. License
+## 5. Streaming (temporal-order) training
+
+`scripts/launch_smoke_8gpu.sh` defaults to `--mode streaming-train-eval`, which
+trains Yambda in strict wall-clock order instead of shuffling the whole corpus.
+The timeline is sliced into fixed-duration **windows** (default 1 day,
+`get_dataset.streaming_window_seconds = 86400`), and the loop walks them forward:
+
+```
+window T:    train          window T+1:  eval (then train)   window T+2: eval (then train) ...
+   └─ train window T ─┐
+                      └─ eval window T+1 ─┐
+                                          └─ train window T+1 ─┐
+                                                               └─ eval window T+2 ...
+```
+
+i.e. for each step it **trains window T, then evaluates window T+1** before
+advancing — always predicting the immediate future from the past.
+
+### 5.1 Temporal guarantee
+
+The streaming path enforces **no future leakage** at two levels:
+
+1. **Across windows** — a window is the set of anchors whose *target/candidate*
+   timestamp falls in `[t_min + T·W, t_min + (T+1)·W)`. Training only ever sees
+   windows `≤ T`; the evaluation window `T+1` is strictly in the future of every
+   training anchor it is scored against. Eval always leads train by exactly one
+   window, so reported eval NE/AUC is genuine next-period generalization, never
+   an in-sample measurement.
+2. **Within an anchor** — history is still gathered **causally**: the UIH scan
+   is `scan_start:flat_pos` (events strictly before the anchor), so even though a
+   long user history may reach back across earlier windows, no event at or after
+   the anchor's timestamp can enter its features. Forward-time windowing and
+   causal history are independent guarantees, and both hold simultaneously.
+
+Note this is a *temporal* split on the training stream — distinct from the
+preprocessing GTS split (§2) that carves off the final test day. Windows are
+indexed off the per-anchor target timestamp via a lazily-built, mmap'd
+`anchor_ts_L{H}.npy` cache (built once on first use; the default non-streaming
+path never touches it).
+
+### 5.2 Knobs
+
+All configurable via gin ([yambda_5b.gin](generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin))
+with env overrides:
+
+| env | gin | default | meaning |
+|---|---|---|---|
+| `START_TS` | `streaming_train_eval_loop.start_ts` | 150 | first window (early windows are near-empty warm-up; start dense) |
+| `NUM_TRAIN_TS` | `streaming_train_eval_loop.num_train_ts` | 30 | number of train windows (clamped to available) |
+| `PERSISTENT_LOADER` | `streaming_train_eval_loop.persistent_loader` | 1 | reuse one worker pool across windows (no per-window respawn) |
+| `DOUBLE_BUFFER` | `streaming_train_eval_loop.double_buffer` | 1 | prepare the next window in a background thread during compute |
+| `EVAL_EACH_WINDOW` | `streaming_train_eval_loop.eval_each_window` | 1 | eval window T+1 after training window T |
+| — | `streaming_train_eval_loop.num_train_batches` / `num_eval_batches` | unset | cap per-window steps (unset = consume full window) |
+
+### 5.3 Hiding the window-reset overhead
+
+Advancing to a new window has a fixed cost — selecting the window's anchor
+indices and warming the dataloader's first batch — that, done naively, stalls
+training at every window boundary. Three layers drive it to ~0:
+
+1. **Persistent loader** (`persistent_loader=1`). The naive path recreates a
+   `DataLoader` per window, re-forking workers and paying first-batch warmup
+   each time (~11 s/window). Instead we build **one** `DataLoader` backed by a
+   stateful `StreamingWindowSampler` whose index set is swapped per window
+   (`set_window`), so workers fork once and persist. This removes the respawn
+   but still pays the index-mask + first-batch stall (~3.6 s/window).
+2. **Double buffering** (`double_buffer=1`). Two pre-forked worker pools
+   ping-pong: while the current window trains on pool A, the *next* window's
+   index mask (`window_indices`, a GIL-releasing NumPy `np.where`) and
+   first-batch prefetch are prepared on pool B in a **background thread**, so
+   that work overlaps GPU compute. The boundary train batch then arrives warm —
+   measured train first-batch data-wait drops to **~1–3 ms**. Pools are forked
+   up front on the main thread (never inside the background thread), so a forking
+   worker can never race a thread holding a lock.
+3. **Eval prefetch one window ahead.** With `eval_each_window=1` the eval window
+   (`T+1`) is prepared *before* training window `T` runs, so the idle eval pool
+   prefetches its first batches concurrently with train compute. This hides the
+   eval-side first-batch stall (**~0.55 s → ~2 ms**). It is safe because a
+   sample's content depends only on the sampler's window indices, not on any
+   train/eval flag.
+
+Net effect: steady-state throughput matches the non-streaming baseline and the
+per-window reset is effectively free; the only remaining one-time cost is the
+process cold start (CUDA-graph capture + the first lazy `anchor_ts` mmap).
+
+## 6. License
 
 Apache 2.0 (inherited from upstream).
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py
index 5a13ac034..bed1aafb8 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py
@@ -185,6 +185,8 @@ def __init__(
         cross_specs: Optional[Sequence[Tuple[str, Sequence[str], int, int]]] = None,
         cache_dir: Optional[str] = None,
         is_inference: bool = False,
+        streaming_window_seconds: int = 86400,
+        streaming_sort_within_window: bool = False,
         *args,
         **kwargs,
     ) -> None:
@@ -193,6 +195,17 @@ def __init__(
         self._metadata_dir: str = metadata_dir
         self._history_length: int = history_length
         self._scan_window: int = scan_window
+        # Streaming/temporal-order state. Everything here is LAZY: nothing is
+        # built or read until the first set_ts()/num_windows() call (only the
+        # streaming-train-eval loop does that), so the default train-eval path
+        # is byte-for-byte unaffected.
+        self._streaming_window_seconds: int = streaming_window_seconds
+        self._streaming_sort_within_window: bool = streaming_sort_within_window
+        self._active: Optional[np.ndarray] = None
+        self.is_eval: bool = False
+        self._anchor_ts: Optional[np.ndarray] = None
+        self._t_min: Optional[int] = None
+        self._t_max: Optional[int] = None
         self._cache_dir: Optional[str] = cache_dir
         self._cross_specs: List[Tuple[str, Tuple[str, ...], int, int]] = [
             (name, tuple(keys), n, s) for (name, keys, n, s) in (cross_specs or [])
@@ -409,11 +422,104 @@ def _load_metadata(self, metadata_dir: str) -> None:
         self.num_items: int = n_items
 
     def get_item_count(self) -> int:
+        # Streaming mode restricts the active set to the current time window;
+        # otherwise the full (user-major) anchor list is used (train-eval).
+        if self._active is not None:
+            return int(len(self._active))
         return int(len(self._positions))
 
     def iloc(self, idx: int) -> int:
+        if self._active is not None:
+            return int(self._positions[self._active[idx]])
         return int(self._positions[idx])
 
+    def _ensure_streaming_index(self) -> None:
+        """Lazily build + mmap the per-anchor target-timestamp array used for
+        time-windowed streaming.
+
+        Built only on the first ``set_ts()``/``num_windows()`` call, so the
+        default train-eval path never reads timestamps or writes a new file.
+        Multi-rank safe via an exclusive file lock + atomic rename; all ranks
+        then mmap the result read-only (shared physical pages, ~0 anon).
+        """
+        if self._anchor_ts is not None:
+            return
+        import fcntl
+
+        assert self._cache_dir is not None
+        anchor_path = os.path.join(
+            self._cache_dir, f"anchor_ts_L{self._history_length}.npy"
+        )
+        if not os.path.exists(anchor_path):
+            lock_path = os.path.join(self._cache_dir, "_anchor_ts_lock")
+            with open(lock_path, "w") as lf:
+                logger.info(f"Acquiring anchor-ts build lock for {anchor_path}...")
+                fcntl.flock(lf, fcntl.LOCK_EX)
+                if not os.path.exists(anchor_path):
+                    logger.info(
+                        f"Building {anchor_path}: target ts for "
+                        f"{len(self._positions):,} anchors"
+                    )
+                    anchor_ts = self.store.flat_timestamps[self._positions]
+                    tmp = anchor_path + ".tmp.npy"
+                    np.save(tmp, anchor_ts)
+                    os.replace(tmp, anchor_path)
+                    del anchor_ts
+        self._anchor_ts = _load_npy_readonly(anchor_path)
+        self._t_min = int(self._anchor_ts.min())
+        self._t_max = int(self._anchor_ts.max())
+
+    def num_windows(self) -> int:
+        """Number of fixed-duration windows spanning [t_min, t_max]."""
+        self._ensure_streaming_index()
+        assert self._t_min is not None and self._t_max is not None
+        span = self._t_max - self._t_min + 1
+        w = self._streaming_window_seconds
+        return int((span + w - 1) // w)
+
+    def window_indices(
+        self, ts: int, sort_by_time: Optional[bool] = None
+    ) -> np.ndarray:
+        """Global anchor indices (into ``_positions``) whose target timestamp is
+        in window ``ts``: ``[t_min + ts*W, t_min + (ts+1)*W)``.
+
+        Returned in ascending global-index order (user-major), which keeps the
+        per-sample history scans page-local in the mmap'd event arrays. Used by
+        the per-window path (via ``set_ts``) and the persistent path (shipped to
+        workers through the sampler). ``sort_by_time`` defaults to
+        ``streaming_sort_within_window``.
+
+        Note: an O(log N) variant using a cached argsort of the timestamps was
+        evaluated but rejected — it doubles resident mmap (sorted-ts + order
+        permutation, ~52 GB) and that extra residency evicts the event-array
+        page cache, stalling dataloader workers (NCCL watchdog timeouts). The
+        O(N) mask here keeps only one ~26 GB array resident and is robust.
+        """
+        self._ensure_streaming_index()
+        assert self._anchor_ts is not None and self._t_min is not None
+        w = self._streaming_window_seconds
+        lo = self._t_min + ts * w
+        hi = lo + w
+        idx = np.where((self._anchor_ts >= lo) & (self._anchor_ts < hi))[0]
+        do_sort = (
+            self._streaming_sort_within_window if sort_by_time is None else sort_by_time
+        )
+        if do_sort and idx.size > 0:
+            idx = idx[np.argsort(self._anchor_ts[idx], kind="stable")]
+        logger.warning(f"window_indices({ts}): [{lo}, {hi}) -> {idx.size:,} anchors")
+        return idx.astype(np.int64)
+
+    def set_ts(self, ts: int) -> None:
+        """Restrict the active sample set to anchors in window ``ts`` (used by
+        the per-window-DataLoader path, where ``iloc``/``get_item_count`` index
+        through ``_active``).
+
+        Forward-only temporal slicing for streaming train/eval. History for any
+        anchor is still gathered causally (``scan_start:flat_pos``) and may span
+        earlier windows, so there is no feature leakage from future events.
+        """
+        self._active = self.window_indices(ts)
+
     def load_query_samples(self, sample_list) -> None:
         max_num_candidates = (
             self._max_num_candidates_inference
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 9715f0b78..38cdd9e3d 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -82,12 +82,56 @@ get_dataset.history_length = 2039
 # `history_length + contextual + candidate` would overflow.
 get_hstu_configs.max_seq_len = 2048
 
-# train-eval loop variables (yambda is non-streaming)
+# --- streaming (temporal-order) training -------------------------------------
+# Only consumed under `--mode streaming-train-eval`; the default train-eval
+# path above is unaffected. Trains time window T then evals window T+1,
+# advancing forward in wall-clock time (no future leakage). Window size is the
+# temporal-ordering granularity knob (default 1 day). num_train_ts is clamped
+# to the dataset's available window count at runtime; override via $NUM_TRAIN_TS.
+get_dataset.streaming_window_seconds = 86400
+get_dataset.streaming_sort_within_window = False
+make_streaming_dataloader.batch_size = %batch_size
+make_streaming_dataloader.num_workers = %num_workers
+make_streaming_dataloader.prefetch_factor = %prefetch_factor
+make_persistent_streaming_dataloader.batch_size = %batch_size
+make_persistent_streaming_dataloader.num_workers = %num_workers
+make_persistent_streaming_dataloader.prefetch_factor = %prefetch_factor
+streaming_train_eval_loop.num_train_ts = @nts/env_int()
+nts/env_int.key = "NUM_TRAIN_TS"
+nts/env_int.default = 30
+# Anchors need >= history_length prior events, so the first ~130 daily windows
+# are near-empty warm-up; start at a dense window. Override via $START_TS.
+streaming_train_eval_loop.start_ts = @sts/env_int()
+sts/env_int.key = "START_TS"
+sts/env_int.default = 150
+streaming_train_eval_loop.metric_log_frequency = 50
+# Trace on by default: reuses the shared Profiler.* bindings below (5-step
+# window at step 52). The streaming step counter advances across train+eval
+# batches, so step 52 lands in the first (train) window's compute.
+streaming_train_eval_loop.output_trace = True
+# Reuse one DataLoader (persistent workers) across windows instead of respawning
+# per window. Skip eval to isolate window-reset cost. Override via env.
+streaming_train_eval_loop.persistent_loader = @pl/env_int()
+pl/env_int.key = "PERSISTENT_LOADER"
+pl/env_int.default = 1
+streaming_train_eval_loop.eval_each_window = @ev/env_int()
+ev/env_int.key = "EVAL_EACH_WINDOW"
+ev/env_int.default = 1
+# Double-buffer windows: prepare the next window (index mask + first-batch
+# prefetch) in a background thread during the current window's compute, hiding
+# the per-window reset. Needs persistent_loader=1. Override via env.
+streaming_train_eval_loop.double_buffer = @db/env_int()
+db/env_int.key = "DOUBLE_BUFFER"
+db/env_int.default = 1
+# num_train_batches / num_eval_batches unset => consume each full window.
+# Set them (e.g. via gin) to cap per-window steps for short experiments.
+
+# Default (non-streaming) train-eval loop variables; used unless
+# `--mode streaming-train-eval` selects the temporal-order path configured above.
 train_eval_loop.num_epochs = 1
-train_eval_loop.output_trace = False
 train_eval_loop.metric_log_frequency = 50
-train_eval_loop.eval_frequency = 5000
-train_eval_loop.num_eval_batches = 500
+train_eval_loop.eval_frequency = 500
+train_eval_loop.num_eval_batches = 200
 train_eval_loop.checkpoint_frequency = 1000000000  # disable mid-training checkpoints (disk-full guard)
 train_eval_loop.output_trace = True
 # 3-stage TorchRec pipeline: overlaps the embedding all-to-all (the dominant
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index 94237f947..890144e1d 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -15,6 +15,8 @@
 # pyre-strict
 import logging
 import os
+import threading
+import time
 from collections.abc import Iterator
 from datetime import timedelta
 from typing import (
@@ -424,6 +426,12 @@ def make_streaming_dataloader(
     dataset.dataset.set_ts(ts)  # pyre-ignore [16]
     total_items = dataset.dataset.get_item_count()
     subset = torch.utils.data.Subset(dataset, range(total_items))
+    # shuffle=False keeps temporal order within the window: a non-shuffling
+    # DistributedSampler hands rank r the strided slice indices[r::num_replicas]
+    # (round-robin), so all ranks stay on the same time front and consume the
+    # window in index order. Fork ctx mirrors the train path (COW-share the
+    # mmap'd store instead of pickling it into every worker).
+    mp_ctx = "fork" if num_workers and num_workers > 0 else None
     dataloader = DataLoader(
         dataset=subset,
         batch_size=batch_size,
@@ -432,11 +440,128 @@ def make_streaming_dataloader(
         drop_last=True,
         num_workers=num_workers,
         prefetch_factor=prefetch_factor,
-        sampler=DistributedSampler(subset, drop_last=True),
+        sampler=DistributedSampler(subset, shuffle=False, drop_last=True),
+        multiprocessing_context=mp_ctx,
     )
     return dataloader
 
 
+class StreamingWindowSampler(torch.utils.data.Sampler):
+    """Per-rank sampler whose index list is swapped each window.
+
+    Yields this rank's round-robin slice of the active window's GLOBAL anchor
+    indices (into the dataset's ``_positions``). Because indices are global, a
+    single DataLoader with ``persistent_workers=True`` can be reused across all
+    windows: the main process re-iterates this sampler each window and ships the
+    new indices to the already-forked workers, which map any global index via
+    the shared mmap. No per-window worker respawn / dataset re-pickle.
+
+    Round-robin striding (rank r gets ``indices[r::world_size]``) over the
+    time-sorted window keeps every rank on the same time front; the window is
+    truncated to a multiple of ``world_size`` so all ranks get equal counts
+    (required for DDP collective lockstep).
+    """
+
+    def __init__(self, rank: int, world_size: int) -> None:
+        self._rank: int = rank
+        self._world_size: int = world_size
+        self._indices: List[int] = []
+
+    def set_window(self, global_indices) -> None:
+        n = (len(global_indices) // self._world_size) * self._world_size
+        self._indices = global_indices[:n][self._rank :: self._world_size].tolist()
+
+    def __iter__(self):
+        return iter(self._indices)
+
+    def __len__(self) -> int:
+        return len(self._indices)
+
+
+@gin.configurable
+def make_persistent_streaming_dataloader(
+    dataset: HammerToTorchDataset,
+    sampler: StreamingWindowSampler,
+    batch_size: int,
+    num_workers: int,
+    prefetch_factor: int,
+) -> DataLoader:
+    """One reusable DataLoader for the whole streaming run. ``sampler`` is
+    mutated per window via ``set_window``; workers persist across windows."""
+    use_workers = bool(num_workers and num_workers > 0)
+    return DataLoader(
+        dataset=dataset,
+        batch_size=batch_size,
+        shuffle=False,
+        collate_fn=collate_fn,
+        drop_last=True,
+        num_workers=num_workers,
+        prefetch_factor=prefetch_factor if use_workers else None,
+        sampler=sampler,
+        persistent_workers=use_workers,
+        multiprocessing_context="fork" if use_workers else None,
+    )
+
+
+class _PrefetchingWindowLoader:
+    """Double-buffered window loader for the persistent streaming path.
+
+    Holds ``n_buffers`` pre-forked persistent worker pools that ping-pong: while
+    the current window trains on one pool, the *next* window's index selection
+    (``window_indices``) and first-batch prefetch are prepared on another pool
+    in a background thread. By the time training advances, that window is warm,
+    so the per-window reset (mask + first-batch stall) is hidden behind GPU
+    compute (~0 dead time at the boundary).
+
+    Worker pools are forked once on the main thread at the start of ``stream``;
+    afterwards only iterator resets happen (no forks), so background-thread
+    preparation cannot fork while other threads hold locks.
+    """
+
+    def __init__(
+        self,
+        dataset: "HammerToTorchDataset",
+        sampler_factory,
+        dl_factory,
+        n_buffers: int = 2,
+    ) -> None:
+        self._dataset = dataset
+        self._n = n_buffers
+        self._samplers = [sampler_factory() for _ in range(n_buffers)]
+        self._dls = [dl_factory(s) for s in self._samplers]
+        self._iters: List[Optional[object]] = [None] * n_buffers
+
+    def _prepare(self, buf: int, ts: int) -> None:
+        # window_indices() is the O(N) mask; numpy releases the GIL for it, so it
+        # overlaps the main thread's GPU dispatch. iter() then kicks off this
+        # pool's background prefetch.
+        self._samplers[buf].set_window(self._dataset.dataset.window_indices(ts))
+        self._iters[buf] = iter(self._dls[buf])
+
+    def stream(self, ts_list: List[int]):
+        n = len(ts_list)
+        if n == 0:
+            return
+        threads: List[Optional[threading.Thread]] = [None] * self._n
+        # Prime the first n_buffers windows on the main thread (forks all pools).
+        for b in range(min(self._n, n)):
+            self._prepare(b, ts_list[b])
+        for i in range(n):
+            buf = i % self._n
+            if threads[buf] is not None:
+                threads[buf].join()
+                threads[buf] = None
+            yield ts_list[i], self._iters[buf]
+            # This pool is now free; prefetch the window n_buffers ahead.
+            j = i + self._n
+            if j < n:
+                th = threading.Thread(
+                    target=self._prepare, args=(buf, ts_list[j]), daemon=True
+                )
+                th.start()
+                threads[buf] = th
+
+
 @gin.configurable
 def make_train_test_dataloaders(
     batch_size: int,
@@ -903,6 +1028,10 @@ def streaming_train_eval_loop(
     output_trace: bool = False,
     metric_log_frequency: int = 1,
     checkpoint_frequency: int = 100,
+    start_ts: int = 0,
+    persistent_loader: bool = False,
+    eval_each_window: bool = True,
+    double_buffer: bool = False,
 ) -> None:
     profiler = Profiler(rank) if output_trace else None
     dataset_class, kwargs = get_dataset()
@@ -910,16 +1039,55 @@ def streaming_train_eval_loop(
     dataset = HammerToTorchDataset(
         dataset=dataset_class(hstu_config=hstu_config, is_inference=False, **kwargs)
     )
-    for train_ts in range(num_train_ts):
-        train_batch_idx: int = 0
-        train_dataloader = make_streaming_dataloader(dataset=dataset, ts=train_ts)
-        train_data_iterator = iter(train_dataloader)
+    # Persistent path: build ONE DataLoader + a stateful sampler whose indices
+    # are swapped per window, so workers fork once and are reused across all
+    # windows (eliminates the per-window dataloader respawn + first-batch
+    # warmup). The non-persistent path recreates a DataLoader per window.
+    window_sampler: Optional[StreamingWindowSampler] = None
+    persistent_dl: Optional[DataLoader] = None
+    if persistent_loader:
+        world_size = (
+            torch.distributed.get_world_size()
+            if torch.distributed.is_initialized()
+            else 1
+        )
+        window_sampler = StreamingWindowSampler(rank=rank, world_size=world_size)
+        persistent_dl = make_persistent_streaming_dataloader(
+            dataset=dataset, sampler=window_sampler
+        )
+
+    def _window_iter(ts: int):
+        if persistent_loader:
+            assert window_sampler is not None and persistent_dl is not None
+            window_sampler.set_window(dataset.dataset.window_indices(ts))  # pyre-ignore [16]
+            return iter(persistent_dl)
+        return iter(make_streaming_dataloader(dataset=dataset, ts=ts))
+    # Windows are [start_ts, start_ts + num_train_ts); each step trains window T
+    # then evals window T+1, so the last eval window is start_ts + num_train_ts,
+    # which must be < num_windows(). Anchors require >= history_length prior
+    # events, so the earliest windows are near-empty warm-up — use start_ts to
+    # begin at a dense window. Clamp instead of failing.
+    if hasattr(dataset.dataset, "num_windows"):
+        available = dataset.dataset.num_windows()  # pyre-ignore [16]
+        max_count = max(0, available - 1 - start_ts)
+        if num_train_ts > max_count:
+            logger.warning(
+                f"start_ts={start_ts} + num_train_ts={num_train_ts} exceeds "
+                f"available windows ({available}); clamping num_train_ts to {max_count}."
+            )
+            num_train_ts = max_count
+    def _run_train_window(train_data_iterator, label: Optional[str] = None) -> None:
+        train_batch_idx = 0
+        first_wait: Optional[float] = None
         while True:
             model.train()
+            _t_next = time.perf_counter() if (label and rank == 0) else None
             try:
                 sample = next(train_data_iterator)
             except StopIteration:
                 break
+            if _t_next is not None and first_wait is None:
+                first_wait = time.perf_counter() - _t_next
             optimizer.zero_grad()
             sample.to(device)
             (
@@ -958,18 +1126,25 @@ def streaming_train_eval_loop(
                 profiler.step()
             if num_train_batches is not None and train_batch_idx >= num_train_batches:
                 break
-        eval_ts = train_ts + 1
-        dataset.dataset.is_eval = True  # pyre-ignore [16]
+        if label and rank == 0 and first_wait is not None:
+            logger.info(
+                f"[boundary] {label} train first-batch data-wait={first_wait * 1000:.1f}ms"
+            )
+
+    def _run_eval_window(eval_data_iterator, label: Optional[str] = None) -> None:
         model.eval()
-        eval_batch_idx: int = 0
-        eval_dataloader = make_streaming_dataloader(dataset=dataset, ts=eval_ts)
-        eval_data_iterator = iter(eval_dataloader)
+        eval_batch_idx = 0
+        first_wait: Optional[float] = None
+        _t_enter = time.perf_counter() if (label and rank == 0) else None
         with torch.no_grad():
             while True:
+                _t_next = time.perf_counter() if (label and rank == 0) else None
                 try:
                     sample = next(eval_data_iterator)
                 except StopIteration:
                     break
+                if _t_next is not None and first_wait is None:
+                    first_wait = time.perf_counter() - _t_next
                 sample.to(device)
                 (
                     _,
@@ -1001,9 +1176,18 @@ def streaming_train_eval_loop(
                     break
             for k, v in metric_logger.compute(mode="eval").items():
                 print(f"{k}: {v}")
+        if label and rank == 0 and _t_enter is not None:
+            _eval_total = time.perf_counter() - _t_enter
+            _fw = (first_wait * 1000) if first_wait is not None else float("nan")
+            logger.info(
+                f"[boundary] {label} eval first-batch data-wait={_fw:.1f}ms "
+                f"total_eval={_eval_total * 1000:.1f}ms batches={eval_batch_idx}"
+            )
+
+    def _maybe_checkpoint(train_ts: int) -> None:
         if (
             train_ts % checkpoint_frequency == 0 and train_ts > 0
-        ) or train_ts == num_train_ts - 1:
+        ) or train_ts == start_ts + num_train_ts - 1:
             save_dmp_checkpoint(
                 model=model,
                 optimizer=optimizer,
@@ -1012,6 +1196,64 @@ def streaming_train_eval_loop(
                 batch_idx=train_ts,
             )
 
+    train_ts_list = list(range(start_ts, start_ts + num_train_ts))
+    if persistent_loader and double_buffer:
+        # Double-buffered: next window prepared in the background during the
+        # current window's compute. Eval (if enabled) uses its own pre-forked
+        # pool, primed up front on the main thread so no fork races a bg thread.
+        prefetcher = _PrefetchingWindowLoader(
+            dataset=dataset,
+            sampler_factory=lambda: StreamingWindowSampler(rank, world_size),
+            dl_factory=lambda s: make_persistent_streaming_dataloader(
+                dataset=dataset, sampler=s
+            ),
+        )
+        eval_sampler: Optional[StreamingWindowSampler] = None
+        eval_dl: Optional[DataLoader] = None
+        # Eval iterator is built one window ahead: the eval pool (idle while the
+        # current train window runs) prefetches the eval window's first batches
+        # concurrently with train compute, so eval starts warm (hides the
+        # ~0.5s eval first-batch stall). yambda's sample content depends only on
+        # the sampler window, not is_eval, so prefetching during train is safe.
+        eval_iter: Optional[Iterator] = None
+        if eval_each_window and len(train_ts_list) > 0:
+            eval_sampler = StreamingWindowSampler(rank, world_size)
+            eval_dl = make_persistent_streaming_dataloader(
+                dataset=dataset, sampler=eval_sampler
+            )
+            # Fork the eval pool now (main thread, before any prefetch thread)
+            # and kick off prefetch of the first eval window (train_ts_list[0]+1).
+            eval_sampler.set_window(
+                dataset.dataset.window_indices(train_ts_list[0] + 1)  # pyre-ignore [16]
+            )
+            eval_iter = iter(eval_dl)
+        n_train = len(train_ts_list)
+        for i, (train_ts, train_data_iterator) in enumerate(
+            prefetcher.stream(train_ts_list)
+        ):
+            dataset.dataset.is_eval = False  # pyre-ignore [16]
+            _run_train_window(train_data_iterator, label=f"train_ts={train_ts}")
+            if eval_each_window:
+                dataset.dataset.is_eval = True  # pyre-ignore [16]
+                assert eval_sampler is not None and eval_dl is not None
+                _run_eval_window(eval_iter, label=f"eval_ts={train_ts + 1}")
+                # Re-arm the eval pool for the next window so it prefetches
+                # during the upcoming train window.
+                if i + 1 < n_train:
+                    eval_sampler.set_window(
+                        dataset.dataset.window_indices(train_ts + 2)  # pyre-ignore [16]
+                    )
+                    eval_iter = iter(eval_dl)
+            _maybe_checkpoint(train_ts)
+    else:
+        for train_ts in train_ts_list:
+            dataset.dataset.is_eval = False  # pyre-ignore [16]
+            _run_train_window(_window_iter(train_ts))
+            if eval_each_window:
+                dataset.dataset.is_eval = True  # pyre-ignore [16]
+                _run_eval_window(_window_iter(train_ts + 1))
+            _maybe_checkpoint(train_ts)
+
     eval_ts = num_train_ts
     dataset.dataset.is_eval = True
     model.eval()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
index f276780c2..f261cda97 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
@@ -188,10 +188,17 @@ def handle_fn(p: torch.profiler.profile) -> None:
                 sort_by="self_cuda_time_total"
             )
         )
-        p.export_chrome_trace(path)
-        logger.warning(f"Trace written to: {path}")
-        if keep_n_active is not None and keep_n_active > 0:
-            _trim_warmup_from_trace(path, keep_n_active)
+        # Tracing is best-effort: a write/trim failure (permissions, disk full,
+        # malformed export) must never crash the training run. Degrade to a
+        # warning so the loop continues — especially important since streaming
+        # enables output_trace by default.
+        try:
+            p.export_chrome_trace(path)
+            logger.warning(f"Trace written to: {path}")
+            if keep_n_active is not None and keep_n_active > 0:
+                _trim_warmup_from_trace(path, keep_n_active)
+        except Exception as exc:
+            logger.warning(f"Trace export/trim failed for {path}: {exc!r} (skipping)")
 
     return handle_fn
 
@@ -689,7 +696,13 @@ def run_results_dir(run_name: str = "default", subdir: str = "results") -> str:
 
 
 @gin.configurable
-def get_dataset(name: str, new_path_prefix: str = "", history_length: Optional[int] = None):
+def get_dataset(
+    name: str,
+    new_path_prefix: str = "",
+    history_length: Optional[int] = None,
+    streaming_window_seconds: int = 86400,
+    streaming_sort_within_window: bool = False,
+):
     """
     Get dataset class and configuration by name.
 
@@ -815,6 +828,10 @@ def get_dataset(name: str, new_path_prefix: str = "", history_length: Optional[i
                 "history_length": history_length if history_length is not None else 4096,
                 "scan_window": 20000,
                 "cross_specs": YAMBDA_5B_CROSS_SPECS,
+                # Temporal-streaming knobs (only used under --mode
+                # streaming-train-eval; ignored by the default train-eval path).
+                "streaming_window_seconds": streaming_window_seconds,
+                "streaming_sort_within_window": streaming_sort_within_window,
             },
         )
     if name == "sampled-streaming-100b":
diff --git a/recommendation_v4/scripts/launch_smoke_8gpu.sh b/recommendation_v4/scripts/launch_smoke_8gpu.sh
index 09a3af2ba..94886dc87 100755
--- a/recommendation_v4/scripts/launch_smoke_8gpu.sh
+++ b/recommendation_v4/scripts/launch_smoke_8gpu.sh
@@ -58,4 +58,4 @@ fi
 echo "[$(date)] launching train_ranker with WORLD_SIZE=$WORLD_SIZE" | tee -a "$LOG"
 
 python -m generative_recommenders.dlrm_v3.train.train_ranker \
-    --dataset yambda-5b --mode train-eval 2>&1 | tee -a "$LOG"
+    --dataset yambda-5b --mode "${MODE:-streaming-train-eval}" 2>&1 | tee -a "$LOG"

From d017bcc15064c647574b0bcb8347945bcb69b393 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Wed, 3 Jun 2026 12:30:01 -0500
Subject: [PATCH 027/113] dlrmv4: disable checkpointing by default; fix recipe
 torch note
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

save_dmp_checkpoint.path now resolves from $CKPT_PATH and defaults to empty,
so checkpoints (a full DMP is ~100s of GB, and the streaming loop always saves
the final window) are off unless explicitly enabled. Also drop the stale
training-recipe sentence claiming native torch is kept — it contradicts the
dependency table, which replaces torch and keeps only the image's triton.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 recommendation_v4/docs/training_recipe.md                  | 3 ---
 .../dlrm_v3/train/gin/yambda_5b.gin                        | 7 ++++++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/recommendation_v4/docs/training_recipe.md b/recommendation_v4/docs/training_recipe.md
index 28c88ded2..cf31a9fff 100644
--- a/recommendation_v4/docs/training_recipe.md
+++ b/recommendation_v4/docs/training_recipe.md
@@ -24,9 +24,6 @@ mixed-precision training.
 rocm/primus:v26.3
 ```
 
-The image's native PyTorch is kept as-is and must not be reinstalled — it is
-the ROCm-matched build used by triton/fbgemm.
-
 ### Dependency versions
 
 Aligned with the B200 path: same torch major.minor, same torchrec commit,
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 38cdd9e3d..53066ef9a 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -158,4 +158,9 @@ Profiler.trace_dir = @run_results_dir()
 MetricsLogger.tensorboard_log_path = "/tmp/tb/yambda_5b/"
 MetricsLogger.world_size = 8
 MetricsLogger.auc_threshold = 0.80275
-save_dmp_checkpoint.path = "/apps/chcai/ckpts/yambda_5b/"
+# Checkpointing disabled by default — a full DMP checkpoint is ~100s of GB and
+# the streaming loop always saves on the final window. save_dmp_checkpoint
+# no-ops on the empty path. Set $CKPT_PATH to a directory to re-enable.
+save_dmp_checkpoint.path = @ckpt/env_path()
+ckpt/env_path.key     = "CKPT_PATH"
+ckpt/env_path.default = ""

From 7d11c1771f8a6ec8ab980e3eed18a1d099644e49 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Wed, 3 Jun 2026 17:54:07 -0500
Subject: [PATCH 028/113] dlrmv4: ROCm-only Perfetto trace render fixes at
 export time

Add in-process trace postprocessing in the profiler on_trace_ready
callback to fix two ROCm/roctracer rendering artifacts that make MI350X
traces look wrong in Perfetto (the timing is correct, only the layout):

- _normalize_profilerstep_layout: collapse the fragmented GPU-side
  ProfilerStep#N spans (roctracer splits a step across the HIP null +
  compute streams) into one full-width span per step on the busiest
  compute stream, matching the CUDA look.
- _deoverlap_gpu_slices: pull back sub-us kernel end timestamps so
  back-to-back kernels don't touch/overlap; Perfetto otherwise nests the
  later (long) kernel inside the tiny epilogue and clips it to zero width,
  hiding kernels like _hstu_attn_bwd. Leaves a ~1ns gap (exact end==start
  is just as fatal as an overlap) and leaves real nesting untouched.

Both passes are gated behind _is_rocm() (torch.version.hip) so they are
complete no-ops on CUDA/B200, which don't have these artifacts. All
best-effort: failures degrade to a warning and never crash training.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../generative_recommenders/dlrm_v3/utils.py  | 232 ++++++++++++++++++
 1 file changed, 232 insertions(+)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
index f261cda97..008e41f07 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
@@ -143,6 +143,227 @@ def _keep(e: dict) -> bool:
     )
 
 
+# GPU activity categories used to detect GPU stream rows and their busy time.
+_GPU_KERNEL_CATS = frozenset({"kernel", "gpu_memcpy", "gpu_memset"})
+
+
+def _is_rocm() -> bool:
+    """True on ROCm/AMD builds (``torch.version.hip`` set), False on CUDA/B200.
+
+    The ProfilerStep-layout normalization and the sub-us kernel de-overlap are
+    workarounds for how roctracer projects annotations/kernels onto HIP streams;
+    CUDA/CUPTI traces don't have those artifacts, so these passes must be skipped
+    on NVIDIA to avoid touching otherwise-correct traces.
+    """
+    return getattr(torch.version, "hip", None) is not None
+
+
+def _normalize_profilerstep_layout(path: str) -> None:
+    """Collapse fragmented GPU-side ``ProfilerStep#N`` spans into one span/step.
+
+    ``torch.profiler`` emits ``ProfilerStep#N`` as a CPU ``user_annotation`` that
+    Kineto projects onto the GPU timeline as ``gpu_user_annotation`` spans. On
+    CUDA the blocking H2D copy shares the compute stream, so each step projects
+    onto a single GPU stream and renders as one full-width span. On ROCm a
+    blocking H2D copy lands on HIP's null stream (a different stream than the
+    non-null compute stream), so the step splits across two GPU rows and looks
+    truncated in Perfetto — a pure rendering artifact (every kernel is still
+    captured, and the underlying GPU is busy for the whole step).
+
+    This rewrites each per-step GPU ``ProfilerStep`` annotation to a single span
+    on the rank's busiest (compute) GPU stream, covering the kernel extent inside
+    that step's CPU window. Works on a raw per-rank trace (GPU streams are tids
+    under one pid) by keying the busiest stream on ``(pid, tid)``. No-op when the
+    annotation already lives on a single GPU stream (the CUDA case), so it is
+    safe to run on every platform. Mutates the file in place.
+    """
+    import json as _json
+
+    with open(path) as f:
+        d = _json.load(f)
+    events = d.get("traceEvents", [])
+
+    # Per (pid,tid) GPU busy time -> identify the busiest = compute stream.
+    stream_busy: Dict[tuple, int] = {}
+    for e in events:
+        if e.get("ph") == "X" and e.get("cat") in _GPU_KERNEL_CATS:
+            dur = e.get("dur", 0)
+            if dur > 0:
+                key = (e.get("pid"), e.get("tid"))
+                stream_busy[key] = stream_busy.get(key, 0) + dur
+    if not stream_busy:
+        return
+    busiest = max(stream_busy, key=lambda k: stream_busy[k])
+
+    # Existing GPU-side ProfilerStep spans and the streams they sit on.
+    gpu_ps_streams = set()
+    template = None
+    for e in events:
+        if e.get("cat") == "gpu_user_annotation" and str(
+            e.get("name", "")
+        ).startswith("ProfilerStep"):
+            gpu_ps_streams.add((e.get("pid"), e.get("tid")))
+            if template is None:
+                template = e
+    # No fragmentation (single stream or none) -> leave the trace untouched.
+    if len(gpu_ps_streams) <= 1:
+        return
+
+    # CPU ProfilerStep windows: step name -> [min ts, max end].
+    cpu_win: Dict[str, list] = {}
+    for e in events:
+        if (
+            e.get("cat") == "user_annotation"
+            and e.get("ph") == "X"
+            and str(e.get("name", "")).startswith("ProfilerStep")
+        ):
+            ts = e.get("ts", 0)
+            end = ts + e.get("dur", 0)
+            w = cpu_win.get(e["name"])
+            if w is None:
+                cpu_win[e["name"]] = [ts, end]
+            else:
+                w[0] = min(w[0], ts)
+                w[1] = max(w[1], end)
+
+    # GPU kernel extents (any stream) for clamping each step's span.
+    gpu_kernels = [
+        (e.get("ts", 0), e.get("ts", 0) + e.get("dur", 0))
+        for e in events
+        if e.get("ph") == "X"
+        and e.get("cat") in _GPU_KERNEL_CATS
+        and e.get("dur", 0) > 0
+    ]
+
+    new_spans = []
+    for sname, (cs, ce) in cpu_win.items():
+        ks = [(ts, end) for ts, end in gpu_kernels if end > cs and ts < ce]
+        if not ks:
+            continue
+        gmin = min(ts for ts, _ in ks)
+        gmax = max(end for _, end in ks)
+        span = dict(template) if template else {}
+        span.update(
+            {
+                "ph": "X",
+                "cat": "gpu_user_annotation",
+                "name": sname,
+                "pid": busiest[0],
+                "tid": busiest[1],
+                "ts": gmin,
+                "dur": gmax - gmin,
+                "args": {"normalized_profilerstep": True},
+            }
+        )
+        new_spans.append(span)
+
+    if not new_spans:
+        return
+
+    out = [
+        e
+        for e in events
+        if not (
+            e.get("cat") == "gpu_user_annotation"
+            and str(e.get("name", "")).startswith("ProfilerStep")
+        )
+    ]
+    dropped = len(events) - len(out)
+    out.extend(new_spans)
+    d["traceEvents"] = out
+    with open(path, "w") as f:
+        _json.dump(d, f)
+    logger.warning(
+        f"Normalized GPU ProfilerStep layout in {path}: dropped {dropped} "
+        f"fragmented span(s) across {len(gpu_ps_streams)} stream(s), wrote "
+        f"{len(new_spans)} span(s) on busiest stream pid={busiest[0]} "
+        f"tid={busiest[1]}"
+    )
+
+
+def _deoverlap_gpu_slices(path: str, max_snap_us: float = 5.0) -> None:
+    """Remove sub-microsecond kernel overlaps that break Perfetto's renderer.
+
+    Perfetto draws all ``ph=="X"`` slices on a single track (one ``(pid, tid)``)
+    as a strict nested stack ordered by start time: a slice that *opens* while a
+    previous slice on the same track is still open is treated as that slice's
+    child and is **clipped to the parent's end**. ROCm's roctracer reports
+    per-stream kernel timestamps at ns granularity, so two back-to-back kernels
+    on the same compute stream occasionally overlap by a fraction of a
+    microsecond (e.g. an 88 ns ``elementwise`` epilogue ending 0.075 us *after*
+    the next 21 ms ``_hstu_attn_bwd`` kernel begins). Perfetto then nests the
+    long kernel inside the tiny one and clips it to a sub-pixel sliver, so the
+    kernel "disappears" from the timeline even though it is fully present in the
+    JSON.
+
+    This pulls each slice's end back to just *before* the next slice's start
+    whenever they overlap by less than ``max_snap_us`` (a measurement artifact,
+    not real concurrency — kernels on one stream are serialized), leaving genuine
+    nesting (a small kernel fully contained in a larger one) untouched. The
+    adjustment is sub-microsecond and does not change any reported duration
+    meaningfully. Mutates the file in place; best-effort.
+
+    Critically, the slices are separated by a tiny ``_GAP_US`` (~1 ns) rather
+    than snapped to an *exactly equal* end==start timestamp. A coincident
+    end==start is just as fatal as an overlap in Perfetto: it nests the next
+    slice inside the previous one and clips it to zero width (this is the ~1 ns
+    gap that roctracer leaves between cleanly-rendered back-to-back kernels). So
+    we also fix exact-touch (``a_end == b.ts``) boundaries, not just overlaps.
+    """
+    import json as _json
+    from collections import defaultdict
+
+    # ~1 ns. Matches the natural inter-kernel gap roctracer leaves between
+    # back-to-back kernels that Perfetto already renders correctly. Must be
+    # strictly > 0 so end != start after the nudge.
+    _GAP_US = 0.001
+
+    with open(path) as f:
+        d = _json.load(f)
+    events = d.get("traceEvents", [])
+
+    tracks: Dict[tuple, list] = defaultdict(list)
+    for e in events:
+        if (
+            e.get("ph") == "X"
+            and e.get("cat") in _GPU_KERNEL_CATS
+            and e.get("dur", 0) > 0
+        ):
+            tracks[(e.get("pid"), e.get("tid"))].append(e)
+
+    snapped = 0
+    max_clip = 0.0
+    for sl in tracks.values():
+        # Sort by start, then longest-first so a container precedes the slices
+        # it nests; consecutive pairs are then either disjoint, properly nested,
+        # or a tiny artifact overlap.
+        sl.sort(key=lambda e: (e["ts"], -e["dur"]))
+        for i in range(len(sl) - 1):
+            a = sl[i]
+            b = sl[i + 1]
+            a_end = a["ts"] + a["dur"]
+            b_end = b["ts"] + b["dur"]
+            # Touching (a_end == b.ts) or partial overlap (a ends inside b) both
+            # break rendering; true containment (a_end >= b_end) is valid nesting
+            # and is left alone.
+            if b["ts"] <= a_end < b_end:
+                desired_end = b["ts"] - _GAP_US
+                clip = a_end - desired_end
+                if a["ts"] < desired_end and 0 < clip < max_snap_us:
+                    a["dur"] = desired_end - a["ts"]
+                    snapped += 1
+                    if clip > max_clip:
+                        max_clip = clip
+
+    if snapped:
+        with open(path, "w") as f:
+            _json.dump(d, f)
+        logger.warning(
+            f"De-overlapped GPU slices in {path}: snapped {snapped} sub-us "
+            f"overlap(s) (max {max_clip:.3f}us) so Perfetto renders every kernel"
+        )
+
+
 def _on_trace_ready_fn(
     rank: Optional[int] = None,
     trace_dir: str = "/tmp/dlrm_v3_traces",
@@ -197,6 +418,17 @@ def handle_fn(p: torch.profiler.profile) -> None:
             logger.warning(f"Trace written to: {path}")
             if keep_n_active is not None and keep_n_active > 0:
                 _trim_warmup_from_trace(path, keep_n_active)
+            # ROCm/AMD-only rendering fixes. CUDA/CUPTI (e.g. B200) traces don't
+            # exhibit the fragmented-ProfilerStep or sub-us kernel-overlap
+            # artifacts, so skip entirely on NVIDIA to avoid touching otherwise
+            # correct traces. Best-effort like trim above.
+            if _is_rocm():
+                # Normalize the GPU-side ProfilerStep layout so ROCm traces
+                # render with one full-width step span per stream like CUDA.
+                _normalize_profilerstep_layout(path)
+                # Snap roctracer's sub-us kernel overlaps so Perfetto doesn't
+                # mis-nest and hide long kernels.
+                _deoverlap_gpu_slices(path)
         except Exception as exc:
             logger.warning(f"Trace export/trim failed for {path}: {exc!r} (skipping)")
 

From e0242179a1d6494fc01d85f9afd6295212704993 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Wed, 3 Jun 2026 19:17:23 -0500
Subject: [PATCH 029/113] dlrmv4: ROCm annotation de-overlap so phase spans
 render full width

Add _deoverlap_gpu_annotations to the trace-export postprocessing, the
annotation-boundary analog of the kernel de-overlap. Kineto projects the
forward/backward phase annotations (## user_forward ##, ## item_forward ##,
## stu_* ##, ...) onto the GPU stream as a chain of end-to-end siblings.
The absolute step timestamps are ~5.4e12 us, where a float64's quantum is
~1 ns, so a sibling boundary that should be coincident lands a few ns off;
when the earlier sibling ends at/after the next one's start, Perfetto nests
and clips the next span to a sliver -- e.g. the 100+ ms ## user_forward ##
vanishes on some ranks/steps purely by rounding luck.

Since annotations form a real nesting hierarchy (user_forward contains the
stu_* spans and their kernels), this walks the per-track slice stack and
only snaps a slice back when the next slice extends beyond it (siblings,
not parent/child), guarding against trimming into a span's own descendants.
It also snaps kernel tails that straddle an annotation boundary. Gated by
_is_rocm() (no-op on B200/CUDA) and best-effort like the other passes.

Verified end-to-end on an 8-rank MI350X run: ## user_forward ## renders
40/40 (was 9/40), total clipped annotations 1352 -> ~5.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../generative_recommenders/dlrm_v3/utils.py  | 125 ++++++++++++++++++
 1 file changed, 125 insertions(+)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
index 008e41f07..5a1e7fb9d 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
@@ -364,6 +364,128 @@ def _deoverlap_gpu_slices(path: str, max_snap_us: float = 5.0) -> None:
         )
 
 
+def _deoverlap_gpu_annotations(path: str, max_snap_us: float = 5.0) -> None:
+    """Separate touching/overlapping *sibling* GPU annotations so Perfetto draws
+    each one full width (the B200-style stacked layout).
+
+    Same root cause as :func:`_deoverlap_gpu_slices`, but at the annotation
+    boundary instead of the kernel boundary. The forward/backward phase
+    annotations Kineto projects onto the GPU stream (``## item_forward ##``,
+    ``## user_forward ##``, ``## multitask_module ##``, the ``## stu_* ##``
+    pairs, ...) are emitted as a chain of siblings laid end-to-end: each is meant
+    to end exactly where the next begins. Perfetto stores timestamps as int64 ns,
+    and the absolute step timestamps are ~5.4e12 us where a float64's quantum is
+    already ~1 ns, so a sibling boundary that should be coincident instead lands
+    a few ns off. When the earlier sibling's end falls *at or after* the next
+    sibling's start, Perfetto nests the next sibling inside it and clips it to a
+    sub-pixel sliver — so e.g. the 100+ ms ``## user_forward ##`` span vanishes on
+    some ranks/steps and renders on others purely by rounding luck.
+
+    Unlike kernels (all flat on one stream), annotations form a real nesting
+    hierarchy — ``## user_forward ##`` legitimately *contains* the ``## stu_* ##``
+    spans and their kernels — so this cannot blindly snap consecutive slices. It
+    walks the per-track slice stack (sorted by start, longest-first) and only
+    snaps a slice ``a`` back when the next slice ``b`` is **not** contained in it
+    (``b`` extends beyond ``a``'s end), i.e. they are siblings rather than
+    parent/child. Real containment is left untouched, and a snap is skipped if it
+    would clip into ``a``'s own descendants (kernels or child annotations).
+    Mutates the file in place; best-effort. Run after :func:`_deoverlap_gpu_slices`
+    so kernel boundaries are already clean.
+    """
+    import json as _json
+    from collections import defaultdict
+
+    # ~2 ns. The annotation boundaries sit at ~5.4e12 us where a float64's
+    # quantum is ~0.98 ns, so a 1 ns nudge can round back onto the neighbour's
+    # timestamp (an exact touch, which Perfetto still nests+clips). 2 ns (~2
+    # quanta) reliably separates them and is still far below any visible width.
+    _GAP_US = 0.002
+
+    with open(path) as f:
+        d = _json.load(f)
+    events = d.get("traceEvents", [])
+
+    # Stack the full per-track hierarchy over BOTH kernels and annotations so a
+    # parent annotation knows the extent of its descendants (the snap guard),
+    # but only annotation slices are ever trimmed.
+    _ANN = "gpu_user_annotation"
+    tracks: Dict[tuple, list] = defaultdict(list)
+    for e in events:
+        if (
+            e.get("ph") == "X"
+            and e.get("dur", 0) > 0
+            and (e.get("cat") in _GPU_KERNEL_CATS or e.get("cat") == _ANN)
+        ):
+            tracks[(e.get("pid"), e.get("tid"))].append(e)
+
+    snapped = 0
+    max_clip = 0.0
+    for sl in tracks.values():
+        # Longest-first on ties so a container precedes the slices it nests.
+        sl.sort(key=lambda e: (e["ts"], -e["dur"]))
+        # Each frame: [event, max_descendant_end]. The stack holds the chain of
+        # currently-open ancestors for the slice being placed.
+        stack: list = []
+        for b in sl:
+            b_ts = b["ts"]
+            b_end = b_ts + b["dur"]
+            while stack:
+                a = stack[-1][0]
+                a_end = a["ts"] + a["dur"]
+                if a_end < b_ts:
+                    # a closed strictly before b begins -> disjoint sibling, pop.
+                    frame = stack.pop()
+                    eff = frame[0]["ts"] + frame[0]["dur"]
+                    if stack:
+                        stack[-1][1] = max(stack[-1][1], eff, frame[1])
+                    continue
+                if a_end < b_end:
+                    # b starts at/inside a but extends past a's end => they are
+                    # siblings (not parent/child), and a's tail nests+clips b in
+                    # Perfetto. Snap a's end to just before b. This fires for both
+                    # annotation tails (## item_forward ## overhanging
+                    # ## user_forward ##) and kernel tails that straddle an
+                    # annotation boundary (a layer-norm kernel ending a few ns
+                    # past the start of the next phase span) -- both are sub-us
+                    # roctracer/rounding artifacts, since kernels on one stream
+                    # are serialized and phase spans are sequential.
+                    desired_end = b_ts - _GAP_US
+                    clip = a_end - desired_end
+                    # Guard: only snap when a's deepest descendant ends at or
+                    # before b's start. If a child (kernel or nested span)
+                    # actually extends *past* b.ts, trimming a wouldn't fix b's
+                    # clipping (the child would still nest b) and could drop a
+                    # real child into b's territory, so leave it. A descendant
+                    # ending exactly at the boundary is itself rounding noise and
+                    # is clipped by <=1 ns, which is fine.
+                    if (
+                        a["ts"] < desired_end
+                        and stack[-1][1] <= b_ts
+                        and 0 < clip < max_snap_us
+                    ):
+                        a["dur"] = desired_end - a["ts"]
+                        snapped += 1
+                        if clip > max_clip:
+                            max_clip = clip
+                    frame = stack.pop()
+                    eff = frame[0]["ts"] + frame[0]["dur"]
+                    if stack:
+                        stack[-1][1] = max(stack[-1][1], eff, frame[1])
+                    continue
+                # a_end >= b_end: a fully contains b -> b is a child, stop.
+                break
+            stack.append([b, b_ts])
+
+    if snapped:
+        with open(path, "w") as f:
+            _json.dump(d, f)
+        logger.warning(
+            f"De-overlapped GPU annotations in {path}: snapped {snapped} sub-us "
+            f"sibling overlap(s) (max {max_clip:.3f}us) so Perfetto renders every "
+            f"annotation full width"
+        )
+
+
 def _on_trace_ready_fn(
     rank: Optional[int] = None,
     trace_dir: str = "/tmp/dlrm_v3_traces",
@@ -429,6 +551,9 @@ def handle_fn(p: torch.profiler.profile) -> None:
                 # Snap roctracer's sub-us kernel overlaps so Perfetto doesn't
                 # mis-nest and hide long kernels.
                 _deoverlap_gpu_slices(path)
+                # Same fix at the annotation-sibling boundary so phase spans
+                # (## user_forward ##, ## stu_* ##, ...) render full width.
+                _deoverlap_gpu_annotations(path)
         except Exception as exc:
             logger.warning(f"Trace export/trim failed for {path}: {exc!r} (skipping)")
 

From f447075180abb932193f57a736199dff937ccece Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Wed, 3 Jun 2026 23:11:02 -0500
Subject: [PATCH 030/113] dlrmv4: streaming checkpoint resume + step/time
 checkpoint cadences

Make streaming-train-eval crash-resumable and add general checkpoint
cadence controls:

- Atomic checkpoint saves (.tmp dir + rename), keep_last_n pruning, and
  swap-aside .old overwrite so a save can safely replace an existing
  train_ts dir; stale .tmp/.old swept on the next save.
- Per-rank RNG snapshot/restore for bit-equal dropout replay on resume;
  auto-latest-subdir resolution + (train_ts, batch_idx_in_window) resume
  hint so a run re-enters a partial window and skips already-trained
  batches exact-once.
- Three independent in-window checkpoint cadences via a pure, testable
  decision helper: per-window batch count, monotonic global step
  (e.g. every 1000 steps), and wall-clock interval (e.g. hourly,
  rank-0-decided + broadcast to keep the save barrier in lockstep).
- gin/env bindings for all cadences + a test-only die_at_step hook.

Tests: checkpoint_cadence_test.py (cadence precedence/triggers) and an
end-to-end baseline/interrupt/resume harness (streaming_resume_test.{sh,py})
that gates on functional invariants (RNG restored, correct resumed step,
atomic save, keep_last_n) plus a loose trajectory-closeness bound.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/checkpoint.py                     | 274 ++++++++++++--
 .../dlrm_v3/train/gin/yambda_5b.gin           |  48 ++-
 .../train/tests/checkpoint_cadence_test.py    | 131 +++++++
 .../train/tests/streaming_resume_test.py      | 166 ++++++++
 .../dlrm_v3/train/train_ranker.py             |  14 +-
 .../dlrm_v3/train/utils.py                    | 358 ++++++++++++++++--
 .../generative_recommenders/dlrm_v3/utils.py  |  15 +
 .../scripts/streaming_resume_test.sh          | 242 ++++++++++++
 8 files changed, 1187 insertions(+), 61 deletions(-)
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/train/tests/checkpoint_cadence_test.py
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py
 create mode 100755 recommendation_v4/scripts/streaming_resume_test.sh

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py b/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py
index 33445bce9..aa3c9daa0 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py
@@ -21,17 +21,29 @@
 """
 
 import gc
+import logging
 import os
+import random
+import shutil
 from datetime import datetime
-from typing import Any, Dict, Optional, Set
+from typing import Any, Dict, Optional, Set, Tuple
 
 import gin
+import numpy as np
 import torch
 from generative_recommenders.dlrm_v3.utils import MetricsLogger
 from torch.distributed.checkpoint.stateful import Stateful
 from torch.optim.optimizer import Optimizer
 from torchrec.distributed.types import ShardedTensor
 
+logger: logging.Logger = logging.getLogger(__name__)
+
+# Sentinel meaning "the saved window completed in full" — when the loop reads
+# this back it advances start_ts past the saved train_ts. Anything >=0 means the
+# saved checkpoint stopped mid-window after K batches; resume continues that
+# window at batch K.
+WINDOW_COMPLETE: int = -1
+
 
 class SparseState(Stateful):
     """
@@ -86,6 +98,114 @@ def load_dense_state_dict(model: torch.nn.Module, state_dict: Dict[str, Any]) ->
         own_state[name].copy_(param)
 
 
+def _rng_state(device: torch.device) -> Dict[str, Any]:
+    """Snapshot every RNG source bit-equal training depends on.
+
+    HSTU has stochastic dropout (input_dropout=0.2, linear_dropout_rate=0.1)
+    consuming the per-device CUDA RNG cycle each step. Without round-tripping
+    these, a resumed run draws different dropout masks and the resumed AUC
+    trajectory diverges from the uninterrupted run within a few steps.
+    """
+    return {
+        "cpu": torch.get_rng_state(),
+        "cuda": torch.cuda.get_rng_state(device),
+        "numpy": np.random.get_state(),
+        "random": random.getstate(),
+    }
+
+
+def _restore_rng_state(state: Dict[str, Any], device: torch.device) -> None:
+    torch.set_rng_state(state["cpu"])
+    torch.cuda.set_rng_state(state["cuda"], device)
+    np.random.set_state(state["numpy"])
+    random.setstate(state["random"])
+
+
+def _list_numeric_subdirs(base_path: str) -> list[str]:
+    """Return subdir names of `base_path` that look like an int, sorted ascending.
+
+    Filters out `*.tmp` (orphaned in-progress saves), `*.sparse/` and any other
+    non-numeric entries.
+    """
+    if not os.path.isdir(base_path):
+        return []
+    out: list[str] = []
+    for name in os.listdir(base_path):
+        if name.isdigit():
+            out.append(name)
+    return sorted(out, key=int)
+
+
+def _resolve_latest_subdir(path: str) -> str:
+    """Map a base ckpt dir → its highest-numbered numeric subdir.
+
+    Used so users can set `load_dmp_checkpoint.path = "<base>"` (or
+    `CKPT_PATH=<base>`) and automatically pick up the most recent save without
+    needing to know which step number to point at.     If `path` already names a leaf save (numeric basename) it's returned
+    unchanged. If the base dir has no numeric subdirs yet — the cold-start case
+    where ``CKPT_PATH`` is configured but nothing has been saved (e.g. the
+    interrupt phase of the resume test starts from a freshly-cleaned dir) — we
+    return ``""`` so ``load_*_checkpoint`` no-ops instead of asserting on a
+    missing ``sparse/.metadata``.
+    """
+    if not path:
+        return path
+    base = path.rstrip("/")
+    leaf = os.path.basename(base)
+    if leaf.isdigit():
+        return base  # already a leaf, caller knows what it wants
+    subs = _list_numeric_subdirs(base)
+    if not subs:
+        logger.info("No checkpoint subdirs under %s — cold start (no load).", base)
+        return ""  # nothing to load → load_*_checkpoint short-circuits
+    resolved = os.path.join(base, subs[-1])
+    logger.info("Auto-latest checkpoint: %s → %s", base, resolved)
+    return resolved
+
+
+def _prune_old_checkpoints(base_path: str, keep_last_n: int, just_saved_subdir: str) -> None:
+    """Delete numeric subdirs older than the keep_last_n most recent.
+
+    Defensive: never prune `just_saved_subdir` even if it would be evicted by
+    the keep_last_n window (shouldn't happen since we just wrote it, but
+    catches off-by-one bugs). Skipped entirely when keep_last_n<=0.
+    """
+    if keep_last_n <= 0:
+        return
+    subs = _list_numeric_subdirs(base_path)
+    if len(subs) <= keep_last_n:
+        return
+    to_prune = subs[:-keep_last_n]
+    for name in to_prune:
+        full = os.path.join(base_path, name)
+        if os.path.realpath(full) == os.path.realpath(just_saved_subdir):
+            continue
+        try:
+            shutil.rmtree(full)
+            logger.info("Pruned old checkpoint: %s", full)
+        except OSError as e:
+            logger.warning("Failed to prune %s: %s", full, e)
+
+
+def _cleanup_stale_tmps(base_path: str) -> None:
+    """Remove `*.tmp`/`*.old` subdirs left by a crashed prior save attempt.
+
+    `*.tmp` = an interrupted write; `*.old` = an interrupted atomic-overwrite
+    swap (see the promotion step in save_dmp_checkpoint). Both are non-numeric
+    so `_resolve_latest_subdir` already ignores them; this just reclaims disk.
+    """
+    if not os.path.isdir(base_path):
+        return
+    for name in os.listdir(base_path):
+        if name.endswith(".tmp") or name.endswith(".old"):
+            full = os.path.join(base_path, name)
+            try:
+                shutil.rmtree(full)
+                logger.warning("Removed stale checkpoint dir: %s", full)
+            except OSError as e:
+                logger.warning("Failed to remove stale dir %s: %s", full, e)
+
+
 @gin.configurable
 def save_dmp_checkpoint(
     model: torch.nn.Module,
@@ -94,32 +214,64 @@ def save_dmp_checkpoint(
     rank: int,
     batch_idx: int,
     path: str = "",
+    keep_last_n: int = 1,
+    train_ts: Optional[int] = None,
+    batch_idx_in_window: int = WINDOW_COMPLETE,
+    device: Optional[torch.device] = None,
 ) -> None:
     """
     Save a distributed model checkpoint including sparse and dense components.
 
-    Saves the model's sparse tensors using distributed checkpointing and dense
-    tensors, optimizer state, and metrics using standard PyTorch serialization.
+    Writes into a per-rank-coordinated atomic layout:
+        <path>/<batch_idx>.tmp/   ← directory written into during save
+        <path>/<batch_idx>/       ← atomically renamed from .tmp on success
+
+    A crash mid-save leaves the `.tmp/` orphan, which `_cleanup_stale_tmps`
+    sweeps on the next save attempt and which `_resolve_latest_subdir` ignores
+    (non-numeric basename). The previous successful `<N-K>/` remains valid.
 
     Args:
         model: The model to checkpoint.
         optimizer: The optimizer whose state should be saved.
         metric_logger: The metrics logger containing training/eval metrics.
         rank: The current process rank in distributed training.
-        batch_idx: The current batch index (used for checkpoint naming).
+        batch_idx: Subdir name (for streaming we set this == train_ts so the
+            on-disk layout monotonically increases).
         path: Base path for saving the checkpoint. If empty, no checkpoint is saved.
+        keep_last_n: Number of most-recent numeric subdirs to retain after a
+            successful save. Set 1 (default) for disk-bounded long runs;
+            <=0 disables pruning.
+        train_ts: For streaming-train-eval, the current train timestamp.
+            Stored in non_sparse.ckpt so resume knows which window to enter.
+        batch_idx_in_window: For streaming-train-eval, batches completed within
+            train_ts. WINDOW_COMPLETE (-1) means the window finished; resume
+            advances to train_ts+1. >=0 means crash happened mid-window; resume
+            re-enters train_ts at batch_idx_in_window.
+        device: CUDA device for the per-rank RNG snapshot. Required for
+            bit-equal trajectories across resume (HSTU dropout consumes the
+            per-device RNG cycle).
     """
     if path == "":
         return
-    now = datetime.now()
-    formatted_datetime = now.strftime("%Y_%m_%d_%H_%M_%S")
-    path = f"{path}/{batch_idx}"
-    if not os.path.exists(path) and rank == 0:
-        os.makedirs(path)
-    sparse_path = f"{path}/sparse/"
-    if not os.path.exists(sparse_path) and rank == 0:
-        os.makedirs(sparse_path)
-    non_sparse_ckpt = f"{path}/non_sparse.ckpt"
+    base_path = path
+    # Atomic-save layout: write to .tmp, rename to final, prune older.
+    tmp_subdir = f"{base_path}/{batch_idx}.tmp"
+    final_subdir = f"{base_path}/{batch_idx}"
+
+    if rank == 0:
+        _cleanup_stale_tmps(base_path)
+        # Always (re)write into a fresh .tmp. An existing `final_subdir` with the
+        # same batch_idx (e.g. a later in-window save for the same train_ts, or a
+        # deterministic re-run at the same step) is overwritten atomically at the
+        # promotion step below — NOT skipped here. Skipping would desync ranks:
+        # the collective barrier/checkpoint.save calls below run on *every* rank,
+        # so a rank-0-only early return deadlocks ranks 1..N on the next barrier.
+        shutil.rmtree(tmp_subdir, ignore_errors=True)
+        os.makedirs(tmp_subdir, exist_ok=True)
+        os.makedirs(f"{tmp_subdir}/sparse/", exist_ok=True)
+    torch.distributed.barrier()
+    sparse_path = f"{tmp_subdir}/sparse/"
+    non_sparse_ckpt = f"{tmp_subdir}/non_sparse.ckpt"
 
     sparse_tensor_keys = {
         k for k, v in model.state_dict().items() if isinstance(v, ShardedTensor)
@@ -148,9 +300,20 @@ def save_dmp_checkpoint(
                 "reg_metrics": regression_metric_state_dict,
                 "global_step": metric_logger.global_step,
                 "sparse_tensor_keys": sparse_tensor_keys,
+                # Streaming resume fields. Defaulted on load so old checkpoints
+                # (pre-streaming-resume) still load as a normal restart.
+                "train_ts": train_ts,
+                "batch_idx_in_window": batch_idx_in_window,
             },
             non_sparse_ckpt,
         )
+
+    # Per-rank RNG snapshot. Written even on a single rank because dropout's
+    # randomness comes from the CUDA generator which differs across devices.
+    if device is not None:
+        rng_path = f"{tmp_subdir}/rng_rank{rank}.pt"
+        torch.save(_rng_state(device), rng_path)
+
     torch.distributed.barrier()
     sparse_dict = {"sparse_dict": SparseState(model, sparse_tensor_keys)}
     torch.distributed.checkpoint.save(
@@ -158,7 +321,25 @@ def save_dmp_checkpoint(
         storage_writer=torch.distributed.checkpoint.FileSystemWriter(sparse_path),
     )
     torch.distributed.barrier()
-    print("checkpoint successfully saved")
+    # Promote .tmp → final, then prune. Done on rank 0 only since the directory
+    # operations are global filesystem state.
+    if rank == 0:
+        if os.path.exists(final_subdir):
+            # POSIX rename() refuses to replace a non-empty directory, so we
+            # can't os.replace(tmp, final) directly. Swap the old snapshot aside
+            # (instant rename), move the new one into place, then delete the old.
+            # The `.old` name is non-numeric → ignored by _resolve_latest_subdir
+            # and swept by _cleanup_stale_tmps on the next save if we crash mid-swap.
+            old_aside = f"{final_subdir}.old"
+            shutil.rmtree(old_aside, ignore_errors=True)
+            os.replace(final_subdir, old_aside)
+            os.replace(tmp_subdir, final_subdir)
+            shutil.rmtree(old_aside, ignore_errors=True)
+        else:
+            os.replace(tmp_subdir, final_subdir)
+        _prune_old_checkpoints(base_path, keep_last_n, final_subdir)
+        logger.info("checkpoint successfully saved → %s", final_subdir)
+    torch.distributed.barrier()
 
 
 @gin.configurable
@@ -190,24 +371,30 @@ def load_nonsparse_checkpoint(
     optimizer: Optional[Optimizer] = None,
     metric_logger: Optional[MetricsLogger] = None,
     path: str = "",
-) -> None:
+    rank: int = 0,
+) -> Tuple[Optional[int], int]:
     """
     Load non-sparse (dense) components from a checkpoint.
 
     Loads dense model parameters, and optionally optimizer state and metrics.
+    Also restores per-rank RNG state if a matching `rng_rank{rank}.pt` is found
+    next to `non_sparse.ckpt`.
 
-    Args:
-        model: The model to load dense parameters into.
-        device: The device to load tensors onto.
-        optimizer: Optional optimizer to restore state for.
-        metric_logger: Optional metrics logger to restore state for.
-        path: Base path of the checkpoint. If empty, no loading is performed.
+    Returns:
+        (train_ts, batch_idx_in_window) — the streaming resume hint stored at
+        save time. `(None, WINDOW_COMPLETE)` if not a streaming checkpoint or
+        no path supplied.
     """
     if path == "":
-        return
+        return None, WINDOW_COMPLETE
     non_sparse_ckpt = f"{path}/non_sparse.ckpt"
 
-    non_sparse_state_dict = torch.load(non_sparse_ckpt, map_location=device)
+    # weights_only=False: these are our own trusted checkpoints, and they hold
+    # non-tensor objects (optimizer/metric state dicts, numpy-backed RNG state)
+    # that PyTorch>=2.6's weights_only=True default refuses to unpickle.
+    non_sparse_state_dict = torch.load(
+        non_sparse_ckpt, map_location=device, weights_only=False
+    )
     load_dense_state_dict(model, non_sparse_state_dict["dense_dict"])
     print("dense checkpoint successfully loaded")
     if optimizer is not None:
@@ -226,6 +413,21 @@ def load_nonsparse_checkpoint(
         for i, m in enumerate(metric_logger.regression_metrics["eval"]):
             m.load_state_dict(regression_metric_state_dict["eval"][i])
 
+    # Per-rank RNG restore. Missing file = bit-equal trajectory not requested at
+    # save time; we silently continue (the test harness checks for both).
+    rng_path = f"{path}/rng_rank{rank}.pt"
+    if os.path.exists(rng_path):
+        # weights_only=False: RNG state is numpy/Python tuples, not tensors.
+        rng_state = torch.load(rng_path, map_location="cpu", weights_only=False)
+        _restore_rng_state(rng_state, device)
+        logger.info("RNG state restored from %s", rng_path)
+
+    train_ts = non_sparse_state_dict.get("train_ts")
+    batch_idx_in_window = non_sparse_state_dict.get(
+        "batch_idx_in_window", WINDOW_COMPLETE
+    )
+    return train_ts, batch_idx_in_window
+
 
 @gin.configurable
 def load_dmp_checkpoint(
@@ -234,25 +436,27 @@ def load_dmp_checkpoint(
     metric_logger: MetricsLogger,
     device: torch.device,
     path: str = "",
-) -> None:
+    rank: int = 0,
+) -> Tuple[Optional[int], int]:
     """
     Load a complete distributed model checkpoint (both sparse and dense components).
 
-    This is a convenience function that calls both load_sparse_checkpoint and
-    load_nonsparse_checkpoint.
+    `path` is auto-resolved: if it points at a directory containing numeric
+    subdirs (e.g. CKPT_PATH=<base>/), the highest-numbered subdir is used. If it
+    already names a leaf save (e.g. <base>/300), it's used as-is. Empty string =
+    no load.
 
-    Args:
-        model: The model to load the checkpoint into.
-        optimizer: The optimizer to restore state for.
-        metric_logger: The metrics logger to restore state for.
-        device: The device to load tensors onto.
-        path: Base path of the checkpoint. If empty, no loading is performed.
+    Returns:
+        (train_ts, batch_idx_in_window) — streaming resume hint. Callers that
+        don't need it can ignore.
     """
-    load_sparse_checkpoint(model=model, path=path)
-    load_nonsparse_checkpoint(
+    resolved = _resolve_latest_subdir(path)
+    load_sparse_checkpoint(model=model, path=resolved)
+    return load_nonsparse_checkpoint(
         model=model,
         optimizer=optimizer,
         metric_logger=metric_logger,
-        path=path,
+        path=resolved,
         device=device,
+        rank=rank,
     )
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 53066ef9a..747e3876f 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -104,7 +104,12 @@ nts/env_int.default = 30
 streaming_train_eval_loop.start_ts = @sts/env_int()
 sts/env_int.key = "START_TS"
 sts/env_int.default = 150
-streaming_train_eval_loop.metric_log_frequency = 50
+# Per-step metric logging cadence. Default 50 (one compute_and_log GPU->CPU
+# sync per 50 batches). The streaming-resume test sets METRIC_LOG_FREQ=1 so
+# every step emits a parseable "Step N metrics" line for trajectory comparison.
+streaming_train_eval_loop.metric_log_frequency = @mlf/env_int()
+mlf/env_int.key     = "METRIC_LOG_FREQ"
+mlf/env_int.default = 50
 # Trace on by default: reuses the shared Profiler.* bindings below (5-step
 # window at step 52). The streaming step counter advances across train+eval
 # batches, so step 52 lands in the first (train) window's compute.
@@ -160,7 +165,46 @@ MetricsLogger.world_size = 8
 MetricsLogger.auc_threshold = 0.80275
 # Checkpointing disabled by default — a full DMP checkpoint is ~100s of GB and
 # the streaming loop always saves on the final window. save_dmp_checkpoint
-# no-ops on the empty path. Set $CKPT_PATH to a directory to re-enable.
+# no-ops on the empty path. Set $CKPT_PATH to a directory to re-enable; the
+# load path auto-resolves to the highest-numbered numeric subdir inside it.
 save_dmp_checkpoint.path = @ckpt/env_path()
 ckpt/env_path.key     = "CKPT_PATH"
 ckpt/env_path.default = ""
+load_dmp_checkpoint.path = @ckpt/env_path()
+# Retention: keep only the most-recent N numeric subdirs after each successful
+# save (atomic rename + prune-older). Override via $KEEP_LAST_N.
+save_dmp_checkpoint.keep_last_n = @kln/env_int()
+kln/env_int.key     = "KEEP_LAST_N"
+kln/env_int.default = 1
+# In-window checkpoint cadence for streaming-train-eval. 0 (default) = end-of-
+# window only; on crash mid-window the partial progress is lost. Set N>0 (e.g.
+# via $IN_WINDOW_CKPT_FREQ) for batch-granularity exact-once-on-resume — paired
+# with the auto-latest load above, the resumed run skips the K already-trained
+# batches of the partial window and continues with bit-equal trajectory.
+streaming_train_eval_loop.in_window_checkpoint_frequency = @iwcf/env_int()
+iwcf/env_int.key     = "IN_WINDOW_CKPT_FREQ"
+iwcf/env_int.default = 0
+# Global-step checkpoint cadence: save whenever the monotonic train global_step
+# crosses a multiple of N (a true "every 1000 steps" trigger that spans windows
+# and survives resume). 0 (default) = off. Override via $CKPT_STEP_FREQ.
+streaming_train_eval_loop.checkpoint_step_frequency = @csf/env_int()
+csf/env_int.key     = "CKPT_STEP_FREQ"
+csf/env_int.default = 0
+# Wall-clock checkpoint cadence in seconds: save when >= this many seconds have
+# elapsed since the last save (e.g. 3600 for hourly). Rank 0 owns the clock and
+# broadcasts the decision so all ranks save together. 0.0 (default) = off.
+# Override via $CKPT_TIME_INTERVAL_S.
+streaming_train_eval_loop.checkpoint_time_interval_s = @ctis/env_float()
+ctis/env_float.key     = "CKPT_TIME_INTERVAL_S"
+ctis/env_float.default = 0.0
+# Cap each train_ts window's batch count (mostly for the resume test driver).
+# Unset / 0 = use the full window.
+streaming_train_eval_loop.num_train_batches = @ntb/env_int()
+ntb/env_int.key     = "NUM_TRAIN_BATCHES"
+ntb/env_int.default = 0
+# Test-only failure injection: when >=0 and metric_logger.global_step['train']
+# reaches this, the process exits with code 42 right after the in-window save
+# fires. Used by `scripts/streaming_resume_test.sh` to verify resume.
+streaming_train_eval_loop.die_at_step = @das/env_int()
+das/env_int.key     = "DIE_AT_STEP"
+das/env_int.default = -1
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/checkpoint_cadence_test.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/checkpoint_cadence_test.py
new file mode 100644
index 000000000..4a483c262
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/checkpoint_cadence_test.py
@@ -0,0 +1,131 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+"""Unit tests for `select_in_window_checkpoint_reason` — the pure decision that
+drives the streaming loop's three fine-grained checkpoint cadences:
+
+  * `in_window_checkpoint_frequency` — per-window-local batch count
+  * `checkpoint_step_frequency`      — monotonic global step ("every 1000 steps")
+  * `checkpoint_time_interval_s`     — wall-clock ("hourly")
+
+These run without a GPU / distributed init: the loop broadcasts a single
+`elapsed_since_last_save` from rank 0 and then calls this pure function, so
+exercising the function directly fully covers the trigger semantics.
+"""
+import unittest
+
+from generative_recommenders.dlrm_v3.train.utils import (
+    select_in_window_checkpoint_reason,
+)
+
+
+def _reason(
+    *,
+    batch: int = 1,
+    step: int = 1,
+    elapsed: float = 0.0,
+    in_window: int = 0,
+    step_freq: int = 0,
+    time_s: float = 0.0,
+) -> str | None:
+    return select_in_window_checkpoint_reason(
+        train_batch_idx=batch,
+        global_step=step,
+        elapsed_since_last_save=elapsed,
+        in_window_checkpoint_frequency=in_window,
+        checkpoint_step_frequency=step_freq,
+        checkpoint_time_interval_s=time_s,
+    )
+
+
+class CheckpointCadenceTest(unittest.TestCase):
+    def test_all_disabled_never_fires(self) -> None:
+        for batch in (1, 100, 1000):
+            for step in (1, 1000, 5000):
+                self.assertIsNone(_reason(batch=batch, step=step, elapsed=1e9))
+
+    def test_step_based_every_1000(self) -> None:
+        # Fires exactly on multiples of the step frequency.
+        self.assertEqual(_reason(step=1000, step_freq=1000), "global_step")
+        self.assertEqual(_reason(step=2000, step_freq=1000), "global_step")
+        # Does not fire just off a boundary.
+        self.assertIsNone(_reason(step=999, step_freq=1000))
+        self.assertIsNone(_reason(step=1001, step_freq=1000))
+
+    def test_step_zero_does_not_trigger(self) -> None:
+        # global_step==0 must not trivially satisfy `0 % N == 0`.
+        self.assertIsNone(_reason(step=0, step_freq=1000))
+
+    def test_time_based_interval(self) -> None:
+        # At/over the interval -> fires; under -> no save.
+        self.assertEqual(
+            _reason(step=3, elapsed=3600.0, time_s=3600.0), "time_interval"
+        )
+        self.assertEqual(
+            _reason(step=3, elapsed=4000.0, time_s=3600.0), "time_interval"
+        )
+        self.assertIsNone(_reason(step=3, elapsed=3599.9, time_s=3600.0))
+
+    def test_in_window_batch_cadence(self) -> None:
+        self.assertEqual(_reason(batch=5, in_window=5), "in_window_batch")
+        self.assertEqual(_reason(batch=10, in_window=5), "in_window_batch")
+        self.assertIsNone(_reason(batch=4, in_window=5))
+
+    def test_precedence_in_window_over_step_over_time(self) -> None:
+        # All three would fire this batch; precedence picks in_window first.
+        self.assertEqual(
+            _reason(
+                batch=5,
+                step=1000,
+                elapsed=9999.0,
+                in_window=5,
+                step_freq=1000,
+                time_s=3600.0,
+            ),
+            "in_window_batch",
+        )
+        # in_window not due this batch -> step wins over time.
+        self.assertEqual(
+            _reason(
+                batch=4,
+                step=1000,
+                elapsed=9999.0,
+                in_window=5,
+                step_freq=1000,
+                time_s=3600.0,
+            ),
+            "global_step",
+        )
+        # Neither batch nor step due -> time wins.
+        self.assertEqual(
+            _reason(
+                batch=4,
+                step=999,
+                elapsed=9999.0,
+                in_window=5,
+                step_freq=1000,
+                time_s=3600.0,
+            ),
+            "time_interval",
+        )
+
+    def test_step_and_time_combined_independent(self) -> None:
+        # Step frequency enabled, time disabled: only step boundaries fire.
+        self.assertEqual(_reason(step=1000, step_freq=1000, time_s=0.0), "global_step")
+        self.assertIsNone(_reason(step=1000, elapsed=1e9, step_freq=0, time_s=0.0))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py
new file mode 100644
index 000000000..2774e8c03
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py
@@ -0,0 +1,166 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""End-to-end failure-injection test for streaming resume.
+
+Validates the four resume features end-to-end on the yambda-5b stack:
+  1. Mid-window save (in_window_checkpoint_frequency)
+  2. Within-window exact-once skip (StreamingWindowSampler.set_window skip)
+  3. Auto-detect-latest checkpoint subdir
+  4. keep_last_n retention (default 1)
+
+Test flow (driven by `scripts/streaming_resume_test.sh`):
+  Phase 1 (baseline): Run streaming-train-eval for N=2 train_ts × K batches/window
+    with die_at_step=-1. Capture per-batch window_ne / window_auc into traj_baseline.json.
+  Phase 2 (interrupt): Same config but die_at_step=M (M mid-window-2). Expect
+    process to exit(42) after the in-window checkpoint at step M lands.
+  Phase 3 (resume): Re-launch with same CKPT_PATH (auto-latest picks the
+    in-window save). Continue to the same total step count. Capture
+    traj_resumed.json (which only contains the post-resume steps).
+
+  Correctness is proven by the FUNCTIONAL INVARIANTS checked in the shell
+  driver (resumed at exactly batch_idx_in_window, per-rank RNG restored, atomic
+  save + keep_last_n), NOT by bit-equal trajectory matching. The training stack
+  is nondeterministic across runs (non-deterministic atomic scatter-add in the
+  embedding/attention backward on ROCm): two independent *cold* runs already
+  drift ~7e-4 in window_ne over 20 steps, and early-training chaos amplifies
+  it, so resume-vs-baseline can differ by a few percent even when resume is
+  perfect. The trajectory comparison here is therefore a LOOSE closeness bound
+  (default atol below) that only flags gross divergence — wrong data slice or
+  unrestored model state — while tolerating nondeterministic drift.
+
+This module also provides a CLI entry point used by the shell driver to (a)
+parse a train.log into a step-keyed dict of metrics, and (b) compare two such
+dicts and fail loudly on mismatch.
+"""
+
+import argparse
+import json
+import re
+import sys
+from typing import Dict, Tuple
+
+# Per-step metrics from MetricsLogger.compute_and_log are emitted like:
+#   "train - Step 51 metrics: {'metric/lifetime_ne/listen_plus': tensor(1.0954, ...)
+#     'metric/window_ne/listen_plus': tensor(0.9940, ...),
+#     'metric/window_accuracy/listen_plus': tensor(0.6231, ...) ..."
+_STEP_RE = re.compile(r"train - Step (\d+) metrics:")
+_WNE_RE = re.compile(r"window_ne/listen_plus.*?tensor\(([0-9.]+)")
+_WAUC_RE = re.compile(r"window_auc/listen_plus.*?tensor\(([0-9.]+)")
+_WACC_RE = re.compile(r"window_accuracy/listen_plus.*?tensor\(([0-9.]+)")
+
+
+def parse_trajectory(log_path: str) -> Dict[int, Dict[str, float]]:
+    """Extract a {step: {window_ne, window_auc, window_accuracy}} dict from a
+    train.log. The grep is loose on the metric line itself — we accept the
+    very long truncated form MetricsLogger prints."""
+    out: Dict[int, Dict[str, float]] = {}
+    with open(log_path, "r", errors="replace") as f:
+        for line in f:
+            m = _STEP_RE.search(line)
+            if not m:
+                continue
+            step = int(m.group(1))
+            wne = _WNE_RE.search(line)
+            wauc = _WAUC_RE.search(line)
+            wacc = _WACC_RE.search(line)
+            if not (wne and wauc and wacc):
+                continue
+            # Only keep ONE entry per step — log can have duplicate per-rank
+            # prints; first one wins (they're identical).
+            if step in out:
+                continue
+            out[step] = {
+                "window_ne": float(wne.group(1)),
+                "window_auc": float(wauc.group(1)),
+                "window_accuracy": float(wacc.group(1)),
+            }
+    return out
+
+
+def compare_trajectories(
+    baseline: Dict[int, Dict[str, float]],
+    resumed: Dict[int, Dict[str, float]],
+    min_resume_step: int,
+    atol: float = 0.15,
+) -> Tuple[bool, str]:
+    """Compare baseline vs resumed trajectories for steps >= min_resume_step.
+
+    This is a LOOSE closeness bound, not a bit-equality check — see the module
+    docstring. `atol` defaults to a value that tolerates the nondeterministic
+    cross-run drift of this stack while still catching gross resume bugs.
+    Returns (ok, message). `ok=False` on any divergence outside `atol`."""
+    steps = sorted(s for s in resumed if s >= min_resume_step)
+    if not steps:
+        return False, f"No resumed steps >= {min_resume_step}"
+    mismatches = []
+    for s in steps:
+        if s not in baseline:
+            mismatches.append(f"step {s}: missing from baseline")
+            continue
+        b = baseline[s]
+        r = resumed[s]
+        for key in ("window_ne", "window_auc", "window_accuracy"):
+            if abs(b[key] - r[key]) > atol:
+                mismatches.append(
+                    f"step {s} {key}: baseline={b[key]:.6f} "
+                    f"resumed={r[key]:.6f} diff={b[key]-r[key]:+.6f}"
+                )
+    if mismatches:
+        return False, (
+            f"{len(mismatches)} mismatches across {len(steps)} resumed steps "
+            f"(atol={atol}):\n  " + "\n  ".join(mismatches[:10])
+        )
+    return True, (
+        f"{len(steps)} resumed steps match baseline within atol={atol} "
+        f"(range: step {steps[0]}..{steps[-1]})"
+    )
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    sub = ap.add_subparsers(dest="cmd", required=True)
+
+    p_parse = sub.add_parser("parse", help="Parse a train.log → traj JSON")
+    p_parse.add_argument("log")
+    p_parse.add_argument("out")
+
+    p_cmp = sub.add_parser("compare", help="Compare baseline vs resumed traj JSONs")
+    p_cmp.add_argument("baseline")
+    p_cmp.add_argument("resumed")
+    p_cmp.add_argument("--min-resume-step", type=int, required=True)
+    p_cmp.add_argument("--atol", type=float, default=0.15)
+
+    args = ap.parse_args()
+    if args.cmd == "parse":
+        traj = parse_trajectory(args.log)
+        with open(args.out, "w") as f:
+            json.dump(traj, f, indent=2)
+        print(f"Wrote {len(traj)} step entries to {args.out}", file=sys.stderr)
+        return 0
+    if args.cmd == "compare":
+        with open(args.baseline) as f:
+            baseline = {int(k): v for k, v in json.load(f).items()}
+        with open(args.resumed) as f:
+            resumed = {int(k): v for k, v in json.load(f).items()}
+        ok, msg = compare_trajectories(
+            baseline, resumed, args.min_resume_step, atol=args.atol
+        )
+        print(msg)
+        return 0 if ok else 1
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
index c6a90c2b7..19490b840 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
@@ -111,8 +111,16 @@ def _main_func(
         device=device,
         rank=rank,
     )
-    load_dmp_checkpoint(
-        model=model, optimizer=optimizer, metric_logger=metrics, device=device
+    # Capture streaming resume hint (None for cold start / non-streaming
+    # checkpoints). For the streaming-train-eval mode, we forward this into
+    # streaming_train_eval_loop so it can advance past the last completed
+    # window OR re-enter the partial window and skip already-trained batches.
+    resume_train_ts, resume_batch_idx_in_window = load_dmp_checkpoint(
+        model=model,
+        optimizer=optimizer,
+        metric_logger=metrics,
+        device=device,
+        rank=rank,
     )
 
     # train loop
@@ -161,6 +169,8 @@ def _main_func(
                 device=device,
                 hstu_config=model_configs,
                 embedding_table_configs=embedding_table_configs,
+                resume_train_ts=resume_train_ts,
+                resume_batch_idx_in_window=resume_batch_idx_in_window,
             )
     except Exception as e:
         logger.info(traceback.format_exc())
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index 890144e1d..6d50064b6 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -35,7 +35,7 @@
 import gin
 import torch
 import torchrec
-from generative_recommenders.dlrm_v3.checkpoint import save_dmp_checkpoint
+from generative_recommenders.dlrm_v3.checkpoint import save_dmp_checkpoint, WINDOW_COMPLETE
 from generative_recommenders.dlrm_v3.configs import (
     get_embedding_table_config,
     get_hstu_configs,
@@ -467,9 +467,27 @@ def __init__(self, rank: int, world_size: int) -> None:
         self._world_size: int = world_size
         self._indices: List[int] = []
 
-    def set_window(self, global_indices) -> None:
+    def set_window(self, global_indices, skip_samples: int = 0) -> None:
+        """Install this window's per-rank index list, optionally fast-forwarding.
+
+        ``skip_samples`` drops the first N per-rank samples from the list so the
+        next ``__iter__`` starts at sample N+1 in this rank's slice. Used on
+        resume to skip batches that were already trained: pass
+        ``skip_samples = batch_size * batches_completed`` and the dataloader
+        emits batches starting at exactly the next unseen batch.
+
+        The skip is safe because the sample order is fully deterministic given
+        (global_indices, rank, world_size): we re-derive the same per-rank list
+        as the pre-crash run, just hand back a tail slice of it.
+        """
         n = (len(global_indices) // self._world_size) * self._world_size
-        self._indices = global_indices[:n][self._rank :: self._world_size].tolist()
+        per_rank = global_indices[:n][self._rank :: self._world_size].tolist()
+        if skip_samples < 0 or skip_samples > len(per_rank):
+            raise ValueError(
+                f"skip_samples={skip_samples} out of [0, {len(per_rank)}] "
+                f"for rank={self._rank} world_size={self._world_size}"
+            )
+        self._indices = per_rank[skip_samples:]
 
     def __iter__(self):
         return iter(self._indices)
@@ -531,21 +549,29 @@ def __init__(
         self._dls = [dl_factory(s) for s in self._samplers]
         self._iters: List[Optional[object]] = [None] * n_buffers
 
-    def _prepare(self, buf: int, ts: int) -> None:
+    def _prepare(self, buf: int, ts: int, skip_samples: int = 0) -> None:
         # window_indices() is the O(N) mask; numpy releases the GIL for it, so it
         # overlaps the main thread's GPU dispatch. iter() then kicks off this
         # pool's background prefetch.
-        self._samplers[buf].set_window(self._dataset.dataset.window_indices(ts))
+        # `skip_samples` is non-zero only for the very first window after a
+        # mid-window resume; subsequent windows always start at 0.
+        self._samplers[buf].set_window(
+            self._dataset.dataset.window_indices(ts), skip_samples=skip_samples
+        )
         self._iters[buf] = iter(self._dls[buf])
 
-    def stream(self, ts_list: List[int]):
+    def stream(self, ts_list: List[int], first_skip_samples: int = 0):
+        """Stream (ts, iterator) pairs. `first_skip_samples` is applied ONLY to
+        the first ts in ``ts_list`` (the mid-window-resumed window); every
+        subsequent window starts at sample 0 of its own per-rank list."""
         n = len(ts_list)
         if n == 0:
             return
         threads: List[Optional[threading.Thread]] = [None] * self._n
         # Prime the first n_buffers windows on the main thread (forks all pools).
         for b in range(min(self._n, n)):
-            self._prepare(b, ts_list[b])
+            skip = first_skip_samples if b == 0 else 0
+            self._prepare(b, ts_list[b], skip_samples=skip)
         for i in range(n):
             buf = i % self._n
             if threads[buf] is not None:
@@ -553,6 +579,8 @@ def stream(self, ts_list: List[int]):
                 threads[buf] = None
             yield ts_list[i], self._iters[buf]
             # This pool is now free; prefetch the window n_buffers ahead.
+            # No skip on subsequent windows — only the first prepared window
+            # carries `first_skip_samples`.
             j = i + self._n
             if j < n:
                 th = threading.Thread(
@@ -1009,10 +1037,56 @@ def train_eval_loop(
                     for k, v in metric_logger.compute(mode="eval").items():
                         print(f"{k}: {v}")
                 model.train()
-            if num_train_batches is not None and train_batch_idx >= num_train_batches:
+            # `num_train_batches` cap: None or 0 = run the whole window. >0 caps
+            # batches per window (mostly the streaming-resume test driver uses
+            # this to keep test windows short).
+            if num_train_batches and train_batch_idx >= num_train_batches:
                 break
 
 
+def select_in_window_checkpoint_reason(
+    *,
+    train_batch_idx: int,
+    global_step: int,
+    elapsed_since_last_save: float,
+    in_window_checkpoint_frequency: int,
+    checkpoint_step_frequency: int,
+    checkpoint_time_interval_s: float,
+) -> Optional[str]:
+    """Decide which (if any) in-window checkpoint cadence fires this batch.
+
+    Pure / distributed-agnostic so it can be unit-tested without a real run.
+    The caller computes `elapsed_since_last_save` (broadcast from rank 0 in the
+    streaming loop) so all ranks pass the same value and reach the same verdict.
+
+    Precedence (at most one save per batch): per-window-local batch count >
+    monotonic global step > wall-clock interval. Returns the trigger reason
+    string, or None when no cadence fires. A cadence is disabled when its
+    frequency/interval is 0 / 0.0.
+
+    Counter conventions match the loop: `train_batch_idx` is already
+    post-incremented (>=1 on the first batch), and `global_step` is guarded
+    >0 so step 0 doesn't trivially satisfy `% N == 0`.
+    """
+    if (
+        in_window_checkpoint_frequency > 0
+        and train_batch_idx % in_window_checkpoint_frequency == 0
+    ):
+        return "in_window_batch"
+    if (
+        checkpoint_step_frequency > 0
+        and global_step > 0
+        and global_step % checkpoint_step_frequency == 0
+    ):
+        return "global_step"
+    if (
+        checkpoint_time_interval_s > 0
+        and elapsed_since_last_save >= checkpoint_time_interval_s
+    ):
+        return "time_interval"
+    return None
+
+
 @gin.configurable
 def streaming_train_eval_loop(
     rank: int,
@@ -1032,7 +1106,55 @@ def streaming_train_eval_loop(
     persistent_loader: bool = False,
     eval_each_window: bool = True,
     double_buffer: bool = False,
+    # --- resume / mid-window-exact-once knobs ---
+    resume_train_ts: Optional[int] = None,
+    resume_batch_idx_in_window: int = WINDOW_COMPLETE,
+    in_window_checkpoint_frequency: int = 0,
+    # --- global step / wall-clock checkpoint cadences ---
+    checkpoint_step_frequency: int = 0,
+    checkpoint_time_interval_s: float = 0.0,
+    # --- test-only failure injection knob ---
+    die_at_step: int = -1,
 ) -> None:
+    """Streaming train+eval loop with per-window (and optionally mid-window)
+    checkpoints.
+
+    Resume semantics (set by train_ranker after `load_dmp_checkpoint` returns):
+      - resume_train_ts=None: cold start; honor `start_ts` as-is.
+      - resume_train_ts=N, resume_batch_idx_in_window=WINDOW_COMPLETE(-1):
+        previous run finished window N cleanly. Start at N+1 from sample 0.
+      - resume_train_ts=N, resume_batch_idx_in_window=K (K>=0): previous run
+        crashed mid-window after K completed batches. Re-enter window N and
+        skip the first K batches of THIS rank's per-rank sample list (deterministic
+        slice since `window_indices(N)` is a pure function of the anchor_ts cache).
+
+    Checkpoint cadences (all independent; any combination may be enabled):
+      - `checkpoint_frequency`: window-granularity. End-of-window save every
+        Nth train_ts (and always on the final window). Uses WINDOW_COMPLETE.
+      - `in_window_checkpoint_frequency`: per-window-local batch count. Fires
+        every N batches *within* a window (counter resets each window).
+      - `checkpoint_step_frequency`: global-step granularity. Fires whenever
+        the monotonic `metric_logger.global_step['train']` hits a multiple of
+        N — i.e. a true "every 1000 steps" trigger that spans windows and
+        survives resume (global_step is restored from the checkpoint).
+      - `checkpoint_time_interval_s`: wall-clock granularity. Fires when at
+        least this many seconds have elapsed since the last save (e.g. 3600
+        for hourly). Rank 0 owns the clock and broadcasts the decision so all
+        ranks save together (avoids the collective barrier in
+        `save_dmp_checkpoint` deadlocking on a split decision).
+
+    All in-window triggers (`in_window_checkpoint_frequency`,
+    `checkpoint_step_frequency`, `checkpoint_time_interval_s`) route through
+    `_save_mid_window`, which stamps `batch_idx_in_window=K` so a crash leaves
+    a resumable partial-window checkpoint. End-of-window saves
+    (`checkpoint_frequency`) always use the WINDOW_COMPLETE sentinel. 0 / 0.0
+    disables a given cadence (the default for all three fine-grained ones).
+
+    `die_at_step` is a test-only hook: when `metric_logger.global_step['train']`
+    reaches this value, the process exits with code 42 right after the in-window
+    save fires. Used by the failure-injection test to crash at a deterministic
+    boundary and then resume.
+    """
     profiler = Profiler(rank) if output_trace else None
     dataset_class, kwargs = get_dataset()
     kwargs["embedding_config"] = embedding_table_configs
@@ -1045,22 +1167,73 @@ def streaming_train_eval_loop(
     # warmup). The non-persistent path recreates a DataLoader per window.
     window_sampler: Optional[StreamingWindowSampler] = None
     persistent_dl: Optional[DataLoader] = None
+    world_size = (
+        torch.distributed.get_world_size()
+        if torch.distributed.is_initialized()
+        else 1
+    )
     if persistent_loader:
-        world_size = (
-            torch.distributed.get_world_size()
-            if torch.distributed.is_initialized()
-            else 1
-        )
         window_sampler = StreamingWindowSampler(rank=rank, world_size=world_size)
         persistent_dl = make_persistent_streaming_dataloader(
             dataset=dataset, sampler=window_sampler
         )
 
-    def _window_iter(ts: int):
+    # Apply resume hint: advance start_ts past the last completed window, or
+    # re-enter the partial window with a per-rank skip on its first iter.
+    # Shrink num_train_ts by the same amount so the resumed run finishes at
+    # the same final timestamp (start_ts + num_train_ts) as a fresh run would
+    # — i.e. resumed and uninterrupted produce identical total work.
+    first_skip_samples = 0
+    if resume_train_ts is not None:
+        original_end_ts = start_ts + num_train_ts
+        if resume_batch_idx_in_window == WINDOW_COMPLETE:
+            new_start = resume_train_ts + 1
+            if rank == 0:
+                logger.info(
+                    "Resuming from completed train_ts=%d → start_ts=%d "
+                    "(num_train_ts %d → %d)",
+                    resume_train_ts, new_start,
+                    num_train_ts, max(0, original_end_ts - new_start),
+                )
+            start_ts = new_start
+        else:
+            if rank == 0:
+                logger.info(
+                    "Resuming mid-window at train_ts=%d batch_idx_in_window=%d "
+                    "(skipping batches already trained)",
+                    resume_train_ts,
+                    resume_batch_idx_in_window,
+                )
+            start_ts = resume_train_ts
+            # `batch_size` is per-rank from the persistent dataloader (set via
+            # gin `make_persistent_streaming_dataloader.batch_size`). The
+            # skip-samples-per-rank below maps "K batches done" → "K * bs
+            # samples in this rank's index list", since each batch draws bs
+            # samples from this rank's deterministic round-robin slice.
+            assert persistent_dl is not None, (
+                "Mid-window resume requires persistent_loader=True"
+            )
+            first_skip_samples = resume_batch_idx_in_window * persistent_dl.batch_size
+        num_train_ts = max(0, original_end_ts - start_ts)
+        if num_train_ts == 0 and rank == 0:
+            logger.info(
+                "Resume target already reached (end_ts=%d, start_ts=%d) — "
+                "no further training windows; skipping straight to final eval.",
+                original_end_ts, start_ts,
+            )
+
+    def _window_iter(ts: int, skip_samples: int = 0):
         if persistent_loader:
             assert window_sampler is not None and persistent_dl is not None
-            window_sampler.set_window(dataset.dataset.window_indices(ts))  # pyre-ignore [16]
+            window_sampler.set_window(
+                dataset.dataset.window_indices(ts),  # pyre-ignore [16]
+                skip_samples=skip_samples,
+            )
             return iter(persistent_dl)
+        if skip_samples != 0:
+            raise NotImplementedError(
+                "skip_samples>0 requires persistent_loader=True"
+            )
         return iter(make_streaming_dataloader(dataset=dataset, ts=ts))
     # Windows are [start_ts, start_ts + num_train_ts); each step trains window T
     # then evals window T+1, so the last eval window is start_ts + num_train_ts,
@@ -1076,8 +1249,54 @@ def _window_iter(ts: int):
                 f"available windows ({available}); clamping num_train_ts to {max_count}."
             )
             num_train_ts = max_count
-    def _run_train_window(train_data_iterator, label: Optional[str] = None) -> None:
-        train_batch_idx = 0
+    # Wall-clock anchor for time-based checkpointing. Mutable single-element
+    # list so the nested train loop can reset it after each save. Starts at
+    # loop entry so the first time-trigger fires ~interval seconds in.
+    last_ckpt_time = [time.time()]
+
+    def _broadcast_elapsed() -> float:
+        """Seconds since the last save, owned by rank 0 and broadcast to all
+        ranks. save_dmp_checkpoint runs a collective barrier, so every rank must
+        feed the same wall-clock value into the cadence decision — otherwise a
+        split verdict (rank 0 saves, rank 1 doesn't) would deadlock. Broadcasting
+        rank 0's elapsed keeps the (pure) decision identical everywhere."""
+        elapsed = time.time() - last_ckpt_time[0]
+        if torch.distributed.is_initialized() and world_size > 1:
+            t = torch.tensor([elapsed], device=device, dtype=torch.float64)
+            torch.distributed.broadcast(t, src=0)
+            elapsed = float(t.item())
+        return elapsed
+
+    def _save_mid_window(train_ts: int, batch_idx_in_window: int) -> None:
+        """In-window checkpoint helper. Snapshots the same state as the
+        end-of-window save but stamps `batch_idx_in_window=K` instead of
+        WINDOW_COMPLETE so the resume path knows to skip K batches.
+        Uses train_ts as the numeric subdir name — every save into the same
+        train_ts overwrites the previous in-window snapshot (via atomic
+        replace), so disk stays bounded to keep_last_n train_ts dirs."""
+        save_dmp_checkpoint(
+            model=model,
+            optimizer=optimizer,
+            metric_logger=metric_logger,
+            rank=rank,
+            batch_idx=train_ts,
+            train_ts=train_ts,
+            batch_idx_in_window=batch_idx_in_window,
+            device=device,
+        )
+
+    def _run_train_window(
+        train_data_iterator,
+        train_ts: int,
+        start_batch_idx: int = 0,
+        label: Optional[str] = None,
+    ) -> None:
+        # `start_batch_idx` is set when we're re-entering a window that was
+        # interrupted mid-way (in_window resume); the dataloader iterator was
+        # already advanced past those batches via the sampler skip, and we
+        # account for them in the local counter so in-window saves and the
+        # die_at_step hook fire at the right relative offsets.
+        train_batch_idx = start_batch_idx
         first_wait: Optional[float] = None
         while True:
             model.train()
@@ -1124,7 +1343,65 @@ def _run_train_window(train_data_iterator, label: Optional[str] = None) -> None:
             if output_trace:
                 assert profiler is not None
                 profiler.step()
-            if num_train_batches is not None and train_batch_idx >= num_train_batches:
+            # Fine-grained in-window checkpoint triggers. All stamp
+            # batch_idx_in_window so a crash here leaves a resumable partial
+            # checkpoint, and all fire AFTER the metric update so restored
+            # state reflects the just-completed batch. Triggers are mutually
+            # short-circuited (one save per batch max) but evaluated on the
+            # same deterministic counters across all ranks, so the collective
+            # inside save_dmp_checkpoint stays in lockstep.
+            gstep = metric_logger.global_step["train"]
+            # Wall-clock elapsed is broadcast from rank 0 so every rank feeds
+            # the same value into the (otherwise pure) cadence decision.
+            elapsed = (
+                _broadcast_elapsed() if checkpoint_time_interval_s > 0 else 0.0
+            )
+            save_reason = select_in_window_checkpoint_reason(
+                train_batch_idx=train_batch_idx,
+                global_step=gstep,
+                elapsed_since_last_save=elapsed,
+                in_window_checkpoint_frequency=in_window_checkpoint_frequency,
+                checkpoint_step_frequency=checkpoint_step_frequency,
+                checkpoint_time_interval_s=checkpoint_time_interval_s,
+            )
+            if save_reason is not None:
+                if rank == 0:
+                    logger.info(
+                        "checkpoint trigger=%s train_ts=%d batch=%d global_step=%d",
+                        save_reason,
+                        train_ts,
+                        train_batch_idx,
+                        gstep,
+                    )
+                _save_mid_window(train_ts, train_batch_idx)
+                # Reset the wall-clock anchor on ANY save so the next time
+                # trigger is measured from the most recent checkpoint.
+                last_ckpt_time[0] = time.time()
+            # Test-only: deterministic crash for the failure-injection test.
+            # Triggered AFTER the save above, so on resume we re-enter at
+            # batch_idx_in_window=train_batch_idx and emit batches [K+1, end).
+            if (
+                die_at_step >= 0
+                and metric_logger.global_step["train"] >= die_at_step
+            ):
+                if rank == 0:
+                    logger.warning(
+                        "die_at_step=%d hit at train_ts=%d batch=%d global_step=%d "
+                        "→ sys.exit(42)",
+                        die_at_step,
+                        train_ts,
+                        train_batch_idx,
+                        metric_logger.global_step["train"],
+                    )
+                # Distributed barrier so all ranks exit together rather than
+                # leaving a few ranks hanging on NCCL ops.
+                torch.distributed.barrier()
+                import sys
+                sys.exit(42)
+            # `num_train_batches` cap: None or 0 = run the whole window. >0 caps
+            # batches per window (mostly the streaming-resume test driver uses
+            # this to keep test windows short).
+            if num_train_batches and train_batch_idx >= num_train_batches:
                 break
         if label and rank == 0 and first_wait is not None:
             logger.info(
@@ -1188,14 +1465,24 @@ def _maybe_checkpoint(train_ts: int) -> None:
         if (
             train_ts % checkpoint_frequency == 0 and train_ts > 0
         ) or train_ts == start_ts + num_train_ts - 1:
+            # End-of-window save: stamp WINDOW_COMPLETE so resume advances past
+            # this train_ts. `device` enables per-rank RNG snapshot for
+            # bit-equal resume of dropout-bearing modules.
             save_dmp_checkpoint(
                 model=model,
                 optimizer=optimizer,
                 metric_logger=metric_logger,
                 rank=rank,
                 batch_idx=train_ts,
+                train_ts=train_ts,
+                batch_idx_in_window=WINDOW_COMPLETE,
+                device=device,
             )
+            last_ckpt_time[0] = time.time()
 
+    # Apply start_ts shift from resume (may have moved past the original start).
+    # num_train_ts is the requested *count*; preserve it so the loop runs for
+    # the same total number of windows post-resume as a fresh run would have.
     train_ts_list = list(range(start_ts, start_ts + num_train_ts))
     if persistent_loader and double_buffer:
         # Double-buffered: next window prepared in the background during the
@@ -1229,10 +1516,27 @@ def _maybe_checkpoint(train_ts: int) -> None:
             eval_iter = iter(eval_dl)
         n_train = len(train_ts_list)
         for i, (train_ts, train_data_iterator) in enumerate(
-            prefetcher.stream(train_ts_list)
+            # Only the FIRST window after a mid-window resume needs the skip
+            # (handed via prefetcher.stream's first_skip_samples). The skip is
+            # zero on cold start (resume_train_ts is None → first_skip_samples=0)
+            # and on completed-window resume (mid-window slice is 0 too).
+            prefetcher.stream(train_ts_list, first_skip_samples=first_skip_samples)
         ):
             dataset.dataset.is_eval = False  # pyre-ignore [16]
-            _run_train_window(train_data_iterator, label=f"train_ts={train_ts}")
+            # First iteration after a mid-window resume carries
+            # resume_batch_idx_in_window so in-window saves and the die_at_step
+            # hook keep accurate counters; otherwise count from 0.
+            start_batch = (
+                resume_batch_idx_in_window
+                if i == 0 and resume_batch_idx_in_window > 0
+                else 0
+            )
+            _run_train_window(
+                train_data_iterator,
+                train_ts=train_ts,
+                start_batch_idx=start_batch,
+                label=f"train_ts={train_ts}",
+            )
             if eval_each_window:
                 dataset.dataset.is_eval = True  # pyre-ignore [16]
                 assert eval_sampler is not None and eval_dl is not None
@@ -1246,9 +1550,19 @@ def _maybe_checkpoint(train_ts: int) -> None:
                     eval_iter = iter(eval_dl)
             _maybe_checkpoint(train_ts)
     else:
-        for train_ts in train_ts_list:
+        for i, train_ts in enumerate(train_ts_list):
             dataset.dataset.is_eval = False  # pyre-ignore [16]
-            _run_train_window(_window_iter(train_ts))
+            skip = first_skip_samples if i == 0 else 0
+            start_batch = (
+                resume_batch_idx_in_window
+                if i == 0 and resume_batch_idx_in_window > 0
+                else 0
+            )
+            _run_train_window(
+                _window_iter(train_ts, skip_samples=skip),
+                train_ts=train_ts,
+                start_batch_idx=start_batch,
+            )
             if eval_each_window:
                 dataset.dataset.is_eval = True  # pyre-ignore [16]
                 _run_eval_window(_window_iter(train_ts + 1))
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
index 5a1e7fb9d..51e35d90e 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
@@ -1031,6 +1031,21 @@ def env_int(key: str = "", default: int = 0) -> int:
     return int(raw) if raw else default
 
 
+@gin.configurable
+def env_float(key: str = "", default: float = 0.0) -> float:
+    """Resolve a float from os.environ[key], falling back to `default`.
+
+    Companion to `env_int` for fractional/duration overrides (e.g. a
+    checkpoint time interval in seconds). Example gin usage:
+
+        streaming_train_eval_loop.checkpoint_time_interval_s = @env_float()
+        env_float.key     = "CKPT_TIME_INTERVAL_S"
+        env_float.default = 3600.0
+    """
+    raw = os.environ.get(key) if key else None
+    return float(raw) if raw else default
+
+
 @gin.configurable
 def run_results_dir(run_name: str = "default", subdir: str = "results") -> str:
     """Resolve ``<recommendation_v4>/<subdir>/<run_name>`` from this file's location.
diff --git a/recommendation_v4/scripts/streaming_resume_test.sh b/recommendation_v4/scripts/streaming_resume_test.sh
new file mode 100755
index 000000000..043d5b926
--- /dev/null
+++ b/recommendation_v4/scripts/streaming_resume_test.sh
@@ -0,0 +1,242 @@
+#!/bin/bash
+# End-to-end failure-injection + resume test for streaming-train-eval.
+#
+# Validates exact-once mid-window resume on the yambda-5b stack:
+#   Phase 1 (baseline):     uninterrupted run for N=2 train_ts × K batches/window
+#   Phase 2 (interrupted):  same config but die_at_step=M → exits at step M
+#                           after the in-window checkpoint lands
+#   Phase 3 (resume):       re-launch with same CKPT_PATH → auto-latest picks
+#                           the in-window save → finishes the partial window
+#                           and the rest of the requested train_ts list
+# Assertion: traj_resumed[step].window_ne / window_auc / window_accuracy match
+# traj_baseline bit-equal (np.allclose atol=1e-4) for all step > die_at_step.
+#
+# Driven entirely via env-driven gin knobs defined in yambda_5b.gin:
+#   NUM_TRAIN_TS / NUM_TRAIN_BATCHES / IN_WINDOW_CKPT_FREQ / DIE_AT_STEP /
+#   CKPT_PATH / KEEP_LAST_N / EVAL_EACH_WINDOW
+#
+# Usage:
+#   bash scripts/streaming_resume_test.sh --jobid <slurm-jobid>
+#       [--container yambda_primus]
+#       [--num-train-batches 200]
+#       [--die-at-step 350]
+#       [--keep]    # retain LOG_DIR + CKPT after run for inspection
+
+set -uo pipefail
+
+JOBID=""
+CONTAINER="yambda_primus"
+NUM_TRAIN_BATCHES=200
+DIE_AT_STEP=350
+IN_WINDOW_FREQ=50
+KEEP=0
+# Trajectory closeness bound — NOT a bit-equality check. The ROCm training stack
+# is nondeterministic across runs (non-deterministic atomic scatter-add in the
+# embedding/attention backward): two independent *cold* runs already drift
+# ~7e-4 in window_ne over 20 steps, and early-training chaos (AUC~0.5) amplifies
+# any seed difference. So resume-vs-baseline can legitimately differ by a few
+# percent. This bound just catches GROSS divergence (wrong data skip, totally
+# unrestored state) while tolerating nondeterministic drift. The HARD resume
+# correctness gates are the functional-invariant checks below (RNG restored,
+# resumed-at-correct-step, atomic/keep_last_n), not this number.
+ATOL=0.15
+CKPT_ROOT=/apps/chcai/ckpts_resume_test
+LOG_DIR=/apps/chcai/streaming_resume_test
+REPO=/home/chcai/training/recommendation_v4
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --jobid) JOBID="$2"; shift 2;;
+        --container) CONTAINER="$2"; shift 2;;
+        --num-train-batches) NUM_TRAIN_BATCHES="$2"; shift 2;;
+        --die-at-step) DIE_AT_STEP="$2"; shift 2;;
+        --in-window-freq) IN_WINDOW_FREQ="$2"; shift 2;;
+        --atol) ATOL="$2"; shift 2;;
+        --keep) KEEP=1; shift;;
+        *) echo "Unknown arg: $1"; exit 1;;
+    esac
+done
+[[ -z "$JOBID" ]] && { echo "Error: --jobid required"; exit 1; }
+
+mkdir -p "$LOG_DIR"
+
+# Single-window mid-window resume: NUM_TRAIN_TS=1, so the whole test runs inside
+# train_ts=START_TS. die_at_step must land strictly inside that window, AT a
+# multiple of IN_WINDOW_FREQ so an in-window checkpoint is saved right before
+# the crash (resume then skips exactly DIE_AT_STEP already-trained batches).
+if (( DIE_AT_STEP <= 0 || DIE_AT_STEP >= NUM_TRAIN_BATCHES )); then
+    echo "Warning: die_at_step=$DIE_AT_STEP not strictly inside window (0, $NUM_TRAIN_BATCHES)" >&2
+fi
+if (( DIE_AT_STEP % IN_WINDOW_FREQ != 0 )); then
+    echo "Warning: die_at_step=$DIE_AT_STEP not a multiple of in_window_freq=$IN_WINDOW_FREQ; no save lands exactly at crash" >&2
+fi
+
+cleanup_workers() {
+    srun --jobid="$JOBID" --overlap docker exec "$CONTAINER" bash -lc \
+        "pkill -9 -f generative_recommenders 2>/dev/null; sleep 2; \
+         pkill -9 -f spawn_main 2>/dev/null; sleep 3; true" 2>/dev/null || true
+}
+clean_ckpt() {
+    srun --jobid="$JOBID" --overlap docker exec "$CONTAINER" rm -rf "$CKPT_ROOT" 2>/dev/null || true
+}
+
+# Wait for a log line to appear OR a crash sentinel. Returns 0 if target found,
+# 1 if crash sentinel found first.
+wait_for_log() {
+    local log="$1"; local target_re="$2"; local timeout_s="${3:-1500}"
+    local elapsed=0
+    while (( elapsed < timeout_s )); do
+        if grep -qE "$target_re" "$log" 2>/dev/null; then
+            return 0
+        fi
+        if grep -qE "Traceback|RuntimeError|OutOfMemoryError" "$log" 2>/dev/null; then
+            return 1
+        fi
+        sleep 5
+        elapsed=$((elapsed + 5))
+    done
+    return 2
+}
+
+# Single train window of NUM_TRAIN_BATCHES steps → last train step == NUM_TRAIN_BATCHES.
+LAST_STEP=$NUM_TRAIN_BATCHES
+
+run_phase() {
+    local name="$1"; shift
+    local log="$LOG_DIR/${name}.log"
+    # Join the per-phase env overrides into ONE word. Using `$*` (not `$@`) is
+    # essential: `$@` embedded mid-string in the double-quoted `bash -lc "..."`
+    # expands to *multiple* arguments, so bash -lc would only run up to the
+    # first override and treat the rest as positional params — launch_smoke
+    # would never execute (silent 0-byte log).
+    local env_overrides="$*"
+    : > "$log"
+    echo "[$(date)] === phase '$name' ==="
+    cleanup_workers
+    srun --jobid="$JOBID" --overlap docker exec -d "$CONTAINER" bash -lc "
+        cd $REPO &&
+        HSTU_HAMMER_KERNEL=TRITON \
+        $env_overrides \
+        RUN_NAME=resume_test_$name \
+        LOG=$log \
+        bash scripts/launch_smoke_8gpu.sh
+    "
+}
+
+# === Phase 1: baseline ===
+clean_ckpt
+run_phase baseline \
+    "NUM_TRAIN_TS=1" \
+    "EVAL_EACH_WINDOW=0" \
+    "METRIC_LOG_FREQ=1" \
+    "NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES" \
+    "DIE_AT_STEP=-1"
+wait_for_log "$LOG_DIR/baseline.log" "train - Step $LAST_STEP metrics" 1500
+rc=$?
+cleanup_workers
+[[ $rc -ne 0 ]] && { echo "FAIL: baseline didn't finish"; tail -20 "$LOG_DIR/baseline.log"; exit 1; }
+
+# === Phase 2: interrupted ===
+clean_ckpt
+run_phase interrupt \
+    "NUM_TRAIN_TS=1" \
+    "EVAL_EACH_WINDOW=0" \
+    "METRIC_LOG_FREQ=1" \
+    "NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES" \
+    "IN_WINDOW_CKPT_FREQ=$IN_WINDOW_FREQ" \
+    "KEEP_LAST_N=1" \
+    "DIE_AT_STEP=$DIE_AT_STEP" \
+    "CKPT_PATH=$CKPT_ROOT"
+wait_for_log "$LOG_DIR/interrupt.log" "die_at_step=$DIE_AT_STEP hit" 1500
+rc=$?
+cleanup_workers
+[[ $rc -ne 0 ]] && { echo "FAIL: interrupt didn't hit die_at_step"; tail -20 "$LOG_DIR/interrupt.log"; exit 1; }
+
+SAVED=$(srun --jobid="$JOBID" --overlap docker exec "$CONTAINER" ls "$CKPT_ROOT" 2>/dev/null | tr '\n' ' ')
+echo "Saved checkpoints after interrupt: $SAVED"
+
+# === Phase 3: resume ===
+run_phase resume \
+    "NUM_TRAIN_TS=1" \
+    "EVAL_EACH_WINDOW=0" \
+    "METRIC_LOG_FREQ=1" \
+    "NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES" \
+    "IN_WINDOW_CKPT_FREQ=$IN_WINDOW_FREQ" \
+    "KEEP_LAST_N=1" \
+    "DIE_AT_STEP=-1" \
+    "CKPT_PATH=$CKPT_ROOT"
+wait_for_log "$LOG_DIR/resume.log" "train - Step $LAST_STEP metrics" 1500
+rc=$?
+[[ $rc -ne 0 ]] && { cleanup_workers; echo "FAIL: resume didn't finish"; tail -20 "$LOG_DIR/resume.log"; exit 1; }
+# The resume run performs an end-of-window checkpoint save AFTER the final
+# step's metric line. That save (hundreds of GB) writes <ts>.tmp and then
+# atomically renames it onto <ts>, logging "checkpoint successfully saved" only
+# once the rename completes. If we kill workers right after the step line we'd
+# orphan a half-written <ts>.tmp and trip the stale-dir gate below — a harness
+# race, not a resume bug. Wait for the save to finish before tearing down.
+wait_for_log "$LOG_DIR/resume.log" "checkpoint successfully saved" 1500
+save_rc=$?
+cleanup_workers
+[[ $save_rc -ne 0 ]] && { echo "FAIL: resume end-of-window checkpoint save did not complete"; tail -20 "$LOG_DIR/resume.log"; exit 1; }
+
+# === HARD resume-correctness gates (functional invariants) ===
+# These — not the trajectory closeness check below — are the authoritative
+# proof the resume path is correct, because they're deterministic and immune
+# to the GPU nondeterminism that perturbs the metric trajectory.
+
+# (1) Re-entered the partial window at exactly the saved batch_idx_in_window.
+if ! grep -qE "Resuming mid-window at train_ts=[0-9]+ batch_idx_in_window=$DIE_AT_STEP\b" "$LOG_DIR/resume.log" 2>/dev/null; then
+    echo "FAIL: resume did not re-enter mid-window at batch_idx_in_window=$DIE_AT_STEP"
+    grep -E "Resuming" "$LOG_DIR/resume.log" 2>/dev/null | head -2
+    exit 1
+fi
+# (2) Per-rank RNG state was actually restored (dropout determinism path).
+RNG_RESTORED=$(grep -c "RNG state restored from" "$LOG_DIR/resume.log" 2>/dev/null || echo 0)
+echo "RNG state restored on $RNG_RESTORED ranks"
+[[ "$RNG_RESTORED" -lt 1 ]] && { echo "FAIL: no RNG state restored on resume"; exit 1; }
+# (3) The FIRST training step after resume is exactly die_at_step+1, i.e. the
+#     skip-already-trained-batches logic emitted the next unseen batch (not a
+#     restart from step 1, and not a gap).
+FIRST_RESUMED=$(grep -oE 'train - Step [0-9]+ metrics: \{.metric' "$LOG_DIR/resume.log" 2>/dev/null \
+    | grep -oE 'Step [0-9]+' | awk '{print $2}' | sort -n | head -1)
+echo "First resumed train step: $FIRST_RESUMED (expect $((DIE_AT_STEP + 1)))"
+[[ "$FIRST_RESUMED" != "$((DIE_AT_STEP + 1))" ]] && {
+    echo "FAIL: resume did not continue at step $((DIE_AT_STEP + 1)) (got $FIRST_RESUMED)"; exit 1; }
+
+# === Final on-disk state checks (atomic save + retention) ===
+NUM_CKPT=$(srun --jobid="$JOBID" --overlap docker exec "$CONTAINER" bash -lc \
+    "ls $CKPT_ROOT 2>/dev/null | grep -E '^[0-9]+$' | wc -l" | tr -d ' ')
+# Both .tmp (interrupted write) and .old (interrupted atomic-overwrite swap)
+# must be absent — their presence means a save crashed without clean recovery.
+STALE_CKPT=$(srun --jobid="$JOBID" --overlap docker exec "$CONTAINER" bash -lc \
+    "ls $CKPT_ROOT 2>/dev/null | grep -E '\\.(tmp|old)$' | wc -l" | tr -d ' ')
+echo "Final: $NUM_CKPT numeric ckpt subdirs, $STALE_CKPT stale (.tmp/.old) dirs (expect 1, 0)"
+[[ "$NUM_CKPT" != "1" ]] && { echo "FAIL: keep_last_n=1 violated"; exit 1; }
+[[ "$STALE_CKPT" != "0" ]] && { echo "FAIL: stale .tmp/.old dirs left behind"; exit 1; }
+echo "=== Resume functional invariants: ALL PASS ==="
+
+# === Trajectory closeness (sanity bound, NOT bit-equality) ===
+# Catches gross resume bugs (wrong data slice, unrestored model) that throw the
+# metric trajectory far off. Small drift is expected & tolerated (see ATOL note
+# at top). The functional invariants above are the real correctness proof.
+python3 $REPO/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py parse \
+    "$LOG_DIR/baseline.log" "$LOG_DIR/traj_baseline.json"
+python3 $REPO/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py parse \
+    "$LOG_DIR/resume.log" "$LOG_DIR/traj_resumed.json"
+
+python3 $REPO/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py compare \
+    "$LOG_DIR/traj_baseline.json" "$LOG_DIR/traj_resumed.json" \
+    --min-resume-step $((DIE_AT_STEP + 1)) --atol $ATOL
+RC=$?
+
+if [[ "$KEEP" != "1" ]]; then
+    rm -rf "$LOG_DIR"
+    clean_ckpt
+fi
+
+if [[ $RC -eq 0 ]]; then
+    echo "=== PASS: resume validated (functional invariants + trajectory within $ATOL of baseline) ==="
+else
+    echo "=== FAIL: trajectory diverged beyond $ATOL — likely a real resume bug (wrong data slice / unrestored state), not nondeterminism ==="
+fi
+exit $RC

From 4a18b954c43604aa6dbc45276eb3871898b3bbb4 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Wed, 3 Jun 2026 23:14:59 -0500
Subject: [PATCH 031/113] dlrmv4: move streaming resume test harness into
 train/tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It's a test driver, not a general script — colocate the shell harness with
its Python comparator under train/tests/ and fix the stale path references.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin     | 2 +-
 .../dlrm_v3/train/tests/streaming_resume_test.py                | 2 +-
 .../dlrm_v3/train/tests}/streaming_resume_test.sh               | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)
 rename recommendation_v4/{scripts => generative_recommenders/dlrm_v3/train/tests}/streaming_resume_test.sh (99%)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 747e3876f..6a1e9d762 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -204,7 +204,7 @@ ntb/env_int.key     = "NUM_TRAIN_BATCHES"
 ntb/env_int.default = 0
 # Test-only failure injection: when >=0 and metric_logger.global_step['train']
 # reaches this, the process exits with code 42 right after the in-window save
-# fires. Used by `scripts/streaming_resume_test.sh` to verify resume.
+# fires. Used by the streaming resume test harness (train/tests/) to verify resume.
 streaming_train_eval_loop.die_at_step = @das/env_int()
 das/env_int.key     = "DIE_AT_STEP"
 das/env_int.default = -1
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py
index 2774e8c03..b46da936b 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py
@@ -20,7 +20,7 @@
   3. Auto-detect-latest checkpoint subdir
   4. keep_last_n retention (default 1)
 
-Test flow (driven by `scripts/streaming_resume_test.sh`):
+Test flow (driven by the sibling `streaming_resume_test.sh`):
   Phase 1 (baseline): Run streaming-train-eval for N=2 train_ts × K batches/window
     with die_at_step=-1. Capture per-batch window_ne / window_auc into traj_baseline.json.
   Phase 2 (interrupt): Same config but die_at_step=M (M mid-window-2). Expect
diff --git a/recommendation_v4/scripts/streaming_resume_test.sh b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
similarity index 99%
rename from recommendation_v4/scripts/streaming_resume_test.sh
rename to recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
index 043d5b926..c3690e652 100755
--- a/recommendation_v4/scripts/streaming_resume_test.sh
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
@@ -16,7 +16,7 @@
 #   CKPT_PATH / KEEP_LAST_N / EVAL_EACH_WINDOW
 #
 # Usage:
-#   bash scripts/streaming_resume_test.sh --jobid <slurm-jobid>
+#   bash generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh --jobid <slurm-jobid>
 #       [--container yambda_primus]
 #       [--num-train-batches 200]
 #       [--die-at-step 350]

From f89da0e25eae73a23afd0246f25aa882c14058d1 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Thu, 4 Jun 2026 01:43:33 -0500
Subject: [PATCH 032/113] dlrmv4: sparse full-holdout eval cadence + eval-pool
 fork-race fix

Add eval_every_n_windows (env EVAL_EVERY_N_WINDOWS, default 1 = no change) so
the heavy full next-day eval window can run every Nth train window (and always
the final one) instead of every window, amortizing its cost on the long run.

Fix a deadlock this exposed in the double-buffer path: the persistent eval
worker pool's first iter() (its only fork) must happen on the main thread
BEFORE the prefetcher's background prep thread starts. Deferring that first
fork into the loop (as the sparse cadence naively did) forks while the bg
thread holds an allocator/GIL-released lock and hangs the run. Always pre-fork
the eval pool before the loop; in-loop re-arms only reset the persistent
workers (no fork) and target the next window that will actually eval.

Also normalize NUM_TRAIN_BATCHES/NUM_EVAL_BATCHES <=0 to None (full window /
full-holdout eval) and bind NUM_EVAL_BATCHES in gin so eval can be capped for
fast validation without affecting the full run.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin           | 13 ++++
 .../dlrm_v3/train/utils.py                    | 64 +++++++++++++++----
 2 files changed, 64 insertions(+), 13 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 6a1e9d762..fe05db74d 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -122,6 +122,13 @@ pl/env_int.default = 1
 streaming_train_eval_loop.eval_each_window = @ev/env_int()
 ev/env_int.key = "EVAL_EACH_WINDOW"
 ev/env_int.default = 1
+# Full-holdout eval cadence: run eval every Nth train window (and always on the
+# final window) instead of every window. 1 (default) = eval every window (no
+# behavior change). Set >1 (e.g. 5 via $EVAL_EVERY_N_WINDOWS) to amortize the
+# cost of consuming the full next-day eval window over several train windows.
+streaming_train_eval_loop.eval_every_n_windows = @evn/env_int()
+evn/env_int.key = "EVAL_EVERY_N_WINDOWS"
+evn/env_int.default = 1
 # Double-buffer windows: prepare the next window (index mask + first-batch
 # prefetch) in a background thread during the current window's compute, hiding
 # the per-window reset. Needs persistent_loader=1. Override via env.
@@ -202,6 +209,12 @@ ctis/env_float.default = 0.0
 streaming_train_eval_loop.num_train_batches = @ntb/env_int()
 ntb/env_int.key     = "NUM_TRAIN_BATCHES"
 ntb/env_int.default = 0
+# Cap each eval (full-holdout) window's batch count. Unset / <=0 = consume the
+# full eval window (the genuine full-holdout NE/AUC; this is what the long run
+# uses). Set >0 via $NUM_EVAL_BATCHES to subsample eval for fast validation.
+streaming_train_eval_loop.num_eval_batches = @neb/env_int()
+neb/env_int.key     = "NUM_EVAL_BATCHES"
+neb/env_int.default = 0
 # Test-only failure injection: when >=0 and metric_logger.global_step['train']
 # reaches this, the process exits with code 42 right after the in-window save
 # fires. Used by the streaming resume test harness (train/tests/) to verify resume.
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index 6d50064b6..e437aca60 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -1105,6 +1105,7 @@ def streaming_train_eval_loop(
     start_ts: int = 0,
     persistent_loader: bool = False,
     eval_each_window: bool = True,
+    eval_every_n_windows: int = 1,
     double_buffer: bool = False,
     # --- resume / mid-window-exact-once knobs ---
     resume_train_ts: Optional[int] = None,
@@ -1156,6 +1157,14 @@ def streaming_train_eval_loop(
     boundary and then resume.
     """
     profiler = Profiler(rank) if output_trace else None
+    # Normalize the per-window caps: <=0 (the env-binding default) means "no cap
+    # = consume the full window". The eval-break check below is `is not None and
+    # eval_batch_idx >= num_eval_batches`, so a literal 0 would (wrongly) break
+    # after the first batch — map it to None instead for the full-holdout eval.
+    if num_eval_batches is not None and num_eval_batches <= 0:
+        num_eval_batches = None
+    if num_train_batches is not None and num_train_batches <= 0:
+        num_train_batches = None
     dataset_class, kwargs = get_dataset()
     kwargs["embedding_config"] = embedding_table_configs
     dataset = HammerToTorchDataset(
@@ -1484,6 +1493,21 @@ def _maybe_checkpoint(train_ts: int) -> None:
     # num_train_ts is the requested *count*; preserve it so the loop runs for
     # the same total number of windows post-resume as a fresh run would have.
     train_ts_list = list(range(start_ts, start_ts + num_train_ts))
+    n_train = len(train_ts_list)
+
+    def _should_eval(i: int) -> bool:
+        """Whether to run the full-holdout eval after training window index `i`.
+
+        `eval_every_n_windows<=1` (default) preserves the per-window cadence.
+        For K>1 we eval on windows 0, K, 2K, ... and ALWAYS on the final window
+        so the trajectory ends with an eval point. Gated by `eval_each_window`.
+        """
+        if not eval_each_window:
+            return False
+        if eval_every_n_windows <= 1:
+            return True
+        return i % eval_every_n_windows == 0 or i == n_train - 1
+
     if persistent_loader and double_buffer:
         # Double-buffered: next window prepared in the background during the
         # current window's compute. Eval (if enabled) uses its own pre-forked
@@ -1498,23 +1522,30 @@ def _maybe_checkpoint(train_ts: int) -> None:
         eval_sampler: Optional[StreamingWindowSampler] = None
         eval_dl: Optional[DataLoader] = None
         # Eval iterator is built one window ahead: the eval pool (idle while the
-        # current train window runs) prefetches the eval window's first batches
-        # concurrently with train compute, so eval starts warm (hides the
-        # ~0.5s eval first-batch stall). yambda's sample content depends only on
-        # the sampler window, not is_eval, so prefetching during train is safe.
+        # current train window runs) prefetches the next eval window's first
+        # batches concurrently with train compute, so eval starts warm. yambda's
+        # sample content depends only on the sampler window, not is_eval, so
+        # prefetching during train is safe.
         eval_iter: Optional[Iterator] = None
         if eval_each_window and len(train_ts_list) > 0:
             eval_sampler = StreamingWindowSampler(rank, world_size)
             eval_dl = make_persistent_streaming_dataloader(
                 dataset=dataset, sampler=eval_sampler
             )
-            # Fork the eval pool now (main thread, before any prefetch thread)
-            # and kick off prefetch of the first eval window (train_ts_list[0]+1).
+            # CRITICAL: fork the eval worker pool HERE, on the main thread,
+            # BEFORE prefetcher.stream() below spins up its background prep
+            # thread. The pool is persistent_workers=True, so this first iter()
+            # is the ONLY fork; every later iter() merely resets and reuses these
+            # workers (no fork), so it can never deadlock against the background
+            # thread holding an allocator/GIL-released lock. (Deferring this
+            # first fork into the loop — as a sparse-eval cadence naively might —
+            # hangs the run.) _should_eval(0) is always True when eval is enabled
+            # (0 % K == 0), so the first eval window is always train_ts_list[0]+1;
+            # arm it now so it prefetches during the i=0 train window.
             eval_sampler.set_window(
                 dataset.dataset.window_indices(train_ts_list[0] + 1)  # pyre-ignore [16]
             )
             eval_iter = iter(eval_dl)
-        n_train = len(train_ts_list)
         for i, (train_ts, train_data_iterator) in enumerate(
             # Only the FIRST window after a mid-window resume needs the skip
             # (handed via prefetcher.stream's first_skip_samples). The skip is
@@ -1537,15 +1568,22 @@ def _maybe_checkpoint(train_ts: int) -> None:
                 start_batch_idx=start_batch,
                 label=f"train_ts={train_ts}",
             )
-            if eval_each_window:
+            if _should_eval(i):
                 dataset.dataset.is_eval = True  # pyre-ignore [16]
                 assert eval_sampler is not None and eval_dl is not None
                 _run_eval_window(eval_iter, label=f"eval_ts={train_ts + 1}")
-                # Re-arm the eval pool for the next window so it prefetches
-                # during the upcoming train window.
-                if i + 1 < n_train:
+                # Re-arm the (already-forked) eval pool for the NEXT window that
+                # will eval (i+1 in dense mode, i+K in sparse mode), so it warms
+                # up during the upcoming train window(s). iter() reuses the
+                # persistent workers — no fork, safe alongside the bg thread.
+                next_eval_i = next(
+                    (j for j in range(i + 1, n_train) if _should_eval(j)), None
+                )
+                if next_eval_i is not None:
                     eval_sampler.set_window(
-                        dataset.dataset.window_indices(train_ts + 2)  # pyre-ignore [16]
+                        dataset.dataset.window_indices(  # pyre-ignore [16]
+                            train_ts_list[next_eval_i] + 1
+                        )
                     )
                     eval_iter = iter(eval_dl)
             _maybe_checkpoint(train_ts)
@@ -1563,7 +1601,7 @@ def _maybe_checkpoint(train_ts: int) -> None:
                 train_ts=train_ts,
                 start_batch_idx=start_batch,
             )
-            if eval_each_window:
+            if _should_eval(i):
                 dataset.dataset.is_eval = True  # pyre-ignore [16]
                 _run_eval_window(_window_iter(train_ts + 1))
             _maybe_checkpoint(train_ts)

From 039f7c9c1ac1308375ee00556b660bc9d309fe87 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Thu, 4 Jun 2026 01:43:43 -0500
Subject: [PATCH 033/113] dlrmv4: self-healing streaming-e2e supervisor +
 NE/AUC trajectory builder

run_streaming_e2e.sh: head-node supervisor that keeps a multi-day yambda-5b
streaming train+eval alive across (1) trainer crash/OOM, (2) silent SIGKILL,
and (3) node loss. Relaunches from the latest checkpoint each time (exact-once
resume handles continuity). Node failover salloc's a fresh exclusive node on
the partition, provisions the container on it, and resumes from shared NFS;
allocations it creates are released on success (never the user's own --jobid).
Includes disk guard + stale .tmp sweep, keep_last_n retention, an exit-sentinel
+ stall watchdog for crash detection, and a node-health watchdog. Heavily
documented inline.

build_ne_auc_trajectory.py: parse train+eval NE/AUC (+perf) from a run log and
emit combined CSV/JSON plus an NE/AUC-vs-step trajectory plot.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../scripts/build_ne_auc_trajectory.py        | 225 ++++++++++
 .../scripts/run_streaming_e2e.sh              | 416 ++++++++++++++++++
 2 files changed, 641 insertions(+)
 create mode 100644 recommendation_v4/scripts/build_ne_auc_trajectory.py
 create mode 100755 recommendation_v4/scripts/run_streaming_e2e.sh

diff --git a/recommendation_v4/scripts/build_ne_auc_trajectory.py b/recommendation_v4/scripts/build_ne_auc_trajectory.py
new file mode 100644
index 000000000..bba1bdec6
--- /dev/null
+++ b/recommendation_v4/scripts/build_ne_auc_trajectory.py
@@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Build a combined train+eval NE/AUC trajectory from a streaming-train-eval log.
+
+The streaming loop (generative_recommenders/dlrm_v3/train/utils.py) emits, via
+MetricsLogger.compute(), one line per logged step of the form:
+
+  INFO:utils:train - Step 201 metrics: {'metric/lifetime_ne/listen_plus':
+    tensor(1.0182, dtype=torch.float64), 'metric/window_ne/listen_plus':
+    tensor(0.9846, ...), ..., 'metric/window_auc/listen_plus': tensor(0.5912),
+    'metric/lifetime_auc/listen_plus': tensor(0.5480)}
+
+and the analogous `eval - Step N metrics:` lines during each (full-holdout) eval
+window, plus throughput lines:
+
+  INFO:utils:train - Step 201 perf: local_sps=97.0 global_sps=776.2
+    step_ms=10553.89 elapsed_sec=680.6 total_samples=205824
+
+This script parses all three, for a chosen task (default listen_plus), and writes:
+  * <out>/trajectory.json   — {"train": {step: {...}}, "eval": {...}, "perf": [...]}
+  * <out>/trajectory.csv    — long-form rows (mode, step, metric, value)
+  * <out>/trajectory_ne_auc.png — NE and AUC vs train step, train + eval overlaid
+                                  (skipped gracefully if matplotlib is absent)
+
+It is dependency-light (stdlib + optional matplotlib) so it runs anywhere the
+log is readable, including the head node.
+
+Usage:
+  python3 scripts/build_ne_auc_trajectory.py LOG [--out DIR] [--task listen_plus]
+"""
+
+import argparse
+import csv
+import json
+import os
+import re
+import sys
+from typing import Dict, List, Optional, Tuple
+
+# `train - Step 201 metrics: {...}` / `eval - Step 17 metrics: {...}`
+_STEP_RE = re.compile(r"(train|eval) - Step (\d+) metrics: \{(.*)\}")
+# `metric/<prefix><name>/<task>': tensor(<value>` — value may be int/float/sci.
+_METRIC_RE = re.compile(
+    r"metric/([A-Za-z0-9_]+)/([A-Za-z0-9_+]+)'?\s*:\s*tensor\(\s*([-0-9.eE+]+)"
+)
+# `train - Step 201 perf: local_sps=97.0 global_sps=776.2 step_ms=10553.89 `
+# `elapsed_sec=680.6 total_samples=205824`
+_PERF_RE = re.compile(
+    r"train - Step (\d+) perf: local_sps=([-0-9.eE+]+) global_sps=([-0-9.eE+]+) "
+    r"step_ms=([-0-9.eE+]+) elapsed_sec=([-0-9.eE+]+) total_samples=(\d+)"
+)
+
+# Metrics we surface in the trajectory (others are still captured if present).
+_KEEP = ("window_ne", "lifetime_ne", "window_auc", "lifetime_auc",
+         "window_accuracy", "lifetime_accuracy", "window_gauc", "lifetime_gauc")
+
+
+def parse_log(
+    log_path: str, task: str
+) -> Tuple[Dict[str, Dict[int, Dict[str, float]]], List[Dict[str, float]]]:
+    """Return ({'train': {step: {metric: val}}, 'eval': {...}}, perf_rows).
+
+    For a given (mode, step) the LAST occurrence wins — duplicate per-rank prints
+    are identical, and within an eval window later steps carry more aggregation.
+    """
+    out: Dict[str, Dict[int, Dict[str, float]]] = {"train": {}, "eval": {}}
+    perf: List[Dict[str, float]] = []
+    with open(log_path, "r", errors="replace") as f:
+        for line in f:
+            pm = _PERF_RE.search(line)
+            if pm:
+                perf.append({
+                    "step": int(pm.group(1)),
+                    "local_sps": float(pm.group(2)),
+                    "global_sps": float(pm.group(3)),
+                    "step_ms": float(pm.group(4)),
+                    "elapsed_sec": float(pm.group(5)),
+                    "total_samples": int(pm.group(6)),
+                })
+                continue
+            m = _STEP_RE.search(line)
+            if not m:
+                continue
+            mode, step_s, body = m.group(1), m.group(2), m.group(3)
+            step = int(step_s)
+            row: Dict[str, float] = {}
+            for name, tname, val in _METRIC_RE.findall(body):
+                if tname != task:
+                    continue
+                try:
+                    row[name] = float(val)
+                except ValueError:
+                    continue
+            if row:
+                out[mode][step] = row  # last write wins
+    return out, perf
+
+
+def write_outputs(
+    traj: Dict[str, Dict[int, Dict[str, float]]],
+    perf: List[Dict[str, float]],
+    out_dir: str,
+    task: str,
+) -> None:
+    os.makedirs(out_dir, exist_ok=True)
+
+    json_path = os.path.join(out_dir, "trajectory.json")
+    with open(json_path, "w") as f:
+        json.dump(
+            {
+                "task": task,
+                "train": {str(k): v for k, v in sorted(traj["train"].items())},
+                "eval": {str(k): v for k, v in sorted(traj["eval"].items())},
+                "perf": perf,
+            },
+            f,
+            indent=2,
+        )
+
+    csv_path = os.path.join(out_dir, "trajectory.csv")
+    with open(csv_path, "w", newline="") as f:
+        w = csv.writer(f)
+        w.writerow(["mode", "step", "metric", "value"])
+        for mode in ("train", "eval"):
+            for step in sorted(traj[mode]):
+                for metric, val in traj[mode][step].items():
+                    w.writerow([mode, step, metric, val])
+
+    n_train = len(traj["train"])
+    n_eval = len(traj["eval"])
+    print(f"Parsed {n_train} train points, {n_eval} eval points, "
+          f"{len(perf)} perf points (task={task}).", file=sys.stderr)
+    print(f"Wrote {json_path}", file=sys.stderr)
+    print(f"Wrote {csv_path}", file=sys.stderr)
+
+    _maybe_plot(traj, out_dir, task)
+
+
+def _maybe_plot(
+    traj: Dict[str, Dict[int, Dict[str, float]]], out_dir: str, task: str
+) -> None:
+    try:
+        import matplotlib
+        matplotlib.use("Agg")
+        import matplotlib.pyplot as plt
+    except Exception as e:  # noqa: BLE001
+        print(f"matplotlib unavailable ({e}); skipping plot.", file=sys.stderr)
+        return
+
+    def series(mode: str, metric: str) -> Tuple[List[int], List[float]]:
+        steps = sorted(s for s in traj[mode] if metric in traj[mode][s])
+        return steps, [traj[mode][s][metric] for s in steps]
+
+    fig, (ax_ne, ax_auc) = plt.subplots(2, 1, figsize=(11, 9), sharex=True)
+
+    for metric, style in (("window_ne", "-"), ("lifetime_ne", "--")):
+        xs, ys = series("train", metric)
+        if xs:
+            ax_ne.plot(xs, ys, style, label=f"train/{metric}", alpha=0.85)
+    for metric, marker in (("window_ne", "o"), ("lifetime_ne", "s")):
+        xs, ys = series("eval", metric)
+        if xs:
+            ax_ne.plot(xs, ys, marker, ms=4, ls="", label=f"eval/{metric}")
+    ax_ne.set_ylabel("NE (normalized entropy)")
+    ax_ne.set_title(f"yambda-5b streaming train+eval trajectory — task={task}")
+    ax_ne.grid(True, alpha=0.3)
+    ax_ne.legend(fontsize=8, ncol=2)
+
+    for metric, style in (("window_auc", "-"), ("lifetime_auc", "--")):
+        xs, ys = series("train", metric)
+        if xs:
+            ax_auc.plot(xs, ys, style, label=f"train/{metric}", alpha=0.85)
+    for metric, marker in (("window_auc", "o"), ("lifetime_auc", "s")):
+        xs, ys = series("eval", metric)
+        if xs:
+            ax_auc.plot(xs, ys, marker, ms=4, ls="", label=f"eval/{metric}")
+    ax_auc.set_ylabel("AUC")
+    ax_auc.set_xlabel("train global step")
+    ax_auc.grid(True, alpha=0.3)
+    ax_auc.legend(fontsize=8, ncol=2)
+
+    png_path = os.path.join(out_dir, "trajectory_ne_auc.png")
+    fig.tight_layout()
+    fig.savefig(png_path, dpi=120)
+    print(f"Wrote {png_path}", file=sys.stderr)
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("log", help="Path to the streaming train.log")
+    ap.add_argument("--out", default=None,
+                    help="Output dir (default: <log_dir>/<log_stem>_trajectory)")
+    ap.add_argument("--task", default="listen_plus",
+                    help="Task name to extract (default: listen_plus)")
+    args = ap.parse_args()
+
+    if not os.path.exists(args.log):
+        print(f"Log not found: {args.log}", file=sys.stderr)
+        return 2
+    out_dir = args.out
+    if out_dir is None:
+        stem = os.path.splitext(os.path.basename(args.log))[0]
+        out_dir = os.path.join(os.path.dirname(os.path.abspath(args.log)),
+                               f"{stem}_trajectory")
+
+    traj, perf = parse_log(args.log, args.task)
+    write_outputs(traj, perf, out_dir, args.task)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/recommendation_v4/scripts/run_streaming_e2e.sh b/recommendation_v4/scripts/run_streaming_e2e.sh
new file mode 100755
index 000000000..64fb1274e
--- /dev/null
+++ b/recommendation_v4/scripts/run_streaming_e2e.sh
@@ -0,0 +1,416 @@
+#!/bin/bash
+# =============================================================================
+# run_streaming_e2e.sh — self-healing supervisor for the long-run yambda-5b
+#                        streaming train+eval (NE/AUC over the full ~5B dataset)
+# =============================================================================
+#
+# WHAT IT DOES
+#   Owns a multi-day "streaming-train-eval" run and keeps it alive unattended
+#   across the three failure modes that actually kill long runs:
+#       1. trainer process crash / OOM / nonzero exit
+#       2. silent death (the whole process group gets SIGKILLed — no exit code)
+#       3. the SLURM node itself going away (down / drained / job ended)
+#   In every case it relaunches the trainer from the latest on-disk checkpoint
+#   (failing over to a brand-new node for case 3) until the run finishes.
+#
+# WHY A RELAUNCH "JUST WORKS" (resume model)
+#   The training stack already implements exact-once resume: on startup it picks
+#   the latest numeric checkpoint subdir under $CKPT_PATH, restores model +
+#   optimizer + per-rank RNG, and (for mid-window in-window saves) skips the
+#   batches already trained in the partially-done window. So relaunching with
+#   the SAME --ckpt-path transparently continues from where it died — no manual
+#   bookkeeping here beyond pointing every attempt at the same base dir.
+#
+# WHERE IT RUNS / HOW IT DRIVES WORK
+#   This script runs on the SLURM HEAD node. The trainer runs inside a long-
+#   lived docker container ($CONTAINER) on the compute node held by a SLURM
+#   allocation ($JOBID). All control flow is `srun --jobid <id> --overlap
+#   docker exec ...` into that container. The container bind-mounts shared NFS
+#   (/home/chcai = code, /apps/chcai = checkpoints+logs), which is what makes
+#   node failover possible: any node in $PARTITION sees the same code+state.
+#
+# MAIN LOOP (state machine, up to --max-relaunch attempts)
+#   for each attempt:
+#     ensure_ready   — guarantee a healthy allocation whose container is up,
+#                      failing over to a freshly-provisioned node if not.
+#     disk_guard     — sweep crash-orphaned *.tmp/*.old saves; abort if the
+#                      ckpt volume has < --min-free-gib free.
+#     cleanup_workers— kill any stragglers from a previous attempt.
+#     launch         — detached `docker exec -d` of the trainer; a trailing
+#                      echo appends an `E2E_RUN_EXIT=<code>` sentinel to the log
+#                      when the trainer returns (clean OR crash).
+#     monitor loop (every --poll-s):
+#         * node watchdog  — if $JOBID stops being healthy mid-run, break and
+#                            let the next attempt fail over.
+#         * exit sentinel  — E2E_RUN_EXIT=0 => success (done); nonzero => relaunch.
+#         * stall watchdog — if the log stops growing AND no trainer process is
+#                            alive for --stall-s, treat as silent death=>relaunch.
+#                            (Long blocking saves keep the process alive, so they
+#                            never false-trip this.)
+#
+# NODE FAILOVER (case 3, the --allow-failover path)
+#   ensure_ready -> acquire_node: `salloc --no-shell --exclusive` a fresh node on
+#   $PARTITION, wait for RUNNING, then provision_node runs $PROVISION_SCRIPT on
+#   it (docker pull + container create + dep install; ~15 min on a cold node).
+#   Allocations WE create are tracked and `scancel`ed (container removed first)
+#   on success via release_acquired; the user's original --jobid is never
+#   cancelled. Checkpoints on shared NFS make the resume seamless.
+#
+# CHECKPOINTS / DISK
+#   The trainer saves atomically (write to <ts>.tmp, fsync, rename to <ts>) and
+#   prunes to keep_last_n newest. One checkpoint is ~560 GB; a save blocks the
+#   step it fires on for ~83 s (measured, no NFS contention). Cadence is driven
+#   by --ckpt-time-interval (time-based) and optional --in-window-freq.
+#
+# ARGS (all optional; defaults target the full production run)
+#   run shape:   --jobid --container --start-ts --num-train-ts --eval-every
+#   ckpt:        --ckpt-path --keep-last-n --ckpt-time-interval --in-window-freq
+#   logging:     --run-name --log
+#   resilience:  --max-relaunch --min-free-gib --stall-s
+#   failover:    --partition --alloc-time --allow-failover --provision-script
+#   validation:  --num-train-batches --num-eval-batches  (>0 caps batches/window
+#                for fast tests; 0 = full window / full-holdout eval)
+#   test-only:   --die-at-step  (>=0 injects a crash at that global step)
+#
+# EXIT CODES
+#   0  run completed (E2E_RUN_EXIT=0 — all windows + final eval done)
+#   1  exhausted --max-relaunch without completing
+#   3  disk guard tripped (insufficient free space)
+#   4  could not secure a healthy allocation (failover failed / disabled)
+#
+# OUTPUTS (next to --log)
+#   <log>                      trainer stdout/stderr + E2E_RUN_EXIT sentinels
+#   <log:.log>.supervisor.log  this supervisor's own timeline
+#   <log:.log>.provision.log   node-provisioning output (failover only)
+#
+# EXAMPLE
+#   nohup bash scripts/run_streaming_e2e.sh \
+#       --ckpt-path /apps/chcai/ckpts/yambda_5b_e2e \
+#       --run-name yambda_5b_e2e --log /apps/chcai/yambda_5b_e2e.log \
+#       --start-ts 150 --num-train-ts 149 --eval-every 10 \
+#       --ckpt-time-interval 7200 --keep-last-n 2 --max-relaunch 50 \
+#       > /apps/chcai/yambda_5b_e2e.supervisor.console.log 2>&1 &
+# =============================================================================
+
+set -uo pipefail
+
+JOBID=11367
+CONTAINER=yambda_primus
+REPO=/home/chcai/training/recommendation_v4
+
+# Defaults are sized from measurement: ~560 GB/checkpoint, ~83 s/save (blocking,
+# attributed to the step it fires on), ~650 ms/train step @ global batch 8192,
+# ~1465 steps (~16 min) per full ~12M-anchor window, full-holdout eval
+# ~6-7 min/window. A ~2h time-based checkpoint interval keeps save overhead ~1%
+# while bounding crash-loss to ~2h of compute; eval every N windows
+# (EVAL_EVERY_N_WINDOWS) amortizes the full-holdout eval cost.
+NUM_TRAIN_TS=149
+START_TS=150
+EVAL_EVERY=5
+CKPT_TIME_INTERVAL=7200
+KEEP_LAST_N=2
+CKPT_PATH=/apps/chcai/ckpts/yambda_5b_e2e
+RUN_NAME=yambda_5b_e2e
+LOG=/apps/chcai/yambda_5b_e2e.log
+MAX_RELAUNCH=50
+NUM_TRAIN_BATCHES=0     # 0 = full window (only capped for validation/tests)
+NUM_EVAL_BATCHES=0      # 0 = full holdout eval (only capped for validation)
+DIE_AT_STEP=-1          # >=0 = test-only failure injection
+IN_WINDOW_FREQ=0        # >0 = also save every N batches within a window
+
+# --- node failover ----------------------------------------------------------
+# If the current allocation/node goes away, acquire a FRESH node, (re)provision
+# the container on it, and resume — checkpoints + code live on shared NFS
+# (/apps/chcai, /home/chcai), so any node in the partition can continue.
+PARTITION=meta64
+ALLOC_TIME=7-00:00:00                 # SLURM --time for a failover allocation
+ALLOW_FAILOVER=1                      # 0 = never acquire a new node
+PROVISION_SCRIPT=/home/chcai/_provision_yambda_primus.sh
+
+# Disk guard: require at least this many GiB free on the ckpt volume before a
+# (re)launch. One checkpoint is ~600 GB; with keep_last_n the existing copies
+# are already counted as used, so we only need room for one new in-flight .tmp
+# plus margin (~800 GiB). The volume has ~3.7 TB free.
+MIN_FREE_GIB=800
+# Stall watchdog: if the log hasn't grown AND no trainer process is alive for
+# this many seconds with no exit sentinel, treat it as a silent death. Comfortably
+# exceeds one blocking checkpoint save (~83 s); and because a save keeps the
+# trainer process alive, an in-progress save never trips the watchdog anyway.
+STALL_S=1200
+POLL_S=30
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --jobid) JOBID="$2"; shift 2;;
+        --container) CONTAINER="$2"; shift 2;;
+        --num-train-ts) NUM_TRAIN_TS="$2"; shift 2;;
+        --start-ts) START_TS="$2"; shift 2;;
+        --eval-every) EVAL_EVERY="$2"; shift 2;;
+        --ckpt-time-interval) CKPT_TIME_INTERVAL="$2"; shift 2;;
+        --keep-last-n) KEEP_LAST_N="$2"; shift 2;;
+        --ckpt-path) CKPT_PATH="$2"; shift 2;;
+        --run-name) RUN_NAME="$2"; shift 2;;
+        --log) LOG="$2"; shift 2;;
+        --max-relaunch) MAX_RELAUNCH="$2"; shift 2;;
+        --num-train-batches) NUM_TRAIN_BATCHES="$2"; shift 2;;
+        --num-eval-batches) NUM_EVAL_BATCHES="$2"; shift 2;;
+        --die-at-step) DIE_AT_STEP="$2"; shift 2;;
+        --in-window-freq) IN_WINDOW_FREQ="$2"; shift 2;;
+        --min-free-gib) MIN_FREE_GIB="$2"; shift 2;;
+        --stall-s) STALL_S="$2"; shift 2;;
+        --partition) PARTITION="$2"; shift 2;;
+        --alloc-time) ALLOC_TIME="$2"; shift 2;;
+        --allow-failover) ALLOW_FAILOVER="$2"; shift 2;;
+        --provision-script) PROVISION_SCRIPT="$2"; shift 2;;
+        *) echo "Unknown arg: $1"; exit 1;;
+    esac
+done
+
+ORIGINAL_JOBID="$JOBID"   # never scancel the user's own hold allocation
+ACQUIRED_JOBIDS=()        # failover allocations WE created (released on success)
+
+SUP_LOG="${LOG%.log}.supervisor.log"
+
+sup() { echo "[$(date '+%F %T')] [supervisor] $*" | tee -a "$SUP_LOG"; }
+
+# Run a command inside the allocation's container, capturing its stdout.
+cexec() { srun --jobid="$JOBID" --overlap docker exec "$CONTAINER" bash -lc "$1" 2>/dev/null; }
+
+cleanup_workers() {
+    # The trainer spawns 8 rank processes + dataloader workers whose cmdlines
+    # don't all match `train_ranker`/`spawn_main`, so target them, then fall
+    # back to `pkill python` — safe because this container is dedicated to this
+    # training (only the trainer runs python here during a supervised run).
+    srun --jobid="$JOBID" --overlap docker exec "$CONTAINER" bash -lc \
+        "pkill -9 -f train_ranker 2>/dev/null; pkill -9 -f multiprocessing 2>/dev/null; \
+         sleep 2; pkill -9 python 2>/dev/null; sleep 3; true" 2>/dev/null || true
+}
+
+# --- node-failover helpers ---------------------------------------------------
+
+# Healthy = the job is RUNNING and its node is not down/drained/failing.
+alloc_healthy() {
+    local jid="$1"
+    [[ -z "$jid" ]] && return 1
+    local st node nstate
+    st=$(squeue -h -j "$jid" -o '%T' 2>/dev/null | head -1)
+    [[ "$st" != "RUNNING" ]] && return 1
+    node=$(squeue -h -j "$jid" -o '%N' 2>/dev/null | head -1)
+    [[ -z "$node" ]] && return 1
+    nstate=$(sinfo -h -n "$node" -o '%t' 2>/dev/null | head -1)
+    case "$nstate" in
+        *down*|*drain*|*fail*|*unk*|*boot*|"") return 1;;
+    esac
+    return 0
+}
+
+# Can we actually exec in the training container on this allocation?
+container_up() {
+    srun --jobid="$1" --overlap docker exec "$CONTAINER" true >/dev/null 2>&1
+}
+
+# (Re)create + dep-install the container on the given allocation's node.
+provision_node() {
+    local jid="$1" node
+    node=$(squeue -h -j "$jid" -o '%N' 2>/dev/null | head -1)
+    sup "provisioning container '$CONTAINER' on job $jid (node ${node:-?}) — cold node can take ~15 min"
+    srun --jobid="$jid" --overlap bash "$PROVISION_SCRIPT" >> "${LOG%.log}.provision.log" 2>&1
+    container_up "$jid"
+}
+
+# Acquire a fresh exclusive node on $PARTITION; sets global JOBID on success.
+acquire_node() {
+    if [[ "$ALLOW_FAILOVER" != "1" ]]; then
+        sup "failover disabled (--allow-failover 0); cannot acquire a new node"; return 1
+    fi
+    sup "requesting a fresh node on partition=$PARTITION (exclusive, time=$ALLOC_TIME)"
+    local out jid
+    out=$(salloc --no-shell --partition="$PARTITION" --nodes=1 --exclusive \
+                 --time="$ALLOC_TIME" --job-name=e2e_failover 2>&1)
+    jid=$(echo "$out" | grep -oiE "Granted job allocation [0-9]+" | grep -oE "[0-9]+" | head -1)
+    if [[ -z "$jid" ]]; then
+        sup "FATAL: salloc did not grant a node: $out"; return 1
+    fi
+    ACQUIRED_JOBIDS+=("$jid")
+    sup "granted new allocation jobid=$jid; waiting for RUNNING"
+    local waited=0
+    while (( waited < 600 )); do
+        [[ "$(squeue -h -j "$jid" -o '%T' 2>/dev/null | head -1)" == "RUNNING" ]] && break
+        sleep 10; waited=$((waited + 10))
+    done
+    if [[ "$(squeue -h -j "$jid" -o '%T' 2>/dev/null | head -1)" != "RUNNING" ]]; then
+        sup "FATAL: new allocation $jid never reached RUNNING (waited ${waited}s)"; return 1
+    fi
+    JOBID="$jid"
+    sup "new node ready: jobid=$JOBID node=$(squeue -h -j "$JOBID" -o '%N' 2>/dev/null | head -1)"
+    return 0
+}
+
+# Ensure $JOBID is a healthy allocation with the container up, failing over to a
+# fresh provisioned node if not. Resume is automatic: the latest checkpoint is
+# on shared NFS, reachable from whatever node we end up on.
+ensure_ready() {
+    if alloc_healthy "$JOBID"; then
+        if container_up "$JOBID"; then return 0; fi
+        sup "alloc $JOBID healthy but container '$CONTAINER' not up — (re)provisioning"
+        provision_node "$JOBID" && return 0
+        sup "provisioning on $JOBID failed; will try a fresh node"
+    else
+        sup "current allocation $JOBID unavailable (job not RUNNING or node down/drained)"
+    fi
+    acquire_node || return 1
+    provision_node "$JOBID" || { sup "provisioning new node $JOBID failed"; return 1; }
+    sup "failover complete — now running on jobid=$JOBID"
+    return 0
+}
+
+release_acquired() {
+    local jid
+    for jid in "${ACQUIRED_JOBIDS[@]:-}"; do
+        [[ -n "$jid" && "$jid" != "$ORIGINAL_JOBID" ]] || continue
+        # docker is independent of SLURM, so remove the container before freeing
+        # the node, otherwise it lingers for the next tenant.
+        srun --jobid="$jid" --overlap docker rm -f "$CONTAINER" >/dev/null 2>&1 || true
+        scancel "$jid" 2>/dev/null && sup "released failover allocation $jid (container removed)"
+    done
+}
+
+# Returns 0 (true) if a trainer process is alive in the container.
+trainer_alive() {
+    local n
+    n=$(cexec "pgrep -f generative_recommenders | wc -l" | tr -d ' ')
+    [[ "${n:-0}" -gt 0 ]]
+}
+
+disk_guard() {
+    # Sweep crash-orphaned partial saves, then check free space.
+    cexec "for d in '$CKPT_PATH'/*.tmp '$CKPT_PATH'/*.old; do [ -e \"\$d\" ] && rm -rf \"\$d\" && echo swept \"\$d\"; done; true"
+    local free_gib
+    free_gib=$(cexec "df -BG --output=avail '$CKPT_PATH' 2>/dev/null | tail -1 | tr -dc '0-9'")
+    free_gib=${free_gib:-0}
+    sup "disk guard: ${free_gib} GiB free on $CKPT_PATH (min ${MIN_FREE_GIB})"
+    if (( free_gib < MIN_FREE_GIB )); then
+        sup "FATAL: insufficient free space (${free_gib} < ${MIN_FREE_GIB} GiB). Aborting."
+        return 1
+    fi
+    return 0
+}
+
+launch() {
+    # Detached launch. The trailing echo appends a definitive exit sentinel to
+    # the log once the trainer returns (clean finish OR crash with nonzero rc).
+    srun --jobid="$JOBID" --overlap docker exec -d "$CONTAINER" bash -lc "
+        cd $REPO &&
+        HSTU_HAMMER_KERNEL=TRITON \
+        MODE=streaming-train-eval \
+        START_TS=$START_TS \
+        NUM_TRAIN_TS=$NUM_TRAIN_TS \
+        EVAL_EACH_WINDOW=1 \
+        EVAL_EVERY_N_WINDOWS=$EVAL_EVERY \
+        CKPT_PATH=$CKPT_PATH \
+        KEEP_LAST_N=$KEEP_LAST_N \
+        CKPT_TIME_INTERVAL_S=$CKPT_TIME_INTERVAL \
+        IN_WINDOW_CKPT_FREQ=$IN_WINDOW_FREQ \
+        NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES \
+        NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES \
+        DIE_AT_STEP=$DIE_AT_STEP \
+        METRIC_LOG_FREQ=50 \
+        RUN_NAME=$RUN_NAME \
+        LOG=$LOG \
+        bash scripts/launch_smoke_8gpu.sh;
+        echo \"E2E_RUN_EXIT=\$? \$(date '+%F %T')\" >> $LOG
+    "
+}
+
+# Returns the exit code from the most recent E2E_RUN_EXIT sentinel APPENDED
+# since `since_marker` bytes, or empty if none yet.
+last_exit_since() {
+    local since_line="$1"
+    cexec "tail -n +$since_line '$LOG' 2>/dev/null | grep -aoE 'E2E_RUN_EXIT=[0-9]+' | tail -1 | cut -d= -f2"
+}
+
+sup "=== streaming e2e supervisor start ==="
+sup "jobid=$JOBID container=$CONTAINER repo=$REPO"
+sup "start_ts=$START_TS num_train_ts=$NUM_TRAIN_TS eval_every=$EVAL_EVERY"
+sup "ckpt_path=$CKPT_PATH keep_last_n=$KEEP_LAST_N ckpt_time_interval=${CKPT_TIME_INTERVAL}s in_window_freq=$IN_WINDOW_FREQ"
+sup "log=$LOG num_train_batches=$NUM_TRAIN_BATCHES die_at_step=$DIE_AT_STEP max_relaunch=$MAX_RELAUNCH"
+
+cexec "mkdir -p '$CKPT_PATH'"
+
+attempt=0
+while (( attempt < MAX_RELAUNCH )); do
+    attempt=$((attempt + 1))
+    sup "--- attempt $attempt/$MAX_RELAUNCH ---"
+
+    # Make sure we have a live, container-ready node (failover + provision if the
+    # current allocation/node has gone away).
+    if ! ensure_ready; then
+        sup "FATAL: could not secure a healthy allocation (failover failed)."
+        exit 4
+    fi
+    if ! disk_guard; then exit 3; fi
+    cleanup_workers
+
+    # Mark current end of log so we only read sentinels produced by THIS attempt.
+    start_line=$(cexec "wc -l < '$LOG' 2>/dev/null" | tr -d ' '); start_line=${start_line:-0}
+    start_line=$((start_line + 1))
+
+    sup "launching (reading sentinels from log line $start_line)"
+    launch
+    sleep 15  # let docker exec spin up the process
+
+    # Monitor loop.
+    last_size=0
+    stall_accum=0
+    hb=0
+    while true; do
+        # Node/allocation watchdog: if the node we're on goes down/drains or the
+        # job ends, bail out of the monitor — the next attempt's ensure_ready
+        # will fail over to a fresh node and resume from the latest checkpoint.
+        hb=$((hb + 1))
+        if (( hb % 4 == 0 )) && ! alloc_healthy "$JOBID"; then
+            sup "allocation $JOBID lost mid-run (node down/job ended) — relaunching with failover."
+            break
+        fi
+
+        rc=$(last_exit_since "$start_line")
+        if [[ -n "$rc" ]]; then
+            if [[ "$rc" == "0" ]]; then
+                sup "RUN COMPLETED CLEANLY (E2E_RUN_EXIT=0) on attempt $attempt."
+                cleanup_workers
+                final_ckpts=$(cexec "ls '$CKPT_PATH' 2>/dev/null | grep -E '^[0-9]+$' | tr '\n' ' '")
+                sup "final checkpoints retained: ${final_ckpts:-<none>}"
+                release_acquired
+                sup "=== streaming e2e supervisor done (success) ==="
+                exit 0
+            fi
+            sup "trainer exited nonzero (E2E_RUN_EXIT=$rc). Will relaunch from latest checkpoint."
+            break
+        fi
+
+        # Stall watchdog: track log growth; if frozen and no trainer alive, die.
+        cur_size=$(cexec "wc -c < '$LOG' 2>/dev/null" | tr -d ' '); cur_size=${cur_size:-0}
+        if [[ "$cur_size" == "$last_size" ]]; then
+            if trainer_alive; then
+                stall_accum=0  # alive but quiet (e.g. long save / eval) — ok
+            else
+                stall_accum=$((stall_accum + POLL_S))
+                if (( stall_accum >= STALL_S )); then
+                    sup "STALL: log frozen ${stall_accum}s and no trainer alive — silent death. Relaunching."
+                    break
+                fi
+            fi
+        else
+            stall_accum=0
+            last_size=$cur_size
+        fi
+        sleep "$POLL_S"
+    done
+
+    cleanup_workers
+    sleep $(( attempt < 5 ? 20 : 60 ))  # small backoff
+done
+
+sup "FATAL: exhausted MAX_RELAUNCH=$MAX_RELAUNCH without completion."
+sup "=== streaming e2e supervisor done (failure) ==="
+exit 1

From 0be8a71d7bd542800366f2b3f2ff7468c47fe999 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Thu, 4 Jun 2026 01:57:34 -0500
Subject: [PATCH 034/113] dlrmv4: durable streaming-run metrics (append log +
 TensorBoard on NFS)

Make the full-run NE/AUC record survive relaunches and node failover:
- launch_smoke_8gpu.sh now appends to $LOG (tee -a) instead of truncating, so
  a supervised run that relaunches many times into the same log keeps its full
  metrics history. The supervisor initializes the log once at run start.
- run_streaming_e2e.sh: truncate $LOG once at start, create the per-run TB dir,
  and export TENSORBOARD_LOG_PATH=/apps/chcai/tb/$RUN_NAME/ into the launch env.
- yambda_5b.gin: MetricsLogger.tensorboard_log_path now reads $TENSORBOARD_LOG_PATH
  (via the existing env_path helper) defaulting to /apps/chcai/tb/yambda_5b/ on
  shared NFS, instead of container-local /tmp (which is wiped on failover).

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin                    |  8 +++++++-
 recommendation_v4/scripts/launch_smoke_8gpu.sh         |  6 +++++-
 recommendation_v4/scripts/run_streaming_e2e.sh         | 10 +++++++++-
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index fe05db74d..febca9d77 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -167,7 +167,13 @@ Profiler.active = 5
 Profiler.trace_dir = @run_results_dir()
 
 # logger variables
-MetricsLogger.tensorboard_log_path = "/tmp/tb/yambda_5b/"
+# TensorBoard event dir. Default lives on shared NFS (not container-local /tmp,
+# which is wiped on node failover) so the NE/AUC scalars survive relaunches and
+# failover. Override per-run via $TENSORBOARD_LOG_PATH (the supervisor sets it
+# to /apps/chcai/tb/$RUN_NAME/).
+MetricsLogger.tensorboard_log_path = @tbp/env_path()
+tbp/env_path.key     = "TENSORBOARD_LOG_PATH"
+tbp/env_path.default = "/apps/chcai/tb/yambda_5b/"
 MetricsLogger.world_size = 8
 MetricsLogger.auc_threshold = 0.80275
 # Checkpointing disabled by default — a full DMP checkpoint is ~100s of GB and
diff --git a/recommendation_v4/scripts/launch_smoke_8gpu.sh b/recommendation_v4/scripts/launch_smoke_8gpu.sh
index 94886dc87..eaa0aa19b 100755
--- a/recommendation_v4/scripts/launch_smoke_8gpu.sh
+++ b/recommendation_v4/scripts/launch_smoke_8gpu.sh
@@ -8,7 +8,11 @@ REPO_ROOT=$(cd "$(dirname "$0")/.." && pwd)
 cd "$REPO_ROOT"
 
 LOG=${LOG:-/apps/chcai/yambda_5b_8gpu.log}
-echo "[$(date)] REPO_ROOT=$REPO_ROOT" | tee "$LOG"
+# Append (not truncate): under the streaming-e2e supervisor a run may relaunch
+# many times into the SAME $LOG, and we want the full NE/AUC history preserved
+# across attempts. The supervisor initializes ($LOG) once at run start. For a
+# standalone invocation, set a fresh $LOG (or truncate it yourself) per run.
+echo "[$(date)] REPO_ROOT=$REPO_ROOT" | tee -a "$LOG"
 
 # polars-u64-idx (NOT stock polars) — yambda parquet's flat-explode overruns
 # 32-bit row index. Reserved node has no outbound DNS, so we install from a
diff --git a/recommendation_v4/scripts/run_streaming_e2e.sh b/recommendation_v4/scripts/run_streaming_e2e.sh
index 64fb1274e..1d5953328 100755
--- a/recommendation_v4/scripts/run_streaming_e2e.sh
+++ b/recommendation_v4/scripts/run_streaming_e2e.sh
@@ -316,6 +316,7 @@ launch() {
         DIE_AT_STEP=$DIE_AT_STEP \
         METRIC_LOG_FREQ=50 \
         RUN_NAME=$RUN_NAME \
+        TENSORBOARD_LOG_PATH=/apps/chcai/tb/$RUN_NAME/ \
         LOG=$LOG \
         bash scripts/launch_smoke_8gpu.sh;
         echo \"E2E_RUN_EXIT=\$? \$(date '+%F %T')\" >> $LOG
@@ -335,7 +336,14 @@ sup "start_ts=$START_TS num_train_ts=$NUM_TRAIN_TS eval_every=$EVAL_EVERY"
 sup "ckpt_path=$CKPT_PATH keep_last_n=$KEEP_LAST_N ckpt_time_interval=${CKPT_TIME_INTERVAL}s in_window_freq=$IN_WINDOW_FREQ"
 sup "log=$LOG num_train_batches=$NUM_TRAIN_BATCHES die_at_step=$DIE_AT_STEP max_relaunch=$MAX_RELAUNCH"
 
-cexec "mkdir -p '$CKPT_PATH'"
+cexec "mkdir -p '$CKPT_PATH' '/apps/chcai/tb/$RUN_NAME'"
+# Initialize this run's metrics log ONCE. launch_smoke_8gpu.sh appends (tee -a),
+# so every relaunch attempt accumulates into this single file — the full-run
+# NE/AUC history survives crashes and node failover instead of being truncated
+# on each relaunch. (Starting the supervisor = starting a fresh run.)
+cexec ": > '$LOG'"
+sup "metrics log initialized (relaunch-append): $LOG"
+sup "tensorboard (NFS): /apps/chcai/tb/$RUN_NAME/"
 
 attempt=0
 while (( attempt < MAX_RELAUNCH )); do

From da8a9ebbe71c6faf6696f6c67a75b505a1136a4b Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Thu, 4 Jun 2026 03:10:37 -0500
Subject: [PATCH 035/113] dlrmv4: raise streaming-run disk guard for
 keep_last_n=1 saves

A checkpoint save writes a fresh ~560 GB .tmp before the old copy is
pruned, so peak transient usage is (keep_last_n + 1) copies (~1120 GB at
keep_last_n=1). Bump MIN_FREE_GIB 800 -> 1200 so a (re)launch never wedges
mid-save on a near-full shared NFS volume.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 recommendation_v4/scripts/run_streaming_e2e.sh | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/recommendation_v4/scripts/run_streaming_e2e.sh b/recommendation_v4/scripts/run_streaming_e2e.sh
index 1d5953328..4a0a0c7f0 100755
--- a/recommendation_v4/scripts/run_streaming_e2e.sh
+++ b/recommendation_v4/scripts/run_streaming_e2e.sh
@@ -128,10 +128,11 @@ ALLOW_FAILOVER=1                      # 0 = never acquire a new node
 PROVISION_SCRIPT=/home/chcai/_provision_yambda_primus.sh
 
 # Disk guard: require at least this many GiB free on the ckpt volume before a
-# (re)launch. One checkpoint is ~600 GB; with keep_last_n the existing copies
-# are already counted as used, so we only need room for one new in-flight .tmp
-# plus margin (~800 GiB). The volume has ~3.7 TB free.
-MIN_FREE_GIB=800
+# (re)launch. One checkpoint is ~560 GB. A save writes a fresh .tmp BEFORE the
+# old copy is pruned, so peak transient usage is (keep_last_n + 1) copies. With
+# keep_last_n=1 that is ~1120 GB; require ~1200 GiB free at launch so the run
+# never wedges mid-save on a near-full shared NFS volume.
+MIN_FREE_GIB=1200
 # Stall watchdog: if the log hasn't grown AND no trainer process is alive for
 # this many seconds with no exit sentinel, treat it as a silent death. Comfortably
 # exceeds one blocking checkpoint save (~83 s); and because a save keeps the

From 3c896c927e8a30b29b3031c61e375032f8e2175c Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Thu, 4 Jun 2026 13:30:19 -0500
Subject: [PATCH 036/113] dlrmv4: anchor eval points to train global step in
 NE/AUC trajectory

Eval used a per-window-resetting internal step counter, so eval markers
bunched at the left of the x-axis instead of overlaying the train curve.
Parse the log sequentially, collapse each eval window to its final
full-holdout metrics, and anchor it to the train global step it ran at
(tagging eval_ts via the [boundary] marker, which can interleave around
the eval's metric lines). Eval points now overlay train on a shared axis.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../scripts/build_ne_auc_trajectory.py        | 92 ++++++++++++++++---
 1 file changed, 77 insertions(+), 15 deletions(-)

diff --git a/recommendation_v4/scripts/build_ne_auc_trajectory.py b/recommendation_v4/scripts/build_ne_auc_trajectory.py
index bba1bdec6..edc57cfbf 100644
--- a/recommendation_v4/scripts/build_ne_auc_trajectory.py
+++ b/recommendation_v4/scripts/build_ne_auc_trajectory.py
@@ -62,22 +62,63 @@
     r"train - Step (\d+) perf: local_sps=([-0-9.eE+]+) global_sps=([-0-9.eE+]+) "
     r"step_ms=([-0-9.eE+]+) elapsed_sec=([-0-9.eE+]+) total_samples=(\d+)"
 )
+# `[boundary] eval_ts=181 eval first-batch ...` — marks the start of a full-holdout
+# eval block; the eval runs at whatever the latest train global step was, so we use
+# it to anchor each eval's metrics onto the shared train-global-step x-axis.
+_EVAL_BOUNDARY_RE = re.compile(r"\[boundary\] eval_ts=(\d+) eval first-batch")
 
 # Metrics we surface in the trajectory (others are still captured if present).
 _KEEP = ("window_ne", "lifetime_ne", "window_auc", "lifetime_auc",
          "window_accuracy", "lifetime_accuracy", "window_gauc", "lifetime_gauc")
 
 
+def _parse_metrics(body: str, task: str) -> Dict[str, float]:
+    row: Dict[str, float] = {}
+    for name, tname, val in _METRIC_RE.findall(body):
+        if tname != task:
+            continue
+        try:
+            row[name] = float(val)
+        except ValueError:
+            continue
+    return row
+
+
 def parse_log(
     log_path: str, task: str
 ) -> Tuple[Dict[str, Dict[int, Dict[str, float]]], List[Dict[str, float]]]:
     """Return ({'train': {step: {metric: val}}, 'eval': {...}}, perf_rows).
 
-    For a given (mode, step) the LAST occurrence wins — duplicate per-rank prints
-    are identical, and within an eval window later steps carry more aggregation.
+    Train is keyed by train global step (last write wins — duplicate per-rank
+    prints are identical). Eval uses a per-rank-resetting internal step counter
+    that restarts every eval window, so we instead anchor each eval window onto
+    the *train global step at which it ran* (the loop trains window T then evals
+    window T+1, so the eval's anchor is the last train step before it). Each eval
+    window collapses to a single point carrying its final, most-aggregated
+    full-holdout metrics, plus `eval_window` (the eval_ts) for reference.
     """
     out: Dict[str, Dict[int, Dict[str, float]]] = {"train": {}, "eval": {}}
     perf: List[Dict[str, float]] = []
+
+    last_train_step = 0
+    cur_anchor: Optional[int] = None   # train global step this eval block runs at
+    cur_ts: Optional[int] = None       # eval window id (eval_ts)
+    cur_row: Optional[Dict[str, float]] = None  # final row of the current block
+    cur_internal: Optional[int] = None  # last eval internal step (reset detection)
+
+    def flush_eval() -> None:
+        nonlocal cur_anchor, cur_ts, cur_row, cur_internal
+        if cur_row:
+            anchor = cur_anchor if cur_anchor is not None else last_train_step
+            row = dict(cur_row)
+            if cur_ts is not None:
+                row["eval_window"] = float(cur_ts)
+            key = anchor
+            while key in out["eval"]:  # keep distinct evals from colliding
+                key += 1
+            out["eval"][key] = row
+        cur_anchor = cur_ts = cur_row = cur_internal = None
+
     with open(log_path, "r", errors="replace") as f:
         for line in f:
             pm = _PERF_RE.search(line)
@@ -91,21 +132,40 @@ def parse_log(
                     "total_samples": int(pm.group(6)),
                 })
                 continue
+            bm = _EVAL_BOUNDARY_RE.search(line)
+            if bm:
+                # The boundary line (a different logger) can interleave before OR
+                # after this eval's metric lines, so don't use it to delimit the
+                # block — just tag the current block with its eval_ts. Block
+                # boundaries come from eval-step resets / training resuming.
+                if cur_anchor is None:
+                    cur_anchor = last_train_step
+                cur_ts = int(bm.group(1))
+                continue
             m = _STEP_RE.search(line)
             if not m:
                 continue
             mode, step_s, body = m.group(1), m.group(2), m.group(3)
             step = int(step_s)
-            row: Dict[str, float] = {}
-            for name, tname, val in _METRIC_RE.findall(body):
-                if tname != task:
-                    continue
-                try:
-                    row[name] = float(val)
-                except ValueError:
-                    continue
-            if row:
-                out[mode][step] = row  # last write wins
+            row = _parse_metrics(body, task)
+            if mode == "train":
+                last_train_step = step
+                if cur_anchor is not None or cur_row is not None:
+                    flush_eval()  # an eval block ends when training resumes
+                if row:
+                    out["train"][step] = row  # last write wins
+            else:  # eval — accumulate into the current block (last = most aggregated)
+                # Fallback for logs without a boundary marker: a drop in the eval
+                # internal step counter signals a fresh eval window.
+                if (cur_internal is not None and step < cur_internal
+                        and cur_anchor is None):
+                    flush_eval()
+                if cur_anchor is None:
+                    cur_anchor = last_train_step
+                cur_internal = step
+                if row:
+                    cur_row = row
+    flush_eval()
     return out, perf
 
 
@@ -173,7 +233,8 @@ def series(mode: str, metric: str) -> Tuple[List[int], List[float]]:
     for metric, marker in (("window_ne", "o"), ("lifetime_ne", "s")):
         xs, ys = series("eval", metric)
         if xs:
-            ax_ne.plot(xs, ys, marker, ms=4, ls="", label=f"eval/{metric}")
+            ax_ne.plot(xs, ys, marker=marker, ms=5, ls="-", lw=1.0, alpha=0.9,
+                       label=f"eval/{metric}")
     ax_ne.set_ylabel("NE (normalized entropy)")
     ax_ne.set_title(f"yambda-5b streaming train+eval trajectory — task={task}")
     ax_ne.grid(True, alpha=0.3)
@@ -186,9 +247,10 @@ def series(mode: str, metric: str) -> Tuple[List[int], List[float]]:
     for metric, marker in (("window_auc", "o"), ("lifetime_auc", "s")):
         xs, ys = series("eval", metric)
         if xs:
-            ax_auc.plot(xs, ys, marker, ms=4, ls="", label=f"eval/{metric}")
+            ax_auc.plot(xs, ys, marker=marker, ms=5, ls="-", lw=1.0, alpha=0.9,
+                        label=f"eval/{metric}")
     ax_auc.set_ylabel("AUC")
-    ax_auc.set_xlabel("train global step")
+    ax_auc.set_xlabel("train global step (eval points anchored to the step they ran at)")
     ax_auc.grid(True, alpha=0.3)
     ax_auc.legend(fontsize=8, ncol=2)
 

From 04dc53a0cfaa2dac61d9eaea0f257299e7c5faf5 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Thu, 4 Jun 2026 15:14:16 -0500
Subject: [PATCH 037/113] dlrmv4: supervisor tolerates control-plane outages +
 attach mode

A SLURM controller outage made the supervisor's squeue/sinfo health check
report "node lost", trigger failover, and then FATAL-exit when salloc
couldn't reach the controller either - abandoning a run whose trainer was
in fact still alive (slurmctld outages don't kill RUNNING jobs).

Harden it:
- controller_up()/wait_for_controller(): treat an unreachable controller as
  transient; wait for recovery (up to --ctrl-wait-max) instead of failing over.
- Direct-SSH fallback (dexec via cached LAST_NODE) so trainer_alive() and the
  mid-run watchdog verify liveness even while the controller is down; only fail
  over if the trainer is genuinely gone, not on a control-plane blip.
- timeout-guard all srun/squeue/sinfo calls so a hung control plane / NFS can't
  wedge the supervisor.
- --attach mode: adopt an already-running trainer (one that outlived a killed
  supervisor) without truncating its log, sweeping its in-flight .tmp, killing
  it, or relaunching - just resume monitoring in place.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../scripts/run_streaming_e2e.sh              | 134 +++++++++++++++---
 1 file changed, 115 insertions(+), 19 deletions(-)

diff --git a/recommendation_v4/scripts/run_streaming_e2e.sh b/recommendation_v4/scripts/run_streaming_e2e.sh
index 4a0a0c7f0..40dc5fe81 100755
--- a/recommendation_v4/scripts/run_streaming_e2e.sh
+++ b/recommendation_v4/scripts/run_streaming_e2e.sh
@@ -98,6 +98,12 @@ JOBID=11367
 CONTAINER=yambda_primus
 REPO=/home/chcai/training/recommendation_v4
 
+# Direct-SSH fallback so the supervisor can probe the node even while the SLURM
+# control plane is unreachable — a transient controller outage must NOT be
+# mistaken for node death (which would needlessly tear down a healthy run).
+SSH_OPTS="-o BatchMode=yes -o ConnectTimeout=10 -o StrictHostKeyChecking=no"
+LAST_NODE=""   # last known node hostname for $JOBID (cached for direct probes)
+
 # Defaults are sized from measurement: ~560 GB/checkpoint, ~83 s/save (blocking,
 # attributed to the step it fires on), ~650 ms/train step @ global batch 8192,
 # ~1465 steps (~16 min) per full ~12M-anchor window, full-holdout eval
@@ -117,6 +123,12 @@ NUM_TRAIN_BATCHES=0     # 0 = full window (only capped for validation/tests)
 NUM_EVAL_BATCHES=0      # 0 = full holdout eval (only capped for validation)
 DIE_AT_STEP=-1          # >=0 = test-only failure injection
 IN_WINDOW_FREQ=0        # >0 = also save every N batches within a window
+ATTACH=0                # 1 = (re)attach to an already-running trainer without
+                        #     killing it or truncating its log — used to restore
+                        #     supervision over a trainer that outlived a previous
+                        #     supervisor (e.g. one a control-plane outage killed).
+CTRL_WAIT_MAX=3600      # max seconds to wait for an unreachable SLURM controller
+                        # to recover before concluding failover is needed.
 
 # --- node failover ----------------------------------------------------------
 # If the current allocation/node goes away, acquire a FRESH node, (re)provision
@@ -157,6 +169,8 @@ while [[ $# -gt 0 ]]; do
         --num-eval-batches) NUM_EVAL_BATCHES="$2"; shift 2;;
         --die-at-step) DIE_AT_STEP="$2"; shift 2;;
         --in-window-freq) IN_WINDOW_FREQ="$2"; shift 2;;
+        --attach) ATTACH="$2"; shift 2;;
+        --ctrl-wait-max) CTRL_WAIT_MAX="$2"; shift 2;;
         --min-free-gib) MIN_FREE_GIB="$2"; shift 2;;
         --stall-s) STALL_S="$2"; shift 2;;
         --partition) PARTITION="$2"; shift 2;;
@@ -174,8 +188,46 @@ SUP_LOG="${LOG%.log}.supervisor.log"
 
 sup() { echo "[$(date '+%F %T')] [supervisor] $*" | tee -a "$SUP_LOG"; }
 
-# Run a command inside the allocation's container, capturing its stdout.
-cexec() { srun --jobid="$JOBID" --overlap docker exec "$CONTAINER" bash -lc "$1" 2>/dev/null; }
+# Run a command inside the allocation's container, capturing its stdout. Wrapped
+# in `timeout` so a hung control plane / NFS can never wedge the supervisor.
+cexec() { timeout 90 srun --jobid="$JOBID" --overlap docker exec "$CONTAINER" bash -lc "$1" 2>/dev/null; }
+
+# Is the SLURM control plane reachable right now?
+controller_up() { timeout 12 sinfo -h -o '%P' >/dev/null 2>&1; }
+
+# Refresh + echo the node hostname for $JOBID (cached in LAST_NODE for direct
+# probes that must work even while the controller is down).
+refresh_node() {
+    local n; n=$(timeout 12 squeue -h -j "$JOBID" -o '%N' 2>/dev/null | head -1)
+    [[ -n "$n" ]] && LAST_NODE="$n"
+    echo "$LAST_NODE"
+}
+
+# Run a (simple) command in the container by SSHing the node DIRECTLY, bypassing
+# SLURM — the only way to observe the trainer during a controller outage. Needs a
+# previously-cached LAST_NODE. Keep "$1" free of embedded double quotes.
+dexec() {
+    [[ -z "$LAST_NODE" ]] && return 1
+    timeout 40 ssh $SSH_OPTS "$LAST_NODE" "docker exec $CONTAINER bash -lc '$1'" 2>/dev/null
+}
+
+# Block (with backoff) until the controller is reachable again, up to
+# CTRL_WAIT_MAX. A controller outage leaves RUNNING jobs running, so waiting it
+# out is almost always preferable to abandoning a healthy node.
+wait_for_controller() {
+    local waited=0
+    controller_up && return 0
+    while ! controller_up; do
+        if (( waited >= CTRL_WAIT_MAX )); then
+            sup "controller still unreachable after ${waited}s (max ${CTRL_WAIT_MAX}s) — proceeding."
+            return 1
+        fi
+        sup "SLURM controller unreachable; waiting for recovery (${waited}s/${CTRL_WAIT_MAX}s)…"
+        sleep 30; waited=$((waited + 30))
+    done
+    sup "SLURM controller reachable again after ${waited}s."
+    return 0
+}
 
 cleanup_workers() {
     # The trainer spawns 8 rank processes + dataloader workers whose cmdlines
@@ -207,7 +259,7 @@ alloc_healthy() {
 
 # Can we actually exec in the training container on this allocation?
 container_up() {
-    srun --jobid="$1" --overlap docker exec "$CONTAINER" true >/dev/null 2>&1
+    timeout 30 srun --jobid="$1" --overlap docker exec "$CONTAINER" true >/dev/null 2>&1
 }
 
 # (Re)create + dep-install the container on the given allocation's node.
@@ -251,7 +303,11 @@ acquire_node() {
 # fresh provisioned node if not. Resume is automatic: the latest checkpoint is
 # on shared NFS, reachable from whatever node we end up on.
 ensure_ready() {
+    # A controller outage leaves RUNNING jobs running; wait it out before deciding
+    # anything is wrong, so we never abandon a healthy node over a transient blip.
+    wait_for_controller || true
     if alloc_healthy "$JOBID"; then
+        refresh_node >/dev/null
         if container_up "$JOBID"; then return 0; fi
         sup "alloc $JOBID healthy but container '$CONTAINER' not up — (re)provisioning"
         provision_node "$JOBID" && return 0
@@ -276,10 +332,16 @@ release_acquired() {
     done
 }
 
-# Returns 0 (true) if a trainer process is alive in the container.
+# Returns 0 (true) if a trainer process is alive in the container. Uses SLURM
+# (srun) when the controller is up, else falls back to a direct SSH probe so a
+# control-plane outage can't make a live trainer look dead.
 trainer_alive() {
     local n
-    n=$(cexec "pgrep -f generative_recommenders | wc -l" | tr -d ' ')
+    if controller_up; then
+        n=$(cexec "pgrep -f generative_recommenders | wc -l" | tr -d ' ')
+    else
+        n=$(dexec "pgrep -f generative_recommenders | wc -l" | tr -d ' ')
+    fi
     [[ "${n:-0}" -gt 0 ]]
 }
 
@@ -341,9 +403,14 @@ cexec "mkdir -p '$CKPT_PATH' '/apps/chcai/tb/$RUN_NAME'"
 # Initialize this run's metrics log ONCE. launch_smoke_8gpu.sh appends (tee -a),
 # so every relaunch attempt accumulates into this single file — the full-run
 # NE/AUC history survives crashes and node failover instead of being truncated
-# on each relaunch. (Starting the supervisor = starting a fresh run.)
-cexec ": > '$LOG'"
-sup "metrics log initialized (relaunch-append): $LOG"
+# on each relaunch. (Starting the supervisor = starting a fresh run.) In ATTACH
+# mode we are adopting an already-running trainer, so we KEEP its existing log.
+if [[ "$ATTACH" == "1" ]]; then
+    sup "ATTACH mode: adopting existing run — keeping metrics log intact: $LOG"
+else
+    cexec ": > '$LOG'"
+    sup "metrics log initialized (relaunch-append): $LOG"
+fi
 sup "tensorboard (NFS): /apps/chcai/tb/$RUN_NAME/"
 
 attempt=0
@@ -357,16 +424,33 @@ while (( attempt < MAX_RELAUNCH )); do
         sup "FATAL: could not secure a healthy allocation (failover failed)."
         exit 4
     fi
-    if ! disk_guard; then exit 3; fi
-    cleanup_workers
-
-    # Mark current end of log so we only read sentinels produced by THIS attempt.
-    start_line=$(cexec "wc -l < '$LOG' 2>/dev/null" | tr -d ' '); start_line=${start_line:-0}
-    start_line=$((start_line + 1))
+    refresh_node >/dev/null   # cache LAST_NODE for direct probes during outages
+
+    # ATTACH (first attempt only): if a trainer is already running for this run,
+    # adopt it in place — DON'T disk-guard (its sweep would delete an in-flight
+    # .tmp save), DON'T cleanup_workers (would kill it), DON'T launch. Just begin
+    # monitoring. Any subsequent relaunch is a normal launch from the checkpoint.
+    adopt=0
+    if [[ "$ATTACH" == "1" ]] && trainer_alive; then
+        adopt=1; ATTACH=0
+        sup "ATTACH mode: trainer already alive on ${LAST_NODE:-node} — monitoring in place (no relaunch/kill/sweep)."
+    fi
 
-    sup "launching (reading sentinels from log line $start_line)"
-    launch
-    sleep 15  # let docker exec spin up the process
+    if (( adopt )); then
+        # Mark current end of log so we only read sentinels produced from here on.
+        start_line=$(cexec "wc -l < '$LOG' 2>/dev/null" | tr -d ' '); start_line=${start_line:-0}
+        start_line=$((start_line + 1))
+        sup "monitoring adopted run (reading sentinels from log line $start_line)"
+    else
+        if ! disk_guard; then exit 3; fi
+        cleanup_workers
+        # Mark current end of log so we only read sentinels produced by THIS attempt.
+        start_line=$(cexec "wc -l < '$LOG' 2>/dev/null" | tr -d ' '); start_line=${start_line:-0}
+        start_line=$((start_line + 1))
+        sup "launching (reading sentinels from log line $start_line)"
+        launch
+        sleep 15  # let docker exec spin up the process
+    fi
 
     # Monitor loop.
     last_size=0
@@ -378,8 +462,20 @@ while (( attempt < MAX_RELAUNCH )); do
         # will fail over to a fresh node and resume from the latest checkpoint.
         hb=$((hb + 1))
         if (( hb % 4 == 0 )) && ! alloc_healthy "$JOBID"; then
-            sup "allocation $JOBID lost mid-run (node down/job ended) — relaunching with failover."
-            break
+            if ! controller_up; then
+                # Control plane unreachable != node down. If the trainer is still
+                # alive on the node (direct SSH probe), this is a transient blip —
+                # keep monitoring rather than tearing down a healthy run.
+                if trainer_alive; then
+                    sup "control plane unreachable but trainer still alive on ${LAST_NODE:-node} — transient; continuing to monitor."
+                else
+                    sup "control plane unreachable AND trainer absent on ${LAST_NODE:-node} — relaunching with failover."
+                    break
+                fi
+            else
+                sup "allocation $JOBID lost mid-run (node down/job ended) — relaunching with failover."
+                break
+            fi
         fi
 
         rc=$(last_exit_since "$start_line")

From 45e0daf4b44e8943afb1a9d047154fa1857c348a Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Thu, 4 Jun 2026 23:31:37 -0500
Subject: [PATCH 038/113] dlrmv4: TFLOPS/MFU/HFU reporting +
 MAX_SEQ_LEN/HISTORY_LENGTH gin knobs

Adds per-step TFLOPS, MFU and HFU to the training perf line + TensorBoard
scalars, plus env-driven gin knobs for sequence-length sweeps. The two
changes ship together because TFLOPS reporting is what makes the 4k vs
2k comparison interpretable (a sps drop alone doesn't say whether the
GPU is doing more work per sample or less work overall).

TFLOPS reporting

  HSTU is structurally different from a standard transformer: the UVQK
  projection fuses Q/K/V/U into one matmul, and SiLU(U)*y elementwise
  gating replaces the FFN block. So a TorchTitan-style "count matmuls
  with factor 6, attention with factor 12" template gives the wrong
  per-sample FLOPs unless it's coded against HSTU's specific shapes.

  DlrmHSTU.get_num_flops_per_sample() implements the HSTU-specific
  dense formula (UVQK projection + Q.K^T + att.V + output projection,
  per layer, times n_layers). Multitask head adds a trivial constant.
  Embedding lookups excluded because they're memory-bound and would
  otherwise pollute MFU. This is the dense yardstick: what the FLOPs
  would be if every sample's UIH filled max_seq_len. It's the standard
  MFU denominator (matches Primus-DLRM's OneTrans accounting style).

  Yambda's per-user history is jagged, so the actual GPU work is
  significantly less than the dense estimate. main_forward stashes
  _last_jagged_flops_per_sample after computing each batch's mean(s)
  and mean(s^2), and MetricsLogger reads + .item()s it once per
  metric_log_frequency (one D->H sync per logging interval, not per
  step). When present, the perf line splits into:

    tflops_algo/gpu  mfu   - dense yardstick, MFU denominator
    tflops_real/gpu  hfu   - actual jagged work, hardware utilization
    fill                   - real / algo, padding-skipped fraction

  When the jagged stash is absent (other model types, or before the
  first main_forward), only tflops_algo/mfu print. When the model
  doesn't expose get_num_flops_per_sample at all, the perf line is
  byte-for-byte unchanged (backward compatible).

  get_gpu_peak_flops("bf16"/"fp32") consults a per-GPU peak table
  (MI355X/MI350X=2300 TF, MI300X/MI325X=1300, B200=2250, H100=990,
  A100=312 for bf16) and warns + defaults to MI350X for unknown
  device names. train_ranker pulls "bf16" when bf16_training=True
  else "fp32"; the dtype string drives only the denominator, not
  anything else.

  TensorBoard scalars added alongside the existing perf/* group:
  perf/train_tflops_algo_gpu, perf/train_mfu_pct,
  perf/train_tflops_real_gpu, perf/train_hfu_pct,
  perf/train_fill_pct.

  Validated on 8x MI350X yambda-5b at the 2k baseline:
    241.6 GFLOP/sample (dense), GPU peak 2300 TFLOPS
    steady-state ~13500 sps -> mfu 17.7-17.9%, hfu 9.7-10.2%,
    fill 55-57% (yambda users average ~1170 events vs 2046 max).
  HSTU's MFU is higher than the OneTrans baseline on the same GPU
  (Primus-DLRM hit 5.7%) because HSTU does less compute per token,
  so the FLOPs it does execute run at higher hardware utilization.

MAX_SEQ_LEN / HISTORY_LENGTH env knobs

  Adds two env-driven gin macros so sequence-length sweeps don't
  require editing yambda_5b.gin (which a running e2e job has parsed
  -- editing the file would change behavior on a supervisor restart):

    get_hstu_configs.max_seq_len = @msl/env_int()   default 2048
    get_dataset.history_length   = @hl/env_int()    default 2039

  Defaults are the current production values, so unset env is a no-op.
  Used the 4k validation run with MAX_SEQ_LEN=4096 HISTORY_LENGTH=4096
  (reuses hstu_cache_L4096/ on disk; ~8 events of trailing UIH
  truncation per sample, negligible).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin           | 17 +++-
 .../dlrm_v3/train/train_ranker.py             | 21 ++++-
 .../generative_recommenders/dlrm_v3/utils.py  | 81 ++++++++++++++++
 .../modules/dlrm_hstu.py                      | 92 +++++++++++++++++++
 4 files changed, 206 insertions(+), 5 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index febca9d77..19c23e5b6 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -75,12 +75,21 @@ get_dataset.new_path_prefix = %DATA_PATH
 # average (not 679) — TRITON's jagged attention skips the unfilled slots,
 # so the under-fill costs sequence budget but not GPU compute.
 # Cache is keyed by L on disk under <processed_dir>/hstu_cache_L<N>/;
-# switching L reuses an existing cache or builds a new one (~5 min).
-get_dataset.history_length = 2039
+# switching L reuses an existing cache or builds a new one (~5 min). Override
+# via $HISTORY_LENGTH (default 2039 keeps the existing single-task cache hot).
+get_dataset.history_length = @hl/env_int()
+hl/env_int.key     = "HISTORY_LENGTH"
+hl/env_int.default = 2039
 
 # Model-side attention budget. Dataset truncates UIH to fit this value if
-# `history_length + contextual + candidate` would overflow.
-get_hstu_configs.max_seq_len = 2048
+# `history_length + contextual + candidate` would overflow. Override via
+# $MAX_SEQ_LEN (default 2048 preserves the production single-task shape).
+# Pair MAX_SEQ_LEN=4096 with HISTORY_LENGTH=4086 for the 4k-no-truncation
+# analog (3*1362+9=4095 ≤ 4096); pair with HISTORY_LENGTH=4096 to reuse the
+# existing hstu_cache_L4096/ cache with ~8 events of trailing truncation.
+get_hstu_configs.max_seq_len = @msl/env_int()
+msl/env_int.key     = "MAX_SEQ_LEN"
+msl/env_int.default = 2048
 
 # --- streaming (temporal-order) training -------------------------------------
 # Only consumed under `--mode streaming-train-eval`; the default train-eval
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
index 19490b840..59520bd16 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
@@ -81,7 +81,10 @@ def _main_func(
         train_eval_loop,
         train_loop,
     )
-    from generative_recommenders.dlrm_v3.utils import MetricsLogger
+    from generative_recommenders.dlrm_v3.utils import (
+        MetricsLogger,
+        get_gpu_peak_flops,
+    )
 
     setup(
         rank=rank,
@@ -104,12 +107,28 @@ def _main_func(
         hstu_config=model_configs,
         embedding_table_configs=embedding_table_configs,
     )
+    # TFLOPS/MFU reporting: query the model's static dense estimate +
+    # current GPU's peak FLOPS. Both default to 0 if the model doesn't
+    # expose get_num_flops_per_sample, in which case MetricsLogger silently
+    # drops the tflops fields from the perf line.
+    inner_model = model.module if hasattr(model, "module") else model
+    num_flops_per_sample = (
+        float(inner_model.get_num_flops_per_sample())
+        if hasattr(inner_model, "get_num_flops_per_sample")
+        else 0.0
+    )
+    gpu_peak_flops = get_gpu_peak_flops(
+        "bf16" if getattr(model_configs, "bf16_training", True) else "fp32"
+    )
     metrics = MetricsLogger(
         multitask_configs=model_configs.multitask_configs,
         batch_size=train_dataloader.batch_size,
         window_size=2500,
         device=device,
         rank=rank,
+        num_flops_per_sample=num_flops_per_sample,
+        gpu_peak_flops=gpu_peak_flops,
+        model=model,
     )
     # Capture streaming resume hint (None for cold start / non-streaming
     # checkpoints). For the streaming-train-eval mode, we forward this into
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
index 51e35d90e..be94af667 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
@@ -693,7 +693,22 @@ def __init__(
         tensorboard_log_path: str = "",
         world_size: int = 1,
         auc_threshold: Optional[float] = None,
+        num_flops_per_sample: float = 0.0,
+        gpu_peak_flops: float = 0.0,
+        model: Optional[torch.nn.Module] = None,
     ) -> None:
+        # tflops/mfu reporting state (optional — when both num_flops_per_sample
+        # and gpu_peak_flops are set, the train perf line gains tflops_algo/gpu,
+        # mfu, tflops_real/gpu, hfu, fill. The jagged ("real") numbers come
+        # from `model._last_jagged_flops_per_sample` stashed by DlrmHSTU.main_forward.
+        self._num_flops_per_sample: float = max(0.0, float(num_flops_per_sample))
+        self._gpu_peak_flops: float = max(0.0, float(gpu_peak_flops))
+        self._model_ref: Optional[torch.nn.Module] = model
+        if rank == 0 and self._num_flops_per_sample > 0 and self._gpu_peak_flops > 0:
+            logger.info(
+                f"FLOPS reporting enabled: {self._num_flops_per_sample / 1e9:.1f} "
+                f"GFLOP/sample (dense fwd+bwd), GPU peak {self._gpu_peak_flops / 1e12:.0f} TFLOPS"
+            )
         self.multitask_configs: List[TaskConfig] = multitask_configs
         all_classification_tasks: List[str] = [
             task.task_name
@@ -940,10 +955,44 @@ def compute_and_log(
             self.tb_logger.add_scalar(
                 "perf/train_elapsed_sec", elapsed, global_step=step
             )
+            # TFLOPS / MFU reporting (algo = dense yardstick, real = jagged).
+            #   tflops_algo/gpu, mfu  — uses max_seq_len^2 attention work (the
+            #     MFU yardstick: the FLOPs the workload would do if every
+            #     user's UIH filled the padded seq length).
+            #   tflops_real/gpu, hfu — uses this batch's mean(s_i^2) (actual
+            #     GPU work; hardware utilization).
+            #   fill                  — real / algo as a percent; how much of
+            #     the algo budget the model actually executed this batch.
+            # The jagged stash is read from the inner model; the model ref may
+            # be a DMP wrapper, so unwrap via .module if present.
+            tflops_str = ""
+            if self._num_flops_per_sample > 0 and self._gpu_peak_flops > 0:
+                local_flops = self._num_flops_per_sample * local_sps
+                tflops_algo = local_flops / 1e12
+                mfu = 100.0 * local_flops / self._gpu_peak_flops
+                self.tb_logger.add_scalar("perf/train_tflops_algo_gpu", tflops_algo, global_step=step)
+                self.tb_logger.add_scalar("perf/train_mfu_pct", mfu, global_step=step)
+                tflops_str = f" tflops_algo/gpu={tflops_algo:.1f} mfu={mfu:.1f}%"
+                jagged_t = None
+                m = self._model_ref
+                if m is not None:
+                    inner = m.module if hasattr(m, "module") else m
+                    jagged_t = getattr(inner, "_last_jagged_flops_per_sample", None)
+                if jagged_t is not None:
+                    jagged = float(jagged_t.item())
+                    if 0 < jagged < self._num_flops_per_sample:
+                        tflops_real = jagged * local_sps / 1e12
+                        hfu = 100.0 * jagged * local_sps / self._gpu_peak_flops
+                        fill = 100.0 * jagged / self._num_flops_per_sample
+                        self.tb_logger.add_scalar("perf/train_tflops_real_gpu", tflops_real, global_step=step)
+                        self.tb_logger.add_scalar("perf/train_hfu_pct", hfu, global_step=step)
+                        self.tb_logger.add_scalar("perf/train_fill_pct", fill, global_step=step)
+                        tflops_str += f" tflops_real/gpu={tflops_real:.1f} hfu={hfu:.1f}% fill={fill:.1f}%"
             logger.info(
                 f"train - Step {step} perf: local_sps={local_sps:.1f} "
                 f"global_sps={global_sps:.1f} step_ms={step_ms:.2f} "
                 f"elapsed_sec={elapsed:.1f} total_samples={self._perf_total_samples}"
+                + tflops_str
             )
             self._perf_t_window = now
             self._perf_steps_in_window = 0
@@ -1046,6 +1095,38 @@ def env_float(key: str = "", default: float = 0.0) -> float:
     return float(raw) if raw else default
 
 
+_GPU_PEAK_FLOPS_TABLE: Dict[str, Dict[str, float]] = {
+    # Per-GPU peak TFLOPS by dtype. Values from vendor datasheets / Primus-DLRM
+    # peak_table. Used as the denominator in MFU/HFU. Keyed by case-insensitive
+    # substring of torch.cuda.get_device_name(0).
+    "MI355X": {"bf16": 2300e12, "fp32": 575e12},
+    "MI350X": {"bf16": 2300e12, "fp32": 575e12},
+    "MI300X": {"bf16": 1300e12, "fp32": 653e12},
+    "MI325X": {"bf16": 1300e12, "fp32": 653e12},
+    "B200":   {"bf16": 2250e12, "fp32": 1125e12},
+    "H100":   {"bf16": 990e12,  "fp32": 67e12},
+    "A100":   {"bf16": 312e12,  "fp32": 19.5e12},
+}
+
+
+def get_gpu_peak_flops(dtype: str = "bf16") -> float:
+    """Peak FLOPS for the current GPU at the given dtype.
+
+    Falls back to MI350X's number with a warning when the device name doesn't
+    match any table entry — better to over-report MFU than to silently skip.
+    """
+    if not torch.cuda.is_available():
+        return 0.0
+    name = torch.cuda.get_device_name(0)
+    for gpu_key, peaks in _GPU_PEAK_FLOPS_TABLE.items():
+        if gpu_key in name:
+            return peaks.get(dtype, peaks["bf16"])
+    logger.warning(
+        f"Unknown GPU for peak FLOPS: {name}; defaulting to MI350X bf16 (2300 TF)"
+    )
+    return _GPU_PEAK_FLOPS_TABLE["MI350X"]["bf16"]
+
+
 @gin.configurable
 def run_results_dir(run_name: str = "default", subdir: str = "results") -> str:
     """Resolve ``<recommendation_v4>/<subdir>/<run_name>`` from this file's location.
diff --git a/recommendation_v4/generative_recommenders/modules/dlrm_hstu.py b/recommendation_v4/generative_recommenders/modules/dlrm_hstu.py
index 3a35df3ba..f11bb226a 100644
--- a/recommendation_v4/generative_recommenders/modules/dlrm_hstu.py
+++ b/recommendation_v4/generative_recommenders/modules/dlrm_hstu.py
@@ -148,6 +148,11 @@ def __init__(  # noqa C901
         self._pipeline_mode: bool = False
         self._hstu_configs = hstu_configs
         self._bf16_training: bool = bf16_training
+        # Last batch's jagged FLOPs/sample (0-d tensor on GPU). Populated by
+        # main_forward; MetricsLogger reads + .item()s on each compute_and_log
+        # to compute tflops_real/gpu and hfu (vs dense yardstick from
+        # get_num_flops_per_sample()).
+        self._last_jagged_flops_per_sample: Optional[torch.Tensor] = None
         set_static_max_seq_lens([self._hstu_configs.max_seq_len])
 
         if not is_dense:
@@ -284,6 +289,81 @@ def __init__(  # noqa C901
             LayerNorm(hstu_configs.hstu_transducer_embedding_dim),
         ).apply(init_mlp_weights_optional_bias)
 
+    # -- FLOPs estimation -----------------------------------------------------
+    # Convention matches TorchTitan / Primus-DLRM: matmul = 6 × M × N × K
+    # (×3 fwd+bwd, ×2 FMA), attention = 2 matmuls (Q·K^T + att·V).
+    # Embedding lookups excluded — they're memory-bound, not compute.
+    #
+    # HSTU vs OneTrans: HSTU collapses attention + FFN into a single UVQK
+    # projection plus SiLU(U) ⊙ y elementwise gating. There is NO separate
+    # FFN block (which dominates FLOPs in a standard transformer), so HSTU
+    # is intentionally compute-leaner per layer for the same N.
+    def _hstu_layer_flops(
+        self, n_tokens_linear: float, n_tokens_attn_sq: float
+    ) -> float:
+        """Per-layer FLOPs given linear-op token count and attention-token²
+        count. Dense estimate uses ``N`` and ``N²``; jagged estimate
+        substitutes ``mean(s_i)`` and ``mean(s_i²)``."""
+        cfg = self._hstu_configs
+        D = cfg.hstu_embedding_table_dim
+        H = cfg.hstu_num_heads
+        hd = cfg.hstu_attn_linear_dim  # V/U head dim
+        qd = cfg.hstu_attn_qk_dim       # Q/K head dim
+        uvqk = 6 * n_tokens_linear * D * (2 * hd + 2 * qd) * H
+        attn = 6 * n_tokens_attn_sq * H * (qd + hd)  # Q·K^T + att·V
+        out = 6 * n_tokens_linear * (3 * H * hd) * D
+        return uvqk + attn + out
+
+    def get_num_flops_per_sample(self) -> float:
+        """Dense-equivalent fwd+bwd FLOPs per sample at ``max_seq_len``.
+
+        Used as the MFU yardstick (peak utilization the workload could
+        theoretically reach if every sample's sequence were the full padded
+        length). The actual ``tflops_real``/``hfu`` reported per step uses
+        the jagged estimate stashed by ``main_forward``.
+        """
+        cfg = self._hstu_configs
+        N = float(cfg.max_seq_len)
+        n_layers = cfg.hstu_attn_num_layers
+        flops = n_layers * self._hstu_layer_flops(
+            n_tokens_linear=N, n_tokens_attn_sq=N * N
+        )
+        # Multitask head (Linear(D, n_tasks)) — negligible but cheap to add.
+        n_tasks = len(self._multitask_configs)
+        if n_tasks > 0:
+            flops += 6 * n_tasks * cfg.hstu_embedding_table_dim
+        return float(flops)
+
+    def _compute_jagged_flops_per_sample(
+        self,
+        uih_seq_lengths: torch.Tensor,
+        num_candidates: torch.Tensor,
+    ) -> torch.Tensor:
+        """Jagged fwd+bwd FLOPs per sample for THIS batch's actual lengths.
+
+        Per-sample merged sequence length s_i = uih_seq_lengths[i] +
+        num_candidates[i]. Returns a 0-d tensor on the batch's device;
+        caller should ``.item()`` it (one D→H sync per logging interval).
+        """
+        s = (uih_seq_lengths + num_candidates).float()
+        mean_s = s.mean()
+        mean_s_sq = (s * s).mean()
+        cfg = self._hstu_configs
+        n_layers = cfg.hstu_attn_num_layers
+        flops = n_layers * (
+            6 * mean_s * cfg.hstu_embedding_table_dim
+              * (2 * cfg.hstu_attn_linear_dim + 2 * cfg.hstu_attn_qk_dim)
+              * cfg.hstu_num_heads
+            + 6 * mean_s_sq * cfg.hstu_num_heads
+              * (cfg.hstu_attn_qk_dim + cfg.hstu_attn_linear_dim)
+            + 6 * mean_s * (3 * cfg.hstu_num_heads * cfg.hstu_attn_linear_dim)
+              * cfg.hstu_embedding_table_dim
+        )
+        n_tasks = len(self._multitask_configs)
+        if n_tasks > 0:
+            flops = flops + 6 * n_tasks * cfg.hstu_embedding_table_dim
+        return flops
+
     def _construct_payload(
         self,
         payload_features: Dict[str, torch.Tensor],
@@ -542,6 +622,18 @@ def main_forward(
         Optional[torch.Tensor],
         Optional[torch.Tensor],
     ]:
+        # Stash this batch's jagged FLOPs/sample for MetricsLogger to read.
+        # No D->H sync: the .item() happens once per metric_log_frequency in
+        # the trainer, not on every step. Eval-mode batches also produce a
+        # stash but the trainer only consumes it on train batches.
+        if not torch.jit.is_scripting():
+            self._last_jagged_flops_per_sample = (
+                self._compute_jagged_flops_per_sample(
+                    uih_seq_lengths=uih_seq_lengths,
+                    num_candidates=num_candidates,
+                )
+            )
+
         # merge uih and candidates embeddings
         for (
             uih_feature_name,

From e794c0a1703850937a040f5eca6c18c2f2cf61df Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Mon, 8 Jun 2026 20:52:18 -0500
Subject: [PATCH 039/113] dlrmv4: configurable lifetime-AUC backend +
 fixed-holdout streaming eval

Add a dual-set streaming eval (fresh per-pass "window_*" + cumulative
"lifetime_*") for NE/Accuracy/GAUC/AUC, with a gin-selectable lifetime-AUC
backend for both train and eval: "binned" (BinnedCumulativeAUC, exact
cumulative AUC from an O(bins) histogram, default) or "capped"
(LifetimeAUCMetricComputation, trailing per-rank buffer). Backend, bins, and
window are MetricsLogger gin bindings (env-overridable).

Persist per-rank cumulative metric state in metricbuf_rank{rank}.pt for both
backends across train/eval/eval_cum; keep per-rank state out of the shared
rank-0 blob (strip capped buffers, zero binned histograms) so a resume never
inherits rank-0's counts. Eval set is a stable user-hash holdout over a fixed
window range, validated against a checkpoint split contract on resume.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/checkpoint.py                     | 227 ++++++++-
 .../dlrm_v3/datasets/synthetic_streaming.py   |   5 +-
 .../dlrm_v3/datasets/yambda.py                | 109 ++++-
 .../dlrm_v3/tests/test_lifetime_auc_resume.py | 146 ++++++
 .../dlrm_v3/train/gin/yambda_5b.gin           |  54 ++-
 .../dlrm_v3/train/train_ranker.py             |  24 +-
 .../dlrm_v3/train/utils.py                    | 278 ++++++++---
 .../generative_recommenders/dlrm_v3/utils.py  | 430 ++++++++++++++----
 .../scripts/run_streaming_e2e.sh              |  14 +-
 9 files changed, 1106 insertions(+), 181 deletions(-)
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/tests/test_lifetime_auc_resume.py

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py b/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py
index aa3c9daa0..0ef223b23 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py
@@ -31,7 +31,11 @@
 import gin
 import numpy as np
 import torch
-from generative_recommenders.dlrm_v3.utils import MetricsLogger
+from generative_recommenders.dlrm_v3.utils import (
+    BinnedCumulativeAUC,
+    LifetimeAUCMetricComputation,
+    MetricsLogger,
+)
 from torch.distributed.checkpoint.stateful import Stateful
 from torch.optim.optimizer import Optimizer
 from torchrec.distributed.types import ShardedTensor
@@ -44,6 +48,93 @@
 # window at batch K.
 WINDOW_COMPLETE: int = -1
 
+# Filename (per-rank) holding the lifetime-AUC trailing buffers, mirroring the
+# rng_rank{rank}.pt pattern. The buffers are per-rank-local, so a single
+# rank-0 copy in non_sparse.ckpt would (wrongly) restore 1/world_size of the
+# true history to every rank — hence a dedicated per-rank artifact.
+METRICBUF_FILE_FMT: str = "metricbuf_rank{rank}.pt"
+
+
+def _metric_blob_state_dict(m: torch.nn.Module) -> Dict[str, Any]:
+    """State dict for the shared (rank-0) non_sparse.ckpt metric blob.
+
+    Both lifetime-AUC backends carry per-rank-local state that is persisted
+    authoritatively per-rank in ``metricbuf_rank{rank}.pt``; we must keep it out
+    of the shared blob so a rank's load doesn't inherit rank-0's counts:
+
+    - ``LifetimeAUCMetricComputation``: drop the explicitly-serialized trailing
+      buffer keys (the rest of the blob keys are the parent's persistent state).
+    - ``BinnedCumulativeAUC``: zero the histogram buffers (they are persistent so
+      the keys must remain for a strict load, but the values are neutralized).
+
+    All other metrics serialize normally. In both cases the per-rank file is
+    loaded afterward and is authoritative.
+    """
+    sd = m.state_dict()
+    if isinstance(m, LifetimeAUCMetricComputation):
+        prefix = LifetimeAUCMetricComputation._LIFETIME_KEY_PREFIX
+        sd = {k: v for k, v in sd.items() if not k.startswith(prefix)}
+    elif isinstance(m, BinnedCumulativeAUC):
+        sd = {
+            k: (torch.zeros_like(v) if torch.is_tensor(v) else v)
+            for k, v in sd.items()
+        }
+    return sd
+
+
+def _collect_perrank_metric_state(
+    metric_logger: "MetricsLogger",
+) -> Dict[str, Dict[str, Any]]:
+    """Map "<collection>|<mode>|<idx>" -> state_dict for every metric whose
+    cumulative state is per-rank-local and must be restored per-rank:
+
+    - lifetime-AUC instances (`LifetimeAUCMetricComputation` trailing buffer, or
+      `BinnedCumulativeAUC` histograms) in class_metrics train/eval. Covers the
+      train lifetime AUC and, in legacy single-set eval, the eval lifetime AUC,
+      under either configured backend.
+    - the ENTIRE cumulative eval set (`eval_cum`, both class + regression) used
+      by the streaming dual-set eval: the lifetime-AUC backend state plus the
+      persistent cumulative scalar sums of NE/Accuracy/GAUC/MSE/MAE.
+
+    Selected by structure/isinstance (not a hard index) since metric positions
+    depend on the configured tasks/mode.
+    """
+    out: Dict[str, Dict[str, Any]] = {}
+    for mode in ("train", "eval"):
+        for idx, m in enumerate(metric_logger.class_metrics.get(mode, [])):
+            if isinstance(m, (LifetimeAUCMetricComputation, BinnedCumulativeAUC)):
+                out[f"class_metrics|{mode}|{idx}"] = m.state_dict()
+    for coll in ("class_metrics", "regression_metrics"):
+        for idx, m in enumerate(getattr(metric_logger, coll).get("eval_cum", [])):
+            out[f"{coll}|eval_cum|{idx}"] = m.state_dict()
+    return out
+
+
+def _restore_perrank_metric_state(
+    metric_logger: "MetricsLogger", state: Dict[str, Dict[str, Any]]
+) -> None:
+    for key, sd in state.items():
+        coll, mode, idx_str = key.split("|")
+        getattr(metric_logger, coll)[mode][int(idx_str)].load_state_dict(sd)
+
+
+def _perrank_sample_counts(metric_logger: "MetricsLogger") -> Dict[str, int]:
+    out: Dict[str, int] = {}
+
+    def _count(m: torch.nn.Module) -> Optional[int]:
+        if isinstance(m, LifetimeAUCMetricComputation):
+            return m.lifetime_sample_count()
+        if isinstance(m, BinnedCumulativeAUC):
+            return m.cumulative_sample_count()
+        return None
+
+    for mode in ("train", "eval", "eval_cum"):
+        for idx, m in enumerate(metric_logger.class_metrics.get(mode, [])):
+            n = _count(m)
+            if n is not None:
+                out[f"class|{mode}|{idx}"] = n
+    return out
+
 
 class SparseState(Stateful):
     """
@@ -218,6 +309,7 @@ def save_dmp_checkpoint(
     train_ts: Optional[int] = None,
     batch_idx_in_window: int = WINDOW_COMPLETE,
     device: Optional[torch.device] = None,
+    split_contract: Optional[Dict[str, Any]] = None,
 ) -> None:
     """
     Save a distributed model checkpoint including sparse and dense components.
@@ -283,8 +375,14 @@ def save_dmp_checkpoint(
             if not isinstance(v, ShardedTensor)
         }
         class_metric_state_dict = {
-            "train": [m.state_dict() for m in metric_logger.class_metrics["train"]],
-            "eval": [m.state_dict() for m in metric_logger.class_metrics["eval"]],
+            "train": [
+                _metric_blob_state_dict(m)
+                for m in metric_logger.class_metrics["train"]
+            ],
+            "eval": [
+                _metric_blob_state_dict(m)
+                for m in metric_logger.class_metrics["eval"]
+            ],
         }
         regression_metric_state_dict = {
             "train": [
@@ -304,6 +402,13 @@ def save_dmp_checkpoint(
                 # (pre-streaming-resume) still load as a normal restart.
                 "train_ts": train_ts,
                 "batch_idx_in_window": batch_idx_in_window,
+                # Immutable train:eval split + resume-determinism contract
+                # (train_split_percentage, split_salt, eval holdout window,
+                # batch_size, world_size). Validated on resume so a relaunch
+                # cannot silently change the split (which would desync the skip
+                # offset and/or train on held-out eval users). None for
+                # non-holdout / legacy runs.
+                "split_contract": split_contract,
             },
             non_sparse_ckpt,
         )
@@ -314,6 +419,23 @@ def save_dmp_checkpoint(
         rng_path = f"{tmp_subdir}/rng_rank{rank}.pt"
         torch.save(_rng_state(device), rng_path)
 
+    # Per-rank cumulative metric state (lifetime-AUC buffers + cumulative-eval
+    # histograms/scalar sums). Written by EVERY rank (outside the rank-0 block)
+    # because this state is per-rank-local; restoring rank-0's copy to all ranks
+    # would lose (world_size-1)/world_size of the history.
+    if metric_logger is not None:
+        perrank_state = _collect_perrank_metric_state(metric_logger)
+        if perrank_state:
+            torch.save(
+                perrank_state,
+                f"{tmp_subdir}/{METRICBUF_FILE_FMT.format(rank=rank)}",
+            )
+            logger.info(
+                "checkpoint save: cumulative metric state rank=%d samples=%s",
+                rank,
+                _perrank_sample_counts(metric_logger),
+            )
+
     torch.distributed.barrier()
     sparse_dict = {"sparse_dict": SparseState(model, sparse_tensor_keys)}
     torch.distributed.checkpoint.save(
@@ -372,7 +494,7 @@ def load_nonsparse_checkpoint(
     metric_logger: Optional[MetricsLogger] = None,
     path: str = "",
     rank: int = 0,
-) -> Tuple[Optional[int], int]:
+) -> Tuple[Optional[int], int, Optional[Dict[str, Any]]]:
     """
     Load non-sparse (dense) components from a checkpoint.
 
@@ -381,12 +503,13 @@ def load_nonsparse_checkpoint(
     next to `non_sparse.ckpt`.
 
     Returns:
-        (train_ts, batch_idx_in_window) — the streaming resume hint stored at
-        save time. `(None, WINDOW_COMPLETE)` if not a streaming checkpoint or
-        no path supplied.
+        (train_ts, batch_idx_in_window, split_contract) — the streaming resume
+        hint and the saved train:eval split contract (None for legacy / non-
+        holdout checkpoints). `(None, WINDOW_COMPLETE, None)` if not a streaming
+        checkpoint or no path supplied.
     """
     if path == "":
-        return None, WINDOW_COMPLETE
+        return None, WINDOW_COMPLETE, None
     non_sparse_ckpt = f"{path}/non_sparse.ckpt"
 
     # weights_only=False: these are our own trusted checkpoints, and they hold
@@ -404,14 +527,69 @@ def load_nonsparse_checkpoint(
         metric_logger.global_step = non_sparse_state_dict["global_step"]
         class_metric_state_dict = non_sparse_state_dict["class_metrics"]
         regression_metric_state_dict = non_sparse_state_dict["reg_metrics"]
-        for i, m in enumerate(metric_logger.class_metrics["train"]):
-            m.load_state_dict(class_metric_state_dict["train"][i])
-        for i, m in enumerate(metric_logger.class_metrics["eval"]):
-            m.load_state_dict(class_metric_state_dict["eval"][i])
-        for i, m in enumerate(metric_logger.regression_metrics["train"]):
-            m.load_state_dict(regression_metric_state_dict["train"][i])
-        for i, m in enumerate(metric_logger.regression_metrics["eval"]):
-            m.load_state_dict(regression_metric_state_dict["eval"][i])
+        # Length-safe positional restore: if a checkpoint was written with a
+        # different metric set (e.g. tasks added/removed since), restore the
+        # overlap instead of crashing with an IndexError at run end.
+        def _restore_metric_list(
+            live: list, saved: Optional[list], label: str
+        ) -> None:
+            saved = saved or []
+            if len(live) != len(saved):
+                logger.warning(
+                    "metric count mismatch for %s: live=%d saved=%d; "
+                    "restoring overlapping %d",
+                    label,
+                    len(live),
+                    len(saved),
+                    min(len(live), len(saved)),
+                )
+            for i in range(min(len(live), len(saved))):
+                live[i].load_state_dict(saved[i])
+
+        _restore_metric_list(
+            metric_logger.class_metrics["train"],
+            class_metric_state_dict.get("train"),
+            "class/train",
+        )
+        _restore_metric_list(
+            metric_logger.class_metrics["eval"],
+            class_metric_state_dict.get("eval"),
+            "class/eval",
+        )
+        _restore_metric_list(
+            metric_logger.regression_metrics["train"],
+            regression_metric_state_dict.get("train"),
+            "reg/train",
+        )
+        _restore_metric_list(
+            metric_logger.regression_metrics["eval"],
+            regression_metric_state_dict.get("eval"),
+            "reg/eval",
+        )
+
+        # Per-rank cumulative metric state restore. This runs AFTER the generic
+        # load above so it is authoritative: the shared blob carries no lifetime
+        # buffers (stripped at save) nor any eval_cum state, and each rank
+        # restores its OWN cumulative state here. Missing file = legacy/pre-fix
+        # checkpoint; cumulative metrics self-heal (lifetime AUC refills; the
+        # binned-AUC histograms / scalar sums restart from zero).
+        mb_path = f"{path}/{METRICBUF_FILE_FMT.format(rank=rank)}"
+        if os.path.exists(mb_path):
+            perrank_state = torch.load(
+                mb_path, map_location=device, weights_only=False
+            )
+            _restore_perrank_metric_state(metric_logger, perrank_state)
+            logger.info(
+                "checkpoint load: cumulative metric state rank=%d samples=%s",
+                rank,
+                _perrank_sample_counts(metric_logger),
+            )
+        else:
+            logger.info(
+                "checkpoint load: no per-rank cumulative metric state at %s "
+                "(legacy/pre-fix checkpoint); cumulative metrics will refill",
+                mb_path,
+            )
 
     # Per-rank RNG restore. Missing file = bit-equal trajectory not requested at
     # save time; we silently continue (the test harness checks for both).
@@ -426,7 +604,8 @@ def load_nonsparse_checkpoint(
     batch_idx_in_window = non_sparse_state_dict.get(
         "batch_idx_in_window", WINDOW_COMPLETE
     )
-    return train_ts, batch_idx_in_window
+    split_contract = non_sparse_state_dict.get("split_contract")
+    return train_ts, batch_idx_in_window, split_contract
 
 
 @gin.configurable
@@ -437,7 +616,7 @@ def load_dmp_checkpoint(
     device: torch.device,
     path: str = "",
     rank: int = 0,
-) -> Tuple[Optional[int], int]:
+) -> Tuple[Optional[int], int, Optional[Dict[str, Any]], bool]:
     """
     Load a complete distributed model checkpoint (both sparse and dense components).
 
@@ -447,12 +626,17 @@ def load_dmp_checkpoint(
     no load.
 
     Returns:
-        (train_ts, batch_idx_in_window) — streaming resume hint. Callers that
-        don't need it can ignore.
+        (train_ts, batch_idx_in_window, split_contract, cold_start) — streaming
+        resume hint plus the saved split contract, and `cold_start` which is True
+        iff there was nothing to load (no checkpoint resolved). `cold_start`
+        distinguishes a genuine fresh run (no weights loaded) from a resume that
+        merely lacks a split contract (e.g. a legacy/non-streaming checkpoint),
+        which the caller's split-contract guard must still reject.
     """
     resolved = _resolve_latest_subdir(path)
+    cold_start = resolved == ""
     load_sparse_checkpoint(model=model, path=resolved)
-    return load_nonsparse_checkpoint(
+    train_ts, batch_idx_in_window, split_contract = load_nonsparse_checkpoint(
         model=model,
         optimizer=optimizer,
         metric_logger=metric_logger,
@@ -460,3 +644,4 @@ def load_dmp_checkpoint(
         device=device,
         rank=rank,
     )
+    return train_ts, batch_idx_in_window, split_contract, cold_start
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/synthetic_streaming.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/synthetic_streaming.py
index 437e5ae8e..6e38fe334 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/synthetic_streaming.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/synthetic_streaming.py
@@ -269,12 +269,15 @@ def get_timestamp_uih(
     ) -> List[int]:
         return [1] * size
 
-    def set_ts(self, ts: int) -> None:
+    def set_ts(self, ts: int, train_only: bool = False) -> None:
         """
         Set the current timestamp and load associated request data.
 
         Args:
             ts: Timestamp index to set.
+            train_only: Accepted for API parity with the yambda dataset (which
+                supports a user-level train:eval holdout). This synthetic
+                dataset has no holdout, so the flag is ignored.
         """
         logger.warning(f"Streaming dataset ts set to {ts}")
         if ts == self.ts:
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py
index bed1aafb8..ad24513f4 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py
@@ -57,6 +57,28 @@ def _load_npy_readonly(path: Union[str, Path]) -> np.ndarray:
     arr.flags.writeable = False
     return arr
 
+
+def _uid_unit_hash(uids: np.ndarray, salt: int) -> np.ndarray:
+    """Deterministic uniform-in-[0,1) hash of user ids (splitmix64 finalizer).
+
+    Pure function of (uid, salt): the same uid always maps to the same value,
+    so the train/eval user split is identical across processes, ranks, and
+    crash/resume — the property the no-leakage holdout relies on. Vectorized
+    uint64 arithmetic wraps mod 2**64 (defined for unsigned), so we silence the
+    benign overflow warnings.
+    """
+    GOLDEN = np.uint64(0x9E3779B97F4A7C15)
+    M1 = np.uint64(0xBF58476D1CE4E5B9)
+    M2 = np.uint64(0x94D049BB133111EB)
+    s30, s27, s31 = np.uint64(30), np.uint64(27), np.uint64(31)
+    with np.errstate(over="ignore"):
+        z = uids.astype(np.uint64) + GOLDEN + np.uint64(salt & 0xFFFFFFFFFFFFFFFF)
+        z = (z ^ (z >> s30)) * M1
+        z = (z ^ (z >> s27)) * M2
+        z = z ^ (z >> s31)
+    # Top 53 bits -> uniform [0, 1) double (same trick numpy uses for randoms).
+    return (z >> np.uint64(11)).astype(np.float64) * (1.0 / 9007199254740992.0)
+
 # Yambda event-type encoding written by preprocess_public_data.py.
 LISTEN_TYPE = 0
 LIKE_TYPE = 1
@@ -187,6 +209,8 @@ def __init__(
         is_inference: bool = False,
         streaming_window_seconds: int = 86400,
         streaming_sort_within_window: bool = False,
+        train_split_percentage: float = 1.0,
+        split_salt: int = 0,
         *args,
         **kwargs,
     ) -> None:
@@ -201,6 +225,18 @@ def __init__(
         # is byte-for-byte unaffected.
         self._streaming_window_seconds: int = streaming_window_seconds
         self._streaming_sort_within_window: bool = streaming_sort_within_window
+        # User-level train:eval split. `train_split_percentage >= 1.0` means no
+        # holdout (legacy behavior: every anchor is trainable). Otherwise the
+        # top `1 - train_split_percentage` fraction of users (by a deterministic
+        # hash of `uid + split_salt`) are held out: NEVER trained, used only to
+        # build the fixed eval set. The split is a pure function of (uid, salt),
+        # so it is identical across crash/resume (no leakage on failover).
+        self._train_split_percentage: float = train_split_percentage
+        self._split_salt: int = split_salt
+        # Cache only the (small) fixed eval-holdout index list; the per-window
+        # train filter is computed on the fly to avoid a full-length mask.
+        self._eval_holdout_cache: Optional[np.ndarray] = None
+        self._eval_holdout_cache_key: Optional[Tuple[int, int]] = None
         self._active: Optional[np.ndarray] = None
         self.is_eval: bool = False
         self._anchor_ts: Optional[np.ndarray] = None
@@ -509,16 +545,79 @@ def window_indices(
         logger.warning(f"window_indices({ts}): [{lo}, {hi}) -> {idx.size:,} anchors")
         return idx.astype(np.int64)
 
-    def set_ts(self, ts: int) -> None:
+    def _eval_anchor_mask(self, anchor_idx: np.ndarray) -> np.ndarray:
+        """Bool mask (aligned to ``anchor_idx``) marking held-out eval users.
+
+        Computed on the fly for just this slice of anchors (a window is ~tens of
+        millions, not the full ~3B ``_positions``), so we never materialize a
+        full-length mask. ``uid``-hash >= ``train_split_percentage`` -> eval.
+        """
+        uids = self.store.flat_uid[self._positions[anchor_idx]]
+        return _uid_unit_hash(uids, self._split_salt) >= self._train_split_percentage
+
+    def train_window_indices(self, ts: int) -> np.ndarray:
+        """Global anchor indices for TRAIN in window ``ts``: ``window_indices``
+        with held-out eval users removed. Identical across resume because both
+        ``window_indices`` and the uid hash are pure functions, so the per-rank
+        round-robin slice (and the mid-window skip offset) stay consistent."""
+        idx = self.window_indices(ts)
+        if self._train_split_percentage >= 1.0:
+            return idx
+        kept = idx[~self._eval_anchor_mask(idx)]
+        logger.warning(
+            f"train_window_indices({ts}): {idx.size:,} -> {kept.size:,} anchors "
+            f"(holdout tsp={self._train_split_percentage}, salt={self._split_salt})"
+        )
+        return kept
+
+    def eval_holdout_indices(self, start_ts: int, num_windows: int = 1) -> np.ndarray:
+        """Fixed eval set: held-out users' anchors over windows
+        ``[start_ts, start_ts + num_windows)``. Computed once and cached, so the
+        SAME anchors are evaluated at every eval step (stable, comparable curve).
+        With no holdout (tsp>=1.0) this falls back to the full window(s)."""
+        key = (int(start_ts), int(num_windows))
+        if self._eval_holdout_cache is not None and self._eval_holdout_cache_key == key:
+            return self._eval_holdout_cache
+        parts: List[np.ndarray] = []
+        for ts in range(start_ts, start_ts + max(1, num_windows)):
+            idx = self.window_indices(ts)
+            if self._train_split_percentage < 1.0:
+                idx = idx[self._eval_anchor_mask(idx)]
+            parts.append(idx)
+        holdout = (
+            np.concatenate(parts).astype(np.int64)
+            if parts
+            else np.empty(0, dtype=np.int64)
+        )
+        logger.warning(
+            f"eval_holdout_indices(start_ts={start_ts}, num_windows={num_windows}): "
+            f"{holdout.size:,} held-out anchors (tsp={self._train_split_percentage})"
+        )
+        self._eval_holdout_cache = holdout
+        self._eval_holdout_cache_key = key
+        return holdout
+
+    def set_ts(self, ts: int, train_only: bool = False) -> None:
         """Restrict the active sample set to anchors in window ``ts`` (used by
         the per-window-DataLoader path, where ``iloc``/``get_item_count`` index
         through ``_active``).
 
-        Forward-only temporal slicing for streaming train/eval. History for any
-        anchor is still gathered causally (``scan_start:flat_pos``) and may span
-        earlier windows, so there is no feature leakage from future events.
+        ``train_only=True`` removes held-out eval users so the non-persistent
+        TRAIN loader never sees them (closes the leakage path). Forward-only
+        temporal slicing for streaming train/eval. History for any anchor is
+        still gathered causally (``scan_start:flat_pos``) and may span earlier
+        windows, so there is no feature leakage from future events.
         """
-        self._active = self.window_indices(ts)
+        self._active = (
+            self.train_window_indices(ts) if train_only else self.window_indices(ts)
+        )
+
+    def set_active_indices(self, indices: np.ndarray) -> None:
+        """Restrict the active sample set to an explicit array of global anchor
+        indices (into ``_positions``). Used by the non-persistent eval path to
+        iterate the fixed user-holdout set (which spans a window range, not a
+        single ``ts``)."""
+        self._active = np.asarray(indices, dtype=np.int64)
 
     def load_query_samples(self, sample_list) -> None:
         max_num_candidates = (
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/tests/test_lifetime_auc_resume.py b/recommendation_v4/generative_recommenders/dlrm_v3/tests/test_lifetime_auc_resume.py
new file mode 100644
index 000000000..3d5bbd1ee
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/tests/test_lifetime_auc_resume.py
@@ -0,0 +1,146 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-strict
+
+"""
+Round-trip correctness test for ``LifetimeAUCMetricComputation`` checkpoint
+serialization.
+
+Background: torchrec's ``AUCMetricComputation`` registers its
+PREDICTIONS/LABELS/WEIGHTS buffers with ``persistent=False``, so the default
+``state_dict()`` returns them empty and a separate ``_num_samples`` counter is
+dropped too. Without the overrides on ``LifetimeAUCMetricComputation`` every
+checkpoint resume would silently restart the lifetime AUC from an empty buffer.
+
+These tests assert:
+  1. update -> compute == A; state_dict -> load_state_dict on a fresh metric ->
+     compute == A (buffers survive the round trip).
+  2. ``_num_samples`` round-trips exactly (required so the next update() does
+     not take the init-sentinel branch and desync windowed eviction).
+  3. The shared-blob path (buffers stripped) leaves a fresh metric empty, so the
+     per-rank artifact is the sole authority for the trailing buffer.
+
+Runs in <1s on CPU. Skipped automatically if torchrec is unavailable.
+"""
+
+import unittest
+
+import torch
+
+try:
+    from generative_recommenders.dlrm_v3.utils import LifetimeAUCMetricComputation
+
+    _HAVE_DEPS = True
+except Exception:  # pragma: no cover - import guard for envs without torchrec
+    _HAVE_DEPS = False
+
+
+def _make_metric(n_tasks: int = 1, window: int = 10_000_000):
+    return LifetimeAUCMetricComputation(
+        my_rank=0,
+        batch_size=128,
+        n_tasks=n_tasks,
+        window_size=window,
+    )
+
+
+def _feed(metric, preds, labels, weights) -> None:
+    metric.update(
+        predictions=preds,
+        labels=labels,
+        weights=weights,
+    )
+
+
+@unittest.skipUnless(_HAVE_DEPS, "torchrec / generative_recommenders not importable")
+class LifetimeAUCResumeTest(unittest.TestCase):
+    def setUp(self) -> None:
+        torch.manual_seed(0)
+        self.n_tasks = 1
+        self.n = 4096
+        self.preds = torch.rand(self.n_tasks, self.n)
+        self.labels = (torch.rand(self.n_tasks, self.n) > 0.5).float()
+        self.weights = torch.ones(self.n_tasks, self.n)
+
+    def _compute_value(self, metric) -> float:
+        reports = metric._compute()
+        return float(reports[0].value.flatten()[0].item())
+
+    def test_state_dict_round_trip_preserves_auc(self) -> None:
+        m = _make_metric(self.n_tasks)
+        _feed(m, self.preds, self.labels, self.weights)
+        auc_a = self._compute_value(m)
+        n_a = m.lifetime_sample_count()
+        self.assertEqual(n_a, self.n)
+
+        sd = m.state_dict()
+
+        fresh = _make_metric(self.n_tasks)
+        fresh.load_state_dict(sd)
+        auc_b = self._compute_value(fresh)
+
+        self.assertEqual(fresh.lifetime_sample_count(), self.n)
+        self.assertAlmostEqual(auc_a, auc_b, places=6)
+
+    def test_num_samples_round_trips(self) -> None:
+        m = _make_metric(self.n_tasks)
+        _feed(m, self.preds, self.labels, self.weights)
+        sd = m.state_dict()
+        fresh = _make_metric(self.n_tasks)
+        fresh.load_state_dict(sd)
+        self.assertEqual(fresh._num_samples, m._num_samples)
+
+    def test_continued_update_after_resume_matches_uninterrupted(self) -> None:
+        # Splitting a stream and resuming in the middle must equal feeding it all
+        # at once (this is what fails when _num_samples is not restored).
+        half = self.n // 2
+        p1, p2 = self.preds[:, :half], self.preds[:, half:]
+        l1, l2 = self.labels[:, :half], self.labels[:, half:]
+        w1, w2 = self.weights[:, :half], self.weights[:, half:]
+
+        ref = _make_metric(self.n_tasks)
+        _feed(ref, p1, l1, w1)
+        _feed(ref, p2, l2, w2)
+        auc_ref = self._compute_value(ref)
+
+        part = _make_metric(self.n_tasks)
+        _feed(part, p1, l1, w1)
+        resumed = _make_metric(self.n_tasks)
+        resumed.load_state_dict(part.state_dict())
+        _feed(resumed, p2, l2, w2)
+        auc_resumed = self._compute_value(resumed)
+
+        self.assertAlmostEqual(auc_ref, auc_resumed, places=6)
+
+    def test_blob_state_dict_strips_buffers(self) -> None:
+        from generative_recommenders.dlrm_v3.checkpoint import (
+            _metric_blob_state_dict,
+        )
+
+        m = _make_metric(self.n_tasks)
+        _feed(m, self.preds, self.labels, self.weights)
+        blob = _metric_blob_state_dict(m)
+        prefix = LifetimeAUCMetricComputation._LIFETIME_KEY_PREFIX
+        self.assertFalse(any(k.startswith(prefix) for k in blob.keys()))
+
+        # A fresh metric loaded from the stripped blob must NOT have history —
+        # the per-rank artifact is the only source of the trailing buffer.
+        fresh = _make_metric(self.n_tasks)
+        fresh.load_state_dict(blob)
+        self.assertEqual(fresh.lifetime_sample_count(), 0)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 19c23e5b6..123027fdd 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -45,11 +45,21 @@ DATA_PATH = @data/env_path()
 data/env_path.key = "DLRM_DATA_PATH"
 data/env_path.default = "/apps/chcai/dlrm_data"
 
+# Shared train:eval split: fraction of USERS used for training; the remaining
+# (1 - this) fraction are held out as a fixed eval set and NEVER trained.
+# Bound to BOTH the static train-eval path (make_train_test_dataloaders, a
+# positional split) and the streaming path (get_dataset, an explicit by-user
+# hash split), so one value configures the holdout in either mode.
+# 1.0 = no holdout (legacy streaming behavior). Override via $TRAIN_SPLIT_PERCENTAGE.
+TRAIN_SPLIT_PERCENTAGE = @tsp/env_float()
+tsp/env_float.key     = "TRAIN_SPLIT_PERCENTAGE"
+tsp/env_float.default = 0.90
+
 # dataloader configs
 make_train_test_dataloaders.batch_size = %batch_size
 make_train_test_dataloaders.eval_batch_size = 1024
 make_train_test_dataloaders.dataset_type = %dataset
-make_train_test_dataloaders.train_split_percentage = 0.90
+make_train_test_dataloaders.train_split_percentage = %TRAIN_SPLIT_PERCENTAGE
 make_train_test_dataloaders.new_path_prefix = %DATA_PATH
 make_train_test_dataloaders.num_workers = %num_workers
 make_train_test_dataloaders.prefetch_factor = %prefetch_factor
@@ -99,6 +109,15 @@ msl/env_int.default = 2048
 # to the dataset's available window count at runtime; override via $NUM_TRAIN_TS.
 get_dataset.streaming_window_seconds = 86400
 get_dataset.streaming_sort_within_window = False
+# User-level train:eval holdout for the streaming path. With tsp<1.0, the top
+# (1 - tsp) fraction of users (by a deterministic hash of uid+split_salt) are
+# held out as a FIXED eval set and never trained -> no temporal/user leakage,
+# stable comparable eval curve, bounded eval time. split_salt lets you draw a
+# different holdout without changing the ratio. Override salt via $SPLIT_SALT.
+get_dataset.train_split_percentage = %TRAIN_SPLIT_PERCENTAGE
+get_dataset.split_salt = @ssalt/env_int()
+ssalt/env_int.key     = "SPLIT_SALT"
+ssalt/env_int.default = 0
 make_streaming_dataloader.batch_size = %batch_size
 make_streaming_dataloader.num_workers = %num_workers
 make_streaming_dataloader.prefetch_factor = %prefetch_factor
@@ -144,6 +163,16 @@ evn/env_int.default = 1
 streaming_train_eval_loop.double_buffer = @db/env_int()
 db/env_int.key = "DOUBLE_BUFFER"
 db/env_int.default = 1
+# Fixed eval-holdout window range (held-out users' anchors over these windows
+# form the eval set evaluated at EVERY eval step). EVAL_HOLDOUT_TS<0 (default)
+# resolves at runtime to start_ts+num_train_ts (the window just past training),
+# which is stable across resume. EVAL_HOLDOUT_NUM_WINDOWS widens the eval span.
+streaming_train_eval_loop.eval_holdout_ts = @eht/env_int()
+eht/env_int.key     = "EVAL_HOLDOUT_TS"
+eht/env_int.default = -1
+streaming_train_eval_loop.eval_holdout_num_windows = @ehnw/env_int()
+ehnw/env_int.key     = "EVAL_HOLDOUT_NUM_WINDOWS"
+ehnw/env_int.default = 1
 # num_train_batches / num_eval_batches unset => consume each full window.
 # Set them (e.g. via gin) to cap per-window steps for short experiments.
 
@@ -185,6 +214,29 @@ tbp/env_path.key     = "TENSORBOARD_LOG_PATH"
 tbp/env_path.default = "/apps/chcai/tb/yambda_5b/"
 MetricsLogger.world_size = 8
 MetricsLogger.auc_threshold = 0.80275
+# Lifetime-AUC backend, selectable independently for the train cumulative AUC and
+# the eval cumulative ("lifetime_*") AUC. Both default to "binned":
+#   "binned" = BinnedCumulativeAUC: exact-cumulative AUC from an O(bins) score
+#     histogram (additive all-reduce, memory independent of #samples/#windows).
+#   "capped" = LifetimeAUCMetricComputation: AUC over a trailing buffer of
+#     `lifetime_auc_window` samples/rank (legacy; per-rank buffer all-gathered).
+# Override per-run via $TRAIN_LIFETIME_AUC_MODE / $EVAL_LIFETIME_AUC_MODE.
+MetricsLogger.train_lifetime_auc_mode = @tlam/env_str()
+tlam/env_str.key     = "TRAIN_LIFETIME_AUC_MODE"
+tlam/env_str.default = "binned"
+MetricsLogger.eval_lifetime_auc_mode = @elam/env_str()
+elam/env_str.key     = "EVAL_LIFETIME_AUC_MODE"
+elam/env_str.default = "binned"
+# Score-histogram resolution for the "binned" backend. Higher = finer AUC
+# resolution at O(bins) memory. Override via $CUMULATIVE_AUC_BINS.
+MetricsLogger.cumulative_auc_bins = @cab/env_int()
+cab/env_int.key     = "CUMULATIVE_AUC_BINS"
+cab/env_int.default = 100000
+# Trailing-buffer size (samples/rank) for the "capped" backend. Override via
+# $LIFETIME_AUC_WINDOW. Ignored when the backend is "binned".
+MetricsLogger.lifetime_auc_window = @law/env_int()
+law/env_int.key     = "LIFETIME_AUC_WINDOW"
+law/env_int.default = 10000000
 # Checkpointing disabled by default — a full DMP checkpoint is ~100s of GB and
 # the streaming loop always saves on the final window. save_dmp_checkpoint
 # no-ops on the empty path. Set $CKPT_PATH to a directory to re-enable; the
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
index 59520bd16..6ae88eba2 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
@@ -120,6 +120,9 @@ def _main_func(
     gpu_peak_flops = get_gpu_peak_flops(
         "bf16" if getattr(model_configs, "bf16_training", True) else "fp32"
     )
+    # Streaming fixed-holdout eval uses the dual fresh/cumulative metric sets:
+    # window_* = fresh per-pass full-holdout, lifetime_* = cumulative across
+    # passes (AUC via O(bins) histogram). Other modes keep the legacy single set.
     metrics = MetricsLogger(
         multitask_configs=model_configs.multitask_configs,
         batch_size=train_dataloader.batch_size,
@@ -129,17 +132,24 @@ def _main_func(
         num_flops_per_sample=num_flops_per_sample,
         gpu_peak_flops=gpu_peak_flops,
         model=model,
+        eval_cumulative=(mode == "streaming-train-eval"),
+        # Lifetime-AUC backend + bins/window come from gin (see yambda_5b.gin:
+        # MetricsLogger.{train,eval}_lifetime_auc_mode / cumulative_auc_bins /
+        # lifetime_auc_window), env-overridable. eval_cumulative stays explicit
+        # because it is runtime-mode dependent, not a config knob.
     )
     # Capture streaming resume hint (None for cold start / non-streaming
     # checkpoints). For the streaming-train-eval mode, we forward this into
     # streaming_train_eval_loop so it can advance past the last completed
     # window OR re-enter the partial window and skip already-trained batches.
-    resume_train_ts, resume_batch_idx_in_window = load_dmp_checkpoint(
-        model=model,
-        optimizer=optimizer,
-        metric_logger=metrics,
-        device=device,
-        rank=rank,
+    resume_train_ts, resume_batch_idx_in_window, resume_split_contract, resume_cold_start = (
+        load_dmp_checkpoint(
+            model=model,
+            optimizer=optimizer,
+            metric_logger=metrics,
+            device=device,
+            rank=rank,
+        )
     )
 
     # train loop
@@ -190,6 +200,8 @@ def _main_func(
                 embedding_table_configs=embedding_table_configs,
                 resume_train_ts=resume_train_ts,
                 resume_batch_idx_in_window=resume_batch_idx_in_window,
+                resume_split_contract=resume_split_contract,
+                resume_cold_start=resume_cold_start,
             )
     except Exception as e:
         logger.info(traceback.format_exc())
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index e437aca60..936ae229e 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -418,12 +418,23 @@ def make_optimizer_and_shard(
 @gin.configurable
 def make_streaming_dataloader(
     dataset: HammerToTorchDataset,
-    ts: int,
-    batch_size: int,
-    num_workers: int,
-    prefetch_factor: int,
+    ts: Optional[int] = None,
+    batch_size: int = 0,
+    num_workers: int = 0,
+    prefetch_factor: int = 0,
+    train_only: bool = False,
+    indices: Optional["np.ndarray"] = None,
 ) -> DataLoader:
-    dataset.dataset.set_ts(ts)  # pyre-ignore [16]
+    # `indices` (explicit anchor index array) is used by the eval path to
+    # iterate the FIXED user-holdout set, which spans a window range rather than
+    # a single ts. Otherwise restrict to window `ts`; train_only=True drops
+    # held-out eval users so the non-persistent TRAIN loader never trains on
+    # them (no-leakage guarantee).
+    if indices is not None:
+        dataset.dataset.set_active_indices(indices)  # pyre-ignore [16]
+    else:
+        assert ts is not None, "make_streaming_dataloader needs ts or indices"
+        dataset.dataset.set_ts(ts, train_only=train_only)  # pyre-ignore [16]
     total_items = dataset.dataset.get_item_count()
     subset = torch.utils.data.Subset(dataset, range(total_items))
     # shuffle=False keeps temporal order within the window: a non-shuffling
@@ -550,13 +561,14 @@ def __init__(
         self._iters: List[Optional[object]] = [None] * n_buffers
 
     def _prepare(self, buf: int, ts: int, skip_samples: int = 0) -> None:
-        # window_indices() is the O(N) mask; numpy releases the GIL for it, so it
-        # overlaps the main thread's GPU dispatch. iter() then kicks off this
-        # pool's background prefetch.
-        # `skip_samples` is non-zero only for the very first window after a
-        # mid-window resume; subsequent windows always start at 0.
+        # train_window_indices() is the O(N) mask (+ uid-hash filter for the
+        # holdout); numpy releases the GIL for it, so it overlaps the main
+        # thread's GPU dispatch. iter() then kicks off this pool's background
+        # prefetch. This is a TRAIN-only loader, so held-out eval users are
+        # excluded here. `skip_samples` is non-zero only for the very first
+        # window after a mid-window resume; subsequent windows always start at 0.
         self._samplers[buf].set_window(
-            self._dataset.dataset.window_indices(ts), skip_samples=skip_samples
+            self._dataset.dataset.train_window_indices(ts), skip_samples=skip_samples
         )
         self._iters[buf] = iter(self._dls[buf])
 
@@ -1087,6 +1099,58 @@ def select_in_window_checkpoint_reason(
     return None
 
 
+def _validate_split_contract(
+    saved: Optional[Dict[str, Any]],
+    live: Dict[str, Any],
+    rank: int,
+) -> None:
+    """Guarantee the train:eval split (and the inputs the resume skip-offset
+    depends on) are unchanged across a crash/resume.
+
+    `saved` is the contract recovered from the checkpoint (None on cold start or
+    legacy pre-holdout checkpoints). Any mismatch is fatal: continuing would
+    either desync the mid-window skip (duplicate/skip batches) or reassign users
+    so that previously held-out eval users get trained (leakage). Set
+    ALLOW_SPLIT_MISMATCH=1 to override (e.g. intentionally resuming a legacy
+    checkpoint into a holdout run, accepting the risk).
+    """
+    allow = os.environ.get("ALLOW_SPLIT_MISMATCH", "0") == "1"
+    if saved is None:
+        # Legacy / cold-start checkpoint with no recorded contract. Only a
+        # problem if this run actually holds users out (tsp < 1.0): we cannot
+        # prove the earlier run used the same split.
+        if live.get("train_split_percentage", 1.0) < 1.0 and not allow:
+            raise RuntimeError(
+                "Resuming a checkpoint with NO saved split contract into a "
+                f"user-holdout run (train_split_percentage="
+                f"{live['train_split_percentage']}). The earlier run's split "
+                "cannot be verified, so held-out eval users may have been "
+                "trained. Set ALLOW_SPLIT_MISMATCH=1 to override."
+            )
+        return
+    mismatches = {
+        k: (saved.get(k), live.get(k))
+        for k in live
+        if saved.get(k) != live.get(k)
+    }
+    if mismatches:
+        msg = (
+            "Split/resume contract mismatch between checkpoint and current run: "
+            + ", ".join(
+                f"{k}: checkpoint={s!r} current={c!r}" for k, (s, c) in mismatches.items()
+            )
+            + ". Resuming would desync the skip offset and/or leak held-out "
+            "users into training."
+        )
+        if allow:
+            if rank == 0:
+                logger.warning("%s ALLOW_SPLIT_MISMATCH=1 set — continuing anyway.", msg)
+        else:
+            raise RuntimeError(msg + " Set ALLOW_SPLIT_MISMATCH=1 to override.")
+    elif rank == 0:
+        logger.info("Split/resume contract verified against checkpoint: %s", live)
+
+
 @gin.configurable
 def streaming_train_eval_loop(
     rank: int,
@@ -1107,9 +1171,22 @@ def streaming_train_eval_loop(
     eval_each_window: bool = True,
     eval_every_n_windows: int = 1,
     double_buffer: bool = False,
+    # --- fixed user-holdout eval set ---
+    # Window range the fixed eval set is drawn from. None -> default to
+    # original_end_ts (start_ts + num_train_ts), the window just past training.
+    eval_holdout_ts: Optional[int] = None,
+    eval_holdout_num_windows: int = 1,
     # --- resume / mid-window-exact-once knobs ---
     resume_train_ts: Optional[int] = None,
     resume_batch_idx_in_window: int = WINDOW_COMPLETE,
+    # Split contract recovered from the checkpoint (None on cold start or
+    # legacy checkpoints). Validated below against the live split so a resumed
+    # run cannot silently train a different user-split (would leak).
+    resume_split_contract: Optional[Dict[str, Any]] = None,
+    # True iff no checkpoint was loaded (genuine fresh run). Distinguishes a
+    # cold start (safe to establish a new split) from a resume that merely lacks
+    # a contract (legacy/non-streaming checkpoint), which the guard must reject.
+    resume_cold_start: bool = False,
     in_window_checkpoint_frequency: int = 0,
     # --- global step / wall-clock checkpoint cadences ---
     checkpoint_step_frequency: int = 0,
@@ -1187,6 +1264,47 @@ def streaming_train_eval_loop(
             dataset=dataset, sampler=window_sampler
         )
 
+    # The fixed user-holdout eval is yambda-specific (needs window_indices +
+    # the split API). Other streaming datasets (synthetic) keep the legacy
+    # per-window eval. Detect support once.
+    supports_holdout = hasattr(dataset.dataset, "eval_holdout_indices")
+
+    # Fixed eval-holdout window range. Captured from the REQUESTED (start_ts,
+    # num_train_ts) BEFORE the resume block mutates them, so it is identical on
+    # cold start and on every resume (the supervisor relaunches with the same
+    # START_TS / NUM_TRAIN_TS). Defaults to the window just past training.
+    requested_end_ts = start_ts + num_train_ts
+    # None (Python default) or <0 (the env-binding default) both mean "use the
+    # window just past training", which is stable across resume.
+    eval_holdout_ts_resolved = (
+        eval_holdout_ts
+        if (eval_holdout_ts is not None and eval_holdout_ts >= 0)
+        else requested_end_ts
+    )
+
+    # The split is an immutable run contract: a silent change across resume
+    # would both desync the mid-window skip offset AND turn held-out eval users
+    # into trained users (leakage). Build the live contract and validate the
+    # one recovered from the checkpoint against it; abort on any mismatch unless
+    # ALLOW_SPLIT_MISMATCH=1 is set (e.g. deliberately resuming a legacy run).
+    live_split_contract: Optional[Dict[str, Any]] = None
+    if supports_holdout:
+        live_split_contract = {
+            "train_split_percentage": dataset.dataset._train_split_percentage,  # pyre-ignore[16]
+            "split_salt": dataset.dataset._split_salt,  # pyre-ignore[16]
+            "eval_holdout_ts": eval_holdout_ts_resolved,
+            "eval_holdout_num_windows": eval_holdout_num_windows,
+            "batch_size": persistent_dl.batch_size if persistent_dl is not None else None,
+            "world_size": world_size,
+        }
+        # Only validate on an actual resume. On a genuine cold start there is no
+        # prior split to verify and establishing this run's split is always safe;
+        # validating there would wrongly reject every fresh holdout run. A resume
+        # that lacks a contract (legacy/non-streaming checkpoint) is NOT a cold
+        # start and is still validated (and rejected) below.
+        if not resume_cold_start:
+            _validate_split_contract(resume_split_contract, live_split_contract, rank)
+
     # Apply resume hint: advance start_ts past the last completed window, or
     # re-enter the partial window with a per-rank skip on its first iter.
     # Shrink num_train_ts by the same amount so the resumed run finishes at
@@ -1232,10 +1350,13 @@ def streaming_train_eval_loop(
             )
 
     def _window_iter(ts: int, skip_samples: int = 0):
+        # TRAIN-only iterator: both branches exclude held-out eval users via
+        # train_window_indices / set_ts(train_only=True). (Eval uses the fixed
+        # holdout set, never this helper.)
         if persistent_loader:
             assert window_sampler is not None and persistent_dl is not None
             window_sampler.set_window(
-                dataset.dataset.window_indices(ts),  # pyre-ignore [16]
+                dataset.dataset.train_window_indices(ts),  # pyre-ignore [16]
                 skip_samples=skip_samples,
             )
             return iter(persistent_dl)
@@ -1243,7 +1364,9 @@ def _window_iter(ts: int, skip_samples: int = 0):
             raise NotImplementedError(
                 "skip_samples>0 requires persistent_loader=True"
             )
-        return iter(make_streaming_dataloader(dataset=dataset, ts=ts))
+        return iter(
+            make_streaming_dataloader(dataset=dataset, ts=ts, train_only=True)
+        )
     # Windows are [start_ts, start_ts + num_train_ts); each step trains window T
     # then evals window T+1, so the last eval window is start_ts + num_train_ts,
     # which must be < num_windows(). Anchors require >= history_length prior
@@ -1292,6 +1415,7 @@ def _save_mid_window(train_ts: int, batch_idx_in_window: int) -> None:
             train_ts=train_ts,
             batch_idx_in_window=batch_idx_in_window,
             device=device,
+            split_contract=live_split_contract,
         )
 
     def _run_train_window(
@@ -1418,7 +1542,20 @@ def _run_train_window(
             )
 
     def _run_eval_window(eval_data_iterator, label: Optional[str] = None) -> None:
+        # DO NOT add a checkpoint trigger anywhere inside this function. The eval
+        # data iterator's position is not serializable, so a checkpoint taken
+        # mid-eval could not be resumed deterministically. `_maybe_checkpoint`
+        # only fires after a completed eval window or mid-train-window, so any
+        # restored state always sits on a completed-eval boundary -- which is
+        # also why the eval reset below is safe across resume.
         model.eval()
+        # Reset eval metrics so each pass reports a clean number over the FIXED
+        # holdout set. Without this, lifetime/window eval metrics would keep
+        # accumulating across eval steps (the old behavior, made worse now that
+        # every step sees the identical set), making the eval-AUC trajectory
+        # uninterpretable. With the reset, each eval point == AUC over the whole
+        # fixed holdout at that train step -> directly comparable across steps.
+        metric_logger.reset(mode="eval")
         eval_batch_idx = 0
         first_wait: Optional[float] = None
         _t_enter = time.perf_counter() if (label and rank == 0) else None
@@ -1486,6 +1623,7 @@ def _maybe_checkpoint(train_ts: int) -> None:
                 train_ts=train_ts,
                 batch_idx_in_window=WINDOW_COMPLETE,
                 device=device,
+                split_contract=live_split_contract,
             )
             last_ckpt_time[0] = time.time()
 
@@ -1508,6 +1646,26 @@ def _should_eval(i: int) -> bool:
             return True
         return i % eval_every_n_windows == 0 or i == n_train - 1
 
+    # Fixed eval set: held-out users' anchors over the resolved holdout window
+    # range, computed ONCE and reused at every eval step. Same anchors every
+    # step -> stable, comparable eval-AUC curve, and bounded eval time
+    # (~(1 - train_split_percentage) of a window). Cached inside the dataset so
+    # re-deriving it (e.g. on resume) returns the identical set. None for
+    # datasets without holdout support (synthetic) -> legacy per-window eval.
+    eval_global_indices: Optional["np.ndarray"] = None
+    if supports_holdout:
+        eval_global_indices = dataset.dataset.eval_holdout_indices(  # pyre-ignore [16]
+            eval_holdout_ts_resolved, eval_holdout_num_windows
+        )
+        if rank == 0:
+            logger.info(
+                "Fixed eval holdout: ts=[%d, %d) -> %d anchors (train_split_percentage=%s)",
+                eval_holdout_ts_resolved,
+                eval_holdout_ts_resolved + eval_holdout_num_windows,
+                len(eval_global_indices),
+                dataset.dataset._train_split_percentage,  # pyre-ignore[16]
+            )
+
     if persistent_loader and double_buffer:
         # Double-buffered: next window prepared in the background during the
         # current window's compute. Eval (if enabled) uses its own pre-forked
@@ -1540,11 +1698,10 @@ def _should_eval(i: int) -> bool:
             # thread holding an allocator/GIL-released lock. (Deferring this
             # first fork into the loop — as a sparse-eval cadence naively might —
             # hangs the run.) _should_eval(0) is always True when eval is enabled
-            # (0 % K == 0), so the first eval window is always train_ts_list[0]+1;
-            # arm it now so it prefetches during the i=0 train window.
-            eval_sampler.set_window(
-                dataset.dataset.window_indices(train_ts_list[0] + 1)  # pyre-ignore [16]
-            )
+            # (0 % K == 0). The eval set is the FIXED holdout (same every step),
+            # so we install it on the sampler ONCE here; later evals just call
+            # iter() again to replay the identical set (no set_window churn).
+            eval_sampler.set_window(eval_global_indices)
             eval_iter = iter(eval_dl)
         for i, (train_ts, train_data_iterator) in enumerate(
             # Only the FIRST window after a mid-window resume needs the skip
@@ -1571,20 +1728,15 @@ def _should_eval(i: int) -> bool:
             if _should_eval(i):
                 dataset.dataset.is_eval = True  # pyre-ignore [16]
                 assert eval_sampler is not None and eval_dl is not None
-                _run_eval_window(eval_iter, label=f"eval_ts={train_ts + 1}")
-                # Re-arm the (already-forked) eval pool for the NEXT window that
-                # will eval (i+1 in dense mode, i+K in sparse mode), so it warms
-                # up during the upcoming train window(s). iter() reuses the
+                _run_eval_window(eval_iter, label=f"eval_holdout@train_ts={train_ts}")
+                # Re-arm the (already-forked) eval pool for the NEXT eval. The
+                # holdout set is fixed, so the sampler window is unchanged; we
+                # only need a fresh iter() to replay it. iter() reuses the
                 # persistent workers — no fork, safe alongside the bg thread.
                 next_eval_i = next(
                     (j for j in range(i + 1, n_train) if _should_eval(j)), None
                 )
                 if next_eval_i is not None:
-                    eval_sampler.set_window(
-                        dataset.dataset.window_indices(  # pyre-ignore [16]
-                            train_ts_list[next_eval_i] + 1
-                        )
-                    )
                     eval_iter = iter(eval_dl)
             _maybe_checkpoint(train_ts)
     else:
@@ -1603,49 +1755,37 @@ def _should_eval(i: int) -> bool:
             )
             if _should_eval(i):
                 dataset.dataset.is_eval = True  # pyre-ignore [16]
-                _run_eval_window(_window_iter(train_ts + 1))
+                if eval_global_indices is not None:
+                    _run_eval_window(
+                        iter(
+                            make_streaming_dataloader(
+                                dataset=dataset, indices=eval_global_indices
+                            )
+                        ),
+                        label=f"eval_holdout@train_ts={train_ts}",
+                    )
+                else:
+                    # Legacy per-window eval (datasets without user holdout).
+                    _run_eval_window(
+                        iter(make_streaming_dataloader(dataset=dataset, ts=train_ts + 1))
+                    )
             _maybe_checkpoint(train_ts)
 
-    eval_ts = num_train_ts
-    dataset.dataset.is_eval = True
-    model.eval()
-    eval_batch_idx: int = 0
-    eval_dataloader = make_streaming_dataloader(dataset=dataset, ts=eval_ts)
-    eval_data_iterator = iter(eval_dataloader)
-    with torch.no_grad():
-        while True:
-            try:
-                sample = next(eval_data_iterator)
-            except StopIteration:
-                break
-            sample.to(device)
-            (
-                _,
-                _,
-                _,
-                mt_target_preds,
-                mt_target_labels,
-                mt_target_weights,
-            ) = model.forward(
-                sample.uih_features_kjt,
-                sample.candidates_features_kjt,
-            )
-            metric_logger.update(
-                mode="eval",
-                predictions=mt_target_preds,
-                labels=mt_target_labels,
-                weights=mt_target_weights,
-                num_candidates=sample.candidates_features_kjt.lengths().view(
-                    len(sample.candidates_features_kjt.keys()), -1
-                )[0],
-            )
-            eval_batch_idx += 1
-            if output_trace:
-                assert profiler is not None
-                profiler.step()
-            if eval_batch_idx % metric_log_frequency == 0:
-                metric_logger.compute_and_log(mode="eval")
-            if num_eval_batches is not None and eval_batch_idx >= num_eval_batches:
-                break
+    # Final eval over the SAME fixed user-holdout set (consistent with the
+    # per-window evals above). Reuses _run_eval_window so metrics are reset and
+    # reported the same way. Falls back to the legacy final-window eval for
+    # datasets without user holdout.
+    dataset.dataset.is_eval = True  # pyre-ignore [16]
+    if eval_global_indices is not None:
+        _run_eval_window(
+            iter(make_streaming_dataloader(dataset=dataset, indices=eval_global_indices)),
+            label="eval_holdout@final",
+        )
+    else:
+        _run_eval_window(
+            iter(make_streaming_dataloader(dataset=dataset, ts=num_train_ts)),
+            label="eval@final",
+        )
+    if rank == 0:
         for k, v in metric_logger.compute(mode="eval").items():
             print(f"{k}: {v}")
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
index be94af667..a9c060324 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
@@ -22,7 +22,7 @@
 import os
 import time
 from pathlib import Path
-from typing import Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional
 
 import gin
 import tensorboard  # @manual=//tensorboard:lib  # noqa: F401 - required implicit dep when using torch.utils.tensorboard
@@ -57,7 +57,31 @@
 
 
 class LifetimeAUCMetricComputation(AUCMetricComputation):
-    """AUC over all predictions seen so far (uncapped buffer); emits with the LIFETIME prefix."""
+    """AUC over a 10M-sample (~5.5 eval-window) trailing buffer; emits with the
+    LIFETIME prefix.
+
+    NOTE: despite the name, this is NOT an uncapped since-step-0 AUC. The parent
+    ``AUCMetricComputation`` evicts the prediction/label/weight buffers down to
+    ``window_size`` in ``update()``; we instantiate it with
+    ``window_size=10_000_000``, so "lifetime" is a ~10M-sample trailing window.
+    Raise ``window_size`` (accepting unbounded buffer growth) if true cumulative
+    AUC is ever required.
+
+    Checkpoint correctness: torchrec registers the PREDICTIONS/LABELS/WEIGHTS
+    buffers with ``persistent=False`` (so the default ``state_dict()`` drops
+    them) and tracks a separate ``self._num_samples`` counter. Without the
+    overrides below, every checkpoint resume would silently restart this metric
+    from an empty buffer. We therefore serialize the buffers AND ``_num_samples``
+    explicitly; restoring ``_num_samples`` is mandatory, since leaving it at 0
+    makes the next ``update()`` take the init-sentinel branch and desync the
+    windowed eviction. These buffers are per-rank-local (cross-rank gather only
+    happens transiently at compute time), so the checkpoint layer MUST persist
+    and restore them per-rank — see ``checkpoint.py``.
+    """
+
+    # Prefix used for the explicitly-serialized non-persistent buffers so the
+    # keys can't collide with any persistent state the parent might register.
+    _LIFETIME_KEY_PREFIX: str = "_lifetime_"
 
     def _compute(self) -> List[MetricComputationReport]:
         from typing import cast as _cast
@@ -76,6 +100,173 @@ def _compute(self) -> List[MetricComputationReport]:
             )
         ]
 
+    def lifetime_sample_count(self) -> int:
+        """Current number of buffered samples (greppable for sanity logs)."""
+        return int(getattr(self, "_num_samples", 0))
+
+    def state_dict(
+        self,
+        destination: Optional[Dict[str, Any]] = None,
+        prefix: str = "",
+        keep_vars: bool = False,
+    ) -> Dict[str, Any]:
+        from torchrec.metrics.auc import LABELS, PREDICTIONS, WEIGHTS
+
+        destination = super().state_dict(
+            destination=destination, prefix=prefix, keep_vars=keep_vars
+        )
+        # The parent registers these buffers persistent=False, so they are absent
+        # from `destination`. Concatenate each buffer list to one (n_tasks, N)
+        # tensor and serialize it alongside the sample counter.
+        for attr in (PREDICTIONS, LABELS, WEIGHTS):
+            buf = getattr(self, attr)
+            if isinstance(buf, (list, tuple)) and len(buf) > 0:
+                flat = torch.cat([t for t in buf], dim=-1)
+            elif isinstance(buf, torch.Tensor):
+                flat = buf
+            else:
+                flat = torch.empty(0)
+            destination[prefix + self._LIFETIME_KEY_PREFIX + attr] = (
+                flat.detach().cpu().clone()
+            )
+        destination[prefix + self._LIFETIME_KEY_PREFIX + "num_samples"] = (
+            torch.tensor(int(getattr(self, "_num_samples", 0)), dtype=torch.long)
+        )
+        return destination
+
+    def load_state_dict(
+        self,
+        state_dict: Dict[str, Any],
+        strict: bool = True,
+    ) -> Any:
+        from torchrec.metrics.auc import LABELS, PREDICTIONS, WEIGHTS
+
+        # Copy so we can strip our custom keys before delegating to the parent
+        # (whose strict load would otherwise reject them as unexpected).
+        remaining = dict(state_dict)
+        saved_bufs: Dict[str, torch.Tensor] = {}
+        for attr in (PREDICTIONS, LABELS, WEIGHTS):
+            key = self._LIFETIME_KEY_PREFIX + attr
+            if key in remaining:
+                saved_bufs[attr] = remaining.pop(key)
+        num_key = self._LIFETIME_KEY_PREFIX + "num_samples"
+        saved_num = remaining.pop(num_key, None)
+
+        result = super().load_state_dict(remaining, strict=strict)
+
+        if saved_bufs:
+            # Device of the live (init-sentinel) buffers; keep restored buffers
+            # co-located so subsequent update()/compute() stay on-device.
+            existing = getattr(self, PREDICTIONS)
+            dev = (
+                existing[0].device
+                if isinstance(existing, (list, tuple)) and len(existing) > 0
+                else torch.device("cpu")
+            )
+            for attr, val in saved_bufs.items():
+                setattr(self, attr, [val.to(dev)])
+            if saved_num is not None:
+                self._num_samples = int(saved_num.item())
+        return result
+
+
+# Sentinel "window size" used for the FRESH eval metrics so torchrec's windowed
+# eviction never fires within a single eval pass (the per-pass reset bounds the
+# buffer to exactly one full holdout pass). 1<<60 is far above any realistic
+# per-rank sample count and avoids sys.maxsize overflow inside torchrec math.
+UNBOUNDED_WINDOW: int = 1 << 60
+
+
+class BinnedCumulativeAUC(RecMetricComputation):
+    """Cumulative AUC via a fixed-resolution score histogram (LIFETIME prefix).
+
+    Global AUC is a rank statistic, so it has no fixed-size additive sufficient
+    statistic the way NE/Accuracy do - exact cumulative AUC otherwise needs every
+    (score, label) pair retained and sorted (the buffer-based ``AUCMetricComputation``
+    / ``LifetimeAUCMetricComputation``). Instead we keep two weighted histograms of
+    positive/negative mass per score bin. This gives an AUC exact up to bin width
+    with O(num_bins) memory that does NOT grow with sample count, and - because
+    histograms are additive - cross-rank sync is a cheap all-reduce (dist_reduce_fx
+    "sum") rather than all-gathering millions of predictions. The state is truly
+    cumulative across all eval passes (never evicted, never reset on eval).
+
+    Predictions MUST be probabilities in [0, 1] (the same tensor feeds NE, which
+    requires probabilities; the model applies sigmoid in multitask_module). Values
+    are clamped into [0, 1] defensively.
+    """
+
+    def __init__(self, *args, num_bins: int = 100_000, **kwargs) -> None:
+        # window_size is irrelevant here (no windowed state); pass through.
+        super().__init__(*args, **kwargs)
+        self._num_bins: int = int(num_bins)
+        self._add_state(
+            "pos_hist",
+            torch.zeros((self._n_tasks, self._num_bins), dtype=torch.float64),
+            add_window_state=False,
+            dist_reduce_fx="sum",
+            persistent=True,
+        )
+        self._add_state(
+            "neg_hist",
+            torch.zeros((self._n_tasks, self._num_bins), dtype=torch.float64),
+            add_window_state=False,
+            dist_reduce_fx="sum",
+            persistent=True,
+        )
+
+    def cumulative_sample_count(self) -> int:
+        """Total weighted samples in the histograms (greppable for sanity logs)."""
+        return int((self.pos_hist.sum() + self.neg_hist.sum()).item())
+
+    def update(
+        self,
+        *,
+        predictions: Optional[torch.Tensor],
+        labels: torch.Tensor,
+        weights: Optional[torch.Tensor],
+        **kwargs: Dict[str, Any],
+    ) -> None:
+        if predictions is None or weights is None:
+            raise ValueError(
+                "BinnedCumulativeAUC.update requires predictions and weights"
+            )
+        preds = predictions.float().clamp_(0.0, 1.0)  # (n_tasks, n_examples)
+        labels = labels.float()
+        weights = weights.float()
+        # Bin index per example; the top edge (p==1.0) folds into the last bin.
+        idx = (preds * self._num_bins).long().clamp_(0, self._num_bins - 1)
+        pos_w = (weights * labels).to(self.pos_hist.dtype)
+        neg_w = (weights * (1.0 - labels)).to(self.neg_hist.dtype)
+        self.pos_hist.scatter_add_(1, idx, pos_w)
+        self.neg_hist.scatter_add_(1, idx, neg_w)
+
+    def _compute(self) -> List[MetricComputationReport]:
+        # By compute() time torchmetrics has all-reduced (summed) the histograms
+        # across ranks, so these are the global per-bin masses.
+        pos = self.pos_hist  # (n_tasks, num_bins)
+        neg = self.neg_hist
+        total_pos = pos.sum(dim=1)
+        total_neg = neg.sum(dim=1)
+        # Lower bin index == lower score. A positive in bin b outranks every
+        # negative in bins < b (exclusive prefix sum), and ties in bin b score
+        # 0.5. AUC = sum_b pos_b * (neg_below_b + 0.5*neg_b) / (P * N).
+        neg_below = torch.cumsum(neg, dim=1) - neg
+        numerator = (pos * (neg_below + 0.5 * neg)).sum(dim=1)
+        denom = total_pos * total_neg
+        auc = torch.where(
+            denom > 0,
+            numerator / denom,
+            torch.full_like(numerator, 0.5),
+        ).to(torch.float32)
+        return [
+            MetricComputationReport(
+                name=MetricName.AUC,
+                metric_prefix=MetricPrefix.LIFETIME,
+                value=auc,
+            )
+        ]
+
+
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("utils")
 
@@ -696,6 +887,11 @@ def __init__(
         num_flops_per_sample: float = 0.0,
         gpu_peak_flops: float = 0.0,
         model: Optional[torch.nn.Module] = None,
+        eval_cumulative: bool = False,
+        cumulative_auc_bins: int = 100_000,
+        train_lifetime_auc_mode: str = "binned",
+        eval_lifetime_auc_mode: str = "binned",
+        lifetime_auc_window: int = 10_000_000,
     ) -> None:
         # tflops/mfu reporting state (optional — when both num_flops_per_sample
         # and gpu_peak_flops are set, the train perf line gains tflops_algo/gpu,
@@ -725,75 +921,95 @@ def __init__(
         ]
         self.task_names: List[str] = all_classification_tasks + all_regression_tasks
 
-        self.class_metrics: Dict[str, List[RecMetricComputation]] = {
-            "train": [],
-            "eval": [],
-        }
+        # Eval metric semantics:
+        #   eval_cumulative=False (default, legacy / static / non-streaming eval):
+        #     a single eval set with the configured window_size, including a
+        #     lifetime AUC. Unchanged behavior.
+        #   eval_cumulative=True (streaming fixed-holdout eval): a FRESH eval set
+        #     (window_size=UNBOUNDED, reset each pass -> per-pass full-holdout
+        #     "window_*") PLUS a CUMULATIVE set ("eval_cum", never reset ->
+        #     "lifetime_*"). NE/Accuracy/GAUC are cumulative for free via their
+        #     persistent scalar sums; AUC cumulative uses the selected backend.
+        #
+        # Lifetime-AUC backend is configurable independently for train and eval:
+        #   "binned" (default): BinnedCumulativeAUC - exact-cumulative AUC via an
+        #     O(num_bins) score histogram (additive all-reduce, no unbounded
+        #     buffer, memory independent of #samples/#windows).
+        #   "capped": LifetimeAUCMetricComputation - AUC over a trailing buffer of
+        #     `lifetime_auc_window` samples/rank (the legacy approach; per-rank
+        #     buffer all-gathered at compute).
+        self._eval_cumulative: bool = eval_cumulative
+        self._cumulative_auc_bins: int = int(cumulative_auc_bins)
+        self._train_lifetime_auc_mode: str = str(train_lifetime_auc_mode)
+        self._eval_lifetime_auc_mode: str = str(eval_lifetime_auc_mode)
+        self._lifetime_auc_window: int = int(lifetime_auc_window)
+        n_cls = len(all_classification_tasks)
+        n_reg = len(all_regression_tasks)
+
+        def _make_lifetime_auc(mode: str) -> RecMetricComputation:
+            if mode == "binned":
+                # window_size=0: no torchrec windowed state; histograms only.
+                return BinnedCumulativeAUC(
+                    my_rank=rank, batch_size=batch_size, n_tasks=n_cls,
+                    window_size=0, num_bins=self._cumulative_auc_bins,
+                ).to(device)
+            if mode == "capped":
+                return LifetimeAUCMetricComputation(
+                    my_rank=rank, batch_size=batch_size, n_tasks=n_cls,
+                    window_size=self._lifetime_auc_window,
+                ).to(device)
+            raise ValueError(
+                f"lifetime_auc_mode must be 'binned' or 'capped', got {mode!r}"
+            )
+
+        def _make_class(ws: int, lifetime_mode: Optional[str]) -> List[RecMetricComputation]:
+            mets: List[RecMetricComputation] = [
+                NEMetricComputation(my_rank=rank, batch_size=batch_size, n_tasks=n_cls, window_size=ws).to(device),
+                AccuracyMetricComputation(my_rank=rank, batch_size=batch_size, n_tasks=n_cls, window_size=ws).to(device),
+                GAUCMetricComputation(my_rank=rank, batch_size=batch_size, n_tasks=n_cls, window_size=ws).to(device),
+                AUCMetricComputation(my_rank=rank, batch_size=batch_size, n_tasks=n_cls, window_size=ws).to(device),
+            ]
+            if lifetime_mode is not None:
+                mets.append(_make_lifetime_auc(lifetime_mode))
+            return mets
+
+        def _make_class_cumulative() -> List[RecMetricComputation]:
+            # NE/Accuracy/GAUC: cumulative via persistent lifetime sums (window
+            # value ignored at compute). AUC: selected lifetime backend.
+            return [
+                NEMetricComputation(my_rank=rank, batch_size=batch_size, n_tasks=n_cls, window_size=window_size).to(device),
+                AccuracyMetricComputation(my_rank=rank, batch_size=batch_size, n_tasks=n_cls, window_size=window_size).to(device),
+                GAUCMetricComputation(my_rank=rank, batch_size=batch_size, n_tasks=n_cls, window_size=window_size).to(device),
+                _make_lifetime_auc(self._eval_lifetime_auc_mode),
+            ]
+
+        def _make_reg(ws: int) -> List[RecMetricComputation]:
+            return [
+                MSEMetricComputation(my_rank=rank, batch_size=batch_size, n_tasks=n_reg, window_size=ws).to(device),
+                MAEMetricComputation(my_rank=rank, batch_size=batch_size, n_tasks=n_reg, window_size=ws).to(device),
+            ]
+
+        self.class_metrics: Dict[str, List[RecMetricComputation]] = {"train": [], "eval": []}
+        self.regression_metrics: Dict[str, List[RecMetricComputation]] = {"train": [], "eval": []}
+        if eval_cumulative:
+            self.class_metrics["eval_cum"] = []
+            self.regression_metrics["eval_cum"] = []
+
         if all_classification_tasks:
-            for mode in ["train", "eval"]:
-                self.class_metrics[mode].append(
-                    NEMetricComputation(
-                        my_rank=rank,
-                        batch_size=batch_size,
-                        n_tasks=len(all_classification_tasks),
-                        window_size=window_size,
-                    ).to(device)
-                )
-                self.class_metrics[mode].append(
-                    AccuracyMetricComputation(
-                        my_rank=rank,
-                        batch_size=batch_size,
-                        n_tasks=len(all_classification_tasks),
-                        window_size=window_size,
-                    ).to(device)
-                )
-                self.class_metrics[mode].append(
-                    GAUCMetricComputation(
-                        my_rank=rank,
-                        batch_size=batch_size,
-                        n_tasks=len(all_classification_tasks),
-                        window_size=window_size,
-                    ).to(device)
-                )
-                self.class_metrics[mode].append(
-                    AUCMetricComputation(
-                        my_rank=rank,
-                        batch_size=batch_size,
-                        n_tasks=len(all_classification_tasks),
-                        window_size=window_size,
-                    ).to(device)
-                )
-                self.class_metrics[mode].append(
-                    LifetimeAUCMetricComputation(
-                        my_rank=rank,
-                        batch_size=batch_size,
-                        n_tasks=len(all_classification_tasks),
-                        window_size=10_000_000,
-                    ).to(device)
-                )
+            self.class_metrics["train"] = _make_class(window_size, lifetime_mode=self._train_lifetime_auc_mode)
+            if eval_cumulative:
+                self.class_metrics["eval"] = _make_class(UNBOUNDED_WINDOW, lifetime_mode=None)
+                self.class_metrics["eval_cum"] = _make_class_cumulative()
+            else:
+                self.class_metrics["eval"] = _make_class(window_size, lifetime_mode=self._eval_lifetime_auc_mode)
 
-        self.regression_metrics: Dict[str, List[RecMetricComputation]] = {
-            "train": [],
-            "eval": [],
-        }
         if all_regression_tasks:
-            for mode in ["train", "eval"]:
-                self.regression_metrics[mode].append(
-                    MSEMetricComputation(
-                        my_rank=rank,
-                        batch_size=batch_size,
-                        n_tasks=len(all_regression_tasks),
-                        window_size=window_size,
-                    ).to(device)
-                )
-                self.regression_metrics[mode].append(
-                    MAEMetricComputation(
-                        my_rank=rank,
-                        batch_size=batch_size,
-                        n_tasks=len(all_regression_tasks),
-                        window_size=window_size,
-                    ).to(device)
-                )
+            self.regression_metrics["train"] = _make_reg(window_size)
+            if eval_cumulative:
+                self.regression_metrics["eval"] = _make_reg(UNBOUNDED_WINDOW)
+                self.regression_metrics["eval_cum"] = _make_reg(window_size)
+            else:
+                self.regression_metrics["eval"] = _make_reg(window_size)
 
         self.global_step: Dict[str, int] = {"train": 0, "eval": 0}
         self.tb_logger: Optional[SummaryWriter] = None
@@ -822,10 +1038,15 @@ def all_metrics(self) -> Dict[str, List[RecMetricComputation]]:
         Returns:
             Dictionary mapping mode ('train'/'eval') to list of metric computations.
         """
-        return {
+        out = {
             "train": self.class_metrics["train"] + self.regression_metrics["train"],
             "eval": self.class_metrics["eval"] + self.regression_metrics["eval"],
         }
+        if "eval_cum" in self.class_metrics or "eval_cum" in self.regression_metrics:
+            out["eval_cum"] = self.class_metrics.get(
+                "eval_cum", []
+            ) + self.regression_metrics.get("eval_cum", [])
+        return out
 
     def update(
         self,
@@ -845,7 +1066,12 @@ def update(
             num_candidates: Number of candidates per sample (for GAUC).
             mode: Either 'train' or 'eval'.
         """
-        for metric in self.all_metrics[mode]:
+        # On eval, update BOTH the fresh set and the never-reset cumulative set
+        # (if enabled) from the same batch.
+        update_targets = list(self.all_metrics[mode])
+        if mode == "eval" and "eval_cum" in self.all_metrics:
+            update_targets = update_targets + self.all_metrics["eval_cum"]
+        for metric in update_targets:
             if isinstance(metric, GAUCMetricComputation):
                 metric.update(
                     predictions=predictions,
@@ -880,13 +1106,41 @@ def compute(self, mode: str = "train") -> Dict[str, float]:
         """
         all_computed_metrics = {}
 
-        for metric in self.all_metrics[mode]:
-            computed_metrics = metric.compute()
-            for computed in computed_metrics:
-                all_values = computed.value.cpu()
-                for i, task_name in enumerate(self.task_names):
-                    key = f"metric/{str(computed.metric_prefix) + str(computed.name)}/{task_name}"
-                    all_computed_metrics[key] = all_values[i]
+        if mode == "eval" and "eval_cum" in self.all_metrics:
+            # Dual-set eval: `window_*` (fresh per-pass) from the reset-each-pass
+            # set; `lifetime_*` (cumulative across passes) from the never-reset
+            # set. Filter each set to the matching prefix, and drop GAUC's
+            # auxiliary `*_num_samples` reports. Key names are unchanged
+            # (`window_auc`, `lifetime_ne`, ...) so dashboards keep working.
+            def _emit(
+                metrics: List[RecMetricComputation], keep_prefix: str
+            ) -> None:
+                for metric in metrics:
+                    for computed in metric.compute():
+                        pfx = str(computed.metric_prefix)
+                        name = str(computed.name)
+                        if pfx != keep_prefix or name.endswith("num_samples"):
+                            continue
+                        all_values = computed.value.cpu()
+                        for i, task_name in enumerate(self.task_names):
+                            if i >= len(all_values):
+                                break
+                            all_computed_metrics[f"metric/{pfx}{name}/{task_name}"] = (
+                                all_values[i]
+                            )
+
+            _emit(self.all_metrics["eval"], "window_")
+            _emit(self.all_metrics["eval_cum"], "lifetime_")
+        else:
+            for metric in self.all_metrics[mode]:
+                computed_metrics = metric.compute()
+                for computed in computed_metrics:
+                    all_values = computed.value.cpu()
+                    for i, task_name in enumerate(self.task_names):
+                        if i >= len(all_values):
+                            break
+                        key = f"metric/{str(computed.metric_prefix) + str(computed.name)}/{task_name}"
+                        all_computed_metrics[key] = all_values[i]
 
         logger.info(
             f"{mode} - Step {self.global_step[mode]} metrics: {all_computed_metrics}"
@@ -1066,6 +1320,21 @@ def env_path(key: str = "", default: str = "") -> str:
     return os.environ.get(key, default) if key else default
 
 
+@gin.configurable
+def env_str(key: str = "", default: str = "") -> str:
+    """Resolve a string from os.environ[key], falling back to `default`.
+
+    Companion to `env_int`/`env_float` for categorical/string overrides (e.g. a
+    metric backend selector). Example gin usage:
+
+        MetricsLogger.train_lifetime_auc_mode = @tlam/env_str()
+        tlam/env_str.key     = "TRAIN_LIFETIME_AUC_MODE"
+        tlam/env_str.default = "binned"
+    """
+    raw = os.environ.get(key) if key else None
+    return raw if raw else default
+
+
 @gin.configurable
 def env_int(key: str = "", default: int = 0) -> int:
     """Resolve an int from os.environ[key], falling back to `default`.
@@ -1155,6 +1424,8 @@ def get_dataset(
     history_length: Optional[int] = None,
     streaming_window_seconds: int = 86400,
     streaming_sort_within_window: bool = False,
+    train_split_percentage: float = 1.0,
+    split_salt: int = 0,
 ):
     """
     Get dataset class and configuration by name.
@@ -1285,6 +1556,11 @@ def get_dataset(
                 # streaming-train-eval; ignored by the default train-eval path).
                 "streaming_window_seconds": streaming_window_seconds,
                 "streaming_sort_within_window": streaming_sort_within_window,
+                # User-level train:eval holdout for the streaming path. 1.0 =
+                # no holdout (legacy). <1.0 holds out (1 - tsp) of users as a
+                # fixed eval set; those users are never trained.
+                "train_split_percentage": train_split_percentage,
+                "split_salt": split_salt,
             },
         )
     if name == "sampled-streaming-100b":
diff --git a/recommendation_v4/scripts/run_streaming_e2e.sh b/recommendation_v4/scripts/run_streaming_e2e.sh
index 40dc5fe81..70ec8fc8f 100755
--- a/recommendation_v4/scripts/run_streaming_e2e.sh
+++ b/recommendation_v4/scripts/run_streaming_e2e.sh
@@ -114,7 +114,7 @@ NUM_TRAIN_TS=149
 START_TS=150
 EVAL_EVERY=5
 CKPT_TIME_INTERVAL=7200
-KEEP_LAST_N=2
+KEEP_LAST_N=1
 CKPT_PATH=/apps/chcai/ckpts/yambda_5b_e2e
 RUN_NAME=yambda_5b_e2e
 LOG=/apps/chcai/yambda_5b_e2e.log
@@ -122,6 +122,14 @@ MAX_RELAUNCH=50
 NUM_TRAIN_BATCHES=0     # 0 = full window (only capped for validation/tests)
 NUM_EVAL_BATCHES=0      # 0 = full holdout eval (only capped for validation)
 DIE_AT_STEP=-1          # >=0 = test-only failure injection
+# Train:eval split (fraction of USERS trained; 1 - this held out as a FIXED,
+# never-trained eval set). Passed on EVERY relaunch so the split stays an
+# immutable run contract — a changed split would abort on resume (validated in
+# the loop) to prevent skip-offset desync and held-out users leaking into train.
+TRAIN_SPLIT_PERCENTAGE=0.90
+SPLIT_SALT=0
+EVAL_HOLDOUT_TS=-1          # <0 = window just past training (start_ts+num_train_ts)
+EVAL_HOLDOUT_NUM_WINDOWS=1
 IN_WINDOW_FREQ=0        # >0 = also save every N batches within a window
 ATTACH=0                # 1 = (re)attach to an already-running trainer without
                         #     killing it or truncating its log — used to restore
@@ -377,6 +385,10 @@ launch() {
         NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES \
         NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES \
         DIE_AT_STEP=$DIE_AT_STEP \
+        TRAIN_SPLIT_PERCENTAGE=$TRAIN_SPLIT_PERCENTAGE \
+        SPLIT_SALT=$SPLIT_SALT \
+        EVAL_HOLDOUT_TS=$EVAL_HOLDOUT_TS \
+        EVAL_HOLDOUT_NUM_WINDOWS=$EVAL_HOLDOUT_NUM_WINDOWS \
         METRIC_LOG_FREQ=50 \
         RUN_NAME=$RUN_NAME \
         TENSORBOARD_LOG_PATH=/apps/chcai/tb/$RUN_NAME/ \

From efc512621e2afecb225d518394b8b8ac3648ad3e Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Mon, 8 Jun 2026 22:40:58 -0500
Subject: [PATCH 040/113] dlrmv4: default yambda-5b to the 4k-no-truncation seq
 shape
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Set HISTORY_LENGTH=4086 / MAX_SEQ_LEN=4096 as the gin defaults (3*1362+9=4095
≤ 4096, the no-overfill 4k analog of the prior 2039/2048 shape). Override via
$HISTORY_LENGTH/$MAX_SEQ_LEN; use 2039/2048 to reuse the 2k single-task cache.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin           | 23 ++++++++++---------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 123027fdd..e7eed30cb 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -77,29 +77,30 @@ get_dataset.new_path_prefix = %DATA_PATH
 # across 3 behaviour pools (listen+ / like / skip) at L//3 events each.
 # Per-sample sequence the model sees =
 #     3 × (L // 3) + 8 contextual + 1 candidate
-# Choosing 2039 makes 3 × 679 + 9 = 2046, the largest value that fits
-# get_hstu_configs.max_seq_len = 2048 with no dataset-side truncation.
+# Choosing 4086 makes 3 × 1362 + 9 = 4095, the largest value that fits
+# get_hstu_configs.max_seq_len = 4096 with no dataset-side truncation (the
+# 4k analog of the previous 2039/2048 shape, where 3×679+9=2046 ≤ 2048).
 # Larger L overflows the budget; the dataset truncates UIH events to fit.
 # Note: like events are only 1.9% of the yambda corpus and max user lifetime
 # is ~28k events, so the like pool fills to ~105 events per anchor on
-# average (not 679) — TRITON's jagged attention skips the unfilled slots,
+# average (not 1362) — TRITON's jagged attention skips the unfilled slots,
 # so the under-fill costs sequence budget but not GPU compute.
 # Cache is keyed by L on disk under <processed_dir>/hstu_cache_L<N>/;
-# switching L reuses an existing cache or builds a new one (~5 min). Override
-# via $HISTORY_LENGTH (default 2039 keeps the existing single-task cache hot).
+# switching L reuses an existing cache or builds a new one. Override via
+# $HISTORY_LENGTH (default 4086 = the 4k-no-truncation shape; use 2039 with
+# MAX_SEQ_LEN=2048 to reuse the previous 2k single-task cache).
 get_dataset.history_length = @hl/env_int()
 hl/env_int.key     = "HISTORY_LENGTH"
-hl/env_int.default = 2039
+hl/env_int.default = 4086
 
 # Model-side attention budget. Dataset truncates UIH to fit this value if
 # `history_length + contextual + candidate` would overflow. Override via
-# $MAX_SEQ_LEN (default 2048 preserves the production single-task shape).
-# Pair MAX_SEQ_LEN=4096 with HISTORY_LENGTH=4086 for the 4k-no-truncation
-# analog (3*1362+9=4095 ≤ 4096); pair with HISTORY_LENGTH=4096 to reuse the
-# existing hstu_cache_L4096/ cache with ~8 events of trailing truncation.
+# $MAX_SEQ_LEN (default 4096, the 4k-no-truncation shape paired with
+# HISTORY_LENGTH=4086: 3*1362+9=4095 ≤ 4096). Set MAX_SEQ_LEN=2048 with
+# HISTORY_LENGTH=2039 for the previous 2k production single-task shape.
 get_hstu_configs.max_seq_len = @msl/env_int()
 msl/env_int.key     = "MAX_SEQ_LEN"
-msl/env_int.default = 2048
+msl/env_int.default = 4096
 
 # --- streaming (temporal-order) training -------------------------------------
 # Only consumed under `--mode streaming-train-eval`; the default train-eval

From 03362da5a8fec3fbab5b6dfd37232ff69965a010 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Mon, 8 Jun 2026 23:04:57 -0500
Subject: [PATCH 041/113] dlrmv4: reservation-aware sbatch failover in
 streaming supervisor

Replace the salloc-based node failover with an sbatch hold job (--wrap "sleep
infinity", bounded by --time), since interactive salloc on meta64 is capped at
240 min and can't hold a multi-day run. Add --reservation so a replacement node
is re-acquired from the same SLURM reservation, plus --acquire-wait-max to
tolerate brief queueing. Provisioning still runs via srun --overlap afterward.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../scripts/run_streaming_e2e.sh              | 66 +++++++++++++------
 1 file changed, 47 insertions(+), 19 deletions(-)

diff --git a/recommendation_v4/scripts/run_streaming_e2e.sh b/recommendation_v4/scripts/run_streaming_e2e.sh
index 70ec8fc8f..59a48b703 100755
--- a/recommendation_v4/scripts/run_streaming_e2e.sh
+++ b/recommendation_v4/scripts/run_streaming_e2e.sh
@@ -49,12 +49,16 @@
 #                            never false-trip this.)
 #
 # NODE FAILOVER (case 3, the --allow-failover path)
-#   ensure_ready -> acquire_node: `salloc --no-shell --exclusive` a fresh node on
-#   $PARTITION, wait for RUNNING, then provision_node runs $PROVISION_SCRIPT on
-#   it (docker pull + container create + dep install; ~15 min on a cold node).
-#   Allocations WE create are tracked and `scancel`ed (container removed first)
-#   on success via release_acquired; the user's original --jobid is never
-#   cancelled. Checkpoints on shared NFS make the resume seamless.
+#   ensure_ready -> acquire_node: submit an `sbatch` hold job (`--wrap "sleep
+#   infinity"`, bounded by --time=$ALLOC_TIME) for a fresh exclusive node on
+#   $PARTITION, optionally from --reservation $RESERVATION; wait for RUNNING,
+#   then provision_node runs $PROVISION_SCRIPT on it via `srun --jobid --overlap`
+#   (docker pull + container create + dep install; ~15 min on a cold node).
+#   sbatch (not salloc) because interactive salloc on some partitions (e.g.
+#   meta64) is capped at 240 min, which a multi-day hold would exceed. Jobs WE
+#   create are tracked and `scancel`ed (container removed first) on success via
+#   release_acquired; the user's original --jobid is never cancelled.
+#   Checkpoints on shared NFS make the resume seamless.
 #
 # CHECKPOINTS / DISK
 #   The trainer saves atomically (write to <ts>.tmp, fsync, rename to <ts>) and
@@ -67,7 +71,8 @@
 #   ckpt:        --ckpt-path --keep-last-n --ckpt-time-interval --in-window-freq
 #   logging:     --run-name --log
 #   resilience:  --max-relaunch --min-free-gib --stall-s
-#   failover:    --partition --alloc-time --allow-failover --provision-script
+#   failover:    --partition --reservation --alloc-time --allow-failover
+#                --provision-script --acquire-wait-max
 #   validation:  --num-train-batches --num-eval-batches  (>0 caps batches/window
 #                for fast tests; 0 = full window / full-holdout eval)
 #   test-only:   --die-at-step  (>=0 injects a crash at that global step)
@@ -85,11 +90,15 @@
 #
 # EXAMPLE
 #   nohup bash scripts/run_streaming_e2e.sh \
+#       --jobid 12074 \
 #       --ckpt-path /apps/chcai/ckpts/yambda_5b_e2e \
 #       --run-name yambda_5b_e2e --log /apps/chcai/yambda_5b_e2e.log \
 #       --start-ts 150 --num-train-ts 149 --eval-every 10 \
-#       --ckpt-time-interval 7200 --keep-last-n 2 --max-relaunch 50 \
+#       --ckpt-time-interval 3600 --keep-last-n 1 --max-relaunch 100 \
+#       --reservation NAN_issue_debug \
 #       > /apps/chcai/yambda_5b_e2e.supervisor.console.log 2>&1 &
+#   (--reservation makes node-death failover re-acquire from that reservation;
+#    omit it to fall back to the open $PARTITION pool.)
 # =============================================================================
 
 set -uo pipefail
@@ -143,9 +152,15 @@ CTRL_WAIT_MAX=3600      # max seconds to wait for an unreachable SLURM controlle
 # the container on it, and resume — checkpoints + code live on shared NFS
 # (/apps/chcai, /home/chcai), so any node in the partition can continue.
 PARTITION=meta64
-ALLOC_TIME=7-00:00:00                 # SLURM --time for a failover allocation
+RESERVATION=""                        # if set, failover acquires from this SLURM
+                                      # reservation (e.g. NAN_issue_debug) so a
+                                      # replacement node comes from the same pool.
+ALLOC_TIME=7-00:00:00                 # SLURM --time for a failover hold job
 ALLOW_FAILOVER=1                      # 0 = never acquire a new node
 PROVISION_SCRIPT=/home/chcai/_provision_yambda_primus.sh
+ACQUIRE_WAIT_MAX=1800                 # max seconds to wait for a failover sbatch
+                                      # hold job to reach RUNNING (tolerates brief
+                                      # queueing before the node is granted).
 
 # Disk guard: require at least this many GiB free on the ckpt volume before a
 # (re)launch. One checkpoint is ~560 GB. A save writes a fresh .tmp BEFORE the
@@ -182,9 +197,11 @@ while [[ $# -gt 0 ]]; do
         --min-free-gib) MIN_FREE_GIB="$2"; shift 2;;
         --stall-s) STALL_S="$2"; shift 2;;
         --partition) PARTITION="$2"; shift 2;;
+        --reservation) RESERVATION="$2"; shift 2;;
         --alloc-time) ALLOC_TIME="$2"; shift 2;;
         --allow-failover) ALLOW_FAILOVER="$2"; shift 2;;
         --provision-script) PROVISION_SCRIPT="$2"; shift 2;;
+        --acquire-wait-max) ACQUIRE_WAIT_MAX="$2"; shift 2;;
         *) echo "Unknown arg: $1"; exit 1;;
     esac
 done
@@ -280,27 +297,37 @@ provision_node() {
 }
 
 # Acquire a fresh exclusive node on $PARTITION; sets global JOBID on success.
+# Uses `sbatch` (not `salloc`): interactive salloc on some partitions (meta64)
+# is capped at 240 min, which an $ALLOC_TIME multi-day hold exceeds. The batch
+# job merely pins the node (`sleep infinity`, bounded by --time); the container
+# is provisioned afterward by provision_node via `srun --jobid --overlap`.
+# Honors --reservation so failover re-acquires from the SAME reservation pool.
 acquire_node() {
     if [[ "$ALLOW_FAILOVER" != "1" ]]; then
         sup "failover disabled (--allow-failover 0); cannot acquire a new node"; return 1
     fi
-    sup "requesting a fresh node on partition=$PARTITION (exclusive, time=$ALLOC_TIME)"
-    local out jid
-    out=$(salloc --no-shell --partition="$PARTITION" --nodes=1 --exclusive \
-                 --time="$ALLOC_TIME" --job-name=e2e_failover 2>&1)
-    jid=$(echo "$out" | grep -oiE "Granted job allocation [0-9]+" | grep -oE "[0-9]+" | head -1)
-    if [[ -z "$jid" ]]; then
-        sup "FATAL: salloc did not grant a node: $out"; return 1
+    local resv_arg=""
+    [[ -n "$RESERVATION" ]] && resv_arg="--reservation=$RESERVATION"
+    sup "requesting a fresh node via sbatch (partition=$PARTITION${RESERVATION:+ reservation=$RESERVATION}, exclusive, time=$ALLOC_TIME)"
+    local jid
+    jid=$(sbatch --parsable --partition="$PARTITION" $resv_arg --nodes=1 --exclusive \
+                 --time="$ALLOC_TIME" --job-name=e2e_failover \
+                 --output="${LOG%.log}.failover_hold.%j.log" \
+                 --wrap="echo \"[failover-hold] node=\$(hostname) jobid=\$SLURM_JOB_ID start=\$(date -Is)\"; sleep infinity" 2>&1)
+    # --parsable => "<jobid>" or "<jobid>;<cluster>"; strip whitespace + cluster.
+    jid=$(echo "$jid" | tr -d ' ' | cut -d';' -f1)
+    if ! [[ "$jid" =~ ^[0-9]+$ ]]; then
+        sup "FATAL: sbatch did not return a jobid: $jid"; return 1
     fi
     ACQUIRED_JOBIDS+=("$jid")
-    sup "granted new allocation jobid=$jid; waiting for RUNNING"
+    sup "submitted failover hold job jobid=$jid; waiting for RUNNING (max ${ACQUIRE_WAIT_MAX}s)"
     local waited=0
-    while (( waited < 600 )); do
+    while (( waited < ACQUIRE_WAIT_MAX )); do
         [[ "$(squeue -h -j "$jid" -o '%T' 2>/dev/null | head -1)" == "RUNNING" ]] && break
         sleep 10; waited=$((waited + 10))
     done
     if [[ "$(squeue -h -j "$jid" -o '%T' 2>/dev/null | head -1)" != "RUNNING" ]]; then
-        sup "FATAL: new allocation $jid never reached RUNNING (waited ${waited}s)"; return 1
+        sup "FATAL: failover hold job $jid never reached RUNNING (waited ${waited}s)"; return 1
     fi
     JOBID="$jid"
     sup "new node ready: jobid=$JOBID node=$(squeue -h -j "$JOBID" -o '%N' 2>/dev/null | head -1)"
@@ -410,6 +437,7 @@ sup "jobid=$JOBID container=$CONTAINER repo=$REPO"
 sup "start_ts=$START_TS num_train_ts=$NUM_TRAIN_TS eval_every=$EVAL_EVERY"
 sup "ckpt_path=$CKPT_PATH keep_last_n=$KEEP_LAST_N ckpt_time_interval=${CKPT_TIME_INTERVAL}s in_window_freq=$IN_WINDOW_FREQ"
 sup "log=$LOG num_train_batches=$NUM_TRAIN_BATCHES die_at_step=$DIE_AT_STEP max_relaunch=$MAX_RELAUNCH"
+sup "failover: allow=$ALLOW_FAILOVER partition=$PARTITION reservation=${RESERVATION:-<none>} alloc_time=$ALLOC_TIME"
 
 cexec "mkdir -p '$CKPT_PATH' '/apps/chcai/tb/$RUN_NAME'"
 # Initialize this run's metrics log ONCE. launch_smoke_8gpu.sh appends (tee -a),

From 888f701926b84ddaba2877276dc82c93e8c2c671 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Mon, 8 Jun 2026 23:11:18 -0500
Subject: [PATCH 042/113] dlrmv4: two-tier reservation-then-open-pool failover
 in streaming supervisor

acquire_node now prefers the configured --reservation (tier 1, short
RESV_WAIT_MAX wait since a free reservation node starts ~immediately),
then falls back to the open partition pool (tier 2, ACQUIRE_WAIT_MAX).
The pending reservation hold job is scancel'd before the fallback resubmit
so we never end up holding two nodes. Factors the sbatch submit and
RUNNING-wait into _submit_hold_job/_wait_running helpers; adds the
--resv-wait-max knob.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../scripts/run_streaming_e2e.sh              | 106 +++++++++++++-----
 1 file changed, 76 insertions(+), 30 deletions(-)

diff --git a/recommendation_v4/scripts/run_streaming_e2e.sh b/recommendation_v4/scripts/run_streaming_e2e.sh
index 59a48b703..e8febc62c 100755
--- a/recommendation_v4/scripts/run_streaming_e2e.sh
+++ b/recommendation_v4/scripts/run_streaming_e2e.sh
@@ -158,9 +158,15 @@ RESERVATION=""                        # if set, failover acquires from this SLUR
 ALLOC_TIME=7-00:00:00                 # SLURM --time for a failover hold job
 ALLOW_FAILOVER=1                      # 0 = never acquire a new node
 PROVISION_SCRIPT=/home/chcai/_provision_yambda_primus.sh
-ACQUIRE_WAIT_MAX=1800                 # max seconds to wait for a failover sbatch
-                                      # hold job to reach RUNNING (tolerates brief
-                                      # queueing before the node is granted).
+ACQUIRE_WAIT_MAX=1800                 # max seconds to wait for the OPEN-POOL
+                                      # (tier-2) failover hold job to reach
+                                      # RUNNING (tolerates brief queueing).
+RESV_WAIT_MAX=300                     # max seconds to wait for a RESERVATION
+                                      # (tier-1) node before giving up on it and
+                                      # falling back to the open $PARTITION pool.
+                                      # Short, since a free reservation node
+                                      # starts ~immediately; a longer wait just
+                                      # means the reservation is currently full.
 
 # Disk guard: require at least this many GiB free on the ckpt volume before a
 # (re)launch. One checkpoint is ~560 GB. A save writes a fresh .tmp BEFORE the
@@ -202,6 +208,7 @@ while [[ $# -gt 0 ]]; do
         --allow-failover) ALLOW_FAILOVER="$2"; shift 2;;
         --provision-script) PROVISION_SCRIPT="$2"; shift 2;;
         --acquire-wait-max) ACQUIRE_WAIT_MAX="$2"; shift 2;;
+        --resv-wait-max) RESV_WAIT_MAX="$2"; shift 2;;
         *) echo "Unknown arg: $1"; exit 1;;
     esac
 done
@@ -296,42 +303,81 @@ provision_node() {
     container_up "$jid"
 }
 
-# Acquire a fresh exclusive node on $PARTITION; sets global JOBID on success.
-# Uses `sbatch` (not `salloc`): interactive salloc on some partitions (meta64)
-# is capped at 240 min, which an $ALLOC_TIME multi-day hold exceeds. The batch
-# job merely pins the node (`sleep infinity`, bounded by --time); the container
-# is provisioned afterward by provision_node via `srun --jobid --overlap`.
-# Honors --reservation so failover re-acquires from the SAME reservation pool.
+# Submit an sbatch hold job that merely pins one exclusive node (`sleep
+# infinity`, bounded by --time=$ALLOC_TIME); echoes the jobid. $1 = extra sbatch
+# args (e.g. "--reservation=NAN_issue_debug" or ""). sbatch (not salloc) because
+# interactive salloc on some partitions (meta64) is capped at 240 min, which an
+# $ALLOC_TIME multi-day hold exceeds. The container is provisioned afterward by
+# provision_node via `srun --jobid --overlap`.
+_submit_hold_job() {
+    local extra="$1" out
+    out=$(sbatch --parsable --partition="$PARTITION" $extra --nodes=1 --exclusive \
+                 --time="$ALLOC_TIME" --job-name=e2e_failover \
+                 --output="${LOG%.log}.failover_hold.%j.log" \
+                 --wrap="echo \"[failover-hold] node=\$(hostname) jobid=\$SLURM_JOB_ID start=\$(date -Is)\"; sleep infinity" 2>&1)
+    # --parsable => "<jobid>" or "<jobid>;<cluster>"; strip whitespace + cluster.
+    echo "$out" | tr -d ' ' | cut -d';' -f1
+}
+
+# Wait up to $2 seconds for job $1 to reach RUNNING. Returns 0 if RUNNING.
+_wait_running() {
+    local jid="$1" max="$2" waited=0 st
+    while (( waited < max )); do
+        st=$(squeue -h -j "$jid" -o '%T' 2>/dev/null | head -1)
+        [[ "$st" == "RUNNING" ]] && return 0
+        sleep 10; waited=$((waited + 10))
+    done
+    return 1
+}
+
+# Acquire a fresh exclusive node and set global JOBID on success. Two-tier:
+#   tier 1 (preferred): the SLURM --reservation $RESERVATION, if configured.
+#     Waited on for only RESV_WAIT_MAX — a free reservation node starts almost
+#     immediately, so a longer wait means the reservation is currently full.
+#   tier 2 (fallback): the open $PARTITION pool (no reservation), waited on for
+#     ACQUIRE_WAIT_MAX. Used when no reservation is set, or the reservation had
+#     no node free within RESV_WAIT_MAX (the pending reservation job is
+#     cancelled before we resubmit so we never end up holding two nodes).
 acquire_node() {
     if [[ "$ALLOW_FAILOVER" != "1" ]]; then
         sup "failover disabled (--allow-failover 0); cannot acquire a new node"; return 1
     fi
-    local resv_arg=""
-    [[ -n "$RESERVATION" ]] && resv_arg="--reservation=$RESERVATION"
-    sup "requesting a fresh node via sbatch (partition=$PARTITION${RESERVATION:+ reservation=$RESERVATION}, exclusive, time=$ALLOC_TIME)"
     local jid
-    jid=$(sbatch --parsable --partition="$PARTITION" $resv_arg --nodes=1 --exclusive \
-                 --time="$ALLOC_TIME" --job-name=e2e_failover \
-                 --output="${LOG%.log}.failover_hold.%j.log" \
-                 --wrap="echo \"[failover-hold] node=\$(hostname) jobid=\$SLURM_JOB_ID start=\$(date -Is)\"; sleep infinity" 2>&1)
-    # --parsable => "<jobid>" or "<jobid>;<cluster>"; strip whitespace + cluster.
-    jid=$(echo "$jid" | tr -d ' ' | cut -d';' -f1)
+
+    # --- tier 1: reservation (preferred) -------------------------------------
+    if [[ -n "$RESERVATION" ]]; then
+        sup "failover tier-1: requesting a node from reservation=$RESERVATION (exclusive, time=$ALLOC_TIME)"
+        jid=$(_submit_hold_job "--reservation=$RESERVATION")
+        if [[ "$jid" =~ ^[0-9]+$ ]]; then
+            ACQUIRED_JOBIDS+=("$jid")   # track for cleanup even if it never starts
+            sup "reservation hold job jobid=$jid submitted; waiting up to ${RESV_WAIT_MAX}s for RUNNING"
+            if _wait_running "$jid" "$RESV_WAIT_MAX"; then
+                JOBID="$jid"
+                sup "new node ready (reservation $RESERVATION): jobid=$JOBID node=$(squeue -h -j "$JOBID" -o '%N' 2>/dev/null | head -1)"
+                return 0
+            fi
+            sup "reservation $RESERVATION has no free node within ${RESV_WAIT_MAX}s — cancelling pending $jid and falling back to open pool"
+            scancel "$jid" 2>/dev/null || true
+        else
+            sup "reservation sbatch did not return a jobid ($jid) — falling back to open pool"
+        fi
+    fi
+
+    # --- tier 2: open partition pool (fallback) ------------------------------
+    sup "failover tier-2: requesting a node from open partition=$PARTITION (exclusive, time=$ALLOC_TIME)"
+    jid=$(_submit_hold_job "")
     if ! [[ "$jid" =~ ^[0-9]+$ ]]; then
-        sup "FATAL: sbatch did not return a jobid: $jid"; return 1
+        sup "FATAL: open-pool sbatch did not return a jobid: $jid"; return 1
     fi
     ACQUIRED_JOBIDS+=("$jid")
-    sup "submitted failover hold job jobid=$jid; waiting for RUNNING (max ${ACQUIRE_WAIT_MAX}s)"
-    local waited=0
-    while (( waited < ACQUIRE_WAIT_MAX )); do
-        [[ "$(squeue -h -j "$jid" -o '%T' 2>/dev/null | head -1)" == "RUNNING" ]] && break
-        sleep 10; waited=$((waited + 10))
-    done
-    if [[ "$(squeue -h -j "$jid" -o '%T' 2>/dev/null | head -1)" != "RUNNING" ]]; then
-        sup "FATAL: failover hold job $jid never reached RUNNING (waited ${waited}s)"; return 1
+    sup "open-pool hold job jobid=$jid submitted; waiting up to ${ACQUIRE_WAIT_MAX}s for RUNNING"
+    if _wait_running "$jid" "$ACQUIRE_WAIT_MAX"; then
+        JOBID="$jid"
+        sup "new node ready (open $PARTITION): jobid=$JOBID node=$(squeue -h -j "$JOBID" -o '%N' 2>/dev/null | head -1)"
+        return 0
     fi
-    JOBID="$jid"
-    sup "new node ready: jobid=$JOBID node=$(squeue -h -j "$JOBID" -o '%N' 2>/dev/null | head -1)"
-    return 0
+    sup "FATAL: open-pool hold job $jid never reached RUNNING (waited ${ACQUIRE_WAIT_MAX}s)"
+    return 1
 }
 
 # Ensure $JOBID is a healthy allocation with the container up, failing over to a

From 7c36891b3ea2d0e1fac625a20e69f5e28f349e25 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Tue, 9 Jun 2026 12:14:47 -0500
Subject: [PATCH 043/113] dlrmv4: cap failover at <=1 reservation node + fix
 trainer-alive self-match

trainer_alive: `pgrep -f generative_recommenders` always matched the probe
shell's own cmdline, so it could never report the trainer dead -- defeating
the stall watchdog and making ATTACH mode falsely "adopt" a nonexistent
trainer. Use the `set -f; pgrep -f [g]enerative_recommenders` self-match guard.

Reservation cap: every node we acquire is an sbatch --job-name=e2e_failover
hold, so reap_failover_holds() reaps strays by name at startup (catching holds
leaked by a prior supervisor that died mid-failover) and before every acquire
(no stacking). wait_for_original_recover() waits --orig-recover-wait (def 600s)
for SLURM to requeue the lost ORIGINAL job and reuses it instead of grabbing a
SECOND reservation node. Together these keep us at <=1 reservation node.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../scripts/run_streaming_e2e.sh              | 88 ++++++++++++++++++-
 1 file changed, 85 insertions(+), 3 deletions(-)

diff --git a/recommendation_v4/scripts/run_streaming_e2e.sh b/recommendation_v4/scripts/run_streaming_e2e.sh
index e8febc62c..5009ecbce 100755
--- a/recommendation_v4/scripts/run_streaming_e2e.sh
+++ b/recommendation_v4/scripts/run_streaming_e2e.sh
@@ -72,7 +72,11 @@
 #   logging:     --run-name --log
 #   resilience:  --max-relaunch --min-free-gib --stall-s
 #   failover:    --partition --reservation --alloc-time --allow-failover
-#                --provision-script --acquire-wait-max
+#                --provision-script --acquire-wait-max --resv-wait-max
+#                --orig-recover-wait
+#                (failover holds <=1 reservation node: stray/leaked e2e_failover
+#                 holds are reaped, and a lost ORIGINAL job is waited on for SLURM
+#                 requeue and reused before a SEPARATE node is acquired.)
 #   validation:  --num-train-batches --num-eval-batches  (>0 caps batches/window
 #                for fast tests; 0 = full window / full-holdout eval)
 #   test-only:   --die-at-step  (>=0 injects a crash at that global step)
@@ -167,6 +171,13 @@ RESV_WAIT_MAX=300                     # max seconds to wait for a RESERVATION
                                       # Short, since a free reservation node
                                       # starts ~immediately; a longer wait just
                                       # means the reservation is currently full.
+ORIG_RECOVER_WAIT=600                 # when the user's ORIGINAL reservation job
+                                      # is lost, wait this long for SLURM to
+                                      # auto-requeue it back to RUNNING before
+                                      # acquiring a SEPARATE node. Reusing the
+                                      # requeued original keeps us at <=1
+                                      # reservation node and skips a redundant
+                                      # acquire (observed requeue latency ~2 min).
 
 # Disk guard: require at least this many GiB free on the ckpt volume before a
 # (re)launch. One checkpoint is ~560 GB. A save writes a fresh .tmp BEFORE the
@@ -209,6 +220,7 @@ while [[ $# -gt 0 ]]; do
         --provision-script) PROVISION_SCRIPT="$2"; shift 2;;
         --acquire-wait-max) ACQUIRE_WAIT_MAX="$2"; shift 2;;
         --resv-wait-max) RESV_WAIT_MAX="$2"; shift 2;;
+        --orig-recover-wait) ORIG_RECOVER_WAIT="$2"; shift 2;;
         *) echo "Unknown arg: $1"; exit 1;;
     esac
 done
@@ -342,6 +354,10 @@ acquire_node() {
     if [[ "$ALLOW_FAILOVER" != "1" ]]; then
         sup "failover disabled (--allow-failover 0); cannot acquire a new node"; return 1
     fi
+    # Release any prior/leaked failover hold BEFORE grabbing a new one, so we
+    # never transiently pin two reservation nodes (e.g. a dead tier-1 hold + the
+    # replacement we are about to submit).
+    reap_failover_holds ""
     local jid
 
     # --- tier 1: reservation (preferred) -------------------------------------
@@ -395,6 +411,17 @@ ensure_ready() {
         sup "provisioning on $JOBID failed; will try a fresh node"
     else
         sup "current allocation $JOBID unavailable (job not RUNNING or node down/drained)"
+        # Prefer the SLURM-requeued original over acquiring a SEPARATE node, so we
+        # stay at <=1 reservation node. (No-op once we've already failed over off
+        # the original.)
+        if wait_for_original_recover; then
+            JOBID="$ORIGINAL_JOBID"
+            refresh_node >/dev/null
+            if container_up "$JOBID"; then sup "reusing recovered original jobid=$JOBID"; return 0; fi
+            sup "recovered original $JOBID up but container '$CONTAINER' not present — (re)provisioning"
+            provision_node "$JOBID" && return 0
+            sup "provisioning recovered original $JOBID failed; will acquire a fresh node"
+        fi
     fi
     acquire_node || return 1
     provision_node "$JOBID" || { sup "provisioning new node $JOBID failed"; return 1; }
@@ -413,15 +440,65 @@ release_acquired() {
     done
 }
 
+# Enforce "at most ONE reservation node held by this run at a time" and reap
+# orphans. Every node WE acquire is an `sbatch --job-name=e2e_failover` hold, so
+# all our holds are discoverable by name even across a supervisor restart — which
+# is how a previous supervisor that died mid-failover (e.g. on a provisioning
+# error) can leave a hold pinning a second reservation node. Cancels every
+# e2e_failover hold owned by us EXCEPT $1 (the one to keep) and the user's
+# ORIGINAL_JOBID (never ours to cancel). Containers are removed before the node
+# is freed so they don't linger for the next tenant.
+reap_failover_holds() {
+    local keep="${1:-}" me jid
+    me=$(id -un 2>/dev/null)
+    [[ -z "$me" ]] && return 0
+    while read -r jid; do
+        [[ -z "$jid" ]] && continue
+        [[ "$jid" == "$keep" || "$jid" == "$ORIGINAL_JOBID" ]] && continue
+        sup "reaping stray failover hold $jid (enforcing <=1 reservation node held by this run)"
+        srun --jobid="$jid" --overlap docker rm -f "$CONTAINER" >/dev/null 2>&1 || true
+        scancel "$jid" 2>/dev/null || true
+    done < <(squeue -h -u "$me" -n e2e_failover -o '%i' 2>/dev/null)
+}
+
+# When the user's ORIGINAL reservation job is lost, SLURM typically auto-requeues
+# it back onto a (fresh) reservation node within a couple of minutes. Waiting for
+# that and REUSING it — rather than immediately acquiring a SEPARATE node — is
+# what keeps us at <=1 reservation node (the alternative is the original requeue
+# AND a failover hold both pinning reservation nodes) and skips a redundant
+# acquire+provision. Only meaningful while we are still on the original job.
+wait_for_original_recover() {
+    [[ "$JOBID" != "$ORIGINAL_JOBID" ]] && return 1
+    local waited=0
+    while (( waited < ORIG_RECOVER_WAIT )); do
+        if alloc_healthy "$ORIGINAL_JOBID"; then
+            sup "original job $ORIGINAL_JOBID is RUNNING again (SLURM requeue) after ${waited}s — reusing it (no second node)"
+            return 0
+        fi
+        sup "waiting for original job $ORIGINAL_JOBID to requeue before acquiring a separate node (${waited}s/${ORIG_RECOVER_WAIT}s)…"
+        sleep 15; waited=$((waited + 15))
+    done
+    sup "original job $ORIGINAL_JOBID did not recover within ${ORIG_RECOVER_WAIT}s — acquiring a fresh node"
+    return 1
+}
+
 # Returns 0 (true) if a trainer process is alive in the container. Uses SLURM
 # (srun) when the controller is up, else falls back to a direct SSH probe so a
 # control-plane outage can't make a live trainer look dead.
 trainer_alive() {
     local n
+    # `set -f; pgrep -f [g]enerative...` is the classic self-match guard: the
+    # probe shell's OWN cmdline contains the pattern, so a naive `pgrep -f
+    # generative_recommenders` ALWAYS matches itself and returns >=1 even when
+    # the trainer is dead — which would defeat the stall watchdog and make
+    # ATTACH mode falsely "adopt" a nonexistent trainer. The [g] char-class
+    # matches "generative" in real trainer cmdlines but NOT the literal
+    # "[g]enerative" in the probe's cmdline; `set -f` keeps the bracket from
+    # being glob-expanded (works under both bash -lc wrappers, no quotes).
     if controller_up; then
-        n=$(cexec "pgrep -f generative_recommenders | wc -l" | tr -d ' ')
+        n=$(cexec "set -f; pgrep -f [g]enerative_recommenders | wc -l" | tr -d ' ')
     else
-        n=$(dexec "pgrep -f generative_recommenders | wc -l" | tr -d ' ')
+        n=$(dexec "set -f; pgrep -f [g]enerative_recommenders | wc -l" | tr -d ' ')
     fi
     [[ "${n:-0}" -gt 0 ]]
 }
@@ -485,6 +562,11 @@ sup "ckpt_path=$CKPT_PATH keep_last_n=$KEEP_LAST_N ckpt_time_interval=${CKPT_TIM
 sup "log=$LOG num_train_batches=$NUM_TRAIN_BATCHES die_at_step=$DIE_AT_STEP max_relaunch=$MAX_RELAUNCH"
 sup "failover: allow=$ALLOW_FAILOVER partition=$PARTITION reservation=${RESERVATION:-<none>} alloc_time=$ALLOC_TIME"
 
+# Reap any failover hold(s) leaked by a PREVIOUS supervisor that died mid-failover
+# (e.g. exited on a provisioning error before release_acquired could run). Without
+# this, such an orphan keeps pinning a second reservation node indefinitely.
+reap_failover_holds ""
+
 cexec "mkdir -p '$CKPT_PATH' '/apps/chcai/tb/$RUN_NAME'"
 # Initialize this run's metrics log ONCE. launch_smoke_8gpu.sh appends (tee -a),
 # so every relaunch attempt accumulates into this single file — the full-run

From 7c9188f47de9b0f63d5d298029fa3626c46729e7 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Tue, 9 Jun 2026 13:06:59 -0500
Subject: [PATCH 044/113] dlrmv4: anchor sparse-eval cadence to absolute window
 ts (resume-invariant)

_should_eval keyed the every-N-windows cadence off the per-call loop index
`i`, so a mid-run resume (which rebases start_ts and restarts train_ts_list at
the resume window) re-anchored the eval grid -- e.g. evals shifted from
150,160,170,... to 165,175,185,... after resuming at window 165. Capture the
original start_ts as eval_anchor_ts BEFORE the resume block mutates start_ts,
and gate eval on (train_ts_list[i] - eval_anchor_ts) % K == 0, so the eval grid
is identical on cold start and every resume. Final-window eval preserved; the
eval-pool fork is unconditional so dropping the i==0 eval on resume is safe.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/utils.py                     | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index 936ae229e..8b06152a1 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -1274,6 +1274,13 @@ def streaming_train_eval_loop(
     # cold start and on every resume (the supervisor relaunches with the same
     # START_TS / NUM_TRAIN_TS). Defaults to the window just past training.
     requested_end_ts = start_ts + num_train_ts
+    # Eval-cadence anchor: the ORIGINAL requested start_ts, captured BEFORE the
+    # resume block rebases start_ts. `_should_eval` keys the every-N-windows
+    # cadence off the absolute window ts relative to THIS anchor, so the eval
+    # grid (e.g. 150,160,170,...) is identical on cold start and on every resume.
+    # (Keying off the per-call loop index instead would re-anchor the grid to
+    # whatever window a mid-run resume happens to restart from.)
+    eval_anchor_ts = start_ts
     # None (Python default) or <0 (the env-binding default) both mean "use the
     # window just past training", which is stable across resume.
     eval_holdout_ts_resolved = (
@@ -1637,14 +1644,19 @@ def _should_eval(i: int) -> bool:
         """Whether to run the full-holdout eval after training window index `i`.
 
         `eval_every_n_windows<=1` (default) preserves the per-window cadence.
-        For K>1 we eval on windows 0, K, 2K, ... and ALWAYS on the final window
-        so the trajectory ends with an eval point. Gated by `eval_each_window`.
+        For K>1 we eval when the ABSOLUTE window ts is on the grid anchored at
+        `eval_anchor_ts` (the original start_ts), i.e. ts in {anchor, anchor+K,
+        anchor+2K, ...}, and ALWAYS on the final window so the trajectory ends
+        with an eval point. Anchoring to the absolute ts (not the per-call loop
+        index `i`) keeps the eval grid (e.g. 150,160,170,...) stable across a
+        mid-run resume, which rebases start_ts/`train_ts_list` to the resume
+        window. Gated by `eval_each_window`.
         """
         if not eval_each_window:
             return False
         if eval_every_n_windows <= 1:
             return True
-        return i % eval_every_n_windows == 0 or i == n_train - 1
+        return (train_ts_list[i] - eval_anchor_ts) % eval_every_n_windows == 0 or i == n_train - 1
 
     # Fixed eval set: held-out users' anchors over the resolved holdout window
     # range, computed ONCE and reused at every eval step. Same anchors every

From dbb9e1d3b60de7212ee04a4c0a3ef7d7dfcfc7dd Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Wed, 10 Jun 2026 03:48:40 -0500
Subject: [PATCH 045/113] dlrmv4: multi-node (N>=1) training over RoCE RDMA via
 consolidated launch_slurm.sh

Consolidate the SLURM orchestration, container/RDMA provisioning, and
in-container trainer launch into a single self-dispatching scripts/launch_slurm.sh
(phases: orchestrate -> provision -> worker) supporting N>=1 nodes; N=1 keeps the
legacy single-node path byte-for-byte.

Multi-node runs real RDMA over the 8 Broadcom bnxt_re RoCE HCAs. The key fix is
an LD_PRELOAD/LD_LIBRARY_PATH overlay of the host's matched rdma-core (v61/v59):
the container's stock v34 provider faults RCCL's deep create_qp (256 WRs) against
the host kernel uapi -> "ibv_create_qp ... Bad address". The unversioned
libibverbs.so symlink in the overlay is required so torch maps only the host lib.
TCP bootstrap is pinned to the routable fenic0; RDMA data goes over bnxt_re
(GID idx 3, TC 104).

Python: derive the global rank (node_rank*gpus_per_node+local_rank), forward
master_addr, pass local_world_size to the TorchRec planner and live world_size to
metrics. Single-node behavior unchanged.

Docs: add docs/multi_node_config.md (enablement details, lessons, and the
cluster-specific knobs to change per fabric); README + perf_opt updated for the
launch_slurm.sh rename.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 recommendation_v4/README.MD                   |  26 +-
 recommendation_v4/docs/multi_node_config.md   | 230 +++++++++
 recommendation_v4/docs/perf_opt.md            |   2 +-
 .../dlrm_v3/train/train_ranker.py             |  62 ++-
 .../dlrm_v3/train/utils.py                    |  16 +-
 recommendation_v4/scripts/launch_slurm.sh     | 475 ++++++++++++++++++
 .../scripts/launch_smoke_8gpu.sh              |  65 ---
 .../scripts/run_streaming_e2e.sh              |   4 +-
 8 files changed, 797 insertions(+), 83 deletions(-)
 create mode 100644 recommendation_v4/docs/multi_node_config.md
 create mode 100755 recommendation_v4/scripts/launch_slurm.sh
 delete mode 100755 recommendation_v4/scripts/launch_smoke_8gpu.sh

diff --git a/recommendation_v4/README.MD b/recommendation_v4/README.MD
index 5c78f6627..dc1ced73b 100644
--- a/recommendation_v4/README.MD
+++ b/recommendation_v4/README.MD
@@ -4,19 +4,37 @@ This is a fork of [meta-recsys/generative-recommenders](https://github.com/meta-
 
 For the original repository and the underlying ICML'24 paper (*Actions Speak Louder than Words*), see the upstream README at the link above. This README focuses on what this fork adds: the Yambda data pipeline, the per-pool gather strategy, and how the data feeds into the HSTU `modules/` (dlrm_v3) path.
 
-## 1. Quick start (8-GPU Yambda)
+## 1. Quick start (Yambda, N×8-GPU)
+
+`scripts/launch_slurm.sh` is the single entry point for **N ≥ 1 nodes**. It
+auto-detects its context: run inside the container it takes the single-node
+worker path; submitted via `sbatch` it orchestrates the multi-node run
+(provision + per-node launch). N=1 is byte-for-byte the legacy single-node path.
+
+**Single node (8-GPU), inside the container:**
 
 ```bash
 docker exec yambda_8gpu bash -c \
-  'cd /workspace/recommendation_v4 && bash scripts/launch_smoke_8gpu.sh'
+  'cd /workspace/recommendation_v4 && bash scripts/launch_slurm.sh'
 ```
 
+**Multi-node (N×8-GPU) via SLURM:**
+
+```bash
+sbatch --nodes=2 --partition=meta64 scripts/launch_slurm.sh
+```
+
+Multi-node uses real RDMA (RoCEv2). The fabric/NCCL setup and every
+cluster-specific knob (interfaces, HCAs, GID/TC, RDMA overlay) are documented in
+[docs/multi_node_config.md](docs/multi_node_config.md) — read it before running on
+a different cluster.
+
 Override the data path or run name without editing the gin:
 
 ```bash
 DLRM_DATA_PATH=/apps/chcai/dlrm_data \
 RUN_NAME=my_experiment \
-bash scripts/launch_smoke_8gpu.sh
+bash scripts/launch_slurm.sh
 ```
 
 Data path resolves at runtime via `env_path` gin macros (see [yambda_5b.gin](generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin)). Traces and any per-run outputs land in `results/<RUN_NAME>/`.
@@ -133,7 +151,7 @@ This means the model sees on average ~1,402 UIH events per sample, not the theor
 
 ## 5. Streaming (temporal-order) training
 
-`scripts/launch_smoke_8gpu.sh` defaults to `--mode streaming-train-eval`, which
+`scripts/launch_slurm.sh` defaults to `--mode streaming-train-eval`, which
 trains Yambda in strict wall-clock order instead of shuffling the whole corpus.
 The timeline is sliced into fixed-duration **windows** (default 1 day,
 `get_dataset.streaming_window_seconds = 86400`), and the loop walks them forward:
diff --git a/recommendation_v4/docs/multi_node_config.md b/recommendation_v4/docs/multi_node_config.md
new file mode 100644
index 000000000..52fdbbb69
--- /dev/null
+++ b/recommendation_v4/docs/multi_node_config.md
@@ -0,0 +1,230 @@
+# Multi-Node Training Enablement (yambda-5b, MI350X / Broadcom bnxt_re RoCE)
+
+How N-node (N×8-GPU) distributed training was brought up for the yambda-5b HSTU
+ranker on the `meta64` cv350 cluster, the hard problems solved, and **exactly
+which settings are cluster/fabric-specific** so this can be reused or re-tuned
+when the underlying network changes.
+
+Companion to [`perf_opt.md`](./perf_opt.md) and [`training_recipe.md`](./training_recipe.md).
+The single entry point is [`scripts/launch_slurm.sh`](../scripts/launch_slurm.sh);
+the Python side is `generative_recommenders/dlrm_v3/train/{train_ranker,utils}.py`.
+
+---
+
+## TL;DR
+
+- Multi-node works over **real RDMA** (RoCEv2 on 8× Broadcom bnxt_re HCAs).
+  2-node = `world_size=16`, clean `rc=0`, ~7.7–8.0k `global_sps` (≈1.28× of
+  1-node 6.2k; weak scaling, per-rank batch fixed).
+- The one non-obvious blocker was a **userspace RDMA provider ABI mismatch**
+  inside the container, fixed with an `LD_PRELOAD`/`LD_LIBRARY_PATH` **overlay**
+  of the host's matched `rdma-core` (no container lib surgery).
+- Everything is one script with three auto-detected phases
+  (`orchestrate` → `provision` → `worker`) plus small Python changes for global
+  ranks. All cluster-specific knobs are env-overridable and tagged
+  `[CLUSTER-SPECIFIC]` in the script.
+
+---
+
+## Architecture: one script, three phases
+
+`launch_slurm.sh` self-dispatches by context (`LAUNCH_SLURM_PHASE`, else
+auto-detected via `/.dockerenv`):
+
+| Phase | Runs on | Does |
+|---|---|---|
+| `orchestrate` | SLURM batch host | Resolve rendezvous (`MASTER_ADDR/PORT`), ensure container on every node (calls `provision`), then `docker exec` the `worker` phase on every node (one srun task per node). |
+| `provision` | each compute node (host) | Ensure the `yambda_primus` container is up (baked image if present, else base image + pip), stage the host RDMA overlay on NFS. |
+| `worker` | inside the container | Derive topology, set NCCL/RDMA env, apply the RDMA overlay, spawn this node's 8 GPU ranks via `train_ranker`. `NNODES==1` => legacy single-node path unchanged. |
+
+Why one script: multi-node enablement is then a single committable file. The
+worker phase is also what the streaming-e2e supervisor invokes directly
+(single-node, already inside the container), so the production path is unchanged.
+
+```
+sbatch --nodes=N launch_slurm.sh
+        │  (batch host: orchestrate)
+        ├─ srun: provision  ──> docker container up + RDMA overlay staged   (×N nodes)
+        └─ srun: docker exec launch_slurm.sh (worker)                       (×N nodes)
+                   │ in container: topology + NCCL/RDMA env + LD overlay
+                   └─ python train_ranker  ──> 8 local ranks  ──> RCCL rendezvous over RDMA
+```
+
+---
+
+## The hard problems (lessons learned)
+
+### 1. RDMA provider ABI mismatch — the core blocker
+
+**Symptom:** multi-node RCCL died at init with
+`ibv_create_qp ... Bad address`.
+
+**Root cause:** the container image (`rocm/primus:v26.3`) ships an **older**
+userspace `rdma-core` (v34, `libbnxt_re-rdmav34.so`) than the **host kernel**
+bnxt_re driver's uapi (host `rdma-core` v61 / `libbnxt_re-rdmav59.so`). The v34
+provider enumerates the HCAs and creates *shallow* QPs fine, but **faults when
+creating a deep send queue** — RCCL uses `max_send_wr=256`. Verified with a
+parameterized verbs probe: v34 `create_qp` is OK at depth ≤16 and faults at ≥64;
+the host v59 provider works at **every** depth. So it is purely the **userspace
+provider**, not the kernel or the fabric (a 2-node RoCEv2 RDMA-write test passes
+on the stock stack, and bare-metal RCCL benchmarks run fine with the host libs).
+
+**Fix (no container surgery):** the `provision` phase stages the host's matched
+`rdma-core` on shared NFS (`$OVERLAY`):
+
+```
+$OVERLAY/lib/libibverbs.so.1          # host libibverbs v61
+$OVERLAY/lib/libibverbs.so -> .so.1   # UNVERSIONED symlink (critical, see below)
+$OVERLAY/lib/libnl-3.so.200, libnl-route-3.so.200
+$OVERLAY/lib/libibverbs/<providers>.so   # incl. libbnxt_re-rdmav59.so
+```
+
+The `worker` phase makes RCCL load it at runtime:
+
+```bash
+export LD_LIBRARY_PATH="$OVERLAY/lib:$OVERLAY/lib/libibverbs:$LD_LIBRARY_PATH"
+export LD_PRELOAD="$OVERLAY/lib/libibverbs.so.1:$LD_PRELOAD"
+```
+
+We do **not** modify the container's system libs — only this process tree's
+`LD_*`. Single-node and other users keep the stock stack.
+
+### 2. The UNVERSIONED `libibverbs.so` symlink is mandatory
+
+An earlier overlay attempt set `LD_LIBRARY_PATH` but still failed with
+`Bad address`. Reason: at `import torch` the ROCm stack pulls in the
+**unversioned** soname `libibverbs.so` (not `libibverbs.so.1`). If the overlay
+only has `libibverbs.so.1`, that unversioned lookup misses the overlay, falls
+through to the **container's** old lib, which then occupies the `libibverbs.so.1`
+slot — so RCCL's later `dlopen("libibverbs.so.1")` binds the v34 stack and
+`create_qp(256)` faults again. The overlay **must** expose
+`libibverbs.so -> libibverbs.so.1`. With it (verified via `/proc/<pid>/maps`),
+the process maps **only** the host lib. `LD_PRELOAD` is belt-and-braces so the
+host lib claims the soname slot first.
+
+### 3. Two network planes — pin TCP bootstrap, RDMA for data
+
+The container is `--network=host`, so RCCL sees **all** host interfaces and, left
+to auto-detect, picks the wrong one. These nodes expose:
+- `benic1p1..benic8p1` — per-GPU point-to-point RoCE links on `192.168.{1..8}.x/31`.
+  These are **not node-routable** for plain TCP; the very first bring-up **hung**
+  in `init_process_group` because RCCL tried the TCP bootstrap over a
+  non-routable `192.168.x` backend addr.
+- `fenic0` — the routable front-end (`10.190.x`).
+
+So we split the planes explicitly:
+- `NCCL_SOCKET_IFNAME=fenic0` → TCP bootstrap/rendezvous over the routable NIC.
+- `NCCL_IB_HCA=bnxt_re0..7` → RDMA **data** over the 8 RoCE HCAs (the RoCEv2
+  fabric *is* reachable rail-to-rail at the RDMA layer even though plain IP is not).
+
+### 4. Minimal proven bnxt_re NCCL config
+
+The minimal set proven on these nodes (matches cmcknigh's bare-metal RCCL
+benchmarks): `NCCL_IB_GID_INDEX=3` (RoCEv2 IPv4 GID), `NCCL_IB_TC=104` (RoCE
+lossless / PFC traffic class). **Do not** add the heavy
+`QPS_PER_CONNECTION / ECE / DMABUF` block — that belongs to a different
+(ionic AINIC) fabric and is counterproductive on bnxt_re. GPU-Direct RDMA
+(`NCCL_NET_GDR_LEVEL`) is left **off**: it needs DMABUF/peermem, unavailable
+in-container here, so RCCL stages through host memory (still real RDMA).
+
+### 5. Rendezvous must be resolved on the host
+
+The container image has **no SLURM client** (`scontrol` absent). So the
+`orchestrate` phase resolves `MASTER_ADDR` (first host of the allocation) and a
+deterministic `MASTER_PORT` (`20000 + job_id % 20000`, same on all nodes) **on
+the host** and forwards them into the container via `docker exec -e`.
+
+### 6. Global rank derivation (Python)
+
+`mp.start_processes` hands out a node-local `local_rank` (0..7). Every downstream
+consumer (data sharding, checkpoint I/O, metrics) needs the **global** rank:
+
+```python
+rank = node_rank * gpus_per_node + local_rank   # train_ranker._main_func
+device = torch.device(f"cuda:{local_rank}")      # CUDA device stays node-local
+```
+
+Also: `make_optimizer_and_shard(local_world_size=gpus_per_node)` so the TorchRec
+planner respects the intra-node GPU count, and `MetricsLogger(world_size=...)`
+gets the live world size (the gin default of 8 would mis-normalize multi-node).
+`NNODES==1` makes `rank == local_rank` — identical to the old single-node path.
+
+### 7. `$0` is the staged `slurm_script`, not the repo path
+
+For an sbatch batch script, `$0` =
+`/var/spool/slurmd/job<ID>/slurm_script` (node-local), so deriving the script /
+repo path from `$0` gives a path that **doesn't exist on other nodes** (`bash
+$SELF` → "No such file", and the worker's `cd $REPO` → exit 127). The
+`orchestrate` phase instead resolves the real shared-NFS path from SLURM:
+
+```bash
+SCRIPT_PATH=$(scontrol show job "$SLURM_JOB_ID" | grep -oP 'Command=\K\S+')
+# fallbacks: $SLURM_SUBMIT_DIR/scripts/launch_slurm.sh, then $SELF
+REPO=$(cd "$(dirname "$SCRIPT_PATH")/.." && pwd)
+```
+
+### 8. `srun ... bash -c "…"` host-vs-remote expansion
+
+Inside the double-quoted srun command string, **plain `$VAR` expands now on the
+batch host** (values computed in orchestrate: `$MASTER_ADDR`, `$SCRIPT_PATH`, …)
+while **`\$VAR` is deferred to each compute node** (`\$SLURM_NODEID`,
+`\$(hostname)`) where the per-node SLURM env lives. Mixing these up sends every
+rank the wrong node id.
+
+### 9. `memlock` ulimit for QP registration
+
+`docker run --ulimit memlock=-1:-1` is **required** — RDMA QP memory
+registration needs unlimited locked memory. A container started with the default
+8 MB memlock fails QP creation regardless of the overlay.
+
+### 10. Provisioning & the image-bake caveat
+
+Fresh nodes otherwise re-download a **6.1 GB** ROCm torch wheel + pip + build
+torchrec-from-git every time. The script supports a pre-baked image
+(`docker commit` → NFS tar → `docker load` offline). **Caveat:** the committed
+image is **~127 GB** (ROCm base is huge), so the full-image NFS tar is impractical
+(loading it can be slower than re-downloading 6 GB). For true download-avoidance
+prefer a **local pip wheelhouse** (`pip install --no-index --find-links` from
+~8 GB of NFS wheels) or a **local registry** (ships only the ~35 GB delta layer).
+The bake hook is left in (`BAKE_IMAGE=1`) but defaults off; provisioning falls
+back to base-image + pip.
+
+### Debunked theory (do not re-introduce)
+
+An earlier claim that the container's rdma-core was "too old → 0 devices /
+Bad address" and needed an **in-place lib copy** was a red herring: the "0
+devices" came from a *broken in-place copy* of the host EL9 libs (mixing v34
+tooling that links `IBVERBS_PRIVATE_34` with host v61 libs breaks symbol-version
+lookup). The stock container enumerates all 8 HCAs fine. The real issue is only
+the deep-QP create path; the fix is the **LD overlay**, never in-place surgery.
+
+---
+
+## Cluster-specific settings — change these when the fabric/hardware changes
+
+All are env-overridable and tagged `[CLUSTER-SPECIFIC]` in `launch_slurm.sh`
+(`grep '\[CLUSTER-SPECIFIC\]' scripts/launch_slurm.sh`).
+
+| Setting | Default (meta64) | What it is | How to find the right value |
+|---|---|---|---|
+| `#SBATCH --partition` | `meta64` | scheduler partition | `sinfo` |
+| bind mounts + default paths | `/home/chcai`, `/apps/chcai` | repo + scratch, **must be shared/NFS on all nodes** | `df -h`, cluster docs |
+| `IMAGE` | `rocm/primus:v26.3` | base container (GPU arch + ROCm version) | vendor image registry |
+| docker `--device` | `/dev/kfd /dev/dri` (AMD) | GPU passthrough | NVIDIA: `--gpus all` / nvidia runtime |
+| `--ulimit memlock` | `-1` | locked mem for RDMA QP | keep `-1` for any RDMA fabric |
+| `TORCH_IDX` / torch,vision,audio | `rocm7.2`, `2.12.0+rocm7.2` … | ROCm-version'd wheels | `download.pytorch.org/whl/<rocmX>` |
+| `FBGEMM_WHL` | gfx950 wheel on NFS | GPU-arch fbgemm | build/stage per arch |
+| `NCCL_SOCKET_IFNAME` | `fenic0` | **routable** host NIC for TCP bootstrap | `ip -br addr` (pick the routable one; NOT the per-GPU RDMA NICs) |
+| `NCCL_IB_HCA` | `bnxt_re0..7` | RDMA HCA device names | `ibv_devices` (vendor: `mlx5_*`, `ionic_*`, …) |
+| `NCCL_IB_GID_INDEX` | `3` | RoCEv2 IPv4 GID index | `show_gids` (v1/v2 & IPv4/IPv6 differ per port) |
+| `NCCL_IB_TC` | `104` | RoCE lossless / PFC traffic class | fabric/switch admin |
+| `RDMA_OVERLAY` (+ provider .so) | `/apps/chcai/rdma_host_el9_new` | host rdma-core overlay | only needed if container rdma-core < host kernel uapi; else set `RDMA_OVERLAY=` to disable. Stage the host's matching `/usr/lib64/libibverbs/<provider>.so` |
+
+**Different NIC vendor (e.g. Mellanox `mlx5`)** typically means: change
+`NCCL_IB_HCA` names, re-check `NCCL_IB_GID_INDEX`/`NCCL_IB_TC`, and the RDMA
+overlay is often **unnecessary** (Mellanox userspace in the image usually matches
+the host) — set `RDMA_OVERLAY=` to skip it.
+
+**Emergency fallback:** `NCCL_NET_TRANSPORT=socket` disables IB and runs
+allreduce over TCP (`fenic0`). Functional but ~100–200× slower; use only to
+isolate a fabric problem.
diff --git a/recommendation_v4/docs/perf_opt.md b/recommendation_v4/docs/perf_opt.md
index 627ab74aa..7799848ad 100644
--- a/recommendation_v4/docs/perf_opt.md
+++ b/recommendation_v4/docs/perf_opt.md
@@ -67,7 +67,7 @@ rocm-smi -d 0 --showclocks        # expect sclk ~2000+ MHz under load
 rocm-smi --setperflevel auto      # restore boost
 ```
 
-`scripts/launch_smoke_8gpu.sh` now logs the perf level + a live `sclk` sample on
+`scripts/launch_slurm.sh` (worker phase) now logs the perf level + a live `sclk` sample on
 every launch, auto-restores `auto` if it finds a `perf_determinism`/`manual`/`low`
 lock, and warns (to reset from the host) if it lacks permission inside the
 container. **Always sanity-check `sclk ≈ 2000+ MHz` before trusting a benchmark.**
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
index 6ae88eba2..55eece518 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
@@ -49,14 +49,27 @@
 
 
 def _main_func(
-    rank: int,
+    local_rank: int,
     world_size: int,
+    node_rank: int,
+    gpus_per_node: int,
+    master_addr: str,
     master_port: int,
     gin_file: str,
     mode: str,
 ) -> None:
-    device = torch.device(f"cuda:{rank}")
-    logger.info(f"rank: {rank}, world_size: {world_size}, device: {device}")
+    # `local_rank` is the index handed out by mp.start_processes (0..gpus_per_node-1)
+    # and indexes this node's GPUs. The GLOBAL rank is what every downstream
+    # consumer wants (data sharding via StreamingWindowSampler, checkpoint I/O,
+    # metrics), so derive it once and pass it through as `rank`. Only the CUDA
+    # device must be node-local. Single-node (node_rank=0) → rank == local_rank,
+    # exactly as before.
+    rank = node_rank * gpus_per_node + local_rank
+    device = torch.device(f"cuda:{local_rank}")
+    logger.info(
+        f"rank: {rank} (node_rank={node_rank} local_rank={local_rank}), "
+        f"world_size: {world_size}, device: {device}"
+    )
     # Phase 1: parse gin early with skip_unknown=True so env-bootstrap
     # bindings take effect BEFORE any module-level @gin.configurable
     # discovers itself. This is required because triton @triton.autotune
@@ -89,6 +102,7 @@ def _main_func(
     setup(
         rank=rank,
         world_size=world_size,
+        master_addr=master_addr,
         master_port=master_port,
         device=device,
     )
@@ -101,7 +115,10 @@ def _main_func(
 
     model, model_configs, embedding_table_configs = make_model()
     model, optimizer = make_optimizer_and_shard(
-        model=model, device=device, world_size=world_size
+        model=model,
+        device=device,
+        world_size=world_size,
+        local_world_size=gpus_per_node,
     )
     train_dataloader, test_dataloader = make_train_test_dataloaders(
         hstu_config=model_configs,
@@ -129,6 +146,10 @@ def _main_func(
         window_size=2500,
         device=device,
         rank=rank,
+        # Pass the live world_size so metric normalization is correct at any
+        # node count; the gin's MetricsLogger.world_size default (=8) is only a
+        # single-node fallback and would mis-normalize a multi-node run.
+        world_size=world_size,
         num_flops_per_sample=num_flops_per_sample,
         gpu_peak_flops=gpu_peak_flops,
         model=model,
@@ -171,6 +192,7 @@ def _main_func(
                 window_size=1000,
                 device=device,
                 rank=rank,
+                world_size=world_size,
             )
             eval_loop(
                 rank=rank,
@@ -236,14 +258,38 @@ def main() -> None:
         "train-eval",
         "streaming-train-eval",
     ], f"Unsupported mode: {args.mode}"
-    WORLD_SIZE = int(os.environ.get("WORLD_SIZE", 1))
-    MASTER_PORT = str(get_free_port())
+    # Distributed topology (single-node defaults reproduce the legacy behavior):
+    #   GPUS_PER_NODE  local procs to spawn on THIS node (default: all visible GPUs)
+    #   NNODES/NODE_RANK  multi-node fan-out, set by the SLURM launcher
+    #   WORLD_SIZE     global rank count = NNODES * GPUS_PER_NODE
+    #   MASTER_ADDR/PORT  rank-0 rendezvous; the port MUST match across nodes, so
+    #                     honor it from the env when set and only fall back to a
+    #                     random free port for the standalone single-node path.
+    GPUS_PER_NODE = int(os.environ.get("GPUS_PER_NODE", 0)) or torch.cuda.device_count()
+    NNODES = int(os.environ.get("NNODES", 1))
+    NODE_RANK = int(os.environ.get("NODE_RANK", 0))
+    WORLD_SIZE = NNODES * GPUS_PER_NODE
+    MASTER_ADDR = os.environ.get("MASTER_ADDR", "localhost")
+    MASTER_PORT = str(os.environ.get("MASTER_PORT") or get_free_port())
     gin_path = f"{os.path.dirname(__file__)}/gin/{SUPPORTED_CONFIGS[args.dataset]}"
+    logger.info(
+        f"launching: nnodes={NNODES} node_rank={NODE_RANK} "
+        f"gpus_per_node={GPUS_PER_NODE} world_size={WORLD_SIZE} "
+        f"master={MASTER_ADDR}:{MASTER_PORT}"
+    )
 
     mp.start_processes(
         _main_func,
-        args=(WORLD_SIZE, MASTER_PORT, gin_path, args.mode),
-        nprocs=WORLD_SIZE,
+        args=(
+            WORLD_SIZE,
+            NODE_RANK,
+            GPUS_PER_NODE,
+            MASTER_ADDR,
+            MASTER_PORT,
+            gin_path,
+            args.mode,
+        ),
+        nprocs=GPUS_PER_NODE,
         join=True,
         start_method="spawn",
     )
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index 8b06152a1..c27f38761 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -77,9 +77,15 @@
 
 
 def setup(
-    rank: int, world_size: int, master_port: int, device: torch.device
+    rank: int,
+    world_size: int,
+    master_port: int,
+    device: torch.device,
+    master_addr: str = "localhost",
 ) -> dist.ProcessGroup:
-    os.environ["MASTER_ADDR"] = "localhost"
+    # Default "localhost" keeps the single-node path unchanged; multi-node
+    # launches pass the rank-0 host so every node rendezvouses at the same addr.
+    os.environ["MASTER_ADDR"] = master_addr
     os.environ["MASTER_PORT"] = str(master_port)
 
     BACKEND = dist.Backend.NCCL
@@ -339,6 +345,7 @@ def make_optimizer_and_shard(
     model: torch.nn.Module,
     device: torch.device,
     world_size: int,
+    local_world_size: Optional[int] = None,
     hbm_cap_gb: int = 260,
 ) -> Tuple[DistributedModelParallel, torch.optim.Optimizer]:
     dense_opt_cls, dense_opt_args, dense_opt_factory = (
@@ -357,9 +364,12 @@ def make_optimizer_and_shard(
                         sparse_opt_cls, [param], sparse_opt_args
                     )
     sharders = get_default_sharders()
+    # local_world_size = GPUs per node so the planner respects the intra-node
+    # (xGMI/NVLink) vs inter-node hierarchy when placing shards. Defaults to
+    # world_size for the single-node case (no behavior change).
     planner = EmbeddingShardingPlanner(
         topology=Topology(
-            local_world_size=world_size,
+            local_world_size=local_world_size or world_size,
             world_size=world_size,
             compute_device="cuda",
             hbm_cap=hbm_cap_gb * 1024 * 1024 * 1024,
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
new file mode 100755
index 000000000..0b5547497
--- /dev/null
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -0,0 +1,475 @@
+#!/bin/bash
+#SBATCH --job-name=yambda_slurm
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --exclusive
+#SBATCH --partition=meta64          # [CLUSTER-SPECIFIC] partition name
+#SBATCH --time=01:10:00
+#SBATCH --output=/apps/chcai/yambda_slurm.%j.out
+# =============================================================================
+# launch_slurm.sh — single entry point for the yambda-5b trainer on N>=1 nodes.
+#
+# Consolidates what used to be three separate files so multi-node enablement is
+# ONE committable script (plus the train_ranker.py / utils.py python changes):
+#   * sbatch_smoke_multinode.sh   -> the `orchestrate` phase (host SLURM glue)
+#   * _provision_yambda_primus.sh -> the `provision`   phase (container + RDMA)
+#   * launch_smoke_8gpu.sh        -> the `worker`      phase (in-container train)
+#
+# PHASES (auto-detected from context; force with LAUNCH_SLURM_PHASE=<phase>):
+#   orchestrate  Runs on the SLURM batch host (no /.dockerenv). Resolves the
+#                rendezvous (MASTER_ADDR/PORT), ensures the container on every
+#                node (provision phase), then `docker exec`s the worker phase on
+#                every node, one task per node.
+#   provision    Runs on a compute-node host. Ensures the `yambda_primus`
+#                container is up (loads the pre-baked image if present — no
+#                internet/pip — else builds from the base image) and stages the
+#                host RDMA userspace overlay on shared NFS.
+#   worker       Runs INSIDE the container. Sets the distributed topology +
+#                NCCL/RDMA env and spawns this node's GPU ranks via train_ranker.
+#                N==1 transparently uses the legacy single-node path (localhost,
+#                node_rank 0), byte-for-byte as before, so the streaming-e2e
+#                supervisor's direct `bash scripts/launch_slurm.sh` is unchanged.
+#
+# USAGE
+#   Multi-node (N>=1):  sbatch --nodes=2 scripts/launch_slurm.sh
+#   Single-node direct: bash scripts/launch_slurm.sh   (already inside container;
+#                       what run_streaming_e2e.sh invokes per relaunch)
+#   Perf pair:
+#     LOG=/apps/chcai/perf_1node.log NUM_TRAIN_BATCHES=200 NUM_EVAL_BATCHES=0 \
+#       EVAL_EACH_WINDOW=0 METRIC_LOG_FREQ=20 \
+#       sbatch --nodes=1 --job-name=y1 scripts/launch_slurm.sh
+#     LOG=/apps/chcai/perf_2node.log NUM_TRAIN_BATCHES=200 NUM_EVAL_BATCHES=0 \
+#       EVAL_EACH_WINDOW=0 METRIC_LOG_FREQ=20 \
+#       sbatch --nodes=2 --job-name=y2 scripts/launch_slurm.sh
+#     # then: bash scripts/compare_node_perf.sh /apps/chcai/perf_1node.log /apps/chcai/perf_2node.log
+#
+# ONE-TIME IMAGE BAKE (so fresh nodes skip the multi-GB torch download + pip):
+#   BAKE_IMAGE=1 LAUNCH_SLURM_PHASE=provision bash scripts/launch_slurm.sh
+#   (commits the deps-installed container to $BAKED_IMAGE and `docker save`s it to
+#    $BAKED_TAR on NFS; subsequent provisions `docker load` it offline.)
+#
+# -----------------------------------------------------------------------------
+# PORTABILITY — what to change for a DIFFERENT cluster / network / hardware.
+# Every such knob is also tagged inline with "[CLUSTER-SPECIFIC]" (grep for it).
+# All are env-overridable, so you can adapt without editing this file.
+#
+#  A) SLURM / scheduler
+#     - #SBATCH --partition=meta64  : partition name. CHANGE per cluster.
+#     - #SBATCH --time / --exclusive : policy; adjust to taste.
+#
+#  B) Filesystems (must be shared/NFS across ALL nodes — this script re-invokes
+#     itself and reads the overlay + data from these paths cluster-wide)
+#     - /home/chcai (repo + this script) and /apps/chcai (scratch: logs, overlay,
+#       baked tar, data, pip tarball). CHANGE both the bind mounts in the
+#       `docker run` (provision) and the default LOG/BAKED_TAR/OVERLAY/PIP_* paths.
+#
+#  C) Container image / GPU software stack (tied to the GPU arch + ROCm version)
+#     - IMAGE=rocm/primus:v26.3        : base image. ROCm/AMD-specific.
+#     - docker run --device=/dev/kfd --device=/dev/dri --group-add video : AMD ROCm
+#       device passthrough. For NVIDIA this is --gpus all / nvidia runtime instead.
+#     - --ulimit memlock=-1 : REQUIRED for RDMA QP registration (do not drop).
+#     - TORCH_IDX (rocm7.2), torch/vision/audio ==*+rocm7.2, FBGEMM_WHL (a gfx950
+#       wheel), torchrec pin : the whole deps set is arch/ROCm-version-specific.
+#
+#  D) Network fabric — THE trickiest part; defaults are PROVEN on meta64 cv350
+#     (Broadcom bnxt_re RoCEv2). On a different fabric these almost certainly change
+#     (see the worker-phase block for the full rationale):
+#     - NCCL_SOCKET_IFNAME=fenic0 : the ONE routable host NIC for TCP bootstrap.
+#       Find yours with `ip -br addr`; the per-GPU RDMA NICs are usually NOT
+#       routable for plain TCP, so auto-detect hangs init — you MUST pin this.
+#     - NCCL_IB_HCA=bnxt_re0..7 : the RDMA HCA device names. List with `ibv_devices`.
+#       Different NIC vendor (e.g. mlx5_*, ionic_*) => different names AND a
+#       different userspace provider, which changes the RDMA overlay below.
+#     - NCCL_IB_GID_INDEX=3 : RoCEv2 IPv4 GID index. Check `show_gids`; v1/v2 and
+#       IPv4/IPv6 live at different indices per port.
+#     - NCCL_IB_TC=104 : RoCE lossless (PFC) traffic class. Fabric/switch-specific.
+#     - RDMA overlay (provision phase): only needed when the CONTAINER's rdma-core
+#       is older than the HOST kernel driver's uapi (our bnxt_re v34-vs-v59 case).
+#       Different NIC/host => different /usr/lib64 provider .so to stage, or the
+#       overlay may be unnecessary entirely (set RDMA_OVERLAY= to disable). If RDMA
+#       can't be made to work, NCCL_NET_TRANSPORT=socket falls back to TCP.
+#
+#  E) Not cluster-specific (auto-derived): GPUS_PER_NODE (torch.cuda.device_count),
+#     NNODES/NODE_RANK/MASTER_ADDR (from SLURM), WORLD_SIZE.
+# =============================================================================
+set -uo pipefail
+
+# Absolute path to THIS script so the orchestrate phase can re-invoke it on every
+# node (home is shared NFS, so the same path resolves cluster-wide).
+SELF=$(cd "$(dirname "$0")" && pwd)/$(basename "$0")
+REPO_ROOT=$(cd "$(dirname "$0")/.." && pwd)
+
+# ---- phase detection --------------------------------------------------------
+PHASE="${LAUNCH_SLURM_PHASE:-}"
+if [ -z "$PHASE" ]; then
+  if [ -f /.dockerenv ]; then PHASE=worker; else PHASE=orchestrate; fi
+fi
+
+# ---- shared config (env-overridable) ----------------------------------------
+CONTAINER=${CONTAINER:-yambda_primus}
+REPO=${REPO:-$REPO_ROOT}                       # repo path inside the container
+IMAGE=${IMAGE:-rocm/primus:v26.3}              # [CLUSTER-SPECIFIC] ROCm/arch base image
+BAKED_IMAGE=${BAKED_IMAGE:-yambda_primus_baked:latest}
+BAKED_TAR=${BAKED_TAR:-/apps/chcai/yambda_primus_baked.tar}   # [CLUSTER-SPECIFIC] shared-NFS path
+USE_BAKED=${USE_BAKED:-1}
+OVERLAY=${RDMA_OVERLAY:-/apps/chcai/rdma_host_el9_new}        # [CLUSTER-SPECIFIC] shared-NFS RDMA overlay
+
+# =============================================================================
+# PHASE: orchestrate  (SLURM batch host)
+# =============================================================================
+orchestrate() {
+  # When run as the SLURM batch script, $0 is the node-local staged copy
+  # (/var/spool/slurmd/job<ID>/slurm_script), so $SELF / $REPO_ROOT are WRONG
+  # here (they don't exist on other nodes). Resolve the REAL shared-NFS script
+  # path + repo root from SLURM so we can re-invoke this script on every node and
+  # `cd` to the right repo inside the container.
+  SCRIPT_PATH=$(scontrol show job "${SLURM_JOB_ID:-0}" 2>/dev/null | grep -oP 'Command=\K\S+')
+  [ -f "${SCRIPT_PATH:-}" ] || SCRIPT_PATH="${SLURM_SUBMIT_DIR:-$REPO_ROOT}/scripts/launch_slurm.sh"
+  [ -f "$SCRIPT_PATH" ] || SCRIPT_PATH="$SELF"
+  REPO=$(cd "$(dirname "$SCRIPT_PATH")/.." && pwd)
+
+  LOG=${LOG:-/apps/chcai/yambda_slurm.${SLURM_JOB_ID:-manual}.log}
+
+  # Smoke defaults — override via env for a perf run (see header USAGE).
+  MODE=${MODE:-streaming-train-eval}
+  START_TS=${START_TS:-150}
+  NUM_TRAIN_TS=${NUM_TRAIN_TS:-1}
+  NUM_TRAIN_BATCHES=${NUM_TRAIN_BATCHES:-20}
+  NUM_EVAL_BATCHES=${NUM_EVAL_BATCHES:-10}
+  EVAL_EACH_WINDOW=${EVAL_EACH_WINDOW:-1}
+  EVAL_EVERY_N_WINDOWS=${EVAL_EVERY_N_WINDOWS:-1}
+  METRIC_LOG_FREQ=${METRIC_LOG_FREQ:-5}
+  FORCE_PROVISION=${FORCE_PROVISION:-0}
+
+  : > "$LOG"
+  echo "[$(date)] launch_slurm/orchestrate: job=${SLURM_JOB_ID:-?} nodes=${SLURM_JOB_NODELIST:-?} nnodes=${SLURM_NNODES:-1}" | tee -a "$LOG"
+  echo "[$(date)] resolved SCRIPT_PATH=$SCRIPT_PATH REPO=$REPO" | tee -a "$LOG"
+  echo "[$(date)] config: MODE=$MODE START_TS=$START_TS NUM_TRAIN_TS=$NUM_TRAIN_TS NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES METRIC_LOG_FREQ=$METRIC_LOG_FREQ" | tee -a "$LOG"
+
+  # Rendezvous resolved on the HOST (the container image has no SLURM client).
+  MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" 2>/dev/null | head -1)
+  MASTER_ADDR=${MASTER_ADDR:-localhost}
+  MASTER_PORT=$(( 20000 + ${SLURM_JOB_ID:-0} % 20000 ))
+  echo "[$(date)] rendezvous: MASTER_ADDR=$MASTER_ADDR MASTER_PORT=$MASTER_PORT" | tee -a "$LOG"
+
+  # Optional NCCL/RCCL fabric overrides — forwarded into the container only when
+  # set at submit time (docker exec does NOT inherit the srun task env). The
+  # worker phase applies its own validated multi-node bnxt_re defaults when these
+  # are unset. Common: NCCL_NET_TRANSPORT=socket (TCP fallback), NCCL_DEBUG=INFO.
+  NCCL_ENV_ARGS=""
+  for v in NCCL_NET_TRANSPORT NCCL_DEBUG NCCL_SOCKET_IFNAME NCCL_IB_HCA NCCL_IB_GID_INDEX \
+           NCCL_IB_TC NCCL_IB_TIMEOUT NCCL_IGNORE_CPU_AFFINITY RCCL_MSCCL_ENABLE NCCL_NET_GDR_LEVEL \
+           NCCL_IB_PCI_RELAXED_ORDERING NCCL_IB_USE_INLINE NCCL_IB_QPS_PER_CONNECTION \
+           NCCL_IB_ECE_ENABLE NCCL_DMABUF_ENABLE NCCL_GDRCOPY_ENABLE NCCL_GDR_FLUSH_DISABLE \
+           NCCL_PXN_DISABLE NCCL_CHECKS_DISABLE NCCL_CROSS_NIC RDMA_OVERLAY; do
+    eval "val=\${$v:-}"
+    if [ -n "$val" ]; then NCCL_ENV_ARGS="$NCCL_ENV_ARGS -e $v=$val"; fi
+  done
+
+  # TRICKY — variable expansion inside the `srun ... bash -c "..."` blocks below:
+  # the string is double-quoted, so PLAIN $VAR expands NOW on the batch host (e.g.
+  # $MASTER_ADDR, $CONTAINER, $SCRIPT_PATH — values computed above), while
+  # BACKSLASH-escaped \$VAR is passed through literally and expands LATER on each
+  # compute node inside the srun task (e.g. \$SLURM_NODEID, \$(hostname)) where the
+  # per-node SLURM_* env actually lives. Mixing these up sends every rank the
+  # wrong node id or breaks the docker exec — keep the \$ on per-node values.
+
+  # --- step 1: ensure the container is up on every node ----------------------
+  echo "[$(date)] ensuring container '$CONTAINER' on all nodes (force=$FORCE_PROVISION)" | tee -a "$LOG"
+  srun --ntasks-per-node=1 bash -c "
+    if [ \"$FORCE_PROVISION\" = \"1\" ] || ! docker exec $CONTAINER true >/dev/null 2>&1; then
+      echo \"[\$(hostname)] (re)provisioning container\"
+      LAUNCH_SLURM_PHASE=provision CONTAINER=$CONTAINER IMAGE=$IMAGE \
+        BAKED_IMAGE=$BAKED_IMAGE BAKED_TAR=$BAKED_TAR USE_BAKED=$USE_BAKED \
+        BAKE_IMAGE=${BAKE_IMAGE:-0} RDMA_OVERLAY=$OVERLAY REPO=$REPO bash $SCRIPT_PATH
+    else
+      echo \"[\$(hostname)] container already up\"
+    fi
+  " 2>&1 | tee -a "$LOG"
+
+  # --- step 2: launch the worker (trainer) inside the container on every node -
+  echo "[$(date)] launching trainer (worker phase) on all nodes" | tee -a "$LOG"
+  srun --ntasks-per-node=1 bash -c "
+    docker exec \
+      -e LAUNCH_SLURM_PHASE=worker \
+      -e SLURM_NNODES=\$SLURM_NNODES \
+      -e SLURM_NODEID=\$SLURM_NODEID \
+      -e SLURM_PROCID=\$SLURM_PROCID \
+      -e SLURM_JOB_NODELIST=\"\$SLURM_JOB_NODELIST\" \
+      -e SLURM_JOB_ID=\$SLURM_JOB_ID \
+      -e MASTER_ADDR=$MASTER_ADDR \
+      -e MASTER_PORT=$MASTER_PORT \
+      -e HSTU_HAMMER_KERNEL=${HSTU_HAMMER_KERNEL:-TRITON} \
+      -e MODE=$MODE \
+      -e START_TS=$START_TS \
+      -e NUM_TRAIN_TS=$NUM_TRAIN_TS \
+      -e EVAL_EACH_WINDOW=$EVAL_EACH_WINDOW \
+      -e EVAL_EVERY_N_WINDOWS=$EVAL_EVERY_N_WINDOWS \
+      -e NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES \
+      -e NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES \
+      -e METRIC_LOG_FREQ=$METRIC_LOG_FREQ \
+      -e TRAIN_SPLIT_PERCENTAGE=${TRAIN_SPLIT_PERCENTAGE:-0.90} \
+      -e SPLIT_SALT=${SPLIT_SALT:-0} \
+      -e EVAL_HOLDOUT_TS=${EVAL_HOLDOUT_TS:--1} \
+      -e EVAL_HOLDOUT_NUM_WINDOWS=${EVAL_HOLDOUT_NUM_WINDOWS:-1} \
+      ${RUN_NAME:+-e RUN_NAME=$RUN_NAME} \
+      ${TENSORBOARD_LOG_PATH:+-e TENSORBOARD_LOG_PATH=$TENSORBOARD_LOG_PATH} \
+      ${CKPT_PATH:+-e CKPT_PATH=$CKPT_PATH} \
+      -e LOG=$LOG \
+      $NCCL_ENV_ARGS \
+      $CONTAINER bash -lc 'cd $REPO && LAUNCH_SLURM_PHASE=worker bash scripts/launch_slurm.sh'
+  " 2>&1 | tee -a "$LOG"
+  rc=${PIPESTATUS[0]}
+  echo "[$(date)] launch_slurm/orchestrate finished rc=$rc" | tee -a "$LOG"
+  exit $rc
+}
+
+# =============================================================================
+# PHASE: provision  (compute-node host)
+# =============================================================================
+provision() {
+  export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:${PATH:-}"
+  DOCKER=$(command -v docker 2>/dev/null || true); DOCKER=${DOCKER:-/usr/bin/docker}
+  FBGEMM_WHL=${FBGEMM_WHL:-/apps/chcai/FBGEMM/fbgemm_gpu/dist/fbgemm_gpu_nightly_rocm-2026.6.2-cp312-cp312-linux_x86_64.whl}  # [CLUSTER-SPECIFIC] gfx950/ROCm wheel
+  TORCH_IDX=${TORCH_IDX:-https://download.pytorch.org/whl/rocm7.2}  # [CLUSTER-SPECIFIC] ROCm version index
+  echo "[provision] host=$(hostname) container=$CONTAINER docker=$DOCKER"
+
+  # Resolve which image to run + whether deps must be installed. Prefer a pre-baked
+  # image (deps already installed) to skip the multi-GB torch download + pip /
+  # torchrec-from-git build on every fresh node:
+  #   1) baked image in this node's docker -> use it, skip deps
+  #   2) baked image tar on NFS            -> docker load (local, no internet)
+  #   3) neither                           -> base image + pip (slow path, which
+  #                                           can then be baked via BAKE_IMAGE=1)
+  NEED_DEPS=1
+  RUN_IMAGE="$IMAGE"
+  if [ "$USE_BAKED" = "1" ]; then
+    if "$DOCKER" image inspect "$BAKED_IMAGE" >/dev/null 2>&1; then
+      echo "[provision] using baked image $BAKED_IMAGE (deps preinstalled, no download)"
+      RUN_IMAGE="$BAKED_IMAGE"; NEED_DEPS=0
+    elif [ -f "$BAKED_TAR" ]; then
+      echo "[provision] loading baked image from $BAKED_TAR (local, no internet)..."
+      if "$DOCKER" load -i "$BAKED_TAR" >/dev/null 2>&1 && "$DOCKER" image inspect "$BAKED_IMAGE" >/dev/null 2>&1; then
+        RUN_IMAGE="$BAKED_IMAGE"; NEED_DEPS=0; echo "[provision] baked image loaded"
+      else
+        echo "[provision] WARNING: docker load failed; falling back to base-image + pip"
+      fi
+    fi
+  fi
+  if ! "$DOCKER" image inspect "$RUN_IMAGE" >/dev/null 2>&1; then
+    echo "[provision] pulling $RUN_IMAGE (this can take a while)..."; "$DOCKER" pull "$RUN_IMAGE"
+  fi
+
+  echo "[provision] (re)starting container $CONTAINER from $RUN_IMAGE"
+  "$DOCKER" rm -f "$CONTAINER" >/dev/null 2>&1 || true
+  "$DOCKER" run -d --name "$CONTAINER" \
+    --network=host --ipc=host --shm-size=64g \
+    --device=/dev/kfd --device=/dev/dri --group-add video \
+    `# [CLUSTER-SPECIFIC] AMD ROCm device passthrough; NVIDIA uses --gpus all / nvidia runtime` \
+    --cap-add=SYS_PTRACE --cap-add=CAP_SYS_ADMIN --cap-add=IPC_LOCK \
+    --ulimit memlock=-1:-1 --ulimit stack=67108864:67108864 \
+    `# memlock=-1 is REQUIRED for RDMA QP memory registration — do not drop` \
+    --security-opt seccomp=unconfined --privileged \
+    -v /home/chcai:/home/chcai \
+    -v /apps/chcai:/apps/chcai \
+    `# [CLUSTER-SPECIFIC] shared-NFS bind mounts: repo + scratch (overlay/logs/data)` \
+    -w "$REPO" \
+    "$RUN_IMAGE" sleep infinity
+
+  # --- RDMA userspace overlay for in-container RCCL (bnxt_re) -----------------
+  # The image (rocm/primus, rdma-core 50/libbnxt_re-rdmav34) ships an OLDER RDMA
+  # userspace than the host kernel bnxt_re driver. The stock v34 provider faults
+  # RCCL's deep-queue create_qp (max_send_wr=256) against the newer kernel uapi
+  # -> "ibv_create_qp ... Bad address". Fix: stage the host's matched rdma-core
+  # (libibverbs v61 + libbnxt_re-rdmav59 + libnl) on NFS so the worker phase makes
+  # RCCL load it via LD_PRELOAD + LD_LIBRARY_PATH. The UNVERSIONED libibverbs.so
+  # symlink is essential (import torch pulls the unversioned soname; without it
+  # the lookup falls through to the container v34 lib and the fix regresses).
+  if [ "${FORCE_OVERLAY:-0}" != "1" ] && ls "$OVERLAY/lib/libibverbs/"libbnxt_re-rdmav*.so >/dev/null 2>&1 && [ -L "$OVERLAY/lib/libibverbs.so" ]; then
+    echo "[provision] host RDMA overlay already staged at $OVERLAY (shared NFS) — skipping"
+  else
+    echo "[provision] staging host RDMA userspace overlay -> $OVERLAY"
+    rm -rf "${OVERLAY}.tmp" 2>/dev/null
+    mkdir -p "${OVERLAY}.tmp/lib/libibverbs" "${OVERLAY}.tmp/libibverbs.d"
+    cp -L /usr/lib64/libibverbs.so.1 /usr/lib64/libnl-3.so.200 /usr/lib64/libnl-route-3.so.200 "${OVERLAY}.tmp/lib/" 2>/dev/null || true
+    ln -sf libibverbs.so.1 "${OVERLAY}.tmp/lib/libibverbs.so"
+    cp -L /usr/lib64/libibverbs/*.so "${OVERLAY}.tmp/lib/libibverbs/" 2>/dev/null || true
+    cp /etc/libibverbs.d/*.driver "${OVERLAY}.tmp/libibverbs.d/" 2>/dev/null || true
+    if ls "${OVERLAY}.tmp/lib/libibverbs/"libbnxt_re-rdmav*.so >/dev/null 2>&1; then
+      rm -rf "$OVERLAY" 2>/dev/null
+      mv "${OVERLAY}.tmp" "$OVERLAY" 2>/dev/null || { mkdir -p "$OVERLAY"; cp -a "${OVERLAY}.tmp/." "$OVERLAY/"; }
+      echo "[provision] host RDMA overlay staged: $(ls "$OVERLAY/lib/libibverbs" | wc -l) providers + libibverbs.so symlink"
+    else
+      echo "[provision] WARNING: host bnxt_re provider not found at /usr/lib64/libibverbs — multi-node RDMA will fail 'Bad address'; use NCCL_NET_TRANSPORT=socket"
+    fi
+  fi
+
+  if [ "$NEED_DEPS" = "0" ]; then
+    echo "[provision] baked image — deps preinstalled; verifying imports only"
+    "$DOCKER" exec "$CONTAINER" bash -lc '
+python -c "import torch, fbgemm_gpu, torchrec, polars, xxhash, gin; print(\"imports OK,\", torch.__version__, torch.version.hip, torch.cuda.device_count(), \"gpus\")"
+' || echo "[provision] WARNING: baked-image import smoke failed"
+  else
+    echo "[provision] installing recipe deps (base image, slow path)"
+    # Install misc deps FIRST, then pin the rocm torch stack + fbgemm + torchrec
+    # LAST with --no-deps so nothing pulls a CUDA torch over the rocm build.
+    "$DOCKER" exec "$CONTAINER" bash -lc '
+set -e
+echo "=== native torch ==="; python -c "import torch;print(torch.__version__)" || true
+echo "=== misc python deps ==="
+pip install --no-cache-dir polars-u64-idx pyarrow pyyaml tqdm psutil numba xxhash gin-config \
+  absl-py pandas tensorboard torchmetrics tensordict pyre-extensions iopath typing-inspect 2>&1 | tail -3 || true
+echo "=== rocm torch stack (force, no-deps, LAST) ==="
+pip install --force-reinstall --no-deps --index-url '"$TORCH_IDX"' \
+  torch==2.12.0+rocm7.2 torchvision==0.27.0+rocm7.2 torchaudio==2.11.0+rocm7.2
+echo "=== fbgemm (local gfx950 wheel) ==="
+pip install --force-reinstall --no-deps '"$FBGEMM_WHL"'
+echo "=== torchrec v2026.06.01.00 (force, no-deps) ==="
+pip install --force-reinstall --no-deps "git+https://github.com/pytorch/torchrec.git@v2026.06.01.00"
+echo "=== import smoke ==="
+python -c "import torch, fbgemm_gpu, torchrec, polars, xxhash, gin; print(\"imports OK,\", torch.__version__, torch.version.hip, torch.cuda.device_count(), \"gpus\")"
+'
+  fi
+
+  # --- one-time bake: snapshot the deps-installed container into a reusable image
+  # and save it to NFS so future nodes skip the download/pip path entirely.
+  if [ "${BAKE_IMAGE:-0}" = "1" ]; then
+    echo "[provision] baking: docker commit $CONTAINER -> $BAKED_IMAGE"
+    if "$DOCKER" commit "$CONTAINER" "$BAKED_IMAGE" >/dev/null; then
+      echo "[provision] saving $BAKED_IMAGE -> $BAKED_TAR (one-time, tens of GB)"
+      if "$DOCKER" save "$BAKED_IMAGE" -o "${BAKED_TAR}.tmp.$$" && mv -f "${BAKED_TAR}.tmp.$$" "$BAKED_TAR"; then
+        echo "[provision] bake done: $(ls -lh "$BAKED_TAR" 2>/dev/null | awk '{print $5}')"
+      else
+        echo "[provision] WARNING: docker save failed"; rm -f "${BAKED_TAR}.tmp.$$" 2>/dev/null
+      fi
+    else
+      echo "[provision] WARNING: docker commit failed"
+    fi
+  fi
+  echo "[provision] DONE"
+}
+
+# =============================================================================
+# PHASE: worker  (inside the container)
+# =============================================================================
+worker() {
+  cd "$REPO_ROOT"
+  LOG=${LOG:-/apps/chcai/yambda_5b_8gpu.log}
+  # Append (not truncate): under the streaming-e2e supervisor a run may relaunch
+  # many times into the SAME $LOG; the supervisor initializes it once at run start.
+  echo "[$(date)] REPO_ROOT=$REPO_ROOT" | tee -a "$LOG"
+
+  # polars-u64-idx (NOT stock polars) — yambda parquet's flat-explode overruns
+  # 32-bit row index. Reserved node has no outbound DNS, so install from a
+  # pre-staged tarball under /apps/chcai/. Override PIP_LOCAL_TGZ for other hosts.
+  PIP_LOCAL_TGZ=${PIP_LOCAL_TGZ:-/apps/chcai/pip_local_yambda.tgz}   # [CLUSTER-SPECIFIC] shared-NFS path
+  PIP_LOCAL_DIR=${PIP_LOCAL_DIR:-/tmp/pip_local}
+  if [ ! -f "$PIP_LOCAL_DIR/lib/python3.12/site-packages/polars/__init__.py" ]; then
+    rm -rf "$PIP_LOCAL_DIR"
+    mkdir -p "$PIP_LOCAL_DIR" && tar xzf "$PIP_LOCAL_TGZ" -C "$(dirname "$PIP_LOCAL_DIR")" 2>&1 | tail -3 | tee -a "$LOG"
+  fi
+
+  export PYTHONPATH="$PIP_LOCAL_DIR/lib/python3.12/site-packages:$REPO_ROOT:${PYTHONPATH:-}"
+  export HOME=${HOME:-/tmp}
+  echo "[$(date)] PYTHONPATH=$PYTHONPATH" | tee -a "$LOG"
+  python -c "import torch, fbgemm_gpu, torchrec, polars, xxhash, gin; print('imports OK,', torch.__version__, torch.cuda.device_count(),'gpus')" 2>&1 | tee -a "$LOG"
+
+  export HIP_VISIBLE_DEVICES=${HIP_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}
+  export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}
+
+  # --- distributed topology ---------------------------------------------------
+  GPUS_PER_NODE=$(python -c "import torch; print(torch.cuda.device_count())")
+  # Multi-node when launched one-task-per-node under SLURM (SLURM_NNODES>1);
+  # otherwise fall through to legacy single-node defaults (localhost, node_rank 0).
+  if [ "${SLURM_NNODES:-1}" -gt 1 ] && [ -n "${SLURM_JOB_NODELIST:-}" ]; then
+    NNODES=${SLURM_NNODES}
+    NODE_RANK=${SLURM_NODEID:-${SLURM_PROCID:-0}}
+    # PREFER a MASTER_ADDR/PORT forwarded from the orchestrate phase (resolved on
+    # the host, which has scontrol); the container image carries no SLURM client.
+    if [ -z "${MASTER_ADDR:-}" ]; then
+      MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" 2>/dev/null | head -1)
+      MASTER_ADDR=${MASTER_ADDR:-localhost}
+    fi
+    MASTER_PORT=${MASTER_PORT:-$(( 20000 + ${SLURM_JOB_ID:-0} % 20000 ))}
+  else
+    NNODES=${NNODES:-1}
+    NODE_RANK=${NODE_RANK:-0}
+    MASTER_ADDR=${MASTER_ADDR:-localhost}
+    MASTER_PORT=${MASTER_PORT:-}     # empty => train_ranker picks a free port
+  fi
+  export NNODES NODE_RANK GPUS_PER_NODE MASTER_ADDR MASTER_PORT
+  export WORLD_SIZE=$(( NNODES * GPUS_PER_NODE ))
+  echo "[$(date)] topology: nnodes=$NNODES node_rank=$NODE_RANK gpus_per_node=$GPUS_PER_NODE world_size=$WORLD_SIZE master=$MASTER_ADDR:${MASTER_PORT:-<auto>}" | tee -a "$LOG"
+
+  # RCCL/NCCL cross-node knobs (multi-node only; single-node leaves auto-detect).
+  # The container is --network=host so RCCL sees ALL host interfaces; split the
+  # two planes explicitly: TCP bootstrap over the routable fenic0, RDMA data over
+  # the 8 Broadcom bnxt_re RoCE HCAs (the per-GPU benic* 192.168.x/31 links are
+  # NOT node-routable for TCP — auto-detect there hangs init).
+  if [ "$NNODES" -gt 1 ]; then
+    # [CLUSTER-SPECIFIC] routable host NIC for TCP bootstrap (find via `ip -br addr`).
+    export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-fenic0}
+    NCCL_NET_TRANSPORT=${NCCL_NET_TRANSPORT:-ib}
+    if [ "$NCCL_NET_TRANSPORT" = "socket" ]; then
+      export NCCL_IB_DISABLE=1
+      echo "[$(date)] NCCL: IB disabled — allreduce over TCP (fenic0). Functional, not RDMA-fast." | tee -a "$LOG"
+    else
+      # bnxt_re userspace provider ABI overlay (REQUIRED for RCCL). The stock v34
+      # provider faults RCCL's create_qp (256 WRs) against the host kernel uapi
+      # ("Bad address"); the host v61/v59 set staged by the provision phase works.
+      # The libibverbs.so (UNVERSIONED) symlink + LD_PRELOAD are both required so
+      # the torch process maps ONLY the host lib (see provision phase comment).
+      if [ -e "$OVERLAY/lib/libibverbs.so.1" ]; then
+        [ -e "$OVERLAY/lib/libibverbs.so" ] || ln -sf libibverbs.so.1 "$OVERLAY/lib/libibverbs.so" 2>/dev/null || true
+        export LD_LIBRARY_PATH="$OVERLAY/lib:$OVERLAY/lib/libibverbs:${LD_LIBRARY_PATH:-}"
+        export LD_PRELOAD="$OVERLAY/lib/libibverbs.so.1${LD_PRELOAD:+:$LD_PRELOAD}"
+        echo "[$(date)] NCCL: bnxt_re provider overlay -> $OVERLAY (host rdma-core v61/v59; symlink+LD_PRELOAD so RCCL binds the host lib for QP creation)" | tee -a "$LOG"
+      else
+        echo "[$(date)] WARNING: RDMA overlay $OVERLAY missing — RCCL QP creation will fail 'Bad address' on stock v34 provider; set RDMA_OVERLAY or use NCCL_NET_TRANSPORT=socket" | tee -a "$LOG"
+      fi
+      # MINIMAL bnxt_re set PROVEN on these meta64 cv350 nodes (cmcknigh RCCL
+      # benchmarks + confirmed e2e here). NCCL_IB_TC=104 (RoCE lossless PFC class)
+      # is required; do NOT add the ionic-AINIC QPS/ECE/DMABUF block.
+      # [CLUSTER-SPECIFIC] RDMA HCA names (`ibv_devices`); other vendors => mlx5_*/ionic_*
+      export NCCL_IB_HCA=${NCCL_IB_HCA:-bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7}
+      export NCCL_IB_GID_INDEX=${NCCL_IB_GID_INDEX:-3}    # [CLUSTER-SPECIFIC] RoCEv2 IPv4 GID idx (`show_gids`)
+      export NCCL_IB_TC=${NCCL_IB_TC:-104}                # [CLUSTER-SPECIFIC] RoCE lossless/PFC traffic class
+      export NCCL_IB_TIMEOUT=${NCCL_IB_TIMEOUT:-14}
+      export NCCL_IGNORE_CPU_AFFINITY=${NCCL_IGNORE_CPU_AFFINITY:-1}
+      export RCCL_MSCCL_ENABLE=${RCCL_MSCCL_ENABLE:-0}
+      # GPU-Direct RDMA needs DMABUF/peermem (neither in-container here) — leave
+      # GDR off so RCCL stages through host memory (still real RDMA over bnxt_re).
+      export NCCL_NET_GDR_LEVEL=${NCCL_NET_GDR_LEVEL:-0}
+      echo "[$(date)] NCCL: RDMA over bnxt_re (GID idx ${NCCL_IB_GID_INDEX}, TC ${NCCL_IB_TC}, GDR_LEVEL=${NCCL_NET_GDR_LEVEL}; meta64 bnxt_re config, validated)" | tee -a "$LOG"
+    fi
+  fi
+  export NCCL_DEBUG=${NCCL_DEBUG:-WARN}
+  export HSTU_HAMMER_KERNEL=${HSTU_HAMMER_KERNEL:-}
+  export PYTORCH_CUDA_ALLOC_CONF=${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}
+
+  # --- GPU clock sanity guard -------------------------------------------------
+  # A leftover perf_determinism cap (half clock) silently slows every kernel ~1.9x.
+  # Log the perf level + a live sclk sample and try to restore boost (non-fatal).
+  if command -v rocm-smi >/dev/null 2>&1; then
+    echo "[$(date)] GPU perf-level check:" | tee -a "$LOG"
+    rocm-smi --showperflevel 2>/dev/null | grep -iE "GPU\[[0-9]+\]" | tee -a "$LOG" || true
+    if rocm-smi --showperflevel 2>/dev/null | grep -iqE "Performance Level: *(perf_determinism|manual|low)"; then
+      echo "[$(date)] WARNING: GPUs not in 'auto' perf level — attempting --setperflevel auto" | tee -a "$LOG"
+      rocm-smi --setperflevel auto 2>/dev/null | grep -iE "set to auto" | tee -a "$LOG" \
+        || echo "[$(date)] WARNING: could not set perf level (no permission?). Run 'rocm-smi --setperflevel auto' on the HOST before benchmarking — clocks may be capped." | tee -a "$LOG"
+    fi
+    echo "[$(date)] sclk sample (GPU0):$(rocm-smi -d 0 --showclocks 2>/dev/null | grep -i 'sclk clock level' | sed -E 's/.*sclk clock level//')" | tee -a "$LOG" || true
+  fi
+
+  echo "[$(date)] launching train_ranker with WORLD_SIZE=$WORLD_SIZE" | tee -a "$LOG"
+  python -m generative_recommenders.dlrm_v3.train.train_ranker \
+      --dataset yambda-5b --mode "${MODE:-streaming-train-eval}" 2>&1 | tee -a "$LOG"
+}
+
+# ---- dispatch ---------------------------------------------------------------
+case "$PHASE" in
+  orchestrate) orchestrate ;;
+  provision)   provision ;;
+  worker)      worker ;;
+  *) echo "launch_slurm.sh: unknown LAUNCH_SLURM_PHASE='$PHASE'" >&2; exit 2 ;;
+esac
diff --git a/recommendation_v4/scripts/launch_smoke_8gpu.sh b/recommendation_v4/scripts/launch_smoke_8gpu.sh
deleted file mode 100755
index eaa0aa19b..000000000
--- a/recommendation_v4/scripts/launch_smoke_8gpu.sh
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/bin/bash
-# 8-GPU yambda-5b run. Resolves the package root from this script's location,
-# so it works from any container mount point. Dataset path is in the gin file
-# (generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin).
-set -uo pipefail
-
-REPO_ROOT=$(cd "$(dirname "$0")/.." && pwd)
-cd "$REPO_ROOT"
-
-LOG=${LOG:-/apps/chcai/yambda_5b_8gpu.log}
-# Append (not truncate): under the streaming-e2e supervisor a run may relaunch
-# many times into the SAME $LOG, and we want the full NE/AUC history preserved
-# across attempts. The supervisor initializes ($LOG) once at run start. For a
-# standalone invocation, set a fresh $LOG (or truncate it yourself) per run.
-echo "[$(date)] REPO_ROOT=$REPO_ROOT" | tee -a "$LOG"
-
-# polars-u64-idx (NOT stock polars) — yambda parquet's flat-explode overruns
-# 32-bit row index. Reserved node has no outbound DNS, so we install from a
-# pre-staged tarball under /apps/chcai/. Override PIP_LOCAL_TGZ for other hosts.
-PIP_LOCAL_TGZ=${PIP_LOCAL_TGZ:-/apps/chcai/pip_local_yambda.tgz}
-PIP_LOCAL_DIR=${PIP_LOCAL_DIR:-/tmp/pip_local}
-if [ ! -f "$PIP_LOCAL_DIR/lib/python3.12/site-packages/polars/__init__.py" ]; then
-  rm -rf "$PIP_LOCAL_DIR"
-  mkdir -p "$PIP_LOCAL_DIR" && tar xzf "$PIP_LOCAL_TGZ" -C "$(dirname "$PIP_LOCAL_DIR")" 2>&1 | tail -3 | tee -a "$LOG"
-fi
-
-export PYTHONPATH="$PIP_LOCAL_DIR/lib/python3.12/site-packages:$REPO_ROOT:${PYTHONPATH:-}"
-export HOME=${HOME:-/tmp}
-echo "[$(date)] PYTHONPATH=$PYTHONPATH" | tee -a "$LOG"
-python -c "import torch, fbgemm_gpu, torchrec, polars, xxhash, gin; print('imports OK,', torch.__version__, torch.cuda.device_count(),'gpus')" 2>&1 | tee -a "$LOG"
-
-export HIP_VISIBLE_DEVICES=${HIP_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}
-export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}
-export WORLD_SIZE=$(python -c "import torch; print(torch.cuda.device_count())")
-# HSTU attention backend is selected in the gin (make_model.hammer_kernel),
-# defaulting to TRITON — fused/flash-style, so it avoids the dense [B,H,N,N]
-# score tensor the PYTORCH path materializes (~32 GiB at N=2048/bs=1024) and is
-# both faster and far lighter on HBM. Only export HSTU_HAMMER_KERNEL=PYTORCH
-# before launch for a one-off fallback (e.g. a ROCm Triton PassManager error).
-export HSTU_HAMMER_KERNEL=${HSTU_HAMMER_KERNEL:-}
-export PYTORCH_CUDA_ALLOC_CONF=${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}
-
-# --- GPU clock sanity guard ---------------------------------------------------
-# Leftover node state once pinned all 8 GPUs into `perf_determinism` at half
-# clock (1093 vs 2200 MHz max). That uniformly slowed every Triton kernel ~1.9x
-# and silently masked real perf changes for an entire debugging session. Always
-# log the perf level + a live sclk sample so a capped run is obvious from the
-# log, and try to restore boost. Fully non-fatal (rocm-smi may be absent or
-# lack permission inside the container — in that case reset from the host).
-if command -v rocm-smi >/dev/null 2>&1; then
-  echo "[$(date)] GPU perf-level check:" | tee -a "$LOG"
-  rocm-smi --showperflevel 2>/dev/null | grep -iE "GPU\[[0-9]+\]" | tee -a "$LOG" || true
-  if rocm-smi --showperflevel 2>/dev/null | grep -iqE "Performance Level: *(perf_determinism|manual|low)"; then
-    echo "[$(date)] WARNING: GPUs not in 'auto' perf level — attempting --setperflevel auto" | tee -a "$LOG"
-    rocm-smi --setperflevel auto 2>/dev/null | grep -iE "set to auto" | tee -a "$LOG" \
-      || echo "[$(date)] WARNING: could not set perf level (no permission?). Run 'rocm-smi --setperflevel auto' on the HOST before benchmarking — clocks may be capped." | tee -a "$LOG"
-  fi
-  echo "[$(date)] sclk sample (GPU0):$(rocm-smi -d 0 --showclocks 2>/dev/null | grep -i 'sclk clock level' | sed -E 's/.*sclk clock level//')" | tee -a "$LOG" || true
-fi
-# -----------------------------------------------------------------------------
-
-echo "[$(date)] launching train_ranker with WORLD_SIZE=$WORLD_SIZE" | tee -a "$LOG"
-
-python -m generative_recommenders.dlrm_v3.train.train_ranker \
-    --dataset yambda-5b --mode "${MODE:-streaming-train-eval}" 2>&1 | tee -a "$LOG"
diff --git a/recommendation_v4/scripts/run_streaming_e2e.sh b/recommendation_v4/scripts/run_streaming_e2e.sh
index 5009ecbce..f97a0e483 100755
--- a/recommendation_v4/scripts/run_streaming_e2e.sh
+++ b/recommendation_v4/scripts/run_streaming_e2e.sh
@@ -543,7 +543,7 @@ launch() {
         RUN_NAME=$RUN_NAME \
         TENSORBOARD_LOG_PATH=/apps/chcai/tb/$RUN_NAME/ \
         LOG=$LOG \
-        bash scripts/launch_smoke_8gpu.sh;
+        bash scripts/launch_slurm.sh;
         echo \"E2E_RUN_EXIT=\$? \$(date '+%F %T')\" >> $LOG
     "
 }
@@ -568,7 +568,7 @@ sup "failover: allow=$ALLOW_FAILOVER partition=$PARTITION reservation=${RESERVAT
 reap_failover_holds ""
 
 cexec "mkdir -p '$CKPT_PATH' '/apps/chcai/tb/$RUN_NAME'"
-# Initialize this run's metrics log ONCE. launch_smoke_8gpu.sh appends (tee -a),
+# Initialize this run's metrics log ONCE. launch_slurm.sh (worker) appends (tee -a),
 # so every relaunch attempt accumulates into this single file — the full-run
 # NE/AUC history survives crashes and node failover instead of being truncated
 # on each relaunch. (Starting the supervisor = starting a fresh run.) In ATTACH

From 15c4a00ba18dad6567bce6ac3eaf942a4e327193 Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Wed, 10 Jun 2026 14:10:32 -0500
Subject: [PATCH 046/113] local dlrmv4 changes: docker setup, run scripts,
 walkthrough docs, smoke log path

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 recommendation_v4/Dockerfile                  |  80 +++
 recommendation_v4/Dockerfile.nvidia           |  92 +++
 .../docs/v4_vs_v2_and_hstu_walkthrough.md     | 534 ++++++++++++++++++
 recommendation_v4/scripts/run_docker.sh       |  59 ++
 4 files changed, 765 insertions(+)
 create mode 100644 recommendation_v4/Dockerfile
 create mode 100644 recommendation_v4/Dockerfile.nvidia
 create mode 100644 recommendation_v4/docs/v4_vs_v2_and_hstu_walkthrough.md
 create mode 100755 recommendation_v4/scripts/run_docker.sh

diff --git a/recommendation_v4/Dockerfile b/recommendation_v4/Dockerfile
new file mode 100644
index 000000000..112d605df
--- /dev/null
+++ b/recommendation_v4/Dockerfile
@@ -0,0 +1,80 @@
+# MI350X path — implements docs/training_recipe.md §"MI350X".
+
+FROM rocm/primus:v26.3
+
+ENV PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1
+
+WORKDIR /workspace/recommendation_v4
+
+# torch / torchvision / torchaudio — training_recipe.md:38-40.
+RUN pip install --upgrade --no-deps \
+        --index-url https://download.pytorch.org/whl/rocm7.2 \
+        torch==2.12.0+rocm7.2 \
+        torchvision==0.27.0+rocm7.2 \
+        torchaudio==2.11.0+rocm7.2
+
+# torchrec — training_recipe.md:43.
+RUN pip install --force-reinstall --no-deps \
+        "git+https://github.com/pytorch/torchrec.git@v2026.06.01.00"
+
+# fbgemm_gpu — training_recipe.md:42. Build from FBGEMM commit 10b77573 for
+# gfx950 against the replaced torch. ~30-60 min.
+RUN apt-get update && apt-get install -y --no-install-recommends git build-essential && \
+    rm -rf /var/lib/apt/lists/* && \
+    git clone --recursive https://github.com/pytorch/FBGEMM.git /tmp/FBGEMM && \
+    cd /tmp/FBGEMM && \
+    git checkout 10b775730212923f65f7b78f79b6a01d80cf3c29 && \
+    git submodule update --init --recursive && \
+    cd fbgemm_gpu && \
+    # Filter `fairscale` and the torch family from fbgemm's requirements.txt:
+    # fairscale pulls a CPU torch that would clobber the +rocm7.2 wheel installed
+    # above. fairscale is a distributed-training lib used by fbgemm tests, not
+    # by the build itself.
+    grep -v -E '^(fairscale|torch|torchvision|torchaudio)([<>=!]|$)' requirements.txt > /tmp/req.txt && \
+    pip install -r /tmp/req.txt && \
+    python setup.py -j 32 bdist_wheel \
+        --build-target=default \
+        --build-variant=rocm \
+        -DHIP_ROOT_DIR=/opt/rocm \
+        -DAMDGPU_TARGETS=gfx950 && \
+    pip install --force-reinstall --no-deps dist/fbgemm_gpu_nightly_rocm*.whl && \
+    cd / && rm -rf /tmp/FBGEMM
+
+# polars-u64-idx — training_recipe.md:44 (mandatory; yambda-5b > 4.29 B rows).
+# Remaining packages — training_recipe.md:156-159 ("Additional Python deps") plus
+# `datasets` + `huggingface_hub`, which the recipe does not list but
+# preprocess_public_data.py:278 imports to download yambda from HuggingFace.
+RUN pip install \
+        polars-u64-idx==1.33.1 \
+        gin-config \
+        absl-py \
+        datasets \
+        huggingface_hub \
+        pyre-extensions \
+        iopath \
+        typing-inspect \
+        psutil \
+        tqdm \
+        pyyaml \
+        lightning-utilities && \
+    # torchmetrics and tensordict declare `torch` as a dep; without --no-deps
+    # pip pulls torch==2.12.0+cu130 from PyPI which clobbers the +rocm7.2 wheel
+    # we installed above (libtorch_hip.so disappears, fbgemm_gpu fails to load).
+    pip install --no-deps \
+        torchmetrics==1.0.3 \
+        tensordict
+
+# Smoke-test the 6 imports the launch script checks at
+# scripts/launch_smoke_8gpu.sh:26.
+RUN python -c "import torch, fbgemm_gpu, torchrec, polars, xxhash, gin; \
+print('torch', torch.__version__, '| hip', getattr(torch.version, 'hip', None))"
+
+COPY . /workspace/recommendation_v4
+
+ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
+    HSTU_HAMMER_KERNEL=TRITON \
+    DLRM_DATA_PATH=/data/mlperf_dlrm_v4
+
+CMD ["bash"]
diff --git a/recommendation_v4/Dockerfile.nvidia b/recommendation_v4/Dockerfile.nvidia
new file mode 100644
index 000000000..388ab8e5e
--- /dev/null
+++ b/recommendation_v4/Dockerfile.nvidia
@@ -0,0 +1,92 @@
+# B200 path — implements docs/training_recipe.md §"B200".
+
+FROM nvcr.io/nvidia/pytorch:26.04-py3
+
+ENV PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1
+
+WORKDIR /workspace/recommendation_v4
+
+# torch / triton — training_recipe.md:137-138, 149-150. Native to the image
+# and must NOT be reinstalled (CUPTI / sm_100 support depends on it).
+
+# torchrec — training_recipe.md:152. Nightly cu130 wheel, --no-deps.
+RUN pip install --force-reinstall --no-deps \
+        --index-url https://download.pytorch.org/whl/nightly/cu130 \
+        torchrec==1.7.0.dev20260601+cu130
+
+# fbgemm_gpu — training_recipe.md:151. Build from FBGEMM commit 10b77573 for
+# sm_100 against the image's native torch. ~55 min (sm_100 TBE-forward via ptxas).
+# NOTE: --nvml_lib_path diverges from training_recipe.md:151. The recipe points
+# at /usr/lib/x86_64-linux-gnu/libnvidia-ml.so, which is mounted only at
+# `docker run --gpus all` time. During `docker build` no GPU runtime is
+# attached, so we link against the NVML stub that ships inside the CUDA SDK in
+# the NGC image; the real driver-side libnvidia-ml.so is used at runtime.
+RUN apt-get update && apt-get install -y --no-install-recommends git build-essential && \
+    rm -rf /var/lib/apt/lists/* && \
+    git clone --recursive https://github.com/pytorch/FBGEMM.git /tmp/FBGEMM && \
+    cd /tmp/FBGEMM && \
+    git checkout 10b775730212923f65f7b78f79b6a01d80cf3c29 && \
+    git submodule update --init --recursive && \
+    cd fbgemm_gpu && \
+    # Filter `fairscale` and the torch family from fbgemm's requirements.txt:
+    # fairscale pulls a CPU torch that would clobber the image's native torch.
+    # fairscale is a distributed-training lib used by fbgemm tests, not by the
+    # build itself.
+    grep -v -E '^(fairscale|torch|torchvision|torchaudio)([<>=!]|$)' requirements.txt > /tmp/req.txt && \
+    pip install -r /tmp/req.txt && \
+    TORCH_CUDA_ARCH_LIST=10.0 python setup.py bdist_wheel \
+        --build-target default \
+        --build-variant cuda \
+        --package_channel nightly \
+        --nvml_lib_path /usr/local/cuda/lib64/stubs/libnvidia-ml.so && \
+    pip install --force-reinstall --no-deps dist/fbgemm_gpu_nightly-*.whl && \
+    cd / && rm -rf /tmp/FBGEMM
+
+# polars-u64-idx — training_recipe.md:153 (mandatory; yambda-5b > 4.29 B rows).
+# Remaining packages — training_recipe.md:156-159 ("Additional Python deps") plus
+# `datasets` + `huggingface_hub`, which the recipe does not list but
+# preprocess_public_data.py:278 imports to download yambda from HuggingFace.
+RUN pip install \
+        polars-u64-idx==1.33.1 \
+        gin-config \
+        absl-py \
+        datasets \
+        huggingface_hub \
+        pyre-extensions \
+        iopath \
+        typing-inspect \
+        psutil \
+        tqdm \
+        pyyaml \
+        lightning-utilities && \
+    # torchmetrics and tensordict declare `torch` as a dep; without --no-deps
+    # pip resolves and reinstalls torch, clobbering the image's native NGC
+    # torch (which would break CUPTI + sm_100 support per training_recipe.md:199).
+    pip install --no-deps \
+        torchmetrics==1.0.3 \
+        tensordict
+
+# Smoke-test that packages are installed at the right versions. Cannot dlopen
+# fbgemm_gpu / torchrec here because their SONAME deps (libnvidia-ml.so.1, etc.)
+# only resolve when the container runs with `--gpus all` — which docker build
+# can't do. The real 6-import check at scripts/launch_smoke_8gpu.sh:26 runs at
+# `docker run` time when the driver is mounted in.
+RUN python -c "import torch, polars, xxhash, gin; \
+print('torch', torch.__version__, '| cuda', getattr(torch.version, 'cuda', None)); \
+import importlib.metadata as m; \
+print('fbgemm_gpu installed:', m.version('fbgemm_gpu_nightly')); \
+print('torchrec installed:  ', m.version('torchrec'))"
+
+COPY . /workspace/recommendation_v4
+
+# B200 runtime env — training_recipe.md:184-195.
+ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
+    HSTU_HAMMER_KERNEL=TRITON \
+    TORCH_CUDA_ARCH_LIST=10.0 \
+    HBM_CAP_GB=150 \
+    TRITON_CACHE_DIR=/workspace/recommendation_v4/.triton_cache \
+    DLRM_DATA_PATH=/data/mlperf_dlrm_v4
+
+CMD ["bash"]
diff --git a/recommendation_v4/docs/v4_vs_v2_and_hstu_walkthrough.md b/recommendation_v4/docs/v4_vs_v2_and_hstu_walkthrough.md
new file mode 100644
index 000000000..4ef46c069
--- /dev/null
+++ b/recommendation_v4/docs/v4_vs_v2_and_hstu_walkthrough.md
@@ -0,0 +1,534 @@
+# recommendation_v4 (HSTU + Yambda-5b) — reference
+
+A walkthrough of what the proposed `recommendation_v4` MLPerf-training benchmark
+is, how it differs from `recommendation_v2`, what the HSTU model is composed of,
+and how to download the dataset and run training as-is.
+
+All claims below are grounded in code/config paths inside this tree. Every
+numeric constant cites a `file:line` source. Where doc and source disagree, the
+source wins and the discrepancy is called out.
+
+---
+
+## 0. Sources of truth
+
+The following files were read to assemble this document. If you change any of
+them, audit this doc against the change.
+
+- `training/recommendation_v2/torchrec_dlrm/README.MD` (v2 reference)
+- `training/recommendation_v4/README.MD` (v4 fork overview)
+- `training/recommendation_v4/docs/training_recipe.md` (v4 stacks/configs)
+- `training/recommendation_v4/generative_recommenders/modules/stu.py` (HSTU layer)
+- `training/recommendation_v4/generative_recommenders/modules/dlrm_hstu.py` (top-level `DlrmHSTU` + config dataclass)
+- `training/recommendation_v4/generative_recommenders/modules/hstu_transducer.py` (preprocessor → STU stack → postprocessor)
+- `training/recommendation_v4/generative_recommenders/ops/pytorch/pt_hstu_attention.py` (reference HSTU attention math in plain PyTorch)
+- `training/recommendation_v4/generative_recommenders/ops/hstu_attention.py` (kernel dispatcher: PYTORCH / TRITON / TRITON_CC)
+- `training/recommendation_v4/generative_recommenders/dlrm_v3/configs.py` (per-dataset HSTU config + embedding tables)
+- `training/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin` (run config)
+- `training/recommendation_v4/generative_recommenders/dlrm_v3/preprocess_public_data.py` (Yambda HuggingFace downloader/preprocessor)
+- `training/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py` (dataset feeding HSTU)
+- `training/recommendation_v4/generative_recommenders/dlrm_v3/utils.py` (metrics logger / `auc_threshold` consumer)
+- `training/recommendation_v4/scripts/launch_smoke_8gpu.sh` (run wrapper)
+
+---
+
+## 1. `recommendation_v4` vs `recommendation_v2`
+
+v4 is **not** an evolution of v2 — it replaces a tabular CTR benchmark
+(DLRMv2 + DCN on Criteo 1 TB) with a **sequential generative-recommender
+benchmark** (HSTU on Yandex Yambda-5b). Codebase, dataset, task, loss
+labeling, hyperparameters, and software stack are all different. They share
+basically nothing except the "recommendation" label.
+
+### 1.1 Upstream codebase / repo origin
+
+| | v2 | v4 |
+|---|---|---|
+| Upstream repo | `pytorch/torchrec` examples (DLRM) | fork of `meta-recsys/generative-recommenders` (`README.MD:3`) |
+| Layout | single dir: `torchrec_dlrm/` with `dlrm_main.py` | full repo tree: `generative_recommenders/`, `configs/`, `scripts/`, `main.py`, `setup.py`, gin-driven |
+| Config style | argparse CLI flags | gin-config files under `generative_recommenders/dlrm_v3/train/gin/` (e.g. `yambda_5b.gin`) |
+
+### 1.2 Model architecture
+
+| | v2 | v4 |
+|---|---|---|
+| Model | **DLRM v2** — dense MLP + sparse embeddings + feature interaction (paper: Naumov et al. 1906.00091) | **HSTU** — Hierarchical Sequential Transducer Units (ICML'24 *Actions Speak Louder than Words*) (`README.MD:3`) |
+| Interaction arch | DCN v2: `--interaction_type=dcn --dcn_num_layers=3 --dcn_low_rank_dim=512` (`recommendation_v2/torchrec_dlrm/README.MD:167-169`) | Transformer-style sequential self-attention over a User Interaction History (UIH) of length 2048, jagged-attention TRITON kernel (`README.MD:114, 132`; `training_recipe.md:57, 71`) |
+| Embedding dim | 128 (`recommendation_v2/torchrec_dlrm/README.MD:157`) | 512 (`dlrm_v3/configs.py:33, 353`) |
+| Pipeline | TorchRec model-parallel embeddings + data-parallel MLP, overlapped (`recommendation_v2/torchrec_dlrm/README.MD:3`) | TorchRec sharded embeddings + HSTU ranker; per-GPU HBM cap 260 GiB MI350X / 150 GiB B200 (`training_recipe.md:59, 176`) |
+
+### 1.3 Dataset
+
+| | v2 | v4 |
+|---|---|---|
+| Dataset | **Criteo 1 TB click logs** → multi-hot preprocessed variant (~3.8 TB materialized) (`recommendation_v2/torchrec_dlrm/README.MD:142-146`) | **Yambda-5b** (Yandex music, HuggingFace `yandex/yambda`, 5b variant) (`README.MD:3, 28`) |
+| Domain | CTR prediction on tabular ads features (26 categorical + 13 dense) | Sequential music-recommendation events (listen / like / skip / dislike / unlike / undislike) per-user timelines |
+| Size | `TOTAL_TRAINING_SAMPLES=4,195,197,692` rows (`recommendation_v2/torchrec_dlrm/README.MD:153`) | 4.76 B events, 1.00 M users, 9.39 M items; 3.23 B usable training anchors (`README.MD:62-69`) |
+| Storage layout | numpy contiguous shuffled `.npy` (or preprocessed multi-hot bin) | parquet: `train_sessions.parquet` 47 GB, `test_events.parquet` 152 MB, etc. (`README.MD:40-52`) |
+| Preprocessing | `process_Criteo_1TB_Click_Logs_dataset.sh` — 700 GB RAM, 1–2 days, then `materialize_synthetic_multihot_dataset.py` | `generative_recommenders.dlrm_v3.preprocess_public_data --dataset yambda-5b` — ~53 min end-to-end for 5b (`README.MD:54`) |
+| Embedding cardinalities | `num_embeddings_per_feature` 26-vec, top entries 40 M (`recommendation_v2/torchrec_dlrm/README.MD:161`) | item 9.39 M, artist 1.29 M, album 3.37 M, uid 1.00 M, + 7 cross-features up to 100 M (`dlrm_v3/configs.py:40-48, 686-722`) |
+| Required pre-processor pkg | none unusual | **`polars-u64-idx`** because yambda-5b exceeds polars' 32-bit row index (`training_recipe.md:44, 102-103`) |
+
+### 1.4 Task formulation / supervision
+
+| | v2 | v4 |
+|---|---|---|
+| Task | binary CTR (click / no-click) | sequential next-action ranking: given UIH, predict whether the candidate LISTEN event will be a "listen_plus" (`played_ratio ≥ 50%`) (`README.MD:103`) |
+| Label | Criteo click label | `action_weight` bitmask on the candidate; supervision masked to `(supervision_bitmask & task_weight) > 0` with `task_weight = 1` (LP bit) → only `listen_plus` candidates supervise (`README.MD:103`) |
+| Loss | BCE | BCE on `listen_plus` task |
+
+### 1.5 Target metric
+
+| | v2 | v4 |
+|---|---|---|
+| Target | **AUROC ≥ 0.80275** within 1 epoch on Criteo (`recommendation_v2/torchrec_dlrm/README.MD:173`) | `MetricsLogger.auc_threshold = 0.80275` (`yambda_5b.gin:107`). Same numeric value as v2 — likely inherited from the upstream DLRM-DCNv2 reporting convention rather than independently chosen for HSTU. Consumed in `dlrm_v3/utils.py:587-608` to log `time_to_auc_0.80275_sec` as soon as the `listen_plus` task's AUC crosses the threshold. Confirm with the proposing team whether this is the intended final benchmark target or a placeholder. |
+
+### 1.6 Training hyperparameters
+
+| | v2 (MLPerf example, 8 GPU) | v4 (`yambda_5b.gin`, 8 GPU) |
+|---|---|---|
+| Global batch | 65,536 (`recommendation_v2/torchrec_dlrm/README.MD:154`) | **8,192** (`batch_size=1024 × world_size=8`) (`yambda_5b.gin:1, 44`). Note `docs/training_recipe.md:65, 182` shows `32 × 8 = 256` — that doc has drifted; the gin file is the launch source of truth. |
+| Epochs | 1 (`recommendation_v2/torchrec_dlrm/README.MD:163`) | 1 (`yambda_5b.gin:81`) |
+| Dense optimizer | Adagrad, lr 0.005 (`recommendation_v2/torchrec_dlrm/README.MD:170-171`) | **Adam**, lr 1e-3, betas (0.95, 0.999), eps 1e-8 (`yambda_5b.gin:19-24`) |
+| Sparse optimizer | (Adagrad on embeddings via TorchRec) | **RowWiseAdagrad**, lr 1e-3, betas (0.95, 0.999), eps 1e-8 (`yambda_5b.gin:27-32`) |
+| Precision | fp32 (no bf16 flag in v2 example) | **bf16** mixed precision, gated on the TRITON HSTU kernel (`yambda_5b.gin:8`; `training_recipe.md:58, 109-111`) |
+| Sequence length | n/a (non-sequential model) | `history_length=2039`, `max_seq_len=2048` (`yambda_5b.gin:74, 78`) |
+
+### 1.7 Software stack
+
+| | v2 | v4 (MI350X) | v4 (B200) |
+|---|---|---|---|
+| Container | none specified (bare AWS p4d, CUDA 11.0, NCCL 2.10.3) (`recommendation_v2/torchrec_dlrm/README.MD:37`) | `rocm/primus:v26.3` (`training_recipe.md:24`) | `nvcr.io/nvidia/pytorch:26.04-py3` (`training_recipe.md:132`) |
+| GPU target | A100 40 GB | **MI350X** (`gfx950`, ROCm 7.2.1, 288 GiB HBM3e) | **B200** (`sm_100`, ~183 GiB HBM) |
+| torch | TorchRec example era; CUDA 11.0 | `2.12.0+rocm7.2` (`training_recipe.md:38`) | `2.12.0a0` native NGC (CUDA 13.2) (`training_recipe.md:149`) |
+| triton | not central | `3.6.0` (image native; required for HSTU TRITON backend) (`training_recipe.md:41`) | `3.6.0` (`training_recipe.md:150`) |
+| fbgemm_gpu | TorchRec default | `fbgemm_gpu_nightly_rocm-2026.6.2` built from FBGEMM `10b77573` for `gfx950` (`training_recipe.md:42`) | same SHA, built for `sm_100` (`training_recipe.md:151`) |
+| torchrec | (whatever TorchRec was current) | `1.7.0a0+bf55480` (`v2026.06.01.00`) (`training_recipe.md:43`) | `1.7.0.dev20260601+cu130` (`training_recipe.md:152`) |
+| Launcher | `torchx … dist.ddp` | `scripts/launch_smoke_8gpu.sh` | `scripts/launch_smoke_8gpu.sh` |
+| Key kernel | TorchRec EmbeddingBag + DCN | **HSTU TRITON jagged-attention** (`HSTU_HAMMER_KERNEL=TRITON`) (`training_recipe.md:71`) | same (`training_recipe.md:188`) |
+
+---
+
+## 2. HSTU model walkthrough
+
+### 2.1 What HSTU is, in one paragraph
+
+**HSTU = Hierarchical Sequential Transducer Units**, from the Meta paper
+*Actions Speak Louder than Words* (ICML'24). It is a **decoder-only Transformer
+variant**, redesigned for *recommendation* sequences (very long, very ragged,
+heavy on categorical features). The block looks like a standard transformer
+block superficially — attention + MLP — but two things are different from
+GPT/SASRec attention:
+
+1. **Pointwise SiLU instead of softmax** in the attention non-linearity (no
+   log-sum-exp normalization).
+2. **Gated output**: an extra projected stream `U` multiplies the attention
+   output before the residual.
+
+Everything else (residual connections, layer-norm, multi-head, positional
+encoding, causal masking, KV-cache) is conventional transformer. The "S" in
+STU = "Sequential Transducer Unit" = one HSTU block.
+
+### 2.2 The composition: top-level model (DLRM-v3 / `DlrmHSTU`)
+
+The full thing in `dlrm_hstu.py` is a small pipeline. Top-down:
+
+```
+KeyedJaggedTensor of raw ids
+         │
+         ▼
+[1] TorchRec EmbeddingCollection           (≈150 G sparse params, sharded across GPUs)
+         │  emits per-feature jagged embedding lookups
+         ▼
+[2] ContextualPreprocessor                 (interleaves UIH + appends candidate, adds
+                                            positional / action / timestamp encodings)
+         │  output: jagged sequence of length L per user, dim = transducer_embedding_dim
+         ▼
+[3] HSTUTransducer  ── STUStack of N HSTULayers (the "HSTU" attention blocks)
+         │  output: contextualized per-position embedding
+         ▼
+[4] DefaultMultitaskModule                 (linear → BCE on listen_plus bit)
+         │
+         ▼
+Per-anchor logit  →  BCE loss
+```
+
+For yambda-5b the per-dataset overrides in `dlrm_v3/configs.py:78-90, 346-425`
+give:
+
+| component | value | source |
+|---|---|---|
+| embedding tables | `item_id` 9.39 M × 512, `artist_id` 1.29 M × 512, `album_id` 3.37 M × 512, `uid` 1.00 M × 512, + 7 cross-features (e.g. `user_x_artist` 100 M × 512) | `dlrm_v3/configs.py:686-722` |
+| embedding dim | 512 (`HSTU_EMBEDDING_DIM`) | `dlrm_v3/configs.py:33, 353` |
+| HSTU layers | **5** (`hstu_attn_num_layers=5`) | `dlrm_v3/configs.py:82` |
+| attention heads | 4 | `dlrm_v3/configs.py:79` |
+| Q/K dim per head | 128 | `dlrm_v3/configs.py:81` |
+| V/U (linear) dim per head | 128 | `dlrm_v3/configs.py:80` |
+| transducer embedding dim | 512 | `dlrm_v3/configs.py:85, 354` |
+| dropout | input 0.2, linear 0.1 | `dlrm_v3/configs.py:87-88` |
+| max attention budget (model) | 8192 (yambda default; gin further caps to 2048 via `get_hstu_configs.max_seq_len = 2048` in `yambda_5b.gin:78`) | `dlrm_v3/configs.py:355` |
+| task | `listen_plus`, BINARY_CLASSIFICATION, BCE | `dlrm_v3/configs.py:419-424` |
+
+**Sparse-side parameter count, by table** (just the explicit ones; cross-features
+add 282 M more rows × 512 dim ≈ 144 G params, which dominate):
+
+```
+item_id   :  9_390_624 × 512 ≈   4.81 B
+artist_id :  1_293_395 × 512 ≈     662 M
+album_id  :  3_367_692 × 512 ≈    1.72 B
+uid       :  1_000_001 × 512 ≈     512 M
+crosses   :       ~282 M × 512 ≈ 144.4 B   ← dominant
+```
+
+This is overwhelmingly an embedding-bound model — the dense HSTU stack (5
+layers × ~1 M parameters each) is a rounding error next to the embedding
+tables, which is why `make_optimizer_and_shard.hbm_cap_gb = 260` and why
+TorchRec sharding is central.
+
+### 2.3 Inside one STU (HSTU) layer
+
+From `modules/stu.py:182-246, 292-355`. A single STU layer holds **four**
+weight matrices, not the usual two (QKV + out):
+
+```
+_uvqk_weight   : (E, (hidden_dim·2 + attn_dim·2) · num_heads)
+_uvqk_beta     : (...,)                bias for the above
+_input_norm    : LayerNorm(E)
+_output_weight : (hidden_dim · num_heads · 3, E)
+_output_norm   : LayerNorm
+```
+
+Forward pass on input `x` of shape `[L, E]` (jagged):
+
+#### 2.3.1 Fused U/V/Q/K projection
+
+```
+normed = LayerNorm(x)
+[U | V | Q | K] = normed @ _uvqk_weight + _uvqk_beta     # one GEMM, then split
+                                                          # U, V ∈ R^{H·hidden_dim}
+                                                          # Q, K ∈ R^{H·attn_dim}
+```
+
+Compared to a regular transformer, you get an **extra projected stream `U`**.
+`U` will gate the attention output later.
+
+#### 2.3.2 HSTU attention (the core difference vs softmax attention)
+
+Reference math, exactly as written in
+`ops/pytorch/pt_hstu_attention.py:151, 167, 179, 182`:
+
+```python
+qk_attn = einsum("bhxa,bhya->bhxy", Q, K) * alpha   # alpha = 1 / sqrt(attn_dim)
+qk_attn = F.silu(qk_attn) / max_seq_len             # ← pointwise SiLU, scalar divide
+qk_attn = qk_attn * valid_attn_mask                 # mask (see 2.3.3)
+attn    = einsum("bhxd,bhdv->bhxv", qk_attn, V)
+```
+
+Contrast with a vanilla transformer:
+
+```python
+qk   = (Q @ K.T) / sqrt(d)
+qk   = softmax(qk + mask, dim=-1)                   # ← softmax normalises rows
+attn = qk @ V
+```
+
+Two consequences of dropping softmax:
+
+- **No row-wise normalization** → the per-key contribution is decoupled across
+  positions. The paper argues this is *better* for recommendation, because a
+  5-year-old "like" event shouldn't have its weight diluted just because the
+  user has a longer history (which softmax would do).
+- **Numerically more delicate**: the recipe warns *"`pt_hstu_attention`'s QK
+  einsum backward overflows in bf16 at N > 1k and produces NaN at step 1; bf16
+  is only safe with TRITON"* (`docs/training_recipe.md:109-111`). The TRITON
+  kernel handles bf16 accumulation carefully; the reference PyTorch path
+  doesn't.
+
+#### 2.3.3 Custom attention mask (`_get_valid_attn_mask`, `pt_hstu_attention.py:32-84`)
+
+HSTU supports four mask-combination knobs simultaneously:
+
+- **causal**: lower triangle only (standard).
+- **target-aware** (`num_targets`): the last `num_targets` positions are the
+  candidate targets; their "row index" is clamped so all targets see the same
+  prefix (the user's UIH) but cannot peek at each other.
+- **max_attn_len** (sliding window): each position attends only to the previous
+  `max_attn_len` events — bounds the receptive field for very long histories.
+- **contextual_seq_len**: the first `contextual_seq_len` positions are
+  *contextual* tokens (uid + cross-features). They are allowed to attend to
+  everything (and everything attends back to them), regardless of causal order.
+  This is how `uid` / `user_x_artist` etc. get full visibility despite living
+  at the head of the sequence.
+
+#### 2.3.4 Output: gated MLP
+
+From `stu.py:336-354` → `hstu_compute_output`:
+
+```
+y = SwishLayerNorm(attn)             # SiLU(x · sigmoid(x)) then LN
+y = concat([y, U]) @ _output_weight  # gating with the U stream
+y = y · x + dropout                  # residual back to original x
+```
+
+The `U · y` gating is the second non-standard piece. It is reminiscent of
+GLU / SwiGLU but applied to the *attention output*, not just an MLP.
+
+#### 2.3.5 Stack
+
+`STUStack` (`stu.py:426`) is just `nn.ModuleList` of N `STULayer`s applied
+sequentially with the same jagged-tensor convention. No cross-layer fanciness.
+
+### 2.4 "Transformer-style sequential attention over a UIH" — what the inputs actually look like
+
+UIH = **User Interaction History**. For yambda, the input to one training
+sample is one **anchor LISTEN event** plus that user's history. From
+`README.MD:88-101` and `dlrm_v3/configs.py:399-418`:
+
+```
+sequence position:    0 .. 7   |   8 .. (L-2)                |  L-1
+                      ─────────┼─────────────────────────────┼──────────
+content:              contextual│  UIH (interleaved 3 pools)  │  candidate
+                                │                              │
+features per position:           uid, 7 cross-features (length-1 each)
+                                  item_id, artist_id, album_id,
+                                  action_weight (LP/LIKE/SKIP bitmask),
+                                  action_timestamp, dummy_watch_time
+                                                              candidate's:
+                                                                item_candidate_id,
+                                                                item_candidate_artist_id,
+                                                                item_candidate_album_id,
+                                                                item_query_time,
+                                                                item_action_weight,
+                                                                item_dummy_watchtime
+```
+
+The HSTU stack runs causal attention over this `L = 2048` sequence. The label
+is the candidate's `listen_plus` bit (1 if `played_ratio ≥ 50%`, else 0), and
+BCE is taken on the logit emitted at position `L-1`. So "transformer-style
+sequential attention over UIH" literally means: the user's last ~2 k actions
+are tokens, the candidate song is the last token, and a 5-layer HSTU
+transformer predicts whether that candidate will be a `listen_plus`.
+
+This is the conceptual jump from DLRMv2:
+
+| | DLRMv2 (Criteo, v2) | HSTU (Yambda, v4) |
+|---|---|---|
+| Input shape | flat: 26 categorical + 13 dense features per ad impression | sequence of ~2 k past events per user, each a structured tuple |
+| Mixing op | DCN: cross-products of feature vectors, then MLP | self-attention across positions (SiLU-gated, multi-head, causal) |
+| Temporal modelling | none (each ad impression is i.i.d.) | central — masks, timestamps, action types are first-class |
+| Depth | 1-shot (interaction arch + over-arch MLP) | 5 stacked HSTU blocks |
+| "Why is the candidate good?" | low-rank cross of user/ad embeddings | attention over user's relevant past songs/artists/albums |
+
+DLRMv2 is *wide-and-shallow* over tabular features. HSTU is *narrow-and-deep*
+over a temporal sequence. Different paradigm.
+
+### 2.5 Jagged attention — what it is and why it's used
+
+A user's history length varies — yambda median is 2,695 events, max is 27,738
+(`README.MD:65`). For a single training step you have a batch of B users with
+very different sequence lengths. Two ways to lay this out on the GPU:
+
+**Padded layout (standard transformer):**
+
+```
+input shape: [B, N_max, D]           e.g. [1024, 2048, 512]
+```
+
+This wastes compute proportional to `(N_max − N_user) / N_max` per row. On
+yambda the average fill is ~1402/2037 ≈ 69%, so ~30% of every kernel is
+multiplying zeros.
+
+**Jagged layout (what HSTU uses):**
+
+```
+flat values  : [L_total, D]          L_total = Σ user_lengths   (≤ B · N_max)
+offsets      : [B + 1]               cumulative starts, so user i occupies
+                                     values[offsets[i] : offsets[i+1]]
+```
+
+`pt_hstu_attention.py:148, 183` shows the round-trip:
+
+- `torch.ops.fbgemm.jagged_to_padded_dense(...)` only when calling into a dense
+  einsum
+- `torch.ops.fbgemm.dense_to_jagged(...)` on the way out
+
+That's the reference path. The **TRITON jagged-attention kernel**
+(`ops/triton/triton_hstu_attention.py`, dispatched in `ops/hstu_attention.py:27,
+71`) skips the padded intermediate entirely: each Triton program handles one
+user's `[N_user, N_user]` attention block directly, so:
+
+- **No wasted FLOPs.** Empty positions never enter a GEMM.
+- **No wasted memory.** No padded `[B, H, N_max, N_max]` attention scores buffer
+  — that buffer alone would be `1024 · 4 · 2048 · 2048 · 2 bytes ≈ 34 GB` per
+  step (at global batch 1024 × bf16).
+- **Variable-length backward is correct without masking tricks.** The kernel
+  iterates `[offsets[i], offsets[i+1])` per program; the gradient never touches
+  non-existent positions.
+
+This is *the* enabling optimization for the under-filled `like` pool to be
+cheap. The README notes (`README.MD:132`): *"With the TRITON jagged-attention
+backend the GPU only does work for the actual events, so the under-fill costs
+sequence budget but not GPU compute"*. With a padded kernel, the unused 31% of
+every sequence would cost real FLOPs.
+
+Practically: jagged attention is a generic technique (it shows up in
+FlashAttention's varlen variants too); HSTU's TRITON kernel is its
+specialization with SiLU + gated output + the four-way mask.
+
+---
+
+## 3. Yambda-5b — size, contents, download, run
+
+### 3.1 What's in it
+
+[`yandex/yambda`](https://huggingface.co/datasets/yandex/yambda) on HuggingFace.
+From `dlrm_v3/preprocess_public_data.py:233-245` + `README.MD:56-81`:
+
+| field | value |
+|---|---|
+| Provider | Yandex Music recommendation logs |
+| Sizes | yambda-50m, yambda-500m, **yambda-5b** (v4 uses 5b) |
+| Events | 4.76 B interactions across 300 days |
+| Users | 1.00 M unique |
+| Items | 9.39 M songs (+ 1.29 M artists, 3.37 M albums) |
+| Event types | `listen` / `like` / `dislike` / `unlike` / `undislike` (encoded as uint8 0–4) |
+| Listen events also carry | `played_ratio` (used to derive the `listen_plus` label at 50% threshold) |
+| Train / test split | Global Temporal Split: 300 days train, 30-min gap, 1 day test |
+
+### 3.2 On-disk footprint after preprocessing (`README.MD:39-52`)
+
+```
+<DLRM_DATA_PATH>/
+├── raw/5b/multi_event.parquet                    50 GB   (downloaded)
+├── shared_metadata/
+│   ├── artist_item_mapping.parquet               60 MB
+│   ├── album_item_mapping.parquet                76 MB
+│   └── embeddings.parquet                        18 GB   (unused by HSTU training)
+└── processed_5b/
+    ├── train_sessions.parquet                    47 GB   ← main training input
+    ├── test_events.parquet                       152 MB
+    ├── session_index.parquet                     600 MB
+    ├── item_popularity.npy                        75 MB
+    └── split_meta.json                            anchor + boundary stats
+```
+
+Plan for **~115 GB free disk** to do everything end-to-end (raw + shared +
+processed). If you skip the unused `embeddings.parquet` (which the script
+downloads anyway), you still need ~97 GB.
+
+### 3.3 Download + preprocess
+
+Both happen in one command. Download is via the `datasets` library
+(HuggingFace), so you need internet and `pip install datasets`. From
+`dlrm_v3/preprocess_public_data.py:276-317`:
+
+```bash
+pip install datasets polars-u64-idx pyarrow xxhash gin-config absl-py pandas
+
+export DLRM_DATA_PATH=/your/big/disk/dlrm_data
+mkdir -p "$DLRM_DATA_PATH"
+
+cd /home/suachong/training/recommendation_v4
+python3 -m generative_recommenders.dlrm_v3.preprocess_public_data \
+  --dataset yambda-5b \
+  --data-path "$DLRM_DATA_PATH"
+```
+
+Per `README.MD:54`: **~53 minutes end-to-end** for the 5b variant on a
+reasonable box. For a quick smoke test substitute `--dataset yambda-50m`
+(~2 min, ~1 GB on disk).
+
+Critical: **install `polars-u64-idx`, not stock `polars`.** yambda-5b has
+>4.29 B rows and overflows polars' default 32-bit row index silently
+(`training_recipe.md:102-103`, `scripts/launch_smoke_8gpu.sh:13-20`).
+
+### 3.4 Run training (8-GPU smoke)
+
+From `scripts/launch_smoke_8gpu.sh` and `README.MD:9-22`.
+
+**Inside the validated container** (recommended; everything's pre-staged):
+
+```bash
+docker exec yambda_8gpu bash -c \
+  'cd /workspace/recommendation_v4 && bash scripts/launch_smoke_8gpu.sh'
+```
+
+Override data path / run name without editing the gin:
+
+```bash
+DLRM_DATA_PATH=/your/big/disk/dlrm_data \
+RUN_NAME=my_experiment \
+bash scripts/launch_smoke_8gpu.sh
+```
+
+**From scratch on a bare host**, you need to assemble the stack per
+`docs/training_recipe.md`. The hard requirements are:
+
+- **ROCm path**: `rocm/primus:v26.3`, torch `2.12.0+rocm7.2`, triton `3.6.0`,
+  fbgemm_gpu built from commit `10b77573` for `gfx950`, torchrec
+  `1.7.0a0+bf55480`. See `training_recipe.md:30-45`.
+- **CUDA path**: `nvcr.io/nvidia/pytorch:26.04-py3`, native torch (do NOT
+  reinstall), fbgemm_gpu built from same commit for `sm_100`, torchrec
+  `1.7.0.dev20260601+cu130`. See `training_recipe.md:147-155`.
+
+In both cases the actual launch is just:
+
+```bash
+python -m generative_recommenders.dlrm_v3.train.train_ranker \
+    --dataset yambda-5b --mode train-eval
+```
+
+(plus `HSTU_HAMMER_KERNEL=TRITON` for CUDA; `=PYTORCH` is forced on ROCm in
+the smoke script because the Triton kernel hits PassManager errors on some
+shapes there — see `scripts/launch_smoke_8gpu.sh:31-33`. The PYTORCH fallback
+gives ~190 ms/step baseline, not the ~52 ms primus-pinned number.)
+
+### 3.5 What you'll see
+
+Per `training_recipe.md:84-91`, on 8× MI350X in the optimal config:
+
+- ~52 ms/step at global batch 256 (per the doc; gin says 8,192 — see §1.6 note)
+- ~4,970 samples/sec
+- ~7.6 days for one epoch over 3.23 B training anchors
+
+If `auc_threshold = 0.80275` is the real benchmark target (still TBD),
+`time_to_auc_0.80275_sec` will be logged as soon as the eval AUC on
+`listen_plus` crosses it (`dlrm_v3/utils.py:587-608`).
+
+---
+
+## 4. TL;DR
+
+- **HSTU ≈ decoder-only Transformer** with two tweaks: SiLU/N replaces softmax
+  in attention, and a `U`-gated output replaces the standard MLP block.
+- **DLRMv3 (yambda) = TorchRec embeddings → contextual preprocessor → 5
+  stacked HSTU layers → BCE head on `listen_plus`.** Sparse tables (≈150 G
+  params) dominate the model; the dense HSTU stack is tiny by comparison.
+- **UIH = user interaction history.** Each sample is one anchor LISTEN event
+  plus that user's last ~2 k events (LISTEN_PLUS / LIKE / SKIP, interleaved
+  chronologically, gathered with a `L//3`-per-pool cap), and HSTU does causal
+  self-attention across them.
+- **Jagged attention** packs variable-length per-user sequences as
+  `(flat_values, offsets)` instead of padding to `N_max`, so the Triton kernel
+  never spends FLOPs on empty positions — essential because the average
+  sequence is only 69% full on yambda.
+- **Yambda-5b** is a 4.76 B-event / 1 M-user Yandex Music dataset on
+  HuggingFace (`yandex/yambda`); downloading + preprocessing takes ~53 min and
+  ~115 GB disk; run via
+  `python -m generative_recommenders.dlrm_v3.train.train_ranker --dataset yambda-5b --mode train-eval`
+  (or `scripts/launch_smoke_8gpu.sh`).
+
+---
+
+## 5. Open questions to bring back to the proposing team
+
+1. **Target metric.** `yambda_5b.gin:107` reuses DLRMv2's `0.80275` AUC
+   threshold. Is this the intended final v4 benchmark target, or a placeholder
+   inherited from upstream? An HSTU model on a different dataset would
+   normally need its own threshold chosen from a reference run.
+2. **Batch size canonicalization.** `yambda_5b.gin:1` = `1024` per rank
+   (global 8,192); `docs/training_recipe.md:65, 182` says `32` per rank
+   (global 256). Which is the submission config?
+3. **Convergence reference runs.** No `reference_results.md`-style table exists
+   yet under `training/recommendation_v4`. Submission-quality v4 will need
+   reference epochs-to-target numbers per dataset variant.
diff --git a/recommendation_v4/scripts/run_docker.sh b/recommendation_v4/scripts/run_docker.sh
new file mode 100755
index 000000000..bb1d58fca
--- /dev/null
+++ b/recommendation_v4/scripts/run_docker.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+# Launch a yambda_8gpu container from rocm/mlperf:dlrm_v3_mi355 with the repo
+# and data directories bind-mounted at matching host/container paths.
+#
+# Usage:
+#   bash scripts/run_docker.sh                   # interactive shell
+#   bash scripts/run_docker.sh -- bash scripts/launch_smoke_8gpu.sh   # one-shot
+#
+# Overrides (export before invoking):
+#   IMAGE          docker image                 (default: rocm/mlperf:dlrm_v3_mi355)
+#   CONTAINER_NAME container name               (default: yambda_8gpu)
+#   REPO_HOST      host path to repo            (default: this script's parent)
+#   DATA_HOST      host path to dataset root    (default: /data/mlperf_dlrm_v4)
+
+set -euo pipefail
+
+IMAGE=${IMAGE:-rocm/mlperf:dlrm_v3_mi355}
+CONTAINER_NAME=${CONTAINER_NAME:-mlperf-recommendation-v4}
+REPO_HOST=${REPO_HOST:-$(cd "$(dirname "$0")/.." && pwd)}
+DATA_HOST=${DATA_HOST:-/data/mlperf_dlrm_v4}
+
+# Mount host paths at the same string inside the container so DLRM_DATA_PATH
+# can be set from either side and resolve identically (env_path() in
+# dlrm_v3/utils.py:641-653 does a literal os.environ.get).
+REPO_CONT=/workspace/recommendation_v4
+DATA_CONT=${DATA_HOST}
+
+if [ ! -d "${DATA_HOST}" ]; then
+  echo "warning: ${DATA_HOST} does not exist on host. Run preprocess_public_data first or override DATA_HOST." >&2
+fi
+
+# If a container with this name is already running, exec into it instead of
+# starting a new one. Matches the `docker exec yambda_8gpu ...` pattern in
+# README.MD:9-12.
+if docker ps --format '{{.Names}}' | grep -qx "${CONTAINER_NAME}"; then
+  echo "container ${CONTAINER_NAME} already running; exec'ing in" >&2
+  exec docker exec -it "${CONTAINER_NAME}" "${@:-bash}"
+fi
+
+# Remove a stopped container with the same name so --name doesn't collide.
+if docker ps -a --format '{{.Names}}' | grep -qx "${CONTAINER_NAME}"; then
+  docker rm "${CONTAINER_NAME}" >/dev/null
+fi
+
+exec docker run --rm -it \
+  --name "${CONTAINER_NAME}" \
+  --device=/dev/kfd --device=/dev/dri \
+  --group-add video --group-add render \
+  --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
+  --ipc=host --network=host \
+  --shm-size=64g --ulimit memlock=-1 --ulimit stack=67108864 \
+  -v "${REPO_HOST}:${REPO_CONT}" \
+  -v "${DATA_HOST}:${DATA_CONT}" \
+  -e DLRM_DATA_PATH="${DATA_CONT}" \
+  -e HSTU_HAMMER_KERNEL="${HSTU_HAMMER_KERNEL:-TRITON}" \
+  -e RUN_NAME="${RUN_NAME:-default}" \
+  -w "${REPO_CONT}" \
+  "${IMAGE}" \
+  "${@:-bash}"

From d03fb2888002917b5b633da8f0c716f30a1cd8c2 Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Wed, 10 Jun 2026 21:26:12 +0000
Subject: [PATCH 047/113] dlrmv4: decouple multi-node launch from per-user
 paths for portable baseline

Run the consolidated launch/streaming flow from any $HOME without editing
another user's tree:
- Derive REPO_MOUNT/DATA_MOUNT/SCRATCH from $HOME with env overrides; keep
  the shared read-only dataset path intact (no data duplication).
- Per-user container name (yambda_$USER) to avoid collisions.
- In-repo provisioning via launch_slurm.sh (drop external _provision script dep).
- chmod log files world-writable so the container (nobody) can tee-append
  under NFS root_squash; fixes spurious pipefail rc=1.
- Neutral in-repo TensorBoard default path.

Validated: 2-node ROCm/RDMA smoke (world_size 16) completed rc=0, 20-batch
train window with metrics logged.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin           | 12 +++--
 .../train/tests/streaming_resume_test.sh      | 14 +++--
 .../generative_recommenders/dlrm_v3/utils.py  |  2 +-
 recommendation_v4/scripts/launch_slurm.sh     | 53 ++++++++++++++-----
 .../scripts/run_streaming_e2e.sh              | 44 +++++++++------
 5 files changed, 86 insertions(+), 39 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index e7eed30cb..5b5ec03e9 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -206,13 +206,15 @@ Profiler.active = 5
 Profiler.trace_dir = @run_results_dir()
 
 # logger variables
-# TensorBoard event dir. Default lives on shared NFS (not container-local /tmp,
-# which is wiped on node failover) so the NE/AUC scalars survive relaunches and
-# failover. Override per-run via $TENSORBOARD_LOG_PATH (the supervisor sets it
-# to /apps/chcai/tb/$RUN_NAME/).
+# TensorBoard event dir, driven by $TENSORBOARD_LOG_PATH. launch_slurm.sh (worker)
+# always sets it to a writable NFS scratch path ($SCRATCH/tb/...) and the e2e
+# supervisor pins a per-run path, so this literal default is only a last-resort
+# fallback for a bare `train_ranker` invocation. It is RELATIVE on purpose (lands
+# under the trainer's cwd = the repo, which is writable) so no user/site path is
+# baked into the gin; set $TENSORBOARD_LOG_PATH for anything else.
 MetricsLogger.tensorboard_log_path = @tbp/env_path()
 tbp/env_path.key     = "TENSORBOARD_LOG_PATH"
-tbp/env_path.default = "/apps/chcai/tb/yambda_5b/"
+tbp/env_path.default = "tb/yambda_5b/"
 MetricsLogger.world_size = 8
 MetricsLogger.auc_threshold = 0.80275
 # Lifetime-AUC backend, selectable independently for the train cumulative AUC and
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
index c3690e652..7c4bc8179 100755
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
@@ -17,7 +17,7 @@
 #
 # Usage:
 #   bash generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh --jobid <slurm-jobid>
-#       [--container yambda_primus]
+#       [--container yambda_suachong]
 #       [--num-train-batches 200]
 #       [--die-at-step 350]
 #       [--keep]    # retain LOG_DIR + CKPT after run for inspection
@@ -25,7 +25,7 @@
 set -uo pipefail
 
 JOBID=""
-CONTAINER="yambda_primus"
+CONTAINER="yambda_${USER:-$(id -un)}"
 NUM_TRAIN_BATCHES=200
 DIE_AT_STEP=350
 IN_WINDOW_FREQ=50
@@ -40,9 +40,13 @@ KEEP=0
 # correctness gates are the functional-invariant checks below (RNG restored,
 # resumed-at-correct-step, atomic/keep_last_n), not this number.
 ATOL=0.15
-CKPT_ROOT=/apps/chcai/ckpts_resume_test
-LOG_DIR=/apps/chcai/streaming_resume_test
-REPO=/home/chcai/training/recommendation_v4
+# Writable scratch ($HOME-derived) + repo root (this file is at
+# <repo>/generative_recommenders/dlrm_v3/train/tests/, i.e. 4 levels deep).
+# Both env-overridable; nothing is hardwired to a specific user/site.
+SCRATCH=${SCRATCH:-$HOME/yambda_runs}
+CKPT_ROOT=${CKPT_ROOT:-$SCRATCH/ckpts_resume_test}
+LOG_DIR=${LOG_DIR:-$SCRATCH/streaming_resume_test}
+REPO=${REPO:-$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../.." && pwd)}
 
 while [[ $# -gt 0 ]]; do
     case $1 in
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
index a9c060324..1996323a8 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
@@ -807,7 +807,7 @@ class Profiler:
 
     All knobs are gin-tunable, e.g. in a gin file::
 
-        Profiler.trace_dir = "/apps/chcai/dlrm_runs/exp42/trace"
+        Profiler.trace_dir = "/path/to/results/exp42/trace"
         Profiler.trace_steps = [500, 1000, 5000]
         Profiler.warmup = 5
         Profiler.active = 10
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index 0b5547497..098a69529 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -5,7 +5,10 @@
 #SBATCH --exclusive
 #SBATCH --partition=meta64          # [CLUSTER-SPECIFIC] partition name
 #SBATCH --time=01:10:00
-#SBATCH --output=/apps/chcai/yambda_slurm.%j.out
+#SBATCH --output=yambda_slurm.%j.out
+# ^ relative to the submit dir (SLURM parses #SBATCH before any shell runs, so it
+#   cannot expand env vars). The real consolidated run log is $LOG (see below),
+#   which defaults under $SCRATCH; this file just captures the batch stdout.
 # =============================================================================
 # launch_slurm.sh — single entry point for the yambda-5b trainer on N>=1 nodes.
 #
@@ -59,9 +62,10 @@
 #
 #  B) Filesystems (must be shared/NFS across ALL nodes — this script re-invokes
 #     itself and reads the overlay + data from these paths cluster-wide)
-#     - /home/chcai (repo + this script) and /apps/chcai (scratch: logs, overlay,
-#       baked tar, data, pip tarball). CHANGE both the bind mounts in the
-#       `docker run` (provision) and the default LOG/BAKED_TAR/OVERLAY/PIP_* paths.
+#     - REPO_MOUNT (repo + this script, e.g. /home/suachong) is bind-mounted rw;
+#       DATA_MOUNT (e.g. /apps/chcai) holds the read-only dataset + overlay +
+#       baked tar + pip tarball; SCRATCH (e.g. /home/suachong/yambda_runs) is the
+#       writable log/output root. Override any via env — nothing is user-hardwired.
 #
 #  C) Container image / GPU software stack (tied to the GPU arch + ROCm version)
 #     - IMAGE=rocm/primus:v26.3        : base image. ROCm/AMD-specific.
@@ -106,13 +110,24 @@ if [ -z "$PHASE" ]; then
 fi
 
 # ---- shared config (env-overridable) ----------------------------------------
-CONTAINER=${CONTAINER:-yambda_primus}
+CONTAINER=${CONTAINER:-yambda_${USER:-$(id -un)}}   # per-user container name (do NOT reuse another user's container — its bind mounts differ)
 REPO=${REPO:-$REPO_ROOT}                       # repo path inside the container
 IMAGE=${IMAGE:-rocm/primus:v26.3}              # [CLUSTER-SPECIFIC] ROCm/arch base image
 BAKED_IMAGE=${BAKED_IMAGE:-yambda_primus_baked:latest}
-BAKED_TAR=${BAKED_TAR:-/apps/chcai/yambda_primus_baked.tar}   # [CLUSTER-SPECIFIC] shared-NFS path
+BAKED_TAR=${BAKED_TAR:-/apps/chcai/yambda_primus_baked.tar}   # [CLUSTER-SPECIFIC] shared-NFS path (read-only build asset)
 USE_BAKED=${USE_BAKED:-1}
-OVERLAY=${RDMA_OVERLAY:-/apps/chcai/rdma_host_el9_new}        # [CLUSTER-SPECIFIC] shared-NFS RDMA overlay
+OVERLAY=${RDMA_OVERLAY:-/apps/chcai/rdma_host_el9_new}        # [CLUSTER-SPECIFIC] shared-NFS RDMA overlay (read-only, already staged)
+
+# Bind mounts + scratch — all on shared NFS, identical path on every node.
+#   REPO_MOUNT : NFS home root that contains THIS repo (bind-mounted rw).
+#   DATA_MOUNT : NFS root with the (shared, read-only) dataset + RDMA overlay +
+#                pip/fbgemm build assets. Kept as-is so the dataset is NOT
+#                duplicated. You only need read access here.
+#   SCRATCH    : this run's WRITABLE output root (logs / tb / traces).
+# All env-overridable, so nothing is hardwired to one user's home.
+REPO_MOUNT=${REPO_MOUNT:-$HOME}              # NFS home holding the repo (must contain $REPO); override if your repo lives elsewhere
+DATA_MOUNT=${DATA_MOUNT:-/apps/chcai}        # shared dataset + RDMA overlay + pip/fbgemm assets (read-only)
+SCRATCH=${SCRATCH:-$HOME/yambda_runs}        # writable output root (logs / tb / traces)
 
 # =============================================================================
 # PHASE: orchestrate  (SLURM batch host)
@@ -128,7 +143,8 @@ orchestrate() {
   [ -f "$SCRIPT_PATH" ] || SCRIPT_PATH="$SELF"
   REPO=$(cd "$(dirname "$SCRIPT_PATH")/.." && pwd)
 
-  LOG=${LOG:-/apps/chcai/yambda_slurm.${SLURM_JOB_ID:-manual}.log}
+  mkdir -p "$SCRATCH" 2>/dev/null || true
+  LOG=${LOG:-$SCRATCH/yambda_slurm.${SLURM_JOB_ID:-manual}.log}
 
   # Smoke defaults — override via env for a perf run (see header USAGE).
   MODE=${MODE:-streaming-train-eval}
@@ -142,6 +158,11 @@ orchestrate() {
   FORCE_PROVISION=${FORCE_PROVISION:-0}
 
   : > "$LOG"
+  # World-writable so the in-container worker (running as root, squashed to
+  # `nobody` over root-squashed NFS) can append via `tee -a $LOG`. Without this
+  # the worker's tee opens the file read-only-denied and exits non-zero, which
+  # pipefail turns into a spurious rc=1 even when training succeeds.
+  chmod 666 "$LOG" 2>/dev/null || true
   echo "[$(date)] launch_slurm/orchestrate: job=${SLURM_JOB_ID:-?} nodes=${SLURM_JOB_NODELIST:-?} nnodes=${SLURM_NNODES:-1}" | tee -a "$LOG"
   echo "[$(date)] resolved SCRIPT_PATH=$SCRIPT_PATH REPO=$REPO" | tee -a "$LOG"
   echo "[$(date)] config: MODE=$MODE START_TS=$START_TS NUM_TRAIN_TS=$NUM_TRAIN_TS NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES METRIC_LOG_FREQ=$METRIC_LOG_FREQ" | tee -a "$LOG"
@@ -181,7 +202,8 @@ orchestrate() {
       echo \"[\$(hostname)] (re)provisioning container\"
       LAUNCH_SLURM_PHASE=provision CONTAINER=$CONTAINER IMAGE=$IMAGE \
         BAKED_IMAGE=$BAKED_IMAGE BAKED_TAR=$BAKED_TAR USE_BAKED=$USE_BAKED \
-        BAKE_IMAGE=${BAKE_IMAGE:-0} RDMA_OVERLAY=$OVERLAY REPO=$REPO bash $SCRIPT_PATH
+        BAKE_IMAGE=${BAKE_IMAGE:-0} RDMA_OVERLAY=$OVERLAY REPO=$REPO \
+        REPO_MOUNT=$REPO_MOUNT DATA_MOUNT=$DATA_MOUNT SCRATCH=$SCRATCH bash $SCRIPT_PATH
     else
       echo \"[\$(hostname)] container already up\"
     fi
@@ -192,6 +214,7 @@ orchestrate() {
   srun --ntasks-per-node=1 bash -c "
     docker exec \
       -e LAUNCH_SLURM_PHASE=worker \
+      -e SCRATCH=$SCRATCH \
       -e SLURM_NNODES=\$SLURM_NNODES \
       -e SLURM_NODEID=\$SLURM_NODEID \
       -e SLURM_PROCID=\$SLURM_PROCID \
@@ -270,9 +293,9 @@ provision() {
     --ulimit memlock=-1:-1 --ulimit stack=67108864:67108864 \
     `# memlock=-1 is REQUIRED for RDMA QP memory registration — do not drop` \
     --security-opt seccomp=unconfined --privileged \
-    -v /home/chcai:/home/chcai \
-    -v /apps/chcai:/apps/chcai \
-    `# [CLUSTER-SPECIFIC] shared-NFS bind mounts: repo + scratch (overlay/logs/data)` \
+    -v "$REPO_MOUNT:$REPO_MOUNT" \
+    -v "$DATA_MOUNT:$DATA_MOUNT" \
+    `# shared-NFS bind mounts: repo home (REPO_MOUNT, rw) + dataset/build assets (DATA_MOUNT)` \
     -w "$REPO" \
     "$RUN_IMAGE" sleep infinity
 
@@ -354,7 +377,11 @@ python -c "import torch, fbgemm_gpu, torchrec, polars, xxhash, gin; print(\"impo
 # =============================================================================
 worker() {
   cd "$REPO_ROOT"
-  LOG=${LOG:-/apps/chcai/yambda_5b_8gpu.log}
+  mkdir -p "$SCRATCH" 2>/dev/null || true
+  LOG=${LOG:-$SCRATCH/yambda_5b_8gpu.log}
+  # TensorBoard under the writable scratch root unless the caller (e.g. the e2e
+  # supervisor) pinned a per-run path. Keeps the gin default from ever being used.
+  export TENSORBOARD_LOG_PATH=${TENSORBOARD_LOG_PATH:-$SCRATCH/tb/yambda_5b}
   # Append (not truncate): under the streaming-e2e supervisor a run may relaunch
   # many times into the SAME $LOG; the supervisor initializes it once at run start.
   echo "[$(date)] REPO_ROOT=$REPO_ROOT" | tee -a "$LOG"
diff --git a/recommendation_v4/scripts/run_streaming_e2e.sh b/recommendation_v4/scripts/run_streaming_e2e.sh
index f97a0e483..6eb8ee319 100755
--- a/recommendation_v4/scripts/run_streaming_e2e.sh
+++ b/recommendation_v4/scripts/run_streaming_e2e.sh
@@ -26,8 +26,9 @@
 #   lived docker container ($CONTAINER) on the compute node held by a SLURM
 #   allocation ($JOBID). All control flow is `srun --jobid <id> --overlap
 #   docker exec ...` into that container. The container bind-mounts shared NFS
-#   (/home/chcai = code, /apps/chcai = checkpoints+logs), which is what makes
-#   node failover possible: any node in $PARTITION sees the same code+state.
+#   (REPO_MOUNT = code, e.g. /home/suachong; DATA_MOUNT = dataset/build assets;
+#   SCRATCH = checkpoints+logs), which is what makes node failover possible: any
+#   node in $PARTITION sees the same code+state.
 #
 # MAIN LOOP (state machine, up to --max-relaunch attempts)
 #   for each attempt:
@@ -95,12 +96,12 @@
 # EXAMPLE
 #   nohup bash scripts/run_streaming_e2e.sh \
 #       --jobid 12074 \
-#       --ckpt-path /apps/chcai/ckpts/yambda_5b_e2e \
-#       --run-name yambda_5b_e2e --log /apps/chcai/yambda_5b_e2e.log \
+#       --ckpt-path "$HOME/yambda_runs/ckpts/yambda_5b_e2e" \
+#       --run-name yambda_5b_e2e --log "$HOME/yambda_runs/yambda_5b_e2e.log" \
 #       --start-ts 150 --num-train-ts 149 --eval-every 10 \
 #       --ckpt-time-interval 3600 --keep-last-n 1 --max-relaunch 100 \
 #       --reservation NAN_issue_debug \
-#       > /apps/chcai/yambda_5b_e2e.supervisor.console.log 2>&1 &
+#       > "$HOME/yambda_runs/yambda_5b_e2e.supervisor.console.log" 2>&1 &
 #   (--reservation makes node-death failover re-acquire from that reservation;
 #    omit it to fall back to the open $PARTITION pool.)
 # =============================================================================
@@ -108,8 +109,12 @@
 set -uo pipefail
 
 JOBID=11367
-CONTAINER=yambda_primus
-REPO=/home/chcai/training/recommendation_v4
+CONTAINER=yambda_${USER:-$(id -un)}
+# Repo (NFS path, identical inside the bind-mounted container) and the writable
+# output root. Both derive from $HOME so nothing is hardwired to one user; override
+# REPO/SCRATCH via env if your checkout or scratch lives elsewhere.
+REPO=${REPO:-$HOME/training/recommendation_v4}
+SCRATCH=${SCRATCH:-$HOME/yambda_runs}
 
 # Direct-SSH fallback so the supervisor can probe the node even while the SLURM
 # control plane is unreachable — a transient controller outage must NOT be
@@ -128,9 +133,13 @@ START_TS=150
 EVAL_EVERY=5
 CKPT_TIME_INTERVAL=7200
 KEEP_LAST_N=1
-CKPT_PATH=/apps/chcai/ckpts/yambda_5b_e2e
+# NOTE: a full DMP checkpoint is ~560 GB, which a typical ~100 GB home quota
+# cannot hold — point --ckpt-path at a large writable volume (and lower
+# --min-free-gib accordingly) before relying on checkpointing. The $SCRATCH
+# default below is fine for short/uncheckpointed runs.
+CKPT_PATH=$SCRATCH/ckpts/yambda_5b_e2e
 RUN_NAME=yambda_5b_e2e
-LOG=/apps/chcai/yambda_5b_e2e.log
+LOG=$SCRATCH/yambda_5b_e2e.log
 MAX_RELAUNCH=50
 NUM_TRAIN_BATCHES=0     # 0 = full window (only capped for validation/tests)
 NUM_EVAL_BATCHES=0      # 0 = full holdout eval (only capped for validation)
@@ -154,14 +163,17 @@ CTRL_WAIT_MAX=3600      # max seconds to wait for an unreachable SLURM controlle
 # --- node failover ----------------------------------------------------------
 # If the current allocation/node goes away, acquire a FRESH node, (re)provision
 # the container on it, and resume — checkpoints + code live on shared NFS
-# (/apps/chcai, /home/chcai), so any node in the partition can continue.
+# (SCRATCH + REPO_MOUNT), so any node in the partition can continue.
 PARTITION=meta64
 RESERVATION=""                        # if set, failover acquires from this SLURM
                                       # reservation (e.g. NAN_issue_debug) so a
                                       # replacement node comes from the same pool.
 ALLOC_TIME=7-00:00:00                 # SLURM --time for a failover hold job
 ALLOW_FAILOVER=1                      # 0 = never acquire a new node
-PROVISION_SCRIPT=/home/chcai/_provision_yambda_primus.sh
+# Failover (re)provisioning reuses launch_slurm.sh's own `provision` phase
+# (LAUNCH_SLURM_PHASE=provision, set in provision_node) — no dependency on any
+# out-of-repo provisioning script.
+PROVISION_SCRIPT="$REPO/scripts/launch_slurm.sh"
 ACQUIRE_WAIT_MAX=1800                 # max seconds to wait for the OPEN-POOL
                                       # (tier-2) failover hold job to reach
                                       # RUNNING (tolerates brief queueing).
@@ -311,7 +323,9 @@ provision_node() {
     local jid="$1" node
     node=$(squeue -h -j "$jid" -o '%N' 2>/dev/null | head -1)
     sup "provisioning container '$CONTAINER' on job $jid (node ${node:-?}) — cold node can take ~15 min"
-    srun --jobid="$jid" --overlap bash "$PROVISION_SCRIPT" >> "${LOG%.log}.provision.log" 2>&1
+    srun --jobid="$jid" --overlap env LAUNCH_SLURM_PHASE=provision \
+        CONTAINER="$CONTAINER" REPO="$REPO" bash "$PROVISION_SCRIPT" \
+        >> "${LOG%.log}.provision.log" 2>&1
     container_up "$jid"
 }
 
@@ -541,7 +555,7 @@ launch() {
         EVAL_HOLDOUT_NUM_WINDOWS=$EVAL_HOLDOUT_NUM_WINDOWS \
         METRIC_LOG_FREQ=50 \
         RUN_NAME=$RUN_NAME \
-        TENSORBOARD_LOG_PATH=/apps/chcai/tb/$RUN_NAME/ \
+        TENSORBOARD_LOG_PATH=$SCRATCH/tb/$RUN_NAME/ \
         LOG=$LOG \
         bash scripts/launch_slurm.sh;
         echo \"E2E_RUN_EXIT=\$? \$(date '+%F %T')\" >> $LOG
@@ -567,7 +581,7 @@ sup "failover: allow=$ALLOW_FAILOVER partition=$PARTITION reservation=${RESERVAT
 # this, such an orphan keeps pinning a second reservation node indefinitely.
 reap_failover_holds ""
 
-cexec "mkdir -p '$CKPT_PATH' '/apps/chcai/tb/$RUN_NAME'"
+cexec "mkdir -p '$CKPT_PATH' '$SCRATCH/tb/$RUN_NAME'"
 # Initialize this run's metrics log ONCE. launch_slurm.sh (worker) appends (tee -a),
 # so every relaunch attempt accumulates into this single file — the full-run
 # NE/AUC history survives crashes and node failover instead of being truncated
@@ -579,7 +593,7 @@ else
     cexec ": > '$LOG'"
     sup "metrics log initialized (relaunch-append): $LOG"
 fi
-sup "tensorboard (NFS): /apps/chcai/tb/$RUN_NAME/"
+sup "tensorboard (NFS): $SCRATCH/tb/$RUN_NAME/"
 
 attempt=0
 while (( attempt < MAX_RELAUNCH )); do

From c55736eaf1f4ae94a19e28830ede898a72cc193e Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Wed, 10 Jun 2026 18:52:49 -0500
Subject: [PATCH 048/113] dlrmv4: deterministic in-window shuffle diversity
 dial + opt-in diagnostics

Add STREAMING_SHUFFLE_FRACTION (0..1) as a config-invariant control over
in-window embedding diversity, replacing the legacy block/buffer knobs. Full
shuffle is the deterministic, seeded default (fraction=1.0). Add an opt-in
unique-embedding diagnostic (DIAG_UNIQUE_EMB) and gate chrome-trace capture
behind OUTPUT_TRACE; both default off to keep production runs overhead-free.
Forward the new env knobs through launch_slurm.sh.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/datasets/yambda.py                | 66 +++++++++++++++++--
 .../dlrm_v3/train/gin/yambda_5b.gin           | 48 ++++++++++++--
 .../dlrm_v3/train/utils.py                    | 39 +++++++++++
 .../generative_recommenders/dlrm_v3/utils.py  |  7 ++
 recommendation_v4/scripts/launch_slurm.sh     |  6 ++
 5 files changed, 156 insertions(+), 10 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py
index ad24513f4..6627033bc 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py
@@ -209,6 +209,8 @@ def __init__(
         is_inference: bool = False,
         streaming_window_seconds: int = 86400,
         streaming_sort_within_window: bool = False,
+        streaming_shuffle_fraction: float = 0.0,
+        streaming_shuffle_seed: int = 0,
         train_split_percentage: float = 1.0,
         split_salt: int = 0,
         *args,
@@ -225,6 +227,17 @@ def __init__(
         # is byte-for-byte unaffected.
         self._streaming_window_seconds: int = streaming_window_seconds
         self._streaming_sort_within_window: bool = streaming_sort_within_window
+        # In-window shuffle dial in [0, 1] to break user-major batching (default
+        # 0.0 = off, user-major order preserved for page-local mmap scans). Maps to
+        # a within-segment shuffle with K = round(fraction * per-window train-anchor
+        # count): 1.0 = full per-element shuffle (max user diversity per batch),
+        # intermediate = interpolation. Computed from the global anchor count BEFORE
+        # round-robin striding, so a given fraction yields the same diversity
+        # regardless of world_size / #nodes / batch_size (config-invariant). The
+        # permutation is a pure function of (seed, ts) so the per-rank round-robin
+        # slice + mid-window resume skip stay deterministic across restarts.
+        self._streaming_shuffle_fraction: float = streaming_shuffle_fraction
+        self._streaming_shuffle_seed: int = streaming_shuffle_seed
         # User-level train:eval split. `train_split_percentage >= 1.0` means no
         # holdout (legacy behavior: every anchor is trainable). Otherwise the
         # top `1 - train_split_percentage` fraction of users (by a deterministic
@@ -555,20 +568,63 @@ def _eval_anchor_mask(self, anchor_idx: np.ndarray) -> np.ndarray:
         uids = self.store.flat_uid[self._positions[anchor_idx]]
         return _uid_unit_hash(uids, self._split_salt) >= self._train_split_percentage
 
+    def _shuffle_window(self, idx: np.ndarray, ts: int) -> np.ndarray:
+        """Optionally break user-major ordering within a train window.
+
+        ``streaming_shuffle_fraction`` (0..1) is the single diversity dial. It
+        maps to a within-segment shuffle with ``K = round(fraction * N)`` where
+        ``N`` is this window's train-anchor count:
+
+          - 0.0  -> off: return ``idx`` unchanged (user-major, page-local scans).
+          - 1.0  -> full per-element shuffle (max user diversity per batch).
+          - else -> permute WITHIN each contiguous size-K segment (segment order
+            preserved). A per-rank batch then draws across a bounded user-major
+            region, so diversity scales with the fraction while the concurrently
+            touched mmap working set stays within ~one K-segment (page locality).
+
+        Because ``N`` is a property of the dataset/window (not the compute layout)
+        and the permutation is applied BEFORE the per-rank round-robin striding, a
+        given fraction yields the same diversity across world_size / #nodes /
+        batch_size (config-invariant).
+
+        The permutation is a pure function of ``(seed, ts)`` via
+        ``np.random.default_rng(seed + ts)``, so every (re)run of this window
+        yields the IDENTICAL order. This keeps the per-rank round-robin slice and
+        the mid-window resume ``skip_samples`` offset consistent across restarts,
+        exactly like the unshuffled path.
+        """
+        frac = self._streaming_shuffle_fraction
+        if idx.size <= 1 or not frac or frac <= 0.0:
+            return idx
+        rng = np.random.default_rng(self._streaming_shuffle_seed + ts)
+        if frac >= 1.0:
+            return idx[rng.permutation(idx.size)]
+        # Within-segment shuffle (K = round(fraction * N)): a single vectorized
+        # lexsort over per-element random keys, stable within each size-K segment
+        # so elements never cross a segment boundary (bounds the working set). O(N
+        # log K), run once per window in the background prep thread.
+        n = idx.size
+        k = max(1, int(round(frac * n)))
+        seg = np.arange(n, dtype=np.int64) // k
+        keys = rng.random(n)
+        order = np.lexsort((keys, seg))
+        return idx[order]
+
     def train_window_indices(self, ts: int) -> np.ndarray:
         """Global anchor indices for TRAIN in window ``ts``: ``window_indices``
-        with held-out eval users removed. Identical across resume because both
-        ``window_indices`` and the uid hash are pure functions, so the per-rank
-        round-robin slice (and the mid-window skip offset) stay consistent."""
+        with held-out eval users removed. Identical across resume because
+        ``window_indices``, the uid hash, and the (seed,ts)-keyed in-window
+        shuffle are all pure functions, so the per-rank round-robin slice (and
+        the mid-window skip offset) stay consistent."""
         idx = self.window_indices(ts)
         if self._train_split_percentage >= 1.0:
-            return idx
+            return self._shuffle_window(idx, ts)
         kept = idx[~self._eval_anchor_mask(idx)]
         logger.warning(
             f"train_window_indices({ts}): {idx.size:,} -> {kept.size:,} anchors "
             f"(holdout tsp={self._train_split_percentage}, salt={self._split_salt})"
         )
-        return kept
+        return self._shuffle_window(kept, ts)
 
     def eval_holdout_indices(self, start_ts: int, num_windows: int = 1) -> np.ndarray:
         """Fixed eval set: held-out users' anchors over windows
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index e7eed30cb..92c2404fc 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -1,6 +1,13 @@
 batch_size = 1024
-num_workers = 4
-prefetch_factor = 8
+# Dataloader parallelism. Env-overridable so a perf sweep can probe whether the
+# shuffle steady-state cost is CPU-gather latency (hidden by more workers) vs
+# GPU-side embedding work (not). Defaults preserve prior behavior (4 / 8).
+num_workers = @nw/env_int()
+nw/env_int.key     = "NUM_WORKERS"
+nw/env_int.default = 4
+prefetch_factor = @pf/env_int()
+pf/env_int.key     = "PREFETCH_FACTOR"
+pf/env_int.default = 8
 dataset = "yambda-5b"
 
 # model parameters
@@ -110,6 +117,28 @@ msl/env_int.default = 4096
 # to the dataset's available window count at runtime; override via $NUM_TRAIN_TS.
 get_dataset.streaming_window_seconds = 86400
 get_dataset.streaming_sort_within_window = False
+# In-window shuffle to break user-major batching (consecutive sliding-window
+# anchors otherwise come from the same few users -> few unique embedding reads).
+# Diversity dial in [0,1] -- the AGREED, config-invariant benchmark knob. Maps to
+# a within-segment shuffle with K = round(fraction * per-window train-anchor
+# count): 0 = off (user-major, page-local mmap scans), 1 = full per-element
+# shuffle (max diversity), intermediate = interpolation. Same fraction => same
+# diversity regardless of world_size / #nodes / batch_size.
+#
+# Default 1.0 (full shuffle) so the standard/benchmark run is maximally diverse
+# and, together with the fixed seed below, the in-window order is fully
+# DETERMINISTIC and identical across runs/resumes (pure function of (seed, ts)).
+# Override per-run via $STREAMING_SHUFFLE_FRACTION (e.g. 0.0 for user-major,
+# 0.03 for the diversity/locality sweet spot).
+streaming_shuffle_fraction = 1.0
+get_dataset.streaming_shuffle_fraction = @ssf/env_float()
+ssf/env_float.key     = "STREAMING_SHUFFLE_FRACTION"
+ssf/env_float.default = %streaming_shuffle_fraction
+# Fixed shuffle seed -> reproducible permutation. Exposed as a knob; override via
+# $STREAMING_SHUFFLE_SEED only if you deliberately want a different draw.
+get_dataset.streaming_shuffle_seed = @ssfseed/env_int()
+ssfseed/env_int.key     = "STREAMING_SHUFFLE_SEED"
+ssfseed/env_int.default = 0
 # User-level train:eval holdout for the streaming path. With tsp<1.0, the top
 # (1 - tsp) fraction of users (by a deterministic hash of uid+split_salt) are
 # held out as a FIXED eval set and never trained -> no temporal/user leakage,
@@ -139,10 +168,19 @@ sts/env_int.default = 150
 streaming_train_eval_loop.metric_log_frequency = @mlf/env_int()
 mlf/env_int.key     = "METRIC_LOG_FREQ"
 mlf/env_int.default = 50
-# Trace on by default: reuses the shared Profiler.* bindings below (5-step
+# Diagnostic: log per-batch unique/total embedding-id counts on logged steps
+# (rank 0). Quantifies the user-major batching redundancy and the realized
+# diversity from get_dataset.streaming_shuffle_fraction. Off; set $DIAG_UNIQUE_EMB=1.
+streaming_train_eval_loop.streaming_diag_unique_emb = @due/env_int()
+due/env_int.key     = "DIAG_UNIQUE_EMB"
+due/env_int.default = 0
+# Chrome trace capture: reuses the shared Profiler.* bindings below (5-step
 # window at step 52). The streaming step counter advances across train+eval
-# batches, so step 52 lands in the first (train) window's compute.
-streaming_train_eval_loop.output_trace = True
+# batches, so step 52 lands in the first (train) window's compute. Off by
+# default to avoid profiler overhead in production runs; set $OUTPUT_TRACE=1.
+streaming_train_eval_loop.output_trace = @ot/env_int()
+ot/env_int.key     = "OUTPUT_TRACE"
+ot/env_int.default = 0
 # Reuse one DataLoader (persistent workers) across windows instead of respawning
 # per window. Skip eval to isolate window-reset cost. Override via env.
 streaming_train_eval_loop.persistent_loader = @pl/env_int()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index c27f38761..076412d6e 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -688,6 +688,38 @@ def make_train_test_dataloaders(
     return train_dataloader, test_dataloader
 
 
+def _log_unique_embedding_diag(sample, rank: int, step: int) -> None:
+    """Diagnostic: log per-batch unique-vs-total embedding-id counts.
+
+    Quantifies the user-major batching concern — when consecutive sliding-window
+    anchors come from the same few users, a batch reads very few UNIQUE embedding
+    rows (low unique/total), so embedding lookups are highly redundant. In-window
+    shuffle should raise this ratio. Rank-0 only, gated by the caller (only fires
+    on logged steps), and fully non-fatal so it can never break training.
+    """
+    if rank != 0:
+        return
+    try:
+        parts = []
+        for tag, kjt in (
+            ("uih", sample.uih_features_kjt),
+            ("cand", sample.candidates_features_kjt),
+        ):
+            for key in kjt.keys():
+                if not key.endswith(("item_id", "artist_id", "album_id")):
+                    continue
+                vals = kjt[key].values()
+                total = int(vals.numel())
+                if total == 0:
+                    continue
+                uniq = int(torch.unique(vals).numel())
+                parts.append(f"{tag}.{key}={uniq}/{total} ({100.0 * uniq / total:.1f}%)")
+        if parts:
+            logger.info(f"emb-diag - Step {step}: unique/total " + " ".join(parts))
+    except Exception as e:  # diagnostic must never break training
+        logger.warning(f"emb-diag failed: {e}")
+
+
 @gin.configurable
 def train_loop(
     rank: int,
@@ -702,6 +734,7 @@ def train_loop(
     metric_log_frequency: int = 1,
     checkpoint_frequency: int = 100,
     start_batch_idx: int = 0,
+    streaming_diag_unique_emb: bool = False,
     # lr_scheduler: to-do: Add a scheduler
 ) -> None:
     model.train()
@@ -711,6 +744,8 @@ def train_loop(
     for epoch in range(num_epochs):
         dataloader.sampler.set_epoch(epoch)  # pyre-ignore [16]
         for sample in dataloader:
+            if streaming_diag_unique_emb and batch_idx % metric_log_frequency == 0:
+                _log_unique_embedding_diag(sample, rank, batch_idx)
             optimizer.zero_grad()
             sample.to(device)
             (
@@ -1201,6 +1236,8 @@ def streaming_train_eval_loop(
     # --- global step / wall-clock checkpoint cadences ---
     checkpoint_step_frequency: int = 0,
     checkpoint_time_interval_s: float = 0.0,
+    # --- diagnostic: log per-batch unique/total embedding-id counts ---
+    streaming_diag_unique_emb: bool = False,
     # --- test-only failure injection knob ---
     die_at_step: int = -1,
 ) -> None:
@@ -1457,6 +1494,8 @@ def _run_train_window(
                 break
             if _t_next is not None and first_wait is None:
                 first_wait = time.perf_counter() - _t_next
+            if streaming_diag_unique_emb and train_batch_idx % metric_log_frequency == 0:
+                _log_unique_embedding_diag(sample, rank, train_batch_idx)
             optimizer.zero_grad()
             sample.to(device)
             (
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
index a9c060324..0abfaa86a 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
@@ -1424,6 +1424,8 @@ def get_dataset(
     history_length: Optional[int] = None,
     streaming_window_seconds: int = 86400,
     streaming_sort_within_window: bool = False,
+    streaming_shuffle_fraction: float = 0.0,
+    streaming_shuffle_seed: int = 0,
     train_split_percentage: float = 1.0,
     split_salt: int = 0,
 ):
@@ -1556,6 +1558,11 @@ def get_dataset(
                 # streaming-train-eval; ignored by the default train-eval path).
                 "streaming_window_seconds": streaming_window_seconds,
                 "streaming_sort_within_window": streaming_sort_within_window,
+                # In-window shuffle diversity dial in [0,1]: K=round(frac*N) within-
+                # segment shuffle. 0=off/user-major, 1=full. Config-invariant and
+                # deterministic by (seed, ts).
+                "streaming_shuffle_fraction": streaming_shuffle_fraction,
+                "streaming_shuffle_seed": streaming_shuffle_seed,
                 # User-level train:eval holdout for the streaming path. 1.0 =
                 # no holdout (legacy). <1.0 holds out (1 - tsp) of users as a
                 # fixed eval set; those users are never trained.
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index 0b5547497..42f8780ed 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -208,6 +208,12 @@ orchestrate() {
       -e NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES \
       -e NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES \
       -e METRIC_LOG_FREQ=$METRIC_LOG_FREQ \
+      ${STREAMING_SHUFFLE_FRACTION:+-e STREAMING_SHUFFLE_FRACTION=$STREAMING_SHUFFLE_FRACTION} \
+      ${STREAMING_SHUFFLE_SEED:+-e STREAMING_SHUFFLE_SEED=$STREAMING_SHUFFLE_SEED} \
+      ${NUM_WORKERS:+-e NUM_WORKERS=$NUM_WORKERS} \
+      ${PREFETCH_FACTOR:+-e PREFETCH_FACTOR=$PREFETCH_FACTOR} \
+      ${DIAG_UNIQUE_EMB:+-e DIAG_UNIQUE_EMB=$DIAG_UNIQUE_EMB} \
+      ${OUTPUT_TRACE:+-e OUTPUT_TRACE=$OUTPUT_TRACE} \
       -e TRAIN_SPLIT_PERCENTAGE=${TRAIN_SPLIT_PERCENTAGE:-0.90} \
       -e SPLIT_SALT=${SPLIT_SALT:-0} \
       -e EVAL_HOLDOUT_TS=${EVAL_HOLDOUT_TS:--1} \

From 851a354de5842e2163e9f1fcc5b11f8838f2b4a9 Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Thu, 11 Jun 2026 21:48:52 +0000
Subject: [PATCH 049/113] dlrmv4: MLPerf training compliance logging for
 streaming-train-eval

Wire mlperf_logging (mllog) into the yambda-5b streaming-train-eval path:
rank-0-gated MLPerfLogger facade emitting the full event lifecycle
(cache_clear/init_start -> submission_info + hyperparameters ->
init_stop/run_start -> per-window block_start/stop + eval_start/accuracy/stop
-> run_stop), driven by a configurable convergence target.

Key points:
- AUC_THRESHOLD (gin, env-overridable; default 0.80275) doubles as the MLPerf
  convergence target: rank 0 decides on the global lifetime eval AUC and
  BROADCASTS the stop boolean so all ranks break in lockstep (avoids the
  ALLTOALL collective-timeout deadlock from a per-rank decision).
- MLPerfLogger uses the explicit global rank passed by train_ranker (computed
  pre-dist-init), so only true rank 0 logs and the compliance file has exactly
  one event each. Per-job MLPERF_LOG_PATH avoids stale append accumulation.
- Per-step train_loss POINT_IN_TIME event (global cross-rank mean) with
  samples_count + lr, plus a console/TensorBoard readout.
- cumulative_train_samples counter (global, checkpointed) as the samples_count
  progress unit; lifecycle gated on cold start so e2e-supervisor resumes never
  emit orphaned run boundaries.

Validated end-to-end at 1/2/4 nodes (8/16/32 GPUs): clean compliance log,
single run_stop=success, passes the common/closed_common compliance checks.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 recommendation_v4/.gitignore                  |   8 +
 .../dlrm_v3/checkpoint.py                     |   8 +
 .../dlrm_v3/train/gin/yambda_5b.gin           |   9 +-
 .../dlrm_v3/train/mlperf_logging_utils.py     | 175 ++++++++++++++
 .../dlrm_v3/train/train_ranker.py             |  82 +++++++
 .../dlrm_v3/train/utils.py                    | 225 ++++++++++++++++--
 .../generative_recommenders/dlrm_v3/utils.py  |  75 ++++++
 recommendation_v4/requirements.txt            |   1 +
 recommendation_v4/scripts/launch_slurm.sh     |  16 +-
 9 files changed, 576 insertions(+), 23 deletions(-)
 create mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/train/mlperf_logging_utils.py

diff --git a/recommendation_v4/.gitignore b/recommendation_v4/.gitignore
index 5edddc5b3..5a0329448 100644
--- a/recommendation_v4/.gitignore
+++ b/recommendation_v4/.gitignore
@@ -157,3 +157,11 @@ dmypy.json
 
 # Cython debug symbols
 cython_debug/
+
+# SLURM batch stdout + local run artifacts (run logs are never committed;
+# the real run logs / MLPerf compliance logs live under $SCRATCH, outside the
+# repo). The trainer's #SBATCH --output lands in the submit dir as
+# yambda_slurm.<jobid>.out.
+yambda_slurm.*.out
+yambda_slurm.*.log
+compliance_checker.log
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py b/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py
index 0ef223b23..031d19701 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py
@@ -397,6 +397,10 @@ def save_dmp_checkpoint(
                 "class_metrics": class_metric_state_dict,
                 "reg_metrics": regression_metric_state_dict,
                 "global_step": metric_logger.global_step,
+                # MLPerf progress counter (global trained samples). Defaulted on
+                # load so pre-existing checkpoints restore as 0 and resume the
+                # count from there.
+                "cumulative_train_samples": metric_logger.cumulative_train_samples,
                 "sparse_tensor_keys": sparse_tensor_keys,
                 # Streaming resume fields. Defaulted on load so old checkpoints
                 # (pre-streaming-resume) still load as a normal restart.
@@ -525,6 +529,10 @@ def load_nonsparse_checkpoint(
         print("optimizer checkpoint successfully loaded")
     if metric_logger is not None:
         metric_logger.global_step = non_sparse_state_dict["global_step"]
+        # Defaulted for legacy checkpoints written before the counter existed.
+        metric_logger.cumulative_train_samples = non_sparse_state_dict.get(
+            "cumulative_train_samples", 0
+        )
         class_metric_state_dict = non_sparse_state_dict["class_metrics"]
         regression_metric_state_dict = non_sparse_state_dict["reg_metrics"]
         # Length-safe positional restore: if a checkpoint was written with a
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 5b5ec03e9..ba0183543 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -216,7 +216,14 @@ MetricsLogger.tensorboard_log_path = @tbp/env_path()
 tbp/env_path.key     = "TENSORBOARD_LOG_PATH"
 tbp/env_path.default = "tb/yambda_5b/"
 MetricsLogger.world_size = 8
-MetricsLogger.auc_threshold = 0.80275
+# Time-to-target AUC threshold. Doubles as the MLPerf convergence target: when
+# the cumulative ("lifetime_") listen_plus eval AUC first reaches this value the
+# streaming-train-eval run emits a SUCCESS RUN_STOP and terminates gracefully.
+# Override via $AUC_THRESHOLD (e.g. 0.5 to smoke-test the early-stop path on a
+# short run). MLPerf's DLRM-DCNv2 reference uses 0.80275.
+MetricsLogger.auc_threshold = @at/env_float()
+at/env_float.key     = "AUC_THRESHOLD"
+at/env_float.default = 0.80275
 # Lifetime-AUC backend, selectable independently for the train cumulative AUC and
 # the eval cumulative ("lifetime_*") AUC. Both default to "binned":
 #   "binned" = BinnedCumulativeAUC: exact-cumulative AUC from an O(bins) score
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/mlperf_logging_utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/mlperf_logging_utils.py
new file mode 100644
index 000000000..51e7971b5
--- /dev/null
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/mlperf_logging_utils.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2026 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# pyre-unsafe
+"""MLPerf Training compliance logging for the DLRMv3 streaming-train-eval path.
+
+Thin, rank-0-gated wrapper around ``mlperf_logging.mllog`` so the streaming
+loop can emit the MLPerf event stream (INIT/RUN/BLOCK/EVAL/RUN_STOP) without
+every call site re-checking the rank or guarding against a missing dependency.
+
+Modeled on recommendation_v2/torchrec_dlrm's inline ``submission_info`` but
+extended with rank-0 gating + optional distributed barriers (the NeMo / unet3d
+``sync`` pattern), so a multi-rank run produces exactly one valid log.
+"""
+
+import logging
+import os
+from typing import Any, Dict, Optional
+
+import gin
+import torch
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+try:
+    from mlperf_logging import mllog
+    from mlperf_logging.mllog import constants as mllog_constants
+
+    _MLLOG_AVAILABLE = True
+except Exception as e:  # pragma: no cover - import-time guard
+    mllog = None  # type: ignore[assignment]
+    mllog_constants = None  # type: ignore[assignment]
+    _MLLOG_AVAILABLE = False
+    logger.warning(
+        "mlperf_logging not importable (%s); MLPerf logging disabled. "
+        "Install via `pip install git+https://github.com/mlcommons/logging.git`.",
+        e,
+    )
+
+
+def _rank() -> int:
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        return torch.distributed.get_rank()
+    return 0
+
+
+def _barrier() -> None:
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        torch.distributed.barrier()
+
+
+class MLPerfLogger:
+    """Rank-0-gated facade over ``mllog``.
+
+    All event methods no-op on non-zero ranks and when ``mlperf_logging`` is not
+    installed, so callers never need to guard. ``sync=True`` inserts an
+    all-rank ``dist.barrier()`` before the (rank-0-only) emission so the logged
+    timestamp reflects the slowest rank reaching the boundary -- required for
+    INIT_STOP/RUN_START/RUN_STOP per the MLPerf rules.
+    """
+
+    def __init__(
+        self,
+        rank: Optional[int] = None,
+        log_path: Optional[str] = None,
+        default_stack_offset: int = 2,
+        benchmark_name: str = "hstu",
+        submitter_name: str = "reference_implementation",
+    ):
+        self.enabled: bool = _MLLOG_AVAILABLE
+        # CRITICAL: use the EXPLICIT global rank passed by the caller, not a
+        # dist.get_rank() lookup. This logger is constructed BEFORE
+        # dist.init_process_group (so the init phase can be timed), at which
+        # point torch.distributed.get_rank() is unavailable and would return 0
+        # for every process -> all 16 ranks would log everything. The caller
+        # (train_ranker) already knows the true global rank
+        # (node_rank * gpus_per_node + local_rank), so trust it. Fall back to a
+        # best-effort dist/zero lookup only when not provided.
+        self.rank: int = rank if rank is not None else _rank()
+        self.benchmark_name: str = benchmark_name
+        self.submitter_name: str = submitter_name
+        self._logger = None
+        if not self.enabled:
+            return
+        if log_path:
+            os.makedirs(os.path.dirname(log_path), exist_ok=True)
+            mllog.config(filename=log_path, default_stack_offset=default_stack_offset)
+        else:
+            mllog.config(default_stack_offset=default_stack_offset)
+        self._logger = mllog.get_mllogger()
+
+    @property
+    def constants(self):  # pyre-ignore[3]
+        return mllog_constants
+
+    def event(
+        self,
+        key: str,
+        value: Any = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        sync: bool = False,
+    ) -> None:
+        if sync:
+            _barrier()
+        if self.enabled and self.rank == 0:
+            self._logger.event(key=key, value=value, metadata=metadata or {})
+
+    def start(
+        self,
+        key: str,
+        value: Any = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        sync: bool = False,
+    ) -> None:
+        if sync:
+            _barrier()
+        if self.enabled and self.rank == 0:
+            self._logger.start(key=key, value=value, metadata=metadata or {})
+
+    def end(
+        self,
+        key: str,
+        value: Any = None,
+        metadata: Optional[Dict[str, Any]] = None,
+        sync: bool = False,
+    ) -> None:
+        if sync:
+            _barrier()
+        if self.enabled and self.rank == 0:
+            self._logger.end(key=key, value=value, metadata=metadata or {})
+
+    def submission_info(self, benchmark_name: str, submitter_name: str) -> None:
+        """Emit the five SUBMISSION_* events required for a valid submission."""
+        if not (self.enabled and self.rank == 0):
+            return
+        c = mllog_constants
+        self.event(key=c.SUBMISSION_BENCHMARK, value=benchmark_name)
+        self.event(key=c.SUBMISSION_ORG, value=submitter_name)
+        self.event(key=c.SUBMISSION_DIVISION, value=c.CLOSED)
+        self.event(key=c.SUBMISSION_STATUS, value=c.ONPREM)
+        self.event(key=c.SUBMISSION_PLATFORM, value=submitter_name)
+
+
+@gin.configurable
+def get_mlperf_logger(
+    rank: int = 0,
+    log_path: str = "",
+    benchmark_name: str = "hstu",
+    submitter_name: str = "reference_implementation",
+) -> MLPerfLogger:
+    """Build a configured :class:`MLPerfLogger`.
+
+    ``benchmark_name`` / ``submitter_name`` are gin-configurable (and the path is
+    env-overridable via ``$MLPERF_LOG_PATH``) so a submission can stamp its own
+    benchmark string without code changes. The log path defaults to
+    ``$MLPERF_LOG_PATH`` when set, else ``""`` (mllog logs to stdout).
+    """
+    resolved_path = os.environ.get("MLPERF_LOG_PATH", log_path)
+    return MLPerfLogger(
+        rank=rank,
+        log_path=resolved_path,
+        benchmark_name=benchmark_name,
+        submitter_name=submitter_name,
+    )
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
index 55eece518..0168add5b 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
@@ -76,10 +76,33 @@ def _main_func(
     # decorators in generative_recommenders.ops.triton.* read env vars at
     # module import time, and the heavy imports below pull those in.
     from generative_recommenders.dlrm_v3.train._env_bootstrap import apply_env_bootstrap
+    from generative_recommenders.dlrm_v3.train.mlperf_logging_utils import (
+        get_mlperf_logger,
+    )
 
     gin.parse_config_file(gin_file, skip_unknown=True)
     apply_env_bootstrap()
 
+    # MLPerf compliance logging is only wired for the streaming-train-eval
+    # (yambda-5b) benchmark path. Build the rank-0-gated logger now. No-ops on
+    # non-zero ranks and when mlperf_logging is not installed.
+    mlperf_logger = (
+        get_mlperf_logger(rank=rank) if mode == "streaming-train-eval" else None
+    )
+    # Emit the init-start boundary before setup so the init phase is measured,
+    # but ONLY when this is guaranteed to be a cold start. CKPT_PATH unset means
+    # checkpoints are disabled (the default + submission config) -> always cold
+    # start. When CKPT_PATH is set (resumable / e2e-supervisor runs) we defer the
+    # decision to resume_cold_start below and skip the pre-setup markers, so a
+    # resume relaunch never emits an orphaned INIT_START/RUN_START. The whole
+    # downstream sequence (INIT_STOP/RUN_START/blocks/eval/RUN_STOP) is gated on
+    # this flag so the log is always balanced.
+    mlperf_init_logged = False
+    if mlperf_logger is not None and not os.environ.get("CKPT_PATH", ""):
+        mlperf_logger.event(key=mlperf_logger.constants.CACHE_CLEAR, value=True)
+        mlperf_logger.start(key=mlperf_logger.constants.INIT_START)
+        mlperf_init_logged = True
+
     # Phase 2: heavy imports. Triton kernel modules evaluate their autotune
     # decorators here, using the env vars set above.
     from generative_recommenders.dlrm_v3.checkpoint import load_dmp_checkpoint
@@ -173,6 +196,62 @@ def _main_func(
         )
     )
 
+    # MLPerf: submission info + hyperparameters, then the init/run boundary.
+    # Gated on (init markers emitted AND genuine cold start) so the e2e
+    # supervisor's resume relaunches don't restart the INIT/RUN markers
+    # mid-stream (which would invalidate the single-run log), and so the log is
+    # never left with an INIT_STOP that has no matching INIT_START.
+    mlperf_run_active = (
+        mlperf_logger is not None and mlperf_init_logged and resume_cold_start
+    )
+    if mlperf_run_active:
+        c = mlperf_logger.constants
+        mlperf_logger.submission_info(
+            benchmark_name=mlperf_logger.benchmark_name,
+            submitter_name=mlperf_logger.submitter_name,
+        )
+
+        def _gin_param(name: str, default: object) -> object:
+            try:
+                return gin.query_parameter(name)
+            except (ValueError, KeyError):
+                return default
+
+        global_batch_size = world_size * int(train_dataloader.batch_size)
+        mlperf_logger.event(key=c.GLOBAL_BATCH_SIZE, value=global_batch_size)
+        mlperf_logger.event(key=c.GRADIENT_ACCUMULATION_STEPS, value=1)
+        # Seed is fixed in setup() (_SEED = 1); kept in sync here.
+        mlperf_logger.event(key=c.SEED, value=1)
+        # Dense (Adam) + sparse (RowWiseAdagrad) optimizer hyperparameters,
+        # read from the active gin bindings.
+        mlperf_logger.event(
+            key=c.OPT_NAME,
+            value=_gin_param(
+                "dense_optimizer_factory_and_class.optimizer_name", "Adam"
+            ),
+        )
+        mlperf_logger.event(
+            key=c.OPT_BASE_LR,
+            value=_gin_param(
+                "dense_optimizer_factory_and_class.learning_rate", None
+            ),
+        )
+        mlperf_logger.event(
+            key="opt_sparse_name",
+            value=_gin_param(
+                "sparse_optimizer_factory_and_class.optimizer_name",
+                "RowWiseAdagrad",
+            ),
+        )
+        mlperf_logger.event(
+            key="opt_sparse_base_learning_rate",
+            value=_gin_param(
+                "sparse_optimizer_factory_and_class.learning_rate", None
+            ),
+        )
+        mlperf_logger.end(key=c.INIT_STOP, sync=True)
+        mlperf_logger.start(key=c.RUN_START, sync=True)
+
     # train loop
     try:
         if mode == "train":
@@ -224,6 +303,9 @@ def _main_func(
                 resume_batch_idx_in_window=resume_batch_idx_in_window,
                 resume_split_contract=resume_split_contract,
                 resume_cold_start=resume_cold_start,
+                # Only pass the logger when the run boundaries were emitted, so
+                # the loop never produces orphan block/eval events.
+                mlperf_logger=mlperf_logger if mlperf_run_active else None,
             )
     except Exception as e:
         logger.info(traceback.format_exc())
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index c27f38761..96f826f84 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -1203,6 +1203,10 @@ def streaming_train_eval_loop(
     checkpoint_time_interval_s: float = 0.0,
     # --- test-only failure injection knob ---
     die_at_step: int = -1,
+    # MLPerf compliance logger (rank-0-gated facade). None disables all MLPerf
+    # event emission; the loop is otherwise unchanged. Supplied by train_ranker
+    # for the streaming-train-eval benchmark path.
+    mlperf_logger: Optional[Any] = None,
 ) -> None:
     """Streaming train+eval loop with per-window (and optionally mid-window)
     checkpoints.
@@ -1558,7 +1562,9 @@ def _run_train_window(
                 f"[boundary] {label} train first-batch data-wait={first_wait * 1000:.1f}ms"
             )
 
-    def _run_eval_window(eval_data_iterator, label: Optional[str] = None) -> None:
+    def _run_eval_window(
+        eval_data_iterator, label: Optional[str] = None
+    ) -> Dict[str, float]:
         # DO NOT add a checkpoint trigger anywhere inside this function. The eval
         # data iterator's position is not serializable, so a checkpoint taken
         # mid-eval could not be resumed deterministically. `_maybe_checkpoint`
@@ -1614,7 +1620,8 @@ def _run_eval_window(eval_data_iterator, label: Optional[str] = None) -> None:
                     metric_logger.compute_and_log(mode="eval")
                 if num_eval_batches is not None and eval_batch_idx >= num_eval_batches:
                     break
-            for k, v in metric_logger.compute(mode="eval").items():
+            eval_metrics = metric_logger.compute(mode="eval")
+            for k, v in eval_metrics.items():
                 print(f"{k}: {v}")
         if label and rank == 0 and _t_enter is not None:
             _eval_total = time.perf_counter() - _t_enter
@@ -1623,6 +1630,7 @@ def _run_eval_window(eval_data_iterator, label: Optional[str] = None) -> None:
                 f"[boundary] {label} eval first-batch data-wait={_fw:.1f}ms "
                 f"total_eval={_eval_total * 1000:.1f}ms batches={eval_batch_idx}"
             )
+        return eval_metrics
 
     def _maybe_checkpoint(train_ts: int) -> None:
         if (
@@ -1688,6 +1696,152 @@ def _should_eval(i: int) -> bool:
                 dataset.dataset._train_split_percentage,  # pyre-ignore[16]
             )
 
+    # --- MLPerf progress accounting + event helpers ---------------------------
+    # `total_train_samples` is the denominator for the MLPerf samples_count /
+    # epoch_num progress unit: the total GLOBAL trainable samples over the
+    # configured window range. Computed once up-front (one O(N) anchor mask per
+    # window -- the same call the loop makes per window) and logged as
+    # TRAIN_SAMPLES. Skipped entirely when no MLPerf logger is attached, since
+    # the masks are not free. `mlperf_run_stopped` guards single RUN_STOP.
+    total_train_samples = 0
+    mlperf_run_stopped = [False]
+    if mlperf_logger is not None:
+        _twi = getattr(dataset.dataset, "train_window_indices", None)
+        _wi = getattr(dataset.dataset, "window_indices", None)
+        _idx_fn = _twi or _wi
+        if _idx_fn is not None:
+            for _ts in train_ts_list:
+                total_train_samples += int(_idx_fn(_ts).size)
+        if rank == 0:
+            logger.info(
+                "MLPerf: total_train_samples=%d over %d windows",
+                total_train_samples,
+                n_train,
+            )
+        mlperf_logger.event(
+            key=mlperf_logger.constants.TRAIN_SAMPLES, value=total_train_samples
+        )
+        if eval_global_indices is not None:
+            mlperf_logger.event(
+                key=mlperf_logger.constants.EVAL_SAMPLES,
+                value=int(eval_global_indices.size),
+            )
+        # Let MetricsLogger.compute_and_log emit the per-step MLPerf `train_loss`
+        # event (rank-0 gated) at the metric-logging cadence, stamped with the
+        # current base LR. Read param_groups[0] defensively (KeyedOptimizer
+        # exposes it; guard against any optimizer that does not).
+        metric_logger.mlperf_logger = mlperf_logger
+
+        def _current_lr() -> float:
+            return float(optimizer.param_groups[0]["lr"])
+
+        metric_logger.lr_getter = _current_lr
+
+    def _mlperf_progress() -> Dict[str, Any]:
+        samples = metric_logger.cumulative_train_samples
+        epoch_num = (samples / total_train_samples) if total_train_samples > 0 else 0.0
+        return {
+            mlperf_logger.constants.SAMPLES_COUNT: samples,
+            mlperf_logger.constants.EPOCH_NUM: epoch_num,
+        }
+
+    def _lifetime_auc(metrics: Dict[str, float]) -> Optional[float]:
+        # Convergence metric: the cumulative ("lifetime_") listen_plus AUC.
+        # Key format is `metric/{prefix}{name}/{task}` (see MetricsLogger.compute),
+        # e.g. `metric/lifetime_auc/listen_plus`. Match the `lifetime_auc` short
+        # name; ignore GAUC.
+        for key, val in metrics.items():
+            short = key.split("/")[-2] if "/" in key else key
+            if short == "lifetime_auc":
+                return float(val)
+        return None
+
+    def _mlperf_block_start() -> None:
+        if mlperf_logger is not None:
+            mlperf_logger.start(
+                key=mlperf_logger.constants.BLOCK_START, metadata=_mlperf_progress()
+            )
+
+    def _mlperf_block_stop() -> None:
+        if mlperf_logger is not None:
+            mlperf_logger.end(
+                key=mlperf_logger.constants.BLOCK_STOP, metadata=_mlperf_progress()
+            )
+
+    def _mlperf_eval_start() -> None:
+        if mlperf_logger is not None:
+            mlperf_logger.start(
+                key=mlperf_logger.constants.EVAL_START, metadata=_mlperf_progress()
+            )
+
+    def _mlperf_run_stop(status: object) -> None:
+        # Emit RUN_STOP exactly once, with an all-rank barrier so the timestamp
+        # reflects the slowest rank (MLPerf requirement).
+        if mlperf_logger is None or mlperf_run_stopped[0]:
+            return
+        mlperf_logger.end(
+            key=mlperf_logger.constants.RUN_STOP,
+            metadata={mlperf_logger.constants.STATUS: status, **_mlperf_progress()},
+            sync=True,
+        )
+        mlperf_run_stopped[0] = True
+
+    def _mlperf_eval_stop(eval_metrics: Dict[str, float]) -> bool:
+        # Emit EVAL_ACCURACY (lifetime listen_plus AUC) + EVAL_STOP, and drive an
+        # early SUCCESS RUN_STOP when the target threshold is reached. Returns
+        # True iff the run should stop now -- the SAME value on every rank.
+        #
+        # CRITICAL (deadlock avoidance): the cumulative lifetime AUC is produced
+        # by a reduce that is only valid on global rank 0, so a per-rank
+        # `lifetime >= thr` test diverges (only rank 0 sees the value) and the
+        # ranks that "stop" hit the RUN_STOP barrier while the rest march into
+        # the next window's embedding all-to-all -> NCCL collective-timeout hang
+        # (observed: 600s ALLTOALL_BASE watchdog abort). So rank 0 decides and
+        # BROADCASTS the boolean; all ranks then break (or continue) in lockstep.
+        if mlperf_logger is None:
+            return False
+        lifetime = _lifetime_auc(eval_metrics)
+        if lifetime is not None:
+            mlperf_logger.event(
+                key=mlperf_logger.constants.EVAL_ACCURACY,
+                value=lifetime,
+                metadata=_mlperf_progress(),
+            )
+        mlperf_logger.end(
+            key=mlperf_logger.constants.EVAL_STOP, metadata=_mlperf_progress()
+        )
+        thr = metric_logger.auc_threshold
+        decision = torch.zeros(1, device=device)
+        if (
+            rank == 0
+            and not mlperf_run_stopped[0]
+            and lifetime is not None
+            and thr is not None
+            and lifetime >= thr
+        ):
+            decision[0] = 1.0
+        if torch.distributed.is_initialized():
+            torch.distributed.broadcast(decision, src=0)
+        should_stop = bool(decision.item() > 0.5)
+        if should_stop:
+            # All ranks agree -> all reach the RUN_STOP barrier together.
+            _mlperf_run_stop(mlperf_logger.constants.SUCCESS)
+        return should_stop
+
+    def _mlperf_finalize(final_metrics: Dict[str, float]) -> None:
+        # End-of-run RUN_STOP when the threshold was never crossed: SUCCESS iff
+        # the final lifetime AUC meets the target, else ABORTED.
+        if mlperf_logger is None or mlperf_run_stopped[0]:
+            return
+        lifetime = _lifetime_auc(final_metrics)
+        thr = metric_logger.auc_threshold
+        success = lifetime is not None and thr is not None and lifetime >= thr
+        _mlperf_run_stop(
+            mlperf_logger.constants.SUCCESS
+            if success
+            else mlperf_logger.constants.ABORTED
+        )
+
     if persistent_loader and double_buffer:
         # Double-buffered: next window prepared in the background during the
         # current window's compute. Eval (if enabled) uses its own pre-forked
@@ -1741,16 +1895,23 @@ def _should_eval(i: int) -> bool:
                 if i == 0 and resume_batch_idx_in_window > 0
                 else 0
             )
+            _mlperf_block_start()
             _run_train_window(
                 train_data_iterator,
                 train_ts=train_ts,
                 start_batch_idx=start_batch,
                 label=f"train_ts={train_ts}",
             )
+            _mlperf_block_stop()
+            should_stop = False
             if _should_eval(i):
                 dataset.dataset.is_eval = True  # pyre-ignore [16]
                 assert eval_sampler is not None and eval_dl is not None
-                _run_eval_window(eval_iter, label=f"eval_holdout@train_ts={train_ts}")
+                _mlperf_eval_start()
+                eval_metrics = _run_eval_window(
+                    eval_iter, label=f"eval_holdout@train_ts={train_ts}"
+                )
+                should_stop = _mlperf_eval_stop(eval_metrics)
                 # Re-arm the (already-forked) eval pool for the NEXT eval. The
                 # holdout set is fixed, so the sampler window is unchanged; we
                 # only need a fresh iter() to replay it. iter() reuses the
@@ -1761,6 +1922,9 @@ def _should_eval(i: int) -> bool:
                 if next_eval_i is not None:
                     eval_iter = iter(eval_dl)
             _maybe_checkpoint(train_ts)
+            if should_stop:
+                # MLPerf target reached: RUN_STOP already emitted; stop training.
+                break
     else:
         for i, train_ts in enumerate(train_ts_list):
             dataset.dataset.is_eval = False  # pyre-ignore [16]
@@ -1770,15 +1934,19 @@ def _should_eval(i: int) -> bool:
                 if i == 0 and resume_batch_idx_in_window > 0
                 else 0
             )
+            _mlperf_block_start()
             _run_train_window(
                 _window_iter(train_ts, skip_samples=skip),
                 train_ts=train_ts,
                 start_batch_idx=start_batch,
             )
+            _mlperf_block_stop()
+            should_stop = False
             if _should_eval(i):
                 dataset.dataset.is_eval = True  # pyre-ignore [16]
+                _mlperf_eval_start()
                 if eval_global_indices is not None:
-                    _run_eval_window(
+                    eval_metrics = _run_eval_window(
                         iter(
                             make_streaming_dataloader(
                                 dataset=dataset, indices=eval_global_indices
@@ -1788,26 +1956,43 @@ def _should_eval(i: int) -> bool:
                     )
                 else:
                     # Legacy per-window eval (datasets without user holdout).
-                    _run_eval_window(
+                    eval_metrics = _run_eval_window(
                         iter(make_streaming_dataloader(dataset=dataset, ts=train_ts + 1))
                     )
+                should_stop = _mlperf_eval_stop(eval_metrics)
             _maybe_checkpoint(train_ts)
+            if should_stop:
+                # MLPerf target reached: RUN_STOP already emitted; stop training.
+                break
 
     # Final eval over the SAME fixed user-holdout set (consistent with the
     # per-window evals above). Reuses _run_eval_window so metrics are reset and
     # reported the same way. Falls back to the legacy final-window eval for
-    # datasets without user holdout.
-    dataset.dataset.is_eval = True  # pyre-ignore [16]
-    if eval_global_indices is not None:
-        _run_eval_window(
-            iter(make_streaming_dataloader(dataset=dataset, indices=eval_global_indices)),
-            label="eval_holdout@final",
-        )
-    else:
-        _run_eval_window(
-            iter(make_streaming_dataloader(dataset=dataset, ts=num_train_ts)),
-            label="eval@final",
-        )
-    if rank == 0:
-        for k, v in metric_logger.compute(mode="eval").items():
-            print(f"{k}: {v}")
+    # datasets without user holdout. Skipped if the MLPerf target was already
+    # reached mid-run (RUN_STOP already emitted, run is over).
+    if not mlperf_run_stopped[0]:
+        dataset.dataset.is_eval = True  # pyre-ignore [16]
+        _mlperf_eval_start()
+        if eval_global_indices is not None:
+            final_metrics = _run_eval_window(
+                iter(
+                    make_streaming_dataloader(
+                        dataset=dataset, indices=eval_global_indices
+                    )
+                ),
+                label="eval_holdout@final",
+            )
+        else:
+            final_metrics = _run_eval_window(
+                iter(make_streaming_dataloader(dataset=dataset, ts=num_train_ts)),
+                label="eval@final",
+            )
+        # EVAL_ACCURACY/EVAL_STOP for the final pass (may emit SUCCESS RUN_STOP
+        # if the target was met exactly at the end).
+        _mlperf_eval_stop(final_metrics)
+        if rank == 0:
+            for k, v in final_metrics.items():
+                print(f"{k}: {v}")
+        # End-of-run RUN_STOP: SUCCESS iff the final lifetime AUC met the target,
+        # else ABORTED. No-op if a SUCCESS RUN_STOP already fired above.
+        _mlperf_finalize(final_metrics)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
index 1996323a8..6a4f29508 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
@@ -1012,6 +1012,21 @@ def _make_reg(ws: int) -> List[RecMetricComputation]:
                 self.regression_metrics["eval"] = _make_reg(window_size)
 
         self.global_step: Dict[str, int] = {"train": 0, "eval": 0}
+        # Monotonic, resume-safe count of GLOBAL trained samples (summed across
+        # ranks), used as the MLPerf `samples_count` progress unit. Distinct from
+        # the perf-only `_perf_total_samples` below (per-rank, not checkpointed):
+        # this one is persisted/restored alongside `global_step` so a resumed
+        # streaming run continues the convergence-progress count.
+        self.cumulative_train_samples: int = 0
+        self._rank: int = int(rank)
+        # Optional MLPerf logger + learning-rate accessor, wired by the streaming
+        # loop (kept duck-typed -- expects `.event(key, value, metadata)` and
+        # `.constants` -- to avoid a train-module import cycle). When set,
+        # compute_and_log emits a POINT_IN_TIME `train_loss` event (rank-0 gated
+        # inside the logger) at the metric-logging cadence, mirroring the per-step
+        # MLPerf train_loss readout other benchmarks log.
+        self.mlperf_logger: Optional[Any] = None
+        self.lr_getter: Optional[Callable[[], float]] = None
         self.tb_logger: Optional[SummaryWriter] = None
         if tensorboard_log_path != "":
             self.tb_logger = SummaryWriter(log_dir=tensorboard_log_path, purge_step=0)
@@ -1030,6 +1045,13 @@ def _make_reg(ws: int) -> List[RecMetricComputation]:
             1, dtype=torch.long, device=device
         )
 
+    @property
+    def auc_threshold(self) -> Optional[float]:
+        """Configured time-to-target AUC threshold (None if unset). Exposed so
+        the streaming loop can drive the MLPerf SUCCESS RUN_STOP off the same
+        target without reaching into the private attribute."""
+        return self._auc_threshold
+
     @property
     def all_metrics(self) -> Dict[str, List[RecMetricComputation]]:
         """
@@ -1093,6 +1115,12 @@ def update(
                 self._perf_samples_counter.dtype
             )
             self._perf_steps_in_window += 1
+            # MLPerf progress counter: global trained samples this step. Local
+            # batch sample count (num_candidates is per-rank) scaled by world
+            # size approximates the global count without an extra collective;
+            # accumulated on CPU as a plain int so it serializes trivially into
+            # the checkpoint (see save/load_nonsparse_checkpoint).
+            self.cumulative_train_samples += int(num_candidates.numel()) * self._world_size
 
     def compute(self, mode: str = "train") -> Dict[str, float]:
         """
@@ -1183,6 +1211,53 @@ def compute_and_log(
                         global_step=self.global_step[mode],
                     )
 
+        # Train-loss readout: surface a single GLOBAL (cross-rank mean) training
+        # loss on the regular console logger every `metric_log_frequency` batches,
+        # so progress is visible from step 0 instead of only at the first
+        # end-of-window eval. The per-loss-term breakdown already goes to
+        # TensorBoard above (losses/train_*); here we add the combined scalar.
+        # The all-reduce is a cheap 1-element collective run by EVERY rank at the
+        # same deterministic steps (this method is called in lockstep), so it
+        # cannot desync. Set METRIC_LOG_FREQ low (e.g. 1-5) to see it per step.
+        if mode == "train" and additional_logs is not None and "losses" in additional_logs:
+            loss_terms = additional_logs["losses"]
+            if loss_terms:
+                loss_t = torch.stack(
+                    [v.detach().float().sum() for v in loss_terms.values()]
+                ).sum()
+                if torch.distributed.is_available() and torch.distributed.is_initialized():
+                    torch.distributed.all_reduce(
+                        loss_t, op=torch.distributed.ReduceOp.SUM
+                    )
+                    loss_t = loss_t / self._world_size
+                train_loss = float(loss_t)
+                self.tb_logger.add_scalar(
+                    "train_loss", train_loss, global_step=self.global_step["train"]
+                )
+                if self._rank == 0:
+                    logger.info(
+                        f"train - Step {self.global_step['train']} "
+                        f"train_loss={train_loss:.5f}"
+                    )
+                # MLPerf POINT_IN_TIME train_loss (rank-0 gated in the logger).
+                # samples_count = cumulative GLOBAL trained samples (the same
+                # progress unit as block/eval events); lr = current base LR.
+                if self.mlperf_logger is not None:
+                    c = self.mlperf_logger.constants
+                    md: Dict[str, Any] = {
+                        c.SAMPLES_COUNT: self.cumulative_train_samples,
+                    }
+                    if self.lr_getter is not None:
+                        try:
+                            md["lr"] = float(self.lr_getter())
+                        except Exception:
+                            pass
+                    self.mlperf_logger.event(
+                        key=getattr(c, "TRAIN_LOSS", "train_loss"),
+                        value=train_loss,
+                        metadata=md,
+                    )
+
         # Throughput metrics (train only). One GPU->CPU sync per call.
         if mode == "train" and self._perf_steps_in_window > 0:
             now = time.perf_counter()
diff --git a/recommendation_v4/requirements.txt b/recommendation_v4/requirements.txt
index 023c22332..a8637bf5e 100644
--- a/recommendation_v4/requirements.txt
+++ b/recommendation_v4/requirements.txt
@@ -5,3 +5,4 @@ gin_config>=0.5.0
 pandas>=2.2.0
 tensorboard>=2.19.0
 pybind11
+git+https://github.com/mlcommons/logging.git
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index 098a69529..c100a6828 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -232,11 +232,13 @@ orchestrate() {
       -e NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES \
       -e METRIC_LOG_FREQ=$METRIC_LOG_FREQ \
       -e TRAIN_SPLIT_PERCENTAGE=${TRAIN_SPLIT_PERCENTAGE:-0.90} \
+      -e AUC_THRESHOLD=${AUC_THRESHOLD:-0.80275} \
       -e SPLIT_SALT=${SPLIT_SALT:-0} \
       -e EVAL_HOLDOUT_TS=${EVAL_HOLDOUT_TS:--1} \
       -e EVAL_HOLDOUT_NUM_WINDOWS=${EVAL_HOLDOUT_NUM_WINDOWS:-1} \
       ${RUN_NAME:+-e RUN_NAME=$RUN_NAME} \
       ${TENSORBOARD_LOG_PATH:+-e TENSORBOARD_LOG_PATH=$TENSORBOARD_LOG_PATH} \
+      ${MLPERF_LOG_PATH:+-e MLPERF_LOG_PATH=$MLPERF_LOG_PATH} \
       ${CKPT_PATH:+-e CKPT_PATH=$CKPT_PATH} \
       -e LOG=$LOG \
       $NCCL_ENV_ARGS \
@@ -382,8 +384,18 @@ worker() {
   # TensorBoard under the writable scratch root unless the caller (e.g. the e2e
   # supervisor) pinned a per-run path. Keeps the gin default from ever being used.
   export TENSORBOARD_LOG_PATH=${TENSORBOARD_LOG_PATH:-$SCRATCH/tb/yambda_5b}
-  # Append (not truncate): under the streaming-e2e supervisor a run may relaunch
-  # many times into the SAME $LOG; the supervisor initializes it once at run start.
+  # MLPerf Training compliance log (streaming-train-eval path). Lands beside the
+  # other run outputs under scratch unless the caller pins it. Rank 0 writes it;
+  # check it post-run with:
+  #   python -m mlperf_logging.compliance_checker --usage training \
+  #     --ruleset 5.0.0 "$MLPERF_LOG_PATH"
+  # Default to a PER-JOB filename so each standalone `sbatch` gets a clean
+  # compliance log: mllog opens the file in APPEND mode, so a fixed name would
+  # accumulate events across runs and fail the compliance_checker (duplicate
+  # INIT_START/RUN_START). The streaming-e2e supervisor pins MLPERF_LOG_PATH
+  # explicitly (and inits it once at run start), so its relaunch-into-same-file
+  # append semantics are preserved untouched.
+  export MLPERF_LOG_PATH=${MLPERF_LOG_PATH:-$SCRATCH/mlperf/yambda_5b_mlperf.${SLURM_JOB_ID:-manual}.log}
   echo "[$(date)] REPO_ROOT=$REPO_ROOT" | tee -a "$LOG"
 
   # polars-u64-idx (NOT stock polars) — yambda parquet's flat-explode overruns

From 2fc3cb1c31dcb400e657302ea45ce655ff6c3889 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Fri, 12 Jun 2026 20:48:18 -0500
Subject: [PATCH 050/113] dlrmv4: min_history anchor-eligibility floor
 (decoupled from history_length)

A LISTEN event qualifies as a train/eval anchor once the user has >= min_history prior events, decoupled from history_length (the gather/truncation cap) since jagged attention handles short UIH. Anchor positions/anchor_ts caches are keyed by (history_length, min_history) and built independently of the _READY-gated 150GB flat store, so changing the floor rebuilds only the cheap positions array. Default None preserves the legacy full-history behavior (which dropped ~60% of users).

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 recommendation_v4/README.MD                   |  9 +-
 .../dlrm_v3/datasets/yambda.py                | 87 ++++++++++++++++++-
 .../generative_recommenders/dlrm_v3/utils.py  |  7 ++
 3 files changed, 97 insertions(+), 6 deletions(-)

diff --git a/recommendation_v4/README.MD b/recommendation_v4/README.MD
index dc1ced73b..acd59cc31 100644
--- a/recommendation_v4/README.MD
+++ b/recommendation_v4/README.MD
@@ -100,7 +100,7 @@ The `like` pool is roughly **30× rarer** than `lp` — important context for th
 
 ## 4. How data is fed to HSTU
 
-For every training anchor (a LISTEN event with ≥ `history_length` prior events), the dataset builds a `(uih_kjt, candidate_kjt)` pair:
+For every training anchor (a LISTEN event with ≥ `min_history` prior events — default `1`, i.e. ~all users; set `$MIN_HISTORY=4086` for the legacy "full `history_length` of context required" filter that dropped ~60% of users), the dataset builds a `(uih_kjt, candidate_kjt)` pair:
 
 ```
 UIH (User Interaction History):
@@ -186,8 +186,10 @@ The streaming path enforces **no future leakage** at two levels:
 Note this is a *temporal* split on the training stream — distinct from the
 preprocessing GTS split (§2) that carves off the final test day. Windows are
 indexed off the per-anchor target timestamp via a lazily-built, mmap'd
-`anchor_ts_L{H}.npy` cache (built once on first use; the default non-streaming
-path never touches it).
+`anchor_ts_L{H}[_m{MIN_HISTORY}].npy` cache (built once on first use; the
+default non-streaming path never touches it). The anchor `positions` and
+`anchor_ts` arrays are keyed by `(history_length, min_history)` so different
+floors don't collide and the expensive flat store is shared across them.
 
 ### 5.2 Knobs
 
@@ -201,6 +203,7 @@ with env overrides:
 | `PERSISTENT_LOADER` | `streaming_train_eval_loop.persistent_loader` | 1 | reuse one worker pool across windows (no per-window respawn) |
 | `DOUBLE_BUFFER` | `streaming_train_eval_loop.double_buffer` | 1 | prepare the next window in a background thread during compute |
 | `EVAL_EACH_WINDOW` | `streaming_train_eval_loop.eval_each_window` | 1 | eval window T+1 after training window T |
+| `MIN_HISTORY` | `get_dataset.min_history` | 1 | anchor-eligibility floor: min prior events for a LISTEN to be a sample (1 = ~all users; 4086 = legacy full-history filter) |
 | — | `streaming_train_eval_loop.num_train_batches` / `num_eval_batches` | unset | cap per-window steps (unset = consume full window) |
 
 ### 5.3 Hiding the window-reset overhead
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py
index 6627033bc..ed98a1013 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py
@@ -204,6 +204,7 @@ def __init__(
         metadata_dir: str,
         history_length: int = 2048,
         scan_window: int = 20000,
+        min_history: Optional[int] = None,
         cross_specs: Optional[Sequence[Tuple[str, Sequence[str], int, int]]] = None,
         cache_dir: Optional[str] = None,
         is_inference: bool = False,
@@ -221,6 +222,15 @@ def __init__(
         self._metadata_dir: str = metadata_dir
         self._history_length: int = history_length
         self._scan_window: int = scan_window
+        # Minimum prior-event count for a LISTEN event to qualify as an anchor.
+        # Decoupled from history_length (which is only the gather/truncation cap):
+        # jagged attention handles short UIH, so we no longer require a full
+        # history_length of context to include a sample. Default None preserves
+        # the legacy "need a full history_length of prior events" behavior (which
+        # dropped ~60% of users); set small (e.g. 1) to include ~all users.
+        self._min_history: int = (
+            history_length if min_history is None else int(min_history)
+        )
         # Streaming/temporal-order state. Everything here is LAZY: nothing is
         # built or read until the first set_ts()/num_windows() call (only the
         # streaming-train-eval loop does that), so the default train-eval path
@@ -274,15 +284,83 @@ def __init__(
         self._cache_dir = cache_dir
         self._ensure_cache_built(cache_dir, processed_dir, history_length)
         self.store: _FlatEventStore = _FlatEventStore.load_mmap(cache_dir)
-        # Mmap the positions file built alongside the flat columns.
+        # Anchor positions depend on min_history (the eligibility floor), not
+        # just history_length (the gather cap), so they live in a
+        # min_history-versioned file that shares the flat store. Built
+        # independently of the _READY sentinel so changing the floor rebuilds
+        # only this (cheap) array, not the whole 150 GB cache.
+        self._positions_name: str = self._positions_filename(
+            history_length, self._min_history
+        )
+        self._ensure_positions_built(
+            cache_dir, self._positions_name, self._min_history
+        )
         self._positions: np.ndarray = _load_npy_readonly(
-            os.path.join(cache_dir, f"positions_L{history_length}.npy")
+            os.path.join(cache_dir, self._positions_name)
         )
         logger.info(
             f"Yambda dataset ready: {self.store.total_events:,} events, "
             f"{len(self._positions):,} training positions"
         )
 
+    @staticmethod
+    def _positions_filename(history_length: int, min_history: int) -> str:
+        """Anchor-positions filename. Uses the legacy name when the floor equals
+        the gather cap (the historical "full history required" behavior) so
+        existing caches are reused as-is; otherwise a min_history-tagged name."""
+        if min_history == history_length:
+            return f"positions_L{history_length}.npy"
+        return f"positions_L{history_length}_m{min_history}.npy"
+
+    @staticmethod
+    def _ensure_positions_built(
+        cache_dir: str, positions_name: str, min_history: int
+    ) -> None:
+        """Build the anchor-positions array for ``min_history`` if absent.
+
+        Anchors are LISTEN events whose user-local offset is >= ``min_history``
+        (i.e. the user already has that many prior events). This is decoupled
+        from the _READY-gated flat-store build so a new floor only rebuilds this
+        (cheap, ~one int64 scan) array rather than the whole 150 GB cache.
+        Multi-rank safe via an exclusive lock + atomic rename; all ranks then
+        mmap the result read-only.
+        """
+        import fcntl
+
+        positions_path = os.path.join(cache_dir, positions_name)
+        if os.path.exists(positions_path):
+            return
+        lock_path = os.path.join(cache_dir, "_positions_lock")
+        with open(lock_path, "w") as lf:
+            logger.info(f"Acquiring positions build lock for {positions_path}...")
+            fcntl.flock(lf, fcntl.LOCK_EX)
+            try:
+                if os.path.exists(positions_path):
+                    return
+                flat_uid = _load_npy_readonly(
+                    os.path.join(cache_dir, "flat_uid.npy")
+                )
+                event_types = _load_npy_readonly(
+                    os.path.join(cache_dir, "flat_event_types.npy")
+                )
+                user_start = _load_npy_readonly(
+                    os.path.join(cache_dir, "user_start.npy")
+                )
+                idx = np.arange(len(flat_uid), dtype=np.int64)
+                keep = (idx - user_start[flat_uid] >= min_history) & (
+                    event_types == LISTEN_TYPE
+                )
+                positions = np.where(keep)[0].astype(np.int64)
+                tmp = positions_path + ".tmp.npy"
+                np.save(tmp, positions)
+                os.replace(tmp, positions_path)
+                logger.info(
+                    f"Wrote {positions_name}: {len(positions):,} anchors "
+                    f"(min_history={min_history})"
+                )
+            finally:
+                fcntl.flock(lf, fcntl.LOCK_UN)
+
     @staticmethod
     def _ensure_cache_built(
         cache_dir: str, processed_dir: str, history_length: int
@@ -496,8 +574,11 @@ def _ensure_streaming_index(self) -> None:
         import fcntl
 
         assert self._cache_dir is not None
+        # Target-ts array is per-anchor, so it must track the same min_history
+        # versioning as the positions file it indexes into.
         anchor_path = os.path.join(
-            self._cache_dir, f"anchor_ts_L{self._history_length}.npy"
+            self._cache_dir,
+            self._positions_name.replace("positions_", "anchor_ts_", 1),
         )
         if not os.path.exists(anchor_path):
             lock_path = os.path.join(self._cache_dir, "_anchor_ts_lock")
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
index 0abfaa86a..dcda51365 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
@@ -1422,6 +1422,7 @@ def get_dataset(
     name: str,
     new_path_prefix: str = "",
     history_length: Optional[int] = None,
+    min_history: Optional[int] = None,
     streaming_window_seconds: int = 86400,
     streaming_sort_within_window: bool = False,
     streaming_shuffle_fraction: float = 0.0,
@@ -1553,6 +1554,12 @@ def get_dataset(
                 # Override via `get_dataset.history_length = N` in gin.
                 "history_length": history_length if history_length is not None else 4096,
                 "scan_window": 20000,
+                # Anchor-eligibility floor: a LISTEN event qualifies once the
+                # user has >= min_history prior events. Decoupled from
+                # history_length (gather cap) since jagged attention handles
+                # short UIH. None = legacy (require a full history_length).
+                # Override via `get_dataset.min_history = N` / $MIN_HISTORY.
+                "min_history": min_history,
                 "cross_specs": YAMBDA_5B_CROSS_SPECS,
                 # Temporal-streaming knobs (only used under --mode
                 # streaming-train-eval; ignored by the default train-eval path).

From e5214b8b8ddfea6a3b5270a4aef385bfecd8b4af Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Fri, 12 Jun 2026 20:48:18 -0500
Subject: [PATCH 051/113] =?UTF-8?q?dlrmv4:=20gin=20defaults=20=E2=80=94=20?=
 =?UTF-8?q?min=5Fhistory=3D1,=20BATCH=5FSIZE=20env=20knob,=20default=20to?=
 =?UTF-8?q?=20no=20in-window=20shuffle?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bind get_dataset.min_history to $MIN_HISTORY (default 1 = ~all users). Make batch_size env-overridable via $BATCH_SIZE (default 1024). Change default streaming_shuffle_fraction 1.0 -> 0.0 (user-major, production streaming order); override per-run via $STREAMING_SHUFFLE_FRACTION (1.0 = full shuffle).

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin           | 31 ++++++++++++++-----
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 92c2404fc..3bfccc510 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -1,4 +1,9 @@
-batch_size = 1024
+# Per-rank batch size. Env-overridable so a denser per-sample shape can fit in
+# HBM by lowering it without editing gin. Default 1024 preserves prior runs;
+# streaming train+eval both read this macro (make_*streaming_dataloader.batch_size).
+batch_size = @bs/env_int()
+bs/env_int.key     = "BATCH_SIZE"
+bs/env_int.default = 1024
 # Dataloader parallelism. Env-overridable so a perf sweep can probe whether the
 # shuffle steady-state cost is CPU-gather latency (hidden by more workers) vs
 # GPU-side embedding work (not). Defaults preserve prior behavior (4 / 8).
@@ -100,6 +105,18 @@ get_dataset.history_length = @hl/env_int()
 hl/env_int.key     = "HISTORY_LENGTH"
 hl/env_int.default = 4086
 
+# Anchor-eligibility floor: a LISTEN event becomes a trainable/eval anchor once
+# the user has >= MIN_HISTORY prior events. Decoupled from history_length (which
+# is only the gather/truncation cap) — jagged attention handles short UIH, so we
+# no longer need a full history_length of context to include a sample. The legacy
+# behavior (require a full history_length, which dropped ~60% of users) is
+# MIN_HISTORY=4086; the default below (1) includes ~all users with any history.
+# Positions/anchor-ts caches are keyed by (L, MIN_HISTORY) so floors don't
+# collide. Override via $MIN_HISTORY.
+get_dataset.min_history = @mh/env_int()
+mh/env_int.key     = "MIN_HISTORY"
+mh/env_int.default = 1
+
 # Model-side attention budget. Dataset truncates UIH to fit this value if
 # `history_length + contextual + candidate` would overflow. Override via
 # $MAX_SEQ_LEN (default 4096, the 4k-no-truncation shape paired with
@@ -125,12 +142,12 @@ get_dataset.streaming_sort_within_window = False
 # shuffle (max diversity), intermediate = interpolation. Same fraction => same
 # diversity regardless of world_size / #nodes / batch_size.
 #
-# Default 1.0 (full shuffle) so the standard/benchmark run is maximally diverse
-# and, together with the fixed seed below, the in-window order is fully
-# DETERMINISTIC and identical across runs/resumes (pure function of (seed, ts)).
-# Override per-run via $STREAMING_SHUFFLE_FRACTION (e.g. 0.0 for user-major,
-# 0.03 for the diversity/locality sweet spot).
-streaming_shuffle_fraction = 1.0
+# Default 0.0 (no shuffle -> user-major, page-local mmap scans) so the standard
+# run matches the production streaming order. Together with the fixed seed below
+# the in-window order is fully DETERMINISTIC and identical across runs/resumes.
+# Override per-run via $STREAMING_SHUFFLE_FRACTION (e.g. 1.0 for a full
+# per-element shuffle / max diversity, 0.03 for the diversity/locality sweet spot).
+streaming_shuffle_fraction = 0.0
 get_dataset.streaming_shuffle_fraction = @ssf/env_float()
 ssf/env_float.key     = "STREAMING_SHUFFLE_FRACTION"
 ssf/env_float.default = %streaming_shuffle_fraction

From 5fdf70c0e7e5817a81edb1e775b8e135d65fb4c9 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Fri, 12 Jun 2026 20:48:19 -0500
Subject: [PATCH 052/113] dlrmv4: launch_slurm container hygiene + readiness
 gating + env passthroughs

Reap stale foreign GPU containers and restart our own to reclaim leaked HBM before launch; gate the worker exec on container State.Running + a probe with retry to fix container-restart races (which caused NCCL TCPStore 600s timeouts); APPEND_LOG=1 appends the metrics log on resume. Forward MIN_HISTORY/MAX_SEQ_LEN/HISTORY_LENGTH/BATCH_SIZE/CKPT_TIME_INTERVAL_S/DIAG_EMB_STEPS into the container. Drop the hardcoded --time; tidy header comments after launch_smoke_8gpu removal.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 recommendation_v4/scripts/launch_slurm.sh | 148 +++++++++++++++++++---
 1 file changed, 133 insertions(+), 15 deletions(-)

diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index 42f8780ed..6b5ba7607 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -4,16 +4,15 @@
 #SBATCH --ntasks-per-node=1
 #SBATCH --exclusive
 #SBATCH --partition=meta64          # [CLUSTER-SPECIFIC] partition name
-#SBATCH --time=01:10:00
 #SBATCH --output=/apps/chcai/yambda_slurm.%j.out
 # =============================================================================
 # launch_slurm.sh — single entry point for the yambda-5b trainer on N>=1 nodes.
 #
-# Consolidates what used to be three separate files so multi-node enablement is
+# Consolidates what used to be separate scripts so multi-node enablement is
 # ONE committable script (plus the train_ranker.py / utils.py python changes):
-#   * sbatch_smoke_multinode.sh   -> the `orchestrate` phase (host SLURM glue)
-#   * _provision_yambda_primus.sh -> the `provision`   phase (container + RDMA)
-#   * launch_smoke_8gpu.sh        -> the `worker`      phase (in-container train)
+#   * orchestrate phase (host SLURM glue) — formerly sbatch_smoke_multinode.sh
+#   * provision   phase (container + RDMA) — formerly _provision_yambda_primus.sh
+#   * worker      phase (in-container train) — now inlined below
 #
 # PHASES (auto-detected from context; force with LAUNCH_SLURM_PHASE=<phase>):
 #   orchestrate  Runs on the SLURM batch host (no /.dockerenv). Resolves the
@@ -141,7 +140,15 @@ orchestrate() {
   METRIC_LOG_FREQ=${METRIC_LOG_FREQ:-5}
   FORCE_PROVISION=${FORCE_PROVISION:-0}
 
-  : > "$LOG"
+  # Truncate the metrics log on a FRESH run; APPEND on a supervised relaunch
+  # (APPEND_LOG=1) so the full-run NE/AUC history survives crash/node-failover
+  # resubmits instead of being wiped on every attempt (mirrors the single-node
+  # supervisor's init-once/append model).
+  if [ "${APPEND_LOG:-0}" = "1" ]; then
+    echo "[$(date)] === resume: appending to existing $LOG (APPEND_LOG=1) ===" >> "$LOG"
+  else
+    : > "$LOG"
+  fi
   echo "[$(date)] launch_slurm/orchestrate: job=${SLURM_JOB_ID:-?} nodes=${SLURM_JOB_NODELIST:-?} nnodes=${SLURM_NNODES:-1}" | tee -a "$LOG"
   echo "[$(date)] resolved SCRIPT_PATH=$SCRIPT_PATH REPO=$REPO" | tee -a "$LOG"
   echo "[$(date)] config: MODE=$MODE START_TS=$START_TS NUM_TRAIN_TS=$NUM_TRAIN_TS NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES METRIC_LOG_FREQ=$METRIC_LOG_FREQ" | tee -a "$LOG"
@@ -177,19 +184,78 @@ orchestrate() {
   # --- step 1: ensure the container is up on every node ----------------------
   echo "[$(date)] ensuring container '$CONTAINER' on all nodes (force=$FORCE_PROVISION)" | tee -a "$LOG"
   srun --ntasks-per-node=1 bash -c "
+    # Reap stale/foreign GPU containers from prior jobs BEFORE (re)provisioning.
+    # The node is allocated --exclusive, so any GPU container other than
+    # '$CONTAINER' is an orphan left by a previous job (its container outlives the
+    # SLURM allocation). We remove every such container that has GPU access
+    # (/dev/kfd or /dev/dri) — running OR stopped, whether or not it currently
+    # pins VRAM ('docker ps -aq' includes stopped ones) — since idle orphans can
+    # still hold device handles or wake up; leaked HBM from these has caused both
+    # OOMs and RCCL collective hangs. We deliberately SKIP non-GPU containers
+    # (e.g. 'k8s-node-services-*' and other cluster system services) so we don't
+    # disrupt node infrastructure. docker teardown lets the driver reclaim HBM.
+    for _c in \$(docker ps -aq 2>/dev/null); do
+      _nm=\$(docker inspect -f '{{.Name}}' \"\$_c\" 2>/dev/null | sed 's#^/##')
+      [ \"\$_nm\" = \"$CONTAINER\" ] && continue
+      _dev=\$(docker inspect -f '{{range .HostConfig.Devices}}{{.PathOnHost}} {{end}}' \"\$_c\" 2>/dev/null)
+      case \"\$_dev\" in
+        *kfd*|*dri*)
+          echo \"[\$(hostname)] reaping stale GPU container \$_nm (\$_c)\"
+          docker rm -f \"\$_c\" >/dev/null 2>&1 || true ;;
+        *)
+          echo \"[\$(hostname)] keeping non-GPU/system container \$_nm (\$_c)\" ;;
+      esac
+    done
     if [ \"$FORCE_PROVISION\" = \"1\" ] || ! docker exec $CONTAINER true >/dev/null 2>&1; then
       echo \"[\$(hostname)] (re)provisioning container\"
       LAUNCH_SLURM_PHASE=provision CONTAINER=$CONTAINER IMAGE=$IMAGE \
         BAKED_IMAGE=$BAKED_IMAGE BAKED_TAR=$BAKED_TAR USE_BAKED=$USE_BAKED \
         BAKE_IMAGE=${BAKE_IMAGE:-0} RDMA_OVERLAY=$OVERLAY REPO=$REPO bash $SCRIPT_PATH
     else
-      echo \"[\$(hostname)] container already up\"
+      # Container persists across jobs; the reap above only removes FOREIGN GPU
+      # containers, so our own '$CONTAINER' can still pin HBM via stray trainer
+      # ranks left by a prior OOM/crash (this caused repeated 'CUDA out of memory'
+      # on relaunch onto the same node). Restart it to kill every exec'd proc and
+      # let the driver reclaim HBM — cheap (keeps the installed deps in the
+      # container fs; NFS RDMA overlay also persists), no full re-provision.
+      echo \"[\$(hostname)] container already up — restarting to free any leaked HBM before launch\"
+      docker restart $CONTAINER >/dev/null 2>&1 || true
+      # Readiness gate: a bare 'docker exec true' can pass while the runtime is
+      # still settling, so the SUBSEQUENT (heavier) worker exec races the restart
+      # and dies with 'container is not running' / OCI 'setns' errors (observed on
+      # c07-08 and e08-08 -> the peer never joins rendezvous -> master 600s
+      # TCPStore timeout). Require State.Running=true AND a successful probe, then
+      # a short settle, before considering the container ready.
+      for _w in \$(seq 1 30); do
+        [ \"\$(docker inspect -f '{{.State.Running}}' $CONTAINER 2>/dev/null)\" = \"true\" ] \
+          && docker exec $CONTAINER true >/dev/null 2>&1 && break
+        sleep 2
+      done
+      sleep 2
+      echo \"[\$(hostname)] container restarted (HBM reclaimed; running=\$(docker inspect -f '{{.State.Running}}' $CONTAINER 2>/dev/null))\"
     fi
   " 2>&1 | tee -a "$LOG"
 
   # --- step 2: launch the worker (trainer) inside the container on every node -
   echo "[$(date)] launching trainer (worker phase) on all nodes" | tee -a "$LOG"
   srun --ntasks-per-node=1 bash -c "
+    # Pre-flight readiness gate (per node): step 1 ran in a SEPARATE srun, so the
+    # container can still be settling here. Wait for State.Running=true + a probe
+    # before the worker exec so we don't race a just-restarted container.
+    for _w in \$(seq 1 30); do
+      [ \"\$(docker inspect -f '{{.State.Running}}' $CONTAINER 2>/dev/null)\" = \"true\" ] \
+        && docker exec $CONTAINER true >/dev/null 2>&1 && break
+      [ \$_w -eq 1 ] && echo \"[\$(hostname)] worker pre-flight: waiting for container to be ready...\"
+      sleep 2
+    done
+    # Retry wrapper: docker exec startup failures (rc 125 daemon 'container is not
+    # running', 126/127 OCI/setns 'exec failed') mean the container wasn't ready,
+    # NOT that the trainer ran and failed. Restart + re-gate + retry a few times.
+    # Any OTHER rc (the trainer actually started and exited) is propagated so the
+    # supervisor's resume-from-checkpoint logic owns real failures.
+    _wattempt=0
+    while : ; do
+    _wattempt=\$((_wattempt+1))
     docker exec \
       -e LAUNCH_SLURM_PHASE=worker \
       -e SLURM_NNODES=\$SLURM_NNODES \
@@ -213,7 +279,16 @@ orchestrate() {
       ${NUM_WORKERS:+-e NUM_WORKERS=$NUM_WORKERS} \
       ${PREFETCH_FACTOR:+-e PREFETCH_FACTOR=$PREFETCH_FACTOR} \
       ${DIAG_UNIQUE_EMB:+-e DIAG_UNIQUE_EMB=$DIAG_UNIQUE_EMB} \
+      ${DIAG_EMB_STEPS:+-e DIAG_EMB_STEPS=$DIAG_EMB_STEPS} \
       ${OUTPUT_TRACE:+-e OUTPUT_TRACE=$OUTPUT_TRACE} \
+      ${MIN_HISTORY:+-e MIN_HISTORY=$MIN_HISTORY} \
+      ${MAX_SEQ_LEN:+-e MAX_SEQ_LEN=$MAX_SEQ_LEN} \
+      ${HISTORY_LENGTH:+-e HISTORY_LENGTH=$HISTORY_LENGTH} \
+      ${BATCH_SIZE:+-e BATCH_SIZE=$BATCH_SIZE} \
+      ${CKPT_TIME_INTERVAL_S:+-e CKPT_TIME_INTERVAL_S=$CKPT_TIME_INTERVAL_S} \
+      ${KEEP_LAST_N:+-e KEEP_LAST_N=$KEEP_LAST_N} \
+      ${IN_WINDOW_CKPT_FREQ:+-e IN_WINDOW_CKPT_FREQ=$IN_WINDOW_CKPT_FREQ} \
+      ${CKPT_STEP_FREQ:+-e CKPT_STEP_FREQ=$CKPT_STEP_FREQ} \
       -e TRAIN_SPLIT_PERCENTAGE=${TRAIN_SPLIT_PERCENTAGE:-0.90} \
       -e SPLIT_SALT=${SPLIT_SALT:-0} \
       -e EVAL_HOLDOUT_TS=${EVAL_HOLDOUT_TS:--1} \
@@ -224,6 +299,20 @@ orchestrate() {
       -e LOG=$LOG \
       $NCCL_ENV_ARGS \
       $CONTAINER bash -lc 'cd $REPO && LAUNCH_SLURM_PHASE=worker bash scripts/launch_slurm.sh'
+    _wrc=\$?
+    if { [ \$_wrc -eq 125 ] || [ \$_wrc -eq 126 ] || [ \$_wrc -eq 127 ]; } && [ \$_wattempt -lt 5 ]; then
+      echo \"[\$(hostname)] worker exec failed to START (rc=\$_wrc, attempt \$_wattempt/5) — container not ready; restarting + retrying\"
+      docker restart $CONTAINER >/dev/null 2>&1 || true
+      for _w in \$(seq 1 30); do
+        [ \"\$(docker inspect -f '{{.State.Running}}' $CONTAINER 2>/dev/null)\" = \"true\" ] \
+          && docker exec $CONTAINER true >/dev/null 2>&1 && break
+        sleep 2
+      done
+      sleep 3
+      continue
+    fi
+    exit \$_wrc
+    done
   " 2>&1 | tee -a "$LOG"
   rc=${PIPESTATUS[0]}
   echo "[$(date)] launch_slurm/orchestrate finished rc=$rc" | tee -a "$LOG"
@@ -400,21 +489,29 @@ worker() {
   else
     NNODES=${NNODES:-1}
     NODE_RANK=${NODE_RANK:-0}
-    MASTER_ADDR=${MASTER_ADDR:-localhost}
+    # Single-node: all ranks live on THIS host, so rendezvous over loopback and
+    # do NOT use the SLURM hostname. On some nodes the hostname resolves to a
+    # non-routable per-GPU RoCE /31 (benic 192.168.x) address; using it makes the
+    # NCCL bootstrap fail with "No route to host". localhost is node-independent.
+    MASTER_ADDR=localhost
     MASTER_PORT=${MASTER_PORT:-}     # empty => train_ranker picks a free port
   fi
   export NNODES NODE_RANK GPUS_PER_NODE MASTER_ADDR MASTER_PORT
   export WORLD_SIZE=$(( NNODES * GPUS_PER_NODE ))
   echo "[$(date)] topology: nnodes=$NNODES node_rank=$NODE_RANK gpus_per_node=$GPUS_PER_NODE world_size=$WORLD_SIZE master=$MASTER_ADDR:${MASTER_PORT:-<auto>}" | tee -a "$LOG"
 
-  # RCCL/NCCL cross-node knobs (multi-node only; single-node leaves auto-detect).
-  # The container is --network=host so RCCL sees ALL host interfaces; split the
-  # two planes explicitly: TCP bootstrap over the routable fenic0, RDMA data over
-  # the 8 Broadcom bnxt_re RoCE HCAs (the per-GPU benic* 192.168.x/31 links are
-  # NOT node-routable for TCP — auto-detect there hangs init).
+  # NCCL bootstrap NIC — pin for BOTH single- and multi-node. The container is
+  # --network=host so RCCL sees ALL host interfaces; if left to auto-detect, NCCL
+  # can pick a non-routable per-GPU RoCE /31 (benic* 192.168.x) link and fail
+  # bootstrap with "No route to host" (this is node-dependent: it happened to
+  # work on some nodes and not others, causing repetitive single-node init
+  # failures). Pinning the routable host NIC fixes it everywhere.
+  # [CLUSTER-SPECIFIC] routable host NIC for TCP bootstrap (find via `ip -br addr`).
+  export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-fenic0}
+
+  # Multi-node additionally needs the RDMA data-plane (bnxt_re HCAs) configured;
+  # single-node uses intra-node P2P (XGMI/PCIe) so only the bootstrap NIC matters.
   if [ "$NNODES" -gt 1 ]; then
-    # [CLUSTER-SPECIFIC] routable host NIC for TCP bootstrap (find via `ip -br addr`).
-    export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-fenic0}
     NCCL_NET_TRANSPORT=${NCCL_NET_TRANSPORT:-ib}
     if [ "$NCCL_NET_TRANSPORT" = "socket" ]; then
       export NCCL_IB_DISABLE=1
@@ -467,6 +564,27 @@ worker() {
     echo "[$(date)] sclk sample (GPU0):$(rocm-smi -d 0 --showclocks 2>/dev/null | grep -i 'sclk clock level' | sed -E 's/.*sclk clock level//')" | tee -a "$LOG" || true
   fi
 
+  # --- stray-trainer / leaked-VRAM guard -------------------------------------
+  # The trainer runs via `docker exec` into a long-lived container, so its procs
+  # live in the container PID namespace, NOT the SLURM job cgroup. If a prior job
+  # OOM'd/crashed, a rank can leak and keep holding ~half of every GPU's VRAM,
+  # which persists across jobs (container survives) and guarantees the next
+  # attempt OOMs. Before launching, reap any pre-existing trainer procs (there
+  # should be none at this point) and wait for VRAM to drain. [g]-guard avoids
+  # self-match. Non-fatal.
+  if pgrep -f '[g]enerative_recommenders' >/dev/null 2>&1; then
+    echo "[$(date)] WARNING: leaked trainer procs found pre-launch — killing." | tee -a "$LOG"
+    pkill -9 -f '[g]enerative_recommenders' 2>/dev/null || true
+    for _i in $(seq 1 15); do
+      pgrep -f '[g]enerative_recommenders' >/dev/null 2>&1 || break
+      sleep 2
+    done
+    sleep 5  # let the driver release VRAM after process exit
+    if command -v rocm-smi >/dev/null 2>&1; then
+      echo "[$(date)] post-cleanup GPU0 used GiB:$(rocm-smi --showmeminfo vram 2>/dev/null | awk -F: '/Used/{printf " %.0f", $3/1073741824; exit}')" | tee -a "$LOG"
+    fi
+  fi
+
   echo "[$(date)] launching train_ranker with WORLD_SIZE=$WORLD_SIZE" | tee -a "$LOG"
   python -m generative_recommenders.dlrm_v3.train.train_ranker \
       --dataset yambda-5b --mode "${MODE:-streaming-train-eval}" 2>&1 | tee -a "$LOG"

From 6182ab46dce96c3b52ccdcd60c429ebd078f5566 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Fri, 12 Jun 2026 20:48:19 -0500
Subject: [PATCH 053/113] dlrmv4: durable per-boundary eval metrics JSONL sink
 + aggregate emb-diag
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Write one JSON line per eval boundary to <LOG>.metrics.jsonl capturing the end-of-pass metrics over the fixed holdout (append-only, rank 0, survives restarts/resumes) — no interim/averaging ambiguity. Rework the unique-embedding diagnostic into an aggregate over DIAG_EMB_STEPS batches covering the cross-feature tables.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/utils.py                    | 178 ++++++++++++++++--
 1 file changed, 162 insertions(+), 16 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index 076412d6e..6208214fb 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -688,34 +688,129 @@ def make_train_test_dataloaders(
     return train_dataloader, test_dataloader
 
 
-def _log_unique_embedding_diag(sample, rank: int, step: int) -> None:
-    """Diagnostic: log per-batch unique-vs-total embedding-id counts.
+# THROWAWAY DIAG state: per-embedding-table lookup stats accumulated across a
+# fixed window of steps (DIAG_EMB_STEPS, default 100) so the reported numbers are
+# averaged/aggregated rather than a single noisy batch. Rank-0 only.
+_EMB_DIAG_ACC: Dict[str, Dict[str, Any]] = {}
+_EMB_DIAG_NBATCH: int = 0
+# Cap (per-batch lookups) below which we also track a TRUE global-unique set
+# across the whole window (cheap for contextual/cross tables, total == batch
+# size). Sequential tables (item/artist/album) blow past this and only get the
+# per-batch averages.
+_EMB_DIAG_GLOBAL_CAP: int = 1 << 17  # 131072
+
+
+def _log_unique_embedding_diag(
+    sample, rank: int, step: int, max_steps: int = 100, log_every: int = 50
+) -> None:
+    """Diagnostic: aggregate per-embedding-table lookup stats over a step window.
 
     Quantifies the user-major batching concern — when consecutive sliding-window
     anchors come from the same few users, a batch reads very few UNIQUE embedding
-    rows (low unique/total), so embedding lookups are highly redundant. In-window
-    shuffle should raise this ratio. Rank-0 only, gated by the caller (only fires
-    on logged steps), and fully non-fatal so it can never break training.
+    rows (low unique/total), so embedding lookups are highly redundant. Covers the
+    base id tables AND the cross-feature tables (user_x_* / *_x_hour hashed
+    combos); the user_x_* tables should be the most redundant under shuffle OFF.
+
+    Accumulates over ``max_steps`` batches and emits an aggregate summary (mean
+    per-batch unique%/hot%/top10%, plus a true global-unique over the whole
+    window for the small contextual/cross tables). Rank-0 only, non-fatal.
     """
     if rank != 0:
         return
+    global _EMB_DIAG_NBATCH
     try:
-        parts = []
+        from generative_recommenders.dlrm_v3.configs import YAMBDA_5B_CROSS_SPECS
+
+        cross_caps = {name: n for (name, _k, n, _s) in YAMBDA_5B_CROSS_SPECS}
+
+        def _table_of(key: str):
+            # cross tables match by exact name; resolve BEFORE substring fallbacks
+            # so e.g. 'user_x_artist' isn't misread as the artist_id table.
+            if key in cross_caps:
+                return key, cross_caps[key]
+            if key == "uid" or key.endswith("_uid") or key.endswith(".uid"):
+                return "uid", 0
+            if "artist" in key:
+                return "artist_id", 0
+            if "album" in key:
+                return "album_id", 0
+            if "item" in key and key.endswith("id"):
+                return "item_id", 0
+            return None, 0
+
         for tag, kjt in (
             ("uih", sample.uih_features_kjt),
             ("cand", sample.candidates_features_kjt),
         ):
             for key in kjt.keys():
-                if not key.endswith(("item_id", "artist_id", "album_id")):
+                table, cap = _table_of(key)
+                if table is None:
                     continue
                 vals = kjt[key].values()
                 total = int(vals.numel())
                 if total == 0:
                     continue
-                uniq = int(torch.unique(vals).numel())
-                parts.append(f"{tag}.{key}={uniq}/{total} ({100.0 * uniq / total:.1f}%)")
-        if parts:
-            logger.info(f"emb-diag - Step {step}: unique/total " + " ".join(parts))
+                u, counts = torch.unique(vals, return_counts=True)
+                uniq = int(u.numel())
+                hot1 = int(counts.max().item())
+                k = min(10, uniq)
+                topk = int(torch.topk(counts, k).values.sum().item())
+
+                slot = _EMB_DIAG_ACC.setdefault(
+                    f"{tag}.{key}",
+                    {
+                        "table": table,
+                        "cap": cap,
+                        "n": 0,
+                        "tot": 0,
+                        "uniq": 0,
+                        "upct": 0.0,
+                        "upct_min": 100.0,
+                        "upct_max": 0.0,
+                        "hot1pct": 0.0,
+                        "topkpct": 0.0,
+                        "glob": None,  # running global-unique id tensor (small tables)
+                    },
+                )
+                upct = 100.0 * uniq / total
+                slot["n"] += 1
+                slot["tot"] += total
+                slot["uniq"] += uniq
+                slot["upct"] += upct
+                slot["upct_min"] = min(slot["upct_min"], upct)
+                slot["upct_max"] = max(slot["upct_max"], upct)
+                slot["hot1pct"] += 100.0 * hot1 / total
+                slot["topkpct"] += 100.0 * topk / total
+                if total <= _EMB_DIAG_GLOBAL_CAP:
+                    prev = slot["glob"]
+                    merged = u if prev is None else torch.cat([prev, u])
+                    slot["glob"] = torch.unique(merged)
+
+        _EMB_DIAG_NBATCH += 1
+        n = _EMB_DIAG_NBATCH
+        if n % log_every == 0 or n >= max_steps:
+            lines = [f"emb-diag AGGREGATE over {n} batches (step<= {step}):"]
+            for name in sorted(_EMB_DIAG_ACC):
+                s = _EMB_DIAG_ACC[name]
+                c = max(1, s["n"])
+                cap_s = f" cap={s['cap']/1e6:.0f}M" if s["cap"] else ""
+                glob_s = ""
+                if s["glob"] is not None:
+                    g = int(s["glob"].numel())
+                    glob_s = (
+                        f" | global_uniq={g} over {s['tot']} seen "
+                        f"({s['tot']/max(1,g):.1f}x reuse)"
+                    )
+                lines.append(
+                    f"  {name}[{s['table']}]{cap_s}: "
+                    f"avg_tot={s['tot']/c:.0f} "
+                    f"avg_uniq%={s['upct']/c:.1f} "
+                    f"(min={s['upct_min']:.1f} max={s['upct_max']:.1f}) "
+                    f"avg_hot1%={s['hot1pct']/c:.1f} "
+                    f"avg_top10%={s['topkpct']/c:.1f}"
+                    f"{glob_s}"
+                )
+            logger.info("\n".join(lines))
     except Exception as e:  # diagnostic must never break training
         logger.warning(f"emb-diag failed: {e}")
 
@@ -744,8 +839,16 @@ def train_loop(
     for epoch in range(num_epochs):
         dataloader.sampler.set_epoch(epoch)  # pyre-ignore [16]
         for sample in dataloader:
-            if streaming_diag_unique_emb and batch_idx % metric_log_frequency == 0:
-                _log_unique_embedding_diag(sample, rank, batch_idx)
+            if streaming_diag_unique_emb and batch_idx < int(
+                os.environ.get("DIAG_EMB_STEPS", "100")
+            ):
+                _log_unique_embedding_diag(
+                    sample,
+                    rank,
+                    batch_idx,
+                    max_steps=int(os.environ.get("DIAG_EMB_STEPS", "100")),
+                    log_every=metric_log_frequency,
+                )
             optimizer.zero_grad()
             sample.to(device)
             (
@@ -1494,8 +1597,16 @@ def _run_train_window(
                 break
             if _t_next is not None and first_wait is None:
                 first_wait = time.perf_counter() - _t_next
-            if streaming_diag_unique_emb and train_batch_idx % metric_log_frequency == 0:
-                _log_unique_embedding_diag(sample, rank, train_batch_idx)
+            if streaming_diag_unique_emb and train_batch_idx < int(
+                os.environ.get("DIAG_EMB_STEPS", "100")
+            ):
+                _log_unique_embedding_diag(
+                    sample,
+                    rank,
+                    train_batch_idx,
+                    max_steps=int(os.environ.get("DIAG_EMB_STEPS", "100")),
+                    log_every=metric_log_frequency,
+                )
             optimizer.zero_grad()
             sample.to(device)
             (
@@ -1653,7 +1764,8 @@ def _run_eval_window(eval_data_iterator, label: Optional[str] = None) -> None:
                     metric_logger.compute_and_log(mode="eval")
                 if num_eval_batches is not None and eval_batch_idx >= num_eval_batches:
                     break
-            for k, v in metric_logger.compute(mode="eval").items():
+            _eval_metrics = metric_logger.compute(mode="eval")
+            for k, v in _eval_metrics.items():
                 print(f"{k}: {v}")
         if label and rank == 0 and _t_enter is not None:
             _eval_total = time.perf_counter() - _t_enter
@@ -1662,6 +1774,40 @@ def _run_eval_window(eval_data_iterator, label: Optional[str] = None) -> None:
                 f"[boundary] {label} eval first-batch data-wait={_fw:.1f}ms "
                 f"total_eval={_eval_total * 1000:.1f}ms batches={eval_batch_idx}"
             )
+            # Dedicated per-eval metrics sink. One JSON line per eval boundary
+            # capturing the END-OF-PASS metric over the FIXED holdout -- the single
+            # correct value for that eval point (no interim/averaging ambiguity).
+            # Rank 0 only; append-only so it survives restarts and the trajectory
+            # accumulates across resumes. Written next to the main run log
+            # ("<LOG>.metrics.jsonl"), falling back to cwd if LOG is unset.
+            import json
+            import re as _re
+
+            _log = os.environ.get("LOG")
+            if _log:
+                _base = _log[:-4] if _log.endswith(".log") else _log
+                _metrics_path = f"{_base}.metrics.jsonl"
+            else:
+                _metrics_path = "streaming_eval_metrics.jsonl"
+            _ts_m = _re.search(r"train_ts=(\d+)", label)
+            _rec = {
+                "label": label,
+                "train_ts": int(_ts_m.group(1)) if _ts_m else None,
+                "global_step": int(metric_logger.global_step.get("train", -1)),
+                "eval_batches": eval_batch_idx,
+                "total_eval_ms": round(_eval_total * 1000, 1),
+                "wall_time": time.time(),
+            }
+            for _k, _v in _eval_metrics.items():
+                try:
+                    _rec[_k] = float(_v)
+                except (TypeError, ValueError):
+                    pass
+            try:
+                with open(_metrics_path, "a") as _f:
+                    _f.write(json.dumps(_rec) + "\n")
+            except OSError as _e:
+                logger.warning("failed to write metrics sink %s: %s", _metrics_path, _e)
 
     def _maybe_checkpoint(train_ts: int) -> None:
         if (

From 51e6e5f1dfd6aa182a4762ab71fbe629834af027 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Fri, 12 Jun 2026 20:53:11 -0500
Subject: [PATCH 054/113] dlrmv4: default MIN_HISTORY=0 (include cold-start
 first events) to match min0 runs

All live min0 runs already export MIN_HISTORY=0; make the gin default match so future runs without an override include each user's cold-start first event (zero prior context) instead of requiring >=1 prior event.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin                       | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 3bfccc510..e6922f594 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -110,12 +110,12 @@ hl/env_int.default = 4086
 # is only the gather/truncation cap) — jagged attention handles short UIH, so we
 # no longer need a full history_length of context to include a sample. The legacy
 # behavior (require a full history_length, which dropped ~60% of users) is
-# MIN_HISTORY=4086; the default below (1) includes ~all users with any history.
-# Positions/anchor-ts caches are keyed by (L, MIN_HISTORY) so floors don't
-# collide. Override via $MIN_HISTORY.
+# MIN_HISTORY=4086; the default below (0) includes ~all users AND their
+# cold-start first event (zero prior context). Positions/anchor-ts caches are
+# keyed by (L, MIN_HISTORY) so floors don't collide. Override via $MIN_HISTORY.
 get_dataset.min_history = @mh/env_int()
 mh/env_int.key     = "MIN_HISTORY"
-mh/env_int.default = 1
+mh/env_int.default = 0
 
 # Model-side attention budget. Dataset truncates UIH to fit this value if
 # `history_length + contextual + candidate` would overflow. Override via

From 03a5be13268c5102354fa98183215a93210456cf Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Fri, 12 Jun 2026 23:46:39 -0500
Subject: [PATCH 055/113] dlrmv4: default NUM_TRAIN_TS=149 to sweep full
 ts=150..298 streaming range

Matches the long e2e runs (start_ts=150 + 149 daily windows). EVAL_EACH_WINDOW
and EVAL_EVERY_N_WINDOWS already default to 1. Clamped to the dataset's
available window count at runtime; override via $NUM_TRAIN_TS.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin  | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index e6922f594..ad268aec7 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -173,7 +173,10 @@ make_persistent_streaming_dataloader.num_workers = %num_workers
 make_persistent_streaming_dataloader.prefetch_factor = %prefetch_factor
 streaming_train_eval_loop.num_train_ts = @nts/env_int()
 nts/env_int.key = "NUM_TRAIN_TS"
-nts/env_int.default = 30
+# 149 daily windows -> with start_ts=150 the run sweeps ts=150..298, the full
+# dense range of the corpus (matches the long e2e runs). Clamped to the
+# dataset's available window count at runtime. Override via $NUM_TRAIN_TS.
+nts/env_int.default = 149
 # Anchors need >= history_length prior events, so the first ~130 daily windows
 # are near-empty warm-up; start at a dense window. Override via $START_TS.
 streaming_train_eval_loop.start_ts = @sts/env_int()

From 20f691776fff68b04ca1cb0ce0cc21039c88182c Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Fri, 12 Jun 2026 23:57:09 -0500
Subject: [PATCH 056/113] dlrmv4: consolidate eval cadence into single
 EVAL_EVERY_N_WINDOWS knob

Drop the redundant EVAL_EACH_WINDOW on/off boolean and fold "disable eval"
into the cadence int: 0 = eval off (train-only / resume test), 1 = every
window, N>1 = every Nth window (anchored to the absolute ts grid, stable
across resume). Updates streaming_train_eval_loop, the gin bindings, and the
resume test harness accordingly. No behavior change for the live runs
(EVAL_EVERY_N_WINDOWS=1).

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin           | 14 +++++-----
 .../train/tests/streaming_resume_test.sh      |  8 +++---
 .../dlrm_v3/train/utils.py                    | 26 ++++++++++---------
 3 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index ad268aec7..0b2c884bd 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -206,13 +206,13 @@ ot/env_int.default = 0
 streaming_train_eval_loop.persistent_loader = @pl/env_int()
 pl/env_int.key = "PERSISTENT_LOADER"
 pl/env_int.default = 1
-streaming_train_eval_loop.eval_each_window = @ev/env_int()
-ev/env_int.key = "EVAL_EACH_WINDOW"
-ev/env_int.default = 1
-# Full-holdout eval cadence: run eval every Nth train window (and always on the
-# final window) instead of every window. 1 (default) = eval every window (no
-# behavior change). Set >1 (e.g. 5 via $EVAL_EVERY_N_WINDOWS) to amortize the
-# cost of consuming the full next-day eval window over several train windows.
+# Full-holdout eval cadence (single knob; replaces the old EVAL_EACH_WINDOW
+# on/off switch). 0 = eval disabled (train-only, e.g. perf benchmarking or the
+# resume test; the eval dataloader isn't even built). 1 (default) = eval after
+# every window. N>1 (e.g. 5 via $EVAL_EVERY_N_WINDOWS) = eval every Nth window
+# (and always the final one) to amortize the cost of consuming the full next-day
+# eval window. The cadence is anchored to the absolute ts grid so eval points
+# stay stable across a mid-run resume.
 streaming_train_eval_loop.eval_every_n_windows = @evn/env_int()
 evn/env_int.key = "EVAL_EVERY_N_WINDOWS"
 evn/env_int.default = 1
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
index c3690e652..e14e557e8 100755
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
@@ -13,7 +13,7 @@
 #
 # Driven entirely via env-driven gin knobs defined in yambda_5b.gin:
 #   NUM_TRAIN_TS / NUM_TRAIN_BATCHES / IN_WINDOW_CKPT_FREQ / DIE_AT_STEP /
-#   CKPT_PATH / KEEP_LAST_N / EVAL_EACH_WINDOW
+#   CKPT_PATH / KEEP_LAST_N / EVAL_EVERY_N_WINDOWS
 #
 # Usage:
 #   bash generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh --jobid <slurm-jobid>
@@ -127,7 +127,7 @@ run_phase() {
 clean_ckpt
 run_phase baseline \
     "NUM_TRAIN_TS=1" \
-    "EVAL_EACH_WINDOW=0" \
+    "EVAL_EVERY_N_WINDOWS=0" \
     "METRIC_LOG_FREQ=1" \
     "NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES" \
     "DIE_AT_STEP=-1"
@@ -140,7 +140,7 @@ cleanup_workers
 clean_ckpt
 run_phase interrupt \
     "NUM_TRAIN_TS=1" \
-    "EVAL_EACH_WINDOW=0" \
+    "EVAL_EVERY_N_WINDOWS=0" \
     "METRIC_LOG_FREQ=1" \
     "NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES" \
     "IN_WINDOW_CKPT_FREQ=$IN_WINDOW_FREQ" \
@@ -158,7 +158,7 @@ echo "Saved checkpoints after interrupt: $SAVED"
 # === Phase 3: resume ===
 run_phase resume \
     "NUM_TRAIN_TS=1" \
-    "EVAL_EACH_WINDOW=0" \
+    "EVAL_EVERY_N_WINDOWS=0" \
     "METRIC_LOG_FREQ=1" \
     "NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES" \
     "IN_WINDOW_CKPT_FREQ=$IN_WINDOW_FREQ" \
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index 6208214fb..9ef251951 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -1316,7 +1316,6 @@ def streaming_train_eval_loop(
     checkpoint_frequency: int = 100,
     start_ts: int = 0,
     persistent_loader: bool = False,
-    eval_each_window: bool = True,
     eval_every_n_windows: int = 1,
     double_buffer: bool = False,
     # --- fixed user-holdout eval set ---
@@ -1838,18 +1837,21 @@ def _maybe_checkpoint(train_ts: int) -> None:
     def _should_eval(i: int) -> bool:
         """Whether to run the full-holdout eval after training window index `i`.
 
-        `eval_every_n_windows<=1` (default) preserves the per-window cadence.
-        For K>1 we eval when the ABSOLUTE window ts is on the grid anchored at
-        `eval_anchor_ts` (the original start_ts), i.e. ts in {anchor, anchor+K,
-        anchor+2K, ...}, and ALWAYS on the final window so the trajectory ends
-        with an eval point. Anchoring to the absolute ts (not the per-call loop
-        index `i`) keeps the eval grid (e.g. 150,160,170,...) stable across a
-        mid-run resume, which rebases start_ts/`train_ts_list` to the resume
-        window. Gated by `eval_each_window`.
+        Single cadence knob `eval_every_n_windows`:
+          * <=0 -> eval disabled entirely (train-only; e.g. perf benchmarking or
+            the resume test). The eval dataloader is not even built.
+          * 1 (default) -> eval after every window.
+          * K>1 -> eval when the ABSOLUTE window ts is on the grid anchored at
+            `eval_anchor_ts` (the original start_ts), i.e. ts in {anchor,
+            anchor+K, anchor+2K, ...}, and ALWAYS on the final window so the
+            trajectory ends with an eval point. Anchoring to the absolute ts
+            (not the per-call loop index `i`) keeps the eval grid (e.g.
+            150,160,170,...) stable across a mid-run resume, which rebases
+            start_ts/`train_ts_list` to the resume window.
         """
-        if not eval_each_window:
+        if eval_every_n_windows <= 0:
             return False
-        if eval_every_n_windows <= 1:
+        if eval_every_n_windows == 1:
             return True
         return (train_ts_list[i] - eval_anchor_ts) % eval_every_n_windows == 0 or i == n_train - 1
 
@@ -1892,7 +1894,7 @@ def _should_eval(i: int) -> bool:
         # sample content depends only on the sampler window, not is_eval, so
         # prefetching during train is safe.
         eval_iter: Optional[Iterator] = None
-        if eval_each_window and len(train_ts_list) > 0:
+        if eval_every_n_windows > 0 and len(train_ts_list) > 0:
             eval_sampler = StreamingWindowSampler(rank, world_size)
             eval_dl = make_persistent_streaming_dataloader(
                 dataset=dataset, sampler=eval_sampler

From 9171e4f4871b59a94374fa171e1be3e5433b9fb0 Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Mon, 15 Jun 2026 06:45:10 +0000
Subject: [PATCH 057/113] dlrmv4: random per-run seed + graceful teardown + no
 double-logging

- setup(): init the process group first, then draw a fresh random seed per
  run (rank 0 broadcasts so all ranks agree), export it to $SEED, and log it;
  pin $SEED to reproduce a run exactly. Data order/split are unaffected (still
  time-deterministic + $SPLIT_SALT-governed); the seed governs dense weight
  init. MLPerf SEED event now logs the actual chosen value, not hardcoded 1.
- train_ranker(): move distributed teardown into a finally block so a clean
  finish also barriers + destroy_process_group()s in lockstep, silencing the
  noisy TCPStore "broken pipe" / "should dump" warnings at exit. Best-effort
  so teardown never masks a real error.
- launch_slurm.sh: orchestrate sets WORKER_TEE=0 so the worker points its own
  file sink at /dev/null (stdout is already tee'd upstream), avoiding every log
  line being written twice. Direct single-node (e2e supervisor) keeps WORKER_TEE
  unset and still writes $LOG itself.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/train_ranker.py             | 26 +++++++++--
 .../dlrm_v3/train/utils.py                    | 46 ++++++++++++++-----
 recommendation_v4/scripts/launch_slurm.sh     |  9 ++++
 3 files changed, 66 insertions(+), 15 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
index 0168add5b..d22d09a30 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
@@ -220,8 +220,9 @@ def _gin_param(name: str, default: object) -> object:
         global_batch_size = world_size * int(train_dataloader.batch_size)
         mlperf_logger.event(key=c.GLOBAL_BATCH_SIZE, value=global_batch_size)
         mlperf_logger.event(key=c.GRADIENT_ACCUMULATION_STEPS, value=1)
-        # Seed is fixed in setup() (_SEED = 1); kept in sync here.
-        mlperf_logger.event(key=c.SEED, value=1)
+        # Log the ACTUAL seed chosen in setup() (random per-run unless $SEED is
+        # pinned; setup() exports the chosen value to $SEED).
+        mlperf_logger.event(key=c.SEED, value=int(os.environ.get("SEED", "1")))
         # Dense (Adam) + sparse (RowWiseAdagrad) optimizer hyperparameters,
         # read from the active gin bindings.
         mlperf_logger.event(
@@ -309,8 +310,27 @@ def _gin_param(name: str, default: object) -> object:
             )
     except Exception as e:
         logger.info(traceback.format_exc())
-        cleanup()
         raise Exception(e)
+    finally:
+        # Graceful distributed teardown (runs on BOTH success and failure).
+        # Previously cleanup() ran only in the except branch, so a clean finish
+        # returned without destroying the process group: ranks that returned
+        # first let rank 0's TCPStore close while peers' ProcessGroupNCCL
+        # heartbeat-monitor threads were still polling it, emitting the noisy
+        # (but harmless) "Failed to check the 'should dump' flag on TCPStore /
+        # Broken pipe" warnings + C++ stack traces at exit. Barrier first so all
+        # ranks reach the end in lockstep, then destroy_process_group() stops
+        # each rank's monitor thread and closes NCCL/the store in order. Both
+        # steps are guarded/best-effort so teardown never masks a real error.
+        if torch.distributed.is_initialized():
+            try:
+                torch.distributed.barrier()
+            except Exception:
+                logger.info("teardown barrier failed (non-fatal)")
+            try:
+                cleanup()
+            except Exception:
+                logger.info("teardown destroy_process_group failed (non-fatal)")
 
 
 def get_args():  # pyre-ignore [3]
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index 96f826f84..79a8571b0 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -96,27 +96,49 @@ def setup(
     # leaving stale allocations and triggering OOMs on rank 0.
     torch.cuda.set_device(device)
 
-    # Seed all RNGs so weight init (make_model, called after setup) is
-    # reproducible across runs. Same seed on every rank → dense params are
-    # initialized identically across ranks; sharded embeddings are init'd from
-    # the meta device by DMP. Fixed seed makes pipeline-vs-non-pipeline an
-    # init-matched A/B (data order is already deterministic via the sampler).
     import random
 
     import numpy as np
 
-    _SEED = 1
-    random.seed(_SEED)
-    np.random.seed(_SEED)
-    torch.manual_seed(_SEED)
-    torch.cuda.manual_seed_all(_SEED)
-
-    # initialize the process group
+    # initialize the process group FIRST so ranks can agree on a shared seed
+    # (the seed MUST be identical on every rank, else dense weight init diverges
+    # across ranks and DDP/AllReduce trains garbage).
     if not dist.is_initialized():
         dist.init_process_group(
             "nccl", rank=rank, world_size=world_size, device_id=device
         )
 
+    # Seed selection. By default we draw a FRESH RANDOM seed every run so each
+    # launch explores a different dense weight init; pin $SEED to reproduce a
+    # specific run exactly. rank 0 draws the seed and broadcasts it so all ranks
+    # share one value; the chosen seed is exported to $SEED and logged so any run
+    # can be reproduced after the fact. NOTE (streaming-train-eval): the data
+    # ORDER and train/holdout split do NOT depend on this seed — order is
+    # time-deterministic (StreamingWindowSampler) and the split is governed by
+    # $SPLIT_SALT. The seed governs dense weight init + global-RNG stochastic ops.
+    env_seed = os.environ.get("SEED", "").strip()
+    if env_seed:
+        seed = int(env_seed)
+    else:
+        seed = int.from_bytes(os.urandom(4), "little") if rank == 0 else 0
+        _seed_t = torch.tensor([seed], dtype=torch.int64, device=device)
+        dist.broadcast(_seed_t, src=0)
+        seed = int(_seed_t.item())
+    os.environ["SEED"] = str(seed)
+
+    # Seed all RNGs with the agreed value so weight init (make_model, called
+    # after setup) is identical across ranks; sharded embeddings are init'd from
+    # the meta device by DMP.
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    if rank == 0:
+        logger.info(
+            f"[seed] using seed={seed} "
+            f"({'pinned via $SEED' if env_seed else 'random per-run; set $SEED to reproduce'})"
+        )
+
     pg = dist.new_group(
         backend=BACKEND,
         timeout=timedelta(seconds=TIMEOUT),
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index c100a6828..bcb14442b 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -214,6 +214,7 @@ orchestrate() {
   srun --ntasks-per-node=1 bash -c "
     docker exec \
       -e LAUNCH_SLURM_PHASE=worker \
+      -e WORKER_TEE=0 \
       -e SCRATCH=$SCRATCH \
       -e SLURM_NNODES=\$SLURM_NNODES \
       -e SLURM_NODEID=\$SLURM_NODEID \
@@ -381,6 +382,14 @@ worker() {
   cd "$REPO_ROOT"
   mkdir -p "$SCRATCH" 2>/dev/null || true
   LOG=${LOG:-$SCRATCH/yambda_5b_8gpu.log}
+  # Avoid double-logging. When launched by the orchestrate phase, our stdout is
+  # ALREADY captured into the real $LOG by orchestrate's `tee` (and, multi-node,
+  # funneled through one srun pipe). Re-`tee`ing $LOG here would write every line
+  # twice. Orchestrate sets WORKER_TEE=0 to point our own file sink at /dev/null:
+  # we still echo to stdout (captured upstream) but don't duplicate the file.
+  # Direct single-node invocation (the streaming-e2e supervisor) leaves
+  # WORKER_TEE unset, so the worker keeps writing $LOG itself.
+  [ "${WORKER_TEE:-1}" = "0" ] && LOG=/dev/null
   # TensorBoard under the writable scratch root unless the caller (e.g. the e2e
   # supervisor) pinned a per-run path. Keeps the gin default from ever being used.
   export TENSORBOARD_LOG_PATH=${TENSORBOARD_LOG_PATH:-$SCRATCH/tb/yambda_5b}

From ded9c30e081ee5e42d19aed177f00f0202e80924 Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Mon, 15 Jun 2026 02:52:25 -0500
Subject: [PATCH 058/113] dlrmv4: fix single-node-without-slurm via
 run_docker.sh
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

run_docker.sh: strip an optional leading `--` so the documented
`run_docker.sh -- bash scripts/launch_slurm.sh` form works (it was
forwarded verbatim to `docker run` and failed exec: "--" not found);
wire run_docker.sh to the launch_slurm worker flow (CONTAINER_NAME,
LOG/MODE/MAX_SEQ_LEN/HISTORY_LENGTH passthrough) and forward
NCCL_SOCKET_IFNAME so the bootstrap NIC is host-overridable.

launch_slurm.sh: default NCCL_SOCKET_IFNAME=lo for single-node
(NNODES==1) instead of the meta64-only fenic0 — loopback is reachable
by all local ranks on any host (data plane is intra-node XGMI/PCIe),
so the single-node path now runs out-of-the-box on dev boxes with no
fenic0. Multi-node keeps the fenic0 default; both stay overridable.

Verified: 8-GPU single-node streaming-train-eval smoke runs clean
(RCCL init over lo, train+eval, MLPerf run_stop) on a lone MI355 node.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 recommendation_v4/scripts/launch_slurm.sh | 28 ++++++++++++++++-------
 recommendation_v4/scripts/run_docker.sh   | 24 ++++++++++++++++---
 2 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index b035ac671..3ab70078a 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -549,14 +549,26 @@ worker() {
   export WORLD_SIZE=$(( NNODES * GPUS_PER_NODE ))
   echo "[$(date)] topology: nnodes=$NNODES node_rank=$NODE_RANK gpus_per_node=$GPUS_PER_NODE world_size=$WORLD_SIZE master=$MASTER_ADDR:${MASTER_PORT:-<auto>}" | tee -a "$LOG"
 
-  # NCCL bootstrap NIC — pin for BOTH single- and multi-node. The container is
-  # --network=host so RCCL sees ALL host interfaces; if left to auto-detect, NCCL
-  # can pick a non-routable per-GPU RoCE /31 (benic* 192.168.x) link and fail
-  # bootstrap with "No route to host" (this is node-dependent: it happened to
-  # work on some nodes and not others, causing repetitive single-node init
-  # failures). Pinning the routable host NIC fixes it everywhere.
-  # [CLUSTER-SPECIFIC] routable host NIC for TCP bootstrap (find via `ip -br addr`).
-  export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-fenic0}
+  # NCCL bootstrap NIC. The container is --network=host so RCCL sees ALL host
+  # interfaces; if left to auto-detect, NCCL can pick a non-routable per-GPU RoCE
+  # /31 (benic* 192.168.x) link and fail bootstrap with "No route to host" (this
+  # is node-dependent: it worked on some nodes and not others, causing repetitive
+  # single-node init failures). Pin it explicitly to avoid that.
+  #   * Single-node (NNODES==1): all ranks are on THIS host, so only the bootstrap
+  #     control-plane crosses the socket NIC (data plane is intra-node XGMI/PCIe,
+  #     see below). Loopback is reachable by every local rank on ANY host and is
+  #     node-independent — same rationale as MASTER_ADDR=localhost above — so it
+  #     "just works" on dev boxes that have no fenic0 (e.g. a single MI355 node).
+  #   * Multi-node (NNODES>1): needs a routable host NIC shared across nodes for
+  #     the cross-node TCP rendezvous; default to the meta64 fenic0.
+  # Both remain ${NCCL_SOCKET_IFNAME:-...}-overridable for other fabrics.
+  # [CLUSTER-SPECIFIC] multi-node routable host NIC for TCP bootstrap (find via `ip -br addr`).
+  if [ "$NNODES" -gt 1 ]; then
+    export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-fenic0}
+  else
+    export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-lo}
+  fi
+  echo "[$(date)] NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME (nnodes=$NNODES)" | tee -a "$LOG"
 
   # Multi-node additionally needs the RDMA data-plane (bnxt_re HCAs) configured;
   # single-node uses intra-node P2P (XGMI/PCIe) so only the bootstrap NIC matters.
diff --git a/recommendation_v4/scripts/run_docker.sh b/recommendation_v4/scripts/run_docker.sh
index bb1d58fca..864746550 100755
--- a/recommendation_v4/scripts/run_docker.sh
+++ b/recommendation_v4/scripts/run_docker.sh
@@ -3,14 +3,21 @@
 # and data directories bind-mounted at matching host/container paths.
 #
 # Usage:
-#   bash scripts/run_docker.sh                   # interactive shell
-#   bash scripts/run_docker.sh -- bash scripts/launch_smoke_8gpu.sh   # one-shot
+#   bash scripts/run_docker.sh                                      # interactive shell
+#   bash scripts/run_docker.sh -- bash scripts/launch_slurm.sh      # one-shot single-node train
+#
+# Inside the container /.dockerenv exists, so launch_slurm.sh auto-selects its
+# SLURM-free `worker` phase (NNODES=1) — identical to the old launch_smoke_8gpu.sh.
 #
 # Overrides (export before invoking):
 #   IMAGE          docker image                 (default: rocm/mlperf:dlrm_v3_mi355)
-#   CONTAINER_NAME container name               (default: yambda_8gpu)
+#   CONTAINER_NAME container name               (default: mlperf-recommendation-v4)
 #   REPO_HOST      host path to repo            (default: this script's parent)
 #   DATA_HOST      host path to dataset root    (default: /data/mlperf_dlrm_v4)
+#   LOG            in-container train log path  (default: /workspace/recommendation_v4/mlperf_dlrm_v4.log)
+#   MODE           launch_slurm.sh mode         (default: launcher default = streaming-train-eval; set train-eval for classic)
+#   MAX_SEQ_LEN / HISTORY_LENGTH                seq shape; set 2048 / 2039 for the previous 2k shape
+#   NCCL_SOCKET_IFNAME  NCCL bootstrap NIC      (default: launch_slurm picks lo single-node / fenic0 multi-node; override per host)
 
 set -euo pipefail
 
@@ -29,6 +36,12 @@ if [ ! -d "${DATA_HOST}" ]; then
   echo "warning: ${DATA_HOST} does not exist on host. Run preprocess_public_data first or override DATA_HOST." >&2
 fi
 
+# Drop an optional `--` separating this script's invocation from the in-container
+# command (the documented `run_docker.sh -- bash scripts/launch_slurm.sh` form).
+# Without this, `--` is forwarded verbatim to `docker run` as the command and
+# fails with: exec: "--": executable file not found.
+if [ "${1:-}" = "--" ]; then shift; fi
+
 # If a container with this name is already running, exec into it instead of
 # starting a new one. Matches the `docker exec yambda_8gpu ...` pattern in
 # README.MD:9-12.
@@ -54,6 +67,11 @@ exec docker run --rm -it \
   -e DLRM_DATA_PATH="${DATA_CONT}" \
   -e HSTU_HAMMER_KERNEL="${HSTU_HAMMER_KERNEL:-TRITON}" \
   -e RUN_NAME="${RUN_NAME:-default}" \
+  -e LOG="${LOG:-/workspace/recommendation_v4/mlperf_dlrm_v4.log}" \
+  ${MODE:+-e MODE="${MODE}"} \
+  ${MAX_SEQ_LEN:+-e MAX_SEQ_LEN="${MAX_SEQ_LEN}"} \
+  ${HISTORY_LENGTH:+-e HISTORY_LENGTH="${HISTORY_LENGTH}"} \
+  ${NCCL_SOCKET_IFNAME:+-e NCCL_SOCKET_IFNAME="${NCCL_SOCKET_IFNAME}"} \
   -w "${REPO_CONT}" \
   "${IMAGE}" \
   "${@:-bash}"

From 380b1cabd6ce0e4d0067b956a67dd18746380f76 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Mon, 15 Jun 2026 23:11:25 -0500
Subject: [PATCH 059/113] dlrmv4: gin-configurable quantized (bf16/fp16)
 embedding all-to-all

Quantize the bandwidth-bound embedding-shuffle all-to-all via TorchRec
QCommsConfig on the sequence EmbeddingCollectionSharder. Exposed as two
gin knobs on make_optimizer_and_shard (env-overridable):
  sparse_a2a_precision           = fp32 (off, default) | bf16 | fp16
  sparse_a2a_quantize_backward   = 1 (default) | 0 (forward-only)
Default fp32 keeps the path byte-for-byte identical to trunk. launch_slurm
forwards the env overrides and reuses a stopped container instead of
destructively re-provisioning.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin           | 15 ++++
 .../dlrm_v3/train/utils.py                    | 82 +++++++++++++++++++
 recommendation_v4/scripts/launch_slurm.sh     | 19 +++++
 3 files changed, 116 insertions(+)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 0b2c884bd..2bbf0dabf 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -83,6 +83,21 @@ make_optimizer_and_shard.hbm_cap_gb = @env_int()
 env_int.key = "HBM_CAP_GB"
 env_int.default = 260
 
+# Sparse embedding all-to-all wire precision. The embedding shuffle is the
+# dominant, bandwidth-bound (esp. multi-node) collective; quantizing it via
+# TorchRec QCommsConfig halves (bf16/fp16) the wire volume. "fp32" = off
+# (default; numerically untouched). Set "bf16" (or "fp16") to enable.
+# Override via $DLRMV4_SPARSE_A2A_PRECISION.
+make_optimizer_and_shard.sparse_a2a_precision = @saap/env_str()
+saap/env_str.key     = "DLRMV4_SPARSE_A2A_PRECISION"
+saap/env_str.default = "fp32"
+# Also quantize the backward (gradient) a2a. 1 = yes (default), 0 = forward-only
+# (keeps gradients fp32 for a more conservative numerical profile).
+# Override via $DLRMV4_BF16_SPARSE_BWD.
+make_optimizer_and_shard.sparse_a2a_quantize_backward = @saab/env_int()
+saab/env_int.key     = "DLRMV4_BF16_SPARSE_BWD"
+saab/env_int.default = 1
+
 get_dataset.name = %dataset
 get_dataset.new_path_prefix = %DATA_PATH
 # Total user-interaction-history (UIH) budget per sample, distributed evenly
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index 9ef251951..fcb562b7d 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -340,6 +340,80 @@ def sparse_optimizer_factory_and_class(
     return optimizer_cls, kwargs, optimizer_factory
 
 
+def _maybe_apply_qcomm_a2a(
+    sharders: List[Any],
+    device: torch.device,
+    precision: str = "fp32",
+    quantize_backward: bool = True,
+) -> List[Any]:
+    """Optionally quantize the embedding all-to-all payload via TorchRec qcomm.
+
+    The yambda-5b embedding shuffle is the dominant, bandwidth-bound (multi-node)
+    collective (~14.5 GB/rank fp32); BF16/FP16 forward(+backward) halves the wire
+    volume. Quant/dequant happen inside the comm op, transparent to the lookup
+    consumer. Ported from the DLRMv2 R2 lever, retargeted from
+    ``EmbeddingBagCollectionSharder`` to the sequence ``EmbeddingCollectionSharder``
+    this model uses.
+
+    Args (set via gin on ``make_optimizer_and_shard``, env-overridable):
+      precision: ``fp32`` (off, default) | ``bf16`` | ``fp16`` — the a2a wire dtype.
+      quantize_backward: also quantize the gradient a2a (default True).
+    """
+    precision = (precision or "fp32").strip().lower()
+    rank0 = (not dist.is_initialized()) or dist.get_rank() == 0
+    if precision in ("fp32", "", "off", "none"):
+        return sharders
+    if precision not in ("bf16", "fp16"):
+        if rank0:
+            logger.warning(
+                "DLRMV4 qcomm a2a: unknown precision %r (want fp32|bf16|fp16); "
+                "using fp32 a2a",
+                precision,
+            )
+        return sharders
+    try:
+        from torchrec.distributed.embedding import EmbeddingCollectionSharder
+        from torchrec.distributed.fbgemm_qcomm_codec import (
+            CommType,
+            get_qcomm_codecs_registry,
+            QCommsConfig,
+        )
+
+        fwd_prec = {"bf16": CommType.BF16, "fp16": CommType.FP16}[precision]
+        qcfg = QCommsConfig(
+            forward_precision=fwd_prec,
+            backward_precision=fwd_prec if quantize_backward else CommType.FP32,
+        )
+        registry = get_qcomm_codecs_registry(qcfg, device=device)
+        new_sharders = []
+        replaced = False
+        for s in sharders:
+            if type(s).__name__ == "EmbeddingCollectionSharder" and not replaced:
+                new_sharders.append(
+                    EmbeddingCollectionSharder(qcomm_codecs_registry=registry)
+                )
+                replaced = True
+            else:
+                new_sharders.append(s)
+        if rank0:
+            logger.info(
+                "DLRMV4 qcomm a2a ENABLED: forward=%s backward=%s "
+                "replaced_ec_sharder=%s",
+                fwd_prec.value,
+                fwd_prec.value if quantize_backward else "fp32",
+                replaced,
+            )
+        return new_sharders
+    except Exception as e:  # noqa: BLE001 — fall back to fp32 a2a on any failure
+        if rank0:
+            logger.warning(
+                "DLRMV4 qcomm a2a: failed to enable (%s: %s); using fp32 a2a",
+                type(e).__name__,
+                e,
+            )
+        return sharders
+
+
 @gin.configurable
 def make_optimizer_and_shard(
     model: torch.nn.Module,
@@ -347,6 +421,8 @@ def make_optimizer_and_shard(
     world_size: int,
     local_world_size: Optional[int] = None,
     hbm_cap_gb: int = 260,
+    sparse_a2a_precision: str = "fp32",
+    sparse_a2a_quantize_backward: bool = True,
 ) -> Tuple[DistributedModelParallel, torch.optim.Optimizer]:
     dense_opt_cls, dense_opt_args, dense_opt_factory = (
         dense_optimizer_factory_and_class()
@@ -364,6 +440,12 @@ def make_optimizer_and_shard(
                         sparse_opt_cls, [param], sparse_opt_args
                     )
     sharders = get_default_sharders()
+    sharders = _maybe_apply_qcomm_a2a(
+        sharders,
+        device,
+        precision=sparse_a2a_precision,
+        quantize_backward=bool(sparse_a2a_quantize_backward),
+    )
     # local_world_size = GPUs per node so the planner respects the intra-node
     # (xGMI/NVLink) vs inter-node hierarchy when placing shards. Defaults to
     # world_size for the single-node case (no behavior change).
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index 6b5ba7607..22beb886b 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -206,6 +206,11 @@ orchestrate() {
           echo \"[\$(hostname)] keeping non-GPU/system container \$_nm (\$_c)\" ;;
       esac
     done
+    # Reuse a STOPPED '$CONTAINER' (its installed deps persist in the container
+    # fs) instead of destructively re-provisioning from the base image + pip.
+    # Harmless no-op on a fresh node (no such container) -> falls through to
+    # provision below. Repo code is bind-mounted, so live edits are still picked up.
+    docker start $CONTAINER >/dev/null 2>&1 || true
     if [ \"$FORCE_PROVISION\" = \"1\" ] || ! docker exec $CONTAINER true >/dev/null 2>&1; then
       echo \"[\$(hostname)] (re)provisioning container\"
       LAUNCH_SLURM_PHASE=provision CONTAINER=$CONTAINER IMAGE=$IMAGE \
@@ -293,9 +298,13 @@ orchestrate() {
       -e SPLIT_SALT=${SPLIT_SALT:-0} \
       -e EVAL_HOLDOUT_TS=${EVAL_HOLDOUT_TS:--1} \
       -e EVAL_HOLDOUT_NUM_WINDOWS=${EVAL_HOLDOUT_NUM_WINDOWS:-1} \
+      ${WORKER_CMD:+-e WORKER_CMD=\"$WORKER_CMD\"} \
       ${RUN_NAME:+-e RUN_NAME=$RUN_NAME} \
       ${TENSORBOARD_LOG_PATH:+-e TENSORBOARD_LOG_PATH=$TENSORBOARD_LOG_PATH} \
       ${CKPT_PATH:+-e CKPT_PATH=$CKPT_PATH} \
+      ${DLRMV4_BF16_SPARSE_A2A:+-e DLRMV4_BF16_SPARSE_A2A=$DLRMV4_BF16_SPARSE_A2A} \
+      ${DLRMV4_SPARSE_A2A_PRECISION:+-e DLRMV4_SPARSE_A2A_PRECISION=$DLRMV4_SPARSE_A2A_PRECISION} \
+      ${DLRMV4_BF16_SPARSE_BWD:+-e DLRMV4_BF16_SPARSE_BWD=$DLRMV4_BF16_SPARSE_BWD} \
       -e LOG=$LOG \
       $NCCL_ENV_ARGS \
       $CONTAINER bash -lc 'cd $REPO && LAUNCH_SLURM_PHASE=worker bash scripts/launch_slurm.sh'
@@ -585,6 +594,16 @@ worker() {
     fi
   fi
 
+  # WORKER_CMD override: run an arbitrary in-container command (e.g. an a2a/RCCL
+  # micro-benchmark) instead of the trainer, REUSING all the NCCL/RDMA/topology
+  # setup above so it exercises the exact transport the trainer uses. The
+  # supervisor never sets WORKER_CMD, so the training path is unchanged.
+  if [ -n "${WORKER_CMD:-}" ]; then
+    echo "[$(date)] WORKER_CMD override (WORLD_SIZE=$WORLD_SIZE): $WORKER_CMD" | tee -a "$LOG"
+    bash -lc "cd $REPO_ROOT && $WORKER_CMD" 2>&1 | tee -a "$LOG"
+    return
+  fi
+
   echo "[$(date)] launching train_ranker with WORLD_SIZE=$WORLD_SIZE" | tee -a "$LOG"
   python -m generative_recommenders.dlrm_v3.train.train_ranker \
       --dataset yambda-5b --mode "${MODE:-streaming-train-eval}" 2>&1 | tee -a "$LOG"

From 3e776a8e0776f77cc1d2312a227db8183050ade5 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Mon, 15 Jun 2026 23:35:02 -0500
Subject: [PATCH 060/113] dlrmv4: split quantized a2a into independent fwd/bwd
 precision knobs

Replace the single sparse-a2a precision knob with separate forward and
backward precision settings ($SPARSE_A2A_FWD / $SPARSE_A2A_BWD, each
fp32|bf16|fp16; both fp32 = off, identical to baseline). This enables the
TorchRec golden_training recommended mix (fwd=fp16, bwd=bf16): fp16's
mantissa suits bounded forward activations while bf16's wider exponent
range avoids gradient overflow. 2-node A/B shows fp16/bf16 at perf parity
with bf16/bf16 (both 2-byte wire), so it's a numerical-safety win at zero
perf cost.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin           | 25 +++----
 .../dlrm_v3/train/utils.py                    | 70 +++++++++++--------
 recommendation_v4/scripts/launch_slurm.sh     |  5 +-
 3 files changed, 55 insertions(+), 45 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 2bbf0dabf..49a94e573 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -85,18 +85,19 @@ env_int.default = 260
 
 # Sparse embedding all-to-all wire precision. The embedding shuffle is the
 # dominant, bandwidth-bound (esp. multi-node) collective; quantizing it via
-# TorchRec QCommsConfig halves (bf16/fp16) the wire volume. "fp32" = off
-# (default; numerically untouched). Set "bf16" (or "fp16") to enable.
-# Override via $DLRMV4_SPARSE_A2A_PRECISION.
-make_optimizer_and_shard.sparse_a2a_precision = @saap/env_str()
-saap/env_str.key     = "DLRMV4_SPARSE_A2A_PRECISION"
-saap/env_str.default = "fp32"
-# Also quantize the backward (gradient) a2a. 1 = yes (default), 0 = forward-only
-# (keeps gradients fp32 for a more conservative numerical profile).
-# Override via $DLRMV4_BF16_SPARSE_BWD.
-make_optimizer_and_shard.sparse_a2a_quantize_backward = @saab/env_int()
-saab/env_int.key     = "DLRMV4_BF16_SPARSE_BWD"
-saab/env_int.default = 1
+# TorchRec QCommsConfig halves (bf16/fp16, both 2 bytes) the wire volume.
+# Forward and backward are set independently (each: "fp32" | "bf16" | "fp16").
+# Both "fp32" = off (default; numerically identical to baseline trunk).
+# Per TorchRec golden_training, fwd=fp16 / bwd=bf16 is the recommended quantized
+# mix: fp16's mantissa suits bounded forward activations, while bf16's wider
+# exponent range avoids overflow on gradients.
+# Override via $SPARSE_A2A_FWD / $SPARSE_A2A_BWD.
+make_optimizer_and_shard.sparse_a2a_forward_precision = @saaf/env_str()
+saaf/env_str.key     = "SPARSE_A2A_FWD"
+saaf/env_str.default = "fp32"
+make_optimizer_and_shard.sparse_a2a_backward_precision = @saab/env_str()
+saab/env_str.key     = "SPARSE_A2A_BWD"
+saab/env_str.default = "fp32"
 
 get_dataset.name = %dataset
 get_dataset.new_path_prefix = %DATA_PATH
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index fcb562b7d..b9610e18f 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -343,33 +343,44 @@ def sparse_optimizer_factory_and_class(
 def _maybe_apply_qcomm_a2a(
     sharders: List[Any],
     device: torch.device,
-    precision: str = "fp32",
-    quantize_backward: bool = True,
+    forward_precision: str = "fp32",
+    backward_precision: str = "fp32",
 ) -> List[Any]:
     """Optionally quantize the embedding all-to-all payload via TorchRec qcomm.
 
     The yambda-5b embedding shuffle is the dominant, bandwidth-bound (multi-node)
-    collective (~14.5 GB/rank fp32); BF16/FP16 forward(+backward) halves the wire
-    volume. Quant/dequant happen inside the comm op, transparent to the lookup
-    consumer. Ported from the DLRMv2 R2 lever, retargeted from
-    ``EmbeddingBagCollectionSharder`` to the sequence ``EmbeddingCollectionSharder``
-    this model uses.
-
-    Args (set via gin on ``make_optimizer_and_shard``, env-overridable):
-      precision: ``fp32`` (off, default) | ``bf16`` | ``fp16`` — the a2a wire dtype.
-      quantize_backward: also quantize the gradient a2a (default True).
+    collective (~14.5 GB/rank fp32); a bf16/fp16 wire dtype halves it. Quant/
+    dequant happen inside the comm op, transparent to the lookup consumer. Ported
+    from the DLRMv2 R2 lever, retargeted from ``EmbeddingBagCollectionSharder`` to
+    the sequence ``EmbeddingCollectionSharder`` this model uses.
+
+    Forward and backward are configured independently because they have different
+    numerical needs (TorchRec golden_training/train_dlrm.py recommends
+    forward=fp16, backward=bf16): the forward carries bounded embedding
+    activations where fp16's extra mantissa helps, while gradients have a wider
+    range that can overflow fp16, so bf16 (fp32 exponent range) is safer there.
+    bf16 and fp16 are both 2 bytes, so the wire volume / perf is identical — the
+    choice is purely numerical.
+
+    Args (set via gin on ``make_optimizer_and_shard``, env-overridable). Each is
+    one of ``fp32`` (that direction unquantized) | ``bf16`` | ``fp16``. If BOTH
+    are fp32 the sharders are returned untouched (identical to baseline trunk).
     """
-    precision = (precision or "fp32").strip().lower()
+    _COMM = {"bf16": "BF16", "fp16": "FP16", "fp32": "FP32"}
+    fwd = (forward_precision or "fp32").strip().lower()
+    bwd = (backward_precision or "fp32").strip().lower()
     rank0 = (not dist.is_initialized()) or dist.get_rank() == 0
-    if precision in ("fp32", "", "off", "none"):
-        return sharders
-    if precision not in ("bf16", "fp16"):
-        if rank0:
-            logger.warning(
-                "DLRMV4 qcomm a2a: unknown precision %r (want fp32|bf16|fp16); "
-                "using fp32 a2a",
-                precision,
-            )
+    for name, p in (("forward", fwd), ("backward", bwd)):
+        if p not in _COMM:
+            if rank0:
+                logger.warning(
+                    "DLRMV4 qcomm a2a: unknown %s precision %r (want "
+                    "fp32|bf16|fp16); using fp32 a2a",
+                    name,
+                    p,
+                )
+            return sharders
+    if fwd == "fp32" and bwd == "fp32":
         return sharders
     try:
         from torchrec.distributed.embedding import EmbeddingCollectionSharder
@@ -379,10 +390,9 @@ def _maybe_apply_qcomm_a2a(
             QCommsConfig,
         )
 
-        fwd_prec = {"bf16": CommType.BF16, "fp16": CommType.FP16}[precision]
         qcfg = QCommsConfig(
-            forward_precision=fwd_prec,
-            backward_precision=fwd_prec if quantize_backward else CommType.FP32,
+            forward_precision=getattr(CommType, _COMM[fwd]),
+            backward_precision=getattr(CommType, _COMM[bwd]),
         )
         registry = get_qcomm_codecs_registry(qcfg, device=device)
         new_sharders = []
@@ -399,8 +409,8 @@ def _maybe_apply_qcomm_a2a(
             logger.info(
                 "DLRMV4 qcomm a2a ENABLED: forward=%s backward=%s "
                 "replaced_ec_sharder=%s",
-                fwd_prec.value,
-                fwd_prec.value if quantize_backward else "fp32",
+                fwd,
+                bwd,
                 replaced,
             )
         return new_sharders
@@ -421,8 +431,8 @@ def make_optimizer_and_shard(
     world_size: int,
     local_world_size: Optional[int] = None,
     hbm_cap_gb: int = 260,
-    sparse_a2a_precision: str = "fp32",
-    sparse_a2a_quantize_backward: bool = True,
+    sparse_a2a_forward_precision: str = "fp32",
+    sparse_a2a_backward_precision: str = "fp32",
 ) -> Tuple[DistributedModelParallel, torch.optim.Optimizer]:
     dense_opt_cls, dense_opt_args, dense_opt_factory = (
         dense_optimizer_factory_and_class()
@@ -443,8 +453,8 @@ def make_optimizer_and_shard(
     sharders = _maybe_apply_qcomm_a2a(
         sharders,
         device,
-        precision=sparse_a2a_precision,
-        quantize_backward=bool(sparse_a2a_quantize_backward),
+        forward_precision=sparse_a2a_forward_precision,
+        backward_precision=sparse_a2a_backward_precision,
     )
     # local_world_size = GPUs per node so the planner respects the intra-node
     # (xGMI/NVLink) vs inter-node hierarchy when placing shards. Defaults to
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index 22beb886b..fe82a1dda 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -302,9 +302,8 @@ orchestrate() {
       ${RUN_NAME:+-e RUN_NAME=$RUN_NAME} \
       ${TENSORBOARD_LOG_PATH:+-e TENSORBOARD_LOG_PATH=$TENSORBOARD_LOG_PATH} \
       ${CKPT_PATH:+-e CKPT_PATH=$CKPT_PATH} \
-      ${DLRMV4_BF16_SPARSE_A2A:+-e DLRMV4_BF16_SPARSE_A2A=$DLRMV4_BF16_SPARSE_A2A} \
-      ${DLRMV4_SPARSE_A2A_PRECISION:+-e DLRMV4_SPARSE_A2A_PRECISION=$DLRMV4_SPARSE_A2A_PRECISION} \
-      ${DLRMV4_BF16_SPARSE_BWD:+-e DLRMV4_BF16_SPARSE_BWD=$DLRMV4_BF16_SPARSE_BWD} \
+      ${SPARSE_A2A_FWD:+-e SPARSE_A2A_FWD=$SPARSE_A2A_FWD} \
+      ${SPARSE_A2A_BWD:+-e SPARSE_A2A_BWD=$SPARSE_A2A_BWD} \
       -e LOG=$LOG \
       $NCCL_ENV_ARGS \
       $CONTAINER bash -lc 'cd $REPO && LAUNCH_SLURM_PHASE=worker bash scripts/launch_slurm.sh'

From 5f76993ad9800e4e774ae95f608d2d246b03b20a Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Tue, 16 Jun 2026 19:08:07 +0000
Subject: [PATCH 061/113] dlrmv4: enable GPUDirect RDMA by default in slurm
 worker

Set NCCL_NET_GDR_LEVEL=5 and NCCL_DMABUF_ENABLE=1 by default so RCCL does
true GPU<->NIC DMA over bnxt_re instead of host-memory staging. The brcmrdma
host kernel ships the inbox peer-memory client, so GDR works with no
container/host changes. Measured ~+22% throughput at 2 nodes (65.7%->79.8%
weak-scaling efficiency). Overridable via NCCL_NET_GDR_LEVEL=0 and non-fatal
(falls back to host staging if peermem is absent).

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 recommendation_v4/scripts/launch_slurm.sh | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index 3ab70078a..53982923d 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -601,10 +601,20 @@ worker() {
       export NCCL_IB_TIMEOUT=${NCCL_IB_TIMEOUT:-14}
       export NCCL_IGNORE_CPU_AFFINITY=${NCCL_IGNORE_CPU_AFFINITY:-1}
       export RCCL_MSCCL_ENABLE=${RCCL_MSCCL_ENABLE:-0}
-      # GPU-Direct RDMA needs DMABUF/peermem (neither in-container here) — leave
-      # GDR off so RCCL stages through host memory (still real RDMA over bnxt_re).
-      export NCCL_NET_GDR_LEVEL=${NCCL_NET_GDR_LEVEL:-0}
-      echo "[$(date)] NCCL: RDMA over bnxt_re (GID idx ${NCCL_IB_GID_INDEX}, TC ${NCCL_IB_TC}, GDR_LEVEL=${NCCL_NET_GDR_LEVEL}; meta64 bnxt_re config, validated)" | tee -a "$LOG"
+      # GPU-Direct RDMA: ENABLED by default. The brcmrdma host kernel ships the
+      # inbox peer-memory client (`ib_register_peer_memory_client` in
+      # /proc/kallsyms), so RCCL does true GPU<->NIC DMA over bnxt_re instead of
+      # bouncing through host memory. Measured ~+22% throughput at 2 nodes
+      # (65.7%->79.8% weak-scaling efficiency) vs the old host-staged path.
+      # GDR_LEVEL=5 (most permissive) is required so GDR is used even when the GPU
+      # and NIC cross the CPU root complex. NCCL_DMABUF_ENABLE=1 is a harmless
+      # no-op here (kernel lacks CONFIG_DMABUF_MOVE_NOTIFY/CONFIG_PCI_P2PDMA, so
+      # peermem carries it). Enabling is non-fatal: if peermem is ever absent RCCL
+      # just logs "GDR 0" and falls back to host staging. Override with
+      # NCCL_NET_GDR_LEVEL=0 to force the legacy host-staged path.
+      export NCCL_NET_GDR_LEVEL=${NCCL_NET_GDR_LEVEL:-5}
+      export NCCL_DMABUF_ENABLE=${NCCL_DMABUF_ENABLE:-1}
+      echo "[$(date)] NCCL: RDMA over bnxt_re (GID idx ${NCCL_IB_GID_INDEX}, TC ${NCCL_IB_TC}, GDR_LEVEL=${NCCL_NET_GDR_LEVEL}, DMABUF=${NCCL_DMABUF_ENABLE}; meta64 bnxt_re config, validated)" | tee -a "$LOG"
     fi
   fi
   export NCCL_DEBUG=${NCCL_DEBUG:-WARN}

From a34facc3308fc5d4760f29676908c9a38fc84551 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Tue, 16 Jun 2026 17:51:12 -0500
Subject: [PATCH 062/113] dlrmv4: gin-configurable RNG seed ($SEED) + default
 TRAIN_SPLIT_PERCENTAGE=1.0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a gin-configurable global seed so weight init (dense params + embedding
tables) is reproducible run-to-run and runs can be init-matched A/Bs. The seed
is bound via $SEED (seed_everything.seed = @seed/env_int(), default 1) and
applied by a new seed_everything() called in train_ranker right before
make_model() — after the full gin parse, so the binding resolves in the second
parse where env_int is registered. Move the old hardcoded seed=1 out of setup()
(too early to be gin-configurable). Forward $SEED through launch_slurm.sh.

Flip the default TRAIN_SPLIT_PERCENTAGE 0.90 -> 1.0 (all users trained AND
evaluated, matching the alleval/qa2a production runs) in both the gin default
and the launch_slurm.sh fallback.

Validated with two short 1-node runs: SEED=1 and SEED=2 each log their seed on
all ranks, and tsp=1.0 is applied without exporting TRAIN_SPLIT_PERCENTAGE.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin           | 14 ++++++-
 .../dlrm_v3/train/train_ranker.py             |  6 +++
 .../dlrm_v3/train/utils.py                    | 39 ++++++++++++-------
 recommendation_v4/scripts/launch_slurm.sh     |  3 +-
 4 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 49a94e573..93f88920f 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -32,6 +32,16 @@ make_model.hammer_kernel = "TRITON"
 # pinned constants in ops/triton/_autotune_pinning.py call sites.
 apply_env_bootstrap.TRITON_FULL_AUTOTUNE = False
 
+# Global RNG seed for reproducible weight init (dense params + embedding tables)
+# and any seeded RNG consumers. Same seed on every rank => identical dense init;
+# fixing it makes runs an init-matched A/B (data order is already deterministic
+# via the sampler). seed_everything() is called right before make_model() in
+# train_ranker (after the full gin parse), so this binding is resolved in the
+# second parse where env_int is registered. Override per-run via $SEED.
+seed_everything.seed = @seed/env_int()
+seed/env_int.key     = "SEED"
+seed/env_int.default = 1
+
 # dense model optimizer
 dense_optimizer_factory_and_class.learning_rate = 0.001
 dense_optimizer_factory_and_class.optimizer_name = "Adam"
@@ -63,9 +73,11 @@ data/env_path.default = "/apps/chcai/dlrm_data"
 # positional split) and the streaming path (get_dataset, an explicit by-user
 # hash split), so one value configures the holdout in either mode.
 # 1.0 = no holdout (legacy streaming behavior). Override via $TRAIN_SPLIT_PERCENTAGE.
+# Default 1.0: all users are trained AND evaluated (full-coverage eval), matching
+# the alleval/qa2a production runs; set <1.0 (e.g. 0.90) for a clean held-out cohort.
 TRAIN_SPLIT_PERCENTAGE = @tsp/env_float()
 tsp/env_float.key     = "TRAIN_SPLIT_PERCENTAGE"
-tsp/env_float.default = 0.90
+tsp/env_float.default = 1.0
 
 # dataloader configs
 make_train_test_dataloaders.batch_size = %batch_size
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
index 55eece518..dc2a9a8a9 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
@@ -89,6 +89,7 @@ def _main_func(
         make_model,
         make_optimizer_and_shard,
         make_train_test_dataloaders,
+        seed_everything,
         setup,
         streaming_train_eval_loop,
         train_eval_loop,
@@ -113,6 +114,11 @@ def _main_func(
     # make_train_test_dataloaders, etc.
     gin.parse_config_file(gin_file)
 
+    # Seed all RNGs (gin-configurable $SEED) BEFORE make_model() so weight init
+    # is reproducible run-to-run. Must follow the full parse above so the binding
+    # is wired, and precede make_model() below.
+    seed_everything(rank=rank)
+
     model, model_configs, embedding_table_configs = make_model()
     model, optimizer = make_optimizer_and_shard(
         model=model,
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index b9610e18f..92b09f3f1 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -76,6 +76,27 @@
 }
 
 
+@gin.configurable
+def seed_everything(seed: int = 1, rank: int = 0) -> None:
+    """Seed all RNGs so weight init (make_model) is reproducible across runs.
+
+    Same seed on every rank => dense params are initialized identically across
+    ranks; sharded embeddings are init'd from the meta device by DMP. Fixing the
+    seed makes runs an init-matched A/B (data order is already deterministic via
+    the sampler). gin-configurable via $SEED (yambda_5b.gin: seed_everything.seed);
+    call this right before make_model(), after the full gin parse.
+    """
+    import random
+
+    import numpy as np
+
+    logger.info(f"[rank {rank}] seeding all RNGs with SEED={seed}")
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+
 def setup(
     rank: int,
     world_size: int,
@@ -96,20 +117,10 @@ def setup(
     # leaving stale allocations and triggering OOMs on rank 0.
     torch.cuda.set_device(device)
 
-    # Seed all RNGs so weight init (make_model, called after setup) is
-    # reproducible across runs. Same seed on every rank → dense params are
-    # initialized identically across ranks; sharded embeddings are init'd from
-    # the meta device by DMP. Fixed seed makes pipeline-vs-non-pipeline an
-    # init-matched A/B (data order is already deterministic via the sampler).
-    import random
-
-    import numpy as np
-
-    _SEED = 1
-    random.seed(_SEED)
-    np.random.seed(_SEED)
-    torch.manual_seed(_SEED)
-    torch.cuda.manual_seed_all(_SEED)
+    # NOTE: RNG seeding for reproducible weight init lives in seed_everything(),
+    # which train_ranker calls right before make_model() (after the full gin
+    # parse, so the gin-configurable $SEED is bound). Seeding here would be too
+    # early to be gin-configurable and is redundant with that call.
 
     # initialize the process group
     if not dist.is_initialized():
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index fe82a1dda..f639215e7 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -287,6 +287,7 @@ orchestrate() {
       ${DIAG_EMB_STEPS:+-e DIAG_EMB_STEPS=$DIAG_EMB_STEPS} \
       ${OUTPUT_TRACE:+-e OUTPUT_TRACE=$OUTPUT_TRACE} \
       ${MIN_HISTORY:+-e MIN_HISTORY=$MIN_HISTORY} \
+      ${SEED:+-e SEED=$SEED} \
       ${MAX_SEQ_LEN:+-e MAX_SEQ_LEN=$MAX_SEQ_LEN} \
       ${HISTORY_LENGTH:+-e HISTORY_LENGTH=$HISTORY_LENGTH} \
       ${BATCH_SIZE:+-e BATCH_SIZE=$BATCH_SIZE} \
@@ -294,7 +295,7 @@ orchestrate() {
       ${KEEP_LAST_N:+-e KEEP_LAST_N=$KEEP_LAST_N} \
       ${IN_WINDOW_CKPT_FREQ:+-e IN_WINDOW_CKPT_FREQ=$IN_WINDOW_CKPT_FREQ} \
       ${CKPT_STEP_FREQ:+-e CKPT_STEP_FREQ=$CKPT_STEP_FREQ} \
-      -e TRAIN_SPLIT_PERCENTAGE=${TRAIN_SPLIT_PERCENTAGE:-0.90} \
+      -e TRAIN_SPLIT_PERCENTAGE=${TRAIN_SPLIT_PERCENTAGE:-1.0} \
       -e SPLIT_SALT=${SPLIT_SALT:-0} \
       -e EVAL_HOLDOUT_TS=${EVAL_HOLDOUT_TS:--1} \
       -e EVAL_HOLDOUT_NUM_WINDOWS=${EVAL_HOLDOUT_NUM_WINDOWS:-1} \

From 40c696a9c4a6c55ea232d50a37052d3493554fde Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Wed, 17 Jun 2026 08:14:06 -0500
Subject: [PATCH 063/113] dlrmv4: env-configurable dense/sparse LR + optimizer
 LR logging

Make the dense ($DENSE_LR) and sparse ($SPARSE_LR) optimizer learning rates
overridable per-run via env (defaults unchanged at 0.001), and log the
resolved LR at optimizer construction so runs are self-documenting. Forward
both vars through launch_slurm.sh into the container.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin                    | 10 ++++++++--
 .../generative_recommenders/dlrm_v3/train/utils.py     |  8 ++++++++
 recommendation_v4/scripts/launch_slurm.sh              |  2 ++
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 93f88920f..61bdd70e4 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -43,7 +43,10 @@ seed/env_int.key     = "SEED"
 seed/env_int.default = 1
 
 # dense model optimizer
-dense_optimizer_factory_and_class.learning_rate = 0.001
+# Learning rate is env-overridable via $DENSE_LR (default 0.001, unchanged).
+dense_optimizer_factory_and_class.learning_rate = @dlr/env_float()
+dlr/env_float.key     = "DENSE_LR"
+dlr/env_float.default = 0.001
 dense_optimizer_factory_and_class.optimizer_name = "Adam"
 dense_optimizer_factory_and_class.momentum = 0
 dense_optimizer_factory_and_class.weight_decay = 0
@@ -51,7 +54,10 @@ dense_optimizer_factory_and_class.eps = 1e-8
 dense_optimizer_factory_and_class.betas = (0.95, 0.999)
 
 # sparse model optimizer
-sparse_optimizer_factory_and_class.learning_rate = 0.001
+# Learning rate is env-overridable via $SPARSE_LR (default 0.001, unchanged).
+sparse_optimizer_factory_and_class.learning_rate = @slr/env_float()
+slr/env_float.key     = "SPARSE_LR"
+slr/env_float.default = 0.001
 sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
 sparse_optimizer_factory_and_class.momentum = 0
 sparse_optimizer_factory_and_class.weight_decay = 0
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index 92b09f3f1..aaed8c2a0 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -308,6 +308,10 @@ def dense_optimizer_factory_and_class(
 
     optimizer_factory = lambda params: optimizer_cls(params, **kwargs)
 
+    logger.info(
+        f"[dense optimizer] {optimizer_name} learning_rate={learning_rate} "
+        f"(resolved from gin; override via $DENSE_LR)"
+    )
     return optimizer_cls, kwargs, optimizer_factory
 
 
@@ -348,6 +352,10 @@ def sparse_optimizer_factory_and_class(
 
     optimizer_factory = lambda params: optimizer_cls(params, **kwargs)
 
+    logger.info(
+        f"[sparse optimizer] {optimizer_name} learning_rate={learning_rate} "
+        f"(resolved from gin; override via $SPARSE_LR)"
+    )
     return optimizer_cls, kwargs, optimizer_factory
 
 
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index f639215e7..c6b73cb8d 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -288,6 +288,8 @@ orchestrate() {
       ${OUTPUT_TRACE:+-e OUTPUT_TRACE=$OUTPUT_TRACE} \
       ${MIN_HISTORY:+-e MIN_HISTORY=$MIN_HISTORY} \
       ${SEED:+-e SEED=$SEED} \
+      ${DENSE_LR:+-e DENSE_LR=$DENSE_LR} \
+      ${SPARSE_LR:+-e SPARSE_LR=$SPARSE_LR} \
       ${MAX_SEQ_LEN:+-e MAX_SEQ_LEN=$MAX_SEQ_LEN} \
       ${HISTORY_LENGTH:+-e HISTORY_LENGTH=$HISTORY_LENGTH} \
       ${BATCH_SIZE:+-e BATCH_SIZE=$BATCH_SIZE} \

From d646b2acd095936752216ced1706f7c548921401 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Wed, 17 Jun 2026 08:14:49 -0500
Subject: [PATCH 064/113] dlrmv4: env-configurable HSTU transformer depth
 ($HSTU_NUM_LAYERS)

Make the HSTU attention layer count overridable per-run via $HSTU_NUM_LAYERS
(default 5, unchanged), resolved in the full gin parse and forwarded through
launch_slurm.sh. Changing depth alters model shape, so a run with a new depth
must use a fresh CKPT_PATH (incompatible with existing checkpoints).

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin                    | 10 ++++++++++
 recommendation_v4/scripts/launch_slurm.sh              |  1 +
 2 files changed, 11 insertions(+)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 61bdd70e4..3099cd67e 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -160,6 +160,16 @@ get_hstu_configs.max_seq_len = @msl/env_int()
 msl/env_int.key     = "MAX_SEQ_LEN"
 msl/env_int.default = 4096
 
+# HSTU transformer depth (number of attention layers). Default 5 (unchanged).
+# Override per-run via $HSTU_NUM_LAYERS. NOTE: changing depth changes the model
+# shape, so a run with a new depth MUST use a FRESH CKPT_PATH (incompatible with
+# 5-layer checkpoints). Resolved in the full gin parse (get_hstu_configs is not
+# registered during the early skip_unknown parse), so the @env_int reference is
+# skipped on the first pass — same safe path as the LR knobs.
+get_hstu_configs.hstu_attn_num_layers = @nl/env_int()
+nl/env_int.key     = "HSTU_NUM_LAYERS"
+nl/env_int.default = 5
+
 # --- streaming (temporal-order) training -------------------------------------
 # Only consumed under `--mode streaming-train-eval`; the default train-eval
 # path above is unaffected. Trains time window T then evals window T+1,
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index c6b73cb8d..f20934285 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -290,6 +290,7 @@ orchestrate() {
       ${SEED:+-e SEED=$SEED} \
       ${DENSE_LR:+-e DENSE_LR=$DENSE_LR} \
       ${SPARSE_LR:+-e SPARSE_LR=$SPARSE_LR} \
+      ${HSTU_NUM_LAYERS:+-e HSTU_NUM_LAYERS=$HSTU_NUM_LAYERS} \
       ${MAX_SEQ_LEN:+-e MAX_SEQ_LEN=$MAX_SEQ_LEN} \
       ${HISTORY_LENGTH:+-e HISTORY_LENGTH=$HISTORY_LENGTH} \
       ${BATCH_SIZE:+-e BATCH_SIZE=$BATCH_SIZE} \

From 922aeec9db0eb013c79e4b261377e5d58d3ca568 Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Thu, 18 Jun 2026 19:07:38 +0000
Subject: [PATCH 065/113] dlrmv4: env-overridable dense/sparse LRs for sweeps +
 holdout default 1.0

Make dense (Adam) and sparse (RowWiseAdagrad) learning rates overridable via
$DENSE_LR / $SPARSE_LR with gin defaults preserved at 0.001, so LR sweeps don't
require editing gin. Resolve gin macro references in the MLPerf param logger so
env-overridden LRs are logged as real numbers. Default TRAIN_SPLIT_PERCENTAGE
to 1.0 (no holdout) and log the resolved LR overrides at orchestration time.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin            | 18 +++++++++++++++---
 .../dlrm_v3/train/train_ranker.py              | 13 ++++++++++++-
 recommendation_v4/scripts/launch_slurm.sh      |  5 ++++-
 3 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 8623fb21d..984fb20c3 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -33,7 +33,13 @@ make_model.hammer_kernel = "TRITON"
 apply_env_bootstrap.TRITON_FULL_AUTOTUNE = False
 
 # dense model optimizer
-dense_optimizer_factory_and_class.learning_rate = 0.001
+# Learning rate is env-overridable (default 0.001 preserves prior runs) so an LR
+# sweep can probe the dense Adam rate without editing gin. Scoped (`dlr/`) so
+# this env_float binding doesn't collide with the other env_float call sites.
+# Override via $DENSE_LR.
+dense_optimizer_factory_and_class.learning_rate = @dlr/env_float()
+dlr/env_float.key     = "DENSE_LR"
+dlr/env_float.default = 0.001
 dense_optimizer_factory_and_class.optimizer_name = "Adam"
 dense_optimizer_factory_and_class.momentum = 0
 dense_optimizer_factory_and_class.weight_decay = 0
@@ -41,7 +47,13 @@ dense_optimizer_factory_and_class.eps = 1e-8
 dense_optimizer_factory_and_class.betas = (0.95, 0.999)
 
 # sparse model optimizer
-sparse_optimizer_factory_and_class.learning_rate = 0.001
+# Learning rate is env-overridable (default 0.001 preserves prior runs) so an LR
+# sweep can probe the sparse RowWiseAdagrad rate without editing gin. Scoped
+# (`slr/`) so this env_float binding doesn't collide with the other env_float
+# call sites. Override via $SPARSE_LR.
+sparse_optimizer_factory_and_class.learning_rate = @slr/env_float()
+slr/env_float.key     = "SPARSE_LR"
+slr/env_float.default = 0.001
 sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
 sparse_optimizer_factory_and_class.momentum = 0
 sparse_optimizer_factory_and_class.weight_decay = 0
@@ -65,7 +77,7 @@ data/env_path.default = "/apps/chcai/dlrm_data"
 # 1.0 = no holdout (legacy streaming behavior). Override via $TRAIN_SPLIT_PERCENTAGE.
 TRAIN_SPLIT_PERCENTAGE = @tsp/env_float()
 tsp/env_float.key     = "TRAIN_SPLIT_PERCENTAGE"
-tsp/env_float.default = 0.90
+tsp/env_float.default = 1.0
 
 # dataloader configs
 make_train_test_dataloaders.batch_size = %batch_size
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
index d22d09a30..6df73f351 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
@@ -213,9 +213,20 @@ def _main_func(
 
         def _gin_param(name: str, default: object) -> object:
             try:
-                return gin.query_parameter(name)
+                value = gin.query_parameter(name)
             except (ValueError, KeyError):
                 return default
+            # When a binding is a gin macro/configurable reference (e.g.
+            # `@dlr/env_float()`), query_parameter returns the unevaluated
+            # reference object, which the MLPerf logger cannot encode. Resolve
+            # it to its actual value so env-overridden LRs are logged as real
+            # numbers. Plain literals pass through unchanged.
+            if hasattr(value, "scoped_configurable_fn"):
+                try:
+                    return value.scoped_configurable_fn()
+                except Exception:
+                    return default
+            return value
 
         global_batch_size = world_size * int(train_dataloader.batch_size)
         mlperf_logger.event(key=c.GLOBAL_BATCH_SIZE, value=global_batch_size)
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index 53982923d..a71370143 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -174,6 +174,7 @@ orchestrate() {
   echo "[$(date)] launch_slurm/orchestrate: job=${SLURM_JOB_ID:-?} nodes=${SLURM_JOB_NODELIST:-?} nnodes=${SLURM_NNODES:-1}" | tee -a "$LOG"
   echo "[$(date)] resolved SCRIPT_PATH=$SCRIPT_PATH REPO=$REPO" | tee -a "$LOG"
   echo "[$(date)] config: MODE=$MODE START_TS=$START_TS NUM_TRAIN_TS=$NUM_TRAIN_TS NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES METRIC_LOG_FREQ=$METRIC_LOG_FREQ" | tee -a "$LOG"
+  echo "[$(date)] lr-override: DENSE_LR=${DENSE_LR:-<unset:gin default 0.001>} SPARSE_LR=${SPARSE_LR:-<unset:gin default 0.001>}" | tee -a "$LOG"
 
   # Rendezvous resolved on the HOST (the container image has no SLURM client).
   MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" 2>/dev/null | head -1)
@@ -310,11 +311,13 @@ orchestrate() {
       ${MAX_SEQ_LEN:+-e MAX_SEQ_LEN=$MAX_SEQ_LEN} \
       ${HISTORY_LENGTH:+-e HISTORY_LENGTH=$HISTORY_LENGTH} \
       ${BATCH_SIZE:+-e BATCH_SIZE=$BATCH_SIZE} \
+      ${DENSE_LR:+-e DENSE_LR=$DENSE_LR} \
+      ${SPARSE_LR:+-e SPARSE_LR=$SPARSE_LR} \
       ${CKPT_TIME_INTERVAL_S:+-e CKPT_TIME_INTERVAL_S=$CKPT_TIME_INTERVAL_S} \
       ${KEEP_LAST_N:+-e KEEP_LAST_N=$KEEP_LAST_N} \
       ${IN_WINDOW_CKPT_FREQ:+-e IN_WINDOW_CKPT_FREQ=$IN_WINDOW_CKPT_FREQ} \
       ${CKPT_STEP_FREQ:+-e CKPT_STEP_FREQ=$CKPT_STEP_FREQ} \
-      -e TRAIN_SPLIT_PERCENTAGE=${TRAIN_SPLIT_PERCENTAGE:-0.90} \
+      -e TRAIN_SPLIT_PERCENTAGE=${TRAIN_SPLIT_PERCENTAGE:-1.0} \
       -e AUC_THRESHOLD=${AUC_THRESHOLD:-0.80275} \
       -e SPLIT_SALT=${SPLIT_SALT:-0} \
       -e EVAL_HOLDOUT_TS=${EVAL_HOLDOUT_TS:--1} \

From e0d6e46d9c4e7d1f9d8c49ffabf5da4b4d3b292e Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Thu, 18 Jun 2026 14:17:43 -0500
Subject: [PATCH 066/113] dlrmv4: disable TensorBoard by default (no-op writer)

The shared-NFS tfevents writer was the only metrics sink whose I/O error
was uncaught, and it repeatedly crashed trainers on transient /apps
Errno 121 (Remote I/O) hiccups. Default TENSORBOARD_LOG_PATH is now empty,
which installs a _NoOpSummaryWriter so the metrics path (compute +
text-log + .metrics.jsonl sinks) runs unchanged and never crashes on TB
file I/O. Nothing we consume reads TensorBoard. Re-enable per-run by
setting $TENSORBOARD_LOG_PATH to a non-empty dir.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin           | 13 ++++++----
 .../generative_recommenders/dlrm_v3/utils.py  | 24 +++++++++++++++++++
 2 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 3099cd67e..09ebf296e 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -308,13 +308,16 @@ Profiler.active = 5
 Profiler.trace_dir = @run_results_dir()
 
 # logger variables
-# TensorBoard event dir. Default lives on shared NFS (not container-local /tmp,
-# which is wiped on node failover) so the NE/AUC scalars survive relaunches and
-# failover. Override per-run via $TENSORBOARD_LOG_PATH (the supervisor sets it
-# to /apps/chcai/tb/$RUN_NAME/).
+# TensorBoard event dir. DISABLED BY DEFAULT (empty path): the shared-NFS
+# tfevents writer is the only metrics sink whose I/O error is uncaught, and it
+# repeatedly crashed trainers on transient /apps `Errno 121` (Remote I/O) hiccups.
+# Nothing we consume reads TensorBoard — eval-window AUCs come from the durable
+# `<LOG>.metrics.jsonl` sink (try/except-guarded) and the text run log. An empty
+# path makes MetricsLogger install a no-op writer (see _NoOpSummaryWriter).
+# Re-enable for a run by setting $TENSORBOARD_LOG_PATH to a non-empty dir.
 MetricsLogger.tensorboard_log_path = @tbp/env_path()
 tbp/env_path.key     = "TENSORBOARD_LOG_PATH"
-tbp/env_path.default = "/apps/chcai/tb/yambda_5b/"
+tbp/env_path.default = ""
 MetricsLogger.world_size = 8
 MetricsLogger.auc_threshold = 0.80275
 # Lifetime-AUC backend, selectable independently for the train cumulative AUC and
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
index dcda51365..10c107bdc 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
@@ -857,6 +857,25 @@ def step(self) -> None:
         self._profiler.step()
 
 
+class _NoOpSummaryWriter:
+    """Drop-in stand-in for SummaryWriter used when TensorBoard is disabled
+    (empty ``tensorboard_log_path``). All scalar writes become no-ops so the
+    metrics path (compute + text-log + ``.metrics.jsonl`` sinks) runs unchanged
+    and never crashes on TB file I/O. The shared ``/apps`` tfevents writer was a
+    crash source under transient filer ``Errno 121`` (Remote I/O) errors, and
+    nothing we consume reads TensorBoard — eval-window AUCs come from the JSONL
+    sink and the text log."""
+
+    def add_scalar(self, *args, **kwargs) -> None:
+        pass
+
+    def flush(self) -> None:
+        pass
+
+    def close(self) -> None:
+        pass
+
+
 @gin.configurable
 class MetricsLogger:
     """
@@ -1016,6 +1035,11 @@ def _make_reg(ws: int) -> List[RecMetricComputation]:
         if tensorboard_log_path != "":
             self.tb_logger = SummaryWriter(log_dir=tensorboard_log_path, purge_step=0)
             self.tb_logger.flush()
+        else:
+            # TB disabled: use a no-op writer so the existing call sites (and the
+            # `assert self.tb_logger is not None` in compute_and_log) keep working
+            # while no tfevents are written to the fragile shared filer.
+            self.tb_logger = _NoOpSummaryWriter()
 
         # Throughput / time-to-target tracking. Counters are train-only; eval
         # samples are not relevant for headline samples/sec numbers.

From bc743b580be38b0bf651007d7d7c0e51b45cc936 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Thu, 18 Jun 2026 14:18:31 -0500
Subject: [PATCH 067/113] dlrmv4: default dense/sparse LR=1e-5 and HSTU depth=3

Change the gin defaults to the configuration validated by the recent
power-user (min4086) ht299 runs: dense+sparse LR 0.001 -> 1e-5 and HSTU
attention depth 5 -> 3 (reaches ~0.78-0.81 holdout-299 AUC at ~1.4x
faster training than the 5-layer/5e-5 setup). Both remain env-overridable
via $DENSE_LR / $SPARSE_LR / $HSTU_NUM_LAYERS.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin            | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 09ebf296e..f18c866a2 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -43,10 +43,10 @@ seed/env_int.key     = "SEED"
 seed/env_int.default = 1
 
 # dense model optimizer
-# Learning rate is env-overridable via $DENSE_LR (default 0.001, unchanged).
+# Learning rate is env-overridable via $DENSE_LR (default 1e-5).
 dense_optimizer_factory_and_class.learning_rate = @dlr/env_float()
 dlr/env_float.key     = "DENSE_LR"
-dlr/env_float.default = 0.001
+dlr/env_float.default = 0.00001
 dense_optimizer_factory_and_class.optimizer_name = "Adam"
 dense_optimizer_factory_and_class.momentum = 0
 dense_optimizer_factory_and_class.weight_decay = 0
@@ -54,10 +54,10 @@ dense_optimizer_factory_and_class.eps = 1e-8
 dense_optimizer_factory_and_class.betas = (0.95, 0.999)
 
 # sparse model optimizer
-# Learning rate is env-overridable via $SPARSE_LR (default 0.001, unchanged).
+# Learning rate is env-overridable via $SPARSE_LR (default 1e-5).
 sparse_optimizer_factory_and_class.learning_rate = @slr/env_float()
 slr/env_float.key     = "SPARSE_LR"
-slr/env_float.default = 0.001
+slr/env_float.default = 0.00001
 sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
 sparse_optimizer_factory_and_class.momentum = 0
 sparse_optimizer_factory_and_class.weight_decay = 0
@@ -160,15 +160,15 @@ get_hstu_configs.max_seq_len = @msl/env_int()
 msl/env_int.key     = "MAX_SEQ_LEN"
 msl/env_int.default = 4096
 
-# HSTU transformer depth (number of attention layers). Default 5 (unchanged).
+# HSTU transformer depth (number of attention layers). Default 3.
 # Override per-run via $HSTU_NUM_LAYERS. NOTE: changing depth changes the model
 # shape, so a run with a new depth MUST use a FRESH CKPT_PATH (incompatible with
-# 5-layer checkpoints). Resolved in the full gin parse (get_hstu_configs is not
-# registered during the early skip_unknown parse), so the @env_int reference is
-# skipped on the first pass — same safe path as the LR knobs.
+# checkpoints of a different depth). Resolved in the full gin parse
+# (get_hstu_configs is not registered during the early skip_unknown parse), so
+# the @env_int reference is skipped on the first pass — same safe path as the LR knobs.
 get_hstu_configs.hstu_attn_num_layers = @nl/env_int()
 nl/env_int.key     = "HSTU_NUM_LAYERS"
-nl/env_int.default = 5
+nl/env_int.default = 3
 
 # --- streaming (temporal-order) training -------------------------------------
 # Only consumed under `--mode streaming-train-eval`; the default train-eval

From b4ce3ba40acb746cf71b0f4221de5838e72674f7 Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Thu, 18 Jun 2026 19:24:49 +0000
Subject: [PATCH 068/113] dlrmv4: report EVAL_ACCURACY as per-window AUC
 (configurable, default window)

The MLPerf EVAL_ACCURACY event and the convergence decision (early SUCCESS
RUN_STOP + end-of-run finalize) now use the per-pass full-holdout "window_auc"
instead of the cumulative "lifetime_auc". Made selectable via a new gin knob
streaming_train_eval_loop.eval_accuracy_auc_mode ($EVAL_ACCURACY_AUC_MODE),
default "window"; set "lifetime" to restore prior behavior. Both AUCs are still
computed and logged to TensorBoard. The rank-0-decides-then-broadcast deadlock
guard is preserved. Forward $EVAL_ACCURACY_AUC_MODE through launch_slurm.sh.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin           | 17 +++--
 .../dlrm_v3/train/utils.py                    | 70 ++++++++++++-------
 recommendation_v4/scripts/launch_slurm.sh     |  1 +
 3 files changed, 60 insertions(+), 28 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 6dc2358fc..6efa64244 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -328,13 +328,22 @@ tbp/env_path.key     = "TENSORBOARD_LOG_PATH"
 tbp/env_path.default = "tb/yambda_5b/"
 MetricsLogger.world_size = 8
 # Time-to-target AUC threshold. Doubles as the MLPerf convergence target: when
-# the cumulative ("lifetime_") listen_plus eval AUC first reaches this value the
-# streaming-train-eval run emits a SUCCESS RUN_STOP and terminates gracefully.
-# Override via $AUC_THRESHOLD (e.g. 0.5 to smoke-test the early-stop path on a
-# short run). MLPerf's DLRM-DCNv2 reference uses 0.80275.
+# the selected listen_plus eval AUC (see eval_accuracy_auc_mode below; default
+# the per-pass "window_" AUC) first reaches this value the streaming-train-eval
+# run emits a SUCCESS RUN_STOP and terminates gracefully. Override via
+# $AUC_THRESHOLD (e.g. 0.5 to smoke-test the early-stop path on a short run).
+# MLPerf's DLRM-DCNv2 reference uses 0.80275.
 MetricsLogger.auc_threshold = @at/env_float()
 at/env_float.key     = "AUC_THRESHOLD"
 at/env_float.default = 0.80275
+# Which eval AUC is reported as EVAL_ACCURACY and drives the convergence /
+# SUCCESS RUN_STOP decision: "window" (per-pass full-holdout AUC, reset each eval
+# pass; the default) or "lifetime" (cumulative across all eval passes). Both AUCs
+# are still computed and logged to TensorBoard regardless; this only selects the
+# one used for MLPerf EVAL_ACCURACY + early-stop. Override via $EVAL_ACCURACY_AUC_MODE.
+streaming_train_eval_loop.eval_accuracy_auc_mode = @eaam/env_str()
+eaam/env_str.key     = "EVAL_ACCURACY_AUC_MODE"
+eaam/env_str.default = "window"
 # Lifetime-AUC backend, selectable independently for the train cumulative AUC and
 # the eval cumulative ("lifetime_*") AUC. Both default to "binned":
 #   "binned" = BinnedCumulativeAUC: exact-cumulative AUC from an O(bins) score
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index ca6291ffc..d28f2ebec 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -1488,6 +1488,12 @@ def streaming_train_eval_loop(
     # event emission; the loop is otherwise unchanged. Supplied by train_ranker
     # for the streaming-train-eval benchmark path.
     mlperf_logger: Optional[Any] = None,
+    # Which eval AUC drives the reported EVAL_ACCURACY and the convergence
+    # decision (early SUCCESS RUN_STOP + end-of-run finalize): "window" = the
+    # per-pass full-holdout AUC (reset each eval pass; the default), or
+    # "lifetime" = the cumulative AUC across all eval passes. Override via
+    # $EVAL_ACCURACY_AUC_MODE.
+    eval_accuracy_auc_mode: str = "window",
 ) -> None:
     """Streaming train+eval loop with per-window (and optionally mid-window)
     checkpoints.
@@ -2076,14 +2082,29 @@ def _mlperf_progress() -> Dict[str, Any]:
             mlperf_logger.constants.EPOCH_NUM: epoch_num,
         }
 
-    def _lifetime_auc(metrics: Dict[str, float]) -> Optional[float]:
-        # Convergence metric: the cumulative ("lifetime_") listen_plus AUC.
-        # Key format is `metric/{prefix}{name}/{task}` (see MetricsLogger.compute),
-        # e.g. `metric/lifetime_auc/listen_plus`. Match the `lifetime_auc` short
-        # name; ignore GAUC.
+    # Convergence/EVAL_ACCURACY metric short name, selected by
+    # eval_accuracy_auc_mode: "window_auc" (per-pass full-holdout AUC, default)
+    # or "lifetime_auc" (cumulative across eval passes).
+    _eval_auc_short = (
+        "lifetime_auc"
+        if str(eval_accuracy_auc_mode).strip().lower() == "lifetime"
+        else "window_auc"
+    )
+    if rank == 0 and mlperf_logger is not None:
+        logger.info(
+            f"[mlperf] EVAL_ACCURACY / convergence metric = {_eval_auc_short} "
+            f"(eval_accuracy_auc_mode={eval_accuracy_auc_mode!r})"
+        )
+
+    def _eval_target_auc(metrics: Dict[str, float]) -> Optional[float]:
+        # Convergence metric: the listen_plus eval AUC selected by
+        # eval_accuracy_auc_mode (window vs lifetime). Key format is
+        # `metric/{prefix}{name}/{task}` (see MetricsLogger.compute), e.g.
+        # `metric/window_auc/listen_plus`. Match the selected short name;
+        # ignore GAUC.
         for key, val in metrics.items():
             short = key.split("/")[-2] if "/" in key else key
-            if short == "lifetime_auc":
+            if short == _eval_auc_short:
                 return float(val)
         return None
 
@@ -2118,24 +2139,25 @@ def _mlperf_run_stop(status: object) -> None:
         mlperf_run_stopped[0] = True
 
     def _mlperf_eval_stop(eval_metrics: Dict[str, float]) -> bool:
-        # Emit EVAL_ACCURACY (lifetime listen_plus AUC) + EVAL_STOP, and drive an
-        # early SUCCESS RUN_STOP when the target threshold is reached. Returns
-        # True iff the run should stop now -- the SAME value on every rank.
+        # Emit EVAL_ACCURACY (the selected eval listen_plus AUC) + EVAL_STOP, and
+        # drive an early SUCCESS RUN_STOP when the target threshold is reached.
+        # Returns True iff the run should stop now -- the SAME value on every rank.
         #
-        # CRITICAL (deadlock avoidance): the cumulative lifetime AUC is produced
-        # by a reduce that is only valid on global rank 0, so a per-rank
-        # `lifetime >= thr` test diverges (only rank 0 sees the value) and the
-        # ranks that "stop" hit the RUN_STOP barrier while the rest march into
-        # the next window's embedding all-to-all -> NCCL collective-timeout hang
-        # (observed: 600s ALLTOALL_BASE watchdog abort). So rank 0 decides and
-        # BROADCASTS the boolean; all ranks then break (or continue) in lockstep.
+        # CRITICAL (deadlock avoidance): the eval AUC is produced by a reduce
+        # that is only guaranteed valid on global rank 0, so a per-rank
+        # `eval_auc >= thr` test could diverge (only rank 0 sees the value) and
+        # the ranks that "stop" hit the RUN_STOP barrier while the rest march
+        # into the next window's embedding all-to-all -> NCCL collective-timeout
+        # hang (observed: 600s ALLTOALL_BASE watchdog abort). So rank 0 decides
+        # and BROADCASTS the boolean; all ranks then break (or continue) in
+        # lockstep.
         if mlperf_logger is None:
             return False
-        lifetime = _lifetime_auc(eval_metrics)
-        if lifetime is not None:
+        eval_auc = _eval_target_auc(eval_metrics)
+        if eval_auc is not None:
             mlperf_logger.event(
                 key=mlperf_logger.constants.EVAL_ACCURACY,
-                value=lifetime,
+                value=eval_auc,
                 metadata=_mlperf_progress(),
             )
         mlperf_logger.end(
@@ -2146,9 +2168,9 @@ def _mlperf_eval_stop(eval_metrics: Dict[str, float]) -> bool:
         if (
             rank == 0
             and not mlperf_run_stopped[0]
-            and lifetime is not None
+            and eval_auc is not None
             and thr is not None
-            and lifetime >= thr
+            and eval_auc >= thr
         ):
             decision[0] = 1.0
         if torch.distributed.is_initialized():
@@ -2161,12 +2183,12 @@ def _mlperf_eval_stop(eval_metrics: Dict[str, float]) -> bool:
 
     def _mlperf_finalize(final_metrics: Dict[str, float]) -> None:
         # End-of-run RUN_STOP when the threshold was never crossed: SUCCESS iff
-        # the final lifetime AUC meets the target, else ABORTED.
+        # the final eval AUC meets the target, else ABORTED.
         if mlperf_logger is None or mlperf_run_stopped[0]:
             return
-        lifetime = _lifetime_auc(final_metrics)
+        eval_auc = _eval_target_auc(final_metrics)
         thr = metric_logger.auc_threshold
-        success = lifetime is not None and thr is not None and lifetime >= thr
+        success = eval_auc is not None and thr is not None and eval_auc >= thr
         _mlperf_run_stop(
             mlperf_logger.constants.SUCCESS
             if success
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index 045495b22..50b235dd2 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -326,6 +326,7 @@ orchestrate() {
       ${CKPT_STEP_FREQ:+-e CKPT_STEP_FREQ=$CKPT_STEP_FREQ} \
       -e TRAIN_SPLIT_PERCENTAGE=${TRAIN_SPLIT_PERCENTAGE:-1.0} \
       -e AUC_THRESHOLD=${AUC_THRESHOLD:-0.80275} \
+      ${EVAL_ACCURACY_AUC_MODE:+-e EVAL_ACCURACY_AUC_MODE=$EVAL_ACCURACY_AUC_MODE} \
       -e SPLIT_SALT=${SPLIT_SALT:-0} \
       -e EVAL_HOLDOUT_TS=${EVAL_HOLDOUT_TS:--1} \
       -e EVAL_HOLDOUT_NUM_WINDOWS=${EVAL_HOLDOUT_NUM_WINDOWS:-1} \

From c5469a4faf4a9212f609c2f6c5f13e7df2c88521 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Thu, 18 Jun 2026 16:52:38 -0500
Subject: [PATCH 069/113] dlrmv4: consolidate streaming e2e supervisor to one
 sbatch-wrapping script

Replace the single-node docker-exec supervisor with the sbatch-job-level
model (formerly run_streaming_e2e_multinode.sh), which handles 1..N nodes
via launch_slurm.sh. Node replacement is now SLURM's job on resubmit, so
the in-place node-acquisition/provision/exec-sentinel logic is dropped.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../scripts/run_streaming_e2e.sh              | 836 +++++-------------
 1 file changed, 221 insertions(+), 615 deletions(-)

diff --git a/recommendation_v4/scripts/run_streaming_e2e.sh b/recommendation_v4/scripts/run_streaming_e2e.sh
index f97a0e483..c913bac7b 100755
--- a/recommendation_v4/scripts/run_streaming_e2e.sh
+++ b/recommendation_v4/scripts/run_streaming_e2e.sh
@@ -1,689 +1,295 @@
 #!/bin/bash
 # =============================================================================
-# run_streaming_e2e.sh — self-healing supervisor for the long-run yambda-5b
-#                        streaming train+eval (NE/AUC over the full ~5B dataset)
+# run_streaming_e2e.sh — self-healing supervisor for a yambda-5b streaming
+#   train+eval run (sbatch-job level). Works for 1..N nodes.
 # =============================================================================
 #
-# WHAT IT DOES
-#   Owns a multi-day "streaming-train-eval" run and keeps it alive unattended
-#   across the three failure modes that actually kill long runs:
-#       1. trainer process crash / OOM / nonzero exit
-#       2. silent death (the whole process group gets SIGKILLed — no exit code)
-#       3. the SLURM node itself going away (down / drained / job ended)
-#   In every case it relaunches the trainer from the latest on-disk checkpoint
-#   (failing over to a brand-new node for case 3) until the run finishes.
+# WHAT IT SUPERVISES
+#   The run is an `sbatch [--nodes=N] scripts/launch_slurm.sh` BATCH job. That
+#   batch script is fully self-contained: it runs orchestrate -> provision
+#   (container + RDMA) -> worker (in-container trainer) on EVERY node, so it
+#   handles single-node (--nodes=1) and multi-node (world_size=8N) identically.
+#   This supervisor wraps THAT job: it monitors it and, on crash / node-failure
+#   / hang, RESUBMITS it (which resumes from the latest checkpoint via load
+#   auto-latest), bounded by --max-relaunch. There is no docker-exec lifecycle
+#   or in-place node failover here — node replacement is SLURM's job on resubmit.
 #
-# WHY A RELAUNCH "JUST WORKS" (resume model)
-#   The training stack already implements exact-once resume: on startup it picks
-#   the latest numeric checkpoint subdir under $CKPT_PATH, restores model +
-#   optimizer + per-rank RNG, and (for mid-window in-window saves) skips the
-#   batches already trained in the partially-done window. So relaunching with
-#   the SAME --ckpt-path transparently continues from where it died — no manual
-#   bookkeeping here beyond pointing every attempt at the same base dir.
+# RESUME MODEL (why a resubmit "just works")
+#   The trainer checkpoints to $CKPT_PATH and on startup load_dmp_checkpoint
+#   auto-resolves to the highest-numbered subdir, restoring model+optimizer+RNG
+#   and skipping already-trained batches of a partial window. So resubmitting the
+#   SAME submit-script (same CKPT_PATH/LOG) continues from where it died.
+#   Resubmits set APPEND_LOG=1 so the metrics log is preserved across attempts.
 #
-# WHERE IT RUNS / HOW IT DRIVES WORK
-#   This script runs on the SLURM HEAD node. The trainer runs inside a long-
-#   lived docker container ($CONTAINER) on the compute node held by a SLURM
-#   allocation ($JOBID). All control flow is `srun --jobid <id> --overlap
-#   docker exec ...` into that container. The container bind-mounts shared NFS
-#   (/home/chcai = code, /apps/chcai = checkpoints+logs), which is what makes
-#   node failover possible: any node in $PARTITION sees the same code+state.
+# WHAT IT DETECTS (poll every --poll-s)
+#   * job left the queue  -> read sacct State/ExitCode:
+#       COMPLETED+0   => run finished (success, exit 0)
+#       CANCELLED     => user intent (stop, exit 0 — NOT our place to resubmit)
+#       FAILED/NODE_FAIL/TIMEOUT/OUT_OF_MEMORY/BOOT_FAIL/PREEMPTED => relaunch
+#   * hang watchdog: job RUNNING but LOG frozen >= --stall-s AND no trainer
+#       process alive on ANY node (cross-node pgrep) => scancel + relaunch.
+#   * disk guard before each (re)submit: require --min-free-gib on the ckpt vol.
 #
-# MAIN LOOP (state machine, up to --max-relaunch attempts)
-#   for each attempt:
-#     ensure_ready   — guarantee a healthy allocation whose container is up,
-#                      failing over to a freshly-provisioned node if not.
-#     disk_guard     — sweep crash-orphaned *.tmp/*.old saves; abort if the
-#                      ckpt volume has < --min-free-gib free.
-#     cleanup_workers— kill any stragglers from a previous attempt.
-#     launch         — detached `docker exec -d` of the trainer; a trailing
-#                      echo appends an `E2E_RUN_EXIT=<code>` sentinel to the log
-#                      when the trainer returns (clean OR crash).
-#     monitor loop (every --poll-s):
-#         * node watchdog  — if $JOBID stops being healthy mid-run, break and
-#                            let the next attempt fail over.
-#         * exit sentinel  — E2E_RUN_EXIT=0 => success (done); nonzero => relaunch.
-#         * stall watchdog — if the log stops growing AND no trainer process is
-#                            alive for --stall-s, treat as silent death=>relaunch.
-#                            (Long blocking saves keep the process alive, so they
-#                            never false-trip this.)
+# WHERE IT RUNS
+#   On the SLURM head node (NFS-mounted /home/chcai code + /apps/chcai
+#   ckpts/logs are visible here for squeue/sacct/df and the cross-node pgrep).
 #
-# NODE FAILOVER (case 3, the --allow-failover path)
-#   ensure_ready -> acquire_node: submit an `sbatch` hold job (`--wrap "sleep
-#   infinity"`, bounded by --time=$ALLOC_TIME) for a fresh exclusive node on
-#   $PARTITION, optionally from --reservation $RESERVATION; wait for RUNNING,
-#   then provision_node runs $PROVISION_SCRIPT on it via `srun --jobid --overlap`
-#   (docker pull + container create + dep install; ~15 min on a cold node).
-#   sbatch (not salloc) because interactive salloc on some partitions (e.g.
-#   meta64) is capped at 240 min, which a multi-day hold would exceed. Jobs WE
-#   create are tracked and `scancel`ed (container removed first) on success via
-#   release_acquired; the user's original --jobid is never cancelled.
-#   Checkpoints on shared NFS make the resume seamless.
+# USAGE
+#   # Submit a fresh job from the launch script, then supervise it:
+#   nohup bash scripts/run_streaming_e2e.sh \
+#       --submit-script /apps/chcai/yambda_5b_e2e/<run>/launch_1node.sh \
+#       --log         /apps/chcai/yambda_5b_e2e/<run>/<run>.log \
+#       --ckpt-path   /apps/chcai/yambda_5b_e2e/<run>/ckpts \
+#       --run-name    <run> \
+#       > /apps/chcai/yambda_5b_e2e/<run>/<run>.supervisor.console.log 2>&1 &
 #
-# CHECKPOINTS / DISK
-#   The trainer saves atomically (write to <ts>.tmp, fsync, rename to <ts>) and
-#   prunes to keep_last_n newest. One checkpoint is ~560 GB; a save blocks the
-#   step it fires on for ~83 s (measured, no NFS contention). Cadence is driven
-#   by --ckpt-time-interval (time-based) and optional --in-window-freq.
+#   # Adopt an already-submitted job instead of submitting a new one:
+#   nohup bash scripts/run_streaming_e2e.sh --jobid 13235 \
+#       --submit-script .../launch_2node.sh --log .../run.log \
+#       --ckpt-path .../ckpts --run-name <run> > .../console.log 2>&1 &
 #
-# ARGS (all optional; defaults target the full production run)
-#   run shape:   --jobid --container --start-ts --num-train-ts --eval-every
-#   ckpt:        --ckpt-path --keep-last-n --ckpt-time-interval --in-window-freq
-#   logging:     --run-name --log
-#   resilience:  --max-relaunch --min-free-gib --stall-s
-#   failover:    --partition --reservation --alloc-time --allow-failover
-#                --provision-script --acquire-wait-max --resv-wait-max
-#                --orig-recover-wait
-#                (failover holds <=1 reservation node: stray/leaked e2e_failover
-#                 holds are reaped, and a lost ORIGINAL job is waited on for SLURM
-#                 requeue and reused before a SEPARATE node is acquired.)
-#   validation:  --num-train-batches --num-eval-batches  (>0 caps batches/window
-#                for fast tests; 0 = full window / full-holdout eval)
-#   test-only:   --die-at-step  (>=0 injects a crash at that global step)
+#   The node count, partition, and reservation all live in the --submit-script's
+#   sbatch line (launch_1node.sh / launch_2node.sh / ...), not here.
 #
 # EXIT CODES
-#   0  run completed (E2E_RUN_EXIT=0 — all windows + final eval done)
-#   1  exhausted --max-relaunch without completing
-#   3  disk guard tripped (insufficient free space)
-#   4  could not secure a healthy allocation (failover failed / disabled)
-#
-# OUTPUTS (next to --log)
-#   <log>                      trainer stdout/stderr + E2E_RUN_EXIT sentinels
-#   <log:.log>.supervisor.log  this supervisor's own timeline
-#   <log:.log>.provision.log   node-provisioning output (failover only)
-#
-# EXAMPLE
-#   nohup bash scripts/run_streaming_e2e.sh \
-#       --jobid 12074 \
-#       --ckpt-path /apps/chcai/ckpts/yambda_5b_e2e \
-#       --run-name yambda_5b_e2e --log /apps/chcai/yambda_5b_e2e.log \
-#       --start-ts 150 --num-train-ts 149 --eval-every 10 \
-#       --ckpt-time-interval 3600 --keep-last-n 1 --max-relaunch 100 \
-#       --reservation NAN_issue_debug \
-#       > /apps/chcai/yambda_5b_e2e.supervisor.console.log 2>&1 &
-#   (--reservation makes node-death failover re-acquire from that reservation;
-#    omit it to fall back to the open $PARTITION pool.)
+#   0  run completed (COMPLETED+0) or user-cancelled
+#   1  exhausted --max-relaunch without completion (or submit failed)
+#   3  disk guard tripped
 # =============================================================================
-
 set -uo pipefail
 
-JOBID=11367
+JOBID=""                       # adopt this job; empty => submit fresh
+SUBMIT_SCRIPT=""
+LOG=""
+CKPT_PATH=""
+RUN_NAME="yambda_5b_e2e"
 CONTAINER=yambda_primus
-REPO=/home/chcai/training/recommendation_v4
-
-# Direct-SSH fallback so the supervisor can probe the node even while the SLURM
-# control plane is unreachable — a transient controller outage must NOT be
-# mistaken for node death (which would needlessly tear down a healthy run).
-SSH_OPTS="-o BatchMode=yes -o ConnectTimeout=10 -o StrictHostKeyChecking=no"
-LAST_NODE=""   # last known node hostname for $JOBID (cached for direct probes)
-
-# Defaults are sized from measurement: ~560 GB/checkpoint, ~83 s/save (blocking,
-# attributed to the step it fires on), ~650 ms/train step @ global batch 8192,
-# ~1465 steps (~16 min) per full ~12M-anchor window, full-holdout eval
-# ~6-7 min/window. A ~2h time-based checkpoint interval keeps save overhead ~1%
-# while bounding crash-loss to ~2h of compute; eval every N windows
-# (EVAL_EVERY_N_WINDOWS) amortizes the full-holdout eval cost.
-NUM_TRAIN_TS=149
-START_TS=150
-EVAL_EVERY=5
-CKPT_TIME_INTERVAL=7200
-KEEP_LAST_N=1
-CKPT_PATH=/apps/chcai/ckpts/yambda_5b_e2e
-RUN_NAME=yambda_5b_e2e
-LOG=/apps/chcai/yambda_5b_e2e.log
 MAX_RELAUNCH=50
-NUM_TRAIN_BATCHES=0     # 0 = full window (only capped for validation/tests)
-NUM_EVAL_BATCHES=0      # 0 = full holdout eval (only capped for validation)
-DIE_AT_STEP=-1          # >=0 = test-only failure injection
-# Train:eval split (fraction of USERS trained; 1 - this held out as a FIXED,
-# never-trained eval set). Passed on EVERY relaunch so the split stays an
-# immutable run contract — a changed split would abort on resume (validated in
-# the loop) to prevent skip-offset desync and held-out users leaking into train.
-TRAIN_SPLIT_PERCENTAGE=0.90
-SPLIT_SALT=0
-EVAL_HOLDOUT_TS=-1          # <0 = window just past training (start_ts+num_train_ts)
-EVAL_HOLDOUT_NUM_WINDOWS=1
-IN_WINDOW_FREQ=0        # >0 = also save every N batches within a window
-ATTACH=0                # 1 = (re)attach to an already-running trainer without
-                        #     killing it or truncating its log — used to restore
-                        #     supervision over a trainer that outlived a previous
-                        #     supervisor (e.g. one a control-plane outage killed).
-CTRL_WAIT_MAX=3600      # max seconds to wait for an unreachable SLURM controller
-                        # to recover before concluding failover is needed.
-
-# --- node failover ----------------------------------------------------------
-# If the current allocation/node goes away, acquire a FRESH node, (re)provision
-# the container on it, and resume — checkpoints + code live on shared NFS
-# (/apps/chcai, /home/chcai), so any node in the partition can continue.
-PARTITION=meta64
-RESERVATION=""                        # if set, failover acquires from this SLURM
-                                      # reservation (e.g. NAN_issue_debug) so a
-                                      # replacement node comes from the same pool.
-ALLOC_TIME=7-00:00:00                 # SLURM --time for a failover hold job
-ALLOW_FAILOVER=1                      # 0 = never acquire a new node
-PROVISION_SCRIPT=/home/chcai/_provision_yambda_primus.sh
-ACQUIRE_WAIT_MAX=1800                 # max seconds to wait for the OPEN-POOL
-                                      # (tier-2) failover hold job to reach
-                                      # RUNNING (tolerates brief queueing).
-RESV_WAIT_MAX=300                     # max seconds to wait for a RESERVATION
-                                      # (tier-1) node before giving up on it and
-                                      # falling back to the open $PARTITION pool.
-                                      # Short, since a free reservation node
-                                      # starts ~immediately; a longer wait just
-                                      # means the reservation is currently full.
-ORIG_RECOVER_WAIT=600                 # when the user's ORIGINAL reservation job
-                                      # is lost, wait this long for SLURM to
-                                      # auto-requeue it back to RUNNING before
-                                      # acquiring a SEPARATE node. Reusing the
-                                      # requeued original keeps us at <=1
-                                      # reservation node and skips a redundant
-                                      # acquire (observed requeue latency ~2 min).
-
-# Disk guard: require at least this many GiB free on the ckpt volume before a
-# (re)launch. One checkpoint is ~560 GB. A save writes a fresh .tmp BEFORE the
-# old copy is pruned, so peak transient usage is (keep_last_n + 1) copies. With
-# keep_last_n=1 that is ~1120 GB; require ~1200 GiB free at launch so the run
-# never wedges mid-save on a near-full shared NFS volume.
 MIN_FREE_GIB=1200
-# Stall watchdog: if the log hasn't grown AND no trainer process is alive for
-# this many seconds with no exit sentinel, treat it as a silent death. Comfortably
-# exceeds one blocking checkpoint save (~83 s); and because a save keeps the
-# trainer process alive, an in-progress save never trips the watchdog anyway.
-STALL_S=1200
+STALL_S=2400                   # 40 min: comfortably exceeds a full-holdout eval
+                               # window + a blocking ckpt save; only trips when
+                               # the log is frozen AND no trainer proc is alive.
 POLL_S=30
 
 while [[ $# -gt 0 ]]; do
     case $1 in
         --jobid) JOBID="$2"; shift 2;;
-        --container) CONTAINER="$2"; shift 2;;
-        --num-train-ts) NUM_TRAIN_TS="$2"; shift 2;;
-        --start-ts) START_TS="$2"; shift 2;;
-        --eval-every) EVAL_EVERY="$2"; shift 2;;
-        --ckpt-time-interval) CKPT_TIME_INTERVAL="$2"; shift 2;;
-        --keep-last-n) KEEP_LAST_N="$2"; shift 2;;
+        --submit-script) SUBMIT_SCRIPT="$2"; shift 2;;
+        --log) LOG="$2"; shift 2;;
         --ckpt-path) CKPT_PATH="$2"; shift 2;;
         --run-name) RUN_NAME="$2"; shift 2;;
-        --log) LOG="$2"; shift 2;;
+        --container) CONTAINER="$2"; shift 2;;
         --max-relaunch) MAX_RELAUNCH="$2"; shift 2;;
-        --num-train-batches) NUM_TRAIN_BATCHES="$2"; shift 2;;
-        --num-eval-batches) NUM_EVAL_BATCHES="$2"; shift 2;;
-        --die-at-step) DIE_AT_STEP="$2"; shift 2;;
-        --in-window-freq) IN_WINDOW_FREQ="$2"; shift 2;;
-        --attach) ATTACH="$2"; shift 2;;
-        --ctrl-wait-max) CTRL_WAIT_MAX="$2"; shift 2;;
         --min-free-gib) MIN_FREE_GIB="$2"; shift 2;;
         --stall-s) STALL_S="$2"; shift 2;;
-        --partition) PARTITION="$2"; shift 2;;
-        --reservation) RESERVATION="$2"; shift 2;;
-        --alloc-time) ALLOC_TIME="$2"; shift 2;;
-        --allow-failover) ALLOW_FAILOVER="$2"; shift 2;;
-        --provision-script) PROVISION_SCRIPT="$2"; shift 2;;
-        --acquire-wait-max) ACQUIRE_WAIT_MAX="$2"; shift 2;;
-        --resv-wait-max) RESV_WAIT_MAX="$2"; shift 2;;
-        --orig-recover-wait) ORIG_RECOVER_WAIT="$2"; shift 2;;
+        --poll-s) POLL_S="$2"; shift 2;;
         *) echo "Unknown arg: $1"; exit 1;;
     esac
 done
 
-ORIGINAL_JOBID="$JOBID"   # never scancel the user's own hold allocation
-ACQUIRED_JOBIDS=()        # failover allocations WE created (released on success)
+[[ -n "$SUBMIT_SCRIPT" && -f "$SUBMIT_SCRIPT" ]] || { echo "FATAL: --submit-script required and must exist"; exit 1; }
+[[ -n "$LOG" ]] || { echo "FATAL: --log required"; exit 1; }
 
 SUP_LOG="${LOG%.log}.supervisor.log"
-
 sup() { echo "[$(date '+%F %T')] [supervisor] $*" | tee -a "$SUP_LOG"; }
 
-# Run a command inside the allocation's container, capturing its stdout. Wrapped
-# in `timeout` so a hung control plane / NFS can never wedge the supervisor.
-cexec() { timeout 90 srun --jobid="$JOBID" --overlap docker exec "$CONTAINER" bash -lc "$1" 2>/dev/null; }
-
-# Is the SLURM control plane reachable right now?
-controller_up() { timeout 12 sinfo -h -o '%P' >/dev/null 2>&1; }
-
-# Refresh + echo the node hostname for $JOBID (cached in LAST_NODE for direct
-# probes that must work even while the controller is down).
-refresh_node() {
-    local n; n=$(timeout 12 squeue -h -j "$JOBID" -o '%N' 2>/dev/null | head -1)
-    [[ -n "$n" ]] && LAST_NODE="$n"
-    echo "$LAST_NODE"
-}
-
-# Run a (simple) command in the container by SSHing the node DIRECTLY, bypassing
-# SLURM — the only way to observe the trainer during a controller outage. Needs a
-# previously-cached LAST_NODE. Keep "$1" free of embedded double quotes.
-dexec() {
-    [[ -z "$LAST_NODE" ]] && return 1
-    timeout 40 ssh $SSH_OPTS "$LAST_NODE" "docker exec $CONTAINER bash -lc '$1'" 2>/dev/null
-}
-
-# Block (with backoff) until the controller is reachable again, up to
-# CTRL_WAIT_MAX. A controller outage leaves RUNNING jobs running, so waiting it
-# out is almost always preferable to abandoning a healthy node.
-wait_for_controller() {
-    local waited=0
-    controller_up && return 0
-    while ! controller_up; do
-        if (( waited >= CTRL_WAIT_MAX )); then
-            sup "controller still unreachable after ${waited}s (max ${CTRL_WAIT_MAX}s) — proceeding."
-            return 1
-        fi
-        sup "SLURM controller unreachable; waiting for recovery (${waited}s/${CTRL_WAIT_MAX}s)…"
-        sleep 30; waited=$((waited + 30))
-    done
-    sup "SLURM controller reachable again after ${waited}s."
-    return 0
-}
-
-cleanup_workers() {
-    # The trainer spawns 8 rank processes + dataloader workers whose cmdlines
-    # don't all match `train_ranker`/`spawn_main`, so target them, then fall
-    # back to `pkill python` — safe because this container is dedicated to this
-    # training (only the trainer runs python here during a supervised run).
-    srun --jobid="$JOBID" --overlap docker exec "$CONTAINER" bash -lc \
-        "pkill -9 -f train_ranker 2>/dev/null; pkill -9 -f multiprocessing 2>/dev/null; \
-         sleep 2; pkill -9 python 2>/dev/null; sleep 3; true" 2>/dev/null || true
-}
-
-# --- node-failover helpers ---------------------------------------------------
-
-# Healthy = the job is RUNNING and its node is not down/drained/failing.
-alloc_healthy() {
-    local jid="$1"
-    [[ -z "$jid" ]] && return 1
-    local st node nstate
-    st=$(squeue -h -j "$jid" -o '%T' 2>/dev/null | head -1)
-    [[ "$st" != "RUNNING" ]] && return 1
-    node=$(squeue -h -j "$jid" -o '%N' 2>/dev/null | head -1)
-    [[ -z "$node" ]] && return 1
-    nstate=$(sinfo -h -n "$node" -o '%t' 2>/dev/null | head -1)
-    case "$nstate" in
-        *down*|*drain*|*fail*|*unk*|*boot*|"") return 1;;
-    esac
-    return 0
-}
-
-# Can we actually exec in the training container on this allocation?
-container_up() {
-    timeout 30 srun --jobid="$1" --overlap docker exec "$CONTAINER" true >/dev/null 2>&1
-}
-
-# (Re)create + dep-install the container on the given allocation's node.
-provision_node() {
-    local jid="$1" node
-    node=$(squeue -h -j "$jid" -o '%N' 2>/dev/null | head -1)
-    sup "provisioning container '$CONTAINER' on job $jid (node ${node:-?}) — cold node can take ~15 min"
-    srun --jobid="$jid" --overlap bash "$PROVISION_SCRIPT" >> "${LOG%.log}.provision.log" 2>&1
-    container_up "$jid"
-}
-
-# Submit an sbatch hold job that merely pins one exclusive node (`sleep
-# infinity`, bounded by --time=$ALLOC_TIME); echoes the jobid. $1 = extra sbatch
-# args (e.g. "--reservation=NAN_issue_debug" or ""). sbatch (not salloc) because
-# interactive salloc on some partitions (meta64) is capped at 240 min, which an
-# $ALLOC_TIME multi-day hold exceeds. The container is provisioned afterward by
-# provision_node via `srun --jobid --overlap`.
-_submit_hold_job() {
-    local extra="$1" out
-    out=$(sbatch --parsable --partition="$PARTITION" $extra --nodes=1 --exclusive \
-                 --time="$ALLOC_TIME" --job-name=e2e_failover \
-                 --output="${LOG%.log}.failover_hold.%j.log" \
-                 --wrap="echo \"[failover-hold] node=\$(hostname) jobid=\$SLURM_JOB_ID start=\$(date -Is)\"; sleep infinity" 2>&1)
-    # --parsable => "<jobid>" or "<jobid>;<cluster>"; strip whitespace + cluster.
-    echo "$out" | tr -d ' ' | cut -d';' -f1
-}
-
-# Wait up to $2 seconds for job $1 to reach RUNNING. Returns 0 if RUNNING.
-_wait_running() {
-    local jid="$1" max="$2" waited=0 st
-    while (( waited < max )); do
-        st=$(squeue -h -j "$jid" -o '%T' 2>/dev/null | head -1)
-        [[ "$st" == "RUNNING" ]] && return 0
-        sleep 10; waited=$((waited + 10))
+# Is the job in the queue right now (single read)?
+job_in_queue() { [[ -n "$(squeue -h -j "$1" -o '%T' 2>/dev/null | head -1)" ]]; }
+job_state()  { squeue -h -j "$1" -o '%T' 2>/dev/null | head -1; }
+
+# Is the job still active? squeue/the SLURM control plane can transiently return
+# empty during an NFS/controller blip even though the job is alive (this once
+# killed all supervisors at once: empty squeue -> sacct said RUNNING -> a bogus
+# "relaunch"). So a SINGLE empty read is not trusted: re-check a few times before
+# believing the job is really gone.
+job_active() {
+    job_in_queue "$1" && return 0
+    local k
+    for k in 1 2 3; do
+        sleep 10
+        job_in_queue "$1" && return 0
     done
     return 1
 }
 
-# Acquire a fresh exclusive node and set global JOBID on success. Two-tier:
-#   tier 1 (preferred): the SLURM --reservation $RESERVATION, if configured.
-#     Waited on for only RESV_WAIT_MAX — a free reservation node starts almost
-#     immediately, so a longer wait means the reservation is currently full.
-#   tier 2 (fallback): the open $PARTITION pool (no reservation), waited on for
-#     ACQUIRE_WAIT_MAX. Used when no reservation is set, or the reservation had
-#     no node free within RESV_WAIT_MAX (the pending reservation job is
-#     cancelled before we resubmit so we never end up holding two nodes).
-acquire_node() {
-    if [[ "$ALLOW_FAILOVER" != "1" ]]; then
-        sup "failover disabled (--allow-failover 0); cannot acquire a new node"; return 1
-    fi
-    # Release any prior/leaked failover hold BEFORE grabbing a new one, so we
-    # never transiently pin two reservation nodes (e.g. a dead tier-1 hold + the
-    # replacement we are about to submit).
-    reap_failover_holds ""
-    local jid
-
-    # --- tier 1: reservation (preferred) -------------------------------------
-    if [[ -n "$RESERVATION" ]]; then
-        sup "failover tier-1: requesting a node from reservation=$RESERVATION (exclusive, time=$ALLOC_TIME)"
-        jid=$(_submit_hold_job "--reservation=$RESERVATION")
-        if [[ "$jid" =~ ^[0-9]+$ ]]; then
-            ACQUIRED_JOBIDS+=("$jid")   # track for cleanup even if it never starts
-            sup "reservation hold job jobid=$jid submitted; waiting up to ${RESV_WAIT_MAX}s for RUNNING"
-            if _wait_running "$jid" "$RESV_WAIT_MAX"; then
-                JOBID="$jid"
-                sup "new node ready (reservation $RESERVATION): jobid=$JOBID node=$(squeue -h -j "$JOBID" -o '%N' 2>/dev/null | head -1)"
-                return 0
-            fi
-            sup "reservation $RESERVATION has no free node within ${RESV_WAIT_MAX}s — cancelling pending $jid and falling back to open pool"
-            scancel "$jid" 2>/dev/null || true
-        else
-            sup "reservation sbatch did not return a jobid ($jid) — falling back to open pool"
-        fi
-    fi
-
-    # --- tier 2: open partition pool (fallback) ------------------------------
-    sup "failover tier-2: requesting a node from open partition=$PARTITION (exclusive, time=$ALLOC_TIME)"
-    jid=$(_submit_hold_job "")
-    if ! [[ "$jid" =~ ^[0-9]+$ ]]; then
-        sup "FATAL: open-pool sbatch did not return a jobid: $jid"; return 1
-    fi
-    ACQUIRED_JOBIDS+=("$jid")
-    sup "open-pool hold job jobid=$jid submitted; waiting up to ${ACQUIRE_WAIT_MAX}s for RUNNING"
-    if _wait_running "$jid" "$ACQUIRE_WAIT_MAX"; then
-        JOBID="$jid"
-        sup "new node ready (open $PARTITION): jobid=$JOBID node=$(squeue -h -j "$JOBID" -o '%N' 2>/dev/null | head -1)"
-        return 0
-    fi
-    sup "FATAL: open-pool hold job $jid never reached RUNNING (waited ${ACQUIRE_WAIT_MAX}s)"
-    return 1
-}
+# Terminal State + ExitCode from accounting once the job has left the queue.
+job_final() { sacct -j "$1" -X -n -o State,ExitCode 2>/dev/null | head -1 | tr -s ' '; }
 
-# Ensure $JOBID is a healthy allocation with the container up, failing over to a
-# fresh provisioned node if not. Resume is automatic: the latest checkpoint is
-# on shared NFS, reachable from whatever node we end up on.
-ensure_ready() {
-    # A controller outage leaves RUNNING jobs running; wait it out before deciding
-    # anything is wrong, so we never abandon a healthy node over a transient blip.
-    wait_for_controller || true
-    if alloc_healthy "$JOBID"; then
-        refresh_node >/dev/null
-        if container_up "$JOBID"; then return 0; fi
-        sup "alloc $JOBID healthy but container '$CONTAINER' not up — (re)provisioning"
-        provision_node "$JOBID" && return 0
-        sup "provisioning on $JOBID failed; will try a fresh node"
-    else
-        sup "current allocation $JOBID unavailable (job not RUNNING or node down/drained)"
-        # Prefer the SLURM-requeued original over acquiring a SEPARATE node, so we
-        # stay at <=1 reservation node. (No-op once we've already failed over off
-        # the original.)
-        if wait_for_original_recover; then
-            JOBID="$ORIGINAL_JOBID"
-            refresh_node >/dev/null
-            if container_up "$JOBID"; then sup "reusing recovered original jobid=$JOBID"; return 0; fi
-            sup "recovered original $JOBID up but container '$CONTAINER' not present — (re)provisioning"
-            provision_node "$JOBID" && return 0
-            sup "provisioning recovered original $JOBID failed; will acquire a fresh node"
-        fi
-    fi
-    acquire_node || return 1
-    provision_node "$JOBID" || { sup "provisioning new node $JOBID failed"; return 1; }
-    sup "failover complete — now running on jobid=$JOBID"
-    return 0
-}
-
-release_acquired() {
-    local jid
-    for jid in "${ACQUIRED_JOBIDS[@]:-}"; do
-        [[ -n "$jid" && "$jid" != "$ORIGINAL_JOBID" ]] || continue
-        # docker is independent of SLURM, so remove the container before freeing
-        # the node, otherwise it lingers for the next tenant.
-        srun --jobid="$jid" --overlap docker rm -f "$CONTAINER" >/dev/null 2>&1 || true
-        scancel "$jid" 2>/dev/null && sup "released failover allocation $jid (container removed)"
-    done
-}
-
-# Enforce "at most ONE reservation node held by this run at a time" and reap
-# orphans. Every node WE acquire is an `sbatch --job-name=e2e_failover` hold, so
-# all our holds are discoverable by name even across a supervisor restart — which
-# is how a previous supervisor that died mid-failover (e.g. on a provisioning
-# error) can leave a hold pinning a second reservation node. Cancels every
-# e2e_failover hold owned by us EXCEPT $1 (the one to keep) and the user's
-# ORIGINAL_JOBID (never ours to cancel). Containers are removed before the node
-# is freed so they don't linger for the next tenant.
-reap_failover_holds() {
-    local keep="${1:-}" me jid
-    me=$(id -un 2>/dev/null)
-    [[ -z "$me" ]] && return 0
-    while read -r jid; do
-        [[ -z "$jid" ]] && continue
-        [[ "$jid" == "$keep" || "$jid" == "$ORIGINAL_JOBID" ]] && continue
-        sup "reaping stray failover hold $jid (enforcing <=1 reservation node held by this run)"
-        srun --jobid="$jid" --overlap docker rm -f "$CONTAINER" >/dev/null 2>&1 || true
-        scancel "$jid" 2>/dev/null || true
-    done < <(squeue -h -u "$me" -n e2e_failover -o '%i' 2>/dev/null)
-}
-
-# When the user's ORIGINAL reservation job is lost, SLURM typically auto-requeues
-# it back onto a (fresh) reservation node within a couple of minutes. Waiting for
-# that and REUSING it — rather than immediately acquiring a SEPARATE node — is
-# what keeps us at <=1 reservation node (the alternative is the original requeue
-# AND a failover hold both pinning reservation nodes) and skips a redundant
-# acquire+provision. Only meaningful while we are still on the original job.
-wait_for_original_recover() {
-    [[ "$JOBID" != "$ORIGINAL_JOBID" ]] && return 1
-    local waited=0
-    while (( waited < ORIG_RECOVER_WAIT )); do
-        if alloc_healthy "$ORIGINAL_JOBID"; then
-            sup "original job $ORIGINAL_JOBID is RUNNING again (SLURM requeue) after ${waited}s — reusing it (no second node)"
-            return 0
-        fi
-        sup "waiting for original job $ORIGINAL_JOBID to requeue before acquiring a separate node (${waited}s/${ORIG_RECOVER_WAIT}s)…"
-        sleep 15; waited=$((waited + 15))
-    done
-    sup "original job $ORIGINAL_JOBID did not recover within ${ORIG_RECOVER_WAIT}s — acquiring a fresh node"
-    return 1
+# sacct/SLURM states that mean the job is STILL ALIVE (not terminal). If we see
+# one of these after the monitor loop exits, squeue lied (transient) — resume
+# monitoring instead of relaunching (which could spawn a DUPLICATE job).
+is_active_state() {
+    case "$1" in
+        RUNNING|PENDING|CONFIGURING|COMPLETING|REQUEUED|RESIZING|SUSPENDED|REQUEUE_HOLD|REQUEUE_FED|SIGNALING|STAGE_OUT) return 0;;
+        *) return 1;;
+    esac
 }
 
-# Returns 0 (true) if a trainer process is alive in the container. Uses SLURM
-# (srun) when the controller is up, else falls back to a direct SSH probe so a
-# control-plane outage can't make a live trainer look dead.
+# Any trainer process alive on ANY node of the allocation? (cross-node pgrep via
+# overlap srun into each node's container). [g]enerative self-match guard avoids
+# pgrep matching its own command line.
 trainer_alive() {
-    local n
-    # `set -f; pgrep -f [g]enerative...` is the classic self-match guard: the
-    # probe shell's OWN cmdline contains the pattern, so a naive `pgrep -f
-    # generative_recommenders` ALWAYS matches itself and returns >=1 even when
-    # the trainer is dead — which would defeat the stall watchdog and make
-    # ATTACH mode falsely "adopt" a nonexistent trainer. The [g] char-class
-    # matches "generative" in real trainer cmdlines but NOT the literal
-    # "[g]enerative" in the probe's cmdline; `set -f` keeps the bracket from
-    # being glob-expanded (works under both bash -lc wrappers, no quotes).
-    if controller_up; then
-        n=$(cexec "set -f; pgrep -f [g]enerative_recommenders | wc -l" | tr -d ' ')
-    else
-        n=$(dexec "set -f; pgrep -f [g]enerative_recommenders | wc -l" | tr -d ' ')
-    fi
+    local jid="$1" n
+    n=$(timeout 70 srun --jobid="$jid" --overlap --ntasks-per-node=1 bash -c \
+        "docker exec $CONTAINER bash -lc 'set -f; pgrep -f [g]enerative_recommenders | wc -l' 2>/dev/null" 2>/dev/null \
+        | awk '{s+=$1} END{print s+0}')
     [[ "${n:-0}" -gt 0 ]]
 }
 
+# Free GiB on the ckpt volume (NFS is mounted on this head node, so df locally).
+disk_free_gib() {
+    df -BG --output=avail "$CKPT_PATH" 2>/dev/null | tail -1 | tr -dc '0-9'
+}
+
 disk_guard() {
-    # Sweep crash-orphaned partial saves, then check free space.
-    cexec "for d in '$CKPT_PATH'/*.tmp '$CKPT_PATH'/*.old; do [ -e \"\$d\" ] && rm -rf \"\$d\" && echo swept \"\$d\"; done; true"
-    local free_gib
-    free_gib=$(cexec "df -BG --output=avail '$CKPT_PATH' 2>/dev/null | tail -1 | tr -dc '0-9'")
-    free_gib=${free_gib:-0}
-    sup "disk guard: ${free_gib} GiB free on $CKPT_PATH (min ${MIN_FREE_GIB})"
-    if (( free_gib < MIN_FREE_GIB )); then
-        sup "FATAL: insufficient free space (${free_gib} < ${MIN_FREE_GIB} GiB). Aborting."
+    [[ -z "$CKPT_PATH" ]] && return 0
+    local free; free=$(disk_free_gib); free=${free:-0}
+    sup "disk guard: ${free} GiB free on $CKPT_PATH (min ${MIN_FREE_GIB})"
+    if (( free < MIN_FREE_GIB )); then
+        sup "FATAL: insufficient free space (${free} < ${MIN_FREE_GIB} GiB). Aborting."
         return 1
     fi
     return 0
 }
 
-launch() {
-    # Detached launch. The trailing echo appends a definitive exit sentinel to
-    # the log once the trainer returns (clean finish OR crash with nonzero rc).
-    srun --jobid="$JOBID" --overlap docker exec -d "$CONTAINER" bash -lc "
-        cd $REPO &&
-        HSTU_HAMMER_KERNEL=TRITON \
-        MODE=streaming-train-eval \
-        START_TS=$START_TS \
-        NUM_TRAIN_TS=$NUM_TRAIN_TS \
-        EVAL_EACH_WINDOW=1 \
-        EVAL_EVERY_N_WINDOWS=$EVAL_EVERY \
-        CKPT_PATH=$CKPT_PATH \
-        KEEP_LAST_N=$KEEP_LAST_N \
-        CKPT_TIME_INTERVAL_S=$CKPT_TIME_INTERVAL \
-        IN_WINDOW_CKPT_FREQ=$IN_WINDOW_FREQ \
-        NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES \
-        NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES \
-        DIE_AT_STEP=$DIE_AT_STEP \
-        TRAIN_SPLIT_PERCENTAGE=$TRAIN_SPLIT_PERCENTAGE \
-        SPLIT_SALT=$SPLIT_SALT \
-        EVAL_HOLDOUT_TS=$EVAL_HOLDOUT_TS \
-        EVAL_HOLDOUT_NUM_WINDOWS=$EVAL_HOLDOUT_NUM_WINDOWS \
-        METRIC_LOG_FREQ=50 \
-        RUN_NAME=$RUN_NAME \
-        TENSORBOARD_LOG_PATH=/apps/chcai/tb/$RUN_NAME/ \
-        LOG=$LOG \
-        bash scripts/launch_slurm.sh;
-        echo \"E2E_RUN_EXIT=\$? \$(date '+%F %T')\" >> $LOG
-    "
+# Resubmit the run; resumes from latest checkpoint. APPEND_LOG=1 preserves the
+# metrics log. Echoes the new jobid.
+resubmit() {
+    local out newjid
+    out=$(APPEND_LOG=1 bash "$SUBMIT_SCRIPT" 2>&1)
+    newjid=$(echo "$out" | grep -oE 'Submitted batch job [0-9]+' | grep -oE '[0-9]+' | head -1)
+    echo "$out" | sed 's/^/    /' >> "$SUP_LOG"
+    echo "$newjid"
 }
 
-# Returns the exit code from the most recent E2E_RUN_EXIT sentinel APPENDED
-# since `since_marker` bytes, or empty if none yet.
-last_exit_since() {
-    local since_line="$1"
-    cexec "tail -n +$since_line '$LOG' 2>/dev/null | grep -aoE 'E2E_RUN_EXIT=[0-9]+' | tail -1 | cut -d= -f2"
+# Submit with retries+backoff. A transient NFS / control-plane error (e.g.
+# "sbatch: error: ... I/O error writing script/environment to file") must NOT
+# kill the supervisor — it leaves runs unsupervised / unlaunched. Echoes a jobid
+# on success, or empty after all retries.
+submit_retry() {
+    local cand sub_try
+    for sub_try in $(seq 1 12); do
+        cand=$(resubmit)
+        if [[ "$cand" =~ ^[0-9]+$ ]]; then echo "$cand"; return 0; fi
+        sup "submit attempt $sub_try/12 failed (transient sbatch/NFS error) — backing off."
+        sleep $(( sub_try < 5 ? 30 : 120 ))
+    done
+    return 1
 }
 
 sup "=== streaming e2e supervisor start ==="
-sup "jobid=$JOBID container=$CONTAINER repo=$REPO"
-sup "start_ts=$START_TS num_train_ts=$NUM_TRAIN_TS eval_every=$EVAL_EVERY"
-sup "ckpt_path=$CKPT_PATH keep_last_n=$KEEP_LAST_N ckpt_time_interval=${CKPT_TIME_INTERVAL}s in_window_freq=$IN_WINDOW_FREQ"
-sup "log=$LOG num_train_batches=$NUM_TRAIN_BATCHES die_at_step=$DIE_AT_STEP max_relaunch=$MAX_RELAUNCH"
-sup "failover: allow=$ALLOW_FAILOVER partition=$PARTITION reservation=${RESERVATION:-<none>} alloc_time=$ALLOC_TIME"
-
-# Reap any failover hold(s) leaked by a PREVIOUS supervisor that died mid-failover
-# (e.g. exited on a provisioning error before release_acquired could run). Without
-# this, such an orphan keeps pinning a second reservation node indefinitely.
-reap_failover_holds ""
-
-cexec "mkdir -p '$CKPT_PATH' '/apps/chcai/tb/$RUN_NAME'"
-# Initialize this run's metrics log ONCE. launch_slurm.sh (worker) appends (tee -a),
-# so every relaunch attempt accumulates into this single file — the full-run
-# NE/AUC history survives crashes and node failover instead of being truncated
-# on each relaunch. (Starting the supervisor = starting a fresh run.) In ATTACH
-# mode we are adopting an already-running trainer, so we KEEP its existing log.
-if [[ "$ATTACH" == "1" ]]; then
-    sup "ATTACH mode: adopting existing run — keeping metrics log intact: $LOG"
-else
-    cexec ": > '$LOG'"
-    sup "metrics log initialized (relaunch-append): $LOG"
-fi
-sup "tensorboard (NFS): /apps/chcai/tb/$RUN_NAME/"
+sup "run=$RUN_NAME submit=$SUBMIT_SCRIPT log=$LOG ckpt=$CKPT_PATH"
+sup "max_relaunch=$MAX_RELAUNCH min_free_gib=$MIN_FREE_GIB stall_s=$STALL_S poll_s=$POLL_S"
 
 attempt=0
-while (( attempt < MAX_RELAUNCH )); do
-    attempt=$((attempt + 1))
-    sup "--- attempt $attempt/$MAX_RELAUNCH ---"
-
-    # Make sure we have a live, container-ready node (failover + provision if the
-    # current allocation/node has gone away).
-    if ! ensure_ready; then
-        sup "FATAL: could not secure a healthy allocation (failover failed)."
-        exit 4
-    fi
-    refresh_node >/dev/null   # cache LAST_NODE for direct probes during outages
-
-    # ATTACH (first attempt only): if a trainer is already running for this run,
-    # adopt it in place — DON'T disk-guard (its sweep would delete an in-flight
-    # .tmp save), DON'T cleanup_workers (would kill it), DON'T launch. Just begin
-    # monitoring. Any subsequent relaunch is a normal launch from the checkpoint.
-    adopt=0
-    if [[ "$ATTACH" == "1" ]] && trainer_alive; then
-        adopt=1; ATTACH=0
-        sup "ATTACH mode: trainer already alive on ${LAST_NODE:-node} — monitoring in place (no relaunch/kill/sweep)."
-    fi
-
-    if (( adopt )); then
-        # Mark current end of log so we only read sentinels produced from here on.
-        start_line=$(cexec "wc -l < '$LOG' 2>/dev/null" | tr -d ' '); start_line=${start_line:-0}
-        start_line=$((start_line + 1))
-        sup "monitoring adopted run (reading sentinels from log line $start_line)"
-    else
-        if ! disk_guard; then exit 3; fi
-        cleanup_workers
-        # Mark current end of log so we only read sentinels produced by THIS attempt.
-        start_line=$(cexec "wc -l < '$LOG' 2>/dev/null" | tr -d ' '); start_line=${start_line:-0}
-        start_line=$((start_line + 1))
-        sup "launching (reading sentinels from log line $start_line)"
-        launch
-        sleep 15  # let docker exec spin up the process
-    fi
-
-    # Monitor loop.
-    last_size=0
-    stall_accum=0
-    hb=0
-    while true; do
-        # Node/allocation watchdog: if the node we're on goes down/drains or the
-        # job ends, bail out of the monitor — the next attempt's ensure_ready
-        # will fail over to a fresh node and resume from the latest checkpoint.
-        hb=$((hb + 1))
-        if (( hb % 4 == 0 )) && ! alloc_healthy "$JOBID"; then
-            if ! controller_up; then
-                # Control plane unreachable != node down. If the trainer is still
-                # alive on the node (direct SSH probe), this is a transient blip —
-                # keep monitoring rather than tearing down a healthy run.
-                if trainer_alive; then
-                    sup "control plane unreachable but trainer still alive on ${LAST_NODE:-node} — transient; continuing to monitor."
-                else
-                    sup "control plane unreachable AND trainer absent on ${LAST_NODE:-node} — relaunching with failover."
-                    break
+if [[ -z "$JOBID" ]]; then
+    if ! disk_guard; then exit 3; fi
+    sup "no --jobid given; submitting a fresh job"
+    JOBID=$(submit_retry)
+    [[ "$JOBID" =~ ^[0-9]+$ ]] || { sup "FATAL: could not submit after 12 retries — aborting."; exit 1; }
+fi
+attempt=1
+sup "supervising jobid=$JOBID (attempt $attempt/$MAX_RELAUNCH)"
+
+while (( attempt <= MAX_RELAUNCH )); do
+    # --- wait for the job to be schedulable / running ---
+    wait_pend=0
+    while job_active "$JOBID" && [[ "$(job_state "$JOBID")" != "RUNNING" ]]; do
+        (( wait_pend % 10 == 0 )) && sup "job $JOBID state=$(job_state "$JOBID") — waiting to run…"
+        sleep "$POLL_S"; wait_pend=$((wait_pend + 1))
+    done
+    [[ "$(job_state "$JOBID")" == "RUNNING" ]] && sup "job $JOBID RUNNING on $(squeue -h -j "$JOBID" -o '%N' 2>/dev/null | head -1)"
+
+    # --- monitor loop ---
+    last_size=0; stall_accum=0; hb=0; self_cancelled=0
+    while job_active "$JOBID"; do
+        st=$(job_state "$JOBID")
+        if [[ "$st" == "RUNNING" ]]; then
+            cur_size=$(stat -c %s "$LOG" 2>/dev/null || echo 0)
+            if [[ "$cur_size" == "$last_size" ]]; then
+                # frozen log: only count as a stall if no trainer proc is alive
+                # (a long eval / blocking save keeps the process up -> not a stall)
+                hb=$((hb + 1))
+                if (( hb % 4 == 0 )); then
+                    if trainer_alive "$JOBID"; then
+                        stall_accum=0
+                    else
+                        stall_accum=$((stall_accum + POLL_S * 4))
+                        sup "log frozen + no trainer alive (${stall_accum}s/${STALL_S}s)"
+                        if (( stall_accum >= STALL_S )); then
+                            sup "STALL: hung run — scancel $JOBID and relaunch."
+                            self_cancelled=1
+                            scancel "$JOBID" 2>/dev/null || true
+                            sleep 20
+                            break
+                        fi
+                    fi
                 fi
             else
-                sup "allocation $JOBID lost mid-run (node down/job ended) — relaunching with failover."
-                break
+                stall_accum=0; last_size=$cur_size
             fi
         fi
+        sleep "$POLL_S"
+    done
 
-        rc=$(last_exit_since "$start_line")
-        if [[ -n "$rc" ]]; then
-            if [[ "$rc" == "0" ]]; then
-                sup "RUN COMPLETED CLEANLY (E2E_RUN_EXIT=0) on attempt $attempt."
-                cleanup_workers
-                final_ckpts=$(cexec "ls '$CKPT_PATH' 2>/dev/null | grep -E '^[0-9]+$' | tr '\n' ' '")
-                sup "final checkpoints retained: ${final_ckpts:-<none>}"
-                release_acquired
-                sup "=== streaming e2e supervisor done (success) ==="
+    # --- job has left the queue (or we scancel'd it): decide ---
+    sleep 5
+    final=$(job_final "$JOBID")
+    state=$(echo "$final" | awk '{print $1}')
+    code=$(echo "$final" | awk '{print $2}')
+    sup "job $JOBID ended: state='${state:-?}' exit='${code:-?}'"
+
+    # The monitor loop only exits when squeue has been empty across several
+    # confirming reads. If accounting STILL reports an active state, the job is
+    # actually alive (squeue/control-plane blip) — resume monitoring rather than
+    # relaunching, which would create a duplicate job.
+    if is_active_state "$state"; then
+        sup "sacct reports still-active state '$state' — transient squeue blip; resuming monitoring (NOT relaunching)."
+        sleep "$POLL_S"
+        continue
+    fi
+
+    case "$state" in
+        COMPLETED)
+            if [[ "$code" == "0:0" ]]; then
+                sup "RUN COMPLETED CLEANLY on attempt $attempt."
+                sup "=== supervisor done (success) ==="
                 exit 0
             fi
-            sup "trainer exited nonzero (E2E_RUN_EXIT=$rc). Will relaunch from latest checkpoint."
-            break
-        fi
-
-        # Stall watchdog: track log growth; if frozen and no trainer alive, die.
-        cur_size=$(cexec "wc -c < '$LOG' 2>/dev/null" | tr -d ' '); cur_size=${cur_size:-0}
-        if [[ "$cur_size" == "$last_size" ]]; then
-            if trainer_alive; then
-                stall_accum=0  # alive but quiet (e.g. long save / eval) — ok
+            sup "COMPLETED but nonzero exit ($code) — relaunching."
+            ;;
+        CANCELLED*)
+            if (( self_cancelled )); then
+                sup "job CANCELLED by our own stall recovery — relaunching from latest checkpoint."
             else
-                stall_accum=$((stall_accum + POLL_S))
-                if (( stall_accum >= STALL_S )); then
-                    sup "STALL: log frozen ${stall_accum}s and no trainer alive — silent death. Relaunching."
-                    break
-                fi
+                sup "job CANCELLED (user/admin intent) — NOT resubmitting. Stopping supervisor."
+                sup "=== supervisor done (cancelled) ==="
+                exit 0
             fi
-        else
-            stall_accum=0
-            last_size=$cur_size
-        fi
-        sleep "$POLL_S"
-    done
+            ;;
+        FAILED|NODE_FAIL|TIMEOUT|OUT_OF_MEMORY|BOOT_FAIL|PREEMPTED|"")
+            sup "failure state '${state:-unknown}' — will relaunch from latest checkpoint."
+            ;;
+        *)
+            sup "unrecognized terminal state '${state}' — relaunching to be safe."
+            ;;
+    esac
 
-    cleanup_workers
-    sleep $(( attempt < 5 ? 20 : 60 ))  # small backoff
+    if (( attempt >= MAX_RELAUNCH )); then break; fi
+    if ! disk_guard; then exit 3; fi
+    sleep $(( attempt < 5 ? 20 : 60 ))   # small backoff
+    # Resubmit with retries. A transient NFS / control-plane error (e.g.
+    # "sbatch: error: Batch job submission failed: I/O error writing
+    # script/environment to file") must NOT kill the supervisor — that once
+    # left a live run permanently unsupervised. Retry with backoff first.
+    JOBID=$(submit_retry)
+    if ! [[ "$JOBID" =~ ^[0-9]+$ ]]; then
+        sup "FATAL: resubmit failed after 12 retries — aborting."; exit 1
+    fi
+    attempt=$((attempt + 1))
+    sup "relaunched as jobid=$JOBID (attempt $attempt/$MAX_RELAUNCH)"
 done
 
 sup "FATAL: exhausted MAX_RELAUNCH=$MAX_RELAUNCH without completion."
-sup "=== streaming e2e supervisor done (failure) ==="
+sup "=== supervisor done (failure) ==="
 exit 1

From be892d8209e020b7141e4c0fcdffdc06bc60147a Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Fri, 19 Jun 2026 00:22:01 +0000
Subject: [PATCH 070/113] dlrmv4: add non-SLURM local launcher + self-healing
 supervisor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

launch_local.sh: single-host, scheduler-free analog of launch_slurm.sh's
worker phase — same train_ranker entry point / yambda_5b.gin config, smoke
or full run on a GPU host with no SLURM/docker/RDMA overlay.

run_streaming_e2e_local.sh: local analog of run_streaming_e2e.sh. Backgrounds
a per-run submit script (so wait $PID yields the trainer's real exit code),
relaunches from the latest checkpoint on crash/nonzero-exit/hang, with a
hang watchdog (frozen log + no trainer proc) and a pre-launch disk guard.
cleanup_container waits for GPU HBM to actually drain before relaunching so an
OOM/crash can't cascade into a dirty-GPU OOM loop.
---
 recommendation_v4/scripts/launch_local.sh     | 154 ++++++++++++++
 .../scripts/run_streaming_e2e_local.sh        | 200 ++++++++++++++++++
 2 files changed, 354 insertions(+)
 create mode 100755 recommendation_v4/scripts/launch_local.sh
 create mode 100755 recommendation_v4/scripts/run_streaming_e2e_local.sh

diff --git a/recommendation_v4/scripts/launch_local.sh b/recommendation_v4/scripts/launch_local.sh
new file mode 100755
index 000000000..9a69a36e5
--- /dev/null
+++ b/recommendation_v4/scripts/launch_local.sh
@@ -0,0 +1,154 @@
+#!/bin/bash
+# =============================================================================
+# launch_local.sh — single-host, NON-SLURM launcher for the yambda-5b trainer.
+#
+# This is the SLURM-free analog of scripts/launch_slurm.sh's `worker` phase:
+# it sets the single-node distributed topology + sane env and invokes the SAME
+# entry point (`train_ranker.py --dataset yambda-5b`) reading the SAME
+# train/gin/yambda_5b.gin config. No scheduler, no docker, no RDMA overlay —
+# everything runs directly on this host against an already-prepared dataset.
+#
+# Use it to:
+#   * Smoke-test the launch path on a single GPU box (SMOKE=1, the default —
+#     a few train/eval batches of one streaming window), or
+#   * Run the full gin-default workload (SMOKE=0 — consumes whole windows).
+#
+# PREREQUISITES
+#   1) Data prepared (run once, CPU-only — no GPU needed):
+#        python generative_recommenders/dlrm_v3/preprocess_public_data.py \
+#            --dataset yambda-5b --data-path "$DLRM_DATA_PATH"
+#      producing  $DLRM_DATA_PATH/processed_5b/{train_sessions.parquet,...}
+#             and $DLRM_DATA_PATH/shared_metadata/{artist,album}_item_mapping.parquet
+#   2) The train_recipe GPU stack importable by $PYTHON (see docs/training_recipe.md):
+#        torch (rocm or cuda build), fbgemm_gpu, torchrec, polars-u64-idx,
+#        gin-config, xxhash, pandas, tensorboard, ...
+#      This box must have visible GPUs (the trainer shards embeddings onto HBM).
+#
+# USAGE
+#   # smoke (default): one window, 20 train + 10 eval batches
+#   DLRM_DATA_PATH=/home/chcai/dlrm_data bash scripts/launch_local.sh
+#
+#   # full gin-default run (whole windows; long)
+#   SMOKE=0 DLRM_DATA_PATH=/home/chcai/dlrm_data bash scripts/launch_local.sh
+#
+#   # restrict to 2 GPUs, custom log, plain (non-streaming) train-eval
+#   GPUS_PER_NODE=2 MODE=train-eval LOG=/tmp/y.log bash scripts/launch_local.sh
+#
+# Every knob below is env-overridable; defaults reproduce launch_slurm.sh's
+# single-node smoke path so a local run matches the known-good cluster path.
+# =============================================================================
+set -uo pipefail
+
+REPO_ROOT=$(cd "$(dirname "$0")/.." && pwd)
+cd "$REPO_ROOT"
+
+# ---- interpreter ------------------------------------------------------------
+# Default to the venv created for data prep if present, else system python3.
+# Override with PYTHON=/path/to/python (e.g. the in-container recipe python).
+DEFAULT_PY=/home/chcai/dlrmv4_venv/bin/python
+PYTHON=${PYTHON:-$([ -x "$DEFAULT_PY" ] && echo "$DEFAULT_PY" || echo python3)}
+
+# ---- dataset / data path ----------------------------------------------------
+DATASET=${DATASET:-yambda-5b}
+MODE=${MODE:-streaming-train-eval}
+# Mirrors the yambda_5b.gin default ("/apps/chcai/dlrm_data"); point at wherever
+# preprocess_public_data.py wrote processed_5b/ + shared_metadata/.
+export DLRM_DATA_PATH=${DLRM_DATA_PATH:-/home/chcai/dlrm_data}
+
+LOG=${LOG:-$REPO_ROOT/yambda_local.$(date +%Y%m%d_%H%M%S).log}
+
+# ---- single-node distributed topology --------------------------------------
+# train_ranker reads these from the env (see train_ranker.main): it spawns
+# GPUS_PER_NODE ranks via torch.multiprocessing on THIS host. localhost
+# rendezvous; empty MASTER_PORT => train_ranker picks a free port.
+export NNODES=${NNODES:-1}
+export NODE_RANK=${NODE_RANK:-0}
+export MASTER_ADDR=${MASTER_ADDR:-localhost}
+export MASTER_PORT=${MASTER_PORT:-}
+# GPUS_PER_NODE: 0/unset => train_ranker auto-detects torch.cuda.device_count().
+export GPUS_PER_NODE=${GPUS_PER_NODE:-0}
+
+# ---- runtime env (matches launch_slurm.sh worker defaults) ------------------
+export HSTU_HAMMER_KERNEL=${HSTU_HAMMER_KERNEL:-TRITON}
+export PYTORCH_CUDA_ALLOC_CONF=${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}
+export PYTHONPATH="$REPO_ROOT:${PYTHONPATH:-}"
+# Single-node RCCL bootstrap: all ranks rendezvous over localhost, so pin the
+# loopback NIC. Left to auto-detect, RCCL can grab a non-routable per-GPU RoCE
+# NIC and hang/"No route to host" at init (same failure launch_slurm.sh pins
+# fenic0 to avoid). Override NCCL_SOCKET_IFNAME for a routable multi-host setup.
+export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-lo}
+export NCCL_DEBUG=${NCCL_DEBUG:-WARN}
+
+# ---- smoke caps -------------------------------------------------------------
+# SMOKE=1 (default): apply small per-window batch caps so a launch finishes in
+# minutes (validates the path end-to-end). SMOKE=0: leave the gin defaults
+# untouched (consume full windows — the real workload).
+SMOKE=${SMOKE:-1}
+if [ "$SMOKE" = "1" ]; then
+  export START_TS=${START_TS:-150}
+  export NUM_TRAIN_TS=${NUM_TRAIN_TS:-1}
+  export NUM_TRAIN_BATCHES=${NUM_TRAIN_BATCHES:-20}
+  export NUM_EVAL_BATCHES=${NUM_EVAL_BATCHES:-10}
+  export EVAL_EVERY_N_WINDOWS=${EVAL_EVERY_N_WINDOWS:-1}
+  export METRIC_LOG_FREQ=${METRIC_LOG_FREQ:-5}
+  # Smaller per-sample shape keeps the smoke run light; drop these to use the
+  # gin defaults (4086/4096). Reuse an existing hstu_cache_L<N>/ if present.
+  export BATCH_SIZE=${BATCH_SIZE:-32}
+fi
+
+mkdir -p "$(dirname "$LOG")"
+{
+  echo "[$(date)] launch_local: dataset=$DATASET mode=$MODE smoke=$SMOKE"
+  echo "[$(date)] PYTHON=$PYTHON"
+  echo "[$(date)] DLRM_DATA_PATH=$DLRM_DATA_PATH"
+  echo "[$(date)] topology: nnodes=$NNODES node_rank=$NODE_RANK gpus_per_node(req)=$GPUS_PER_NODE master=$MASTER_ADDR:${MASTER_PORT:-<auto>}"
+} | tee -a "$LOG"
+
+# ---- preflight: data present? ----------------------------------------------
+SUFFIX=${DATASET#yambda-}
+PROCESSED="$DLRM_DATA_PATH/processed_${SUFFIX}/train_sessions.parquet"
+META="$DLRM_DATA_PATH/shared_metadata/artist_item_mapping.parquet"
+if [ "$DATASET" = "yambda-5b" ] && { [ ! -f "$PROCESSED" ] || [ ! -f "$META" ]; }; then
+  echo "[$(date)] ERROR: prepared data not found." | tee -a "$LOG"
+  echo "  expected: $PROCESSED" | tee -a "$LOG"
+  echo "        and: $META" | tee -a "$LOG"
+  echo "  run preprocessing first:" | tee -a "$LOG"
+  echo "    $PYTHON generative_recommenders/dlrm_v3/preprocess_public_data.py --dataset $DATASET --data-path $DLRM_DATA_PATH" | tee -a "$LOG"
+  exit 1
+fi
+
+# ---- preflight: GPU stack importable + GPUs visible? ------------------------
+echo "[$(date)] preflight: checking torch / fbgemm_gpu / torchrec + GPU count" | tee -a "$LOG"
+"$PYTHON" - <<'PY' 2>&1 | tee -a "$LOG"
+import sys
+missing = []
+for m in ("torch", "fbgemm_gpu", "torchrec", "polars", "gin", "xxhash"):
+    try:
+        __import__(m)
+    except Exception as e:
+        missing.append(f"{m} ({e.__class__.__name__})")
+if missing:
+    print("PREFLIGHT FAIL: missing/broken imports: " + ", ".join(missing))
+    print("Install the train_recipe GPU stack (see docs/training_recipe.md).")
+    sys.exit(3)
+import torch
+n = torch.cuda.device_count()
+print(f"imports OK, torch {torch.__version__}, cuda/hip available={torch.cuda.is_available()}, {n} GPU(s)")
+if n == 0:
+    print("PREFLIGHT FAIL: no GPUs visible — the HSTU trainer shards embeddings "
+          "onto GPU HBM and cannot run CPU-only. Launch on a GPU host.")
+    sys.exit(4)
+PY
+pf=${PIPESTATUS[0]}
+if [ "$pf" -ne 0 ]; then
+  echo "[$(date)] preflight failed (rc=$pf) — not launching trainer." | tee -a "$LOG"
+  exit "$pf"
+fi
+
+# ---- launch -----------------------------------------------------------------
+echo "[$(date)] launching train_ranker ($DATASET, mode=$MODE)" | tee -a "$LOG"
+"$PYTHON" -m generative_recommenders.dlrm_v3.train.train_ranker \
+    --dataset "$DATASET" --mode "$MODE" 2>&1 | tee -a "$LOG"
+rc=${PIPESTATUS[0]}
+echo "[$(date)] launch_local finished rc=$rc" | tee -a "$LOG"
+exit "$rc"
diff --git a/recommendation_v4/scripts/run_streaming_e2e_local.sh b/recommendation_v4/scripts/run_streaming_e2e_local.sh
new file mode 100755
index 000000000..85a817c8c
--- /dev/null
+++ b/recommendation_v4/scripts/run_streaming_e2e_local.sh
@@ -0,0 +1,200 @@
+#!/bin/bash
+# =============================================================================
+# run_streaming_e2e_local.sh — self-healing supervisor for a SINGLE-HOST
+#   (NON-SLURM) yambda-5b streaming train+eval run. Local analog of
+#   scripts/run_streaming_e2e.sh (the SLURM/sbatch supervisor).
+#
+# WHAT IT SUPERVISES
+#   The "job" is one foreground run of --submit-script (default
+#   scripts/launch_e2e_local.sh), which `docker exec`s the trainer in the
+#   container. The supervisor runs that submit-script in the BACKGROUND so:
+#     * its host PID == liveness  (kill -0 / hang watchdog), and
+#     * `wait $PID` == the trainer's EXIT CODE (success vs. failure).
+#   On crash / nonzero-exit / hang it RELAUNCHES the same submit-script (same
+#   $CKPT_PATH/$LOG → resumes from the latest checkpoint), bounded by
+#   --max-relaunch. This is the SLURM supervisor's sacct/squeue/scancel control
+#   plane re-expressed with a local process + `docker exec` lifecycle.
+#
+# WHAT IT DETECTS (poll every --poll-s)
+#   * submit-script process exits -> read its exit code:
+#       0      => run finished cleanly (success)
+#       != 0   => crash/OOM/die_at_step(42)/etc. => relaunch from latest ckpt
+#   * hang watchdog: process alive but $LOG frozen >= --stall-s AND no trainer
+#       process alive in the container (pgrep via docker exec) => kill + pkill in
+#       container + relaunch. A long eval / blocking ckpt save keeps the trainer
+#       process up, so it is NOT counted as a stall.
+#   * disk guard before each (re)launch: require --min-free-gib on the ckpt vol.
+#
+# USAGE
+#   nohup bash scripts/run_streaming_e2e_local.sh \
+#       --submit-script scripts/launch_e2e_local.sh \
+#       --log       /home/chcai/yambda_5b_e2e/<run>/<run>.log \
+#       --ckpt-path /home/chcai/yambda_5b_e2e/<run>/ckpts \
+#       --run-name  <run> \
+#       > /home/chcai/yambda_5b_e2e/<run>/<run>.supervisor.console.log 2>&1 &
+#
+#   Per-run hyperparameters live in the --submit-script's env defaults (or are
+#   exported before invoking this supervisor), not here.
+#
+# EXIT CODES
+#   0  run completed cleanly
+#   1  exhausted --max-relaunch without completion (or launch failed)
+#   3  disk guard tripped
+# =============================================================================
+set -uo pipefail
+
+SUBMIT_SCRIPT="scripts/launch_e2e_local.sh"
+LOG=""
+CKPT_PATH=""
+RUN_NAME="yambda_5b_e2e_local"
+CONTAINER=${CONTAINER:-yambda_local}
+DOCKER=${DOCKER:-sudo docker}
+MAX_RELAUNCH=50
+MIN_FREE_GIB=700        # one full DMP ckpt (~600 GB) + headroom for the atomic
+                        # .tmp written beside the retained one during a save.
+STALL_S=2400            # 40 min frozen-log + no-trainer-proc => hung.
+POLL_S=30
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --submit-script) SUBMIT_SCRIPT="$2"; shift 2;;
+        --log) LOG="$2"; shift 2;;
+        --ckpt-path) CKPT_PATH="$2"; shift 2;;
+        --run-name) RUN_NAME="$2"; shift 2;;
+        --container) CONTAINER="$2"; shift 2;;
+        --docker) DOCKER="$2"; shift 2;;
+        --max-relaunch) MAX_RELAUNCH="$2"; shift 2;;
+        --min-free-gib) MIN_FREE_GIB="$2"; shift 2;;
+        --stall-s) STALL_S="$2"; shift 2;;
+        --poll-s) POLL_S="$2"; shift 2;;
+        *) echo "Unknown arg: $1"; exit 1;;
+    esac
+done
+
+[[ -n "$SUBMIT_SCRIPT" && -f "$SUBMIT_SCRIPT" ]] || { echo "FATAL: --submit-script required and must exist ($SUBMIT_SCRIPT)"; exit 1; }
+[[ -n "$LOG" ]] || { echo "FATAL: --log required"; exit 1; }
+
+SUP_LOG="${LOG%.log}.supervisor.log"
+# Create the log + ckpt dirs up front so the disk guard's df has a real path to
+# stat (df on a nonexistent dir returns 0 avail -> false "disk full" abort).
+mkdir -p "$(dirname "$SUP_LOG")"
+[[ -n "$CKPT_PATH" ]] && mkdir -p "$CKPT_PATH"
+sup() { echo "[$(date '+%F %T')] [supervisor] $*" | tee -a "$SUP_LOG"; }
+
+# Any trainer process alive in the container? [g]enerative self-match guard
+# avoids pgrep matching its own command line.
+trainer_alive() {
+    local n
+    n=$($DOCKER exec "$CONTAINER" bash -lc 'set -f; pgrep -f "[g]enerative_recommenders" | wc -l' 2>/dev/null | tr -dc '0-9')
+    [[ "${n:-0}" -gt 0 ]]
+}
+
+# Hard-kill any trainer processes left in the container AND wait for GPU HBM to
+# actually drain before returning. A rank stuck in a HIP/RCCL collective sits in
+# uninterruptible D-state and keeps its multi-hundred-GB embedding shard resident
+# for many seconds after SIGKILL; relaunching before that frees makes the next
+# attempt OOM on dirty GPUs (an OOM-crash -> dirty-GPU -> OOM cascade). So kill,
+# then poll rocm-smi until every GPU is <5 GB (or give up after ~120s).
+cleanup_container() {
+    $DOCKER exec "$CONTAINER" bash -lc \
+        'pkill -9 -f generative_recommenders 2>/dev/null; pkill -9 -f spawn_main 2>/dev/null; pkill -9 -f resource_tracker 2>/dev/null; true' \
+        2>/dev/null || true
+    local k busy
+    for k in $(seq 1 24); do                       # up to ~120s
+        busy=$($DOCKER exec "$CONTAINER" bash -lc \
+            "rocm-smi --showmeminfo vram 2>/dev/null | awk '/Used/{if (\$NF+0 > 5e9) c++} END{print c+0}'" \
+            2>/dev/null | tr -dc '0-9')
+        busy=${busy:-0}
+        [[ "$busy" == "0" ]] && return 0
+        sup "waiting for GPU HBM to drain ($busy GPU(s) still >5GB)…"
+        $DOCKER exec "$CONTAINER" bash -lc 'pkill -9 -f spawn_main 2>/dev/null; true' 2>/dev/null || true
+        sleep 5
+    done
+    sup "WARNING: GPUs still show residual HBM after 120s — launching anyway."
+    return 0
+}
+
+disk_free_gib() { df -BG --output=avail "$CKPT_PATH" 2>/dev/null | tail -1 | tr -dc '0-9'; }
+
+disk_guard() {
+    [[ -z "$CKPT_PATH" ]] && return 0
+    local free; free=$(disk_free_gib); free=${free:-0}
+    sup "disk guard: ${free} GiB free on $CKPT_PATH (min ${MIN_FREE_GIB})"
+    if (( free < MIN_FREE_GIB )); then
+        sup "FATAL: insufficient free space (${free} < ${MIN_FREE_GIB} GiB). Aborting."
+        return 1
+    fi
+    return 0
+}
+
+# Run the submit-script in the FOREGROUND (its exit status == the trainer's).
+# APPEND_LOG=1 preserves the metrics log across relaunches. This is invoked as
+# `launch & PID=$!` from the main loop so the backgrounded copy is a DIRECT child
+# of this shell — otherwise `wait $PID` can't reap it and always returns 127,
+# making every clean completion look like a failure (infinite relaunch loop).
+launch() {
+    APPEND_LOG=1 CONTAINER="$CONTAINER" DOCKER="$DOCKER" \
+        RUN_NAME="$RUN_NAME" LOG="$LOG" CKPT_PATH="$CKPT_PATH" \
+        bash "$SUBMIT_SCRIPT" >>"$SUP_LOG" 2>&1
+}
+
+sup "=== streaming e2e LOCAL supervisor start ==="
+sup "run=$RUN_NAME submit=$SUBMIT_SCRIPT log=$LOG ckpt=$CKPT_PATH container=$CONTAINER"
+sup "max_relaunch=$MAX_RELAUNCH min_free_gib=$MIN_FREE_GIB stall_s=$STALL_S poll_s=$POLL_S"
+
+attempt=1
+while (( attempt <= MAX_RELAUNCH )); do
+    if ! disk_guard; then exit 3; fi
+    sup "launching attempt $attempt/$MAX_RELAUNCH"
+    cleanup_container                    # ensure no stragglers from a prior attempt
+    launch & PID=$!                      # direct child => wait $PID reaps the real rc
+    sup "submit-script running as host pid=$PID"
+
+    # --- monitor loop ---
+    last_size=0; stall_accum=0; hb=0; hung=0
+    while kill -0 "$PID" 2>/dev/null; do
+        cur_size=$(stat -c %s "$LOG" 2>/dev/null || echo 0)
+        if [[ "$cur_size" == "$last_size" ]]; then
+            hb=$((hb + 1))
+            # Re-check liveness only every 4 polls (cheap docker exec amortized).
+            if (( hb % 4 == 0 )); then
+                if trainer_alive; then
+                    stall_accum=0
+                else
+                    stall_accum=$((stall_accum + POLL_S * 4))
+                    sup "log frozen + no trainer alive (${stall_accum}s/${STALL_S}s)"
+                    if (( stall_accum >= STALL_S )); then
+                        sup "STALL: hung run — killing pid=$PID + container trainer procs, will relaunch."
+                        hung=1
+                        kill -9 "$PID" 2>/dev/null || true
+                        cleanup_container
+                        break
+                    fi
+                fi
+            fi
+        else
+            stall_accum=0; last_size=$cur_size
+        fi
+        sleep "$POLL_S"
+    done
+
+    # --- the submit-script has exited (or we killed it): decide ---
+    wait "$PID" 2>/dev/null; rc=$?
+    if (( hung )); then
+        sup "attempt $attempt ended via STALL recovery (rc=$rc) — relaunching from latest checkpoint."
+    elif (( rc == 0 )); then
+        sup "RUN COMPLETED CLEANLY on attempt $attempt."
+        sup "=== supervisor done (success) ==="
+        exit 0
+    else
+        sup "attempt $attempt exited rc=$rc (crash/OOM/die_at_step) — relaunching from latest checkpoint."
+    fi
+
+    if (( attempt >= MAX_RELAUNCH )); then break; fi
+    sleep $(( attempt < 5 ? 20 : 60 ))   # small backoff
+    attempt=$((attempt + 1))
+done
+
+sup "FATAL: exhausted MAX_RELAUNCH=$MAX_RELAUNCH without completion."
+sup "=== supervisor done (failure) ==="
+exit 1

From 7f73ac7b0dffe0c01c100ac7083fe4c26b0081c1 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Fri, 19 Jun 2026 01:42:02 -0500
Subject: [PATCH 071/113] dlrmv4: set default-PG timeout to TIMEOUT (1800s) to
 survive checkpoint skew

The checkpoint DCP collectives run on the default process group created by
init_process_group, which had no explicit timeout and thus used NCCL's stock
600s watchdog. The 560GB sparse-embedding checkpoint is written to shared NFS
with a badly imbalanced sharding plan (per-rank shards ~37GB..~95GB), so the
fastest rank can wait >600s in the post-write allgather/barrier for the slowest
rank, tripping the watchdog and SIGABRTing an otherwise-healthy job (observed
on 3 nodes, always rank 7). Pass timeout=timedelta(seconds=TIMEOUT) so the
default PG tolerates the skew, matching the secondary new_group.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../generative_recommenders/dlrm_v3/train/utils.py  | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index aaed8c2a0..113841a3c 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -123,9 +123,20 @@ def setup(
     # early to be gin-configurable and is redundant with that call.
 
     # initialize the process group
+    #
+    # The default PG timeout must match TIMEOUT (not the 600s NCCL default):
+    # checkpoint saves go through DCP collectives on *this* default PG, and the
+    # 560GB sparse-embedding write is both slow on shared NFS and badly skewed
+    # across ranks (shards range ~37GB..~95GB), so the fastest rank can sit in
+    # the post-write allgather/barrier well past 600s waiting for the slowest
+    # rank. The stock 600s watchdog then SIGABRTs an otherwise-healthy job.
     if not dist.is_initialized():
         dist.init_process_group(
-            "nccl", rank=rank, world_size=world_size, device_id=device
+            "nccl",
+            rank=rank,
+            world_size=world_size,
+            device_id=device,
+            timeout=timedelta(seconds=TIMEOUT),
         )
 
     pg = dist.new_group(

From 09b4204589fc5f13e8944256ec0d5bde1b5ac8dc Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Mon, 22 Jun 2026 00:57:40 -0500
Subject: [PATCH 072/113] dlrmv4: optional gradient clipping for the streaming
 path ($GRAD_CLIP_NORM)

Add env-configurable global-norm gradient clipping to streaming_train_eval_loop,
applied to dense params after backward() and before optimizer.step() (sparse
tables use a fused optimizer and are unaffected, matching the non-streaming
path's clip_grad_norm_). Wired via gin to $GRAD_CLIP_NORM and forwarded into the
container by launch_slurm.sh. Default 0.0 = OFF, so existing streaming runs are
unchanged. Eliminates the window-131 eval-AUC dip at LR=1e-5.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin           |  7 +++++++
 .../dlrm_v3/train/utils.py                    | 19 +++++++++++++++++++
 recommendation_v4/scripts/launch_slurm.sh     |  1 +
 3 files changed, 27 insertions(+)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index f18c866a2..49870a6b4 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -64,6 +64,13 @@ sparse_optimizer_factory_and_class.weight_decay = 0
 sparse_optimizer_factory_and_class.eps = 1e-8
 sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
 
+# Gradient clipping for the STREAMING path (clips dense params; sparse tables
+# use a fused optimizer and are unaffected). Env-overridable via $GRAD_CLIP_NORM.
+# Default 0.0 = OFF so existing streaming runs are unchanged; set >0 to enable.
+streaming_train_eval_loop.grad_clip_norm = @gcn/env_float()
+gcn/env_float.key     = "GRAD_CLIP_NORM"
+gcn/env_float.default = 0.0
+
 # Data root: resolved at runtime from $DLRM_DATA_PATH if set, else the literal
 # below. Used by both make_train_test_dataloaders and get_dataset.
 # Scoped (`data/env_path`) so this binding doesn't collide with the RUN_NAME
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index 113841a3c..c6c299998 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -1460,6 +1460,9 @@ def streaming_train_eval_loop(
     # --- global step / wall-clock checkpoint cadences ---
     checkpoint_step_frequency: int = 0,
     checkpoint_time_interval_s: float = 0.0,
+    # --- gradient clipping (streaming path, dense params). 0.0 = OFF, which
+    #     preserves legacy streaming behavior. Wired to $GRAD_CLIP_NORM via gin. ---
+    grad_clip_norm: float = 0.0,
     # --- diagnostic: log per-batch unique/total embedding-id counts ---
     streaming_diag_unique_emb: bool = False,
     # --- test-only failure injection knob ---
@@ -1627,6 +1630,13 @@ def streaming_train_eval_loop(
                 original_end_ts, start_ts,
             )
 
+    if rank == 0:
+        logger.info(
+            "[grad-clip] streaming path gradient clipping %s (max_norm=%.4g via $GRAD_CLIP_NORM)",
+            "ENABLED" if (grad_clip_norm and grad_clip_norm > 0) else "OFF",
+            grad_clip_norm,
+        )
+
     def _window_iter(ts: int, skip_samples: int = 0):
         # TRAIN-only iterator: both branches exclude held-out eval users via
         # train_window_indices / set_ts(train_only=True). (Eval uses the fixed
@@ -1743,6 +1753,15 @@ def _run_train_window(
             )
             # pyre-ignore
             sum(aux_losses.values()).backward()
+            # Gradient clipping for the streaming path. Clips dense params (the
+            # sparse embedding tables use a fused optimizer and are unaffected,
+            # same as the non-streaming path's clip_grad_norm_). OFF by default
+            # (grad_clip_norm=0.0 via $GRAD_CLIP_NORM) so legacy streaming runs
+            # are byte-for-byte unchanged; set >0 to enable.
+            if grad_clip_norm and grad_clip_norm > 0:
+                torch.nn.utils.clip_grad_norm_(
+                    model.parameters(), max_norm=grad_clip_norm
+                )
             optimizer.step()
             metric_logger.update(
                 mode="train",
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index f20934285..9b0c1987d 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -290,6 +290,7 @@ orchestrate() {
       ${SEED:+-e SEED=$SEED} \
       ${DENSE_LR:+-e DENSE_LR=$DENSE_LR} \
       ${SPARSE_LR:+-e SPARSE_LR=$SPARSE_LR} \
+      ${GRAD_CLIP_NORM:+-e GRAD_CLIP_NORM=$GRAD_CLIP_NORM} \
       ${HSTU_NUM_LAYERS:+-e HSTU_NUM_LAYERS=$HSTU_NUM_LAYERS} \
       ${MAX_SEQ_LEN:+-e MAX_SEQ_LEN=$MAX_SEQ_LEN} \
       ${HISTORY_LENGTH:+-e HISTORY_LENGTH=$HISTORY_LENGTH} \

From 3cb9eb64a858abc386776365c2af784690ec7108 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Mon, 22 Jun 2026 15:09:57 -0500
Subject: [PATCH 073/113] dlrmv4: data-fraction eval cadence +
 lr1e-7/grad-clip-on defaults

Add a data-percentage-based eval cadence for the streaming loop, an
alternative to the per-window cadence. EVAL_EVERY_DATA_PCT>0 runs the
full-holdout eval every fixed FRACTION of the run's total training data,
so eval points are evenly spaced by data volume regardless of per-window
sample counts. The fraction is converted once into a global train-step
interval (round(pct * total_train_anchors / (batch_size*world_size)))
over the original requested window range, and eval fires on
global_step % interval -- mid-window and resume-stable, mirroring
checkpoint_step_frequency. Each eval label carries @step=<global_step>
for plotting against data volume.

- yambda: total_train_anchors(start_ts, num_ts) one-time O(N) count.
- streaming_train_eval_loop: eval_every_data_pct param + interval calc;
  the two cadences are mutually exclusive (ValueError if both >0).
- launch_slurm.sh: forward EVAL_EVERY_DATA_PCT into the container.

Also flip the yambda-5b gin defaults to the validated config: dense and
sparse LR 1e-5 -> 1e-7, and gradient clipping ON by default
(GRAD_CLIP_NORM 0.0 -> 1.0). HSTU depth stays 3.

Verified e2e on an open-pool node: interval computed correctly, two
mid-window evals fired on the step grid + final eval (rc=0), and the
both-enabled config raises the expected ValueError.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/datasets/yambda.py                |  35 +++++
 .../dlrm_v3/train/gin/yambda_5b.gin           |  48 +++++-
 .../dlrm_v3/train/utils.py                    | 137 +++++++++++++++++-
 recommendation_v4/scripts/launch_slurm.sh     |   1 +
 4 files changed, 214 insertions(+), 7 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py
index ed98a1013..1ced622e5 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py
@@ -734,6 +734,41 @@ def eval_holdout_indices(self, start_ts: int, num_windows: int = 1) -> np.ndarra
         self._eval_holdout_cache_key = key
         return holdout
 
+    def total_train_anchors(self, start_ts: int, num_ts: int) -> int:
+        """Total TRAIN anchors across windows ``[start_ts, start_ts + num_ts)``.
+
+        A single O(N) pass over the cached ``_anchor_ts`` array (NOT per-window
+        ``train_window_indices`` scans). Used to convert a "fraction of training
+        data" eval cadence into a global train-step interval. With a user holdout
+        (``train_split_percentage`` < 1.0) the held-out eval users are excluded
+        via the SAME uid hash as ``train_window_indices``, so the count matches
+        what is actually trained.
+
+        NOTE: this is an UPPER BOUND on the realized train STEP count — the
+        per-window samplers truncate each window to a multiple of ``world_size``
+        and drop the last partial per-rank batch (``drop_last=True``). The small
+        overcount is acceptable for a cadence knob (it only shifts the eval grid
+        by a fraction of a window).
+        """
+        self._ensure_streaming_index()
+        assert self._anchor_ts is not None and self._t_min is not None
+        if num_ts <= 0:
+            return 0
+        w = self._streaming_window_seconds
+        lo = self._t_min + start_ts * w
+        hi = self._t_min + (start_ts + num_ts) * w
+        in_range = (self._anchor_ts >= lo) & (self._anchor_ts < hi)
+        if self._train_split_percentage >= 1.0:
+            total = int(np.count_nonzero(in_range))
+        else:
+            sel = np.where(in_range)[0]
+            total = int(np.count_nonzero(~self._eval_anchor_mask(sel)))
+        logger.warning(
+            f"total_train_anchors(start_ts={start_ts}, num_ts={num_ts}): "
+            f"{total:,} train anchors (tsp={self._train_split_percentage})"
+        )
+        return total
+
     def set_ts(self, ts: int, train_only: bool = False) -> None:
         """Restrict the active sample set to anchors in window ``ts`` (used by
         the per-window-DataLoader path, where ``iloc``/``get_item_count`` index
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 49870a6b4..d14412196 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -43,10 +43,10 @@ seed/env_int.key     = "SEED"
 seed/env_int.default = 1
 
 # dense model optimizer
-# Learning rate is env-overridable via $DENSE_LR (default 1e-5).
+# Learning rate is env-overridable via $DENSE_LR (default 1e-7).
 dense_optimizer_factory_and_class.learning_rate = @dlr/env_float()
 dlr/env_float.key     = "DENSE_LR"
-dlr/env_float.default = 0.00001
+dlr/env_float.default = 0.0000001
 dense_optimizer_factory_and_class.optimizer_name = "Adam"
 dense_optimizer_factory_and_class.momentum = 0
 dense_optimizer_factory_and_class.weight_decay = 0
@@ -54,10 +54,10 @@ dense_optimizer_factory_and_class.eps = 1e-8
 dense_optimizer_factory_and_class.betas = (0.95, 0.999)
 
 # sparse model optimizer
-# Learning rate is env-overridable via $SPARSE_LR (default 1e-5).
+# Learning rate is env-overridable via $SPARSE_LR (default 1e-7).
 sparse_optimizer_factory_and_class.learning_rate = @slr/env_float()
 slr/env_float.key     = "SPARSE_LR"
-slr/env_float.default = 0.00001
+slr/env_float.default = 0.0000001
 sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
 sparse_optimizer_factory_and_class.momentum = 0
 sparse_optimizer_factory_and_class.weight_decay = 0
@@ -66,10 +66,10 @@ sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
 
 # Gradient clipping for the STREAMING path (clips dense params; sparse tables
 # use a fused optimizer and are unaffected). Env-overridable via $GRAD_CLIP_NORM.
-# Default 0.0 = OFF so existing streaming runs are unchanged; set >0 to enable.
+# Default 1.0 = ON (max_norm=1.0); set 0.0 via $GRAD_CLIP_NORM to disable.
 streaming_train_eval_loop.grad_clip_norm = @gcn/env_float()
 gcn/env_float.key     = "GRAD_CLIP_NORM"
-gcn/env_float.default = 0.0
+gcn/env_float.default = 1.0
 
 # Data root: resolved at runtime from $DLRM_DATA_PATH if set, else the literal
 # below. Used by both make_train_test_dataloaders and get_dataset.
@@ -257,6 +257,13 @@ ot/env_int.default = 0
 streaming_train_eval_loop.persistent_loader = @pl/env_int()
 pl/env_int.key = "PERSISTENT_LOADER"
 pl/env_int.default = 1
+# ---- Eval cadence: choose EXACTLY ONE of the two knobs below ----------------
+# The streaming loop can decide WHEN to run the full-holdout eval pass in one of
+# two ways. They are MUTUALLY EXCLUSIVE — enabling the data-fraction cadence
+# (EVAL_EVERY_DATA_PCT>0) requires EVAL_EVERY_N_WINDOWS=0; setting both >0 raises
+# a ValueError at startup. The final end-of-run eval always runs in either mode.
+#
+# (1) PER-WINDOW cadence (EVAL_EVERY_N_WINDOWS, the default).
 # Full-holdout eval cadence (single knob; replaces the old EVAL_EACH_WINDOW
 # on/off switch). 0 = eval disabled (train-only, e.g. perf benchmarking or the
 # resume test; the eval dataloader isn't even built). 1 (default) = eval after
@@ -264,9 +271,38 @@ pl/env_int.default = 1
 # (and always the final one) to amortize the cost of consuming the full next-day
 # eval window. The cadence is anchored to the absolute ts grid so eval points
 # stay stable across a mid-run resume.
+# NOTE: each daily window has a DIFFERENT number of training samples, so a
+# per-window cadence produces eval points that are UNEVENLY spaced in terms of
+# how much data was trained between them. Use the data-fraction cadence below if
+# you want evenly-spaced-by-data eval points instead.
 streaming_train_eval_loop.eval_every_n_windows = @evn/env_int()
 evn/env_int.key = "EVAL_EVERY_N_WINDOWS"
 evn/env_int.default = 1
+#
+# (2) DATA-FRACTION cadence (EVAL_EVERY_DATA_PCT).
+# Run the full-holdout eval every time the run has trained this FRACTION of the
+# run's TOTAL training data, so eval points are EVENLY spaced by data volume
+# (compute), independent of how many samples each daily window happens to hold.
+# This is the fix for the per-window cadence's uneven spacing noted above.
+#   value semantics (it is a fraction in (0, 1], NOT a percent number):
+#     0.0  (default) = OFF  -> fall back to the per-window EVAL_EVERY_N_WINDOWS.
+#     0.01           = eval every 1%  of the data  -> ~100 eval points total.
+#     0.05           = eval every 5%  of the data  -> ~20  eval points total.
+#     0.10           = eval every 10% of the data  -> ~10  eval points total.
+# How "fraction of data" becomes an eval trigger: at startup the fraction is
+# converted ONCE into a global train-step interval
+#     eval_interval_steps = round(pct * total_train_anchors / (batch_size * world_size))
+# where total_train_anchors is counted over the ORIGINAL requested window range
+# [start_ts, start_ts+num_train_ts). Eval then fires whenever the monotonic
+# train global_step crosses a multiple of that interval (it can fire MID-WINDOW,
+# i.e. partway through a daily window, and across window boundaries). Because the
+# interval is computed over the original range and global_step is
+# checkpoint-restored, the eval grid is identical on a cold start and on every
+# resume. Each eval record's label carries "@step=<global_step>" so the
+# trajectory can be plotted against data volume. Override via $EVAL_EVERY_DATA_PCT.
+streaming_train_eval_loop.eval_every_data_pct = @edp/env_float()
+edp/env_float.key     = "EVAL_EVERY_DATA_PCT"
+edp/env_float.default = 0.0
 # Double-buffer windows: prepare the next window (index mask + first-batch
 # prefetch) in a background thread during the current window's compute, hiding
 # the per-window reset. Needs persistent_loader=1. Override via env.
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index c6c299998..0e9e456e9 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -1439,6 +1439,11 @@ def streaming_train_eval_loop(
     start_ts: int = 0,
     persistent_loader: bool = False,
     eval_every_n_windows: int = 1,
+    # Data-fraction eval cadence (mutually exclusive with eval_every_n_windows).
+    # >0 = eval every this FRACTION of the run's total training data, converted
+    # once to a global train-step interval. 0.0 = OFF (use the per-window
+    # cadence). Wired to $EVAL_EVERY_DATA_PCT via gin.
+    eval_every_data_pct: float = 0.0,
     double_buffer: bool = False,
     # --- fixed user-holdout eval set ---
     # Window range the fixed eval set is drawn from. None -> default to
@@ -1507,6 +1512,18 @@ def streaming_train_eval_loop(
     save fires. Used by the failure-injection test to crash at a deterministic
     boundary and then resume.
     """
+    # Exactly one eval cadence may be active. eval_every_n_windows defaults to 1
+    # (eval every window), so enabling the data-fraction cadence REQUIRES
+    # explicitly disabling the per-window one (EVAL_EVERY_N_WINDOWS=0). Fail fast
+    # on a contradictory config rather than silently picking one.
+    if (eval_every_data_pct and eval_every_data_pct > 0) and eval_every_n_windows > 0:
+        raise ValueError(
+            "Conflicting eval cadences: eval_every_data_pct="
+            f"{eval_every_data_pct} (>0) AND eval_every_n_windows="
+            f"{eval_every_n_windows} (>0). They are mutually exclusive. To use "
+            "the data-fraction cadence set EVAL_EVERY_N_WINDOWS=0; to use the "
+            "per-window cadence set EVAL_EVERY_DATA_PCT=0."
+        )
     profiler = Profiler(rank) if output_trace else None
     # Normalize the per-window caps: <=0 (the env-binding default) means "no cap
     # = consume the full window". The eval-break check below is `is not None and
@@ -1563,6 +1580,53 @@ def streaming_train_eval_loop(
         else requested_end_ts
     )
 
+    # Data-fraction eval cadence: convert eval_every_data_pct into a global
+    # train-step interval ONCE, over the ORIGINAL requested window range
+    # [eval_anchor_ts, requested_end_ts). Keying the later trigger off
+    # `global_step % eval_interval_steps` (global_step is monotonic and
+    # checkpoint-restored) makes the eval grid identical on cold start and on
+    # every resume, exactly like checkpoint_step_frequency. 0 => disabled.
+    eval_interval_steps = 0
+    if eval_every_data_pct and eval_every_data_pct > 0:
+        # Per-rank batch size: the persistent loader carries it directly; the
+        # per-window path uses the same gin %batch_size (env BATCH_SIZE,
+        # default 1024 — matches make_streaming_dataloader.batch_size).
+        bs = (
+            persistent_dl.batch_size
+            if persistent_dl is not None
+            else int(os.environ.get("BATCH_SIZE", "1024"))
+        )
+        if hasattr(dataset.dataset, "total_train_anchors"):
+            total_train_anchors = dataset.dataset.total_train_anchors(  # pyre-ignore[16]
+                eval_anchor_ts, requested_end_ts - eval_anchor_ts
+            )
+            total_train_steps = total_train_anchors // max(1, bs * world_size)
+            eval_interval_steps = max(
+                1, round(eval_every_data_pct * total_train_steps)
+            )
+            if rank == 0:
+                logger.info(
+                    "[data-pct-eval] eval_every_data_pct=%.6g -> "
+                    "eval_interval_steps=%d (total_train_anchors=%d bs=%d "
+                    "world_size=%d total_train_steps=%d over windows [%d, %d))",
+                    eval_every_data_pct,
+                    eval_interval_steps,
+                    total_train_anchors,
+                    bs,
+                    world_size,
+                    total_train_steps,
+                    eval_anchor_ts,
+                    requested_end_ts,
+                )
+        elif rank == 0:
+            logger.warning(
+                "[data-pct-eval] dataset %s has no total_train_anchors(); "
+                "data-fraction eval is DISABLED (no per-window eval either, "
+                "since EVAL_EVERY_N_WINDOWS must be 0 to reach here) — only the "
+                "final eval will run.",
+                type(dataset.dataset).__name__,
+            )
+
     # The split is an immutable run contract: a silent change across resume
     # would both desync the mid-window skip offset AND turn held-out eval users
     # into trained users (leakage). Build the live contract and validate the
@@ -1711,6 +1775,7 @@ def _run_train_window(
         train_ts: int,
         start_batch_idx: int = 0,
         label: Optional[str] = None,
+        do_eval: Optional[Callable[[int, int], None]] = None,
     ) -> None:
         # `start_batch_idx` is set when we're re-entering a window that was
         # interrupted mid-way (in_window resume); the dataloader iterator was
@@ -1817,6 +1882,30 @@ def _run_train_window(
                 # Reset the wall-clock anchor on ANY save so the next time
                 # trigger is measured from the most recent checkpoint.
                 last_ckpt_time[0] = time.time()
+            # Data-fraction eval cadence: run the full-holdout eval whenever the
+            # monotonic global step crosses a multiple of eval_interval_steps
+            # (i.e. every eval_every_data_pct of the training data). Keyed off
+            # global_step (checkpoint-restored) so the eval grid is identical
+            # across resume. Mid-window-safe: eval sets model.eval(), so restore
+            # train mode + dataset.is_eval afterward. do_eval is None unless the
+            # data-pct cadence is enabled.
+            if (
+                do_eval is not None
+                and eval_interval_steps > 0
+                and gstep > 0
+                and gstep % eval_interval_steps == 0
+            ):
+                if rank == 0:
+                    logger.info(
+                        "[data-pct-eval] trigger eval train_ts=%d global_step=%d "
+                        "(interval=%d)",
+                        train_ts,
+                        gstep,
+                        eval_interval_steps,
+                    )
+                do_eval(train_ts, gstep)
+                model.train()
+                dataset.dataset.is_eval = False  # pyre-ignore [16]
             # Test-only: deterministic crash for the failure-injection test.
             # Triggered AFTER the save above, so on resume we re-enter at
             # batch_idx_in_window=train_batch_idx and emit batches [K+1, end).
@@ -2035,7 +2124,13 @@ def _should_eval(i: int) -> bool:
         # sample content depends only on the sampler window, not is_eval, so
         # prefetching during train is safe.
         eval_iter: Optional[Iterator] = None
-        if eval_every_n_windows > 0 and len(train_ts_list) > 0:
+        # Build/fork the eval pool when EITHER cadence needs it: the per-window
+        # cadence (eval_every_n_windows>0) or the data-fraction cadence
+        # (eval_interval_steps>0). Both are never simultaneously on (validated
+        # at entry), so this is "eval is enabled at all".
+        if (eval_every_n_windows > 0 or eval_interval_steps > 0) and len(
+            train_ts_list
+        ) > 0:
             eval_sampler = StreamingWindowSampler(rank, world_size)
             eval_dl = make_persistent_streaming_dataloader(
                 dataset=dataset, sampler=eval_sampler
@@ -2053,6 +2148,22 @@ def _should_eval(i: int) -> bool:
             # iter() again to replay the identical set (no set_window churn).
             eval_sampler.set_window(eval_global_indices)
             eval_iter = iter(eval_dl)
+
+        # Data-fraction eval callback (double-buffer path). Fired mid-window by
+        # _run_train_window on the global-step cadence. Reuses the already-forked
+        # persistent eval pool: iter(eval_dl) here runs on the MAIN thread (a
+        # reset, not a fork — the only fork was the up-front iter() above), so it
+        # stays safe alongside the background window-prefetch thread.
+        def _do_eval_db(train_ts: int, gstep: int) -> None:
+            dataset.dataset.is_eval = True  # pyre-ignore [16]
+            assert eval_dl is not None
+            _run_eval_window(
+                iter(eval_dl),
+                label=f"eval_holdout@train_ts={train_ts}@step={gstep}",
+            )
+
+        _db_do_eval = _do_eval_db if eval_interval_steps > 0 else None
+
         for i, (train_ts, train_data_iterator) in enumerate(
             # Only the FIRST window after a mid-window resume needs the skip
             # (handed via prefetcher.stream's first_skip_samples). The skip is
@@ -2074,6 +2185,7 @@ def _should_eval(i: int) -> bool:
                 train_ts=train_ts,
                 start_batch_idx=start_batch,
                 label=f"train_ts={train_ts}",
+                do_eval=_db_do_eval,
             )
             if _should_eval(i):
                 dataset.dataset.is_eval = True  # pyre-ignore [16]
@@ -2090,6 +2202,28 @@ def _should_eval(i: int) -> bool:
                     eval_iter = iter(eval_dl)
             _maybe_checkpoint(train_ts)
     else:
+        # Data-fraction eval callback (non-double-buffer path). Builds a fresh
+        # eval dataloader per call over the FIXED holdout set (or the legacy
+        # next-window eval when the dataset has no holdout support).
+        def _do_eval_nb(train_ts: int, gstep: int) -> None:
+            dataset.dataset.is_eval = True  # pyre-ignore [16]
+            if eval_global_indices is not None:
+                _run_eval_window(
+                    iter(
+                        make_streaming_dataloader(
+                            dataset=dataset, indices=eval_global_indices
+                        )
+                    ),
+                    label=f"eval_holdout@train_ts={train_ts}@step={gstep}",
+                )
+            else:
+                _run_eval_window(
+                    iter(make_streaming_dataloader(dataset=dataset, ts=train_ts + 1)),
+                    label=f"eval@train_ts={train_ts}@step={gstep}",
+                )
+
+        _nb_do_eval = _do_eval_nb if eval_interval_steps > 0 else None
+
         for i, train_ts in enumerate(train_ts_list):
             dataset.dataset.is_eval = False  # pyre-ignore [16]
             skip = first_skip_samples if i == 0 else 0
@@ -2102,6 +2236,7 @@ def _should_eval(i: int) -> bool:
                 _window_iter(train_ts, skip_samples=skip),
                 train_ts=train_ts,
                 start_batch_idx=start_batch,
+                do_eval=_nb_do_eval,
             )
             if _should_eval(i):
                 dataset.dataset.is_eval = True  # pyre-ignore [16]
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index 9b0c1987d..579d4427a 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -276,6 +276,7 @@ orchestrate() {
       -e NUM_TRAIN_TS=$NUM_TRAIN_TS \
       -e EVAL_EACH_WINDOW=$EVAL_EACH_WINDOW \
       -e EVAL_EVERY_N_WINDOWS=$EVAL_EVERY_N_WINDOWS \
+      ${EVAL_EVERY_DATA_PCT:+-e EVAL_EVERY_DATA_PCT=$EVAL_EVERY_DATA_PCT} \
       -e NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES \
       -e NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES \
       -e METRIC_LOG_FREQ=$METRIC_LOG_FREQ \

From c2342c5d992680c8ae1b458e6470bf02b448e584 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Mon, 22 Jun 2026 20:18:39 +0000
Subject: [PATCH 074/113] dlrmv4: seed embedding init + reproducibility
 checksum ($SEED)

Make embedding-table init a deterministic function of $SEED (dense init
already was), so runs are a clean init-matched A/B when data order
($STREAMING_SHUFFLE_SEED) and the holdout split ($SPLIT_SALT) are fixed.

- configs.py: attach a per-table seeded uniform init_fn (per-table seed =
  sha256($SEED, table_name)); meta-safe (skips the meta device DMP builds
  the unsharded module on). Init bounds mirror stock (+/-1/sqrt(N) or a
  table's explicit weight_init_min/max), so the distribution -- and thus
  model quality -- is unchanged; only determinism/seeding differs.
- utils.py (make_optimizer_and_shard): re-seed torch/torch.cuda from $SEED
  right before DistributedModelParallel(...) so the fused FBGEMM TBE
  on-device embedding init is reproducible for a FIXED sharding plan
  (Tier 1). Dense params are already built in make_model, so untouched.
- utils.py: log a post-DMP init checksum (per-table count/sum/sumsq +
  a one-line digest; sharded stats are all-reduced so the fingerprint
  covers the whole table regardless of shard layout). Gate via
  INIT_CHECKSUM (default on). Verified on idle nodes: same seed -> same
  digest across reruns and node types; different seed -> different digest.
- yambda_5b.gin: document precisely what $SEED controls and what it does
  not (streaming data order, holdout split).

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/configs.py                        | 80 ++++++++++++++++++-
 .../dlrm_v3/train/gin/yambda_5b.gin           | 50 ++++++++++--
 .../dlrm_v3/train/utils.py                    | 64 +++++++++++++++
 3 files changed, 187 insertions(+), 7 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/configs.py b/recommendation_v4/generative_recommenders/dlrm_v3/configs.py
index 1b6ecf62f..387fb4900 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/configs.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/configs.py
@@ -19,9 +19,13 @@
 This module provides configuration functions for the HSTU model architecture and embedding table configurations.
 """
 
-from typing import Dict, Optional
+import hashlib
+import math
+import os
+from typing import Callable, Dict, Optional, Tuple
 
 import gin
+import torch
 
 from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig
 from generative_recommenders.modules.multitask_module import (
@@ -509,10 +513,63 @@ def get_hstu_configs(
     return hstu_config
 
 
+def _stable_table_seed(init_seed: int, table_name: str) -> int:
+    """Deterministic 63-bit seed from (init_seed, table_name).
+
+    Uses sha256 (not Python's salted built-in ``hash()``) so the per-table seed
+    is identical across processes/ranks/runs for a given ``$SEED`` + table name.
+    """
+    digest = hashlib.sha256(f"{init_seed}:{table_name}".encode("utf-8")).digest()
+    return int.from_bytes(digest[:8], "big") & 0x7FFF_FFFF_FFFF_FFFF
+
+
+def _uniform_init_bounds(cfg: EmbeddingConfig) -> Tuple[float, float]:
+    """Mirror TorchREC's default per-table init bounds.
+
+    TorchREC falls back to ``uniform_(-1/sqrt(N), +1/sqrt(N))`` when a table does
+    not set ``weight_init_min/max``; honor any explicit bounds the config carries.
+    """
+    bound = math.sqrt(1.0 / cfg.num_embeddings)
+    lo = -bound if cfg.weight_init_min is None else cfg.weight_init_min
+    hi = bound if cfg.weight_init_max is None else cfg.weight_init_max
+    return lo, hi
+
+
+def _make_seeded_uniform_init(
+    table_seed: int, lo: float, hi: float
+) -> Callable[[torch.Tensor], torch.Tensor]:
+    """Build a seeded in-place uniform initializer for one table's weight.
+
+    TorchREC/FBGEMM calls ``init_fn`` with the (per-rank) local shard tensor on
+    its compute device, so we seed a generator on that same device. For a fixed
+    sharding plan (world size + plan unchanged) this makes embedding init
+    byte-reproducible run-to-run.
+    """
+
+    def _init(weight: torch.Tensor) -> torch.Tensor:
+        # TorchREC builds the unsharded EmbeddingCollection on the META device
+        # first (DMP materializes real storage on the compute device later).
+        # Meta tensors have no storage and torch.Generator(device="meta") is
+        # invalid ("META device type not an accelerator"), so skip them: the
+        # seeded init for the sharded/fused TBE path is provided by the RNG
+        # re-seed right before DMP in make_optimizer_and_shard. On a real
+        # device (eager/non-meta path) we still apply the per-table seeded fill.
+        if weight.device.type == "meta":
+            return weight
+        gen = torch.Generator(device=weight.device)
+        gen.manual_seed(table_seed)
+        with torch.no_grad():
+            weight.uniform_(lo, hi, generator=gen)
+        return weight
+
+    return _init
+
+
 @gin.configurable
 def get_embedding_table_config(
     dataset: str = "debug",
     embedding_dim: Optional[int] = None,
+    init_seed: Optional[int] = None,
 ) -> Dict[str, EmbeddingConfig]:
     """
     Create and return embedding table configurations.
@@ -527,10 +584,31 @@ def get_embedding_table_config(
             `HSTU_EMBEDDING_DIM`. Keep in sync with the matching gin override on
             `get_hstu_configs.hstu_embedding_table_dim` — the model and the
             tables must agree on dim or sharding will reject the plan.
+        init_seed: Base seed for the per-table seeded `init_fn` (Tier 1
+            reproducible embedding init). When None, falls back to `$SEED`
+            (default 1), matching `seed_everything`. Each table draws from a
+            generator seeded by `sha256(init_seed, table_name)` so init is
+            reproducible run-to-run for a fixed sharding plan.
 
     Returns:
         Dict mapping table names to their EmbeddingConfig objects.
     """
+    tables = _build_embedding_table_config(dataset=dataset, embedding_dim=embedding_dim)
+
+    if init_seed is None:
+        init_seed = int(os.environ.get("SEED", "1"))
+    for name, cfg in tables.items():
+        lo, hi = _uniform_init_bounds(cfg)
+        cfg.init_fn = _make_seeded_uniform_init(
+            _stable_table_seed(init_seed, name), lo, hi
+        )
+    return tables
+
+
+def _build_embedding_table_config(
+    dataset: str = "debug",
+    embedding_dim: Optional[int] = None,
+) -> Dict[str, EmbeddingConfig]:
     DIM = embedding_dim if embedding_dim is not None else HSTU_EMBEDDING_DIM
     if "movielens" in dataset:
         assert dataset in [
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index d14412196..227233101 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -32,12 +32,50 @@ make_model.hammer_kernel = "TRITON"
 # pinned constants in ops/triton/_autotune_pinning.py call sites.
 apply_env_bootstrap.TRITON_FULL_AUTOTUNE = False
 
-# Global RNG seed for reproducible weight init (dense params + embedding tables)
-# and any seeded RNG consumers. Same seed on every rank => identical dense init;
-# fixing it makes runs an init-matched A/B (data order is already deterministic
-# via the sampler). seed_everything() is called right before make_model() in
-# train_ranker (after the full gin parse), so this binding is resolved in the
-# second parse where env_int is registered. Override per-run via $SEED.
+# =============================================================================
+# $SEED — global RNG seed for reproducible MODEL INITIALIZATION.
+#
+# WHAT IT CONTROLS (all weight init is a deterministic function of $SEED):
+#   1. Dense parameters (HSTU transformer blocks, MLPs, action embeddings, the
+#      postprocessor). seed_everything() seeds python/numpy/torch/torch.cuda
+#      with $SEED right before make_model(). The SAME seed is set on every rank,
+#      so dense weights are initialized identically across ranks AND reproducibly
+#      run-to-run.
+#   2. Sparse embedding tables (item_id, artist_id, album_id, uid, the cross
+#      tables). These are materialized on-device while DMP shards the model, NOT
+#      in make_model(), so seed_everything() alone does not pin them. Two
+#      mechanisms tie them to $SEED:
+#        (a) make_optimizer_and_shard() RE-SEEDS torch/torch.cuda from $SEED
+#            immediately before DistributedModelParallel(...), so the fused
+#            FBGEMM TBE init (which draws from the global RNG on-device) is
+#            reproducible. This is the path that actually applies here.
+#        (b) get_embedding_table_config() also attaches a per-table seeded
+#            init_fn (seed derived from sha256($SEED, table_name)) for the
+#            eager/non-meta code path. It no-ops on the meta device that DMP
+#            uses, so (a) is the effective guarantee for this setup.
+#   3. Any other seeded RNG consumers (e.g. dropout's init-time draws).
+#
+#   SCOPE ("Tier 1"): reproducible for a FIXED sharding plan — same GPU/world
+#   size AND same planner output. It is NOT invariant to changing the GPU count
+#   or sharding (the per-shard draw boundaries move). The init DISTRIBUTION is
+#   unchanged from stock (uniform +/-1/sqrt(num_embeddings), or a table's
+#   explicit weight_init_min/max), so $SEED affects determinism, not quality.
+#
+#   VERIFY: INIT_CHECKSUM=1 (default) logs a per-table fingerprint + a one-line
+#   "[init-checksum] SEED=.. digest=.." right after DMP. Two builds with the same
+#   $SEED + plan print the same digest; different seeds differ. (INIT_CHECKSUM=0
+#   to skip.)
+#
+# WHAT IT DOES *NOT* CONTROL (separate, independent knobs):
+#   - Streaming data order / shuffle permutation -> $STREAMING_SHUFFLE_SEED
+#     (get_dataset.streaming_shuffle_seed, below).
+#   - Train/eval holdout user split               -> $SPLIT_SALT (below).
+#   So holding $STREAMING_SHUFFLE_SEED + $SPLIT_SALT fixed and varying ONLY $SEED
+#   isolates the effect of model initialization (a clean init-seed A/B / sweep).
+#
+# PARSE NOTE: seed_everything() runs right before make_model() in train_ranker
+# (after the full gin parse), so this binding resolves in the second parse where
+# env_int is registered. Override per-run via $SEED.
 seed_everything.seed = @seed/env_int()
 seed/env_int.key     = "SEED"
 seed/env_int.default = 1
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index 0e9e456e9..85acd7711 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -504,6 +504,18 @@ def make_optimizer_and_shard(
 
     plan = planner.collective_plan(model, sharders, pg)
 
+    # Re-seed right before DMP materializes/inits the sharded embedding tables.
+    # The per-table seeded init_fn (configs.get_embedding_table_config) handles
+    # the eager path, but the fused FBGEMM TBE path inits weights on-device and
+    # may bypass init_fn, drawing from the global RNG instead. Re-seeding here
+    # (same value on every rank) makes embedding init reproducible run-to-run for
+    # a fixed sharding plan (Tier 1). Dense params are already initialized in
+    # make_model, so this does not perturb them.
+    _emb_seed = int(os.environ.get("SEED", "1"))
+    torch.manual_seed(_emb_seed)
+    torch.cuda.manual_seed_all(_emb_seed)
+    logger.info(f"[emb-init] re-seeded RNGs before DMP with SEED={_emb_seed}")
+
     # Shard model
     model = DistributedModelParallel(
         module=model,
@@ -511,6 +523,58 @@ def make_optimizer_and_shard(
         plan=plan,
         sharders=sharders,
     )
+
+    # --- startup init checksum (reproducibility probe) -------------------------
+    # Right after DMP materializes real weights, log a cheap deterministic
+    # fingerprint of every parameter so two builds with the same $SEED + sharding
+    # plan can be diffed for byte-level init reproducibility. For sharded
+    # embeddings we all-reduce the per-shard (count, sum, sumsq) so the
+    # fingerprint covers the WHOLE table independent of how rows split across
+    # ranks; replicated dense params use rank 0's local copy. Stats are computed
+    # as in-place reductions with fp64 accumulation (no full-size temporaries —
+    # the embedding shards are tens of GB), so this stays light. Disable with
+    # INIT_CHECKSUM=0.
+    if os.environ.get("INIT_CHECKSUM", "1") == "1":
+        import hashlib
+
+        _rank = dist.get_rank() if dist.is_initialized() else 0
+        _fps: List[str] = []
+        for _name, _p in sorted(model.named_parameters(), key=lambda kv: kv[0]):
+            _sharded = isinstance(_p, ShardedTensor)
+            if _sharded:
+                _shards = _p.local_shards()
+                _loc = _shards[0].tensor if _shards else None
+            else:
+                _loc = _p
+            if _loc is None or _loc.numel() == 0:
+                _cnt, _sm, _sq = 0.0, 0.0, 0.0
+            else:
+                _det = _loc.detach()
+                _cnt = float(_det.numel())
+                _sm = _det.sum(dtype=torch.float64).item()
+                _nrm = torch.linalg.vector_norm(
+                    _det, ord=2, dtype=torch.float64
+                ).item()
+                _sq = _nrm * _nrm
+            if _sharded and dist.is_initialized():
+                _stat = torch.tensor(
+                    [_cnt, _sm, _sq], dtype=torch.float64, device=device
+                )
+                dist.all_reduce(_stat, op=dist.ReduceOp.SUM)
+                _cnt, _sm, _sq = _stat.tolist()
+            _fps.append(f"{_name}|{int(_cnt)}|{_sm:.6f}|{_sq:.6f}")
+            if _rank == 0:
+                logger.info(
+                    f"[init-checksum] {'sharded' if _sharded else 'dense'} "
+                    f"{_name} n={int(_cnt)} sum={_sm:.6f} sumsq={_sq:.6f}"
+                )
+        if _rank == 0:
+            _digest = hashlib.sha256("\n".join(_fps).encode()).hexdigest()[:16]
+            logger.info(
+                f"[init-checksum] SEED={os.environ.get('SEED', '?')} "
+                f"params={len(_fps)} digest={_digest}"
+            )
+
     # Create keyed optimizer
     all_optimizers = []
     all_params = {}

From eef33047ed7715c33fd95d7ebd43d3e5b5c48509 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Mon, 22 Jun 2026 18:06:33 -0500
Subject: [PATCH 075/113] dlrmv4: default INIT_CHECKSUM off (fp64 shard copy
 OOMs the build)

The startup reproducibility checksum computes per-shard sum/norm with
dtype=float64, which materializes a full fp64 copy of each local embedding
shard (>150 GiB for the big tables). After sharding leaves ~95 GiB of fp32
tables resident, that temporary leaves almost no HBM headroom and OOMs the
build during make_optimizer_and_shard on any node with residual memory.

Flip INIT_CHECKSUM to default 0 (opt-in) so normal launches and supervisor
resubmits never run it. Correct the utils.py + yambda_5b.gin comments that
claimed it had no full-size temporaries.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin           | 12 ++++++----
 .../dlrm_v3/train/utils.py                    | 24 +++++++++++--------
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 227233101..e838b346f 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -61,10 +61,14 @@ apply_env_bootstrap.TRITON_FULL_AUTOTUNE = False
 #   unchanged from stock (uniform +/-1/sqrt(num_embeddings), or a table's
 #   explicit weight_init_min/max), so $SEED affects determinism, not quality.
 #
-#   VERIFY: INIT_CHECKSUM=1 (default) logs a per-table fingerprint + a one-line
-#   "[init-checksum] SEED=.. digest=.." right after DMP. Two builds with the same
-#   $SEED + plan print the same digest; different seeds differ. (INIT_CHECKSUM=0
-#   to skip.)
+#   VERIFY: INIT_CHECKSUM=1 (OFF by default) logs a per-table fingerprint + a
+#   one-line "[init-checksum] SEED=.. digest=.." right after DMP. Two builds with
+#   the same $SEED + plan print the same digest; different seeds differ. It is OFF
+#   by default because the fp64 per-shard reductions materialize a full fp64 copy
+#   of each local embedding shard (>150 GiB for the big tables), leaving almost no
+#   HBM headroom after sharding and OOMing the build on any node with residual
+#   memory. Enable only for an explicit reproducibility check (ideally a clean
+#   node / smaller batch).
 #
 # WHAT IT DOES *NOT* CONTROL (separate, independent knobs):
 #   - Streaming data order / shuffle permutation -> $STREAMING_SHUFFLE_SEED
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index 85acd7711..785320020 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -525,16 +525,20 @@ def make_optimizer_and_shard(
     )
 
     # --- startup init checksum (reproducibility probe) -------------------------
-    # Right after DMP materializes real weights, log a cheap deterministic
-    # fingerprint of every parameter so two builds with the same $SEED + sharding
-    # plan can be diffed for byte-level init reproducibility. For sharded
-    # embeddings we all-reduce the per-shard (count, sum, sumsq) so the
-    # fingerprint covers the WHOLE table independent of how rows split across
-    # ranks; replicated dense params use rank 0's local copy. Stats are computed
-    # as in-place reductions with fp64 accumulation (no full-size temporaries —
-    # the embedding shards are tens of GB), so this stays light. Disable with
-    # INIT_CHECKSUM=0.
-    if os.environ.get("INIT_CHECKSUM", "1") == "1":
+    # Right after DMP materializes real weights, log a deterministic fingerprint
+    # of every parameter so two builds with the same $SEED + sharding plan can be
+    # diffed for byte-level init reproducibility. For sharded embeddings we
+    # all-reduce the per-shard (count, sum, sumsq) so the fingerprint covers the
+    # WHOLE table independent of how rows split across ranks; replicated dense
+    # params use rank 0's local copy.
+    # OFF BY DEFAULT: the fp64 reductions below (.sum(dtype=float64) /
+    # vector_norm(dtype=float64)) materialize a full fp64 copy of each local
+    # embedding shard (~2x the fp32 shard, i.e. >150 GiB for the big tables),
+    # which leaves almost no HBM headroom after sharding and will OOM the build on
+    # any node with residual memory. Only enable for explicit reproducibility
+    # checks, ideally with a smaller batch / on a clean node. Enable with
+    # INIT_CHECKSUM=1.
+    if os.environ.get("INIT_CHECKSUM", "0") == "1":
         import hashlib
 
         _rank = dist.get_rank() if dist.is_initialized() else 0

From 38a9175a9ebd9d85c6af33b3f8776fab3f4eed4b Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Tue, 23 Jun 2026 02:59:51 +0000
Subject: [PATCH 076/113] dlrmv4: add last_n UIH history strategy
 ($HISTORY_STRATEGY)

Adds a configurable Yambda UIH construction strategy alongside the existing
per-pool interleaved scheme. "last_n" takes the last HISTORY_LENGTH events of
any pool (listen+/like/skip) with no per-pool split, raising effective sequence
length (~2.7k -> ~4.1k) and letting the like share fall to its natural rate.
Default stays "interleaved" (no behavior change); strategy is resolved at
sample-build time so it reuses the existing on-disk cache (no rebuild).

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/datasets/yambda.py                | 137 ++++++++++++++----
 .../dlrm_v3/train/gin/yambda_5b.gin           |  37 +++++
 .../generative_recommenders/dlrm_v3/utils.py  |   8 +
 3 files changed, 156 insertions(+), 26 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py
index 1ced622e5..00b22cff9 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py
@@ -190,8 +190,12 @@ class DLRMv3YambdaDataset(DLRMv3RandomDataset):
         hstu_config: DlrmHSTUConfig (must come from `get_hstu_configs("yambda-5b")`).
         processed_dir: directory with `train_sessions.parquet` + `item_popularity.npy`.
         metadata_dir: directory with `{artist,album}_item_mapping.parquet`.
-        history_length: per-pool truncation cap (total interleaved ≤ 3 * this).
+        history_length: UIH cap. Under "interleaved" it is the per-pool cap
+            (total ≤ 3 * history_length // 3); under "last_n" it is the literal
+            total number of pooled events kept.
         scan_window: how far back to scan when filling each pool.
+        history_strategy: "interleaved" (equal per-pool L//3 cap, re-interleaved)
+            or "last_n" (last history_length pooled events, no per-pool split).
         cross_specs: list of (name, keys, num_embeddings, salt). Source of truth
             in `dlrm_v3/configs.py:YAMBDA_5B_CROSS_SPECS`.
         is_inference: passed through to base class.
@@ -205,6 +209,7 @@ def __init__(
         history_length: int = 2048,
         scan_window: int = 20000,
         min_history: Optional[int] = None,
+        history_strategy: str = "interleaved",
         cross_specs: Optional[Sequence[Tuple[str, Sequence[str], int, int]]] = None,
         cache_dir: Optional[str] = None,
         is_inference: bool = False,
@@ -222,6 +227,21 @@ def __init__(
         self._metadata_dir: str = metadata_dir
         self._history_length: int = history_length
         self._scan_window: int = scan_window
+        # UIH construction strategy:
+        #   "interleaved" (default) — equal history_length//3 cap per behavior
+        #     pool (listen+/like/skip), re-interleaved chronologically. Likes are
+        #     ~1.9% of the corpus so the like pool over-fills relative to its
+        #     natural frequency while the sequence under-fills overall.
+        #   "last_n" — take the last history_length events of ANY pool type with
+        #     no per-pool split. Fills the sequence to ~history_length (higher
+        #     effective length) and lets the like share fall to its natural rate.
+        # Both exclude dislike/unlike/undislike (no action-weight bit).
+        if history_strategy not in ("interleaved", "last_n"):
+            raise ValueError(
+                f"history_strategy must be 'interleaved' or 'last_n', got "
+                f"{history_strategy!r}"
+            )
+        self._history_strategy: str = history_strategy
         # Minimum prior-event count for a LISTEN event to qualify as an anchor.
         # Decoupled from history_length (which is only the gather/truncation cap):
         # jagged attention handles short UIH, so we no longer require a full
@@ -814,6 +834,64 @@ def get_sample(self, idx: int) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]:
         flat_pos = self.iloc(idx)
         return self._build_sample(flat_pos, max_num_candidates)
 
+    @staticmethod
+    def _empty_history() -> Tuple[
+        np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray
+    ]:
+        empty = np.empty(0, dtype=np.int64)
+        return empty, empty, empty, empty, empty
+
+    def _read_scan_window(
+        self, flat_pos: int, user_start: int
+    ) -> Optional[
+        Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]
+    ]:
+        """Read the causal scan window [scan_start, flat_pos) for an anchor.
+        Returns (item_ids, timestamps, is_lp, is_like, is_skip) views, or None
+        if the window is empty."""
+        scan_start = max(int(user_start), int(flat_pos) - self._scan_window)
+        scan_end = int(flat_pos)
+        if scan_end <= scan_start:
+            return None
+        return (
+            self.store.flat_item_ids[scan_start:scan_end],
+            self.store.flat_timestamps[scan_start:scan_end],
+            self.store.flat_is_listen_plus[scan_start:scan_end],
+            self.store.flat_is_like[scan_start:scan_end],
+            self.store.flat_is_skip[scan_start:scan_end],
+        )
+
+    def _materialize_history(
+        self,
+        keep_local: np.ndarray,
+        item_ids: np.ndarray,
+        timestamps: np.ndarray,
+        is_lp: np.ndarray,
+        is_like: np.ndarray,
+        is_skip: np.ndarray,
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """Gather item/artist/album/ts + pool-bitmask `weight` for the kept
+        (chronologically-ordered) local indices."""
+        items = item_ids[keep_local]
+        ts = timestamps[keep_local]
+        artists = self.item_to_artist[np.clip(items, 0, self.item_to_artist.shape[0] - 1)]
+        albums = self.item_to_album[np.clip(items, 0, self.item_to_album.shape[0] - 1)]
+        # Pool bitmask per kept event (LP/LIKE/SKIP are mutually exclusive in
+        # the source data, but OR is safe and forward-compatible).
+        weight = np.zeros(keep_local.shape[0], dtype=np.int64)
+        weight[is_lp[keep_local]] |= LP_BIT
+        weight[is_like[keep_local]] |= LIKE_BIT
+        weight[is_skip[keep_local]] |= SKIP_BIT
+        return items, artists, albums, ts, weight
+
+    def _gather_history(
+        self, flat_pos: int, user_start: int
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """Dispatch UIH construction to the configured strategy."""
+        if self._history_strategy == "last_n":
+            return self._gather_last_n_history(flat_pos, user_start)
+        return self._gather_interleaved_history(flat_pos, user_start)
+
     def _gather_interleaved_history(
         self, flat_pos: int, user_start: int
     ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
@@ -822,17 +900,10 @@ def _gather_interleaved_history(
         (LP_BIT/LIKE_BIT/SKIP_BIT). Per-pool cap = history_length // 3."""
         L = self._history_length
         per_pool = max(1, L // 3)
-        scan_start = max(int(user_start), int(flat_pos) - self._scan_window)
-        scan_end = int(flat_pos)
-        if scan_end <= scan_start:
-            empty = np.empty(0, dtype=np.int64)
-            return empty, empty, empty, empty, empty
-
-        item_ids = self.store.flat_item_ids[scan_start:scan_end]
-        timestamps = self.store.flat_timestamps[scan_start:scan_end]
-        is_lp = self.store.flat_is_listen_plus[scan_start:scan_end]
-        is_like = self.store.flat_is_like[scan_start:scan_end]
-        is_skip = self.store.flat_is_skip[scan_start:scan_end]
+        scan = self._read_scan_window(flat_pos, user_start)
+        if scan is None:
+            return self._empty_history()
+        item_ids, timestamps, is_lp, is_like, is_skip = scan
 
         # Local indices into the scan window — preserves chronological order
         # within each pool and lets us interleave by re-sorting.
@@ -843,25 +914,39 @@ def _gather_interleaved_history(
 
         keep_local = np.concatenate([lp_idx, like_idx, skip_idx])
         if keep_local.size == 0:
-            empty = np.empty(0, dtype=np.int64)
-            return empty, empty, empty, empty, empty
+            return self._empty_history()
 
         order = np.argsort(keep_local, kind="stable")
         keep_local = keep_local[order]
 
-        items = item_ids[keep_local]
-        ts = timestamps[keep_local]
-        artists = self.item_to_artist[np.clip(items, 0, self.item_to_artist.shape[0] - 1)]
-        albums = self.item_to_album[np.clip(items, 0, self.item_to_album.shape[0] - 1)]
+        return self._materialize_history(
+            keep_local, item_ids, timestamps, is_lp, is_like, is_skip
+        )
 
-        # Pool bitmask per kept event (LP/LIKE/SKIP are mutually exclusive in
-        # the source data, but OR is safe and forward-compatible).
-        weight = np.zeros(keep_local.shape[0], dtype=np.int64)
-        weight[is_lp[keep_local]] |= LP_BIT
-        weight[is_like[keep_local]] |= LIKE_BIT
-        weight[is_skip[keep_local]] |= SKIP_BIT
+    def _gather_last_n_history(
+        self, flat_pos: int, user_start: int
+    ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+        """Build the UIH from the last `history_length` events of ANY pool type
+        (listen+/like/skip) with no per-pool split. Vs the interleaved strategy
+        this fills the sequence to ~history_length (higher effective length) and
+        lets the like share fall to its natural corpus rate (~1.9%). Events
+        outside the 3 pools (dislike/unlike/undislike) are excluded as before."""
+        L = self._history_length
+        scan = self._read_scan_window(flat_pos, user_start)
+        if scan is None:
+            return self._empty_history()
+        item_ids, timestamps, is_lp, is_like, is_skip = scan
+
+        member = is_lp | is_like | is_skip
+        # Last L pooled events, in chronological order (already position-sorted
+        # within the scan window, so no re-sort is needed).
+        keep_local = np.arange(item_ids.shape[0], dtype=np.int64)[member][-L:]
+        if keep_local.size == 0:
+            return self._empty_history()
 
-        return items, artists, albums, ts, weight
+        return self._materialize_history(
+            keep_local, item_ids, timestamps, is_lp, is_like, is_skip
+        )
 
     def _build_sample(
         self, flat_pos: int, max_num_candidates: int
@@ -869,7 +954,7 @@ def _build_sample(
         uid = int(self.store.flat_uid[flat_pos])
         user_start = int(self.store.user_start[uid])
 
-        items, artists, albums, ts, weight = self._gather_interleaved_history(
+        items, artists, albums, ts, weight = self._gather_history(
             flat_pos, user_start
         )
 
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index e838b346f..41a2b6234 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -188,6 +188,43 @@ get_dataset.history_length = @hl/env_int()
 hl/env_int.key     = "HISTORY_LENGTH"
 hl/env_int.default = 4086
 
+# UIH construction strategy — how a user's prior events become the sequence the
+# model attends over. Override via $HISTORY_STRATEGY. Both strategies scan the
+# last `scan_window` (20000) events before the anchor and consider ONLY the 3
+# behavior pools (listen+ / like / skip); dislike/unlike/undislike are excluded
+# (no action-weight bit), so the model's action_weights=[1,2,4] are unchanged.
+# The two differ only in HOW they pick events out of that scan window:
+#
+#   "interleaved" (default):
+#     - Budget = equal per-pool quota of HISTORY_LENGTH//3 (=1362) events; take
+#       the last 1362 of EACH pool independently, then merge + re-sort by time.
+#     - HISTORY_LENGTH is thus a PER-POOL cap (nominal total = 3 * L//3).
+#     - Consequence: likes are only ~1.9% of the corpus, so the like pool
+#       under-fills (~105 of its 1362 slots) and that budget is NOT reallocated
+#       — the sequence comes up short. Measured: ~2663 events effective, ~4.0%
+#       of them likes (i.e. likes OVER-represented vs their natural rate).
+#     - Use when you want guaranteed like exposure per sample (class-balanced UIH).
+#
+#   "last_n":
+#     - Budget = a single pool-agnostic cap: take the last HISTORY_LENGTH events
+#       of ANY pool type, already chronological (no per-pool split, no re-sort).
+#     - HISTORY_LENGTH is thus the LITERAL TOTAL UIH cap (not a per-pool L//3).
+#     - Consequence: the sequence fills to ~HISTORY_LENGTH (the only limit is how
+#       many pooled events exist in the scan window), so effective length is
+#       higher and the like share falls to its natural rate. Measured: ~4085
+#       events effective (~1.5x interleaved), ~1.2% of them likes.
+#     - Trade-off: more recent listen+/skip context (longer effective sequence)
+#       at the cost of fewer likes; ~1.4x eval/step compute (more non-skipped
+#       jagged-attention positions). Keep HISTORY_LENGTH=4086 to fill the 4k
+#       model budget.
+#
+# Both strategies are strategy-INDEPENDENT on disk: the gather runs at
+# sample-build time, so switching reuses the SAME hstu_cache_L<HISTORY_LENGTH>/
+# and positions cache — no rebuild needed.
+get_dataset.history_strategy = @hs/env_str()
+hs/env_str.key     = "HISTORY_STRATEGY"
+hs/env_str.default = "interleaved"
+
 # Anchor-eligibility floor: a LISTEN event becomes a trainable/eval anchor once
 # the user has >= MIN_HISTORY prior events. Decoupled from history_length (which
 # is only the gather/truncation cap) — jagged attention handles short UIH, so we
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
index 10c107bdc..7a805c195 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
@@ -1447,6 +1447,7 @@ def get_dataset(
     new_path_prefix: str = "",
     history_length: Optional[int] = None,
     min_history: Optional[int] = None,
+    history_strategy: str = "interleaved",
     streaming_window_seconds: int = 86400,
     streaming_sort_within_window: bool = False,
     streaming_shuffle_fraction: float = 0.0,
@@ -1584,6 +1585,13 @@ def get_dataset(
                 # short UIH. None = legacy (require a full history_length).
                 # Override via `get_dataset.min_history = N` / $MIN_HISTORY.
                 "min_history": min_history,
+                # UIH construction: "interleaved" (per-pool L//3 cap) or
+                # "last_n" (last history_length pooled events, no per-pool
+                # split). Strategy-independent on disk — both reuse the same
+                # hstu_cache_L<history_length>/ and positions file (the gather
+                # runs at sample-construction time), so switching needs no
+                # rebuild. Override via $HISTORY_STRATEGY.
+                "history_strategy": history_strategy,
                 "cross_specs": YAMBDA_5B_CROSS_SPECS,
                 # Temporal-streaming knobs (only used under --mode
                 # streaming-train-eval; ignored by the default train-eval path).

From f14485a5cbaca45f2c057c0f7c3a68d4d5b770dd Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Wed, 24 Jun 2026 00:49:37 +0000
Subject: [PATCH 077/113] =?UTF-8?q?dlrmv4:=20logging-freeze=20prep=20?=
 =?UTF-8?q?=E2=80=94=20MIN=5FHISTORY=3D4086=20default=20+=20HISTORY=5FSTRA?=
 =?UTF-8?q?TEGY=20passthrough?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- gin: MIN_HISTORY default 0 -> 4086 (power-users floor at the full history
  budget; maps to the existing positions_L4086.npy cache, no rebuild/no
  shared-dir write). AUC_THRESHOLD left unchanged (0.80275) pending finalization.
- launch_slurm.sh: forward $HISTORY_STRATEGY through the worker docker exec -e
  block (was silently dropped, so the knob never reached the worker); fix the
  stale lr-override echo (gin default is 1e-7, not 0.001).
- README: document MIN_HISTORY default as 4086.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 recommendation_v4/README.MD                                  | 4 ++--
 .../generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin  | 5 ++++-
 recommendation_v4/scripts/launch_slurm.sh                    | 3 ++-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/recommendation_v4/README.MD b/recommendation_v4/README.MD
index acd59cc31..d09a1567f 100644
--- a/recommendation_v4/README.MD
+++ b/recommendation_v4/README.MD
@@ -100,7 +100,7 @@ The `like` pool is roughly **30× rarer** than `lp` — important context for th
 
 ## 4. How data is fed to HSTU
 
-For every training anchor (a LISTEN event with ≥ `min_history` prior events — default `1`, i.e. ~all users; set `$MIN_HISTORY=4086` for the legacy "full `history_length` of context required" filter that dropped ~60% of users), the dataset builds a `(uih_kjt, candidate_kjt)` pair:
+For every training anchor (a LISTEN event with ≥ `min_history` prior events — frozen default `4086`, the "full `history_length` of context required" power-users filter; set `$MIN_HISTORY=0` to include ~all users plus their cold-start first event), the dataset builds a `(uih_kjt, candidate_kjt)` pair:
 
 ```
 UIH (User Interaction History):
@@ -203,7 +203,7 @@ with env overrides:
 | `PERSISTENT_LOADER` | `streaming_train_eval_loop.persistent_loader` | 1 | reuse one worker pool across windows (no per-window respawn) |
 | `DOUBLE_BUFFER` | `streaming_train_eval_loop.double_buffer` | 1 | prepare the next window in a background thread during compute |
 | `EVAL_EACH_WINDOW` | `streaming_train_eval_loop.eval_each_window` | 1 | eval window T+1 after training window T |
-| `MIN_HISTORY` | `get_dataset.min_history` | 1 | anchor-eligibility floor: min prior events for a LISTEN to be a sample (1 = ~all users; 4086 = legacy full-history filter) |
+| `MIN_HISTORY` | `get_dataset.min_history` | 4086 | anchor-eligibility floor: min prior events for a LISTEN to be a sample (frozen default 4086 = full-history power-users filter; 0 = ~all users incl. cold-start) |
 | — | `streaming_train_eval_loop.num_train_batches` / `num_eval_batches` | unset | cap per-window steps (unset = consume full window) |
 
 ### 5.3 Hiding the window-reset overhead
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index b4a675de8..7b7959c2b 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -239,7 +239,10 @@ hs/env_str.default = "interleaved"
 # keyed by (L, MIN_HISTORY) so floors don't collide. Override via $MIN_HISTORY.
 get_dataset.min_history = @mh/env_int()
 mh/env_int.key     = "MIN_HISTORY"
-mh/env_int.default = 0
+# Freeze default: power-users-only floor at the full history budget. With
+# HISTORY_LENGTH=4086 this maps to the existing positions_L4086.npy cache (no
+# rebuild, no shared-dir write — see yambda.py _positions_filename legacy path).
+mh/env_int.default = 4086
 
 # Model-side attention budget. Dataset truncates UIH to fit this value if
 # `history_length + contextual + candidate` would overflow. Override via
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index 7db407624..d543aacba 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -174,7 +174,7 @@ orchestrate() {
   echo "[$(date)] launch_slurm/orchestrate: job=${SLURM_JOB_ID:-?} nodes=${SLURM_JOB_NODELIST:-?} nnodes=${SLURM_NNODES:-1}" | tee -a "$LOG"
   echo "[$(date)] resolved SCRIPT_PATH=$SCRIPT_PATH REPO=$REPO" | tee -a "$LOG"
   echo "[$(date)] config: MODE=$MODE START_TS=$START_TS NUM_TRAIN_TS=$NUM_TRAIN_TS NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES METRIC_LOG_FREQ=$METRIC_LOG_FREQ" | tee -a "$LOG"
-  echo "[$(date)] lr-override: DENSE_LR=${DENSE_LR:-<unset:gin default 0.001>} SPARSE_LR=${SPARSE_LR:-<unset:gin default 0.001>}" | tee -a "$LOG"
+  echo "[$(date)] lr-override: DENSE_LR=${DENSE_LR:-<unset:gin default 1e-7>} SPARSE_LR=${SPARSE_LR:-<unset:gin default 1e-7>}" | tee -a "$LOG"
 
   # Rendezvous resolved on the HOST (the container image has no SLURM client).
   MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" 2>/dev/null | head -1)
@@ -314,6 +314,7 @@ orchestrate() {
       ${DIAG_EMB_STEPS:+-e DIAG_EMB_STEPS=$DIAG_EMB_STEPS} \
       ${OUTPUT_TRACE:+-e OUTPUT_TRACE=$OUTPUT_TRACE} \
       ${MIN_HISTORY:+-e MIN_HISTORY=$MIN_HISTORY} \
+      ${HISTORY_STRATEGY:+-e HISTORY_STRATEGY=$HISTORY_STRATEGY} \
       ${SEED:+-e SEED=$SEED} \
       ${DENSE_LR:+-e DENSE_LR=$DENSE_LR} \
       ${SPARSE_LR:+-e SPARSE_LR=$SPARSE_LR} \

From 53f517628bd8c0339da677980f5f3b9d543601a2 Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Wed, 24 Jun 2026 00:54:47 +0000
Subject: [PATCH 078/113] dlrmv4: untrack docs/v4_vs_v2_and_hstu_walkthrough.md

Stop tracking the local walkthrough doc (kept on disk, no longer in the repo).

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../docs/v4_vs_v2_and_hstu_walkthrough.md     | 534 ------------------
 1 file changed, 534 deletions(-)
 delete mode 100644 recommendation_v4/docs/v4_vs_v2_and_hstu_walkthrough.md

diff --git a/recommendation_v4/docs/v4_vs_v2_and_hstu_walkthrough.md b/recommendation_v4/docs/v4_vs_v2_and_hstu_walkthrough.md
deleted file mode 100644
index 4ef46c069..000000000
--- a/recommendation_v4/docs/v4_vs_v2_and_hstu_walkthrough.md
+++ /dev/null
@@ -1,534 +0,0 @@
-# recommendation_v4 (HSTU + Yambda-5b) — reference
-
-A walkthrough of what the proposed `recommendation_v4` MLPerf-training benchmark
-is, how it differs from `recommendation_v2`, what the HSTU model is composed of,
-and how to download the dataset and run training as-is.
-
-All claims below are grounded in code/config paths inside this tree. Every
-numeric constant cites a `file:line` source. Where doc and source disagree, the
-source wins and the discrepancy is called out.
-
----
-
-## 0. Sources of truth
-
-The following files were read to assemble this document. If you change any of
-them, audit this doc against the change.
-
-- `training/recommendation_v2/torchrec_dlrm/README.MD` (v2 reference)
-- `training/recommendation_v4/README.MD` (v4 fork overview)
-- `training/recommendation_v4/docs/training_recipe.md` (v4 stacks/configs)
-- `training/recommendation_v4/generative_recommenders/modules/stu.py` (HSTU layer)
-- `training/recommendation_v4/generative_recommenders/modules/dlrm_hstu.py` (top-level `DlrmHSTU` + config dataclass)
-- `training/recommendation_v4/generative_recommenders/modules/hstu_transducer.py` (preprocessor → STU stack → postprocessor)
-- `training/recommendation_v4/generative_recommenders/ops/pytorch/pt_hstu_attention.py` (reference HSTU attention math in plain PyTorch)
-- `training/recommendation_v4/generative_recommenders/ops/hstu_attention.py` (kernel dispatcher: PYTORCH / TRITON / TRITON_CC)
-- `training/recommendation_v4/generative_recommenders/dlrm_v3/configs.py` (per-dataset HSTU config + embedding tables)
-- `training/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin` (run config)
-- `training/recommendation_v4/generative_recommenders/dlrm_v3/preprocess_public_data.py` (Yambda HuggingFace downloader/preprocessor)
-- `training/recommendation_v4/generative_recommenders/dlrm_v3/datasets/yambda.py` (dataset feeding HSTU)
-- `training/recommendation_v4/generative_recommenders/dlrm_v3/utils.py` (metrics logger / `auc_threshold` consumer)
-- `training/recommendation_v4/scripts/launch_smoke_8gpu.sh` (run wrapper)
-
----
-
-## 1. `recommendation_v4` vs `recommendation_v2`
-
-v4 is **not** an evolution of v2 — it replaces a tabular CTR benchmark
-(DLRMv2 + DCN on Criteo 1 TB) with a **sequential generative-recommender
-benchmark** (HSTU on Yandex Yambda-5b). Codebase, dataset, task, loss
-labeling, hyperparameters, and software stack are all different. They share
-basically nothing except the "recommendation" label.
-
-### 1.1 Upstream codebase / repo origin
-
-| | v2 | v4 |
-|---|---|---|
-| Upstream repo | `pytorch/torchrec` examples (DLRM) | fork of `meta-recsys/generative-recommenders` (`README.MD:3`) |
-| Layout | single dir: `torchrec_dlrm/` with `dlrm_main.py` | full repo tree: `generative_recommenders/`, `configs/`, `scripts/`, `main.py`, `setup.py`, gin-driven |
-| Config style | argparse CLI flags | gin-config files under `generative_recommenders/dlrm_v3/train/gin/` (e.g. `yambda_5b.gin`) |
-
-### 1.2 Model architecture
-
-| | v2 | v4 |
-|---|---|---|
-| Model | **DLRM v2** — dense MLP + sparse embeddings + feature interaction (paper: Naumov et al. 1906.00091) | **HSTU** — Hierarchical Sequential Transducer Units (ICML'24 *Actions Speak Louder than Words*) (`README.MD:3`) |
-| Interaction arch | DCN v2: `--interaction_type=dcn --dcn_num_layers=3 --dcn_low_rank_dim=512` (`recommendation_v2/torchrec_dlrm/README.MD:167-169`) | Transformer-style sequential self-attention over a User Interaction History (UIH) of length 2048, jagged-attention TRITON kernel (`README.MD:114, 132`; `training_recipe.md:57, 71`) |
-| Embedding dim | 128 (`recommendation_v2/torchrec_dlrm/README.MD:157`) | 512 (`dlrm_v3/configs.py:33, 353`) |
-| Pipeline | TorchRec model-parallel embeddings + data-parallel MLP, overlapped (`recommendation_v2/torchrec_dlrm/README.MD:3`) | TorchRec sharded embeddings + HSTU ranker; per-GPU HBM cap 260 GiB MI350X / 150 GiB B200 (`training_recipe.md:59, 176`) |
-
-### 1.3 Dataset
-
-| | v2 | v4 |
-|---|---|---|
-| Dataset | **Criteo 1 TB click logs** → multi-hot preprocessed variant (~3.8 TB materialized) (`recommendation_v2/torchrec_dlrm/README.MD:142-146`) | **Yambda-5b** (Yandex music, HuggingFace `yandex/yambda`, 5b variant) (`README.MD:3, 28`) |
-| Domain | CTR prediction on tabular ads features (26 categorical + 13 dense) | Sequential music-recommendation events (listen / like / skip / dislike / unlike / undislike) per-user timelines |
-| Size | `TOTAL_TRAINING_SAMPLES=4,195,197,692` rows (`recommendation_v2/torchrec_dlrm/README.MD:153`) | 4.76 B events, 1.00 M users, 9.39 M items; 3.23 B usable training anchors (`README.MD:62-69`) |
-| Storage layout | numpy contiguous shuffled `.npy` (or preprocessed multi-hot bin) | parquet: `train_sessions.parquet` 47 GB, `test_events.parquet` 152 MB, etc. (`README.MD:40-52`) |
-| Preprocessing | `process_Criteo_1TB_Click_Logs_dataset.sh` — 700 GB RAM, 1–2 days, then `materialize_synthetic_multihot_dataset.py` | `generative_recommenders.dlrm_v3.preprocess_public_data --dataset yambda-5b` — ~53 min end-to-end for 5b (`README.MD:54`) |
-| Embedding cardinalities | `num_embeddings_per_feature` 26-vec, top entries 40 M (`recommendation_v2/torchrec_dlrm/README.MD:161`) | item 9.39 M, artist 1.29 M, album 3.37 M, uid 1.00 M, + 7 cross-features up to 100 M (`dlrm_v3/configs.py:40-48, 686-722`) |
-| Required pre-processor pkg | none unusual | **`polars-u64-idx`** because yambda-5b exceeds polars' 32-bit row index (`training_recipe.md:44, 102-103`) |
-
-### 1.4 Task formulation / supervision
-
-| | v2 | v4 |
-|---|---|---|
-| Task | binary CTR (click / no-click) | sequential next-action ranking: given UIH, predict whether the candidate LISTEN event will be a "listen_plus" (`played_ratio ≥ 50%`) (`README.MD:103`) |
-| Label | Criteo click label | `action_weight` bitmask on the candidate; supervision masked to `(supervision_bitmask & task_weight) > 0` with `task_weight = 1` (LP bit) → only `listen_plus` candidates supervise (`README.MD:103`) |
-| Loss | BCE | BCE on `listen_plus` task |
-
-### 1.5 Target metric
-
-| | v2 | v4 |
-|---|---|---|
-| Target | **AUROC ≥ 0.80275** within 1 epoch on Criteo (`recommendation_v2/torchrec_dlrm/README.MD:173`) | `MetricsLogger.auc_threshold = 0.80275` (`yambda_5b.gin:107`). Same numeric value as v2 — likely inherited from the upstream DLRM-DCNv2 reporting convention rather than independently chosen for HSTU. Consumed in `dlrm_v3/utils.py:587-608` to log `time_to_auc_0.80275_sec` as soon as the `listen_plus` task's AUC crosses the threshold. Confirm with the proposing team whether this is the intended final benchmark target or a placeholder. |
-
-### 1.6 Training hyperparameters
-
-| | v2 (MLPerf example, 8 GPU) | v4 (`yambda_5b.gin`, 8 GPU) |
-|---|---|---|
-| Global batch | 65,536 (`recommendation_v2/torchrec_dlrm/README.MD:154`) | **8,192** (`batch_size=1024 × world_size=8`) (`yambda_5b.gin:1, 44`). Note `docs/training_recipe.md:65, 182` shows `32 × 8 = 256` — that doc has drifted; the gin file is the launch source of truth. |
-| Epochs | 1 (`recommendation_v2/torchrec_dlrm/README.MD:163`) | 1 (`yambda_5b.gin:81`) |
-| Dense optimizer | Adagrad, lr 0.005 (`recommendation_v2/torchrec_dlrm/README.MD:170-171`) | **Adam**, lr 1e-3, betas (0.95, 0.999), eps 1e-8 (`yambda_5b.gin:19-24`) |
-| Sparse optimizer | (Adagrad on embeddings via TorchRec) | **RowWiseAdagrad**, lr 1e-3, betas (0.95, 0.999), eps 1e-8 (`yambda_5b.gin:27-32`) |
-| Precision | fp32 (no bf16 flag in v2 example) | **bf16** mixed precision, gated on the TRITON HSTU kernel (`yambda_5b.gin:8`; `training_recipe.md:58, 109-111`) |
-| Sequence length | n/a (non-sequential model) | `history_length=2039`, `max_seq_len=2048` (`yambda_5b.gin:74, 78`) |
-
-### 1.7 Software stack
-
-| | v2 | v4 (MI350X) | v4 (B200) |
-|---|---|---|---|
-| Container | none specified (bare AWS p4d, CUDA 11.0, NCCL 2.10.3) (`recommendation_v2/torchrec_dlrm/README.MD:37`) | `rocm/primus:v26.3` (`training_recipe.md:24`) | `nvcr.io/nvidia/pytorch:26.04-py3` (`training_recipe.md:132`) |
-| GPU target | A100 40 GB | **MI350X** (`gfx950`, ROCm 7.2.1, 288 GiB HBM3e) | **B200** (`sm_100`, ~183 GiB HBM) |
-| torch | TorchRec example era; CUDA 11.0 | `2.12.0+rocm7.2` (`training_recipe.md:38`) | `2.12.0a0` native NGC (CUDA 13.2) (`training_recipe.md:149`) |
-| triton | not central | `3.6.0` (image native; required for HSTU TRITON backend) (`training_recipe.md:41`) | `3.6.0` (`training_recipe.md:150`) |
-| fbgemm_gpu | TorchRec default | `fbgemm_gpu_nightly_rocm-2026.6.2` built from FBGEMM `10b77573` for `gfx950` (`training_recipe.md:42`) | same SHA, built for `sm_100` (`training_recipe.md:151`) |
-| torchrec | (whatever TorchRec was current) | `1.7.0a0+bf55480` (`v2026.06.01.00`) (`training_recipe.md:43`) | `1.7.0.dev20260601+cu130` (`training_recipe.md:152`) |
-| Launcher | `torchx … dist.ddp` | `scripts/launch_smoke_8gpu.sh` | `scripts/launch_smoke_8gpu.sh` |
-| Key kernel | TorchRec EmbeddingBag + DCN | **HSTU TRITON jagged-attention** (`HSTU_HAMMER_KERNEL=TRITON`) (`training_recipe.md:71`) | same (`training_recipe.md:188`) |
-
----
-
-## 2. HSTU model walkthrough
-
-### 2.1 What HSTU is, in one paragraph
-
-**HSTU = Hierarchical Sequential Transducer Units**, from the Meta paper
-*Actions Speak Louder than Words* (ICML'24). It is a **decoder-only Transformer
-variant**, redesigned for *recommendation* sequences (very long, very ragged,
-heavy on categorical features). The block looks like a standard transformer
-block superficially — attention + MLP — but two things are different from
-GPT/SASRec attention:
-
-1. **Pointwise SiLU instead of softmax** in the attention non-linearity (no
-   log-sum-exp normalization).
-2. **Gated output**: an extra projected stream `U` multiplies the attention
-   output before the residual.
-
-Everything else (residual connections, layer-norm, multi-head, positional
-encoding, causal masking, KV-cache) is conventional transformer. The "S" in
-STU = "Sequential Transducer Unit" = one HSTU block.
-
-### 2.2 The composition: top-level model (DLRM-v3 / `DlrmHSTU`)
-
-The full thing in `dlrm_hstu.py` is a small pipeline. Top-down:
-
-```
-KeyedJaggedTensor of raw ids
-         │
-         ▼
-[1] TorchRec EmbeddingCollection           (≈150 G sparse params, sharded across GPUs)
-         │  emits per-feature jagged embedding lookups
-         ▼
-[2] ContextualPreprocessor                 (interleaves UIH + appends candidate, adds
-                                            positional / action / timestamp encodings)
-         │  output: jagged sequence of length L per user, dim = transducer_embedding_dim
-         ▼
-[3] HSTUTransducer  ── STUStack of N HSTULayers (the "HSTU" attention blocks)
-         │  output: contextualized per-position embedding
-         ▼
-[4] DefaultMultitaskModule                 (linear → BCE on listen_plus bit)
-         │
-         ▼
-Per-anchor logit  →  BCE loss
-```
-
-For yambda-5b the per-dataset overrides in `dlrm_v3/configs.py:78-90, 346-425`
-give:
-
-| component | value | source |
-|---|---|---|
-| embedding tables | `item_id` 9.39 M × 512, `artist_id` 1.29 M × 512, `album_id` 3.37 M × 512, `uid` 1.00 M × 512, + 7 cross-features (e.g. `user_x_artist` 100 M × 512) | `dlrm_v3/configs.py:686-722` |
-| embedding dim | 512 (`HSTU_EMBEDDING_DIM`) | `dlrm_v3/configs.py:33, 353` |
-| HSTU layers | **5** (`hstu_attn_num_layers=5`) | `dlrm_v3/configs.py:82` |
-| attention heads | 4 | `dlrm_v3/configs.py:79` |
-| Q/K dim per head | 128 | `dlrm_v3/configs.py:81` |
-| V/U (linear) dim per head | 128 | `dlrm_v3/configs.py:80` |
-| transducer embedding dim | 512 | `dlrm_v3/configs.py:85, 354` |
-| dropout | input 0.2, linear 0.1 | `dlrm_v3/configs.py:87-88` |
-| max attention budget (model) | 8192 (yambda default; gin further caps to 2048 via `get_hstu_configs.max_seq_len = 2048` in `yambda_5b.gin:78`) | `dlrm_v3/configs.py:355` |
-| task | `listen_plus`, BINARY_CLASSIFICATION, BCE | `dlrm_v3/configs.py:419-424` |
-
-**Sparse-side parameter count, by table** (just the explicit ones; cross-features
-add 282 M more rows × 512 dim ≈ 144 G params, which dominate):
-
-```
-item_id   :  9_390_624 × 512 ≈   4.81 B
-artist_id :  1_293_395 × 512 ≈     662 M
-album_id  :  3_367_692 × 512 ≈    1.72 B
-uid       :  1_000_001 × 512 ≈     512 M
-crosses   :       ~282 M × 512 ≈ 144.4 B   ← dominant
-```
-
-This is overwhelmingly an embedding-bound model — the dense HSTU stack (5
-layers × ~1 M parameters each) is a rounding error next to the embedding
-tables, which is why `make_optimizer_and_shard.hbm_cap_gb = 260` and why
-TorchRec sharding is central.
-
-### 2.3 Inside one STU (HSTU) layer
-
-From `modules/stu.py:182-246, 292-355`. A single STU layer holds **four**
-weight matrices, not the usual two (QKV + out):
-
-```
-_uvqk_weight   : (E, (hidden_dim·2 + attn_dim·2) · num_heads)
-_uvqk_beta     : (...,)                bias for the above
-_input_norm    : LayerNorm(E)
-_output_weight : (hidden_dim · num_heads · 3, E)
-_output_norm   : LayerNorm
-```
-
-Forward pass on input `x` of shape `[L, E]` (jagged):
-
-#### 2.3.1 Fused U/V/Q/K projection
-
-```
-normed = LayerNorm(x)
-[U | V | Q | K] = normed @ _uvqk_weight + _uvqk_beta     # one GEMM, then split
-                                                          # U, V ∈ R^{H·hidden_dim}
-                                                          # Q, K ∈ R^{H·attn_dim}
-```
-
-Compared to a regular transformer, you get an **extra projected stream `U`**.
-`U` will gate the attention output later.
-
-#### 2.3.2 HSTU attention (the core difference vs softmax attention)
-
-Reference math, exactly as written in
-`ops/pytorch/pt_hstu_attention.py:151, 167, 179, 182`:
-
-```python
-qk_attn = einsum("bhxa,bhya->bhxy", Q, K) * alpha   # alpha = 1 / sqrt(attn_dim)
-qk_attn = F.silu(qk_attn) / max_seq_len             # ← pointwise SiLU, scalar divide
-qk_attn = qk_attn * valid_attn_mask                 # mask (see 2.3.3)
-attn    = einsum("bhxd,bhdv->bhxv", qk_attn, V)
-```
-
-Contrast with a vanilla transformer:
-
-```python
-qk   = (Q @ K.T) / sqrt(d)
-qk   = softmax(qk + mask, dim=-1)                   # ← softmax normalises rows
-attn = qk @ V
-```
-
-Two consequences of dropping softmax:
-
-- **No row-wise normalization** → the per-key contribution is decoupled across
-  positions. The paper argues this is *better* for recommendation, because a
-  5-year-old "like" event shouldn't have its weight diluted just because the
-  user has a longer history (which softmax would do).
-- **Numerically more delicate**: the recipe warns *"`pt_hstu_attention`'s QK
-  einsum backward overflows in bf16 at N > 1k and produces NaN at step 1; bf16
-  is only safe with TRITON"* (`docs/training_recipe.md:109-111`). The TRITON
-  kernel handles bf16 accumulation carefully; the reference PyTorch path
-  doesn't.
-
-#### 2.3.3 Custom attention mask (`_get_valid_attn_mask`, `pt_hstu_attention.py:32-84`)
-
-HSTU supports four mask-combination knobs simultaneously:
-
-- **causal**: lower triangle only (standard).
-- **target-aware** (`num_targets`): the last `num_targets` positions are the
-  candidate targets; their "row index" is clamped so all targets see the same
-  prefix (the user's UIH) but cannot peek at each other.
-- **max_attn_len** (sliding window): each position attends only to the previous
-  `max_attn_len` events — bounds the receptive field for very long histories.
-- **contextual_seq_len**: the first `contextual_seq_len` positions are
-  *contextual* tokens (uid + cross-features). They are allowed to attend to
-  everything (and everything attends back to them), regardless of causal order.
-  This is how `uid` / `user_x_artist` etc. get full visibility despite living
-  at the head of the sequence.
-
-#### 2.3.4 Output: gated MLP
-
-From `stu.py:336-354` → `hstu_compute_output`:
-
-```
-y = SwishLayerNorm(attn)             # SiLU(x · sigmoid(x)) then LN
-y = concat([y, U]) @ _output_weight  # gating with the U stream
-y = y · x + dropout                  # residual back to original x
-```
-
-The `U · y` gating is the second non-standard piece. It is reminiscent of
-GLU / SwiGLU but applied to the *attention output*, not just an MLP.
-
-#### 2.3.5 Stack
-
-`STUStack` (`stu.py:426`) is just `nn.ModuleList` of N `STULayer`s applied
-sequentially with the same jagged-tensor convention. No cross-layer fanciness.
-
-### 2.4 "Transformer-style sequential attention over a UIH" — what the inputs actually look like
-
-UIH = **User Interaction History**. For yambda, the input to one training
-sample is one **anchor LISTEN event** plus that user's history. From
-`README.MD:88-101` and `dlrm_v3/configs.py:399-418`:
-
-```
-sequence position:    0 .. 7   |   8 .. (L-2)                |  L-1
-                      ─────────┼─────────────────────────────┼──────────
-content:              contextual│  UIH (interleaved 3 pools)  │  candidate
-                                │                              │
-features per position:           uid, 7 cross-features (length-1 each)
-                                  item_id, artist_id, album_id,
-                                  action_weight (LP/LIKE/SKIP bitmask),
-                                  action_timestamp, dummy_watch_time
-                                                              candidate's:
-                                                                item_candidate_id,
-                                                                item_candidate_artist_id,
-                                                                item_candidate_album_id,
-                                                                item_query_time,
-                                                                item_action_weight,
-                                                                item_dummy_watchtime
-```
-
-The HSTU stack runs causal attention over this `L = 2048` sequence. The label
-is the candidate's `listen_plus` bit (1 if `played_ratio ≥ 50%`, else 0), and
-BCE is taken on the logit emitted at position `L-1`. So "transformer-style
-sequential attention over UIH" literally means: the user's last ~2 k actions
-are tokens, the candidate song is the last token, and a 5-layer HSTU
-transformer predicts whether that candidate will be a `listen_plus`.
-
-This is the conceptual jump from DLRMv2:
-
-| | DLRMv2 (Criteo, v2) | HSTU (Yambda, v4) |
-|---|---|---|
-| Input shape | flat: 26 categorical + 13 dense features per ad impression | sequence of ~2 k past events per user, each a structured tuple |
-| Mixing op | DCN: cross-products of feature vectors, then MLP | self-attention across positions (SiLU-gated, multi-head, causal) |
-| Temporal modelling | none (each ad impression is i.i.d.) | central — masks, timestamps, action types are first-class |
-| Depth | 1-shot (interaction arch + over-arch MLP) | 5 stacked HSTU blocks |
-| "Why is the candidate good?" | low-rank cross of user/ad embeddings | attention over user's relevant past songs/artists/albums |
-
-DLRMv2 is *wide-and-shallow* over tabular features. HSTU is *narrow-and-deep*
-over a temporal sequence. Different paradigm.
-
-### 2.5 Jagged attention — what it is and why it's used
-
-A user's history length varies — yambda median is 2,695 events, max is 27,738
-(`README.MD:65`). For a single training step you have a batch of B users with
-very different sequence lengths. Two ways to lay this out on the GPU:
-
-**Padded layout (standard transformer):**
-
-```
-input shape: [B, N_max, D]           e.g. [1024, 2048, 512]
-```
-
-This wastes compute proportional to `(N_max − N_user) / N_max` per row. On
-yambda the average fill is ~1402/2037 ≈ 69%, so ~30% of every kernel is
-multiplying zeros.
-
-**Jagged layout (what HSTU uses):**
-
-```
-flat values  : [L_total, D]          L_total = Σ user_lengths   (≤ B · N_max)
-offsets      : [B + 1]               cumulative starts, so user i occupies
-                                     values[offsets[i] : offsets[i+1]]
-```
-
-`pt_hstu_attention.py:148, 183` shows the round-trip:
-
-- `torch.ops.fbgemm.jagged_to_padded_dense(...)` only when calling into a dense
-  einsum
-- `torch.ops.fbgemm.dense_to_jagged(...)` on the way out
-
-That's the reference path. The **TRITON jagged-attention kernel**
-(`ops/triton/triton_hstu_attention.py`, dispatched in `ops/hstu_attention.py:27,
-71`) skips the padded intermediate entirely: each Triton program handles one
-user's `[N_user, N_user]` attention block directly, so:
-
-- **No wasted FLOPs.** Empty positions never enter a GEMM.
-- **No wasted memory.** No padded `[B, H, N_max, N_max]` attention scores buffer
-  — that buffer alone would be `1024 · 4 · 2048 · 2048 · 2 bytes ≈ 34 GB` per
-  step (at global batch 1024 × bf16).
-- **Variable-length backward is correct without masking tricks.** The kernel
-  iterates `[offsets[i], offsets[i+1])` per program; the gradient never touches
-  non-existent positions.
-
-This is *the* enabling optimization for the under-filled `like` pool to be
-cheap. The README notes (`README.MD:132`): *"With the TRITON jagged-attention
-backend the GPU only does work for the actual events, so the under-fill costs
-sequence budget but not GPU compute"*. With a padded kernel, the unused 31% of
-every sequence would cost real FLOPs.
-
-Practically: jagged attention is a generic technique (it shows up in
-FlashAttention's varlen variants too); HSTU's TRITON kernel is its
-specialization with SiLU + gated output + the four-way mask.
-
----
-
-## 3. Yambda-5b — size, contents, download, run
-
-### 3.1 What's in it
-
-[`yandex/yambda`](https://huggingface.co/datasets/yandex/yambda) on HuggingFace.
-From `dlrm_v3/preprocess_public_data.py:233-245` + `README.MD:56-81`:
-
-| field | value |
-|---|---|
-| Provider | Yandex Music recommendation logs |
-| Sizes | yambda-50m, yambda-500m, **yambda-5b** (v4 uses 5b) |
-| Events | 4.76 B interactions across 300 days |
-| Users | 1.00 M unique |
-| Items | 9.39 M songs (+ 1.29 M artists, 3.37 M albums) |
-| Event types | `listen` / `like` / `dislike` / `unlike` / `undislike` (encoded as uint8 0–4) |
-| Listen events also carry | `played_ratio` (used to derive the `listen_plus` label at 50% threshold) |
-| Train / test split | Global Temporal Split: 300 days train, 30-min gap, 1 day test |
-
-### 3.2 On-disk footprint after preprocessing (`README.MD:39-52`)
-
-```
-<DLRM_DATA_PATH>/
-├── raw/5b/multi_event.parquet                    50 GB   (downloaded)
-├── shared_metadata/
-│   ├── artist_item_mapping.parquet               60 MB
-│   ├── album_item_mapping.parquet                76 MB
-│   └── embeddings.parquet                        18 GB   (unused by HSTU training)
-└── processed_5b/
-    ├── train_sessions.parquet                    47 GB   ← main training input
-    ├── test_events.parquet                       152 MB
-    ├── session_index.parquet                     600 MB
-    ├── item_popularity.npy                        75 MB
-    └── split_meta.json                            anchor + boundary stats
-```
-
-Plan for **~115 GB free disk** to do everything end-to-end (raw + shared +
-processed). If you skip the unused `embeddings.parquet` (which the script
-downloads anyway), you still need ~97 GB.
-
-### 3.3 Download + preprocess
-
-Both happen in one command. Download is via the `datasets` library
-(HuggingFace), so you need internet and `pip install datasets`. From
-`dlrm_v3/preprocess_public_data.py:276-317`:
-
-```bash
-pip install datasets polars-u64-idx pyarrow xxhash gin-config absl-py pandas
-
-export DLRM_DATA_PATH=/your/big/disk/dlrm_data
-mkdir -p "$DLRM_DATA_PATH"
-
-cd /home/suachong/training/recommendation_v4
-python3 -m generative_recommenders.dlrm_v3.preprocess_public_data \
-  --dataset yambda-5b \
-  --data-path "$DLRM_DATA_PATH"
-```
-
-Per `README.MD:54`: **~53 minutes end-to-end** for the 5b variant on a
-reasonable box. For a quick smoke test substitute `--dataset yambda-50m`
-(~2 min, ~1 GB on disk).
-
-Critical: **install `polars-u64-idx`, not stock `polars`.** yambda-5b has
->4.29 B rows and overflows polars' default 32-bit row index silently
-(`training_recipe.md:102-103`, `scripts/launch_smoke_8gpu.sh:13-20`).
-
-### 3.4 Run training (8-GPU smoke)
-
-From `scripts/launch_smoke_8gpu.sh` and `README.MD:9-22`.
-
-**Inside the validated container** (recommended; everything's pre-staged):
-
-```bash
-docker exec yambda_8gpu bash -c \
-  'cd /workspace/recommendation_v4 && bash scripts/launch_smoke_8gpu.sh'
-```
-
-Override data path / run name without editing the gin:
-
-```bash
-DLRM_DATA_PATH=/your/big/disk/dlrm_data \
-RUN_NAME=my_experiment \
-bash scripts/launch_smoke_8gpu.sh
-```
-
-**From scratch on a bare host**, you need to assemble the stack per
-`docs/training_recipe.md`. The hard requirements are:
-
-- **ROCm path**: `rocm/primus:v26.3`, torch `2.12.0+rocm7.2`, triton `3.6.0`,
-  fbgemm_gpu built from commit `10b77573` for `gfx950`, torchrec
-  `1.7.0a0+bf55480`. See `training_recipe.md:30-45`.
-- **CUDA path**: `nvcr.io/nvidia/pytorch:26.04-py3`, native torch (do NOT
-  reinstall), fbgemm_gpu built from same commit for `sm_100`, torchrec
-  `1.7.0.dev20260601+cu130`. See `training_recipe.md:147-155`.
-
-In both cases the actual launch is just:
-
-```bash
-python -m generative_recommenders.dlrm_v3.train.train_ranker \
-    --dataset yambda-5b --mode train-eval
-```
-
-(plus `HSTU_HAMMER_KERNEL=TRITON` for CUDA; `=PYTORCH` is forced on ROCm in
-the smoke script because the Triton kernel hits PassManager errors on some
-shapes there — see `scripts/launch_smoke_8gpu.sh:31-33`. The PYTORCH fallback
-gives ~190 ms/step baseline, not the ~52 ms primus-pinned number.)
-
-### 3.5 What you'll see
-
-Per `training_recipe.md:84-91`, on 8× MI350X in the optimal config:
-
-- ~52 ms/step at global batch 256 (per the doc; gin says 8,192 — see §1.6 note)
-- ~4,970 samples/sec
-- ~7.6 days for one epoch over 3.23 B training anchors
-
-If `auc_threshold = 0.80275` is the real benchmark target (still TBD),
-`time_to_auc_0.80275_sec` will be logged as soon as the eval AUC on
-`listen_plus` crosses it (`dlrm_v3/utils.py:587-608`).
-
----
-
-## 4. TL;DR
-
-- **HSTU ≈ decoder-only Transformer** with two tweaks: SiLU/N replaces softmax
-  in attention, and a `U`-gated output replaces the standard MLP block.
-- **DLRMv3 (yambda) = TorchRec embeddings → contextual preprocessor → 5
-  stacked HSTU layers → BCE head on `listen_plus`.** Sparse tables (≈150 G
-  params) dominate the model; the dense HSTU stack is tiny by comparison.
-- **UIH = user interaction history.** Each sample is one anchor LISTEN event
-  plus that user's last ~2 k events (LISTEN_PLUS / LIKE / SKIP, interleaved
-  chronologically, gathered with a `L//3`-per-pool cap), and HSTU does causal
-  self-attention across them.
-- **Jagged attention** packs variable-length per-user sequences as
-  `(flat_values, offsets)` instead of padding to `N_max`, so the Triton kernel
-  never spends FLOPs on empty positions — essential because the average
-  sequence is only 69% full on yambda.
-- **Yambda-5b** is a 4.76 B-event / 1 M-user Yandex Music dataset on
-  HuggingFace (`yandex/yambda`); downloading + preprocessing takes ~53 min and
-  ~115 GB disk; run via
-  `python -m generative_recommenders.dlrm_v3.train.train_ranker --dataset yambda-5b --mode train-eval`
-  (or `scripts/launch_smoke_8gpu.sh`).
-
----
-
-## 5. Open questions to bring back to the proposing team
-
-1. **Target metric.** `yambda_5b.gin:107` reuses DLRMv2's `0.80275` AUC
-   threshold. Is this the intended final v4 benchmark target, or a placeholder
-   inherited from upstream? An HSTU model on a different dataset would
-   normally need its own threshold chosen from a reference run.
-2. **Batch size canonicalization.** `yambda_5b.gin:1` = `1024` per rank
-   (global 8,192); `docs/training_recipe.md:65, 182` says `32` per rank
-   (global 256). Which is the submission config?
-3. **Convergence reference runs.** No `reference_results.md`-style table exists
-   yet under `training/recommendation_v4`. Submission-quality v4 will need
-   reference epochs-to-target numbers per dataset variant.

From 0f7a6ebb89ab1e4bd9595d3f8ede0bb69d17276e Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Wed, 24 Jun 2026 00:58:59 +0000
Subject: [PATCH 079/113] dlrmv4: scrub hardcoded username from reference
 comments

Replace the example username in launch_slurm.sh / streaming_resume_test.sh
comments with a generic <user> placeholder. Runtime defaults already derive
the container name + mounts from $USER, so only doc/example strings changed.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/tests/streaming_resume_test.sh              | 2 +-
 recommendation_v4/scripts/launch_slurm.sh                     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
index a497959a0..c7bb5bab9 100755
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
@@ -17,7 +17,7 @@
 #
 # Usage:
 #   bash generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh --jobid <slurm-jobid>
-#       [--container yambda_suachong]
+#       [--container yambda_<user>]
 #       [--num-train-batches 200]
 #       [--die-at-step 350]
 #       [--keep]    # retain LOG_DIR + CKPT after run for inspection
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index d543aacba..f5e07ad0a 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -62,9 +62,9 @@
 #
 #  B) Filesystems (must be shared/NFS across ALL nodes — this script re-invokes
 #     itself and reads the overlay + data from these paths cluster-wide)
-#     - REPO_MOUNT (repo + this script, e.g. /home/suachong) is bind-mounted rw;
+#     - REPO_MOUNT (repo + this script, e.g. /home/<user>) is bind-mounted rw;
 #       DATA_MOUNT (e.g. /apps/chcai) holds the read-only dataset + overlay +
-#       baked tar + pip tarball; SCRATCH (e.g. /home/suachong/yambda_runs) is the
+#       baked tar + pip tarball; SCRATCH (e.g. /home/<user>/yambda_runs) is the
 #       writable log/output root. Override any via env — nothing is user-hardwired.
 #
 #  C) Container image / GPU software stack (tied to the GPU arch + ROCm version)

From 31a38facc5e8a5a586863586dbed5b5eea3a8732 Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Wed, 24 Jun 2026 01:06:18 +0000
Subject: [PATCH 080/113] =?UTF-8?q?dlrmv4:=20README=20=E2=80=94=20add=20fu?=
 =?UTF-8?q?ll=20single/multi-node=20reference=20run=20example?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Document a complete sbatch launch (run-shape + data-fraction eval cadence) for
1-node and 2-node, noting the launchers differ only in --nodes.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 recommendation_v4/README.MD | 45 +++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/recommendation_v4/README.MD b/recommendation_v4/README.MD
index d09a1567f..ed8a9ace4 100644
--- a/recommendation_v4/README.MD
+++ b/recommendation_v4/README.MD
@@ -39,6 +39,51 @@ bash scripts/launch_slurm.sh
 
 Data path resolves at runtime via `env_path` gin macros (see [yambda_5b.gin](generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin)). Traces and any per-run outputs land in `results/<RUN_NAME>/`.
 
+### 1.1 Full reference run (single + multi-node)
+
+For a complete benchmark run you typically wrap the `sbatch` call in a small
+submit script that pins the run-shape + eval cadence (most other
+hyperparameters already match the [yambda_5b.gin](generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin)
+defaults, so only the knobs below need to be set). The single- and multi-node
+launchers are identical except for `--nodes` (the trainer auto-derives
+`NNODES`/`NODE_RANK`/`MASTER_ADDR`/`WORLD_SIZE` from SLURM):
+
+**Single node (1×8 GPU):**
+
+```bash
+cd <repo-root>
+MODE=streaming-train-eval \
+  START_TS=0 NUM_TRAIN_TS=299 \
+  TRAIN_SPLIT_PERCENTAGE=1.0 \
+  EVAL_HOLDOUT_TS=299 EVAL_HOLDOUT_NUM_WINDOWS=1 \
+  EVAL_EVERY_N_WINDOWS=0 EVAL_EVERY_DATA_PCT=0.005 \
+  AUC_THRESHOLD=0.885 \
+  BATCH_SIZE=1024 \
+  METRIC_LOG_FREQ=20 \
+  RUN_NAME=yc-full-epoch \
+  LOG="$HOME/yambda_runs/yc-full-epoch/run.log" \
+  sbatch --nodes=1 --time=4-00:00:00 --job-name=yambda-1node scripts/launch_slurm.sh
+```
+
+**Multi-node (2×8 GPU):** same env, only the `sbatch` line changes:
+
+```bash
+  ... same env knobs as above ...
+  RUN_NAME=yc-full-epoch-2node \
+  LOG="$HOME/yambda_runs/yc-full-epoch-2node/run.log" \
+  sbatch --nodes=2 --ntasks-per-node=1 --time=4-00:00:00 --job-name=yambda-2node scripts/launch_slurm.sh
+```
+
+Notes:
+- `EVAL_EVERY_DATA_PCT=0.005` + `EVAL_EVERY_N_WINDOWS=0` selects the
+  data-fraction eval cadence (eval every 0.5% of the training stream — a fixed
+  number of samples between evals, independent of node count). The two eval
+  cadences are mutually exclusive.
+- `AUC_THRESHOLD` is the convergence target: a SUCCESS `RUN_STOP` fires when the
+  per-pass `window_auc` first reaches it.
+- Keep all run outputs (`LOG`, checkpoints, mllog, TensorBoard) under a writable
+  scratch path you own; the dataset mount is read-only.
+
 ## 2. Data preparation
 
 ```bash

From 3cd9f6d623694e803354ad0611282bd94cd83fee Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Wed, 24 Jun 2026 01:12:10 +0000
Subject: [PATCH 081/113] dlrmv4: exclude eval/checkpoint overhead from step_ms
 timing

MetricsLogger now brackets eval and checkpoint phases with pause/resume
perf timers so the reported step_ms reflects pure train-step latency.
Adds wall_step_ms (inclusive), eval_ms, and ckpt_ms to the perf log line
and TensorBoard scalars (appended for parser backward-compat). Checkpoint
saves and eval windows are wrapped with categorized pause/resume calls.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/checkpoint.py                     | 18 +++++-
 .../dlrm_v3/train/utils.py                    | 10 ++++
 .../generative_recommenders/dlrm_v3/utils.py  | 59 ++++++++++++++++++-
 3 files changed, 84 insertions(+), 3 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py b/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py
index 0ef223b23..bd113959a 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py
@@ -25,6 +25,7 @@
 import os
 import random
 import shutil
+import time
 from datetime import datetime
 from typing import Any, Dict, Optional, Set, Tuple
 
@@ -345,6 +346,15 @@ def save_dmp_checkpoint(
     """
     if path == "":
         return
+    # Exclude checkpoint wall-time from the train step-time window so step_ms
+    # reports canonical compute latency; the duration is surfaced separately
+    # (window_ckpt_time_ms + the per-save log below). pause/resume are no-ops if
+    # metric_logger is None. Not wrapped in try/finally: a save that raises
+    # crashes the process (supervisor restarts fresh), so a dangling pause on
+    # the soon-dead logger is irrelevant.
+    _t_ckpt_start = time.perf_counter()
+    if metric_logger is not None:
+        metric_logger.pause_perf("ckpt")
     base_path = path
     # Atomic-save layout: write to .tmp, rename to final, prune older.
     tmp_subdir = f"{base_path}/{batch_idx}.tmp"
@@ -460,8 +470,14 @@ def save_dmp_checkpoint(
         else:
             os.replace(tmp_subdir, final_subdir)
         _prune_old_checkpoints(base_path, keep_last_n, final_subdir)
-        logger.info("checkpoint successfully saved → %s", final_subdir)
+        logger.info(
+            "checkpoint successfully saved → %s (wall-time %.2fs)",
+            final_subdir,
+            time.perf_counter() - _t_ckpt_start,
+        )
     torch.distributed.barrier()
+    if metric_logger is not None:
+        metric_logger.resume_perf("ckpt")
 
 
 @gin.configurable
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index 785320020..d460c9d07 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -1102,6 +1102,8 @@ def eval_loop(
     # lr_scheduler: to-do: Add a scheduler
 ) -> None:
     model.eval()
+    # Exclude eval wall-time from the train step-time window (see _run_eval_window).
+    metric_logger.pause_perf("eval")
     batch_idx: int = 0
     profiler = Profiler(rank) if output_trace else None
     metric_logger.reset(mode="eval")
@@ -1139,6 +1141,7 @@ def eval_loop(
     metric_logger.compute_and_log(mode="eval")
     for k, v in metric_logger.compute(mode="eval").items():
         print(f"{k}: {v}")
+    metric_logger.resume_perf("eval")
 
 
 class _PipelineModelWrapper(torch.nn.Module):
@@ -2012,6 +2015,12 @@ def _run_eval_window(eval_data_iterator, label: Optional[str] = None) -> None:
         # only fires after a completed eval window or mid-train-window, so any
         # restored state always sits on a completed-eval boundary -- which is
         # also why the eval reset below is safe across resume.
+        #
+        # Exclude this eval pass's wall-time from the train step-time window so
+        # step_ms stays canonical even when eval coincides with a train interval;
+        # the duration is reported separately (window_eval_time_ms + total_eval
+        # below). Resumed unconditionally at the end of this function.
+        metric_logger.pause_perf("eval")
         model.eval()
         # Reset eval metrics so each pass reports a clean number over the FIXED
         # holdout set. Without this, lifetime/window eval metrics would keep
@@ -2105,6 +2114,7 @@ def _run_eval_window(eval_data_iterator, label: Optional[str] = None) -> None:
                     _f.write(json.dumps(_rec) + "\n")
             except OSError as _e:
                 logger.warning("failed to write metrics sink %s: %s", _metrics_path, _e)
+        metric_logger.resume_perf("eval")
 
     def _maybe_checkpoint(train_ts: int) -> None:
         if (
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
index 7a805c195..506790d0b 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
@@ -1053,6 +1053,32 @@ def _make_reg(ws: int) -> List[RecMetricComputation]:
         self._perf_samples_counter: torch.Tensor = torch.zeros(
             1, dtype=torch.long, device=device
         )
+        # Non-train wall-time to exclude from the train step-time window so
+        # `step_ms` reports the canonical per-step compute latency even when an
+        # interval coincides with eval or checkpointing. Categorized so the
+        # excluded time is also reportable (eval_ms / ckpt_ms) rather than just
+        # discarded. The trainer brackets eval/ckpt regions via
+        # pause_perf(cat)/resume_perf(cat); accumulators reset each train-perf log.
+        self._perf_excluded: Dict[str, float] = {"eval": 0.0, "ckpt": 0.0}
+        self._perf_pause: Dict[str, Optional[float]] = {}
+
+    def pause_perf(self, category: str) -> None:
+        """Start excluding wall-time under `category` (e.g. "eval"/"ckpt") from
+        the train step-time window. Idempotent: a second pause without an
+        intervening resume is a no-op (keeps the earliest start)."""
+        if self._perf_pause.get(category) is None:
+            self._perf_pause[category] = time.perf_counter()
+
+    def resume_perf(self, category: str) -> None:
+        """Stop excluding `category` and fold the elapsed interval into the
+        per-category accumulator. No-op if not currently paused."""
+        t0 = self._perf_pause.get(category)
+        if t0 is not None:
+            self._perf_excluded[category] = (
+                self._perf_excluded.get(category, 0.0)
+                + (time.perf_counter() - t0)
+            )
+            self._perf_pause[category] = None
 
     @property
     def all_metrics(self) -> Dict[str, List[RecMetricComputation]]:
@@ -1210,12 +1236,22 @@ def compute_and_log(
         # Throughput metrics (train only). One GPU->CPU sync per call.
         if mode == "train" and self._perf_steps_in_window > 0:
             now = time.perf_counter()
-            dt = max(now - self._perf_t_window, 1e-6)
+            wall_dt = max(now - self._perf_t_window, 1e-6)
+            # Subtract bracketed eval/checkpoint wall-time so step_ms / sps /
+            # MFU reflect canonical train-step compute, not eval+ckpt stalls
+            # that happened to land in this window. The excluded time is also
+            # surfaced separately below (eval_ms / ckpt_ms) rather than discarded.
+            eval_s = self._perf_excluded.get("eval", 0.0)
+            ckpt_s = self._perf_excluded.get("ckpt", 0.0)
+            dt = max(wall_dt - eval_s - ckpt_s, 1e-6)
             n_samples = int(self._perf_samples_counter.item())
             self._perf_total_samples += n_samples
             local_sps = n_samples / dt
             global_sps = local_sps * self._world_size
             step_ms = dt * 1000.0 / self._perf_steps_in_window
+            wall_step_ms = wall_dt * 1000.0 / self._perf_steps_in_window
+            eval_ms = eval_s * 1000.0
+            ckpt_ms = ckpt_s * 1000.0
             elapsed = now - self._perf_t_start
             step = self.global_step["train"]
             self.tb_logger.add_scalar(
@@ -1227,6 +1263,17 @@ def compute_and_log(
             self.tb_logger.add_scalar(
                 "perf/train_step_time_ms", step_ms, global_step=step
             )
+            # Inclusive (old-semantics) per-step wall time and the eval/ckpt
+            # breakdown that was excluded from step_ms above.
+            self.tb_logger.add_scalar(
+                "perf/train_wall_step_time_ms", wall_step_ms, global_step=step
+            )
+            self.tb_logger.add_scalar(
+                "perf/window_eval_time_ms", eval_ms, global_step=step
+            )
+            self.tb_logger.add_scalar(
+                "perf/window_ckpt_time_ms", ckpt_ms, global_step=step
+            )
             self.tb_logger.add_scalar(
                 "perf/train_total_samples", self._perf_total_samples, global_step=step
             )
@@ -1269,12 +1316,20 @@ def compute_and_log(
             logger.info(
                 f"train - Step {step} perf: local_sps={local_sps:.1f} "
                 f"global_sps={global_sps:.1f} step_ms={step_ms:.2f} "
-                f"elapsed_sec={elapsed:.1f} total_samples={self._perf_total_samples}"
+                f"elapsed_sec={elapsed:.1f} total_samples={self._perf_total_samples} "
+                f"wall_step_ms={wall_step_ms:.2f} eval_ms={eval_ms:.1f} "
+                f"ckpt_ms={ckpt_ms:.1f}"
                 + tflops_str
             )
             self._perf_t_window = now
             self._perf_steps_in_window = 0
             self._perf_samples_counter.zero_()
+            # Reset the excluded-time accumulators for the next window. Any
+            # still-open pause (eval/ckpt straddling this log) is cleared so its
+            # remaining time is not double-counted; in practice perf logs fire
+            # only after a train step, never mid eval/ckpt.
+            self._perf_excluded = {"eval": 0.0, "ckpt": 0.0}
+            self._perf_pause = {}
 
         # Time-to-target: latch wall-clock once any task's AUC crosses threshold.
         # Matches MLPerf DLRM-DCNv2 reporting style (default upstream target 0.80275).

From dc412ffd6d852d0ac91943f237801e5607e96095 Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Wed, 24 Jun 2026 01:13:14 +0000
Subject: [PATCH 082/113] =?UTF-8?q?dlrmv4:=20README=20=E2=80=94=20use=20AU?=
 =?UTF-8?q?C=5FTHRESHOLD=3D0.80275=20in=20example=20for=20gin-default=20co?=
 =?UTF-8?q?nsistency?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 recommendation_v4/README.MD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recommendation_v4/README.MD b/recommendation_v4/README.MD
index ed8a9ace4..c8df7b272 100644
--- a/recommendation_v4/README.MD
+++ b/recommendation_v4/README.MD
@@ -57,7 +57,7 @@ MODE=streaming-train-eval \
   TRAIN_SPLIT_PERCENTAGE=1.0 \
   EVAL_HOLDOUT_TS=299 EVAL_HOLDOUT_NUM_WINDOWS=1 \
   EVAL_EVERY_N_WINDOWS=0 EVAL_EVERY_DATA_PCT=0.005 \
-  AUC_THRESHOLD=0.885 \
+  AUC_THRESHOLD=0.80275 \
   BATCH_SIZE=1024 \
   METRIC_LOG_FREQ=20 \
   RUN_NAME=yc-full-epoch \

From fad177f2fa768de976851b36fef0f5e9385ae13e Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Wed, 24 Jun 2026 01:27:44 +0000
Subject: [PATCH 083/113] dlrmv4: make a bare sbatch reproduce the frozen
 reference run

orchestrate() now defaults to the reference run-shape (START_TS=0,
NUM_TRAIN_TS=299, full windows) + the data-fraction eval cadence
(EVAL_EVERY_DATA_PCT=0.005, per-window off), so `sbatch scripts/launch_slurm.sh`
needs no env knobs. SMOKE=1 restores the previous fast functional defaults
(short window, capped batches, per-window eval). The two eval cadences are
auto-deconflicted (explicit EVAL_EVERY_N_WINDOWS>0 disables data-pct). gin
library defaults + the resume/local smoke paths are unchanged. README updated to
the bare single/multi-node commands.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 recommendation_v4/README.MD               | 80 ++++++++++-------------
 recommendation_v4/scripts/launch_slurm.sh | 46 ++++++++++---
 2 files changed, 69 insertions(+), 57 deletions(-)

diff --git a/recommendation_v4/README.MD b/recommendation_v4/README.MD
index c8df7b272..6476e7729 100644
--- a/recommendation_v4/README.MD
+++ b/recommendation_v4/README.MD
@@ -11,17 +11,27 @@ auto-detects its context: run inside the container it takes the single-node
 worker path; submitted via `sbatch` it orchestrates the multi-node run
 (provision + per-node launch). N=1 is byte-for-byte the legacy single-node path.
 
-**Single node (8-GPU), inside the container:**
+A bare submit reproduces the **frozen reference run** (full 299-window sweep +
+data-fraction eval cadence) — all run-shape/cadence defaults are baked in, so no
+env knobs are required:
+
+**Single node (8-GPU):**
 
 ```bash
-docker exec yambda_8gpu bash -c \
-  'cd /workspace/recommendation_v4 && bash scripts/launch_slurm.sh'
+sbatch --nodes=1 scripts/launch_slurm.sh
 ```
 
-**Multi-node (N×8-GPU) via SLURM:**
+**Multi-node (N×8-GPU):**
 
 ```bash
-sbatch --nodes=2 --partition=meta64 scripts/launch_slurm.sh
+sbatch --nodes=2 scripts/launch_slurm.sh
+```
+
+For a fast functional check instead of a full run, prepend `SMOKE=1` (short
+window, capped batches, per-window eval):
+
+```bash
+SMOKE=1 sbatch --nodes=1 scripts/launch_slurm.sh
 ```
 
 Multi-node uses real RDMA (RoCEv2). The fabric/NCCL setup and every
@@ -39,50 +49,26 @@ bash scripts/launch_slurm.sh
 
 Data path resolves at runtime via `env_path` gin macros (see [yambda_5b.gin](generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin)). Traces and any per-run outputs land in `results/<RUN_NAME>/`.
 
-### 1.1 Full reference run (single + multi-node)
-
-For a complete benchmark run you typically wrap the `sbatch` call in a small
-submit script that pins the run-shape + eval cadence (most other
-hyperparameters already match the [yambda_5b.gin](generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin)
-defaults, so only the knobs below need to be set). The single- and multi-node
-launchers are identical except for `--nodes` (the trainer auto-derives
-`NNODES`/`NODE_RANK`/`MASTER_ADDR`/`WORLD_SIZE` from SLURM):
-
-**Single node (1×8 GPU):**
+### 1.1 The frozen reference shape
 
-```bash
-cd <repo-root>
-MODE=streaming-train-eval \
-  START_TS=0 NUM_TRAIN_TS=299 \
-  TRAIN_SPLIT_PERCENTAGE=1.0 \
-  EVAL_HOLDOUT_TS=299 EVAL_HOLDOUT_NUM_WINDOWS=1 \
-  EVAL_EVERY_N_WINDOWS=0 EVAL_EVERY_DATA_PCT=0.005 \
-  AUC_THRESHOLD=0.80275 \
-  BATCH_SIZE=1024 \
-  METRIC_LOG_FREQ=20 \
-  RUN_NAME=yc-full-epoch \
-  LOG="$HOME/yambda_runs/yc-full-epoch/run.log" \
-  sbatch --nodes=1 --time=4-00:00:00 --job-name=yambda-1node scripts/launch_slurm.sh
-```
+The reference run-shape and eval cadence are the built-in defaults (set in the
+orchestrate phase of `scripts/launch_slurm.sh`), so the bare `sbatch` commands
+above ARE the reference run. The single- and multi-node launchers are identical
+except for `--nodes`; the trainer auto-derives `NNODES`/`NODE_RANK`/`MASTER_ADDR`/
+`WORLD_SIZE` from SLURM. The baked-in shape:
 
-**Multi-node (2×8 GPU):** same env, only the `sbatch` line changes:
-
-```bash
-  ... same env knobs as above ...
-  RUN_NAME=yc-full-epoch-2node \
-  LOG="$HOME/yambda_runs/yc-full-epoch-2node/run.log" \
-  sbatch --nodes=2 --ntasks-per-node=1 --time=4-00:00:00 --job-name=yambda-2node scripts/launch_slurm.sh
-```
-
-Notes:
-- `EVAL_EVERY_DATA_PCT=0.005` + `EVAL_EVERY_N_WINDOWS=0` selects the
-  data-fraction eval cadence (eval every 0.5% of the training stream — a fixed
-  number of samples between evals, independent of node count). The two eval
-  cadences are mutually exclusive.
-- `AUC_THRESHOLD` is the convergence target: a SUCCESS `RUN_STOP` fires when the
-  per-pass `window_auc` first reaches it.
-- Keep all run outputs (`LOG`, checkpoints, mllog, TensorBoard) under a writable
-  scratch path you own; the dataset mount is read-only.
+| knob | reference default |
+|---|---|
+| `START_TS` / `NUM_TRAIN_TS` | 0 / 299 (full sweep) |
+| eval cadence | `EVAL_EVERY_DATA_PCT=0.005` (every 0.5% of the training stream — a fixed number of samples between evals, independent of node count), per-window cadence off |
+| `NUM_TRAIN_BATCHES` / `NUM_EVAL_BATCHES` | 0 / 0 (consume full windows) |
+
+To customize, override any knob via env (e.g. `RUN_NAME=...`, `LOG=...`,
+`AUC_THRESHOLD=...`). Selecting the per-window eval cadence
+(`EVAL_EVERY_N_WINDOWS>0`) automatically disables the data-fraction one (they are
+mutually exclusive). Keep all run outputs (`LOG`, checkpoints, mllog,
+TensorBoard) under a writable scratch path you own — the dataset mount is
+read-only.
 
 ## 2. Data preparation
 
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index f5e07ad0a..e81510434 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -34,9 +34,14 @@
 #                supervisor's direct `bash scripts/launch_slurm.sh` is unchanged.
 #
 # USAGE
-#   Multi-node (N>=1):  sbatch --nodes=2 scripts/launch_slurm.sh
+#   Reference run (1 node): sbatch --nodes=1 scripts/launch_slurm.sh
+#   Reference run (N node):  sbatch --nodes=N scripts/launch_slurm.sh
+#     ^ a bare submit reproduces the FROZEN REFERENCE shape (full 299-window
+#       sweep + data-fraction eval cadence). Prepend SMOKE=1 for a fast
+#       functional check (short window, capped batches).
 #   Single-node direct: bash scripts/launch_slurm.sh   (already inside container;
-#                       what run_streaming_e2e.sh invokes per relaunch)
+#                       what run_streaming_e2e.sh invokes per relaunch — uses the
+#                       gin defaults, NOT the orchestrate reference shape)
 #   Perf pair:
 #     LOG=/apps/chcai/perf_1node.log NUM_TRAIN_BATCHES=200 NUM_EVAL_BATCHES=0 \
 #       EVAL_EACH_WINDOW=0 METRIC_LOG_FREQ=20 \
@@ -146,15 +151,36 @@ orchestrate() {
   mkdir -p "$SCRATCH" 2>/dev/null || true
   LOG=${LOG:-$SCRATCH/yambda_slurm.${SLURM_JOB_ID:-manual}.log}
 
-  # Smoke defaults — override via env for a perf run (see header USAGE).
+  # Run-shape defaults. By DEFAULT a bare `sbatch scripts/launch_slurm.sh`
+  # reproduces the FROZEN REFERENCE run: full 299-window sweep (START_TS=0) with
+  # the data-fraction eval cadence (eval every 0.5% of the training stream). Set
+  # SMOKE=1 for a fast functional check (short dense window, capped batches,
+  # per-window eval). Any individual knob below stays env-overridable.
   MODE=${MODE:-streaming-train-eval}
-  START_TS=${START_TS:-150}
-  NUM_TRAIN_TS=${NUM_TRAIN_TS:-1}
-  NUM_TRAIN_BATCHES=${NUM_TRAIN_BATCHES:-20}
-  NUM_EVAL_BATCHES=${NUM_EVAL_BATCHES:-10}
+  if [ "${SMOKE:-0}" = "1" ]; then
+    START_TS=${START_TS:-150}
+    NUM_TRAIN_TS=${NUM_TRAIN_TS:-1}
+    NUM_TRAIN_BATCHES=${NUM_TRAIN_BATCHES:-20}
+    NUM_EVAL_BATCHES=${NUM_EVAL_BATCHES:-10}
+    EVAL_EVERY_N_WINDOWS=${EVAL_EVERY_N_WINDOWS:-1}
+    METRIC_LOG_FREQ=${METRIC_LOG_FREQ:-5}
+  fi
+  START_TS=${START_TS:-0}
+  NUM_TRAIN_TS=${NUM_TRAIN_TS:-299}
+  NUM_TRAIN_BATCHES=${NUM_TRAIN_BATCHES:-0}
+  NUM_EVAL_BATCHES=${NUM_EVAL_BATCHES:-0}
   EVAL_EACH_WINDOW=${EVAL_EACH_WINDOW:-1}
-  EVAL_EVERY_N_WINDOWS=${EVAL_EVERY_N_WINDOWS:-1}
-  METRIC_LOG_FREQ=${METRIC_LOG_FREQ:-5}
+  METRIC_LOG_FREQ=${METRIC_LOG_FREQ:-20}
+  # Eval cadence — the two knobs are mutually exclusive (the worker raises if both
+  # are >0). Data-fraction is the reference default; if the caller explicitly
+  # selected the per-window cadence (EVAL_EVERY_N_WINDOWS>0) leave data-pct off,
+  # otherwise default to the reference 0.5%-of-data cadence (per-window disabled).
+  if [ "${EVAL_EVERY_N_WINDOWS:-0}" -gt 0 ] 2>/dev/null; then
+    EVAL_EVERY_DATA_PCT=${EVAL_EVERY_DATA_PCT:-0}
+  else
+    EVAL_EVERY_N_WINDOWS=0
+    EVAL_EVERY_DATA_PCT=${EVAL_EVERY_DATA_PCT:-0.005}
+  fi
   FORCE_PROVISION=${FORCE_PROVISION:-0}
 
   # Truncate the metrics log on a FRESH run; APPEND on a supervised relaunch
@@ -173,7 +199,7 @@ orchestrate() {
   chmod 666 "$LOG" 2>/dev/null || true
   echo "[$(date)] launch_slurm/orchestrate: job=${SLURM_JOB_ID:-?} nodes=${SLURM_JOB_NODELIST:-?} nnodes=${SLURM_NNODES:-1}" | tee -a "$LOG"
   echo "[$(date)] resolved SCRIPT_PATH=$SCRIPT_PATH REPO=$REPO" | tee -a "$LOG"
-  echo "[$(date)] config: MODE=$MODE START_TS=$START_TS NUM_TRAIN_TS=$NUM_TRAIN_TS NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES METRIC_LOG_FREQ=$METRIC_LOG_FREQ" | tee -a "$LOG"
+  echo "[$(date)] config: MODE=$MODE START_TS=$START_TS NUM_TRAIN_TS=$NUM_TRAIN_TS NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES METRIC_LOG_FREQ=$METRIC_LOG_FREQ SMOKE=${SMOKE:-0} EVAL_EVERY_N_WINDOWS=$EVAL_EVERY_N_WINDOWS EVAL_EVERY_DATA_PCT=$EVAL_EVERY_DATA_PCT" | tee -a "$LOG"
   echo "[$(date)] lr-override: DENSE_LR=${DENSE_LR:-<unset:gin default 1e-7>} SPARSE_LR=${SPARSE_LR:-<unset:gin default 1e-7>}" | tee -a "$LOG"
 
   # Rendezvous resolved on the HOST (the container image has no SLURM client).

From b50fe5876fce2d90b3678ab95cd1a12f929f9f6d Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Wed, 24 Jun 2026 16:32:20 +0000
Subject: [PATCH 084/113] =?UTF-8?q?dlrmv4:=20address=20PR=20review=20?=
 =?UTF-8?q?=E2=80=94=20mlperf=5Flogging=20install/pin=20+=20logging-utils?=
 =?UTF-8?q?=20robustness?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Dockerfiles (AMD+NVIDIA): install mlperf_logging (--no-deps) so compliance
  logging is not silently disabled at runtime.
- Pin mlcommons/logging to 6.0.0-rc6 in requirements.txt + both Dockerfiles
  for reproducibility.
- mlperf_logging_utils: guard empty os.path.dirname(log_path); attach file
  handler on rank 0 only; get_mlperf_logger() returns None when the dep is
  unavailable so callers' `is not None` guards disable logging cleanly.
- launch_slurm.sh: chmod 622 (not 666) on the job log — tee -a needs write
  only, avoids world-readable logs on shared NFS.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 recommendation_v4/Dockerfile                  |  6 +++++
 recommendation_v4/Dockerfile.nvidia           |  6 +++++
 .../dlrm_v3/train/mlperf_logging_utils.py     | 22 +++++++++++++++----
 recommendation_v4/requirements.txt            |  2 +-
 recommendation_v4/scripts/launch_slurm.sh     | 10 +++++----
 5 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/recommendation_v4/Dockerfile b/recommendation_v4/Dockerfile
index 112d605df..450a5ab55 100644
--- a/recommendation_v4/Dockerfile
+++ b/recommendation_v4/Dockerfile
@@ -66,6 +66,12 @@ RUN pip install \
         torchmetrics==1.0.3 \
         tensordict
 
+# mlperf_logging — required by train/mlperf_logging_utils.py for MLPerf
+# compliance logs. Pinned to the Training 6.0 tag for reproducibility; --no-deps
+# so pip does not resolve requirements.txt's torch/fbgemm_gpu/torchrec pins and
+# clobber the +rocm7.2 wheels above.
+RUN pip install --no-deps "git+https://github.com/mlcommons/logging.git@6.0.0-rc6"
+
 # Smoke-test the 6 imports the launch script checks at
 # scripts/launch_smoke_8gpu.sh:26.
 RUN python -c "import torch, fbgemm_gpu, torchrec, polars, xxhash, gin; \
diff --git a/recommendation_v4/Dockerfile.nvidia b/recommendation_v4/Dockerfile.nvidia
index 388ab8e5e..a1a9a3319 100644
--- a/recommendation_v4/Dockerfile.nvidia
+++ b/recommendation_v4/Dockerfile.nvidia
@@ -68,6 +68,12 @@ RUN pip install \
         torchmetrics==1.0.3 \
         tensordict
 
+# mlperf_logging — required by train/mlperf_logging_utils.py for MLPerf
+# compliance logs. Pinned to the Training 6.0 tag for reproducibility; --no-deps
+# so pip does not resolve requirements.txt's torch/fbgemm_gpu/torchrec pins and
+# clobber the image's native NGC torch.
+RUN pip install --no-deps "git+https://github.com/mlcommons/logging.git@6.0.0-rc6"
+
 # Smoke-test that packages are installed at the right versions. Cannot dlopen
 # fbgemm_gpu / torchrec here because their SONAME deps (libnvidia-ml.so.1, etc.)
 # only resolve when the container runs with `--gpus all` — which docker build
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/mlperf_logging_utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/mlperf_logging_utils.py
index 51e7971b5..74eac4ede 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/mlperf_logging_utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/mlperf_logging_utils.py
@@ -93,8 +93,16 @@ def __init__(
         self._logger = None
         if not self.enabled:
             return
-        if log_path:
-            os.makedirs(os.path.dirname(log_path), exist_ok=True)
+        # Only rank 0 emits events, so only rank 0 needs the file handler:
+        # attaching it on every rank wastes file handles and risks contention on
+        # a shared log path. Non-zero ranks configure mllog without a filename
+        # (their event methods no-op anyway).
+        if log_path and self.rank == 0:
+            log_dir = os.path.dirname(log_path)
+            # dirname is "" when log_path has no directory component (e.g.
+            # MLPERF_LOG_PATH=mlperf.log); os.makedirs("") raises, so guard it.
+            if log_dir:
+                os.makedirs(log_dir, exist_ok=True)
             mllog.config(filename=log_path, default_stack_offset=default_stack_offset)
         else:
             mllog.config(default_stack_offset=default_stack_offset)
@@ -158,14 +166,20 @@ def get_mlperf_logger(
     log_path: str = "",
     benchmark_name: str = "hstu",
     submitter_name: str = "reference_implementation",
-) -> MLPerfLogger:
-    """Build a configured :class:`MLPerfLogger`.
+) -> Optional[MLPerfLogger]:
+    """Build a configured :class:`MLPerfLogger`, or ``None`` if unavailable.
 
     ``benchmark_name`` / ``submitter_name`` are gin-configurable (and the path is
     env-overridable via ``$MLPERF_LOG_PATH``) so a submission can stamp its own
     benchmark string without code changes. The log path defaults to
     ``$MLPERF_LOG_PATH`` when set, else ``""`` (mllog logs to stdout).
+
+    Returns ``None`` when ``mlperf_logging`` is not installed so callers' existing
+    ``mlperf_logger is not None`` guards cleanly disable logging -- otherwise they
+    would pass the guard and then hit ``logger.constants`` (which is ``None``).
     """
+    if not _MLLOG_AVAILABLE:
+        return None
     resolved_path = os.environ.get("MLPERF_LOG_PATH", log_path)
     return MLPerfLogger(
         rank=rank,
diff --git a/recommendation_v4/requirements.txt b/recommendation_v4/requirements.txt
index a8637bf5e..d1aba1e95 100644
--- a/recommendation_v4/requirements.txt
+++ b/recommendation_v4/requirements.txt
@@ -5,4 +5,4 @@ gin_config>=0.5.0
 pandas>=2.2.0
 tensorboard>=2.19.0
 pybind11
-git+https://github.com/mlcommons/logging.git
+git+https://github.com/mlcommons/logging.git@6.0.0-rc6
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index e81510434..829c5ce44 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -192,11 +192,13 @@ orchestrate() {
   else
     : > "$LOG"
   fi
-  # World-writable so the in-container worker (running as root, squashed to
-  # `nobody` over root-squashed NFS) can append via `tee -a $LOG`. Without this
-  # the worker's tee opens the file read-only-denied and exits non-zero, which
+  # Group/other write (but NOT read) so the in-container worker (running as root,
+  # squashed to `nobody` over root-squashed NFS) can append via `tee -a $LOG`.
+  # `tee -a` opens write-only, so 622 is sufficient -- avoid 666, which would let
+  # other users on the shared filesystem read (and tamper with) the job log.
+  # Without the write bit the worker's tee is denied and exits non-zero, which
   # pipefail turns into a spurious rc=1 even when training succeeds.
-  chmod 666 "$LOG" 2>/dev/null || true
+  chmod 622 "$LOG" 2>/dev/null || true
   echo "[$(date)] launch_slurm/orchestrate: job=${SLURM_JOB_ID:-?} nodes=${SLURM_JOB_NODELIST:-?} nnodes=${SLURM_NNODES:-1}" | tee -a "$LOG"
   echo "[$(date)] resolved SCRIPT_PATH=$SCRIPT_PATH REPO=$REPO" | tee -a "$LOG"
   echo "[$(date)] config: MODE=$MODE START_TS=$START_TS NUM_TRAIN_TS=$NUM_TRAIN_TS NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES METRIC_LOG_FREQ=$METRIC_LOG_FREQ SMOKE=${SMOKE:-0} EVAL_EVERY_N_WINDOWS=$EVAL_EVERY_N_WINDOWS EVAL_EVERY_DATA_PCT=$EVAL_EVERY_DATA_PCT" | tee -a "$LOG"

From 9d1dbf8a70945f2afcb8edb741529afa8e3c4ad7 Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Wed, 24 Jun 2026 17:19:00 +0000
Subject: [PATCH 085/113] dlrmv4: decorrelate per-rank runtime RNG for HSTU
 dropout

Add decorrelate_runtime_rng(rank): after make_model + DMP init, re-seed
torch/cuda with $SEED + rank so HSTU dropout draws different masks per
data-parallel rank instead of the identical masks implied by the shared
init seed. Runs strictly after init so replicated dense weights and
sharded embeddings stay init-identical across ranks. Toggle via
$DECORRELATE_DROPOUT (default 1; 0 = legacy identical-mask behavior).
Offset is a pure fn of resolved $SEED + rank and per-rank RNG state is
checkpointed, so reproducibility/resume are preserved.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin           | 13 ++++++
 .../dlrm_v3/train/train_ranker.py             |  6 +++
 .../dlrm_v3/train/utils.py                    | 42 +++++++++++++++++++
 3 files changed, 61 insertions(+)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 7b7959c2b..864b45351 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -88,6 +88,19 @@ seed_everything.seed = @seed/env_int()
 seed/env_int.key     = "SEED"
 seed/env_int.default = -1
 
+# $DECORRELATE_DROPOUT — per-rank decorrelation of RUNTIME stochastic ops.
+# After init (make_model + DMP), re-seed torch/cuda with $SEED + rank so HSTU
+# dropout (input_dropout=0.2, linear_dropout_rate=0.1) draws DIFFERENT masks on
+# every data-parallel rank instead of the identical masks implied by the shared
+# init seed — the standard data-parallel RNG track. Init (dense weights +
+# sharded embeddings) is untouched: this runs strictly after it. 1 = on
+# (default), 0 = legacy identical-mask-on-every-rank behavior. Reproducible: the
+# offset is a pure fn of resolved $SEED + rank, and per-rank RNG state is
+# checkpointed across resume.
+decorrelate_runtime_rng.enabled = @drr/env_int()
+drr/env_int.key     = "DECORRELATE_DROPOUT"
+drr/env_int.default = 1
+
 # dense model optimizer
 # Learning rate is env-overridable via $DENSE_LR (default 1e-7).
 dense_optimizer_factory_and_class.learning_rate = @dlr/env_float()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
index 7ee39c2ed..25711bd02 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
@@ -108,6 +108,7 @@ def _main_func(
     from generative_recommenders.dlrm_v3.checkpoint import load_dmp_checkpoint
     from generative_recommenders.dlrm_v3.train.utils import (
         cleanup,
+        decorrelate_runtime_rng,
         eval_loop,
         make_model,
         make_optimizer_and_shard,
@@ -149,6 +150,11 @@ def _main_func(
         world_size=world_size,
         local_world_size=gpus_per_node,
     )
+    # Decorrelate forward-time stochasticity (HSTU dropout) per data-parallel
+    # rank. MUST run after make_model() + make_optimizer_and_shard() so the
+    # replicated dense weights and sharded embeddings stay init-identical across
+    # ranks; this only offsets the global RNG by rank so dropout masks differ.
+    decorrelate_runtime_rng(rank=rank)
     train_dataloader, test_dataloader = make_train_test_dataloaders(
         hstu_config=model_configs,
         embedding_table_configs=embedding_table_configs,
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index d0c4a8538..1e537b32d 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -127,6 +127,48 @@ def seed_everything(seed: int = -1, rank: int = 0) -> None:
     torch.cuda.manual_seed_all(seed)
 
 
+@gin.configurable
+def decorrelate_runtime_rng(rank: int = 0, enabled: bool = True) -> None:
+    """Offset the global RNG by ``rank`` so RUNTIME stochastic ops draw
+    decorrelated draws per data-parallel rank.
+
+    The only such op here is HSTU dropout (input_dropout=0.2,
+    linear_dropout_rate=0.1; see configs.get_hstu_configs). seed_everything()
+    sets an IDENTICAL seed on every rank — required so replicated dense weights
+    init identically — which also makes every rank draw the SAME dropout masks
+    in the forward. Gradients still differ (each rank sees different data), so
+    that is not incorrect, but identical masks waste the extra mask diversity
+    that decorrelated replicas give per global batch. This re-seeds torch/cuda
+    with $SEED + rank to recover it (the standard data-parallel RNG track, cf.
+    Megatron's tensor/data-parallel RNG separation).
+
+    ORDERING IS LOAD-BEARING: call this AFTER everything that must be identical
+    across ranks — make_model() (dense weight init) AND make_optimizer_and_shard()
+    (the pre-DMP re-seed + sharded embedding init). It deliberately perturbs only
+    forward-time stochasticity, never init.
+
+    Reproducibility is preserved: the offset is a pure function of the resolved
+    $SEED (exported by seed_everything) and rank, and per-rank RNG state is
+    snapshotted/restored on checkpoint resume (see checkpoint.py). Set
+    enabled=False (gin: decorrelate_runtime_rng.enabled) to restore the legacy
+    identical-mask-on-every-rank behavior.
+    """
+    if not enabled:
+        logger.info(
+            f"[rank {rank}] decorrelate_runtime_rng disabled; dropout masks "
+            f"identical across ranks"
+        )
+        return
+    base = int(os.environ.get("SEED", "1"))
+    offset_seed = base + int(rank)
+    torch.manual_seed(offset_seed)
+    torch.cuda.manual_seed_all(offset_seed)
+    logger.info(
+        f"[rank {rank}] decorrelated runtime RNG: SEED={base} + rank={rank} "
+        f"=> {offset_seed} (per-rank dropout masks)"
+    )
+
+
 def setup(
     rank: int,
     world_size: int,

From 784516aeb8947840bc0e6fea993f27e195bb381e Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Wed, 24 Jun 2026 18:41:14 +0000
Subject: [PATCH 086/113] dlrmv4: drop non-portable build/run helpers from the
 baseline
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove the B200 Dockerfile.nvidia and the local scripts/run_docker.sh
helper — neither is wired into launch_slurm.sh (which uses rocm/primus
directly) and both are environment-specific. Gitignore them plus the
local ad-hoc analysis artifacts so they are not re-added.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 recommendation_v4/.gitignore            | 17 +++++
 recommendation_v4/Dockerfile.nvidia     | 98 -------------------------
 recommendation_v4/scripts/run_docker.sh | 77 -------------------
 3 files changed, 17 insertions(+), 175 deletions(-)
 delete mode 100644 recommendation_v4/Dockerfile.nvidia
 delete mode 100755 recommendation_v4/scripts/run_docker.sh

diff --git a/recommendation_v4/.gitignore b/recommendation_v4/.gitignore
index 5a0329448..4bfbaa0ca 100644
--- a/recommendation_v4/.gitignore
+++ b/recommendation_v4/.gitignore
@@ -165,3 +165,20 @@ cython_debug/
 yambda_slurm.*.out
 yambda_slurm.*.log
 compliance_checker.log
+
+# Local container build/run helpers — environment-specific, not committed.
+/Dockerfile.nvidia
+/scripts/run_docker.sh
+
+# Local dlrmv4 analysis artifacts (per-job plots/scripts/dumps); kept on disk
+# for ad-hoc analysis but never committed.
+/analyze_*.py
+/dump_eval_*.py
+/perday_*.py
+/parse_lr_sweep.py
+/gen_canvas.py
+/plot_*.py
+/*.png
+/*.csv
+/scripts/bench_collectives.py
+/docs/v4_vs_v2_and_hstu_walkthrough.md
diff --git a/recommendation_v4/Dockerfile.nvidia b/recommendation_v4/Dockerfile.nvidia
deleted file mode 100644
index a1a9a3319..000000000
--- a/recommendation_v4/Dockerfile.nvidia
+++ /dev/null
@@ -1,98 +0,0 @@
-# B200 path — implements docs/training_recipe.md §"B200".
-
-FROM nvcr.io/nvidia/pytorch:26.04-py3
-
-ENV PYTHONUNBUFFERED=1 \
-    PIP_NO_CACHE_DIR=1 \
-    PIP_DISABLE_PIP_VERSION_CHECK=1
-
-WORKDIR /workspace/recommendation_v4
-
-# torch / triton — training_recipe.md:137-138, 149-150. Native to the image
-# and must NOT be reinstalled (CUPTI / sm_100 support depends on it).
-
-# torchrec — training_recipe.md:152. Nightly cu130 wheel, --no-deps.
-RUN pip install --force-reinstall --no-deps \
-        --index-url https://download.pytorch.org/whl/nightly/cu130 \
-        torchrec==1.7.0.dev20260601+cu130
-
-# fbgemm_gpu — training_recipe.md:151. Build from FBGEMM commit 10b77573 for
-# sm_100 against the image's native torch. ~55 min (sm_100 TBE-forward via ptxas).
-# NOTE: --nvml_lib_path diverges from training_recipe.md:151. The recipe points
-# at /usr/lib/x86_64-linux-gnu/libnvidia-ml.so, which is mounted only at
-# `docker run --gpus all` time. During `docker build` no GPU runtime is
-# attached, so we link against the NVML stub that ships inside the CUDA SDK in
-# the NGC image; the real driver-side libnvidia-ml.so is used at runtime.
-RUN apt-get update && apt-get install -y --no-install-recommends git build-essential && \
-    rm -rf /var/lib/apt/lists/* && \
-    git clone --recursive https://github.com/pytorch/FBGEMM.git /tmp/FBGEMM && \
-    cd /tmp/FBGEMM && \
-    git checkout 10b775730212923f65f7b78f79b6a01d80cf3c29 && \
-    git submodule update --init --recursive && \
-    cd fbgemm_gpu && \
-    # Filter `fairscale` and the torch family from fbgemm's requirements.txt:
-    # fairscale pulls a CPU torch that would clobber the image's native torch.
-    # fairscale is a distributed-training lib used by fbgemm tests, not by the
-    # build itself.
-    grep -v -E '^(fairscale|torch|torchvision|torchaudio)([<>=!]|$)' requirements.txt > /tmp/req.txt && \
-    pip install -r /tmp/req.txt && \
-    TORCH_CUDA_ARCH_LIST=10.0 python setup.py bdist_wheel \
-        --build-target default \
-        --build-variant cuda \
-        --package_channel nightly \
-        --nvml_lib_path /usr/local/cuda/lib64/stubs/libnvidia-ml.so && \
-    pip install --force-reinstall --no-deps dist/fbgemm_gpu_nightly-*.whl && \
-    cd / && rm -rf /tmp/FBGEMM
-
-# polars-u64-idx — training_recipe.md:153 (mandatory; yambda-5b > 4.29 B rows).
-# Remaining packages — training_recipe.md:156-159 ("Additional Python deps") plus
-# `datasets` + `huggingface_hub`, which the recipe does not list but
-# preprocess_public_data.py:278 imports to download yambda from HuggingFace.
-RUN pip install \
-        polars-u64-idx==1.33.1 \
-        gin-config \
-        absl-py \
-        datasets \
-        huggingface_hub \
-        pyre-extensions \
-        iopath \
-        typing-inspect \
-        psutil \
-        tqdm \
-        pyyaml \
-        lightning-utilities && \
-    # torchmetrics and tensordict declare `torch` as a dep; without --no-deps
-    # pip resolves and reinstalls torch, clobbering the image's native NGC
-    # torch (which would break CUPTI + sm_100 support per training_recipe.md:199).
-    pip install --no-deps \
-        torchmetrics==1.0.3 \
-        tensordict
-
-# mlperf_logging — required by train/mlperf_logging_utils.py for MLPerf
-# compliance logs. Pinned to the Training 6.0 tag for reproducibility; --no-deps
-# so pip does not resolve requirements.txt's torch/fbgemm_gpu/torchrec pins and
-# clobber the image's native NGC torch.
-RUN pip install --no-deps "git+https://github.com/mlcommons/logging.git@6.0.0-rc6"
-
-# Smoke-test that packages are installed at the right versions. Cannot dlopen
-# fbgemm_gpu / torchrec here because their SONAME deps (libnvidia-ml.so.1, etc.)
-# only resolve when the container runs with `--gpus all` — which docker build
-# can't do. The real 6-import check at scripts/launch_smoke_8gpu.sh:26 runs at
-# `docker run` time when the driver is mounted in.
-RUN python -c "import torch, polars, xxhash, gin; \
-print('torch', torch.__version__, '| cuda', getattr(torch.version, 'cuda', None)); \
-import importlib.metadata as m; \
-print('fbgemm_gpu installed:', m.version('fbgemm_gpu_nightly')); \
-print('torchrec installed:  ', m.version('torchrec'))"
-
-COPY . /workspace/recommendation_v4
-
-# B200 runtime env — training_recipe.md:184-195.
-ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
-    HSTU_HAMMER_KERNEL=TRITON \
-    TORCH_CUDA_ARCH_LIST=10.0 \
-    HBM_CAP_GB=150 \
-    TRITON_CACHE_DIR=/workspace/recommendation_v4/.triton_cache \
-    DLRM_DATA_PATH=/data/mlperf_dlrm_v4
-
-CMD ["bash"]
diff --git a/recommendation_v4/scripts/run_docker.sh b/recommendation_v4/scripts/run_docker.sh
deleted file mode 100755
index 864746550..000000000
--- a/recommendation_v4/scripts/run_docker.sh
+++ /dev/null
@@ -1,77 +0,0 @@
-#!/bin/bash
-# Launch a yambda_8gpu container from rocm/mlperf:dlrm_v3_mi355 with the repo
-# and data directories bind-mounted at matching host/container paths.
-#
-# Usage:
-#   bash scripts/run_docker.sh                                      # interactive shell
-#   bash scripts/run_docker.sh -- bash scripts/launch_slurm.sh      # one-shot single-node train
-#
-# Inside the container /.dockerenv exists, so launch_slurm.sh auto-selects its
-# SLURM-free `worker` phase (NNODES=1) — identical to the old launch_smoke_8gpu.sh.
-#
-# Overrides (export before invoking):
-#   IMAGE          docker image                 (default: rocm/mlperf:dlrm_v3_mi355)
-#   CONTAINER_NAME container name               (default: mlperf-recommendation-v4)
-#   REPO_HOST      host path to repo            (default: this script's parent)
-#   DATA_HOST      host path to dataset root    (default: /data/mlperf_dlrm_v4)
-#   LOG            in-container train log path  (default: /workspace/recommendation_v4/mlperf_dlrm_v4.log)
-#   MODE           launch_slurm.sh mode         (default: launcher default = streaming-train-eval; set train-eval for classic)
-#   MAX_SEQ_LEN / HISTORY_LENGTH                seq shape; set 2048 / 2039 for the previous 2k shape
-#   NCCL_SOCKET_IFNAME  NCCL bootstrap NIC      (default: launch_slurm picks lo single-node / fenic0 multi-node; override per host)
-
-set -euo pipefail
-
-IMAGE=${IMAGE:-rocm/mlperf:dlrm_v3_mi355}
-CONTAINER_NAME=${CONTAINER_NAME:-mlperf-recommendation-v4}
-REPO_HOST=${REPO_HOST:-$(cd "$(dirname "$0")/.." && pwd)}
-DATA_HOST=${DATA_HOST:-/data/mlperf_dlrm_v4}
-
-# Mount host paths at the same string inside the container so DLRM_DATA_PATH
-# can be set from either side and resolve identically (env_path() in
-# dlrm_v3/utils.py:641-653 does a literal os.environ.get).
-REPO_CONT=/workspace/recommendation_v4
-DATA_CONT=${DATA_HOST}
-
-if [ ! -d "${DATA_HOST}" ]; then
-  echo "warning: ${DATA_HOST} does not exist on host. Run preprocess_public_data first or override DATA_HOST." >&2
-fi
-
-# Drop an optional `--` separating this script's invocation from the in-container
-# command (the documented `run_docker.sh -- bash scripts/launch_slurm.sh` form).
-# Without this, `--` is forwarded verbatim to `docker run` as the command and
-# fails with: exec: "--": executable file not found.
-if [ "${1:-}" = "--" ]; then shift; fi
-
-# If a container with this name is already running, exec into it instead of
-# starting a new one. Matches the `docker exec yambda_8gpu ...` pattern in
-# README.MD:9-12.
-if docker ps --format '{{.Names}}' | grep -qx "${CONTAINER_NAME}"; then
-  echo "container ${CONTAINER_NAME} already running; exec'ing in" >&2
-  exec docker exec -it "${CONTAINER_NAME}" "${@:-bash}"
-fi
-
-# Remove a stopped container with the same name so --name doesn't collide.
-if docker ps -a --format '{{.Names}}' | grep -qx "${CONTAINER_NAME}"; then
-  docker rm "${CONTAINER_NAME}" >/dev/null
-fi
-
-exec docker run --rm -it \
-  --name "${CONTAINER_NAME}" \
-  --device=/dev/kfd --device=/dev/dri \
-  --group-add video --group-add render \
-  --cap-add=SYS_PTRACE --security-opt seccomp=unconfined \
-  --ipc=host --network=host \
-  --shm-size=64g --ulimit memlock=-1 --ulimit stack=67108864 \
-  -v "${REPO_HOST}:${REPO_CONT}" \
-  -v "${DATA_HOST}:${DATA_CONT}" \
-  -e DLRM_DATA_PATH="${DATA_CONT}" \
-  -e HSTU_HAMMER_KERNEL="${HSTU_HAMMER_KERNEL:-TRITON}" \
-  -e RUN_NAME="${RUN_NAME:-default}" \
-  -e LOG="${LOG:-/workspace/recommendation_v4/mlperf_dlrm_v4.log}" \
-  ${MODE:+-e MODE="${MODE}"} \
-  ${MAX_SEQ_LEN:+-e MAX_SEQ_LEN="${MAX_SEQ_LEN}"} \
-  ${HISTORY_LENGTH:+-e HISTORY_LENGTH="${HISTORY_LENGTH}"} \
-  ${NCCL_SOCKET_IFNAME:+-e NCCL_SOCKET_IFNAME="${NCCL_SOCKET_IFNAME}"} \
-  -w "${REPO_CONT}" \
-  "${IMAGE}" \
-  "${@:-bash}"

From 3f36f61abde50a8e01de87b4ebd109f3f7b54d57 Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Wed, 24 Jun 2026 19:19:08 +0000
Subject: [PATCH 087/113] dlrmv4: trim PR comments, default DECORRELATE_DROPOUT
 off, extend eval metric

- Collapse verbose comment blocks across gin/launch_slurm/resume-test to 1-2
  lines; revert container name to yambda_primus and drop #SBATCH --time.
- Default DECORRELATE_DROPOUT=0 (identical dropout masks across ranks).
- Generalize EVAL_ACCURACY_AUC_MODE to any {window,lifetime}_{auc,gauc,
  accuracy,ne} metric, with NE handled as lower-is-better for early-stop.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin           | 38 ++------
 .../train/tests/streaming_resume_test.sh      |  7 +-
 .../dlrm_v3/train/utils.py                    | 85 ++++++++---------
 recommendation_v4/scripts/launch_slurm.sh     | 91 +++----------------
 4 files changed, 59 insertions(+), 162 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 864b45351..dd4dd4715 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -80,26 +80,16 @@ apply_env_bootstrap.TRITON_FULL_AUTOTUNE = False
 # PARSE NOTE: seed_everything() runs right before make_model() in train_ranker
 # (after the full gin parse), so this binding resolves in the second parse where
 # env_int is registered. Override per-run via $SEED.
-#
-# DEFAULT -1 = draw a FRESH RANDOM seed every run (rank 0 draws + broadcasts so
-# all ranks agree; the chosen value is exported to $SEED and logged), so each
-# launch explores a different init. Pin $SEED >= 0 to reproduce a specific run.
+# Default -1 draws a fresh random seed each run; pin $SEED >= 0 to reproduce.
 seed_everything.seed = @seed/env_int()
 seed/env_int.key     = "SEED"
 seed/env_int.default = -1
 
-# $DECORRELATE_DROPOUT — per-rank decorrelation of RUNTIME stochastic ops.
-# After init (make_model + DMP), re-seed torch/cuda with $SEED + rank so HSTU
-# dropout (input_dropout=0.2, linear_dropout_rate=0.1) draws DIFFERENT masks on
-# every data-parallel rank instead of the identical masks implied by the shared
-# init seed — the standard data-parallel RNG track. Init (dense weights +
-# sharded embeddings) is untouched: this runs strictly after it. 1 = on
-# (default), 0 = legacy identical-mask-on-every-rank behavior. Reproducible: the
-# offset is a pure fn of resolved $SEED + rank, and per-rank RNG state is
-# checkpointed across resume.
+# $DECORRELATE_DROPOUT — re-seed torch/cuda with $SEED + rank after init so HSTU
+# dropout masks differ per data-parallel rank. 1 = on, 0 = identical masks (default).
 decorrelate_runtime_rng.enabled = @drr/env_int()
 drr/env_int.key     = "DECORRELATE_DROPOUT"
-drr/env_int.default = 1
+drr/env_int.default = 0
 
 # dense model optimizer
 # Learning rate is env-overridable via $DENSE_LR (default 1e-7).
@@ -252,9 +242,6 @@ hs/env_str.default = "interleaved"
 # keyed by (L, MIN_HISTORY) so floors don't collide. Override via $MIN_HISTORY.
 get_dataset.min_history = @mh/env_int()
 mh/env_int.key     = "MIN_HISTORY"
-# Freeze default: power-users-only floor at the full history budget. With
-# HISTORY_LENGTH=4086 this maps to the existing positions_L4086.npy cache (no
-# rebuild, no shared-dir write — see yambda.py _positions_filename legacy path).
 mh/env_int.default = 4086
 
 # Model-side attention budget. Dataset truncates UIH to fit this value if
@@ -461,23 +448,16 @@ MetricsLogger.tensorboard_log_path = @tbp/env_path()
 tbp/env_path.key     = "TENSORBOARD_LOG_PATH"
 tbp/env_path.default = ""
 MetricsLogger.world_size = 8
-# Time-to-target AUC threshold. Doubles as the MLPerf convergence target: when
-# the selected listen_plus eval AUC (see eval_accuracy_auc_mode below; default
-# the per-pass "window_" AUC) first reaches this value the streaming-train-eval
-# run emits a SUCCESS RUN_STOP and terminates gracefully. Override via
-# $AUC_THRESHOLD (e.g. 0.5 to smoke-test the early-stop path on a short run).
-# MLPerf's DLRM-DCNv2 reference uses 0.80275.
+# MLPerf convergence target: run stops when the selected eval AUC reaches it.
+# Override via $AUC_THRESHOLD.
 MetricsLogger.auc_threshold = @at/env_float()
 at/env_float.key     = "AUC_THRESHOLD"
 at/env_float.default = 0.80275
-# Which eval AUC is reported as EVAL_ACCURACY and drives the convergence /
-# SUCCESS RUN_STOP decision: "window" (per-pass full-holdout AUC, reset each eval
-# pass; the default) or "lifetime" (cumulative across all eval passes). Both AUCs
-# are still computed and logged to TensorBoard regardless; this only selects the
-# one used for MLPerf EVAL_ACCURACY + early-stop. Override via $EVAL_ACCURACY_AUC_MODE.
+# Eval metric driving EVAL_ACCURACY + early-stop: "{window|lifetime}_{auc|gauc|
+# accuracy|ne}" (default window_auc). Override via $EVAL_ACCURACY_AUC_MODE.
 streaming_train_eval_loop.eval_accuracy_auc_mode = @eaam/env_str()
 eaam/env_str.key     = "EVAL_ACCURACY_AUC_MODE"
-eaam/env_str.default = "window"
+eaam/env_str.default = "window_auc"
 # Lifetime-AUC backend, selectable independently for the train cumulative AUC and
 # the eval cumulative ("lifetime_*") AUC. Both default to "binned":
 #   "binned" = BinnedCumulativeAUC: exact-cumulative AUC from an O(bins) score
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
index c7bb5bab9..47c451696 100755
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
@@ -17,7 +17,7 @@
 #
 # Usage:
 #   bash generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh --jobid <slurm-jobid>
-#       [--container yambda_<user>]
+#       [--container yambda_primus]
 #       [--num-train-batches 200]
 #       [--die-at-step 350]
 #       [--keep]    # retain LOG_DIR + CKPT after run for inspection
@@ -25,7 +25,7 @@
 set -uo pipefail
 
 JOBID=""
-CONTAINER="yambda_${USER:-$(id -un)}"
+CONTAINER="yambda_primus"
 NUM_TRAIN_BATCHES=200
 DIE_AT_STEP=350
 IN_WINDOW_FREQ=50
@@ -40,9 +40,6 @@ KEEP=0
 # correctness gates are the functional-invariant checks below (RNG restored,
 # resumed-at-correct-step, atomic/keep_last_n), not this number.
 ATOL=0.15
-# Writable scratch ($HOME-derived) + repo root (this file is at
-# <repo>/generative_recommenders/dlrm_v3/train/tests/, i.e. 4 levels deep).
-# Both env-overridable; nothing is hardwired to a specific user/site.
 SCRATCH=${SCRATCH:-$HOME/yambda_runs}
 CKPT_ROOT=${CKPT_ROOT:-$SCRATCH/ckpts_resume_test}
 LOG_DIR=${LOG_DIR:-$SCRATCH/streaming_resume_test}
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index 1e537b32d..641e8e67c 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -1620,12 +1620,10 @@ def streaming_train_eval_loop(
     # event emission; the loop is otherwise unchanged. Supplied by train_ranker
     # for the streaming-train-eval benchmark path.
     mlperf_logger: Optional[Any] = None,
-    # Which eval AUC drives the reported EVAL_ACCURACY and the convergence
-    # decision (early SUCCESS RUN_STOP + end-of-run finalize): "window" = the
-    # per-pass full-holdout AUC (reset each eval pass; the default), or
-    # "lifetime" = the cumulative AUC across all eval passes. Override via
-    # $EVAL_ACCURACY_AUC_MODE.
-    eval_accuracy_auc_mode: str = "window",
+    # Which eval metric drives EVAL_ACCURACY + the convergence decision. Format
+    # "{window|lifetime}_{auc|gauc|accuracy|ne}" (bare "window"/"lifetime" => auc).
+    # Override via $EVAL_ACCURACY_AUC_MODE.
+    eval_accuracy_auc_mode: str = "window_auc",
 ) -> None:
     """Streaming train+eval loop with per-window (and optionally mid-window)
     checkpoints.
@@ -2321,32 +2319,33 @@ def _mlperf_progress() -> Dict[str, Any]:
             mlperf_logger.constants.EPOCH_NUM: epoch_num,
         }
 
-    # Convergence/EVAL_ACCURACY metric short name, selected by
-    # eval_accuracy_auc_mode: "window_auc" (per-pass full-holdout AUC, default)
-    # or "lifetime_auc" (cumulative across eval passes).
-    _eval_auc_short = (
-        "lifetime_auc"
-        if str(eval_accuracy_auc_mode).strip().lower() == "lifetime"
-        else "window_auc"
-    )
+    # Convergence/EVAL_ACCURACY metric short name selected by
+    # eval_accuracy_auc_mode. Bare "window"/"lifetime" defaults the metric to auc.
+    _eval_metric_short = str(eval_accuracy_auc_mode).strip().lower()
+    if _eval_metric_short in ("window", "lifetime"):
+        _eval_metric_short = f"{_eval_metric_short}_auc"
+    # NE is lower-is-better; auc/gauc/accuracy are higher-is-better.
+    _eval_lower_is_better = _eval_metric_short.endswith("_ne")
     if rank == 0 and mlperf_logger is not None:
         logger.info(
-            f"[mlperf] EVAL_ACCURACY / convergence metric = {_eval_auc_short} "
-            f"(eval_accuracy_auc_mode={eval_accuracy_auc_mode!r})"
+            f"[mlperf] EVAL_ACCURACY / convergence metric = {_eval_metric_short} "
+            f"(stop when {'<=' if _eval_lower_is_better else '>='} threshold)"
         )
 
-    def _eval_target_auc(metrics: Dict[str, float]) -> Optional[float]:
-        # Convergence metric: the listen_plus eval AUC selected by
-        # eval_accuracy_auc_mode (window vs lifetime). Key format is
-        # `metric/{prefix}{name}/{task}` (see MetricsLogger.compute), e.g.
-        # `metric/window_auc/listen_plus`. Match the selected short name;
-        # ignore GAUC.
+    def _eval_target_metric(metrics: Dict[str, float]) -> Optional[float]:
+        # listen_plus eval metric selected by eval_accuracy_auc_mode. Key format
+        # is `metric/{prefix}_{name}/{task}` (see MetricsLogger.compute).
         for key, val in metrics.items():
             short = key.split("/")[-2] if "/" in key else key
-            if short == _eval_auc_short:
+            if short == _eval_metric_short:
                 return float(val)
         return None
 
+    def _meets_target(value: Optional[float], thr: Optional[float]) -> bool:
+        if value is None or thr is None:
+            return False
+        return value <= thr if _eval_lower_is_better else value >= thr
+
     def _mlperf_block_start() -> None:
         if mlperf_logger is not None:
             mlperf_logger.start(
@@ -2378,25 +2377,21 @@ def _mlperf_run_stop(status: object) -> None:
         mlperf_run_stopped[0] = True
 
     def _mlperf_eval_stop(eval_metrics: Dict[str, float]) -> bool:
-        # Emit EVAL_ACCURACY (the selected eval listen_plus AUC) + EVAL_STOP, and
-        # drive an early SUCCESS RUN_STOP when the target threshold is reached.
-        # Returns True iff the run should stop now -- the SAME value on every rank.
+        # Emit EVAL_ACCURACY (the selected eval listen_plus metric) + EVAL_STOP,
+        # and drive an early SUCCESS RUN_STOP when the target is reached. Returns
+        # True iff the run should stop now -- the SAME value on every rank.
         #
-        # CRITICAL (deadlock avoidance): the eval AUC is produced by a reduce
-        # that is only guaranteed valid on global rank 0, so a per-rank
-        # `eval_auc >= thr` test could diverge (only rank 0 sees the value) and
-        # the ranks that "stop" hit the RUN_STOP barrier while the rest march
-        # into the next window's embedding all-to-all -> NCCL collective-timeout
-        # hang (observed: 600s ALLTOALL_BASE watchdog abort). So rank 0 decides
-        # and BROADCASTS the boolean; all ranks then break (or continue) in
-        # lockstep.
+        # CRITICAL (deadlock avoidance): the eval metric is only valid on global
+        # rank 0, so rank 0 decides and BROADCASTS the boolean; all ranks then
+        # break (or continue) in lockstep. A per-rank test could diverge and hang
+        # the next window's embedding all-to-all (600s ALLTOALL_BASE watchdog).
         if mlperf_logger is None:
             return False
-        eval_auc = _eval_target_auc(eval_metrics)
-        if eval_auc is not None:
+        eval_value = _eval_target_metric(eval_metrics)
+        if eval_value is not None:
             mlperf_logger.event(
                 key=mlperf_logger.constants.EVAL_ACCURACY,
-                value=eval_auc,
+                value=eval_value,
                 metadata=_mlperf_progress(),
             )
         mlperf_logger.end(
@@ -2404,13 +2399,7 @@ def _mlperf_eval_stop(eval_metrics: Dict[str, float]) -> bool:
         )
         thr = metric_logger.auc_threshold
         decision = torch.zeros(1, device=device)
-        if (
-            rank == 0
-            and not mlperf_run_stopped[0]
-            and eval_auc is not None
-            and thr is not None
-            and eval_auc >= thr
-        ):
+        if rank == 0 and not mlperf_run_stopped[0] and _meets_target(eval_value, thr):
             decision[0] = 1.0
         if torch.distributed.is_initialized():
             torch.distributed.broadcast(decision, src=0)
@@ -2421,13 +2410,11 @@ def _mlperf_eval_stop(eval_metrics: Dict[str, float]) -> bool:
         return should_stop
 
     def _mlperf_finalize(final_metrics: Dict[str, float]) -> None:
-        # End-of-run RUN_STOP when the threshold was never crossed: SUCCESS iff
-        # the final eval AUC meets the target, else ABORTED.
+        # End-of-run RUN_STOP when the target was never crossed: SUCCESS iff the
+        # final eval metric meets the target, else ABORTED.
         if mlperf_logger is None or mlperf_run_stopped[0]:
             return
-        eval_auc = _eval_target_auc(final_metrics)
-        thr = metric_logger.auc_threshold
-        success = eval_auc is not None and thr is not None and eval_auc >= thr
+        success = _meets_target(_eval_target_metric(final_metrics), metric_logger.auc_threshold)
         _mlperf_run_stop(
             mlperf_logger.constants.SUCCESS
             if success
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index 829c5ce44..5ab1e950a 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -4,11 +4,7 @@
 #SBATCH --ntasks-per-node=1
 #SBATCH --exclusive
 #SBATCH --partition=meta64          # [CLUSTER-SPECIFIC] partition name
-#SBATCH --time=01:10:00
 #SBATCH --output=yambda_slurm.%j.out
-# ^ relative to the submit dir (SLURM parses #SBATCH before any shell runs, so it
-#   cannot expand env vars). The real consolidated run log is $LOG (see below),
-#   which defaults under $SCRATCH; this file just captures the batch stdout.
 # =============================================================================
 # launch_slurm.sh — single entry point for the yambda-5b trainer on N>=1 nodes.
 #
@@ -34,14 +30,9 @@
 #                supervisor's direct `bash scripts/launch_slurm.sh` is unchanged.
 #
 # USAGE
-#   Reference run (1 node): sbatch --nodes=1 scripts/launch_slurm.sh
-#   Reference run (N node):  sbatch --nodes=N scripts/launch_slurm.sh
-#     ^ a bare submit reproduces the FROZEN REFERENCE shape (full 299-window
-#       sweep + data-fraction eval cadence). Prepend SMOKE=1 for a fast
-#       functional check (short window, capped batches).
+#   Multi-node (N>=1):  sbatch --nodes=2 scripts/launch_slurm.sh
 #   Single-node direct: bash scripts/launch_slurm.sh   (already inside container;
-#                       what run_streaming_e2e.sh invokes per relaunch — uses the
-#                       gin defaults, NOT the orchestrate reference shape)
+#                       what run_streaming_e2e.sh invokes per relaunch)
 #   Perf pair:
 #     LOG=/apps/chcai/perf_1node.log NUM_TRAIN_BATCHES=200 NUM_EVAL_BATCHES=0 \
 #       EVAL_EACH_WINDOW=0 METRIC_LOG_FREQ=20 \
@@ -115,7 +106,7 @@ if [ -z "$PHASE" ]; then
 fi
 
 # ---- shared config (env-overridable) ----------------------------------------
-CONTAINER=${CONTAINER:-yambda_${USER:-$(id -un)}}   # per-user container name (do NOT reuse another user's container — its bind mounts differ)
+CONTAINER=${CONTAINER:-yambda_primus}
 REPO=${REPO:-$REPO_ROOT}                       # repo path inside the container
 IMAGE=${IMAGE:-rocm/primus:v26.3}              # [CLUSTER-SPECIFIC] ROCm/arch base image
 BAKED_IMAGE=${BAKED_IMAGE:-yambda_primus_baked:latest}
@@ -123,13 +114,6 @@ BAKED_TAR=${BAKED_TAR:-/apps/chcai/yambda_primus_baked.tar}   # [CLUSTER-SPECIFI
 USE_BAKED=${USE_BAKED:-1}
 OVERLAY=${RDMA_OVERLAY:-/apps/chcai/rdma_host_el9_new}        # [CLUSTER-SPECIFIC] shared-NFS RDMA overlay (read-only, already staged)
 
-# Bind mounts + scratch — all on shared NFS, identical path on every node.
-#   REPO_MOUNT : NFS home root that contains THIS repo (bind-mounted rw).
-#   DATA_MOUNT : NFS root with the (shared, read-only) dataset + RDMA overlay +
-#                pip/fbgemm build assets. Kept as-is so the dataset is NOT
-#                duplicated. You only need read access here.
-#   SCRATCH    : this run's WRITABLE output root (logs / tb / traces).
-# All env-overridable, so nothing is hardwired to one user's home.
 REPO_MOUNT=${REPO_MOUNT:-$HOME}              # NFS home holding the repo (must contain $REPO); override if your repo lives elsewhere
 DATA_MOUNT=${DATA_MOUNT:-/apps/chcai}        # shared dataset + RDMA overlay + pip/fbgemm assets (read-only)
 SCRATCH=${SCRATCH:-$HOME/yambda_runs}        # writable output root (logs / tb / traces)
@@ -151,11 +135,6 @@ orchestrate() {
   mkdir -p "$SCRATCH" 2>/dev/null || true
   LOG=${LOG:-$SCRATCH/yambda_slurm.${SLURM_JOB_ID:-manual}.log}
 
-  # Run-shape defaults. By DEFAULT a bare `sbatch scripts/launch_slurm.sh`
-  # reproduces the FROZEN REFERENCE run: full 299-window sweep (START_TS=0) with
-  # the data-fraction eval cadence (eval every 0.5% of the training stream). Set
-  # SMOKE=1 for a fast functional check (short dense window, capped batches,
-  # per-window eval). Any individual knob below stays env-overridable.
   MODE=${MODE:-streaming-train-eval}
   if [ "${SMOKE:-0}" = "1" ]; then
     START_TS=${START_TS:-150}
@@ -171,10 +150,6 @@ orchestrate() {
   NUM_EVAL_BATCHES=${NUM_EVAL_BATCHES:-0}
   EVAL_EACH_WINDOW=${EVAL_EACH_WINDOW:-1}
   METRIC_LOG_FREQ=${METRIC_LOG_FREQ:-20}
-  # Eval cadence — the two knobs are mutually exclusive (the worker raises if both
-  # are >0). Data-fraction is the reference default; if the caller explicitly
-  # selected the per-window cadence (EVAL_EVERY_N_WINDOWS>0) leave data-pct off,
-  # otherwise default to the reference 0.5%-of-data cadence (per-window disabled).
   if [ "${EVAL_EVERY_N_WINDOWS:-0}" -gt 0 ] 2>/dev/null; then
     EVAL_EVERY_DATA_PCT=${EVAL_EVERY_DATA_PCT:-0}
   else
@@ -192,12 +167,6 @@ orchestrate() {
   else
     : > "$LOG"
   fi
-  # Group/other write (but NOT read) so the in-container worker (running as root,
-  # squashed to `nobody` over root-squashed NFS) can append via `tee -a $LOG`.
-  # `tee -a` opens write-only, so 622 is sufficient -- avoid 666, which would let
-  # other users on the shared filesystem read (and tamper with) the job log.
-  # Without the write bit the worker's tee is denied and exits non-zero, which
-  # pipefail turns into a spurious rc=1 even when training succeeds.
   chmod 622 "$LOG" 2>/dev/null || true
   echo "[$(date)] launch_slurm/orchestrate: job=${SLURM_JOB_ID:-?} nodes=${SLURM_JOB_NODELIST:-?} nnodes=${SLURM_NNODES:-1}" | tee -a "$LOG"
   echo "[$(date)] resolved SCRIPT_PATH=$SCRIPT_PATH REPO=$REPO" | tee -a "$LOG"
@@ -523,28 +492,12 @@ worker() {
   cd "$REPO_ROOT"
   mkdir -p "$SCRATCH" 2>/dev/null || true
   LOG=${LOG:-$SCRATCH/yambda_5b_8gpu.log}
-  # Avoid double-logging. When launched by the orchestrate phase, our stdout is
-  # ALREADY captured into the real $LOG by orchestrate's `tee` (and, multi-node,
-  # funneled through one srun pipe). Re-`tee`ing $LOG here would write every line
-  # twice. Orchestrate sets WORKER_TEE=0 to point our own file sink at /dev/null:
-  # we still echo to stdout (captured upstream) but don't duplicate the file.
-  # Direct single-node invocation (the streaming-e2e supervisor) leaves
-  # WORKER_TEE unset, so the worker keeps writing $LOG itself.
+  # WORKER_TEE=0 (set by orchestrate) sends our file sink to /dev/null to avoid
+  # double-logging, since orchestrate already tees stdout into the real $LOG.
   [ "${WORKER_TEE:-1}" = "0" ] && LOG=/dev/null
-  # TensorBoard under the writable scratch root unless the caller (e.g. the e2e
-  # supervisor) pinned a per-run path. Keeps the gin default from ever being used.
   export TENSORBOARD_LOG_PATH=${TENSORBOARD_LOG_PATH:-$SCRATCH/tb/yambda_5b}
-  # MLPerf Training compliance log (streaming-train-eval path). Lands beside the
-  # other run outputs under scratch unless the caller pins it. Rank 0 writes it;
-  # check it post-run with:
-  #   python -m mlperf_logging.compliance_checker --usage training \
-  #     --ruleset 5.0.0 "$MLPERF_LOG_PATH"
-  # Default to a PER-JOB filename so each standalone `sbatch` gets a clean
-  # compliance log: mllog opens the file in APPEND mode, so a fixed name would
-  # accumulate events across runs and fail the compliance_checker (duplicate
-  # INIT_START/RUN_START). The streaming-e2e supervisor pins MLPERF_LOG_PATH
-  # explicitly (and inits it once at run start), so its relaunch-into-same-file
-  # append semantics are preserved untouched.
+  # MLPerf compliance log (rank 0 writes it). Per-job filename so each standalone
+  # sbatch gets a clean log; the e2e supervisor pins MLPERF_LOG_PATH itself.
   export MLPERF_LOG_PATH=${MLPERF_LOG_PATH:-$SCRATCH/mlperf/yambda_5b_mlperf.${SLURM_JOB_ID:-manual}.log}
   echo "[$(date)] REPO_ROOT=$REPO_ROOT" | tee -a "$LOG"
 
@@ -594,20 +547,9 @@ worker() {
   export WORLD_SIZE=$(( NNODES * GPUS_PER_NODE ))
   echo "[$(date)] topology: nnodes=$NNODES node_rank=$NODE_RANK gpus_per_node=$GPUS_PER_NODE world_size=$WORLD_SIZE master=$MASTER_ADDR:${MASTER_PORT:-<auto>}" | tee -a "$LOG"
 
-  # NCCL bootstrap NIC. The container is --network=host so RCCL sees ALL host
-  # interfaces; if left to auto-detect, NCCL can pick a non-routable per-GPU RoCE
-  # /31 (benic* 192.168.x) link and fail bootstrap with "No route to host" (this
-  # is node-dependent: it worked on some nodes and not others, causing repetitive
-  # single-node init failures). Pin it explicitly to avoid that.
-  #   * Single-node (NNODES==1): all ranks are on THIS host, so only the bootstrap
-  #     control-plane crosses the socket NIC (data plane is intra-node XGMI/PCIe,
-  #     see below). Loopback is reachable by every local rank on ANY host and is
-  #     node-independent — same rationale as MASTER_ADDR=localhost above — so it
-  #     "just works" on dev boxes that have no fenic0 (e.g. a single MI355 node).
-  #   * Multi-node (NNODES>1): needs a routable host NIC shared across nodes for
-  #     the cross-node TCP rendezvous; default to the meta64 fenic0.
-  # Both remain ${NCCL_SOCKET_IFNAME:-...}-overridable for other fabrics.
-  # [CLUSTER-SPECIFIC] multi-node routable host NIC for TCP bootstrap (find via `ip -br addr`).
+  # NCCL bootstrap NIC: loopback single-node, routable host NIC multi-node (pin
+  # to avoid auto-detect picking a non-routable per-GPU RoCE link). Override via
+  # $NCCL_SOCKET_IFNAME. [CLUSTER-SPECIFIC] multi-node fenic0 (find via `ip -br addr`).
   if [ "$NNODES" -gt 1 ]; then
     export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-fenic0}
   else
@@ -646,17 +588,8 @@ worker() {
       export NCCL_IB_TIMEOUT=${NCCL_IB_TIMEOUT:-14}
       export NCCL_IGNORE_CPU_AFFINITY=${NCCL_IGNORE_CPU_AFFINITY:-1}
       export RCCL_MSCCL_ENABLE=${RCCL_MSCCL_ENABLE:-0}
-      # GPU-Direct RDMA: ENABLED by default. The brcmrdma host kernel ships the
-      # inbox peer-memory client (`ib_register_peer_memory_client` in
-      # /proc/kallsyms), so RCCL does true GPU<->NIC DMA over bnxt_re instead of
-      # bouncing through host memory. Measured ~+22% throughput at 2 nodes
-      # (65.7%->79.8% weak-scaling efficiency) vs the old host-staged path.
-      # GDR_LEVEL=5 (most permissive) is required so GDR is used even when the GPU
-      # and NIC cross the CPU root complex. NCCL_DMABUF_ENABLE=1 is a harmless
-      # no-op here (kernel lacks CONFIG_DMABUF_MOVE_NOTIFY/CONFIG_PCI_P2PDMA, so
-      # peermem carries it). Enabling is non-fatal: if peermem is ever absent RCCL
-      # just logs "GDR 0" and falls back to host staging. Override with
-      # NCCL_NET_GDR_LEVEL=0 to force the legacy host-staged path.
+      # GPU-Direct RDMA on by default (~+22% throughput at 2 nodes via peermem).
+      # Set NCCL_NET_GDR_LEVEL=0 to force the legacy host-staged path.
       export NCCL_NET_GDR_LEVEL=${NCCL_NET_GDR_LEVEL:-5}
       export NCCL_DMABUF_ENABLE=${NCCL_DMABUF_ENABLE:-1}
       echo "[$(date)] NCCL: RDMA over bnxt_re (GID idx ${NCCL_IB_GID_INDEX}, TC ${NCCL_IB_TC}, GDR_LEVEL=${NCCL_NET_GDR_LEVEL}, DMABUF=${NCCL_DMABUF_ENABLE}; meta64 bnxt_re config, validated)" | tee -a "$LOG"

From a9f8efdec4769c5c97590e6d9dda9076842f0290 Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Wed, 24 Jun 2026 19:40:09 +0000
Subject: [PATCH 088/113] dlrmv4: trim verbose MLPerf-wiring comments to 1-2
 lines

Collapse the large explanatory comment/docstring blocks in the MLPerf
logging wiring (logger, train_ranker boundaries, streaming loop hooks,
MetricsLogger counters) to keep the PR reviewable. No logic changes.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/mlperf_logging_utils.py     | 46 ++++-------------
 .../dlrm_v3/train/train_ranker.py             | 51 +++++--------------
 .../dlrm_v3/train/utils.py                    | 44 ++++------------
 .../generative_recommenders/dlrm_v3/utils.py  | 40 ++++-----------
 4 files changed, 46 insertions(+), 135 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/mlperf_logging_utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/mlperf_logging_utils.py
index 74eac4ede..47d544ad7 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/mlperf_logging_utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/mlperf_logging_utils.py
@@ -15,13 +15,8 @@
 # pyre-unsafe
 """MLPerf Training compliance logging for the DLRMv3 streaming-train-eval path.
 
-Thin, rank-0-gated wrapper around ``mlperf_logging.mllog`` so the streaming
-loop can emit the MLPerf event stream (INIT/RUN/BLOCK/EVAL/RUN_STOP) without
-every call site re-checking the rank or guarding against a missing dependency.
-
-Modeled on recommendation_v2/torchrec_dlrm's inline ``submission_info`` but
-extended with rank-0 gating + optional distributed barriers (the NeMo / unet3d
-``sync`` pattern), so a multi-rank run produces exactly one valid log.
+Rank-0-gated wrapper around ``mlperf_logging.mllog`` so the streaming loop emits
+the MLPerf event stream without every call site re-checking rank or the dep.
 """
 
 import logging
@@ -63,11 +58,9 @@ def _barrier() -> None:
 class MLPerfLogger:
     """Rank-0-gated facade over ``mllog``.
 
-    All event methods no-op on non-zero ranks and when ``mlperf_logging`` is not
-    installed, so callers never need to guard. ``sync=True`` inserts an
-    all-rank ``dist.barrier()`` before the (rank-0-only) emission so the logged
-    timestamp reflects the slowest rank reaching the boundary -- required for
-    INIT_STOP/RUN_START/RUN_STOP per the MLPerf rules.
+    Event methods no-op on non-zero ranks and when mlperf_logging is absent.
+    ``sync=True`` barriers before emit so the timestamp reflects the slowest rank
+    (required for INIT_STOP/RUN_START/RUN_STOP).
     """
 
     def __init__(
@@ -79,29 +72,18 @@ def __init__(
         submitter_name: str = "reference_implementation",
     ):
         self.enabled: bool = _MLLOG_AVAILABLE
-        # CRITICAL: use the EXPLICIT global rank passed by the caller, not a
-        # dist.get_rank() lookup. This logger is constructed BEFORE
-        # dist.init_process_group (so the init phase can be timed), at which
-        # point torch.distributed.get_rank() is unavailable and would return 0
-        # for every process -> all 16 ranks would log everything. The caller
-        # (train_ranker) already knows the true global rank
-        # (node_rank * gpus_per_node + local_rank), so trust it. Fall back to a
-        # best-effort dist/zero lookup only when not provided.
+        # Use the EXPLICIT caller rank: this is built before init_process_group,
+        # when dist.get_rank() would return 0 on every rank (all would log).
         self.rank: int = rank if rank is not None else _rank()
         self.benchmark_name: str = benchmark_name
         self.submitter_name: str = submitter_name
         self._logger = None
         if not self.enabled:
             return
-        # Only rank 0 emits events, so only rank 0 needs the file handler:
-        # attaching it on every rank wastes file handles and risks contention on
-        # a shared log path. Non-zero ranks configure mllog without a filename
-        # (their event methods no-op anyway).
+        # Only rank 0 emits, so only rank 0 needs the file handler.
         if log_path and self.rank == 0:
             log_dir = os.path.dirname(log_path)
-            # dirname is "" when log_path has no directory component (e.g.
-            # MLPERF_LOG_PATH=mlperf.log); os.makedirs("") raises, so guard it.
-            if log_dir:
+            if log_dir:  # guard: os.makedirs("") raises for a bare filename
                 os.makedirs(log_dir, exist_ok=True)
             mllog.config(filename=log_path, default_stack_offset=default_stack_offset)
         else:
@@ -169,14 +151,8 @@ def get_mlperf_logger(
 ) -> Optional[MLPerfLogger]:
     """Build a configured :class:`MLPerfLogger`, or ``None`` if unavailable.
 
-    ``benchmark_name`` / ``submitter_name`` are gin-configurable (and the path is
-    env-overridable via ``$MLPERF_LOG_PATH``) so a submission can stamp its own
-    benchmark string without code changes. The log path defaults to
-    ``$MLPERF_LOG_PATH`` when set, else ``""`` (mllog logs to stdout).
-
-    Returns ``None`` when ``mlperf_logging`` is not installed so callers' existing
-    ``mlperf_logger is not None`` guards cleanly disable logging -- otherwise they
-    would pass the guard and then hit ``logger.constants`` (which is ``None``).
+    Path defaults to ``$MLPERF_LOG_PATH``. Returns ``None`` (not a disabled
+    logger) so callers' ``is not None`` guards cleanly skip logging.
     """
     if not _MLLOG_AVAILABLE:
         return None
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
index 25711bd02..30b081946 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
@@ -83,20 +83,12 @@ def _main_func(
     gin.parse_config_file(gin_file, skip_unknown=True)
     apply_env_bootstrap()
 
-    # MLPerf compliance logging is only wired for the streaming-train-eval
-    # (yambda-5b) benchmark path. Build the rank-0-gated logger now. No-ops on
-    # non-zero ranks and when mlperf_logging is not installed.
+    # Rank-0-gated MLPerf logger, only for the streaming-train-eval path.
     mlperf_logger = (
         get_mlperf_logger(rank=rank) if mode == "streaming-train-eval" else None
     )
-    # Emit the init-start boundary before setup so the init phase is measured,
-    # but ONLY when this is guaranteed to be a cold start. CKPT_PATH unset means
-    # checkpoints are disabled (the default + submission config) -> always cold
-    # start. When CKPT_PATH is set (resumable / e2e-supervisor runs) we defer the
-    # decision to resume_cold_start below and skip the pre-setup markers, so a
-    # resume relaunch never emits an orphaned INIT_START/RUN_START. The whole
-    # downstream sequence (INIT_STOP/RUN_START/blocks/eval/RUN_STOP) is gated on
-    # this flag so the log is always balanced.
+    # Emit INIT_START before setup only on a guaranteed cold start (CKPT_PATH
+    # unset); resume relaunches skip it so the log stays balanced.
     mlperf_init_logged = False
     if mlperf_logger is not None and not os.environ.get("CKPT_PATH", ""):
         mlperf_logger.event(key=mlperf_logger.constants.CACHE_CLEAR, value=True)
@@ -208,11 +200,8 @@ def _main_func(
         )
     )
 
-    # MLPerf: submission info + hyperparameters, then the init/run boundary.
-    # Gated on (init markers emitted AND genuine cold start) so the e2e
-    # supervisor's resume relaunches don't restart the INIT/RUN markers
-    # mid-stream (which would invalidate the single-run log), and so the log is
-    # never left with an INIT_STOP that has no matching INIT_START.
+    # MLPerf submission info + hyperparameters + INIT_STOP/RUN_START, only on a
+    # genuine cold start so resume relaunches don't reopen the run markers.
     mlperf_run_active = (
         mlperf_logger is not None and mlperf_init_logged and resume_cold_start
     )
@@ -228,11 +217,8 @@ def _gin_param(name: str, default: object) -> object:
                 value = gin.query_parameter(name)
             except (ValueError, KeyError):
                 return default
-            # When a binding is a gin macro/configurable reference (e.g.
-            # `@dlr/env_float()`), query_parameter returns the unevaluated
-            # reference object, which the MLPerf logger cannot encode. Resolve
-            # it to its actual value so env-overridden LRs are logged as real
-            # numbers. Plain literals pass through unchanged.
+            # Resolve gin macro refs (e.g. @dlr/env_float()) to real values so
+            # env-overridden LRs log as numbers, not unencodable objects.
             if hasattr(value, "scoped_configurable_fn"):
                 try:
                     return value.scoped_configurable_fn()
@@ -243,11 +229,9 @@ def _gin_param(name: str, default: object) -> object:
         global_batch_size = world_size * int(train_dataloader.batch_size)
         mlperf_logger.event(key=c.GLOBAL_BATCH_SIZE, value=global_batch_size)
         mlperf_logger.event(key=c.GRADIENT_ACCUMULATION_STEPS, value=1)
-        # Log the ACTUAL seed chosen in setup() (random per-run unless $SEED is
-        # pinned; setup() exports the chosen value to $SEED).
+        # Actual seed chosen in setup() (exported to $SEED).
         mlperf_logger.event(key=c.SEED, value=int(os.environ.get("SEED", "1")))
-        # Dense (Adam) + sparse (RowWiseAdagrad) optimizer hyperparameters,
-        # read from the active gin bindings.
+        # Dense (Adam) + sparse (RowWiseAdagrad) optimizer hyperparameters from gin.
         mlperf_logger.event(
             key=c.OPT_NAME,
             value=_gin_param(
@@ -327,24 +311,17 @@ def _gin_param(name: str, default: object) -> object:
                 resume_batch_idx_in_window=resume_batch_idx_in_window,
                 resume_split_contract=resume_split_contract,
                 resume_cold_start=resume_cold_start,
-                # Only pass the logger when the run boundaries were emitted, so
-                # the loop never produces orphan block/eval events.
+                # Only pass the logger when run boundaries were emitted, so the
+                # loop never produces orphan block/eval events.
                 mlperf_logger=mlperf_logger if mlperf_run_active else None,
             )
     except Exception as e:
         logger.info(traceback.format_exc())
         raise Exception(e)
     finally:
-        # Graceful distributed teardown (runs on BOTH success and failure).
-        # Previously cleanup() ran only in the except branch, so a clean finish
-        # returned without destroying the process group: ranks that returned
-        # first let rank 0's TCPStore close while peers' ProcessGroupNCCL
-        # heartbeat-monitor threads were still polling it, emitting the noisy
-        # (but harmless) "Failed to check the 'should dump' flag on TCPStore /
-        # Broken pipe" warnings + C++ stack traces at exit. Barrier first so all
-        # ranks reach the end in lockstep, then destroy_process_group() stops
-        # each rank's monitor thread and closes NCCL/the store in order. Both
-        # steps are guarded/best-effort so teardown never masks a real error.
+        # Graceful distributed teardown on both success and failure: barrier so
+        # all ranks finish in lockstep, then destroy the process group (best-
+        # effort) to avoid noisy TCPStore/NCCL shutdown warnings at exit.
         if torch.distributed.is_initialized():
             try:
                 torch.distributed.barrier()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index 641e8e67c..f3f66e443 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -1616,9 +1616,7 @@ def streaming_train_eval_loop(
     streaming_diag_unique_emb: bool = False,
     # --- test-only failure injection knob ---
     die_at_step: int = -1,
-    # MLPerf compliance logger (rank-0-gated facade). None disables all MLPerf
-    # event emission; the loop is otherwise unchanged. Supplied by train_ranker
-    # for the streaming-train-eval benchmark path.
+    # MLPerf logger (rank-0-gated); None disables all MLPerf event emission.
     mlperf_logger: Optional[Any] = None,
     # Which eval metric drives EVAL_ACCURACY + the convergence decision. Format
     # "{window|lifetime}_{auc|gauc|accuracy|ne}" (bare "window"/"lifetime" => auc).
@@ -2198,9 +2196,7 @@ def _run_eval_window(
             except OSError as _e:
                 logger.warning("failed to write metrics sink %s: %s", _metrics_path, _e)
         metric_logger.resume_perf("eval")
-        # Return metrics to the streaming loop so the MLPerf EVAL_STOP /
-        # EVAL_ACCURACY hooks (and the stop-on-target check) can consume them.
-        # Unconditional (outside the rank-0 logging block) so every rank returns.
+        # Return metrics (on every rank) so the MLPerf eval hooks can consume them.
         return _eval_metrics
 
     def _maybe_checkpoint(train_ts: int) -> None:
@@ -2271,12 +2267,8 @@ def _should_eval(i: int) -> bool:
             )
 
     # --- MLPerf progress accounting + event helpers ---------------------------
-    # `total_train_samples` is the denominator for the MLPerf samples_count /
-    # epoch_num progress unit: the total GLOBAL trainable samples over the
-    # configured window range. Computed once up-front (one O(N) anchor mask per
-    # window -- the same call the loop makes per window) and logged as
-    # TRAIN_SAMPLES. Skipped entirely when no MLPerf logger is attached, since
-    # the masks are not free. `mlperf_run_stopped` guards single RUN_STOP.
+    # total_train_samples = epoch_num denominator (global trainable samples over
+    # the window range), computed once and logged as TRAIN_SAMPLES.
     total_train_samples = 0
     mlperf_run_stopped = [False]
     if mlperf_logger is not None:
@@ -2300,10 +2292,7 @@ def _should_eval(i: int) -> bool:
                 key=mlperf_logger.constants.EVAL_SAMPLES,
                 value=int(eval_global_indices.size),
             )
-        # Let MetricsLogger.compute_and_log emit the per-step MLPerf `train_loss`
-        # event (rank-0 gated) at the metric-logging cadence, stamped with the
-        # current base LR. Read param_groups[0] defensively (KeyedOptimizer
-        # exposes it; guard against any optimizer that does not).
+        # Wire the logger + LR getter so compute_and_log emits the train_loss event.
         metric_logger.mlperf_logger = mlperf_logger
 
         def _current_lr() -> float:
@@ -2377,14 +2366,9 @@ def _mlperf_run_stop(status: object) -> None:
         mlperf_run_stopped[0] = True
 
     def _mlperf_eval_stop(eval_metrics: Dict[str, float]) -> bool:
-        # Emit EVAL_ACCURACY (the selected eval listen_plus metric) + EVAL_STOP,
-        # and drive an early SUCCESS RUN_STOP when the target is reached. Returns
-        # True iff the run should stop now -- the SAME value on every rank.
-        #
-        # CRITICAL (deadlock avoidance): the eval metric is only valid on global
-        # rank 0, so rank 0 decides and BROADCASTS the boolean; all ranks then
-        # break (or continue) in lockstep. A per-rank test could diverge and hang
-        # the next window's embedding all-to-all (600s ALLTOALL_BASE watchdog).
+        # Emit EVAL_ACCURACY + EVAL_STOP, early SUCCESS RUN_STOP on target.
+        # Rank 0 decides + broadcasts the stop bool so all ranks break in lockstep
+        # (a per-rank test could diverge and hang the next all-to-all).
         if mlperf_logger is None:
             return False
         eval_value = _eval_target_metric(eval_metrics)
@@ -2590,11 +2574,8 @@ def _do_eval_nb(train_ts: int, gstep: int) -> None:
                 # MLPerf target reached: RUN_STOP already emitted; stop training.
                 break
 
-    # Final eval over the SAME fixed user-holdout set (consistent with the
-    # per-window evals above). Reuses _run_eval_window so metrics are reset and
-    # reported the same way. Falls back to the legacy final-window eval for
-    # datasets without user holdout. Skipped if the MLPerf target was already
-    # reached mid-run (RUN_STOP already emitted, run is over).
+    # Final eval over the fixed user-holdout set (legacy final-window eval
+    # otherwise). Skipped if the MLPerf target already stopped the run mid-run.
     if not mlperf_run_stopped[0]:
         dataset.dataset.is_eval = True  # pyre-ignore [16]
         _mlperf_eval_start()
@@ -2612,12 +2593,9 @@ def _do_eval_nb(train_ts: int, gstep: int) -> None:
                 iter(make_streaming_dataloader(dataset=dataset, ts=num_train_ts)),
                 label="eval@final",
             )
-        # EVAL_ACCURACY/EVAL_STOP for the final pass (may emit SUCCESS RUN_STOP
-        # if the target was met exactly at the end).
         _mlperf_eval_stop(final_metrics)
         if rank == 0:
             for k, v in final_metrics.items():
                 print(f"{k}: {v}")
-        # End-of-run RUN_STOP: SUCCESS iff the final lifetime AUC met the target,
-        # else ABORTED. No-op if a SUCCESS RUN_STOP already fired above.
+        # End-of-run RUN_STOP: SUCCESS if final metric met target, else ABORTED.
         _mlperf_finalize(final_metrics)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
index 390c9bfbb..879c93b97 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
@@ -1031,19 +1031,12 @@ def _make_reg(ws: int) -> List[RecMetricComputation]:
                 self.regression_metrics["eval"] = _make_reg(window_size)
 
         self.global_step: Dict[str, int] = {"train": 0, "eval": 0}
-        # Monotonic, resume-safe count of GLOBAL trained samples (summed across
-        # ranks), used as the MLPerf `samples_count` progress unit. Distinct from
-        # the perf-only `_perf_total_samples` below (per-rank, not checkpointed):
-        # this one is persisted/restored alongside `global_step` so a resumed
-        # streaming run continues the convergence-progress count.
+        # MLPerf `samples_count` progress unit: global trained samples, persisted
+        # alongside global_step so a resumed run continues the count.
         self.cumulative_train_samples: int = 0
         self._rank: int = int(rank)
-        # Optional MLPerf logger + learning-rate accessor, wired by the streaming
-        # loop (kept duck-typed -- expects `.event(key, value, metadata)` and
-        # `.constants` -- to avoid a train-module import cycle). When set,
-        # compute_and_log emits a POINT_IN_TIME `train_loss` event (rank-0 gated
-        # inside the logger) at the metric-logging cadence, mirroring the per-step
-        # MLPerf train_loss readout other benchmarks log.
+        # Optional MLPerf logger + LR accessor wired by the streaming loop (duck-
+        # typed to avoid a train-module import cycle); drives the train_loss event.
         self.mlperf_logger: Optional[Any] = None
         self.lr_getter: Optional[Callable[[], float]] = None
         self.tb_logger: Optional[SummaryWriter] = None
@@ -1097,9 +1090,7 @@ def resume_perf(self, category: str) -> None:
 
     @property
     def auc_threshold(self) -> Optional[float]:
-        """Configured time-to-target AUC threshold (None if unset). Exposed so
-        the streaming loop can drive the MLPerf SUCCESS RUN_STOP off the same
-        target without reaching into the private attribute."""
+        """Configured time-to-target AUC threshold (None if unset)."""
         return self._auc_threshold
 
     @property
@@ -1165,11 +1156,8 @@ def update(
                 self._perf_samples_counter.dtype
             )
             self._perf_steps_in_window += 1
-            # MLPerf progress counter: global trained samples this step. Local
-            # batch sample count (num_candidates is per-rank) scaled by world
-            # size approximates the global count without an extra collective;
-            # accumulated on CPU as a plain int so it serializes trivially into
-            # the checkpoint (see save/load_nonsparse_checkpoint).
+            # MLPerf progress counter: per-rank sample count scaled by world size
+            # approximates global trained samples without an extra collective.
             self.cumulative_train_samples += int(num_candidates.numel()) * self._world_size
 
     def compute(self, mode: str = "train") -> Dict[str, float]:
@@ -1261,14 +1249,9 @@ def compute_and_log(
                         global_step=self.global_step[mode],
                     )
 
-        # Train-loss readout: surface a single GLOBAL (cross-rank mean) training
-        # loss on the regular console logger every `metric_log_frequency` batches,
-        # so progress is visible from step 0 instead of only at the first
-        # end-of-window eval. The per-loss-term breakdown already goes to
-        # TensorBoard above (losses/train_*); here we add the combined scalar.
-        # The all-reduce is a cheap 1-element collective run by EVERY rank at the
-        # same deterministic steps (this method is called in lockstep), so it
-        # cannot desync. Set METRIC_LOG_FREQ low (e.g. 1-5) to see it per step.
+        # Global (cross-rank mean) train loss to console/TB + the MLPerf
+        # train_loss event, at the metric-logging cadence. The 1-element
+        # all-reduce runs on every rank in lockstep, so it cannot desync.
         if mode == "train" and additional_logs is not None and "losses" in additional_logs:
             loss_terms = additional_logs["losses"]
             if loss_terms:
@@ -1289,9 +1272,6 @@ def compute_and_log(
                         f"train - Step {self.global_step['train']} "
                         f"train_loss={train_loss:.5f}"
                     )
-                # MLPerf POINT_IN_TIME train_loss (rank-0 gated in the logger).
-                # samples_count = cumulative GLOBAL trained samples (the same
-                # progress unit as block/eval events); lr = current base LR.
                 if self.mlperf_logger is not None:
                     c = self.mlperf_logger.constants
                     md: Dict[str, Any] = {

From 024e64b56e7e46376ace1d706115eb08f98e5f12 Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Wed, 24 Jun 2026 19:56:36 +0000
Subject: [PATCH 089/113] dlrmv4: trim seed_everything /
 decorrelate_runtime_rng docstrings

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/utils.py                    | 54 +++++--------------
 1 file changed, 13 insertions(+), 41 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index f3f66e443..bf5fd5512 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -78,24 +78,12 @@
 
 @gin.configurable
 def seed_everything(seed: int = -1, rank: int = 0) -> None:
-    """Seed all RNGs so weight init (make_model) is reproducible across runs.
-
-    Same seed on every rank => dense params are initialized identically across
-    ranks; sharded embeddings are init'd from the meta device by DMP. Fixing the
-    seed makes runs an init-matched A/B (data order is already deterministic via
-    the sampler). gin-configurable via $SEED (yambda_5b.gin: seed_everything.seed);
-    call this right before make_model(), AFTER setup() (the process group must be
-    initialized for the cross-rank broadcast below) and after the full gin parse.
-
-    Default seed < 0 (the gin default, i.e. $SEED unset) => draw a FRESH RANDOM
-    seed every run so each launch explores a different dense weight init. rank 0
-    draws the seed and broadcasts it so all ranks share one value; the chosen
-    seed is exported to $SEED and logged so any run can be reproduced after the
-    fact by re-pinning $SEED. seed >= 0 (i.e. $SEED pinned) reproduces a specific
-    run exactly. NOTE (streaming-train-eval): data ORDER and the train/holdout
-    split do NOT depend on this seed — order is time-deterministic
-    (StreamingWindowSampler) and the split is governed by $SPLIT_SALT. This seed
-    governs dense weight init + global-RNG stochastic ops.
+    """Seed all RNGs (same value on every rank) for reproducible dense weight init.
+
+    Call right before make_model(), after setup() (process group needed for the
+    broadcast) and the gin parse. seed < 0 ($SEED unset) draws a fresh random seed
+    per run (rank 0 broadcasts; exported to $SEED); seed >= 0 reproduces a run.
+    Data order/split are independent of this seed (StreamingWindowSampler/$SPLIT_SALT).
     """
     import random
 
@@ -129,29 +117,13 @@ def seed_everything(seed: int = -1, rank: int = 0) -> None:
 
 @gin.configurable
 def decorrelate_runtime_rng(rank: int = 0, enabled: bool = True) -> None:
-    """Offset the global RNG by ``rank`` so RUNTIME stochastic ops draw
-    decorrelated draws per data-parallel rank.
-
-    The only such op here is HSTU dropout (input_dropout=0.2,
-    linear_dropout_rate=0.1; see configs.get_hstu_configs). seed_everything()
-    sets an IDENTICAL seed on every rank — required so replicated dense weights
-    init identically — which also makes every rank draw the SAME dropout masks
-    in the forward. Gradients still differ (each rank sees different data), so
-    that is not incorrect, but identical masks waste the extra mask diversity
-    that decorrelated replicas give per global batch. This re-seeds torch/cuda
-    with $SEED + rank to recover it (the standard data-parallel RNG track, cf.
-    Megatron's tensor/data-parallel RNG separation).
-
-    ORDERING IS LOAD-BEARING: call this AFTER everything that must be identical
-    across ranks — make_model() (dense weight init) AND make_optimizer_and_shard()
-    (the pre-DMP re-seed + sharded embedding init). It deliberately perturbs only
-    forward-time stochasticity, never init.
-
-    Reproducibility is preserved: the offset is a pure function of the resolved
-    $SEED (exported by seed_everything) and rank, and per-rank RNG state is
-    snapshotted/restored on checkpoint resume (see checkpoint.py). Set
-    enabled=False (gin: decorrelate_runtime_rng.enabled) to restore the legacy
-    identical-mask-on-every-rank behavior.
+    """Re-seed torch/cuda with $SEED + rank so HSTU dropout draws different masks
+    per data-parallel rank (seed_everything's identical seed would draw the same).
+
+    MUST run after make_model() + make_optimizer_and_shard() so init stays
+    identical across ranks; it perturbs only forward-time stochasticity.
+    Reproducible (pure fn of $SEED + rank; RNG state checkpointed). enabled=False
+    keeps the legacy identical-mask behavior.
     """
     if not enabled:
         logger.info(

From f63e5d4c5cd3f0d3b85c0c8f28e88c04b208ddc6 Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Wed, 24 Jun 2026 19:56:37 +0000
Subject: [PATCH 090/113] dlrmv4: drop .gitignore changes from the PR

Revert recommendation_v4/.gitignore to base. Local run artifacts and ad-hoc
analysis files are kept out of the repo via .git/info/exclude (local,
uncommitted) instead, removing one file from the PR diff.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 recommendation_v4/.gitignore | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/recommendation_v4/.gitignore b/recommendation_v4/.gitignore
index 4bfbaa0ca..5edddc5b3 100644
--- a/recommendation_v4/.gitignore
+++ b/recommendation_v4/.gitignore
@@ -157,28 +157,3 @@ dmypy.json
 
 # Cython debug symbols
 cython_debug/
-
-# SLURM batch stdout + local run artifacts (run logs are never committed;
-# the real run logs / MLPerf compliance logs live under $SCRATCH, outside the
-# repo). The trainer's #SBATCH --output lands in the submit dir as
-# yambda_slurm.<jobid>.out.
-yambda_slurm.*.out
-yambda_slurm.*.log
-compliance_checker.log
-
-# Local container build/run helpers — environment-specific, not committed.
-/Dockerfile.nvidia
-/scripts/run_docker.sh
-
-# Local dlrmv4 analysis artifacts (per-job plots/scripts/dumps); kept on disk
-# for ad-hoc analysis but never committed.
-/analyze_*.py
-/dump_eval_*.py
-/perday_*.py
-/parse_lr_sweep.py
-/gen_canvas.py
-/plot_*.py
-/*.png
-/*.csv
-/scripts/bench_collectives.py
-/docs/v4_vs_v2_and_hstu_walkthrough.md

From 02f2d2b10db6f2ecbd5893f04821f541789d7e7e Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Wed, 24 Jun 2026 22:11:21 +0000
Subject: [PATCH 091/113] dlrmv4: centralize MLPerf emission + fix submission
 identity
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move the MLPerf event stream into mlperf_logging_utils.py: a MLPerfRunTracker
state machine owns the block/eval/run markers, progress metadata, and the
convergence decision (replacing ~145 lines of closures in
streaming_train_eval_loop), and MLPerfLogger.log_run_start emits submission
info + hyperparameters + INIT_STOP/RUN_START (collapsing the inline block in
train_ranker).

Convergence/EVAL_ACCURACY is fixed to per-window AUC: drop the
eval_accuracy_auc_mode knob (gin + loop param + launch_slurm passthrough).

Submission identity: SUBMISSION_ORG defaults to AMD, SUBMISSION_PLATFORM to
MI355X (was the org name — a bug), both overridable via
$MLPERF_SUBMISSION_PLATFORM.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin           |  15 +-
 .../dlrm_v3/train/mlperf_logging_utils.py     | 225 ++++++++++++++-
 .../dlrm_v3/train/train_ranker.py             |  58 +---
 .../dlrm_v3/train/utils.py                    | 260 ++++++++----------
 .../generative_recommenders/dlrm_v3/utils.py  |  79 ++++--
 recommendation_v4/scripts/launch_slurm.sh     |   8 +-
 6 files changed, 416 insertions(+), 229 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index dd4dd4715..8a3e5b595 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -325,6 +325,14 @@ sts/env_int.default = 150
 streaming_train_eval_loop.metric_log_frequency = @mlf/env_int()
 mlf/env_int.key     = "METRIC_LOG_FREQ"
 mlf/env_int.default = 50
+# MLPerf train_loss event cadence (global train steps), INDEPENDENT of
+# METRIC_LOG_FREQ above. 0 (default) = fall back to METRIC_LOG_FREQ, preserving
+# the prior coupled behavior. Set $MLPERF_TRAIN_LOSS_LOG_FREQ>0 to log the MLPerf
+# train_loss event at a different rate than the console/TB metrics. Disable the
+# whole MLPerf stream with $MLPERF_LOGGING=0.
+streaming_train_eval_loop.mlperf_train_loss_log_frequency = @mltlf/env_int()
+mltlf/env_int.key     = "MLPERF_TRAIN_LOSS_LOG_FREQ"
+mltlf/env_int.default = 0
 # Diagnostic: log per-batch unique/total embedding-id counts on logged steps
 # (rank 0). Quantifies the user-major batching redundancy and the realized
 # diversity from get_dataset.streaming_shuffle_fraction. Off; set $DIAG_UNIQUE_EMB=1.
@@ -453,11 +461,8 @@ MetricsLogger.world_size = 8
 MetricsLogger.auc_threshold = @at/env_float()
 at/env_float.key     = "AUC_THRESHOLD"
 at/env_float.default = 0.80275
-# Eval metric driving EVAL_ACCURACY + early-stop: "{window|lifetime}_{auc|gauc|
-# accuracy|ne}" (default window_auc). Override via $EVAL_ACCURACY_AUC_MODE.
-streaming_train_eval_loop.eval_accuracy_auc_mode = @eaam/env_str()
-eaam/env_str.key     = "EVAL_ACCURACY_AUC_MODE"
-eaam/env_str.default = "window_auc"
+# EVAL_ACCURACY + early-stop are driven by per-window AUC (window_auc) vs the
+# threshold above.
 # Lifetime-AUC backend, selectable independently for the train cumulative AUC and
 # the eval cumulative ("lifetime_*") AUC. Both default to "binned":
 #   "binned" = BinnedCumulativeAUC: exact-cumulative AUC from an O(bins) score
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/mlperf_logging_utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/mlperf_logging_utils.py
index 47d544ad7..e190f0325 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/mlperf_logging_utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/mlperf_logging_utils.py
@@ -69,7 +69,8 @@ def __init__(
         log_path: Optional[str] = None,
         default_stack_offset: int = 2,
         benchmark_name: str = "hstu",
-        submitter_name: str = "reference_implementation",
+        submitter_name: str = "AMD",
+        submission_platform: str = "MI355X",
     ):
         self.enabled: bool = _MLLOG_AVAILABLE
         # Use the EXPLICIT caller rank: this is built before init_process_group,
@@ -77,6 +78,7 @@ def __init__(
         self.rank: int = rank if rank is not None else _rank()
         self.benchmark_name: str = benchmark_name
         self.submitter_name: str = submitter_name
+        self.submission_platform: str = submission_platform
         self._logger = None
         if not self.enabled:
             return
@@ -130,16 +132,211 @@ def end(
         if self.enabled and self.rank == 0:
             self._logger.end(key=key, value=value, metadata=metadata or {})
 
-    def submission_info(self, benchmark_name: str, submitter_name: str) -> None:
+    def submission_info(self) -> None:
         """Emit the five SUBMISSION_* events required for a valid submission."""
         if not (self.enabled and self.rank == 0):
             return
         c = mllog_constants
-        self.event(key=c.SUBMISSION_BENCHMARK, value=benchmark_name)
-        self.event(key=c.SUBMISSION_ORG, value=submitter_name)
+        self.event(key=c.SUBMISSION_BENCHMARK, value=self.benchmark_name)
+        self.event(key=c.SUBMISSION_ORG, value=self.submitter_name)
         self.event(key=c.SUBMISSION_DIVISION, value=c.CLOSED)
         self.event(key=c.SUBMISSION_STATUS, value=c.ONPREM)
-        self.event(key=c.SUBMISSION_PLATFORM, value=submitter_name)
+        self.event(key=c.SUBMISSION_PLATFORM, value=self.submission_platform)
+
+    def log_run_start(
+        self,
+        global_batch_size: int,
+        seed: int,
+        gradient_accumulation_steps: int = 1,
+    ) -> None:
+        """Emit submission info + core hyperparameters, then INIT_STOP + RUN_START.
+
+        Optimizer names/LRs are read from gin (dense Adam + sparse RowWiseAdagrad),
+        resolving env-macro refs to concrete values. Call once on a genuine cold
+        start, after the model is built. INIT_STOP/RUN_START barrier so the
+        timestamp reflects the slowest rank, so ALL ranks must call this together
+        (non-rank-0 / disabled calls no-op the emit but still hit the barrier).
+        """
+        c = self.constants
+        self.submission_info()
+        self.event(key=c.GLOBAL_BATCH_SIZE, value=int(global_batch_size))
+        self.event(
+            key=c.GRADIENT_ACCUMULATION_STEPS, value=int(gradient_accumulation_steps)
+        )
+        self.event(key=c.SEED, value=int(seed))
+        self.event(
+            key=c.OPT_NAME,
+            value=_gin_param("dense_optimizer_factory_and_class.optimizer_name", "Adam"),
+        )
+        self.event(
+            key=c.OPT_BASE_LR,
+            value=_gin_param("dense_optimizer_factory_and_class.learning_rate", None),
+        )
+        self.event(
+            key="opt_sparse_name",
+            value=_gin_param(
+                "sparse_optimizer_factory_and_class.optimizer_name", "RowWiseAdagrad"
+            ),
+        )
+        self.event(
+            key="opt_sparse_base_learning_rate",
+            value=_gin_param(
+                "sparse_optimizer_factory_and_class.learning_rate", None
+            ),
+        )
+        self.end(key=c.INIT_STOP, sync=True)
+        self.start(key=c.RUN_START, sync=True)
+
+
+def _gin_param(name: str, default: Any) -> Any:
+    """Read a gin-bound parameter, resolving env-macro refs to concrete values.
+
+    Returns ``default`` if the parameter is unbound or a macro ref cannot be
+    resolved (so env-overridden LRs log as numbers, not unencodable objects).
+    """
+    try:
+        value = gin.query_parameter(name)
+    except (ValueError, KeyError):
+        return default
+    if hasattr(value, "scoped_configurable_fn"):
+        try:
+            return value.scoped_configurable_fn()
+        except Exception:
+            return default
+    return value
+
+
+class MLPerfRunTracker:
+    """Centralized MLPerf run-boundary state machine for the streaming loop.
+
+    Owns the block/eval/run markers, the SAMPLES_COUNT/EPOCH_NUM progress
+    metadata, and the convergence decision (per-window AUC vs the configured
+    ``auc_threshold``). Every method no-ops when ``logger`` is None, so the
+    streaming loop can call them unconditionally. The convergence metric is
+    fixed to per-window AUC (higher-is-better).
+    """
+
+    # MetricsLogger.compute key short name for per-window AUC.
+    _EVAL_METRIC_SHORT = "window_auc"
+
+    def __init__(
+        self,
+        logger: Optional[MLPerfLogger],
+        metric_logger: Any,
+        total_train_samples: int,
+        rank: int,
+        device: Any,
+    ):
+        self.logger = logger
+        self.metric_logger = metric_logger
+        self.total_train_samples = int(total_train_samples)
+        self.rank = int(rank)
+        self.device = device
+        self.run_stopped: bool = False
+        # Idempotency flag so the boundary helpers and the outer loop can both
+        # call start/stop without risking a double BLOCK_START/STOP.
+        self._block_open: bool = False
+
+    @property
+    def enabled(self) -> bool:
+        return self.logger is not None
+
+    def _progress(self) -> Dict[str, Any]:
+        c = self.logger.constants
+        samples = self.metric_logger.cumulative_train_samples
+        epoch = (
+            samples / self.total_train_samples if self.total_train_samples > 0 else 0.0
+        )
+        return {c.SAMPLES_COUNT: samples, c.EPOCH_NUM: epoch}
+
+    def log_dataset_sizes(self, eval_samples: Optional[int] = None) -> None:
+        if not self.enabled:
+            return
+        c = self.logger.constants
+        self.logger.event(key=c.TRAIN_SAMPLES, value=self.total_train_samples)
+        if eval_samples is not None:
+            self.logger.event(key=c.EVAL_SAMPLES, value=int(eval_samples))
+
+    def block_start(self) -> None:
+        if self.enabled and not self._block_open:
+            self.logger.start(
+                key=self.logger.constants.BLOCK_START, metadata=self._progress()
+            )
+            self._block_open = True
+
+    def block_stop(self) -> None:
+        if self.enabled and self._block_open:
+            self.logger.end(
+                key=self.logger.constants.BLOCK_STOP, metadata=self._progress()
+            )
+            self._block_open = False
+
+    def eval_start(self) -> None:
+        if self.enabled:
+            self.logger.start(
+                key=self.logger.constants.EVAL_START, metadata=self._progress()
+            )
+
+    def _target_metric(self, metrics: Dict[str, float]) -> Optional[float]:
+        # Key format `metric/{prefix}_{name}/{task}` (see MetricsLogger.compute);
+        # match the per-window AUC short name.
+        for key, val in metrics.items():
+            short = key.split("/")[-2] if "/" in key else key
+            if short == self._EVAL_METRIC_SHORT:
+                return float(val)
+        return None
+
+    def _meets_target(self, value: Optional[float]) -> bool:
+        thr = self.metric_logger.auc_threshold
+        if value is None or thr is None:
+            return False
+        return value >= thr
+
+    def run_stop(self, status: object) -> None:
+        # Emit RUN_STOP exactly once, with an all-rank barrier so the timestamp
+        # reflects the slowest rank (MLPerf requirement).
+        if not self.enabled or self.run_stopped:
+            return
+        c = self.logger.constants
+        self.logger.end(
+            key=c.RUN_STOP,
+            metadata={c.STATUS: status, **self._progress()},
+            sync=True,
+        )
+        self.run_stopped = True
+
+    def eval_stop(self, eval_metrics: Dict[str, float]) -> bool:
+        # Emit EVAL_ACCURACY + EVAL_STOP, early SUCCESS RUN_STOP on target.
+        # Rank 0 decides + broadcasts the stop bool so all ranks break in lockstep
+        # (a per-rank test could diverge and hang the next all-to-all).
+        if not self.enabled:
+            return False
+        c = self.logger.constants
+        eval_value = self._target_metric(eval_metrics)
+        if eval_value is not None:
+            self.logger.event(
+                key=c.EVAL_ACCURACY, value=eval_value, metadata=self._progress()
+            )
+        self.logger.end(key=c.EVAL_STOP, metadata=self._progress())
+        decision = torch.zeros(1, device=self.device)
+        if self.rank == 0 and not self.run_stopped and self._meets_target(eval_value):
+            decision[0] = 1.0
+        if torch.distributed.is_available() and torch.distributed.is_initialized():
+            torch.distributed.broadcast(decision, src=0)
+        should_stop = bool(decision.item() > 0.5)
+        if should_stop:
+            # All ranks agree -> all reach the RUN_STOP barrier together.
+            self.run_stop(c.SUCCESS)
+        return should_stop
+
+    def finalize(self, final_metrics: Dict[str, float]) -> None:
+        # End-of-run RUN_STOP when the target was never crossed: SUCCESS iff the
+        # final eval metric meets the target, else ABORTED.
+        if not self.enabled or self.run_stopped:
+            return
+        c = self.logger.constants
+        success = self._meets_target(self._target_metric(final_metrics))
+        self.run_stop(c.SUCCESS if success else c.ABORTED)
 
 
 @gin.configurable
@@ -147,19 +344,35 @@ def get_mlperf_logger(
     rank: int = 0,
     log_path: str = "",
     benchmark_name: str = "hstu",
-    submitter_name: str = "reference_implementation",
+    submitter_name: str = "AMD",
+    submission_platform: str = "MI355X",
 ) -> Optional[MLPerfLogger]:
     """Build a configured :class:`MLPerfLogger`, or ``None`` if unavailable.
 
     Path defaults to ``$MLPERF_LOG_PATH``. Returns ``None`` (not a disabled
     logger) so callers' ``is not None`` guards cleanly skip logging.
+
+    Disable knob: set ``$MLPERF_LOGGING=0`` (or false/no/off) to turn the whole
+    MLPerf event stream off — returns ``None`` on EVERY rank, so the train loop's
+    ``is not None`` guards skip emission AND the cross-rank train-loss all-reduce
+    in lockstep. Default (unset / "1") = enabled, preserving prior behavior.
     """
     if not _MLLOG_AVAILABLE:
         return None
+    if os.environ.get("MLPERF_LOGGING", "1").strip().lower() in (
+        "0", "false", "no", "off",
+    ):
+        logger.info("MLPerf logging disabled via $MLPERF_LOGGING=0")
+        return None
     resolved_path = os.environ.get("MLPERF_LOG_PATH", log_path)
+    # SUBMISSION_PLATFORM defaults to "MI355X"; override per-submitter via env.
+    resolved_platform = os.environ.get(
+        "MLPERF_SUBMISSION_PLATFORM", submission_platform
+    )
     return MLPerfLogger(
         rank=rank,
         log_path=resolved_path,
         benchmark_name=benchmark_name,
         submitter_name=submitter_name,
+        submission_platform=resolved_platform,
     )
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
index 30b081946..50d359ef6 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
@@ -206,60 +206,14 @@ def _main_func(
         mlperf_logger is not None and mlperf_init_logged and resume_cold_start
     )
     if mlperf_run_active:
-        c = mlperf_logger.constants
-        mlperf_logger.submission_info(
-            benchmark_name=mlperf_logger.benchmark_name,
-            submitter_name=mlperf_logger.submitter_name,
+        # Submission info + hyperparameters + INIT_STOP/RUN_START, all emitted by
+        # the logger (optimizer names/LRs read from gin internally). Seed is the
+        # value setup() resolved and exported to $SEED.
+        mlperf_logger.log_run_start(
+            global_batch_size=world_size * int(train_dataloader.batch_size),
+            seed=int(os.environ.get("SEED", "1")),
         )
 
-        def _gin_param(name: str, default: object) -> object:
-            try:
-                value = gin.query_parameter(name)
-            except (ValueError, KeyError):
-                return default
-            # Resolve gin macro refs (e.g. @dlr/env_float()) to real values so
-            # env-overridden LRs log as numbers, not unencodable objects.
-            if hasattr(value, "scoped_configurable_fn"):
-                try:
-                    return value.scoped_configurable_fn()
-                except Exception:
-                    return default
-            return value
-
-        global_batch_size = world_size * int(train_dataloader.batch_size)
-        mlperf_logger.event(key=c.GLOBAL_BATCH_SIZE, value=global_batch_size)
-        mlperf_logger.event(key=c.GRADIENT_ACCUMULATION_STEPS, value=1)
-        # Actual seed chosen in setup() (exported to $SEED).
-        mlperf_logger.event(key=c.SEED, value=int(os.environ.get("SEED", "1")))
-        # Dense (Adam) + sparse (RowWiseAdagrad) optimizer hyperparameters from gin.
-        mlperf_logger.event(
-            key=c.OPT_NAME,
-            value=_gin_param(
-                "dense_optimizer_factory_and_class.optimizer_name", "Adam"
-            ),
-        )
-        mlperf_logger.event(
-            key=c.OPT_BASE_LR,
-            value=_gin_param(
-                "dense_optimizer_factory_and_class.learning_rate", None
-            ),
-        )
-        mlperf_logger.event(
-            key="opt_sparse_name",
-            value=_gin_param(
-                "sparse_optimizer_factory_and_class.optimizer_name",
-                "RowWiseAdagrad",
-            ),
-        )
-        mlperf_logger.event(
-            key="opt_sparse_base_learning_rate",
-            value=_gin_param(
-                "sparse_optimizer_factory_and_class.learning_rate", None
-            ),
-        )
-        mlperf_logger.end(key=c.INIT_STOP, sync=True)
-        mlperf_logger.start(key=c.RUN_START, sync=True)
-
     # train loop
     try:
         if mode == "train":
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index bf5fd5512..526a6f89d 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -1551,6 +1551,11 @@ def streaming_train_eval_loop(
     num_eval_batches: Optional[int] = None,
     output_trace: bool = False,
     metric_log_frequency: int = 1,
+    # MLPerf `train_loss` event cadence, in global train steps, INDEPENDENT of
+    # metric_log_frequency (the console/TB cadence). 0 = fall back to
+    # metric_log_frequency (preserves prior coupled behavior). Wired to
+    # $MLPERF_TRAIN_LOSS_LOG_FREQ via gin.
+    mlperf_train_loss_log_frequency: int = 0,
     checkpoint_frequency: int = 100,
     start_ts: int = 0,
     persistent_loader: bool = False,
@@ -1590,10 +1595,6 @@ def streaming_train_eval_loop(
     die_at_step: int = -1,
     # MLPerf logger (rank-0-gated); None disables all MLPerf event emission.
     mlperf_logger: Optional[Any] = None,
-    # Which eval metric drives EVAL_ACCURACY + the convergence decision. Format
-    # "{window|lifetime}_{auc|gauc|accuracy|ne}" (bare "window"/"lifetime" => auc).
-    # Override via $EVAL_ACCURACY_AUC_MODE.
-    eval_accuracy_auc_mode: str = "window_auc",
 ) -> None:
     """Streaming train+eval loop with per-window (and optionally mid-window)
     checkpoints.
@@ -1646,6 +1647,14 @@ def streaming_train_eval_loop(
             "the data-fraction cadence set EVAL_EVERY_N_WINDOWS=0; to use the "
             "per-window cadence set EVAL_EVERY_DATA_PCT=0."
         )
+    # MLPerf train_loss cadence: independent of metric_log_frequency. 0 (the
+    # env-binding default) falls back to metric_log_frequency so unset behavior
+    # matches the prior coupled implementation.
+    mlperf_loss_every = (
+        mlperf_train_loss_log_frequency
+        if mlperf_train_loss_log_frequency and mlperf_train_loss_log_frequency > 0
+        else metric_log_frequency
+    )
     profiler = Profiler(rank) if output_trace else None
     # Normalize the per-window caps: <=0 (the env-binding default) means "no cap
     # = consume the full window". The eval-break check below is `is not None and
@@ -1959,6 +1968,11 @@ def _run_train_window(
                     len(sample.candidates_features_kjt.keys()), -1
                 )[0],
             )
+            # MLPerf train_loss event on its own cadence (decoupled from the
+            # console/TB metric cadence below). Called every step; the cross-rank
+            # all-reduce only fires on the cadence, gated by the rank-identical
+            # global_step inside the method, so it stays in lockstep.
+            metric_logger.maybe_log_mlperf_train_loss(aux_losses, every=mlperf_loss_every)
             if train_batch_idx % metric_log_frequency == 0:
                 metric_logger.compute_and_log(
                     mode="train",
@@ -2028,6 +2042,12 @@ def _run_train_window(
                 do_eval(train_ts, gstep)
                 model.train()
                 dataset.dataset.is_eval = False  # pyre-ignore [16]
+                # Data-fraction eval may hit the MLPerf target and emit RUN_STOP
+                # (via _do_eval_*). Stop the window immediately so we don't train
+                # past the convergence point; the outer window loop checks the
+                # same flag and breaks too.
+                if mlt.run_stopped:
+                    break
             # Test-only: deterministic crash for the failure-injection test.
             # Triggered AFTER the save above, so on resume we re-enter at
             # batch_idx_in_window=train_batch_idx and emit batches [K+1, end).
@@ -2238,15 +2258,14 @@ def _should_eval(i: int) -> bool:
                 dataset.dataset._train_split_percentage,  # pyre-ignore[16]
             )
 
-    # --- MLPerf progress accounting + event helpers ---------------------------
+    # --- MLPerf run tracking --------------------------------------------------
     # total_train_samples = epoch_num denominator (global trainable samples over
     # the window range), computed once and logged as TRAIN_SAMPLES.
     total_train_samples = 0
-    mlperf_run_stopped = [False]
     if mlperf_logger is not None:
-        _twi = getattr(dataset.dataset, "train_window_indices", None)
-        _wi = getattr(dataset.dataset, "window_indices", None)
-        _idx_fn = _twi or _wi
+        _idx_fn = getattr(
+            dataset.dataset, "train_window_indices", None
+        ) or getattr(dataset.dataset, "window_indices", None)
         if _idx_fn is not None:
             for _ts in train_ts_list:
                 total_train_samples += int(_idx_fn(_ts).size)
@@ -2256,15 +2275,7 @@ def _should_eval(i: int) -> bool:
                 total_train_samples,
                 n_train,
             )
-        mlperf_logger.event(
-            key=mlperf_logger.constants.TRAIN_SAMPLES, value=total_train_samples
-        )
-        if eval_global_indices is not None:
-            mlperf_logger.event(
-                key=mlperf_logger.constants.EVAL_SAMPLES,
-                value=int(eval_global_indices.size),
-            )
-        # Wire the logger + LR getter so compute_and_log emits the train_loss event.
+        # Wire the logger + LR getter so MetricsLogger.compute emits train_loss.
         metric_logger.mlperf_logger = mlperf_logger
 
         def _current_lr() -> float:
@@ -2272,110 +2283,26 @@ def _current_lr() -> float:
 
         metric_logger.lr_getter = _current_lr
 
-    def _mlperf_progress() -> Dict[str, Any]:
-        samples = metric_logger.cumulative_train_samples
-        epoch_num = (samples / total_train_samples) if total_train_samples > 0 else 0.0
-        return {
-            mlperf_logger.constants.SAMPLES_COUNT: samples,
-            mlperf_logger.constants.EPOCH_NUM: epoch_num,
-        }
-
-    # Convergence/EVAL_ACCURACY metric short name selected by
-    # eval_accuracy_auc_mode. Bare "window"/"lifetime" defaults the metric to auc.
-    _eval_metric_short = str(eval_accuracy_auc_mode).strip().lower()
-    if _eval_metric_short in ("window", "lifetime"):
-        _eval_metric_short = f"{_eval_metric_short}_auc"
-    # NE is lower-is-better; auc/gauc/accuracy are higher-is-better.
-    _eval_lower_is_better = _eval_metric_short.endswith("_ne")
-    if rank == 0 and mlperf_logger is not None:
-        logger.info(
-            f"[mlperf] EVAL_ACCURACY / convergence metric = {_eval_metric_short} "
-            f"(stop when {'<=' if _eval_lower_is_better else '>='} threshold)"
-        )
-
-    def _eval_target_metric(metrics: Dict[str, float]) -> Optional[float]:
-        # listen_plus eval metric selected by eval_accuracy_auc_mode. Key format
-        # is `metric/{prefix}_{name}/{task}` (see MetricsLogger.compute).
-        for key, val in metrics.items():
-            short = key.split("/")[-2] if "/" in key else key
-            if short == _eval_metric_short:
-                return float(val)
-        return None
-
-    def _meets_target(value: Optional[float], thr: Optional[float]) -> bool:
-        if value is None or thr is None:
-            return False
-        return value <= thr if _eval_lower_is_better else value >= thr
-
-    def _mlperf_block_start() -> None:
-        if mlperf_logger is not None:
-            mlperf_logger.start(
-                key=mlperf_logger.constants.BLOCK_START, metadata=_mlperf_progress()
-            )
-
-    def _mlperf_block_stop() -> None:
-        if mlperf_logger is not None:
-            mlperf_logger.end(
-                key=mlperf_logger.constants.BLOCK_STOP, metadata=_mlperf_progress()
-            )
-
-    def _mlperf_eval_start() -> None:
-        if mlperf_logger is not None:
-            mlperf_logger.start(
-                key=mlperf_logger.constants.EVAL_START, metadata=_mlperf_progress()
-            )
-
-    def _mlperf_run_stop(status: object) -> None:
-        # Emit RUN_STOP exactly once, with an all-rank barrier so the timestamp
-        # reflects the slowest rank (MLPerf requirement).
-        if mlperf_logger is None or mlperf_run_stopped[0]:
-            return
-        mlperf_logger.end(
-            key=mlperf_logger.constants.RUN_STOP,
-            metadata={mlperf_logger.constants.STATUS: status, **_mlperf_progress()},
-            sync=True,
-        )
-        mlperf_run_stopped[0] = True
+    # Centralized MLPerf run-boundary state machine: owns block/eval/run markers,
+    # SAMPLES_COUNT/EPOCH_NUM progress metadata, and the per-window-AUC vs
+    # auc_threshold convergence decision. Every method no-ops when mlperf_logger
+    # is None, so the loop below calls them unconditionally.
+    from generative_recommenders.dlrm_v3.train.mlperf_logging_utils import (
+        MLPerfRunTracker,
+    )
 
-    def _mlperf_eval_stop(eval_metrics: Dict[str, float]) -> bool:
-        # Emit EVAL_ACCURACY + EVAL_STOP, early SUCCESS RUN_STOP on target.
-        # Rank 0 decides + broadcasts the stop bool so all ranks break in lockstep
-        # (a per-rank test could diverge and hang the next all-to-all).
-        if mlperf_logger is None:
-            return False
-        eval_value = _eval_target_metric(eval_metrics)
-        if eval_value is not None:
-            mlperf_logger.event(
-                key=mlperf_logger.constants.EVAL_ACCURACY,
-                value=eval_value,
-                metadata=_mlperf_progress(),
-            )
-        mlperf_logger.end(
-            key=mlperf_logger.constants.EVAL_STOP, metadata=_mlperf_progress()
-        )
-        thr = metric_logger.auc_threshold
-        decision = torch.zeros(1, device=device)
-        if rank == 0 and not mlperf_run_stopped[0] and _meets_target(eval_value, thr):
-            decision[0] = 1.0
-        if torch.distributed.is_initialized():
-            torch.distributed.broadcast(decision, src=0)
-        should_stop = bool(decision.item() > 0.5)
-        if should_stop:
-            # All ranks agree -> all reach the RUN_STOP barrier together.
-            _mlperf_run_stop(mlperf_logger.constants.SUCCESS)
-        return should_stop
-
-    def _mlperf_finalize(final_metrics: Dict[str, float]) -> None:
-        # End-of-run RUN_STOP when the target was never crossed: SUCCESS iff the
-        # final eval metric meets the target, else ABORTED.
-        if mlperf_logger is None or mlperf_run_stopped[0]:
-            return
-        success = _meets_target(_eval_target_metric(final_metrics), metric_logger.auc_threshold)
-        _mlperf_run_stop(
-            mlperf_logger.constants.SUCCESS
-            if success
-            else mlperf_logger.constants.ABORTED
-        )
+    mlt = MLPerfRunTracker(
+        logger=mlperf_logger,
+        metric_logger=metric_logger,
+        total_train_samples=total_train_samples,
+        rank=rank,
+        device=device,
+    )
+    mlt.log_dataset_sizes(
+        eval_samples=eval_global_indices.size
+        if eval_global_indices is not None
+        else None
+    )
 
     if persistent_loader and double_buffer:
         # Double-buffered: next window prepared in the background during the
@@ -2427,15 +2354,37 @@ def _mlperf_finalize(final_metrics: Dict[str, float]) -> None:
         # reset, not a fork — the only fork was the up-front iter() above), so it
         # stays safe alongside the background window-prefetch thread.
         def _do_eval_db(train_ts: int, gstep: int) -> None:
+            # Data-fraction eval boundary: this closes the current MLPerf block,
+            # runs the holdout eval with full EVAL_START/EVAL_STOP + EVAL_ACCURACY
+            # + convergence, then opens the next block. The block thus brackets
+            # exactly one eval_interval_steps of training (MLPerf block == work
+            # between two evals), instead of one timestamp window.
             dataset.dataset.is_eval = True  # pyre-ignore [16]
             assert eval_dl is not None
-            _run_eval_window(
+            mlt.block_stop()
+            mlt.eval_start()
+            eval_metrics = _run_eval_window(
                 iter(eval_dl),
                 label=f"eval_holdout@train_ts={train_ts}@step={gstep}",
             )
+            # Emits RUN_STOP (sets mlt.run_stopped) if the target is met;
+            # _run_train_window / the window loop break on that flag.
+            mlt.eval_stop(eval_metrics)
+            if not mlt.run_stopped:
+                mlt.block_start()
 
         _db_do_eval = _do_eval_db if eval_interval_steps > 0 else None
 
+        # Block placement depends on the eval cadence. Per-window cadence
+        # (eval_every_n_windows>0): one block per timestamp window. Otherwise
+        # (data-fraction cadence, or no eval): a single block spans the whole
+        # run, split at each data-fraction eval boundary by _do_eval_db. Open
+        # the first block here for the latter; the boundary helper + the
+        # post-loop stop handle the rest.
+        _per_window_blocks = eval_every_n_windows > 0
+        if not _per_window_blocks:
+            mlt.block_start()
+
         for i, (train_ts, train_data_iterator) in enumerate(
             # Only the FIRST window after a mid-window resume needs the skip
             # (handed via prefetcher.stream's first_skip_samples). The skip is
@@ -2452,7 +2401,8 @@ def _do_eval_db(train_ts: int, gstep: int) -> None:
                 if i == 0 and resume_batch_idx_in_window > 0
                 else 0
             )
-            _mlperf_block_start()
+            if _per_window_blocks:
+                mlt.block_start()
             _run_train_window(
                 train_data_iterator,
                 train_ts=train_ts,
@@ -2460,16 +2410,17 @@ def _do_eval_db(train_ts: int, gstep: int) -> None:
                 label=f"train_ts={train_ts}",
                 do_eval=_db_do_eval,
             )
-            _mlperf_block_stop()
+            if _per_window_blocks:
+                mlt.block_stop()
             should_stop = False
             if _should_eval(i):
                 dataset.dataset.is_eval = True  # pyre-ignore [16]
                 assert eval_sampler is not None and eval_dl is not None
-                _mlperf_eval_start()
+                mlt.eval_start()
                 eval_metrics = _run_eval_window(
                     eval_iter, label=f"eval_holdout@train_ts={train_ts}"
                 )
-                should_stop = _mlperf_eval_stop(eval_metrics)
+                should_stop = mlt.eval_stop(eval_metrics)
                 # Re-arm the (already-forked) eval pool for the NEXT eval. The
                 # holdout set is fixed, so the sampler window is unchanged; we
                 # only need a fresh iter() to replay it. iter() reuses the
@@ -2480,17 +2431,31 @@ def _do_eval_db(train_ts: int, gstep: int) -> None:
                 if next_eval_i is not None:
                     eval_iter = iter(eval_dl)
             _maybe_checkpoint(train_ts)
-            if should_stop:
+            # should_stop: per-window convergence. mlt.run_stopped:
+            # data-fraction convergence (RUN_STOP fired mid-window by _do_eval_db).
+            if should_stop or mlt.run_stopped:
                 # MLPerf target reached: RUN_STOP already emitted; stop training.
                 break
+
+        # Close the run-spanning block for the data-fraction / no-eval case.
+        # Idempotent: a no-op if the last eval boundary already closed it (i.e.
+        # convergence stopped the run) or if per-window blocks were used.
+        if not _per_window_blocks:
+            mlt.block_stop()
     else:
         # Data-fraction eval callback (non-double-buffer path). Builds a fresh
         # eval dataloader per call over the FIXED holdout set (or the legacy
         # next-window eval when the dataset has no holdout support).
         def _do_eval_nb(train_ts: int, gstep: int) -> None:
+            # Data-fraction eval boundary (non-double-buffer path). See _do_eval_db:
+            # close the current MLPerf block, run the eval with full markers +
+            # convergence, then open the next block so a block brackets one
+            # eval_interval_steps of training rather than a timestamp window.
             dataset.dataset.is_eval = True  # pyre-ignore [16]
+            mlt.block_stop()
+            mlt.eval_start()
             if eval_global_indices is not None:
-                _run_eval_window(
+                eval_metrics = _run_eval_window(
                     iter(
                         make_streaming_dataloader(
                             dataset=dataset, indices=eval_global_indices
@@ -2499,13 +2464,23 @@ def _do_eval_nb(train_ts: int, gstep: int) -> None:
                     label=f"eval_holdout@train_ts={train_ts}@step={gstep}",
                 )
             else:
-                _run_eval_window(
+                eval_metrics = _run_eval_window(
                     iter(make_streaming_dataloader(dataset=dataset, ts=train_ts + 1)),
                     label=f"eval@train_ts={train_ts}@step={gstep}",
                 )
+            mlt.eval_stop(eval_metrics)
+            if not mlt.run_stopped:
+                mlt.block_start()
 
         _nb_do_eval = _do_eval_nb if eval_interval_steps > 0 else None
 
+        # See the double-buffer branch: per-window blocks for the per-window
+        # cadence, else a single run-spanning block split at data-fraction eval
+        # boundaries by _do_eval_nb.
+        _per_window_blocks = eval_every_n_windows > 0
+        if not _per_window_blocks:
+            mlt.block_start()
+
         for i, train_ts in enumerate(train_ts_list):
             dataset.dataset.is_eval = False  # pyre-ignore [16]
             skip = first_skip_samples if i == 0 else 0
@@ -2514,18 +2489,20 @@ def _do_eval_nb(train_ts: int, gstep: int) -> None:
                 if i == 0 and resume_batch_idx_in_window > 0
                 else 0
             )
-            _mlperf_block_start()
+            if _per_window_blocks:
+                mlt.block_start()
             _run_train_window(
                 _window_iter(train_ts, skip_samples=skip),
                 train_ts=train_ts,
                 start_batch_idx=start_batch,
                 do_eval=_nb_do_eval,
             )
-            _mlperf_block_stop()
+            if _per_window_blocks:
+                mlt.block_stop()
             should_stop = False
             if _should_eval(i):
                 dataset.dataset.is_eval = True  # pyre-ignore [16]
-                _mlperf_eval_start()
+                mlt.eval_start()
                 if eval_global_indices is not None:
                     eval_metrics = _run_eval_window(
                         iter(
@@ -2540,17 +2517,24 @@ def _do_eval_nb(train_ts: int, gstep: int) -> None:
                     eval_metrics = _run_eval_window(
                         iter(make_streaming_dataloader(dataset=dataset, ts=train_ts + 1))
                     )
-                should_stop = _mlperf_eval_stop(eval_metrics)
+                should_stop = mlt.eval_stop(eval_metrics)
             _maybe_checkpoint(train_ts)
-            if should_stop:
+            # should_stop: per-window convergence. mlt.run_stopped:
+            # data-fraction convergence (RUN_STOP fired mid-window by _do_eval_nb).
+            if should_stop or mlt.run_stopped:
                 # MLPerf target reached: RUN_STOP already emitted; stop training.
                 break
 
+        # Close the run-spanning block for the data-fraction / no-eval case
+        # (idempotent; no-op under per-window blocks or after a convergence stop).
+        if not _per_window_blocks:
+            mlt.block_stop()
+
     # Final eval over the fixed user-holdout set (legacy final-window eval
     # otherwise). Skipped if the MLPerf target already stopped the run mid-run.
-    if not mlperf_run_stopped[0]:
+    if not mlt.run_stopped:
         dataset.dataset.is_eval = True  # pyre-ignore [16]
-        _mlperf_eval_start()
+        mlt.eval_start()
         if eval_global_indices is not None:
             final_metrics = _run_eval_window(
                 iter(
@@ -2565,9 +2549,9 @@ def _do_eval_nb(train_ts: int, gstep: int) -> None:
                 iter(make_streaming_dataloader(dataset=dataset, ts=num_train_ts)),
                 label="eval@final",
             )
-        _mlperf_eval_stop(final_metrics)
+        mlt.eval_stop(final_metrics)
         if rank == 0:
             for k, v in final_metrics.items():
                 print(f"{k}: {v}")
         # End-of-run RUN_STOP: SUCCESS if final metric met target, else ABORTED.
-        _mlperf_finalize(final_metrics)
+        mlt.finalize(final_metrics)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
index 879c93b97..281a37b5d 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
@@ -1213,6 +1213,52 @@ def _emit(
         )
         return all_computed_metrics
 
+    def _global_mean_loss(self, loss_terms: Dict[str, torch.Tensor]) -> float:
+        """Cross-rank mean of the summed per-task losses.
+
+        The 1-element all-reduce MUST run on every rank in lockstep; callers gate
+        it on a rank-identical counter (global_step / a deterministic frequency)
+        so it cannot desync.
+        """
+        loss_t = torch.stack(
+            [v.detach().float().sum() for v in loss_terms.values()]
+        ).sum()
+        if torch.distributed.is_available() and torch.distributed.is_initialized():
+            torch.distributed.all_reduce(loss_t, op=torch.distributed.ReduceOp.SUM)
+            loss_t = loss_t / self._world_size
+        return float(loss_t)
+
+    def maybe_log_mlperf_train_loss(
+        self, aux_losses: Dict[str, torch.Tensor], every: int
+    ) -> None:
+        """Emit the MLPerf ``train_loss`` event on its OWN cadence.
+
+        Decoupled from the console/TB metric cadence (``compute_and_log``): call
+        this every train step with the just-computed ``aux_losses`` and the
+        desired interval ``every`` (in global train steps). The cross-rank loss
+        all-reduce only fires on the cadence, gated by ``global_step["train"]``
+        which is incremented identically on all ranks in ``update()`` — so the
+        collective stays in lockstep. No-op when MLPerf logging is disabled
+        (``mlperf_logger is None`` on every rank) or ``every <= 0``.
+        """
+        if self.mlperf_logger is None or every <= 0 or not aux_losses:
+            return
+        if self.global_step["train"] % every != 0:
+            return
+        train_loss = self._global_mean_loss(aux_losses)
+        c = self.mlperf_logger.constants
+        md: Dict[str, Any] = {c.SAMPLES_COUNT: self.cumulative_train_samples}
+        if self.lr_getter is not None:
+            try:
+                md["lr"] = float(self.lr_getter())
+            except Exception:
+                pass
+        self.mlperf_logger.event(
+            key=getattr(c, "TRAIN_LOSS", "train_loss"),
+            value=train_loss,
+            metadata=md,
+        )
+
     def compute_and_log(
         self,
         mode: str = "train",
@@ -1249,21 +1295,15 @@ def compute_and_log(
                         global_step=self.global_step[mode],
                     )
 
-        # Global (cross-rank mean) train loss to console/TB + the MLPerf
-        # train_loss event, at the metric-logging cadence. The 1-element
-        # all-reduce runs on every rank in lockstep, so it cannot desync.
+        # Global (cross-rank mean) train loss to console/TB at the metric-logging
+        # cadence. The 1-element all-reduce runs on every rank in lockstep, so it
+        # cannot desync. The MLPerf `train_loss` EVENT is emitted separately via
+        # ``maybe_log_mlperf_train_loss`` so its cadence can be tuned independently
+        # of this console/TB cadence (see that method).
         if mode == "train" and additional_logs is not None and "losses" in additional_logs:
             loss_terms = additional_logs["losses"]
             if loss_terms:
-                loss_t = torch.stack(
-                    [v.detach().float().sum() for v in loss_terms.values()]
-                ).sum()
-                if torch.distributed.is_available() and torch.distributed.is_initialized():
-                    torch.distributed.all_reduce(
-                        loss_t, op=torch.distributed.ReduceOp.SUM
-                    )
-                    loss_t = loss_t / self._world_size
-                train_loss = float(loss_t)
+                train_loss = self._global_mean_loss(loss_terms)
                 self.tb_logger.add_scalar(
                     "train_loss", train_loss, global_step=self.global_step["train"]
                 )
@@ -1272,21 +1312,6 @@ def compute_and_log(
                         f"train - Step {self.global_step['train']} "
                         f"train_loss={train_loss:.5f}"
                     )
-                if self.mlperf_logger is not None:
-                    c = self.mlperf_logger.constants
-                    md: Dict[str, Any] = {
-                        c.SAMPLES_COUNT: self.cumulative_train_samples,
-                    }
-                    if self.lr_getter is not None:
-                        try:
-                            md["lr"] = float(self.lr_getter())
-                        except Exception:
-                            pass
-                    self.mlperf_logger.event(
-                        key=getattr(c, "TRAIN_LOSS", "train_loss"),
-                        value=train_loss,
-                        metadata=md,
-                    )
 
         # Throughput metrics (train only). One GPU->CPU sync per call.
         if mode == "train" and self._perf_steps_in_window > 0:
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index 5ab1e950a..9d8e90921 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -303,6 +303,8 @@ orchestrate() {
       -e NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES \
       -e NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES \
       -e METRIC_LOG_FREQ=$METRIC_LOG_FREQ \
+      ${MLPERF_LOGGING:+-e MLPERF_LOGGING=$MLPERF_LOGGING} \
+      ${MLPERF_TRAIN_LOSS_LOG_FREQ:+-e MLPERF_TRAIN_LOSS_LOG_FREQ=$MLPERF_TRAIN_LOSS_LOG_FREQ} \
       ${STREAMING_SHUFFLE_FRACTION:+-e STREAMING_SHUFFLE_FRACTION=$STREAMING_SHUFFLE_FRACTION} \
       ${STREAMING_SHUFFLE_SEED:+-e STREAMING_SHUFFLE_SEED=$STREAMING_SHUFFLE_SEED} \
       ${NUM_WORKERS:+-e NUM_WORKERS=$NUM_WORKERS} \
@@ -326,7 +328,11 @@ orchestrate() {
       ${CKPT_STEP_FREQ:+-e CKPT_STEP_FREQ=$CKPT_STEP_FREQ} \
       -e TRAIN_SPLIT_PERCENTAGE=${TRAIN_SPLIT_PERCENTAGE:-1.0} \
       -e AUC_THRESHOLD=${AUC_THRESHOLD:-0.80275} \
-      ${EVAL_ACCURACY_AUC_MODE:+-e EVAL_ACCURACY_AUC_MODE=$EVAL_ACCURACY_AUC_MODE} \
+      ${MLPERF_SUBMISSION_PLATFORM:+-e MLPERF_SUBMISSION_PLATFORM=$MLPERF_SUBMISSION_PLATFORM} \
+      ${TRAIN_LIFETIME_AUC_MODE:+-e TRAIN_LIFETIME_AUC_MODE=$TRAIN_LIFETIME_AUC_MODE} \
+      ${EVAL_LIFETIME_AUC_MODE:+-e EVAL_LIFETIME_AUC_MODE=$EVAL_LIFETIME_AUC_MODE} \
+      ${CUMULATIVE_AUC_BINS:+-e CUMULATIVE_AUC_BINS=$CUMULATIVE_AUC_BINS} \
+      ${LIFETIME_AUC_WINDOW:+-e LIFETIME_AUC_WINDOW=$LIFETIME_AUC_WINDOW} \
       -e SPLIT_SALT=${SPLIT_SALT:-0} \
       -e EVAL_HOLDOUT_TS=${EVAL_HOLDOUT_TS:--1} \
       -e EVAL_HOLDOUT_NUM_WINDOWS=${EVAL_HOLDOUT_NUM_WINDOWS:-1} \

From ae99293c463e376477d4ac4604e63646df449dd9 Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Wed, 24 Jun 2026 22:34:29 +0000
Subject: [PATCH 092/113] dlrmv4: hardcode lifetime-AUC backend to binned, drop
 the override

The lifetime cumulative AUC always uses the exact binned backend now. Remove
the TRAIN_LIFETIME_AUC_MODE / EVAL_LIFETIME_AUC_MODE env overrides (and the
capped-only LIFETIME_AUC_WINDOW knob, now dead) from gin and launch_slurm.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin           | 25 ++++++-------------
 .../dlrm_v3/train/train_ranker.py             |  8 +++---
 .../generative_recommenders/dlrm_v3/utils.py  |  8 +++---
 recommendation_v4/scripts/launch_slurm.sh     |  3 ---
 4 files changed, 15 insertions(+), 29 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 8a3e5b595..3a938da36 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -463,29 +463,18 @@ at/env_float.key     = "AUC_THRESHOLD"
 at/env_float.default = 0.80275
 # EVAL_ACCURACY + early-stop are driven by per-window AUC (window_auc) vs the
 # threshold above.
-# Lifetime-AUC backend, selectable independently for the train cumulative AUC and
-# the eval cumulative ("lifetime_*") AUC. Both default to "binned":
-#   "binned" = BinnedCumulativeAUC: exact-cumulative AUC from an O(bins) score
-#     histogram (additive all-reduce, memory independent of #samples/#windows).
-#   "capped" = LifetimeAUCMetricComputation: AUC over a trailing buffer of
-#     `lifetime_auc_window` samples/rank (legacy; per-rank buffer all-gathered).
-# Override per-run via $TRAIN_LIFETIME_AUC_MODE / $EVAL_LIFETIME_AUC_MODE.
-MetricsLogger.train_lifetime_auc_mode = @tlam/env_str()
-tlam/env_str.key     = "TRAIN_LIFETIME_AUC_MODE"
-tlam/env_str.default = "binned"
-MetricsLogger.eval_lifetime_auc_mode = @elam/env_str()
-elam/env_str.key     = "EVAL_LIFETIME_AUC_MODE"
-elam/env_str.default = "binned"
+# Lifetime-AUC backend for the train + eval cumulative ("lifetime_*") AUC.
+# Hardcoded to "binned" = BinnedCumulativeAUC: exact-cumulative AUC from an
+# O(bins) score histogram (additive all-reduce, memory independent of
+# #samples/#windows). The legacy "capped" trailing-buffer backend is no longer
+# selectable.
+MetricsLogger.train_lifetime_auc_mode = "binned"
+MetricsLogger.eval_lifetime_auc_mode = "binned"
 # Score-histogram resolution for the "binned" backend. Higher = finer AUC
 # resolution at O(bins) memory. Override via $CUMULATIVE_AUC_BINS.
 MetricsLogger.cumulative_auc_bins = @cab/env_int()
 cab/env_int.key     = "CUMULATIVE_AUC_BINS"
 cab/env_int.default = 100000
-# Trailing-buffer size (samples/rank) for the "capped" backend. Override via
-# $LIFETIME_AUC_WINDOW. Ignored when the backend is "binned".
-MetricsLogger.lifetime_auc_window = @law/env_int()
-law/env_int.key     = "LIFETIME_AUC_WINDOW"
-law/env_int.default = 10000000
 # Checkpointing disabled by default — a full DMP checkpoint is ~100s of GB and
 # the streaming loop always saves on the final window. save_dmp_checkpoint
 # no-ops on the empty path. Set $CKPT_PATH to a directory to re-enable; the
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
index 50d359ef6..81428a603 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
@@ -181,10 +181,10 @@ def _main_func(
         gpu_peak_flops=gpu_peak_flops,
         model=model,
         eval_cumulative=(mode == "streaming-train-eval"),
-        # Lifetime-AUC backend + bins/window come from gin (see yambda_5b.gin:
-        # MetricsLogger.{train,eval}_lifetime_auc_mode / cumulative_auc_bins /
-        # lifetime_auc_window), env-overridable. eval_cumulative stays explicit
-        # because it is runtime-mode dependent, not a config knob.
+        # Lifetime-AUC backend ("binned") + bins come from gin (see yambda_5b.gin:
+        # MetricsLogger.{train,eval}_lifetime_auc_mode / cumulative_auc_bins).
+        # eval_cumulative stays explicit because it is runtime-mode dependent,
+        # not a config knob.
     )
     # Capture streaming resume hint (None for cold start / non-streaming
     # checkpoints). For the streaming-train-eval mode, we forward this into
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
index 281a37b5d..7c11b4b5a 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
@@ -1484,11 +1484,11 @@ def env_str(key: str = "", default: str = "") -> str:
     """Resolve a string from os.environ[key], falling back to `default`.
 
     Companion to `env_int`/`env_float` for categorical/string overrides (e.g. a
-    metric backend selector). Example gin usage:
+    strategy selector). Example gin usage:
 
-        MetricsLogger.train_lifetime_auc_mode = @tlam/env_str()
-        tlam/env_str.key     = "TRAIN_LIFETIME_AUC_MODE"
-        tlam/env_str.default = "binned"
+        get_dataset.history_strategy = @hs/env_str()
+        hs/env_str.key     = "HISTORY_STRATEGY"
+        hs/env_str.default = "interleaved"
     """
     raw = os.environ.get(key) if key else None
     return raw if raw else default
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index 9d8e90921..dc03962a9 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -329,10 +329,7 @@ orchestrate() {
       -e TRAIN_SPLIT_PERCENTAGE=${TRAIN_SPLIT_PERCENTAGE:-1.0} \
       -e AUC_THRESHOLD=${AUC_THRESHOLD:-0.80275} \
       ${MLPERF_SUBMISSION_PLATFORM:+-e MLPERF_SUBMISSION_PLATFORM=$MLPERF_SUBMISSION_PLATFORM} \
-      ${TRAIN_LIFETIME_AUC_MODE:+-e TRAIN_LIFETIME_AUC_MODE=$TRAIN_LIFETIME_AUC_MODE} \
-      ${EVAL_LIFETIME_AUC_MODE:+-e EVAL_LIFETIME_AUC_MODE=$EVAL_LIFETIME_AUC_MODE} \
       ${CUMULATIVE_AUC_BINS:+-e CUMULATIVE_AUC_BINS=$CUMULATIVE_AUC_BINS} \
-      ${LIFETIME_AUC_WINDOW:+-e LIFETIME_AUC_WINDOW=$LIFETIME_AUC_WINDOW} \
       -e SPLIT_SALT=${SPLIT_SALT:-0} \
       -e EVAL_HOLDOUT_TS=${EVAL_HOLDOUT_TS:--1} \
       -e EVAL_HOLDOUT_NUM_WINDOWS=${EVAL_HOLDOUT_NUM_WINDOWS:-1} \

From da748838616bdc3f68fcfd8f677720eaf0505eff Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Wed, 24 Jun 2026 22:38:12 +0000
Subject: [PATCH 093/113] Revert "dlrmv4: hardcode lifetime-AUC backend to
 binned, drop the override"

This reverts commit 900d4f1311b93e281587e224ba9c3065a0f68cf1.
---
 .../dlrm_v3/train/gin/yambda_5b.gin           | 25 +++++++++++++------
 .../dlrm_v3/train/train_ranker.py             |  8 +++---
 .../generative_recommenders/dlrm_v3/utils.py  |  8 +++---
 recommendation_v4/scripts/launch_slurm.sh     |  3 +++
 4 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 3a938da36..8a3e5b595 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -463,18 +463,29 @@ at/env_float.key     = "AUC_THRESHOLD"
 at/env_float.default = 0.80275
 # EVAL_ACCURACY + early-stop are driven by per-window AUC (window_auc) vs the
 # threshold above.
-# Lifetime-AUC backend for the train + eval cumulative ("lifetime_*") AUC.
-# Hardcoded to "binned" = BinnedCumulativeAUC: exact-cumulative AUC from an
-# O(bins) score histogram (additive all-reduce, memory independent of
-# #samples/#windows). The legacy "capped" trailing-buffer backend is no longer
-# selectable.
-MetricsLogger.train_lifetime_auc_mode = "binned"
-MetricsLogger.eval_lifetime_auc_mode = "binned"
+# Lifetime-AUC backend, selectable independently for the train cumulative AUC and
+# the eval cumulative ("lifetime_*") AUC. Both default to "binned":
+#   "binned" = BinnedCumulativeAUC: exact-cumulative AUC from an O(bins) score
+#     histogram (additive all-reduce, memory independent of #samples/#windows).
+#   "capped" = LifetimeAUCMetricComputation: AUC over a trailing buffer of
+#     `lifetime_auc_window` samples/rank (legacy; per-rank buffer all-gathered).
+# Override per-run via $TRAIN_LIFETIME_AUC_MODE / $EVAL_LIFETIME_AUC_MODE.
+MetricsLogger.train_lifetime_auc_mode = @tlam/env_str()
+tlam/env_str.key     = "TRAIN_LIFETIME_AUC_MODE"
+tlam/env_str.default = "binned"
+MetricsLogger.eval_lifetime_auc_mode = @elam/env_str()
+elam/env_str.key     = "EVAL_LIFETIME_AUC_MODE"
+elam/env_str.default = "binned"
 # Score-histogram resolution for the "binned" backend. Higher = finer AUC
 # resolution at O(bins) memory. Override via $CUMULATIVE_AUC_BINS.
 MetricsLogger.cumulative_auc_bins = @cab/env_int()
 cab/env_int.key     = "CUMULATIVE_AUC_BINS"
 cab/env_int.default = 100000
+# Trailing-buffer size (samples/rank) for the "capped" backend. Override via
+# $LIFETIME_AUC_WINDOW. Ignored when the backend is "binned".
+MetricsLogger.lifetime_auc_window = @law/env_int()
+law/env_int.key     = "LIFETIME_AUC_WINDOW"
+law/env_int.default = 10000000
 # Checkpointing disabled by default — a full DMP checkpoint is ~100s of GB and
 # the streaming loop always saves on the final window. save_dmp_checkpoint
 # no-ops on the empty path. Set $CKPT_PATH to a directory to re-enable; the
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
index 81428a603..50d359ef6 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
@@ -181,10 +181,10 @@ def _main_func(
         gpu_peak_flops=gpu_peak_flops,
         model=model,
         eval_cumulative=(mode == "streaming-train-eval"),
-        # Lifetime-AUC backend ("binned") + bins come from gin (see yambda_5b.gin:
-        # MetricsLogger.{train,eval}_lifetime_auc_mode / cumulative_auc_bins).
-        # eval_cumulative stays explicit because it is runtime-mode dependent,
-        # not a config knob.
+        # Lifetime-AUC backend + bins/window come from gin (see yambda_5b.gin:
+        # MetricsLogger.{train,eval}_lifetime_auc_mode / cumulative_auc_bins /
+        # lifetime_auc_window), env-overridable. eval_cumulative stays explicit
+        # because it is runtime-mode dependent, not a config knob.
     )
     # Capture streaming resume hint (None for cold start / non-streaming
     # checkpoints). For the streaming-train-eval mode, we forward this into
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
index 7c11b4b5a..281a37b5d 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
@@ -1484,11 +1484,11 @@ def env_str(key: str = "", default: str = "") -> str:
     """Resolve a string from os.environ[key], falling back to `default`.
 
     Companion to `env_int`/`env_float` for categorical/string overrides (e.g. a
-    strategy selector). Example gin usage:
+    metric backend selector). Example gin usage:
 
-        get_dataset.history_strategy = @hs/env_str()
-        hs/env_str.key     = "HISTORY_STRATEGY"
-        hs/env_str.default = "interleaved"
+        MetricsLogger.train_lifetime_auc_mode = @tlam/env_str()
+        tlam/env_str.key     = "TRAIN_LIFETIME_AUC_MODE"
+        tlam/env_str.default = "binned"
     """
     raw = os.environ.get(key) if key else None
     return raw if raw else default
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index dc03962a9..9d8e90921 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -329,7 +329,10 @@ orchestrate() {
       -e TRAIN_SPLIT_PERCENTAGE=${TRAIN_SPLIT_PERCENTAGE:-1.0} \
       -e AUC_THRESHOLD=${AUC_THRESHOLD:-0.80275} \
       ${MLPERF_SUBMISSION_PLATFORM:+-e MLPERF_SUBMISSION_PLATFORM=$MLPERF_SUBMISSION_PLATFORM} \
+      ${TRAIN_LIFETIME_AUC_MODE:+-e TRAIN_LIFETIME_AUC_MODE=$TRAIN_LIFETIME_AUC_MODE} \
+      ${EVAL_LIFETIME_AUC_MODE:+-e EVAL_LIFETIME_AUC_MODE=$EVAL_LIFETIME_AUC_MODE} \
       ${CUMULATIVE_AUC_BINS:+-e CUMULATIVE_AUC_BINS=$CUMULATIVE_AUC_BINS} \
+      ${LIFETIME_AUC_WINDOW:+-e LIFETIME_AUC_WINDOW=$LIFETIME_AUC_WINDOW} \
       -e SPLIT_SALT=${SPLIT_SALT:-0} \
       -e EVAL_HOLDOUT_TS=${EVAL_HOLDOUT_TS:--1} \
       -e EVAL_HOLDOUT_NUM_WINDOWS=${EVAL_HOLDOUT_NUM_WINDOWS:-1} \

From 1ada43f01d479315eb13edc61e3be9d14aa87de5 Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Wed, 24 Jun 2026 22:47:37 +0000
Subject: [PATCH 094/113] dlrmv4: slim launch_slurm.sh to MLPerf wiring + path
 portability

Reduce the upstream launch_slurm.sh diff to just what MLPerf logging needs:
SCRATCH/REPO_MOUNT/DATA_MOUNT path portability (so outputs/log land off the
hardcoded /home/chcai,/apps/chcai) and the MLPerf env wiring (MLPERF_LOG_PATH,
AUC_THRESHOLD, MLPERF_LOGGING, MLPERF_SUBMISSION_PLATFORM,
MLPERF_TRAIN_LOSS_LOG_FREQ). Reverted the unrelated baseline changes (NCCL
GDR/IFNAME defaults, SMOKE/frozen run-shape, chmod/WORKER_TEE, HISTORY_STRATEGY,
lifetime-AUC passthroughs) to Chris' base.

Preserve the full kitchen-sink launcher as launch_slurm_suachong.sh for
personal multi-node use (self-reinvoke paths repointed to itself).

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 recommendation_v4/scripts/launch_slurm.sh     |  74 +-
 .../scripts/launch_slurm_suachong.sh          | 664 ++++++++++++++++++
 2 files changed, 689 insertions(+), 49 deletions(-)
 create mode 100755 recommendation_v4/scripts/launch_slurm_suachong.sh

diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index 9d8e90921..fb71bdcb7 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -4,7 +4,7 @@
 #SBATCH --ntasks-per-node=1
 #SBATCH --exclusive
 #SBATCH --partition=meta64          # [CLUSTER-SPECIFIC] partition name
-#SBATCH --output=yambda_slurm.%j.out
+#SBATCH --output=/apps/chcai/yambda_slurm.%j.out
 # =============================================================================
 # launch_slurm.sh — single entry point for the yambda-5b trainer on N>=1 nodes.
 #
@@ -110,9 +110,9 @@ CONTAINER=${CONTAINER:-yambda_primus}
 REPO=${REPO:-$REPO_ROOT}                       # repo path inside the container
 IMAGE=${IMAGE:-rocm/primus:v26.3}              # [CLUSTER-SPECIFIC] ROCm/arch base image
 BAKED_IMAGE=${BAKED_IMAGE:-yambda_primus_baked:latest}
-BAKED_TAR=${BAKED_TAR:-/apps/chcai/yambda_primus_baked.tar}   # [CLUSTER-SPECIFIC] shared-NFS path (read-only build asset)
+BAKED_TAR=${BAKED_TAR:-/apps/chcai/yambda_primus_baked.tar}   # [CLUSTER-SPECIFIC] shared-NFS path
 USE_BAKED=${USE_BAKED:-1}
-OVERLAY=${RDMA_OVERLAY:-/apps/chcai/rdma_host_el9_new}        # [CLUSTER-SPECIFIC] shared-NFS RDMA overlay (read-only, already staged)
+OVERLAY=${RDMA_OVERLAY:-/apps/chcai/rdma_host_el9_new}        # [CLUSTER-SPECIFIC] shared-NFS RDMA overlay
 
 REPO_MOUNT=${REPO_MOUNT:-$HOME}              # NFS home holding the repo (must contain $REPO); override if your repo lives elsewhere
 DATA_MOUNT=${DATA_MOUNT:-/apps/chcai}        # shared dataset + RDMA overlay + pip/fbgemm assets (read-only)
@@ -135,27 +135,15 @@ orchestrate() {
   mkdir -p "$SCRATCH" 2>/dev/null || true
   LOG=${LOG:-$SCRATCH/yambda_slurm.${SLURM_JOB_ID:-manual}.log}
 
+  # Smoke defaults — override via env for a perf run (see header USAGE).
   MODE=${MODE:-streaming-train-eval}
-  if [ "${SMOKE:-0}" = "1" ]; then
-    START_TS=${START_TS:-150}
-    NUM_TRAIN_TS=${NUM_TRAIN_TS:-1}
-    NUM_TRAIN_BATCHES=${NUM_TRAIN_BATCHES:-20}
-    NUM_EVAL_BATCHES=${NUM_EVAL_BATCHES:-10}
-    EVAL_EVERY_N_WINDOWS=${EVAL_EVERY_N_WINDOWS:-1}
-    METRIC_LOG_FREQ=${METRIC_LOG_FREQ:-5}
-  fi
-  START_TS=${START_TS:-0}
-  NUM_TRAIN_TS=${NUM_TRAIN_TS:-299}
-  NUM_TRAIN_BATCHES=${NUM_TRAIN_BATCHES:-0}
-  NUM_EVAL_BATCHES=${NUM_EVAL_BATCHES:-0}
+  START_TS=${START_TS:-150}
+  NUM_TRAIN_TS=${NUM_TRAIN_TS:-1}
+  NUM_TRAIN_BATCHES=${NUM_TRAIN_BATCHES:-20}
+  NUM_EVAL_BATCHES=${NUM_EVAL_BATCHES:-10}
   EVAL_EACH_WINDOW=${EVAL_EACH_WINDOW:-1}
-  METRIC_LOG_FREQ=${METRIC_LOG_FREQ:-20}
-  if [ "${EVAL_EVERY_N_WINDOWS:-0}" -gt 0 ] 2>/dev/null; then
-    EVAL_EVERY_DATA_PCT=${EVAL_EVERY_DATA_PCT:-0}
-  else
-    EVAL_EVERY_N_WINDOWS=0
-    EVAL_EVERY_DATA_PCT=${EVAL_EVERY_DATA_PCT:-0.005}
-  fi
+  EVAL_EVERY_N_WINDOWS=${EVAL_EVERY_N_WINDOWS:-1}
+  METRIC_LOG_FREQ=${METRIC_LOG_FREQ:-5}
   FORCE_PROVISION=${FORCE_PROVISION:-0}
 
   # Truncate the metrics log on a FRESH run; APPEND on a supervised relaunch
@@ -167,11 +155,9 @@ orchestrate() {
   else
     : > "$LOG"
   fi
-  chmod 622 "$LOG" 2>/dev/null || true
   echo "[$(date)] launch_slurm/orchestrate: job=${SLURM_JOB_ID:-?} nodes=${SLURM_JOB_NODELIST:-?} nnodes=${SLURM_NNODES:-1}" | tee -a "$LOG"
   echo "[$(date)] resolved SCRIPT_PATH=$SCRIPT_PATH REPO=$REPO" | tee -a "$LOG"
-  echo "[$(date)] config: MODE=$MODE START_TS=$START_TS NUM_TRAIN_TS=$NUM_TRAIN_TS NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES METRIC_LOG_FREQ=$METRIC_LOG_FREQ SMOKE=${SMOKE:-0} EVAL_EVERY_N_WINDOWS=$EVAL_EVERY_N_WINDOWS EVAL_EVERY_DATA_PCT=$EVAL_EVERY_DATA_PCT" | tee -a "$LOG"
-  echo "[$(date)] lr-override: DENSE_LR=${DENSE_LR:-<unset:gin default 1e-7>} SPARSE_LR=${SPARSE_LR:-<unset:gin default 1e-7>}" | tee -a "$LOG"
+  echo "[$(date)] config: MODE=$MODE START_TS=$START_TS NUM_TRAIN_TS=$NUM_TRAIN_TS NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES METRIC_LOG_FREQ=$METRIC_LOG_FREQ" | tee -a "$LOG"
 
   # Rendezvous resolved on the HOST (the container image has no SLURM client).
   MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" 2>/dev/null | head -1)
@@ -284,7 +270,6 @@ orchestrate() {
     _wattempt=\$((_wattempt+1))
     docker exec \
       -e LAUNCH_SLURM_PHASE=worker \
-      -e WORKER_TEE=0 \
       -e SCRATCH=$SCRATCH \
       -e SLURM_NNODES=\$SLURM_NNODES \
       -e SLURM_NODEID=\$SLURM_NODEID \
@@ -313,7 +298,6 @@ orchestrate() {
       ${DIAG_EMB_STEPS:+-e DIAG_EMB_STEPS=$DIAG_EMB_STEPS} \
       ${OUTPUT_TRACE:+-e OUTPUT_TRACE=$OUTPUT_TRACE} \
       ${MIN_HISTORY:+-e MIN_HISTORY=$MIN_HISTORY} \
-      ${HISTORY_STRATEGY:+-e HISTORY_STRATEGY=$HISTORY_STRATEGY} \
       ${SEED:+-e SEED=$SEED} \
       ${DENSE_LR:+-e DENSE_LR=$DENSE_LR} \
       ${SPARSE_LR:+-e SPARSE_LR=$SPARSE_LR} \
@@ -329,10 +313,6 @@ orchestrate() {
       -e TRAIN_SPLIT_PERCENTAGE=${TRAIN_SPLIT_PERCENTAGE:-1.0} \
       -e AUC_THRESHOLD=${AUC_THRESHOLD:-0.80275} \
       ${MLPERF_SUBMISSION_PLATFORM:+-e MLPERF_SUBMISSION_PLATFORM=$MLPERF_SUBMISSION_PLATFORM} \
-      ${TRAIN_LIFETIME_AUC_MODE:+-e TRAIN_LIFETIME_AUC_MODE=$TRAIN_LIFETIME_AUC_MODE} \
-      ${EVAL_LIFETIME_AUC_MODE:+-e EVAL_LIFETIME_AUC_MODE=$EVAL_LIFETIME_AUC_MODE} \
-      ${CUMULATIVE_AUC_BINS:+-e CUMULATIVE_AUC_BINS=$CUMULATIVE_AUC_BINS} \
-      ${LIFETIME_AUC_WINDOW:+-e LIFETIME_AUC_WINDOW=$LIFETIME_AUC_WINDOW} \
       -e SPLIT_SALT=${SPLIT_SALT:-0} \
       -e EVAL_HOLDOUT_TS=${EVAL_HOLDOUT_TS:--1} \
       -e EVAL_HOLDOUT_NUM_WINDOWS=${EVAL_HOLDOUT_NUM_WINDOWS:-1} \
@@ -498,10 +478,8 @@ worker() {
   cd "$REPO_ROOT"
   mkdir -p "$SCRATCH" 2>/dev/null || true
   LOG=${LOG:-$SCRATCH/yambda_5b_8gpu.log}
-  # WORKER_TEE=0 (set by orchestrate) sends our file sink to /dev/null to avoid
-  # double-logging, since orchestrate already tees stdout into the real $LOG.
-  [ "${WORKER_TEE:-1}" = "0" ] && LOG=/dev/null
-  export TENSORBOARD_LOG_PATH=${TENSORBOARD_LOG_PATH:-$SCRATCH/tb/yambda_5b}
+  # Append (not truncate): under the streaming-e2e supervisor a run may relaunch
+  # many times into the SAME $LOG; the supervisor initializes it once at run start.
   # MLPerf compliance log (rank 0 writes it). Per-job filename so each standalone
   # sbatch gets a clean log; the e2e supervisor pins MLPERF_LOG_PATH itself.
   export MLPERF_LOG_PATH=${MLPERF_LOG_PATH:-$SCRATCH/mlperf/yambda_5b_mlperf.${SLURM_JOB_ID:-manual}.log}
@@ -553,15 +531,14 @@ worker() {
   export WORLD_SIZE=$(( NNODES * GPUS_PER_NODE ))
   echo "[$(date)] topology: nnodes=$NNODES node_rank=$NODE_RANK gpus_per_node=$GPUS_PER_NODE world_size=$WORLD_SIZE master=$MASTER_ADDR:${MASTER_PORT:-<auto>}" | tee -a "$LOG"
 
-  # NCCL bootstrap NIC: loopback single-node, routable host NIC multi-node (pin
-  # to avoid auto-detect picking a non-routable per-GPU RoCE link). Override via
-  # $NCCL_SOCKET_IFNAME. [CLUSTER-SPECIFIC] multi-node fenic0 (find via `ip -br addr`).
-  if [ "$NNODES" -gt 1 ]; then
-    export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-fenic0}
-  else
-    export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-lo}
-  fi
-  echo "[$(date)] NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME (nnodes=$NNODES)" | tee -a "$LOG"
+  # NCCL bootstrap NIC — pin for BOTH single- and multi-node. The container is
+  # --network=host so RCCL sees ALL host interfaces; if left to auto-detect, NCCL
+  # can pick a non-routable per-GPU RoCE /31 (benic* 192.168.x) link and fail
+  # bootstrap with "No route to host" (this is node-dependent: it happened to
+  # work on some nodes and not others, causing repetitive single-node init
+  # failures). Pinning the routable host NIC fixes it everywhere.
+  # [CLUSTER-SPECIFIC] routable host NIC for TCP bootstrap (find via `ip -br addr`).
+  export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-fenic0}
 
   # Multi-node additionally needs the RDMA data-plane (bnxt_re HCAs) configured;
   # single-node uses intra-node P2P (XGMI/PCIe) so only the bootstrap NIC matters.
@@ -594,11 +571,10 @@ worker() {
       export NCCL_IB_TIMEOUT=${NCCL_IB_TIMEOUT:-14}
       export NCCL_IGNORE_CPU_AFFINITY=${NCCL_IGNORE_CPU_AFFINITY:-1}
       export RCCL_MSCCL_ENABLE=${RCCL_MSCCL_ENABLE:-0}
-      # GPU-Direct RDMA on by default (~+22% throughput at 2 nodes via peermem).
-      # Set NCCL_NET_GDR_LEVEL=0 to force the legacy host-staged path.
-      export NCCL_NET_GDR_LEVEL=${NCCL_NET_GDR_LEVEL:-5}
-      export NCCL_DMABUF_ENABLE=${NCCL_DMABUF_ENABLE:-1}
-      echo "[$(date)] NCCL: RDMA over bnxt_re (GID idx ${NCCL_IB_GID_INDEX}, TC ${NCCL_IB_TC}, GDR_LEVEL=${NCCL_NET_GDR_LEVEL}, DMABUF=${NCCL_DMABUF_ENABLE}; meta64 bnxt_re config, validated)" | tee -a "$LOG"
+      # GPU-Direct RDMA needs DMABUF/peermem (neither in-container here) — leave
+      # GDR off so RCCL stages through host memory (still real RDMA over bnxt_re).
+      export NCCL_NET_GDR_LEVEL=${NCCL_NET_GDR_LEVEL:-0}
+      echo "[$(date)] NCCL: RDMA over bnxt_re (GID idx ${NCCL_IB_GID_INDEX}, TC ${NCCL_IB_TC}, GDR_LEVEL=${NCCL_NET_GDR_LEVEL}; meta64 bnxt_re config, validated)" | tee -a "$LOG"
     fi
   fi
   export NCCL_DEBUG=${NCCL_DEBUG:-WARN}
diff --git a/recommendation_v4/scripts/launch_slurm_suachong.sh b/recommendation_v4/scripts/launch_slurm_suachong.sh
new file mode 100755
index 000000000..579076d47
--- /dev/null
+++ b/recommendation_v4/scripts/launch_slurm_suachong.sh
@@ -0,0 +1,664 @@
+#!/bin/bash
+#SBATCH --job-name=yambda_slurm
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --exclusive
+#SBATCH --partition=meta64          # [CLUSTER-SPECIFIC] partition name
+#SBATCH --output=yambda_slurm.%j.out
+# =============================================================================
+# launch_slurm.sh — single entry point for the yambda-5b trainer on N>=1 nodes.
+#
+# Consolidates what used to be separate scripts so multi-node enablement is
+# ONE committable script (plus the train_ranker.py / utils.py python changes):
+#   * orchestrate phase (host SLURM glue) — formerly sbatch_smoke_multinode.sh
+#   * provision   phase (container + RDMA) — formerly _provision_yambda_primus.sh
+#   * worker      phase (in-container train) — now inlined below
+#
+# PHASES (auto-detected from context; force with LAUNCH_SLURM_PHASE=<phase>):
+#   orchestrate  Runs on the SLURM batch host (no /.dockerenv). Resolves the
+#                rendezvous (MASTER_ADDR/PORT), ensures the container on every
+#                node (provision phase), then `docker exec`s the worker phase on
+#                every node, one task per node.
+#   provision    Runs on a compute-node host. Ensures the `yambda_primus`
+#                container is up (loads the pre-baked image if present — no
+#                internet/pip — else builds from the base image) and stages the
+#                host RDMA userspace overlay on shared NFS.
+#   worker       Runs INSIDE the container. Sets the distributed topology +
+#                NCCL/RDMA env and spawns this node's GPU ranks via train_ranker.
+#                N==1 transparently uses the legacy single-node path (localhost,
+#                node_rank 0), byte-for-byte as before, so the streaming-e2e
+#                supervisor's direct `bash scripts/launch_slurm.sh` is unchanged.
+#
+# USAGE
+#   Multi-node (N>=1):  sbatch --nodes=2 scripts/launch_slurm.sh
+#   Single-node direct: bash scripts/launch_slurm.sh   (already inside container;
+#                       what run_streaming_e2e.sh invokes per relaunch)
+#   Perf pair:
+#     LOG=/apps/chcai/perf_1node.log NUM_TRAIN_BATCHES=200 NUM_EVAL_BATCHES=0 \
+#       EVAL_EACH_WINDOW=0 METRIC_LOG_FREQ=20 \
+#       sbatch --nodes=1 --job-name=y1 scripts/launch_slurm.sh
+#     LOG=/apps/chcai/perf_2node.log NUM_TRAIN_BATCHES=200 NUM_EVAL_BATCHES=0 \
+#       EVAL_EACH_WINDOW=0 METRIC_LOG_FREQ=20 \
+#       sbatch --nodes=2 --job-name=y2 scripts/launch_slurm.sh
+#     # then: bash scripts/compare_node_perf.sh /apps/chcai/perf_1node.log /apps/chcai/perf_2node.log
+#
+# ONE-TIME IMAGE BAKE (so fresh nodes skip the multi-GB torch download + pip):
+#   BAKE_IMAGE=1 LAUNCH_SLURM_PHASE=provision bash scripts/launch_slurm.sh
+#   (commits the deps-installed container to $BAKED_IMAGE and `docker save`s it to
+#    $BAKED_TAR on NFS; subsequent provisions `docker load` it offline.)
+#
+# -----------------------------------------------------------------------------
+# PORTABILITY — what to change for a DIFFERENT cluster / network / hardware.
+# Every such knob is also tagged inline with "[CLUSTER-SPECIFIC]" (grep for it).
+# All are env-overridable, so you can adapt without editing this file.
+#
+#  A) SLURM / scheduler
+#     - #SBATCH --partition=meta64  : partition name. CHANGE per cluster.
+#     - #SBATCH --time / --exclusive : policy; adjust to taste.
+#
+#  B) Filesystems (must be shared/NFS across ALL nodes — this script re-invokes
+#     itself and reads the overlay + data from these paths cluster-wide)
+#     - REPO_MOUNT (repo + this script, e.g. /home/<user>) is bind-mounted rw;
+#       DATA_MOUNT (e.g. /apps/chcai) holds the read-only dataset + overlay +
+#       baked tar + pip tarball; SCRATCH (e.g. /home/<user>/yambda_runs) is the
+#       writable log/output root. Override any via env — nothing is user-hardwired.
+#
+#  C) Container image / GPU software stack (tied to the GPU arch + ROCm version)
+#     - IMAGE=rocm/primus:v26.3        : base image. ROCm/AMD-specific.
+#     - docker run --device=/dev/kfd --device=/dev/dri --group-add video : AMD ROCm
+#       device passthrough. For NVIDIA this is --gpus all / nvidia runtime instead.
+#     - --ulimit memlock=-1 : REQUIRED for RDMA QP registration (do not drop).
+#     - TORCH_IDX (rocm7.2), torch/vision/audio ==*+rocm7.2, FBGEMM_WHL (a gfx950
+#       wheel), torchrec pin : the whole deps set is arch/ROCm-version-specific.
+#
+#  D) Network fabric — THE trickiest part; defaults are PROVEN on meta64 cv350
+#     (Broadcom bnxt_re RoCEv2). On a different fabric these almost certainly change
+#     (see the worker-phase block for the full rationale):
+#     - NCCL_SOCKET_IFNAME=fenic0 : the ONE routable host NIC for TCP bootstrap.
+#       Find yours with `ip -br addr`; the per-GPU RDMA NICs are usually NOT
+#       routable for plain TCP, so auto-detect hangs init — you MUST pin this.
+#     - NCCL_IB_HCA=bnxt_re0..7 : the RDMA HCA device names. List with `ibv_devices`.
+#       Different NIC vendor (e.g. mlx5_*, ionic_*) => different names AND a
+#       different userspace provider, which changes the RDMA overlay below.
+#     - NCCL_IB_GID_INDEX=3 : RoCEv2 IPv4 GID index. Check `show_gids`; v1/v2 and
+#       IPv4/IPv6 live at different indices per port.
+#     - NCCL_IB_TC=104 : RoCE lossless (PFC) traffic class. Fabric/switch-specific.
+#     - RDMA overlay (provision phase): only needed when the CONTAINER's rdma-core
+#       is older than the HOST kernel driver's uapi (our bnxt_re v34-vs-v59 case).
+#       Different NIC/host => different /usr/lib64 provider .so to stage, or the
+#       overlay may be unnecessary entirely (set RDMA_OVERLAY= to disable). If RDMA
+#       can't be made to work, NCCL_NET_TRANSPORT=socket falls back to TCP.
+#
+#  E) Not cluster-specific (auto-derived): GPUS_PER_NODE (torch.cuda.device_count),
+#     NNODES/NODE_RANK/MASTER_ADDR (from SLURM), WORLD_SIZE.
+# =============================================================================
+set -uo pipefail
+
+# Absolute path to THIS script so the orchestrate phase can re-invoke it on every
+# node (home is shared NFS, so the same path resolves cluster-wide).
+SELF=$(cd "$(dirname "$0")" && pwd)/$(basename "$0")
+REPO_ROOT=$(cd "$(dirname "$0")/.." && pwd)
+
+# ---- phase detection --------------------------------------------------------
+PHASE="${LAUNCH_SLURM_PHASE:-}"
+if [ -z "$PHASE" ]; then
+  if [ -f /.dockerenv ]; then PHASE=worker; else PHASE=orchestrate; fi
+fi
+
+# ---- shared config (env-overridable) ----------------------------------------
+CONTAINER=${CONTAINER:-yambda_primus}
+REPO=${REPO:-$REPO_ROOT}                       # repo path inside the container
+IMAGE=${IMAGE:-rocm/primus:v26.3}              # [CLUSTER-SPECIFIC] ROCm/arch base image
+BAKED_IMAGE=${BAKED_IMAGE:-yambda_primus_baked:latest}
+BAKED_TAR=${BAKED_TAR:-/apps/chcai/yambda_primus_baked.tar}   # [CLUSTER-SPECIFIC] shared-NFS path (read-only build asset)
+USE_BAKED=${USE_BAKED:-1}
+OVERLAY=${RDMA_OVERLAY:-/apps/chcai/rdma_host_el9_new}        # [CLUSTER-SPECIFIC] shared-NFS RDMA overlay (read-only, already staged)
+
+REPO_MOUNT=${REPO_MOUNT:-$HOME}              # NFS home holding the repo (must contain $REPO); override if your repo lives elsewhere
+DATA_MOUNT=${DATA_MOUNT:-/apps/chcai}        # shared dataset + RDMA overlay + pip/fbgemm assets (read-only)
+SCRATCH=${SCRATCH:-$HOME/yambda_runs}        # writable output root (logs / tb / traces)
+
+# =============================================================================
+# PHASE: orchestrate  (SLURM batch host)
+# =============================================================================
+orchestrate() {
+  # When run as the SLURM batch script, $0 is the node-local staged copy
+  # (/var/spool/slurmd/job<ID>/slurm_script), so $SELF / $REPO_ROOT are WRONG
+  # here (they don't exist on other nodes). Resolve the REAL shared-NFS script
+  # path + repo root from SLURM so we can re-invoke this script on every node and
+  # `cd` to the right repo inside the container.
+  SCRIPT_PATH=$(scontrol show job "${SLURM_JOB_ID:-0}" 2>/dev/null | grep -oP 'Command=\K\S+')
+  [ -f "${SCRIPT_PATH:-}" ] || SCRIPT_PATH="${SLURM_SUBMIT_DIR:-$REPO_ROOT}/scripts/launch_slurm_suachong.sh"
+  [ -f "$SCRIPT_PATH" ] || SCRIPT_PATH="$SELF"
+  REPO=$(cd "$(dirname "$SCRIPT_PATH")/.." && pwd)
+
+  mkdir -p "$SCRATCH" 2>/dev/null || true
+  LOG=${LOG:-$SCRATCH/yambda_slurm.${SLURM_JOB_ID:-manual}.log}
+
+  MODE=${MODE:-streaming-train-eval}
+  if [ "${SMOKE:-0}" = "1" ]; then
+    START_TS=${START_TS:-150}
+    NUM_TRAIN_TS=${NUM_TRAIN_TS:-1}
+    NUM_TRAIN_BATCHES=${NUM_TRAIN_BATCHES:-20}
+    NUM_EVAL_BATCHES=${NUM_EVAL_BATCHES:-10}
+    EVAL_EVERY_N_WINDOWS=${EVAL_EVERY_N_WINDOWS:-1}
+    METRIC_LOG_FREQ=${METRIC_LOG_FREQ:-5}
+  fi
+  START_TS=${START_TS:-0}
+  NUM_TRAIN_TS=${NUM_TRAIN_TS:-299}
+  NUM_TRAIN_BATCHES=${NUM_TRAIN_BATCHES:-0}
+  NUM_EVAL_BATCHES=${NUM_EVAL_BATCHES:-0}
+  EVAL_EACH_WINDOW=${EVAL_EACH_WINDOW:-1}
+  METRIC_LOG_FREQ=${METRIC_LOG_FREQ:-20}
+  if [ "${EVAL_EVERY_N_WINDOWS:-0}" -gt 0 ] 2>/dev/null; then
+    EVAL_EVERY_DATA_PCT=${EVAL_EVERY_DATA_PCT:-0}
+  else
+    EVAL_EVERY_N_WINDOWS=0
+    EVAL_EVERY_DATA_PCT=${EVAL_EVERY_DATA_PCT:-0.005}
+  fi
+  FORCE_PROVISION=${FORCE_PROVISION:-0}
+
+  # Truncate the metrics log on a FRESH run; APPEND on a supervised relaunch
+  # (APPEND_LOG=1) so the full-run NE/AUC history survives crash/node-failover
+  # resubmits instead of being wiped on every attempt (mirrors the single-node
+  # supervisor's init-once/append model).
+  if [ "${APPEND_LOG:-0}" = "1" ]; then
+    echo "[$(date)] === resume: appending to existing $LOG (APPEND_LOG=1) ===" >> "$LOG"
+  else
+    : > "$LOG"
+  fi
+  chmod 622 "$LOG" 2>/dev/null || true
+  echo "[$(date)] launch_slurm/orchestrate: job=${SLURM_JOB_ID:-?} nodes=${SLURM_JOB_NODELIST:-?} nnodes=${SLURM_NNODES:-1}" | tee -a "$LOG"
+  echo "[$(date)] resolved SCRIPT_PATH=$SCRIPT_PATH REPO=$REPO" | tee -a "$LOG"
+  echo "[$(date)] config: MODE=$MODE START_TS=$START_TS NUM_TRAIN_TS=$NUM_TRAIN_TS NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES METRIC_LOG_FREQ=$METRIC_LOG_FREQ SMOKE=${SMOKE:-0} EVAL_EVERY_N_WINDOWS=$EVAL_EVERY_N_WINDOWS EVAL_EVERY_DATA_PCT=$EVAL_EVERY_DATA_PCT" | tee -a "$LOG"
+  echo "[$(date)] lr-override: DENSE_LR=${DENSE_LR:-<unset:gin default 1e-7>} SPARSE_LR=${SPARSE_LR:-<unset:gin default 1e-7>}" | tee -a "$LOG"
+
+  # Rendezvous resolved on the HOST (the container image has no SLURM client).
+  MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" 2>/dev/null | head -1)
+  MASTER_ADDR=${MASTER_ADDR:-localhost}
+  MASTER_PORT=$(( 20000 + ${SLURM_JOB_ID:-0} % 20000 ))
+  echo "[$(date)] rendezvous: MASTER_ADDR=$MASTER_ADDR MASTER_PORT=$MASTER_PORT" | tee -a "$LOG"
+
+  # Optional NCCL/RCCL fabric overrides — forwarded into the container only when
+  # set at submit time (docker exec does NOT inherit the srun task env). The
+  # worker phase applies its own validated multi-node bnxt_re defaults when these
+  # are unset. Common: NCCL_NET_TRANSPORT=socket (TCP fallback), NCCL_DEBUG=INFO.
+  NCCL_ENV_ARGS=""
+  for v in NCCL_NET_TRANSPORT NCCL_DEBUG NCCL_SOCKET_IFNAME NCCL_IB_HCA NCCL_IB_GID_INDEX \
+           NCCL_IB_TC NCCL_IB_TIMEOUT NCCL_IGNORE_CPU_AFFINITY RCCL_MSCCL_ENABLE NCCL_NET_GDR_LEVEL \
+           NCCL_IB_PCI_RELAXED_ORDERING NCCL_IB_USE_INLINE NCCL_IB_QPS_PER_CONNECTION \
+           NCCL_IB_ECE_ENABLE NCCL_DMABUF_ENABLE NCCL_GDRCOPY_ENABLE NCCL_GDR_FLUSH_DISABLE \
+           NCCL_PXN_DISABLE NCCL_CHECKS_DISABLE NCCL_CROSS_NIC RDMA_OVERLAY; do
+    eval "val=\${$v:-}"
+    if [ -n "$val" ]; then NCCL_ENV_ARGS="$NCCL_ENV_ARGS -e $v=$val"; fi
+  done
+
+  # TRICKY — variable expansion inside the `srun ... bash -c "..."` blocks below:
+  # the string is double-quoted, so PLAIN $VAR expands NOW on the batch host (e.g.
+  # $MASTER_ADDR, $CONTAINER, $SCRIPT_PATH — values computed above), while
+  # BACKSLASH-escaped \$VAR is passed through literally and expands LATER on each
+  # compute node inside the srun task (e.g. \$SLURM_NODEID, \$(hostname)) where the
+  # per-node SLURM_* env actually lives. Mixing these up sends every rank the
+  # wrong node id or breaks the docker exec — keep the \$ on per-node values.
+
+  # --- step 1: ensure the container is up on every node ----------------------
+  echo "[$(date)] ensuring container '$CONTAINER' on all nodes (force=$FORCE_PROVISION)" | tee -a "$LOG"
+  srun --ntasks-per-node=1 bash -c "
+    # Reap stale/foreign GPU containers from prior jobs BEFORE (re)provisioning.
+    # The node is allocated --exclusive, so any GPU container other than
+    # '$CONTAINER' is an orphan left by a previous job (its container outlives the
+    # SLURM allocation). We remove every such container that has GPU access
+    # (/dev/kfd or /dev/dri) — running OR stopped, whether or not it currently
+    # pins VRAM ('docker ps -aq' includes stopped ones) — since idle orphans can
+    # still hold device handles or wake up; leaked HBM from these has caused both
+    # OOMs and RCCL collective hangs. We deliberately SKIP non-GPU containers
+    # (e.g. 'k8s-node-services-*' and other cluster system services) so we don't
+    # disrupt node infrastructure. docker teardown lets the driver reclaim HBM.
+    for _c in \$(docker ps -aq 2>/dev/null); do
+      _nm=\$(docker inspect -f '{{.Name}}' \"\$_c\" 2>/dev/null | sed 's#^/##')
+      [ \"\$_nm\" = \"$CONTAINER\" ] && continue
+      _dev=\$(docker inspect -f '{{range .HostConfig.Devices}}{{.PathOnHost}} {{end}}' \"\$_c\" 2>/dev/null)
+      case \"\$_dev\" in
+        *kfd*|*dri*)
+          echo \"[\$(hostname)] reaping stale GPU container \$_nm (\$_c)\"
+          docker rm -f \"\$_c\" >/dev/null 2>&1 || true ;;
+        *)
+          echo \"[\$(hostname)] keeping non-GPU/system container \$_nm (\$_c)\" ;;
+      esac
+    done
+    # Reuse a STOPPED '$CONTAINER' (its installed deps persist in the container
+    # fs) instead of destructively re-provisioning from the base image + pip.
+    # Harmless no-op on a fresh node (no such container) -> falls through to
+    # provision below. Repo code is bind-mounted, so live edits are still picked up.
+    docker start $CONTAINER >/dev/null 2>&1 || true
+    if [ \"$FORCE_PROVISION\" = \"1\" ] || ! docker exec $CONTAINER true >/dev/null 2>&1; then
+      echo \"[\$(hostname)] (re)provisioning container\"
+      LAUNCH_SLURM_PHASE=provision CONTAINER=$CONTAINER IMAGE=$IMAGE \
+        BAKED_IMAGE=$BAKED_IMAGE BAKED_TAR=$BAKED_TAR USE_BAKED=$USE_BAKED \
+        BAKE_IMAGE=${BAKE_IMAGE:-0} RDMA_OVERLAY=$OVERLAY REPO=$REPO \
+        REPO_MOUNT=$REPO_MOUNT DATA_MOUNT=$DATA_MOUNT SCRATCH=$SCRATCH bash $SCRIPT_PATH
+    else
+      # Container persists across jobs; the reap above only removes FOREIGN GPU
+      # containers, so our own '$CONTAINER' can still pin HBM via stray trainer
+      # ranks left by a prior OOM/crash (this caused repeated 'CUDA out of memory'
+      # on relaunch onto the same node). Restart it to kill every exec'd proc and
+      # let the driver reclaim HBM — cheap (keeps the installed deps in the
+      # container fs; NFS RDMA overlay also persists), no full re-provision.
+      echo \"[\$(hostname)] container already up — restarting to free any leaked HBM before launch\"
+      docker restart $CONTAINER >/dev/null 2>&1 || true
+      # Readiness gate: a bare 'docker exec true' can pass while the runtime is
+      # still settling, so the SUBSEQUENT (heavier) worker exec races the restart
+      # and dies with 'container is not running' / OCI 'setns' errors (observed on
+      # c07-08 and e08-08 -> the peer never joins rendezvous -> master 600s
+      # TCPStore timeout). Require State.Running=true AND a successful probe, then
+      # a short settle, before considering the container ready.
+      for _w in \$(seq 1 30); do
+        [ \"\$(docker inspect -f '{{.State.Running}}' $CONTAINER 2>/dev/null)\" = \"true\" ] \
+          && docker exec $CONTAINER true >/dev/null 2>&1 && break
+        sleep 2
+      done
+      sleep 2
+      echo \"[\$(hostname)] container restarted (HBM reclaimed; running=\$(docker inspect -f '{{.State.Running}}' $CONTAINER 2>/dev/null))\"
+    fi
+  " 2>&1 | tee -a "$LOG"
+
+  # --- step 2: launch the worker (trainer) inside the container on every node -
+  echo "[$(date)] launching trainer (worker phase) on all nodes" | tee -a "$LOG"
+  srun --ntasks-per-node=1 bash -c "
+    # Pre-flight readiness gate (per node): step 1 ran in a SEPARATE srun, so the
+    # container can still be settling here. Wait for State.Running=true + a probe
+    # before the worker exec so we don't race a just-restarted container.
+    for _w in \$(seq 1 30); do
+      [ \"\$(docker inspect -f '{{.State.Running}}' $CONTAINER 2>/dev/null)\" = \"true\" ] \
+        && docker exec $CONTAINER true >/dev/null 2>&1 && break
+      [ \$_w -eq 1 ] && echo \"[\$(hostname)] worker pre-flight: waiting for container to be ready...\"
+      sleep 2
+    done
+    # Retry wrapper: docker exec startup failures (rc 125 daemon 'container is not
+    # running', 126/127 OCI/setns 'exec failed') mean the container wasn't ready,
+    # NOT that the trainer ran and failed. Restart + re-gate + retry a few times.
+    # Any OTHER rc (the trainer actually started and exited) is propagated so the
+    # supervisor's resume-from-checkpoint logic owns real failures.
+    _wattempt=0
+    while : ; do
+    _wattempt=\$((_wattempt+1))
+    docker exec \
+      -e LAUNCH_SLURM_PHASE=worker \
+      -e WORKER_TEE=0 \
+      -e SCRATCH=$SCRATCH \
+      -e SLURM_NNODES=\$SLURM_NNODES \
+      -e SLURM_NODEID=\$SLURM_NODEID \
+      -e SLURM_PROCID=\$SLURM_PROCID \
+      -e SLURM_JOB_NODELIST=\"\$SLURM_JOB_NODELIST\" \
+      -e SLURM_JOB_ID=\$SLURM_JOB_ID \
+      -e MASTER_ADDR=$MASTER_ADDR \
+      -e MASTER_PORT=$MASTER_PORT \
+      -e HSTU_HAMMER_KERNEL=${HSTU_HAMMER_KERNEL:-TRITON} \
+      -e MODE=$MODE \
+      -e START_TS=$START_TS \
+      -e NUM_TRAIN_TS=$NUM_TRAIN_TS \
+      -e EVAL_EACH_WINDOW=$EVAL_EACH_WINDOW \
+      -e EVAL_EVERY_N_WINDOWS=$EVAL_EVERY_N_WINDOWS \
+      ${EVAL_EVERY_DATA_PCT:+-e EVAL_EVERY_DATA_PCT=$EVAL_EVERY_DATA_PCT} \
+      -e NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES \
+      -e NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES \
+      -e METRIC_LOG_FREQ=$METRIC_LOG_FREQ \
+      ${MLPERF_LOGGING:+-e MLPERF_LOGGING=$MLPERF_LOGGING} \
+      ${MLPERF_TRAIN_LOSS_LOG_FREQ:+-e MLPERF_TRAIN_LOSS_LOG_FREQ=$MLPERF_TRAIN_LOSS_LOG_FREQ} \
+      ${STREAMING_SHUFFLE_FRACTION:+-e STREAMING_SHUFFLE_FRACTION=$STREAMING_SHUFFLE_FRACTION} \
+      ${STREAMING_SHUFFLE_SEED:+-e STREAMING_SHUFFLE_SEED=$STREAMING_SHUFFLE_SEED} \
+      ${NUM_WORKERS:+-e NUM_WORKERS=$NUM_WORKERS} \
+      ${PREFETCH_FACTOR:+-e PREFETCH_FACTOR=$PREFETCH_FACTOR} \
+      ${DIAG_UNIQUE_EMB:+-e DIAG_UNIQUE_EMB=$DIAG_UNIQUE_EMB} \
+      ${DIAG_EMB_STEPS:+-e DIAG_EMB_STEPS=$DIAG_EMB_STEPS} \
+      ${OUTPUT_TRACE:+-e OUTPUT_TRACE=$OUTPUT_TRACE} \
+      ${MIN_HISTORY:+-e MIN_HISTORY=$MIN_HISTORY} \
+      ${HISTORY_STRATEGY:+-e HISTORY_STRATEGY=$HISTORY_STRATEGY} \
+      ${SEED:+-e SEED=$SEED} \
+      ${DENSE_LR:+-e DENSE_LR=$DENSE_LR} \
+      ${SPARSE_LR:+-e SPARSE_LR=$SPARSE_LR} \
+      ${GRAD_CLIP_NORM:+-e GRAD_CLIP_NORM=$GRAD_CLIP_NORM} \
+      ${HSTU_NUM_LAYERS:+-e HSTU_NUM_LAYERS=$HSTU_NUM_LAYERS} \
+      ${MAX_SEQ_LEN:+-e MAX_SEQ_LEN=$MAX_SEQ_LEN} \
+      ${HISTORY_LENGTH:+-e HISTORY_LENGTH=$HISTORY_LENGTH} \
+      ${BATCH_SIZE:+-e BATCH_SIZE=$BATCH_SIZE} \
+      ${CKPT_TIME_INTERVAL_S:+-e CKPT_TIME_INTERVAL_S=$CKPT_TIME_INTERVAL_S} \
+      ${KEEP_LAST_N:+-e KEEP_LAST_N=$KEEP_LAST_N} \
+      ${IN_WINDOW_CKPT_FREQ:+-e IN_WINDOW_CKPT_FREQ=$IN_WINDOW_CKPT_FREQ} \
+      ${CKPT_STEP_FREQ:+-e CKPT_STEP_FREQ=$CKPT_STEP_FREQ} \
+      -e TRAIN_SPLIT_PERCENTAGE=${TRAIN_SPLIT_PERCENTAGE:-1.0} \
+      -e AUC_THRESHOLD=${AUC_THRESHOLD:-0.80275} \
+      ${MLPERF_SUBMISSION_PLATFORM:+-e MLPERF_SUBMISSION_PLATFORM=$MLPERF_SUBMISSION_PLATFORM} \
+      ${TRAIN_LIFETIME_AUC_MODE:+-e TRAIN_LIFETIME_AUC_MODE=$TRAIN_LIFETIME_AUC_MODE} \
+      ${EVAL_LIFETIME_AUC_MODE:+-e EVAL_LIFETIME_AUC_MODE=$EVAL_LIFETIME_AUC_MODE} \
+      ${CUMULATIVE_AUC_BINS:+-e CUMULATIVE_AUC_BINS=$CUMULATIVE_AUC_BINS} \
+      ${LIFETIME_AUC_WINDOW:+-e LIFETIME_AUC_WINDOW=$LIFETIME_AUC_WINDOW} \
+      -e SPLIT_SALT=${SPLIT_SALT:-0} \
+      -e EVAL_HOLDOUT_TS=${EVAL_HOLDOUT_TS:--1} \
+      -e EVAL_HOLDOUT_NUM_WINDOWS=${EVAL_HOLDOUT_NUM_WINDOWS:-1} \
+      ${WORKER_CMD:+-e WORKER_CMD=\"$WORKER_CMD\"} \
+      ${RUN_NAME:+-e RUN_NAME=$RUN_NAME} \
+      ${TENSORBOARD_LOG_PATH:+-e TENSORBOARD_LOG_PATH=$TENSORBOARD_LOG_PATH} \
+      ${MLPERF_LOG_PATH:+-e MLPERF_LOG_PATH=$MLPERF_LOG_PATH} \
+      ${CKPT_PATH:+-e CKPT_PATH=$CKPT_PATH} \
+      ${SPARSE_A2A_FWD:+-e SPARSE_A2A_FWD=$SPARSE_A2A_FWD} \
+      ${SPARSE_A2A_BWD:+-e SPARSE_A2A_BWD=$SPARSE_A2A_BWD} \
+      -e LOG=$LOG \
+      $NCCL_ENV_ARGS \
+      $CONTAINER bash -lc 'cd $REPO && LAUNCH_SLURM_PHASE=worker bash scripts/launch_slurm_suachong.sh'
+    _wrc=\$?
+    if { [ \$_wrc -eq 125 ] || [ \$_wrc -eq 126 ] || [ \$_wrc -eq 127 ]; } && [ \$_wattempt -lt 5 ]; then
+      echo \"[\$(hostname)] worker exec failed to START (rc=\$_wrc, attempt \$_wattempt/5) — container not ready; restarting + retrying\"
+      docker restart $CONTAINER >/dev/null 2>&1 || true
+      for _w in \$(seq 1 30); do
+        [ \"\$(docker inspect -f '{{.State.Running}}' $CONTAINER 2>/dev/null)\" = \"true\" ] \
+          && docker exec $CONTAINER true >/dev/null 2>&1 && break
+        sleep 2
+      done
+      sleep 3
+      continue
+    fi
+    exit \$_wrc
+    done
+  " 2>&1 | tee -a "$LOG"
+  rc=${PIPESTATUS[0]}
+  echo "[$(date)] launch_slurm/orchestrate finished rc=$rc" | tee -a "$LOG"
+  exit $rc
+}
+
+# =============================================================================
+# PHASE: provision  (compute-node host)
+# =============================================================================
+provision() {
+  export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:${PATH:-}"
+  DOCKER=$(command -v docker 2>/dev/null || true); DOCKER=${DOCKER:-/usr/bin/docker}
+  FBGEMM_WHL=${FBGEMM_WHL:-/apps/chcai/FBGEMM/fbgemm_gpu/dist/fbgemm_gpu_nightly_rocm-2026.6.2-cp312-cp312-linux_x86_64.whl}  # [CLUSTER-SPECIFIC] gfx950/ROCm wheel
+  TORCH_IDX=${TORCH_IDX:-https://download.pytorch.org/whl/rocm7.2}  # [CLUSTER-SPECIFIC] ROCm version index
+  echo "[provision] host=$(hostname) container=$CONTAINER docker=$DOCKER"
+
+  # Resolve which image to run + whether deps must be installed. Prefer a pre-baked
+  # image (deps already installed) to skip the multi-GB torch download + pip /
+  # torchrec-from-git build on every fresh node:
+  #   1) baked image in this node's docker -> use it, skip deps
+  #   2) baked image tar on NFS            -> docker load (local, no internet)
+  #   3) neither                           -> base image + pip (slow path, which
+  #                                           can then be baked via BAKE_IMAGE=1)
+  NEED_DEPS=1
+  RUN_IMAGE="$IMAGE"
+  if [ "$USE_BAKED" = "1" ]; then
+    if "$DOCKER" image inspect "$BAKED_IMAGE" >/dev/null 2>&1; then
+      echo "[provision] using baked image $BAKED_IMAGE (deps preinstalled, no download)"
+      RUN_IMAGE="$BAKED_IMAGE"; NEED_DEPS=0
+    elif [ -f "$BAKED_TAR" ]; then
+      echo "[provision] loading baked image from $BAKED_TAR (local, no internet)..."
+      if "$DOCKER" load -i "$BAKED_TAR" >/dev/null 2>&1 && "$DOCKER" image inspect "$BAKED_IMAGE" >/dev/null 2>&1; then
+        RUN_IMAGE="$BAKED_IMAGE"; NEED_DEPS=0; echo "[provision] baked image loaded"
+      else
+        echo "[provision] WARNING: docker load failed; falling back to base-image + pip"
+      fi
+    fi
+  fi
+  if ! "$DOCKER" image inspect "$RUN_IMAGE" >/dev/null 2>&1; then
+    echo "[provision] pulling $RUN_IMAGE (this can take a while)..."; "$DOCKER" pull "$RUN_IMAGE"
+  fi
+
+  echo "[provision] (re)starting container $CONTAINER from $RUN_IMAGE"
+  "$DOCKER" rm -f "$CONTAINER" >/dev/null 2>&1 || true
+  "$DOCKER" run -d --name "$CONTAINER" \
+    --network=host --ipc=host --shm-size=64g \
+    --device=/dev/kfd --device=/dev/dri --group-add video \
+    `# [CLUSTER-SPECIFIC] AMD ROCm device passthrough; NVIDIA uses --gpus all / nvidia runtime` \
+    --cap-add=SYS_PTRACE --cap-add=CAP_SYS_ADMIN --cap-add=IPC_LOCK \
+    --ulimit memlock=-1:-1 --ulimit stack=67108864:67108864 \
+    `# memlock=-1 is REQUIRED for RDMA QP memory registration — do not drop` \
+    --security-opt seccomp=unconfined --privileged \
+    -v "$REPO_MOUNT:$REPO_MOUNT" \
+    -v "$DATA_MOUNT:$DATA_MOUNT" \
+    `# shared-NFS bind mounts: repo home (REPO_MOUNT, rw) + dataset/build assets (DATA_MOUNT)` \
+    -w "$REPO" \
+    "$RUN_IMAGE" sleep infinity
+
+  # --- RDMA userspace overlay for in-container RCCL (bnxt_re) -----------------
+  # The image (rocm/primus, rdma-core 50/libbnxt_re-rdmav34) ships an OLDER RDMA
+  # userspace than the host kernel bnxt_re driver. The stock v34 provider faults
+  # RCCL's deep-queue create_qp (max_send_wr=256) against the newer kernel uapi
+  # -> "ibv_create_qp ... Bad address". Fix: stage the host's matched rdma-core
+  # (libibverbs v61 + libbnxt_re-rdmav59 + libnl) on NFS so the worker phase makes
+  # RCCL load it via LD_PRELOAD + LD_LIBRARY_PATH. The UNVERSIONED libibverbs.so
+  # symlink is essential (import torch pulls the unversioned soname; without it
+  # the lookup falls through to the container v34 lib and the fix regresses).
+  if [ "${FORCE_OVERLAY:-0}" != "1" ] && ls "$OVERLAY/lib/libibverbs/"libbnxt_re-rdmav*.so >/dev/null 2>&1 && [ -L "$OVERLAY/lib/libibverbs.so" ]; then
+    echo "[provision] host RDMA overlay already staged at $OVERLAY (shared NFS) — skipping"
+  else
+    echo "[provision] staging host RDMA userspace overlay -> $OVERLAY"
+    rm -rf "${OVERLAY}.tmp" 2>/dev/null
+    mkdir -p "${OVERLAY}.tmp/lib/libibverbs" "${OVERLAY}.tmp/libibverbs.d"
+    cp -L /usr/lib64/libibverbs.so.1 /usr/lib64/libnl-3.so.200 /usr/lib64/libnl-route-3.so.200 "${OVERLAY}.tmp/lib/" 2>/dev/null || true
+    ln -sf libibverbs.so.1 "${OVERLAY}.tmp/lib/libibverbs.so"
+    cp -L /usr/lib64/libibverbs/*.so "${OVERLAY}.tmp/lib/libibverbs/" 2>/dev/null || true
+    cp /etc/libibverbs.d/*.driver "${OVERLAY}.tmp/libibverbs.d/" 2>/dev/null || true
+    if ls "${OVERLAY}.tmp/lib/libibverbs/"libbnxt_re-rdmav*.so >/dev/null 2>&1; then
+      rm -rf "$OVERLAY" 2>/dev/null
+      mv "${OVERLAY}.tmp" "$OVERLAY" 2>/dev/null || { mkdir -p "$OVERLAY"; cp -a "${OVERLAY}.tmp/." "$OVERLAY/"; }
+      echo "[provision] host RDMA overlay staged: $(ls "$OVERLAY/lib/libibverbs" | wc -l) providers + libibverbs.so symlink"
+    else
+      echo "[provision] WARNING: host bnxt_re provider not found at /usr/lib64/libibverbs — multi-node RDMA will fail 'Bad address'; use NCCL_NET_TRANSPORT=socket"
+    fi
+  fi
+
+  if [ "$NEED_DEPS" = "0" ]; then
+    echo "[provision] baked image — deps preinstalled; verifying imports only"
+    "$DOCKER" exec "$CONTAINER" bash -lc '
+python -c "import torch, fbgemm_gpu, torchrec, polars, xxhash, gin; print(\"imports OK,\", torch.__version__, torch.version.hip, torch.cuda.device_count(), \"gpus\")"
+' || echo "[provision] WARNING: baked-image import smoke failed"
+  else
+    echo "[provision] installing recipe deps (base image, slow path)"
+    # Install misc deps FIRST, then pin the rocm torch stack + fbgemm + torchrec
+    # LAST with --no-deps so nothing pulls a CUDA torch over the rocm build.
+    "$DOCKER" exec "$CONTAINER" bash -lc '
+set -e
+echo "=== native torch ==="; python -c "import torch;print(torch.__version__)" || true
+echo "=== misc python deps ==="
+pip install --no-cache-dir polars-u64-idx pyarrow pyyaml tqdm psutil numba xxhash gin-config \
+  absl-py pandas tensorboard torchmetrics tensordict pyre-extensions iopath typing-inspect 2>&1 | tail -3 || true
+echo "=== rocm torch stack (force, no-deps, LAST) ==="
+pip install --force-reinstall --no-deps --index-url '"$TORCH_IDX"' \
+  torch==2.12.0+rocm7.2 torchvision==0.27.0+rocm7.2 torchaudio==2.11.0+rocm7.2
+echo "=== fbgemm (local gfx950 wheel) ==="
+pip install --force-reinstall --no-deps '"$FBGEMM_WHL"'
+echo "=== torchrec v2026.06.01.00 (force, no-deps) ==="
+pip install --force-reinstall --no-deps "git+https://github.com/pytorch/torchrec.git@v2026.06.01.00"
+echo "=== import smoke ==="
+python -c "import torch, fbgemm_gpu, torchrec, polars, xxhash, gin; print(\"imports OK,\", torch.__version__, torch.version.hip, torch.cuda.device_count(), \"gpus\")"
+'
+  fi
+
+  # --- one-time bake: snapshot the deps-installed container into a reusable image
+  # and save it to NFS so future nodes skip the download/pip path entirely.
+  if [ "${BAKE_IMAGE:-0}" = "1" ]; then
+    echo "[provision] baking: docker commit $CONTAINER -> $BAKED_IMAGE"
+    if "$DOCKER" commit "$CONTAINER" "$BAKED_IMAGE" >/dev/null; then
+      echo "[provision] saving $BAKED_IMAGE -> $BAKED_TAR (one-time, tens of GB)"
+      if "$DOCKER" save "$BAKED_IMAGE" -o "${BAKED_TAR}.tmp.$$" && mv -f "${BAKED_TAR}.tmp.$$" "$BAKED_TAR"; then
+        echo "[provision] bake done: $(ls -lh "$BAKED_TAR" 2>/dev/null | awk '{print $5}')"
+      else
+        echo "[provision] WARNING: docker save failed"; rm -f "${BAKED_TAR}.tmp.$$" 2>/dev/null
+      fi
+    else
+      echo "[provision] WARNING: docker commit failed"
+    fi
+  fi
+  echo "[provision] DONE"
+}
+
+# =============================================================================
+# PHASE: worker  (inside the container)
+# =============================================================================
+worker() {
+  cd "$REPO_ROOT"
+  mkdir -p "$SCRATCH" 2>/dev/null || true
+  LOG=${LOG:-$SCRATCH/yambda_5b_8gpu.log}
+  # WORKER_TEE=0 (set by orchestrate) sends our file sink to /dev/null to avoid
+  # double-logging, since orchestrate already tees stdout into the real $LOG.
+  [ "${WORKER_TEE:-1}" = "0" ] && LOG=/dev/null
+  export TENSORBOARD_LOG_PATH=${TENSORBOARD_LOG_PATH:-$SCRATCH/tb/yambda_5b}
+  # MLPerf compliance log (rank 0 writes it). Per-job filename so each standalone
+  # sbatch gets a clean log; the e2e supervisor pins MLPERF_LOG_PATH itself.
+  export MLPERF_LOG_PATH=${MLPERF_LOG_PATH:-$SCRATCH/mlperf/yambda_5b_mlperf.${SLURM_JOB_ID:-manual}.log}
+  echo "[$(date)] REPO_ROOT=$REPO_ROOT" | tee -a "$LOG"
+
+  # polars-u64-idx (NOT stock polars) — yambda parquet's flat-explode overruns
+  # 32-bit row index. Reserved node has no outbound DNS, so install from a
+  # pre-staged tarball under /apps/chcai/. Override PIP_LOCAL_TGZ for other hosts.
+  PIP_LOCAL_TGZ=${PIP_LOCAL_TGZ:-/apps/chcai/pip_local_yambda.tgz}   # [CLUSTER-SPECIFIC] shared-NFS path
+  PIP_LOCAL_DIR=${PIP_LOCAL_DIR:-/tmp/pip_local}
+  if [ ! -f "$PIP_LOCAL_DIR/lib/python3.12/site-packages/polars/__init__.py" ]; then
+    rm -rf "$PIP_LOCAL_DIR"
+    mkdir -p "$PIP_LOCAL_DIR" && tar xzf "$PIP_LOCAL_TGZ" -C "$(dirname "$PIP_LOCAL_DIR")" 2>&1 | tail -3 | tee -a "$LOG"
+  fi
+
+  export PYTHONPATH="$PIP_LOCAL_DIR/lib/python3.12/site-packages:$REPO_ROOT:${PYTHONPATH:-}"
+  export HOME=${HOME:-/tmp}
+  echo "[$(date)] PYTHONPATH=$PYTHONPATH" | tee -a "$LOG"
+  python -c "import torch, fbgemm_gpu, torchrec, polars, xxhash, gin; print('imports OK,', torch.__version__, torch.cuda.device_count(),'gpus')" 2>&1 | tee -a "$LOG"
+
+  export HIP_VISIBLE_DEVICES=${HIP_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}
+  export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}
+
+  # --- distributed topology ---------------------------------------------------
+  GPUS_PER_NODE=$(python -c "import torch; print(torch.cuda.device_count())")
+  # Multi-node when launched one-task-per-node under SLURM (SLURM_NNODES>1);
+  # otherwise fall through to legacy single-node defaults (localhost, node_rank 0).
+  if [ "${SLURM_NNODES:-1}" -gt 1 ] && [ -n "${SLURM_JOB_NODELIST:-}" ]; then
+    NNODES=${SLURM_NNODES}
+    NODE_RANK=${SLURM_NODEID:-${SLURM_PROCID:-0}}
+    # PREFER a MASTER_ADDR/PORT forwarded from the orchestrate phase (resolved on
+    # the host, which has scontrol); the container image carries no SLURM client.
+    if [ -z "${MASTER_ADDR:-}" ]; then
+      MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" 2>/dev/null | head -1)
+      MASTER_ADDR=${MASTER_ADDR:-localhost}
+    fi
+    MASTER_PORT=${MASTER_PORT:-$(( 20000 + ${SLURM_JOB_ID:-0} % 20000 ))}
+  else
+    NNODES=${NNODES:-1}
+    NODE_RANK=${NODE_RANK:-0}
+    # Single-node: all ranks live on THIS host, so rendezvous over loopback and
+    # do NOT use the SLURM hostname. On some nodes the hostname resolves to a
+    # non-routable per-GPU RoCE /31 (benic 192.168.x) address; using it makes the
+    # NCCL bootstrap fail with "No route to host". localhost is node-independent.
+    MASTER_ADDR=localhost
+    MASTER_PORT=${MASTER_PORT:-}     # empty => train_ranker picks a free port
+  fi
+  export NNODES NODE_RANK GPUS_PER_NODE MASTER_ADDR MASTER_PORT
+  export WORLD_SIZE=$(( NNODES * GPUS_PER_NODE ))
+  echo "[$(date)] topology: nnodes=$NNODES node_rank=$NODE_RANK gpus_per_node=$GPUS_PER_NODE world_size=$WORLD_SIZE master=$MASTER_ADDR:${MASTER_PORT:-<auto>}" | tee -a "$LOG"
+
+  # NCCL bootstrap NIC: loopback single-node, routable host NIC multi-node (pin
+  # to avoid auto-detect picking a non-routable per-GPU RoCE link). Override via
+  # $NCCL_SOCKET_IFNAME. [CLUSTER-SPECIFIC] multi-node fenic0 (find via `ip -br addr`).
+  if [ "$NNODES" -gt 1 ]; then
+    export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-fenic0}
+  else
+    export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-lo}
+  fi
+  echo "[$(date)] NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME (nnodes=$NNODES)" | tee -a "$LOG"
+
+  # Multi-node additionally needs the RDMA data-plane (bnxt_re HCAs) configured;
+  # single-node uses intra-node P2P (XGMI/PCIe) so only the bootstrap NIC matters.
+  if [ "$NNODES" -gt 1 ]; then
+    NCCL_NET_TRANSPORT=${NCCL_NET_TRANSPORT:-ib}
+    if [ "$NCCL_NET_TRANSPORT" = "socket" ]; then
+      export NCCL_IB_DISABLE=1
+      echo "[$(date)] NCCL: IB disabled — allreduce over TCP (fenic0). Functional, not RDMA-fast." | tee -a "$LOG"
+    else
+      # bnxt_re userspace provider ABI overlay (REQUIRED for RCCL). The stock v34
+      # provider faults RCCL's create_qp (256 WRs) against the host kernel uapi
+      # ("Bad address"); the host v61/v59 set staged by the provision phase works.
+      # The libibverbs.so (UNVERSIONED) symlink + LD_PRELOAD are both required so
+      # the torch process maps ONLY the host lib (see provision phase comment).
+      if [ -e "$OVERLAY/lib/libibverbs.so.1" ]; then
+        [ -e "$OVERLAY/lib/libibverbs.so" ] || ln -sf libibverbs.so.1 "$OVERLAY/lib/libibverbs.so" 2>/dev/null || true
+        export LD_LIBRARY_PATH="$OVERLAY/lib:$OVERLAY/lib/libibverbs:${LD_LIBRARY_PATH:-}"
+        export LD_PRELOAD="$OVERLAY/lib/libibverbs.so.1${LD_PRELOAD:+:$LD_PRELOAD}"
+        echo "[$(date)] NCCL: bnxt_re provider overlay -> $OVERLAY (host rdma-core v61/v59; symlink+LD_PRELOAD so RCCL binds the host lib for QP creation)" | tee -a "$LOG"
+      else
+        echo "[$(date)] WARNING: RDMA overlay $OVERLAY missing — RCCL QP creation will fail 'Bad address' on stock v34 provider; set RDMA_OVERLAY or use NCCL_NET_TRANSPORT=socket" | tee -a "$LOG"
+      fi
+      # MINIMAL bnxt_re set PROVEN on these meta64 cv350 nodes (cmcknigh RCCL
+      # benchmarks + confirmed e2e here). NCCL_IB_TC=104 (RoCE lossless PFC class)
+      # is required; do NOT add the ionic-AINIC QPS/ECE/DMABUF block.
+      # [CLUSTER-SPECIFIC] RDMA HCA names (`ibv_devices`); other vendors => mlx5_*/ionic_*
+      export NCCL_IB_HCA=${NCCL_IB_HCA:-bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7}
+      export NCCL_IB_GID_INDEX=${NCCL_IB_GID_INDEX:-3}    # [CLUSTER-SPECIFIC] RoCEv2 IPv4 GID idx (`show_gids`)
+      export NCCL_IB_TC=${NCCL_IB_TC:-104}                # [CLUSTER-SPECIFIC] RoCE lossless/PFC traffic class
+      export NCCL_IB_TIMEOUT=${NCCL_IB_TIMEOUT:-14}
+      export NCCL_IGNORE_CPU_AFFINITY=${NCCL_IGNORE_CPU_AFFINITY:-1}
+      export RCCL_MSCCL_ENABLE=${RCCL_MSCCL_ENABLE:-0}
+      # GPU-Direct RDMA on by default (~+22% throughput at 2 nodes via peermem).
+      # Set NCCL_NET_GDR_LEVEL=0 to force the legacy host-staged path.
+      export NCCL_NET_GDR_LEVEL=${NCCL_NET_GDR_LEVEL:-5}
+      export NCCL_DMABUF_ENABLE=${NCCL_DMABUF_ENABLE:-1}
+      echo "[$(date)] NCCL: RDMA over bnxt_re (GID idx ${NCCL_IB_GID_INDEX}, TC ${NCCL_IB_TC}, GDR_LEVEL=${NCCL_NET_GDR_LEVEL}, DMABUF=${NCCL_DMABUF_ENABLE}; meta64 bnxt_re config, validated)" | tee -a "$LOG"
+    fi
+  fi
+  export NCCL_DEBUG=${NCCL_DEBUG:-WARN}
+  export HSTU_HAMMER_KERNEL=${HSTU_HAMMER_KERNEL:-}
+  export PYTORCH_CUDA_ALLOC_CONF=${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}
+
+  # --- GPU clock sanity guard -------------------------------------------------
+  # A leftover perf_determinism cap (half clock) silently slows every kernel ~1.9x.
+  # Log the perf level + a live sclk sample and try to restore boost (non-fatal).
+  if command -v rocm-smi >/dev/null 2>&1; then
+    echo "[$(date)] GPU perf-level check:" | tee -a "$LOG"
+    rocm-smi --showperflevel 2>/dev/null | grep -iE "GPU\[[0-9]+\]" | tee -a "$LOG" || true
+    if rocm-smi --showperflevel 2>/dev/null | grep -iqE "Performance Level: *(perf_determinism|manual|low)"; then
+      echo "[$(date)] WARNING: GPUs not in 'auto' perf level — attempting --setperflevel auto" | tee -a "$LOG"
+      rocm-smi --setperflevel auto 2>/dev/null | grep -iE "set to auto" | tee -a "$LOG" \
+        || echo "[$(date)] WARNING: could not set perf level (no permission?). Run 'rocm-smi --setperflevel auto' on the HOST before benchmarking — clocks may be capped." | tee -a "$LOG"
+    fi
+    echo "[$(date)] sclk sample (GPU0):$(rocm-smi -d 0 --showclocks 2>/dev/null | grep -i 'sclk clock level' | sed -E 's/.*sclk clock level//')" | tee -a "$LOG" || true
+  fi
+
+  # --- stray-trainer / leaked-VRAM guard -------------------------------------
+  # The trainer runs via `docker exec` into a long-lived container, so its procs
+  # live in the container PID namespace, NOT the SLURM job cgroup. If a prior job
+  # OOM'd/crashed, a rank can leak and keep holding ~half of every GPU's VRAM,
+  # which persists across jobs (container survives) and guarantees the next
+  # attempt OOMs. Before launching, reap any pre-existing trainer procs (there
+  # should be none at this point) and wait for VRAM to drain. [g]-guard avoids
+  # self-match. Non-fatal.
+  if pgrep -f '[g]enerative_recommenders' >/dev/null 2>&1; then
+    echo "[$(date)] WARNING: leaked trainer procs found pre-launch — killing." | tee -a "$LOG"
+    pkill -9 -f '[g]enerative_recommenders' 2>/dev/null || true
+    for _i in $(seq 1 15); do
+      pgrep -f '[g]enerative_recommenders' >/dev/null 2>&1 || break
+      sleep 2
+    done
+    sleep 5  # let the driver release VRAM after process exit
+    if command -v rocm-smi >/dev/null 2>&1; then
+      echo "[$(date)] post-cleanup GPU0 used GiB:$(rocm-smi --showmeminfo vram 2>/dev/null | awk -F: '/Used/{printf " %.0f", $3/1073741824; exit}')" | tee -a "$LOG"
+    fi
+  fi
+
+  # WORKER_CMD override: run an arbitrary in-container command (e.g. an a2a/RCCL
+  # micro-benchmark) instead of the trainer, REUSING all the NCCL/RDMA/topology
+  # setup above so it exercises the exact transport the trainer uses. The
+  # supervisor never sets WORKER_CMD, so the training path is unchanged.
+  if [ -n "${WORKER_CMD:-}" ]; then
+    echo "[$(date)] WORKER_CMD override (WORLD_SIZE=$WORLD_SIZE): $WORKER_CMD" | tee -a "$LOG"
+    bash -lc "cd $REPO_ROOT && $WORKER_CMD" 2>&1 | tee -a "$LOG"
+    return
+  fi
+
+  echo "[$(date)] launching train_ranker with WORLD_SIZE=$WORLD_SIZE" | tee -a "$LOG"
+  python -m generative_recommenders.dlrm_v3.train.train_ranker \
+      --dataset yambda-5b --mode "${MODE:-streaming-train-eval}" 2>&1 | tee -a "$LOG"
+}
+
+# ---- dispatch ---------------------------------------------------------------
+case "$PHASE" in
+  orchestrate) orchestrate ;;
+  provision)   provision ;;
+  worker)      worker ;;
+  *) echo "launch_slurm.sh: unknown LAUNCH_SLURM_PHASE='$PHASE'" >&2; exit 2 ;;
+esac

From 1d257133ceaf48d68147c3d98ab7dadd20180507 Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Wed, 24 Jun 2026 22:49:07 +0000
Subject: [PATCH 095/113] dlrmv4: drop launch_slurm_suachong.sh from the PR
 (keep local only)

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../scripts/launch_slurm_suachong.sh          | 664 ------------------
 1 file changed, 664 deletions(-)
 delete mode 100755 recommendation_v4/scripts/launch_slurm_suachong.sh

diff --git a/recommendation_v4/scripts/launch_slurm_suachong.sh b/recommendation_v4/scripts/launch_slurm_suachong.sh
deleted file mode 100755
index 579076d47..000000000
--- a/recommendation_v4/scripts/launch_slurm_suachong.sh
+++ /dev/null
@@ -1,664 +0,0 @@
-#!/bin/bash
-#SBATCH --job-name=yambda_slurm
-#SBATCH --nodes=1
-#SBATCH --ntasks-per-node=1
-#SBATCH --exclusive
-#SBATCH --partition=meta64          # [CLUSTER-SPECIFIC] partition name
-#SBATCH --output=yambda_slurm.%j.out
-# =============================================================================
-# launch_slurm.sh — single entry point for the yambda-5b trainer on N>=1 nodes.
-#
-# Consolidates what used to be separate scripts so multi-node enablement is
-# ONE committable script (plus the train_ranker.py / utils.py python changes):
-#   * orchestrate phase (host SLURM glue) — formerly sbatch_smoke_multinode.sh
-#   * provision   phase (container + RDMA) — formerly _provision_yambda_primus.sh
-#   * worker      phase (in-container train) — now inlined below
-#
-# PHASES (auto-detected from context; force with LAUNCH_SLURM_PHASE=<phase>):
-#   orchestrate  Runs on the SLURM batch host (no /.dockerenv). Resolves the
-#                rendezvous (MASTER_ADDR/PORT), ensures the container on every
-#                node (provision phase), then `docker exec`s the worker phase on
-#                every node, one task per node.
-#   provision    Runs on a compute-node host. Ensures the `yambda_primus`
-#                container is up (loads the pre-baked image if present — no
-#                internet/pip — else builds from the base image) and stages the
-#                host RDMA userspace overlay on shared NFS.
-#   worker       Runs INSIDE the container. Sets the distributed topology +
-#                NCCL/RDMA env and spawns this node's GPU ranks via train_ranker.
-#                N==1 transparently uses the legacy single-node path (localhost,
-#                node_rank 0), byte-for-byte as before, so the streaming-e2e
-#                supervisor's direct `bash scripts/launch_slurm.sh` is unchanged.
-#
-# USAGE
-#   Multi-node (N>=1):  sbatch --nodes=2 scripts/launch_slurm.sh
-#   Single-node direct: bash scripts/launch_slurm.sh   (already inside container;
-#                       what run_streaming_e2e.sh invokes per relaunch)
-#   Perf pair:
-#     LOG=/apps/chcai/perf_1node.log NUM_TRAIN_BATCHES=200 NUM_EVAL_BATCHES=0 \
-#       EVAL_EACH_WINDOW=0 METRIC_LOG_FREQ=20 \
-#       sbatch --nodes=1 --job-name=y1 scripts/launch_slurm.sh
-#     LOG=/apps/chcai/perf_2node.log NUM_TRAIN_BATCHES=200 NUM_EVAL_BATCHES=0 \
-#       EVAL_EACH_WINDOW=0 METRIC_LOG_FREQ=20 \
-#       sbatch --nodes=2 --job-name=y2 scripts/launch_slurm.sh
-#     # then: bash scripts/compare_node_perf.sh /apps/chcai/perf_1node.log /apps/chcai/perf_2node.log
-#
-# ONE-TIME IMAGE BAKE (so fresh nodes skip the multi-GB torch download + pip):
-#   BAKE_IMAGE=1 LAUNCH_SLURM_PHASE=provision bash scripts/launch_slurm.sh
-#   (commits the deps-installed container to $BAKED_IMAGE and `docker save`s it to
-#    $BAKED_TAR on NFS; subsequent provisions `docker load` it offline.)
-#
-# -----------------------------------------------------------------------------
-# PORTABILITY — what to change for a DIFFERENT cluster / network / hardware.
-# Every such knob is also tagged inline with "[CLUSTER-SPECIFIC]" (grep for it).
-# All are env-overridable, so you can adapt without editing this file.
-#
-#  A) SLURM / scheduler
-#     - #SBATCH --partition=meta64  : partition name. CHANGE per cluster.
-#     - #SBATCH --time / --exclusive : policy; adjust to taste.
-#
-#  B) Filesystems (must be shared/NFS across ALL nodes — this script re-invokes
-#     itself and reads the overlay + data from these paths cluster-wide)
-#     - REPO_MOUNT (repo + this script, e.g. /home/<user>) is bind-mounted rw;
-#       DATA_MOUNT (e.g. /apps/chcai) holds the read-only dataset + overlay +
-#       baked tar + pip tarball; SCRATCH (e.g. /home/<user>/yambda_runs) is the
-#       writable log/output root. Override any via env — nothing is user-hardwired.
-#
-#  C) Container image / GPU software stack (tied to the GPU arch + ROCm version)
-#     - IMAGE=rocm/primus:v26.3        : base image. ROCm/AMD-specific.
-#     - docker run --device=/dev/kfd --device=/dev/dri --group-add video : AMD ROCm
-#       device passthrough. For NVIDIA this is --gpus all / nvidia runtime instead.
-#     - --ulimit memlock=-1 : REQUIRED for RDMA QP registration (do not drop).
-#     - TORCH_IDX (rocm7.2), torch/vision/audio ==*+rocm7.2, FBGEMM_WHL (a gfx950
-#       wheel), torchrec pin : the whole deps set is arch/ROCm-version-specific.
-#
-#  D) Network fabric — THE trickiest part; defaults are PROVEN on meta64 cv350
-#     (Broadcom bnxt_re RoCEv2). On a different fabric these almost certainly change
-#     (see the worker-phase block for the full rationale):
-#     - NCCL_SOCKET_IFNAME=fenic0 : the ONE routable host NIC for TCP bootstrap.
-#       Find yours with `ip -br addr`; the per-GPU RDMA NICs are usually NOT
-#       routable for plain TCP, so auto-detect hangs init — you MUST pin this.
-#     - NCCL_IB_HCA=bnxt_re0..7 : the RDMA HCA device names. List with `ibv_devices`.
-#       Different NIC vendor (e.g. mlx5_*, ionic_*) => different names AND a
-#       different userspace provider, which changes the RDMA overlay below.
-#     - NCCL_IB_GID_INDEX=3 : RoCEv2 IPv4 GID index. Check `show_gids`; v1/v2 and
-#       IPv4/IPv6 live at different indices per port.
-#     - NCCL_IB_TC=104 : RoCE lossless (PFC) traffic class. Fabric/switch-specific.
-#     - RDMA overlay (provision phase): only needed when the CONTAINER's rdma-core
-#       is older than the HOST kernel driver's uapi (our bnxt_re v34-vs-v59 case).
-#       Different NIC/host => different /usr/lib64 provider .so to stage, or the
-#       overlay may be unnecessary entirely (set RDMA_OVERLAY= to disable). If RDMA
-#       can't be made to work, NCCL_NET_TRANSPORT=socket falls back to TCP.
-#
-#  E) Not cluster-specific (auto-derived): GPUS_PER_NODE (torch.cuda.device_count),
-#     NNODES/NODE_RANK/MASTER_ADDR (from SLURM), WORLD_SIZE.
-# =============================================================================
-set -uo pipefail
-
-# Absolute path to THIS script so the orchestrate phase can re-invoke it on every
-# node (home is shared NFS, so the same path resolves cluster-wide).
-SELF=$(cd "$(dirname "$0")" && pwd)/$(basename "$0")
-REPO_ROOT=$(cd "$(dirname "$0")/.." && pwd)
-
-# ---- phase detection --------------------------------------------------------
-PHASE="${LAUNCH_SLURM_PHASE:-}"
-if [ -z "$PHASE" ]; then
-  if [ -f /.dockerenv ]; then PHASE=worker; else PHASE=orchestrate; fi
-fi
-
-# ---- shared config (env-overridable) ----------------------------------------
-CONTAINER=${CONTAINER:-yambda_primus}
-REPO=${REPO:-$REPO_ROOT}                       # repo path inside the container
-IMAGE=${IMAGE:-rocm/primus:v26.3}              # [CLUSTER-SPECIFIC] ROCm/arch base image
-BAKED_IMAGE=${BAKED_IMAGE:-yambda_primus_baked:latest}
-BAKED_TAR=${BAKED_TAR:-/apps/chcai/yambda_primus_baked.tar}   # [CLUSTER-SPECIFIC] shared-NFS path (read-only build asset)
-USE_BAKED=${USE_BAKED:-1}
-OVERLAY=${RDMA_OVERLAY:-/apps/chcai/rdma_host_el9_new}        # [CLUSTER-SPECIFIC] shared-NFS RDMA overlay (read-only, already staged)
-
-REPO_MOUNT=${REPO_MOUNT:-$HOME}              # NFS home holding the repo (must contain $REPO); override if your repo lives elsewhere
-DATA_MOUNT=${DATA_MOUNT:-/apps/chcai}        # shared dataset + RDMA overlay + pip/fbgemm assets (read-only)
-SCRATCH=${SCRATCH:-$HOME/yambda_runs}        # writable output root (logs / tb / traces)
-
-# =============================================================================
-# PHASE: orchestrate  (SLURM batch host)
-# =============================================================================
-orchestrate() {
-  # When run as the SLURM batch script, $0 is the node-local staged copy
-  # (/var/spool/slurmd/job<ID>/slurm_script), so $SELF / $REPO_ROOT are WRONG
-  # here (they don't exist on other nodes). Resolve the REAL shared-NFS script
-  # path + repo root from SLURM so we can re-invoke this script on every node and
-  # `cd` to the right repo inside the container.
-  SCRIPT_PATH=$(scontrol show job "${SLURM_JOB_ID:-0}" 2>/dev/null | grep -oP 'Command=\K\S+')
-  [ -f "${SCRIPT_PATH:-}" ] || SCRIPT_PATH="${SLURM_SUBMIT_DIR:-$REPO_ROOT}/scripts/launch_slurm_suachong.sh"
-  [ -f "$SCRIPT_PATH" ] || SCRIPT_PATH="$SELF"
-  REPO=$(cd "$(dirname "$SCRIPT_PATH")/.." && pwd)
-
-  mkdir -p "$SCRATCH" 2>/dev/null || true
-  LOG=${LOG:-$SCRATCH/yambda_slurm.${SLURM_JOB_ID:-manual}.log}
-
-  MODE=${MODE:-streaming-train-eval}
-  if [ "${SMOKE:-0}" = "1" ]; then
-    START_TS=${START_TS:-150}
-    NUM_TRAIN_TS=${NUM_TRAIN_TS:-1}
-    NUM_TRAIN_BATCHES=${NUM_TRAIN_BATCHES:-20}
-    NUM_EVAL_BATCHES=${NUM_EVAL_BATCHES:-10}
-    EVAL_EVERY_N_WINDOWS=${EVAL_EVERY_N_WINDOWS:-1}
-    METRIC_LOG_FREQ=${METRIC_LOG_FREQ:-5}
-  fi
-  START_TS=${START_TS:-0}
-  NUM_TRAIN_TS=${NUM_TRAIN_TS:-299}
-  NUM_TRAIN_BATCHES=${NUM_TRAIN_BATCHES:-0}
-  NUM_EVAL_BATCHES=${NUM_EVAL_BATCHES:-0}
-  EVAL_EACH_WINDOW=${EVAL_EACH_WINDOW:-1}
-  METRIC_LOG_FREQ=${METRIC_LOG_FREQ:-20}
-  if [ "${EVAL_EVERY_N_WINDOWS:-0}" -gt 0 ] 2>/dev/null; then
-    EVAL_EVERY_DATA_PCT=${EVAL_EVERY_DATA_PCT:-0}
-  else
-    EVAL_EVERY_N_WINDOWS=0
-    EVAL_EVERY_DATA_PCT=${EVAL_EVERY_DATA_PCT:-0.005}
-  fi
-  FORCE_PROVISION=${FORCE_PROVISION:-0}
-
-  # Truncate the metrics log on a FRESH run; APPEND on a supervised relaunch
-  # (APPEND_LOG=1) so the full-run NE/AUC history survives crash/node-failover
-  # resubmits instead of being wiped on every attempt (mirrors the single-node
-  # supervisor's init-once/append model).
-  if [ "${APPEND_LOG:-0}" = "1" ]; then
-    echo "[$(date)] === resume: appending to existing $LOG (APPEND_LOG=1) ===" >> "$LOG"
-  else
-    : > "$LOG"
-  fi
-  chmod 622 "$LOG" 2>/dev/null || true
-  echo "[$(date)] launch_slurm/orchestrate: job=${SLURM_JOB_ID:-?} nodes=${SLURM_JOB_NODELIST:-?} nnodes=${SLURM_NNODES:-1}" | tee -a "$LOG"
-  echo "[$(date)] resolved SCRIPT_PATH=$SCRIPT_PATH REPO=$REPO" | tee -a "$LOG"
-  echo "[$(date)] config: MODE=$MODE START_TS=$START_TS NUM_TRAIN_TS=$NUM_TRAIN_TS NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES METRIC_LOG_FREQ=$METRIC_LOG_FREQ SMOKE=${SMOKE:-0} EVAL_EVERY_N_WINDOWS=$EVAL_EVERY_N_WINDOWS EVAL_EVERY_DATA_PCT=$EVAL_EVERY_DATA_PCT" | tee -a "$LOG"
-  echo "[$(date)] lr-override: DENSE_LR=${DENSE_LR:-<unset:gin default 1e-7>} SPARSE_LR=${SPARSE_LR:-<unset:gin default 1e-7>}" | tee -a "$LOG"
-
-  # Rendezvous resolved on the HOST (the container image has no SLURM client).
-  MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" 2>/dev/null | head -1)
-  MASTER_ADDR=${MASTER_ADDR:-localhost}
-  MASTER_PORT=$(( 20000 + ${SLURM_JOB_ID:-0} % 20000 ))
-  echo "[$(date)] rendezvous: MASTER_ADDR=$MASTER_ADDR MASTER_PORT=$MASTER_PORT" | tee -a "$LOG"
-
-  # Optional NCCL/RCCL fabric overrides — forwarded into the container only when
-  # set at submit time (docker exec does NOT inherit the srun task env). The
-  # worker phase applies its own validated multi-node bnxt_re defaults when these
-  # are unset. Common: NCCL_NET_TRANSPORT=socket (TCP fallback), NCCL_DEBUG=INFO.
-  NCCL_ENV_ARGS=""
-  for v in NCCL_NET_TRANSPORT NCCL_DEBUG NCCL_SOCKET_IFNAME NCCL_IB_HCA NCCL_IB_GID_INDEX \
-           NCCL_IB_TC NCCL_IB_TIMEOUT NCCL_IGNORE_CPU_AFFINITY RCCL_MSCCL_ENABLE NCCL_NET_GDR_LEVEL \
-           NCCL_IB_PCI_RELAXED_ORDERING NCCL_IB_USE_INLINE NCCL_IB_QPS_PER_CONNECTION \
-           NCCL_IB_ECE_ENABLE NCCL_DMABUF_ENABLE NCCL_GDRCOPY_ENABLE NCCL_GDR_FLUSH_DISABLE \
-           NCCL_PXN_DISABLE NCCL_CHECKS_DISABLE NCCL_CROSS_NIC RDMA_OVERLAY; do
-    eval "val=\${$v:-}"
-    if [ -n "$val" ]; then NCCL_ENV_ARGS="$NCCL_ENV_ARGS -e $v=$val"; fi
-  done
-
-  # TRICKY — variable expansion inside the `srun ... bash -c "..."` blocks below:
-  # the string is double-quoted, so PLAIN $VAR expands NOW on the batch host (e.g.
-  # $MASTER_ADDR, $CONTAINER, $SCRIPT_PATH — values computed above), while
-  # BACKSLASH-escaped \$VAR is passed through literally and expands LATER on each
-  # compute node inside the srun task (e.g. \$SLURM_NODEID, \$(hostname)) where the
-  # per-node SLURM_* env actually lives. Mixing these up sends every rank the
-  # wrong node id or breaks the docker exec — keep the \$ on per-node values.
-
-  # --- step 1: ensure the container is up on every node ----------------------
-  echo "[$(date)] ensuring container '$CONTAINER' on all nodes (force=$FORCE_PROVISION)" | tee -a "$LOG"
-  srun --ntasks-per-node=1 bash -c "
-    # Reap stale/foreign GPU containers from prior jobs BEFORE (re)provisioning.
-    # The node is allocated --exclusive, so any GPU container other than
-    # '$CONTAINER' is an orphan left by a previous job (its container outlives the
-    # SLURM allocation). We remove every such container that has GPU access
-    # (/dev/kfd or /dev/dri) — running OR stopped, whether or not it currently
-    # pins VRAM ('docker ps -aq' includes stopped ones) — since idle orphans can
-    # still hold device handles or wake up; leaked HBM from these has caused both
-    # OOMs and RCCL collective hangs. We deliberately SKIP non-GPU containers
-    # (e.g. 'k8s-node-services-*' and other cluster system services) so we don't
-    # disrupt node infrastructure. docker teardown lets the driver reclaim HBM.
-    for _c in \$(docker ps -aq 2>/dev/null); do
-      _nm=\$(docker inspect -f '{{.Name}}' \"\$_c\" 2>/dev/null | sed 's#^/##')
-      [ \"\$_nm\" = \"$CONTAINER\" ] && continue
-      _dev=\$(docker inspect -f '{{range .HostConfig.Devices}}{{.PathOnHost}} {{end}}' \"\$_c\" 2>/dev/null)
-      case \"\$_dev\" in
-        *kfd*|*dri*)
-          echo \"[\$(hostname)] reaping stale GPU container \$_nm (\$_c)\"
-          docker rm -f \"\$_c\" >/dev/null 2>&1 || true ;;
-        *)
-          echo \"[\$(hostname)] keeping non-GPU/system container \$_nm (\$_c)\" ;;
-      esac
-    done
-    # Reuse a STOPPED '$CONTAINER' (its installed deps persist in the container
-    # fs) instead of destructively re-provisioning from the base image + pip.
-    # Harmless no-op on a fresh node (no such container) -> falls through to
-    # provision below. Repo code is bind-mounted, so live edits are still picked up.
-    docker start $CONTAINER >/dev/null 2>&1 || true
-    if [ \"$FORCE_PROVISION\" = \"1\" ] || ! docker exec $CONTAINER true >/dev/null 2>&1; then
-      echo \"[\$(hostname)] (re)provisioning container\"
-      LAUNCH_SLURM_PHASE=provision CONTAINER=$CONTAINER IMAGE=$IMAGE \
-        BAKED_IMAGE=$BAKED_IMAGE BAKED_TAR=$BAKED_TAR USE_BAKED=$USE_BAKED \
-        BAKE_IMAGE=${BAKE_IMAGE:-0} RDMA_OVERLAY=$OVERLAY REPO=$REPO \
-        REPO_MOUNT=$REPO_MOUNT DATA_MOUNT=$DATA_MOUNT SCRATCH=$SCRATCH bash $SCRIPT_PATH
-    else
-      # Container persists across jobs; the reap above only removes FOREIGN GPU
-      # containers, so our own '$CONTAINER' can still pin HBM via stray trainer
-      # ranks left by a prior OOM/crash (this caused repeated 'CUDA out of memory'
-      # on relaunch onto the same node). Restart it to kill every exec'd proc and
-      # let the driver reclaim HBM — cheap (keeps the installed deps in the
-      # container fs; NFS RDMA overlay also persists), no full re-provision.
-      echo \"[\$(hostname)] container already up — restarting to free any leaked HBM before launch\"
-      docker restart $CONTAINER >/dev/null 2>&1 || true
-      # Readiness gate: a bare 'docker exec true' can pass while the runtime is
-      # still settling, so the SUBSEQUENT (heavier) worker exec races the restart
-      # and dies with 'container is not running' / OCI 'setns' errors (observed on
-      # c07-08 and e08-08 -> the peer never joins rendezvous -> master 600s
-      # TCPStore timeout). Require State.Running=true AND a successful probe, then
-      # a short settle, before considering the container ready.
-      for _w in \$(seq 1 30); do
-        [ \"\$(docker inspect -f '{{.State.Running}}' $CONTAINER 2>/dev/null)\" = \"true\" ] \
-          && docker exec $CONTAINER true >/dev/null 2>&1 && break
-        sleep 2
-      done
-      sleep 2
-      echo \"[\$(hostname)] container restarted (HBM reclaimed; running=\$(docker inspect -f '{{.State.Running}}' $CONTAINER 2>/dev/null))\"
-    fi
-  " 2>&1 | tee -a "$LOG"
-
-  # --- step 2: launch the worker (trainer) inside the container on every node -
-  echo "[$(date)] launching trainer (worker phase) on all nodes" | tee -a "$LOG"
-  srun --ntasks-per-node=1 bash -c "
-    # Pre-flight readiness gate (per node): step 1 ran in a SEPARATE srun, so the
-    # container can still be settling here. Wait for State.Running=true + a probe
-    # before the worker exec so we don't race a just-restarted container.
-    for _w in \$(seq 1 30); do
-      [ \"\$(docker inspect -f '{{.State.Running}}' $CONTAINER 2>/dev/null)\" = \"true\" ] \
-        && docker exec $CONTAINER true >/dev/null 2>&1 && break
-      [ \$_w -eq 1 ] && echo \"[\$(hostname)] worker pre-flight: waiting for container to be ready...\"
-      sleep 2
-    done
-    # Retry wrapper: docker exec startup failures (rc 125 daemon 'container is not
-    # running', 126/127 OCI/setns 'exec failed') mean the container wasn't ready,
-    # NOT that the trainer ran and failed. Restart + re-gate + retry a few times.
-    # Any OTHER rc (the trainer actually started and exited) is propagated so the
-    # supervisor's resume-from-checkpoint logic owns real failures.
-    _wattempt=0
-    while : ; do
-    _wattempt=\$((_wattempt+1))
-    docker exec \
-      -e LAUNCH_SLURM_PHASE=worker \
-      -e WORKER_TEE=0 \
-      -e SCRATCH=$SCRATCH \
-      -e SLURM_NNODES=\$SLURM_NNODES \
-      -e SLURM_NODEID=\$SLURM_NODEID \
-      -e SLURM_PROCID=\$SLURM_PROCID \
-      -e SLURM_JOB_NODELIST=\"\$SLURM_JOB_NODELIST\" \
-      -e SLURM_JOB_ID=\$SLURM_JOB_ID \
-      -e MASTER_ADDR=$MASTER_ADDR \
-      -e MASTER_PORT=$MASTER_PORT \
-      -e HSTU_HAMMER_KERNEL=${HSTU_HAMMER_KERNEL:-TRITON} \
-      -e MODE=$MODE \
-      -e START_TS=$START_TS \
-      -e NUM_TRAIN_TS=$NUM_TRAIN_TS \
-      -e EVAL_EACH_WINDOW=$EVAL_EACH_WINDOW \
-      -e EVAL_EVERY_N_WINDOWS=$EVAL_EVERY_N_WINDOWS \
-      ${EVAL_EVERY_DATA_PCT:+-e EVAL_EVERY_DATA_PCT=$EVAL_EVERY_DATA_PCT} \
-      -e NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES \
-      -e NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES \
-      -e METRIC_LOG_FREQ=$METRIC_LOG_FREQ \
-      ${MLPERF_LOGGING:+-e MLPERF_LOGGING=$MLPERF_LOGGING} \
-      ${MLPERF_TRAIN_LOSS_LOG_FREQ:+-e MLPERF_TRAIN_LOSS_LOG_FREQ=$MLPERF_TRAIN_LOSS_LOG_FREQ} \
-      ${STREAMING_SHUFFLE_FRACTION:+-e STREAMING_SHUFFLE_FRACTION=$STREAMING_SHUFFLE_FRACTION} \
-      ${STREAMING_SHUFFLE_SEED:+-e STREAMING_SHUFFLE_SEED=$STREAMING_SHUFFLE_SEED} \
-      ${NUM_WORKERS:+-e NUM_WORKERS=$NUM_WORKERS} \
-      ${PREFETCH_FACTOR:+-e PREFETCH_FACTOR=$PREFETCH_FACTOR} \
-      ${DIAG_UNIQUE_EMB:+-e DIAG_UNIQUE_EMB=$DIAG_UNIQUE_EMB} \
-      ${DIAG_EMB_STEPS:+-e DIAG_EMB_STEPS=$DIAG_EMB_STEPS} \
-      ${OUTPUT_TRACE:+-e OUTPUT_TRACE=$OUTPUT_TRACE} \
-      ${MIN_HISTORY:+-e MIN_HISTORY=$MIN_HISTORY} \
-      ${HISTORY_STRATEGY:+-e HISTORY_STRATEGY=$HISTORY_STRATEGY} \
-      ${SEED:+-e SEED=$SEED} \
-      ${DENSE_LR:+-e DENSE_LR=$DENSE_LR} \
-      ${SPARSE_LR:+-e SPARSE_LR=$SPARSE_LR} \
-      ${GRAD_CLIP_NORM:+-e GRAD_CLIP_NORM=$GRAD_CLIP_NORM} \
-      ${HSTU_NUM_LAYERS:+-e HSTU_NUM_LAYERS=$HSTU_NUM_LAYERS} \
-      ${MAX_SEQ_LEN:+-e MAX_SEQ_LEN=$MAX_SEQ_LEN} \
-      ${HISTORY_LENGTH:+-e HISTORY_LENGTH=$HISTORY_LENGTH} \
-      ${BATCH_SIZE:+-e BATCH_SIZE=$BATCH_SIZE} \
-      ${CKPT_TIME_INTERVAL_S:+-e CKPT_TIME_INTERVAL_S=$CKPT_TIME_INTERVAL_S} \
-      ${KEEP_LAST_N:+-e KEEP_LAST_N=$KEEP_LAST_N} \
-      ${IN_WINDOW_CKPT_FREQ:+-e IN_WINDOW_CKPT_FREQ=$IN_WINDOW_CKPT_FREQ} \
-      ${CKPT_STEP_FREQ:+-e CKPT_STEP_FREQ=$CKPT_STEP_FREQ} \
-      -e TRAIN_SPLIT_PERCENTAGE=${TRAIN_SPLIT_PERCENTAGE:-1.0} \
-      -e AUC_THRESHOLD=${AUC_THRESHOLD:-0.80275} \
-      ${MLPERF_SUBMISSION_PLATFORM:+-e MLPERF_SUBMISSION_PLATFORM=$MLPERF_SUBMISSION_PLATFORM} \
-      ${TRAIN_LIFETIME_AUC_MODE:+-e TRAIN_LIFETIME_AUC_MODE=$TRAIN_LIFETIME_AUC_MODE} \
-      ${EVAL_LIFETIME_AUC_MODE:+-e EVAL_LIFETIME_AUC_MODE=$EVAL_LIFETIME_AUC_MODE} \
-      ${CUMULATIVE_AUC_BINS:+-e CUMULATIVE_AUC_BINS=$CUMULATIVE_AUC_BINS} \
-      ${LIFETIME_AUC_WINDOW:+-e LIFETIME_AUC_WINDOW=$LIFETIME_AUC_WINDOW} \
-      -e SPLIT_SALT=${SPLIT_SALT:-0} \
-      -e EVAL_HOLDOUT_TS=${EVAL_HOLDOUT_TS:--1} \
-      -e EVAL_HOLDOUT_NUM_WINDOWS=${EVAL_HOLDOUT_NUM_WINDOWS:-1} \
-      ${WORKER_CMD:+-e WORKER_CMD=\"$WORKER_CMD\"} \
-      ${RUN_NAME:+-e RUN_NAME=$RUN_NAME} \
-      ${TENSORBOARD_LOG_PATH:+-e TENSORBOARD_LOG_PATH=$TENSORBOARD_LOG_PATH} \
-      ${MLPERF_LOG_PATH:+-e MLPERF_LOG_PATH=$MLPERF_LOG_PATH} \
-      ${CKPT_PATH:+-e CKPT_PATH=$CKPT_PATH} \
-      ${SPARSE_A2A_FWD:+-e SPARSE_A2A_FWD=$SPARSE_A2A_FWD} \
-      ${SPARSE_A2A_BWD:+-e SPARSE_A2A_BWD=$SPARSE_A2A_BWD} \
-      -e LOG=$LOG \
-      $NCCL_ENV_ARGS \
-      $CONTAINER bash -lc 'cd $REPO && LAUNCH_SLURM_PHASE=worker bash scripts/launch_slurm_suachong.sh'
-    _wrc=\$?
-    if { [ \$_wrc -eq 125 ] || [ \$_wrc -eq 126 ] || [ \$_wrc -eq 127 ]; } && [ \$_wattempt -lt 5 ]; then
-      echo \"[\$(hostname)] worker exec failed to START (rc=\$_wrc, attempt \$_wattempt/5) — container not ready; restarting + retrying\"
-      docker restart $CONTAINER >/dev/null 2>&1 || true
-      for _w in \$(seq 1 30); do
-        [ \"\$(docker inspect -f '{{.State.Running}}' $CONTAINER 2>/dev/null)\" = \"true\" ] \
-          && docker exec $CONTAINER true >/dev/null 2>&1 && break
-        sleep 2
-      done
-      sleep 3
-      continue
-    fi
-    exit \$_wrc
-    done
-  " 2>&1 | tee -a "$LOG"
-  rc=${PIPESTATUS[0]}
-  echo "[$(date)] launch_slurm/orchestrate finished rc=$rc" | tee -a "$LOG"
-  exit $rc
-}
-
-# =============================================================================
-# PHASE: provision  (compute-node host)
-# =============================================================================
-provision() {
-  export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:${PATH:-}"
-  DOCKER=$(command -v docker 2>/dev/null || true); DOCKER=${DOCKER:-/usr/bin/docker}
-  FBGEMM_WHL=${FBGEMM_WHL:-/apps/chcai/FBGEMM/fbgemm_gpu/dist/fbgemm_gpu_nightly_rocm-2026.6.2-cp312-cp312-linux_x86_64.whl}  # [CLUSTER-SPECIFIC] gfx950/ROCm wheel
-  TORCH_IDX=${TORCH_IDX:-https://download.pytorch.org/whl/rocm7.2}  # [CLUSTER-SPECIFIC] ROCm version index
-  echo "[provision] host=$(hostname) container=$CONTAINER docker=$DOCKER"
-
-  # Resolve which image to run + whether deps must be installed. Prefer a pre-baked
-  # image (deps already installed) to skip the multi-GB torch download + pip /
-  # torchrec-from-git build on every fresh node:
-  #   1) baked image in this node's docker -> use it, skip deps
-  #   2) baked image tar on NFS            -> docker load (local, no internet)
-  #   3) neither                           -> base image + pip (slow path, which
-  #                                           can then be baked via BAKE_IMAGE=1)
-  NEED_DEPS=1
-  RUN_IMAGE="$IMAGE"
-  if [ "$USE_BAKED" = "1" ]; then
-    if "$DOCKER" image inspect "$BAKED_IMAGE" >/dev/null 2>&1; then
-      echo "[provision] using baked image $BAKED_IMAGE (deps preinstalled, no download)"
-      RUN_IMAGE="$BAKED_IMAGE"; NEED_DEPS=0
-    elif [ -f "$BAKED_TAR" ]; then
-      echo "[provision] loading baked image from $BAKED_TAR (local, no internet)..."
-      if "$DOCKER" load -i "$BAKED_TAR" >/dev/null 2>&1 && "$DOCKER" image inspect "$BAKED_IMAGE" >/dev/null 2>&1; then
-        RUN_IMAGE="$BAKED_IMAGE"; NEED_DEPS=0; echo "[provision] baked image loaded"
-      else
-        echo "[provision] WARNING: docker load failed; falling back to base-image + pip"
-      fi
-    fi
-  fi
-  if ! "$DOCKER" image inspect "$RUN_IMAGE" >/dev/null 2>&1; then
-    echo "[provision] pulling $RUN_IMAGE (this can take a while)..."; "$DOCKER" pull "$RUN_IMAGE"
-  fi
-
-  echo "[provision] (re)starting container $CONTAINER from $RUN_IMAGE"
-  "$DOCKER" rm -f "$CONTAINER" >/dev/null 2>&1 || true
-  "$DOCKER" run -d --name "$CONTAINER" \
-    --network=host --ipc=host --shm-size=64g \
-    --device=/dev/kfd --device=/dev/dri --group-add video \
-    `# [CLUSTER-SPECIFIC] AMD ROCm device passthrough; NVIDIA uses --gpus all / nvidia runtime` \
-    --cap-add=SYS_PTRACE --cap-add=CAP_SYS_ADMIN --cap-add=IPC_LOCK \
-    --ulimit memlock=-1:-1 --ulimit stack=67108864:67108864 \
-    `# memlock=-1 is REQUIRED for RDMA QP memory registration — do not drop` \
-    --security-opt seccomp=unconfined --privileged \
-    -v "$REPO_MOUNT:$REPO_MOUNT" \
-    -v "$DATA_MOUNT:$DATA_MOUNT" \
-    `# shared-NFS bind mounts: repo home (REPO_MOUNT, rw) + dataset/build assets (DATA_MOUNT)` \
-    -w "$REPO" \
-    "$RUN_IMAGE" sleep infinity
-
-  # --- RDMA userspace overlay for in-container RCCL (bnxt_re) -----------------
-  # The image (rocm/primus, rdma-core 50/libbnxt_re-rdmav34) ships an OLDER RDMA
-  # userspace than the host kernel bnxt_re driver. The stock v34 provider faults
-  # RCCL's deep-queue create_qp (max_send_wr=256) against the newer kernel uapi
-  # -> "ibv_create_qp ... Bad address". Fix: stage the host's matched rdma-core
-  # (libibverbs v61 + libbnxt_re-rdmav59 + libnl) on NFS so the worker phase makes
-  # RCCL load it via LD_PRELOAD + LD_LIBRARY_PATH. The UNVERSIONED libibverbs.so
-  # symlink is essential (import torch pulls the unversioned soname; without it
-  # the lookup falls through to the container v34 lib and the fix regresses).
-  if [ "${FORCE_OVERLAY:-0}" != "1" ] && ls "$OVERLAY/lib/libibverbs/"libbnxt_re-rdmav*.so >/dev/null 2>&1 && [ -L "$OVERLAY/lib/libibverbs.so" ]; then
-    echo "[provision] host RDMA overlay already staged at $OVERLAY (shared NFS) — skipping"
-  else
-    echo "[provision] staging host RDMA userspace overlay -> $OVERLAY"
-    rm -rf "${OVERLAY}.tmp" 2>/dev/null
-    mkdir -p "${OVERLAY}.tmp/lib/libibverbs" "${OVERLAY}.tmp/libibverbs.d"
-    cp -L /usr/lib64/libibverbs.so.1 /usr/lib64/libnl-3.so.200 /usr/lib64/libnl-route-3.so.200 "${OVERLAY}.tmp/lib/" 2>/dev/null || true
-    ln -sf libibverbs.so.1 "${OVERLAY}.tmp/lib/libibverbs.so"
-    cp -L /usr/lib64/libibverbs/*.so "${OVERLAY}.tmp/lib/libibverbs/" 2>/dev/null || true
-    cp /etc/libibverbs.d/*.driver "${OVERLAY}.tmp/libibverbs.d/" 2>/dev/null || true
-    if ls "${OVERLAY}.tmp/lib/libibverbs/"libbnxt_re-rdmav*.so >/dev/null 2>&1; then
-      rm -rf "$OVERLAY" 2>/dev/null
-      mv "${OVERLAY}.tmp" "$OVERLAY" 2>/dev/null || { mkdir -p "$OVERLAY"; cp -a "${OVERLAY}.tmp/." "$OVERLAY/"; }
-      echo "[provision] host RDMA overlay staged: $(ls "$OVERLAY/lib/libibverbs" | wc -l) providers + libibverbs.so symlink"
-    else
-      echo "[provision] WARNING: host bnxt_re provider not found at /usr/lib64/libibverbs — multi-node RDMA will fail 'Bad address'; use NCCL_NET_TRANSPORT=socket"
-    fi
-  fi
-
-  if [ "$NEED_DEPS" = "0" ]; then
-    echo "[provision] baked image — deps preinstalled; verifying imports only"
-    "$DOCKER" exec "$CONTAINER" bash -lc '
-python -c "import torch, fbgemm_gpu, torchrec, polars, xxhash, gin; print(\"imports OK,\", torch.__version__, torch.version.hip, torch.cuda.device_count(), \"gpus\")"
-' || echo "[provision] WARNING: baked-image import smoke failed"
-  else
-    echo "[provision] installing recipe deps (base image, slow path)"
-    # Install misc deps FIRST, then pin the rocm torch stack + fbgemm + torchrec
-    # LAST with --no-deps so nothing pulls a CUDA torch over the rocm build.
-    "$DOCKER" exec "$CONTAINER" bash -lc '
-set -e
-echo "=== native torch ==="; python -c "import torch;print(torch.__version__)" || true
-echo "=== misc python deps ==="
-pip install --no-cache-dir polars-u64-idx pyarrow pyyaml tqdm psutil numba xxhash gin-config \
-  absl-py pandas tensorboard torchmetrics tensordict pyre-extensions iopath typing-inspect 2>&1 | tail -3 || true
-echo "=== rocm torch stack (force, no-deps, LAST) ==="
-pip install --force-reinstall --no-deps --index-url '"$TORCH_IDX"' \
-  torch==2.12.0+rocm7.2 torchvision==0.27.0+rocm7.2 torchaudio==2.11.0+rocm7.2
-echo "=== fbgemm (local gfx950 wheel) ==="
-pip install --force-reinstall --no-deps '"$FBGEMM_WHL"'
-echo "=== torchrec v2026.06.01.00 (force, no-deps) ==="
-pip install --force-reinstall --no-deps "git+https://github.com/pytorch/torchrec.git@v2026.06.01.00"
-echo "=== import smoke ==="
-python -c "import torch, fbgemm_gpu, torchrec, polars, xxhash, gin; print(\"imports OK,\", torch.__version__, torch.version.hip, torch.cuda.device_count(), \"gpus\")"
-'
-  fi
-
-  # --- one-time bake: snapshot the deps-installed container into a reusable image
-  # and save it to NFS so future nodes skip the download/pip path entirely.
-  if [ "${BAKE_IMAGE:-0}" = "1" ]; then
-    echo "[provision] baking: docker commit $CONTAINER -> $BAKED_IMAGE"
-    if "$DOCKER" commit "$CONTAINER" "$BAKED_IMAGE" >/dev/null; then
-      echo "[provision] saving $BAKED_IMAGE -> $BAKED_TAR (one-time, tens of GB)"
-      if "$DOCKER" save "$BAKED_IMAGE" -o "${BAKED_TAR}.tmp.$$" && mv -f "${BAKED_TAR}.tmp.$$" "$BAKED_TAR"; then
-        echo "[provision] bake done: $(ls -lh "$BAKED_TAR" 2>/dev/null | awk '{print $5}')"
-      else
-        echo "[provision] WARNING: docker save failed"; rm -f "${BAKED_TAR}.tmp.$$" 2>/dev/null
-      fi
-    else
-      echo "[provision] WARNING: docker commit failed"
-    fi
-  fi
-  echo "[provision] DONE"
-}
-
-# =============================================================================
-# PHASE: worker  (inside the container)
-# =============================================================================
-worker() {
-  cd "$REPO_ROOT"
-  mkdir -p "$SCRATCH" 2>/dev/null || true
-  LOG=${LOG:-$SCRATCH/yambda_5b_8gpu.log}
-  # WORKER_TEE=0 (set by orchestrate) sends our file sink to /dev/null to avoid
-  # double-logging, since orchestrate already tees stdout into the real $LOG.
-  [ "${WORKER_TEE:-1}" = "0" ] && LOG=/dev/null
-  export TENSORBOARD_LOG_PATH=${TENSORBOARD_LOG_PATH:-$SCRATCH/tb/yambda_5b}
-  # MLPerf compliance log (rank 0 writes it). Per-job filename so each standalone
-  # sbatch gets a clean log; the e2e supervisor pins MLPERF_LOG_PATH itself.
-  export MLPERF_LOG_PATH=${MLPERF_LOG_PATH:-$SCRATCH/mlperf/yambda_5b_mlperf.${SLURM_JOB_ID:-manual}.log}
-  echo "[$(date)] REPO_ROOT=$REPO_ROOT" | tee -a "$LOG"
-
-  # polars-u64-idx (NOT stock polars) — yambda parquet's flat-explode overruns
-  # 32-bit row index. Reserved node has no outbound DNS, so install from a
-  # pre-staged tarball under /apps/chcai/. Override PIP_LOCAL_TGZ for other hosts.
-  PIP_LOCAL_TGZ=${PIP_LOCAL_TGZ:-/apps/chcai/pip_local_yambda.tgz}   # [CLUSTER-SPECIFIC] shared-NFS path
-  PIP_LOCAL_DIR=${PIP_LOCAL_DIR:-/tmp/pip_local}
-  if [ ! -f "$PIP_LOCAL_DIR/lib/python3.12/site-packages/polars/__init__.py" ]; then
-    rm -rf "$PIP_LOCAL_DIR"
-    mkdir -p "$PIP_LOCAL_DIR" && tar xzf "$PIP_LOCAL_TGZ" -C "$(dirname "$PIP_LOCAL_DIR")" 2>&1 | tail -3 | tee -a "$LOG"
-  fi
-
-  export PYTHONPATH="$PIP_LOCAL_DIR/lib/python3.12/site-packages:$REPO_ROOT:${PYTHONPATH:-}"
-  export HOME=${HOME:-/tmp}
-  echo "[$(date)] PYTHONPATH=$PYTHONPATH" | tee -a "$LOG"
-  python -c "import torch, fbgemm_gpu, torchrec, polars, xxhash, gin; print('imports OK,', torch.__version__, torch.cuda.device_count(),'gpus')" 2>&1 | tee -a "$LOG"
-
-  export HIP_VISIBLE_DEVICES=${HIP_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}
-  export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-0,1,2,3,4,5,6,7}
-
-  # --- distributed topology ---------------------------------------------------
-  GPUS_PER_NODE=$(python -c "import torch; print(torch.cuda.device_count())")
-  # Multi-node when launched one-task-per-node under SLURM (SLURM_NNODES>1);
-  # otherwise fall through to legacy single-node defaults (localhost, node_rank 0).
-  if [ "${SLURM_NNODES:-1}" -gt 1 ] && [ -n "${SLURM_JOB_NODELIST:-}" ]; then
-    NNODES=${SLURM_NNODES}
-    NODE_RANK=${SLURM_NODEID:-${SLURM_PROCID:-0}}
-    # PREFER a MASTER_ADDR/PORT forwarded from the orchestrate phase (resolved on
-    # the host, which has scontrol); the container image carries no SLURM client.
-    if [ -z "${MASTER_ADDR:-}" ]; then
-      MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" 2>/dev/null | head -1)
-      MASTER_ADDR=${MASTER_ADDR:-localhost}
-    fi
-    MASTER_PORT=${MASTER_PORT:-$(( 20000 + ${SLURM_JOB_ID:-0} % 20000 ))}
-  else
-    NNODES=${NNODES:-1}
-    NODE_RANK=${NODE_RANK:-0}
-    # Single-node: all ranks live on THIS host, so rendezvous over loopback and
-    # do NOT use the SLURM hostname. On some nodes the hostname resolves to a
-    # non-routable per-GPU RoCE /31 (benic 192.168.x) address; using it makes the
-    # NCCL bootstrap fail with "No route to host". localhost is node-independent.
-    MASTER_ADDR=localhost
-    MASTER_PORT=${MASTER_PORT:-}     # empty => train_ranker picks a free port
-  fi
-  export NNODES NODE_RANK GPUS_PER_NODE MASTER_ADDR MASTER_PORT
-  export WORLD_SIZE=$(( NNODES * GPUS_PER_NODE ))
-  echo "[$(date)] topology: nnodes=$NNODES node_rank=$NODE_RANK gpus_per_node=$GPUS_PER_NODE world_size=$WORLD_SIZE master=$MASTER_ADDR:${MASTER_PORT:-<auto>}" | tee -a "$LOG"
-
-  # NCCL bootstrap NIC: loopback single-node, routable host NIC multi-node (pin
-  # to avoid auto-detect picking a non-routable per-GPU RoCE link). Override via
-  # $NCCL_SOCKET_IFNAME. [CLUSTER-SPECIFIC] multi-node fenic0 (find via `ip -br addr`).
-  if [ "$NNODES" -gt 1 ]; then
-    export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-fenic0}
-  else
-    export NCCL_SOCKET_IFNAME=${NCCL_SOCKET_IFNAME:-lo}
-  fi
-  echo "[$(date)] NCCL_SOCKET_IFNAME=$NCCL_SOCKET_IFNAME (nnodes=$NNODES)" | tee -a "$LOG"
-
-  # Multi-node additionally needs the RDMA data-plane (bnxt_re HCAs) configured;
-  # single-node uses intra-node P2P (XGMI/PCIe) so only the bootstrap NIC matters.
-  if [ "$NNODES" -gt 1 ]; then
-    NCCL_NET_TRANSPORT=${NCCL_NET_TRANSPORT:-ib}
-    if [ "$NCCL_NET_TRANSPORT" = "socket" ]; then
-      export NCCL_IB_DISABLE=1
-      echo "[$(date)] NCCL: IB disabled — allreduce over TCP (fenic0). Functional, not RDMA-fast." | tee -a "$LOG"
-    else
-      # bnxt_re userspace provider ABI overlay (REQUIRED for RCCL). The stock v34
-      # provider faults RCCL's create_qp (256 WRs) against the host kernel uapi
-      # ("Bad address"); the host v61/v59 set staged by the provision phase works.
-      # The libibverbs.so (UNVERSIONED) symlink + LD_PRELOAD are both required so
-      # the torch process maps ONLY the host lib (see provision phase comment).
-      if [ -e "$OVERLAY/lib/libibverbs.so.1" ]; then
-        [ -e "$OVERLAY/lib/libibverbs.so" ] || ln -sf libibverbs.so.1 "$OVERLAY/lib/libibverbs.so" 2>/dev/null || true
-        export LD_LIBRARY_PATH="$OVERLAY/lib:$OVERLAY/lib/libibverbs:${LD_LIBRARY_PATH:-}"
-        export LD_PRELOAD="$OVERLAY/lib/libibverbs.so.1${LD_PRELOAD:+:$LD_PRELOAD}"
-        echo "[$(date)] NCCL: bnxt_re provider overlay -> $OVERLAY (host rdma-core v61/v59; symlink+LD_PRELOAD so RCCL binds the host lib for QP creation)" | tee -a "$LOG"
-      else
-        echo "[$(date)] WARNING: RDMA overlay $OVERLAY missing — RCCL QP creation will fail 'Bad address' on stock v34 provider; set RDMA_OVERLAY or use NCCL_NET_TRANSPORT=socket" | tee -a "$LOG"
-      fi
-      # MINIMAL bnxt_re set PROVEN on these meta64 cv350 nodes (cmcknigh RCCL
-      # benchmarks + confirmed e2e here). NCCL_IB_TC=104 (RoCE lossless PFC class)
-      # is required; do NOT add the ionic-AINIC QPS/ECE/DMABUF block.
-      # [CLUSTER-SPECIFIC] RDMA HCA names (`ibv_devices`); other vendors => mlx5_*/ionic_*
-      export NCCL_IB_HCA=${NCCL_IB_HCA:-bnxt_re0,bnxt_re1,bnxt_re2,bnxt_re3,bnxt_re4,bnxt_re5,bnxt_re6,bnxt_re7}
-      export NCCL_IB_GID_INDEX=${NCCL_IB_GID_INDEX:-3}    # [CLUSTER-SPECIFIC] RoCEv2 IPv4 GID idx (`show_gids`)
-      export NCCL_IB_TC=${NCCL_IB_TC:-104}                # [CLUSTER-SPECIFIC] RoCE lossless/PFC traffic class
-      export NCCL_IB_TIMEOUT=${NCCL_IB_TIMEOUT:-14}
-      export NCCL_IGNORE_CPU_AFFINITY=${NCCL_IGNORE_CPU_AFFINITY:-1}
-      export RCCL_MSCCL_ENABLE=${RCCL_MSCCL_ENABLE:-0}
-      # GPU-Direct RDMA on by default (~+22% throughput at 2 nodes via peermem).
-      # Set NCCL_NET_GDR_LEVEL=0 to force the legacy host-staged path.
-      export NCCL_NET_GDR_LEVEL=${NCCL_NET_GDR_LEVEL:-5}
-      export NCCL_DMABUF_ENABLE=${NCCL_DMABUF_ENABLE:-1}
-      echo "[$(date)] NCCL: RDMA over bnxt_re (GID idx ${NCCL_IB_GID_INDEX}, TC ${NCCL_IB_TC}, GDR_LEVEL=${NCCL_NET_GDR_LEVEL}, DMABUF=${NCCL_DMABUF_ENABLE}; meta64 bnxt_re config, validated)" | tee -a "$LOG"
-    fi
-  fi
-  export NCCL_DEBUG=${NCCL_DEBUG:-WARN}
-  export HSTU_HAMMER_KERNEL=${HSTU_HAMMER_KERNEL:-}
-  export PYTORCH_CUDA_ALLOC_CONF=${PYTORCH_CUDA_ALLOC_CONF:-expandable_segments:True}
-
-  # --- GPU clock sanity guard -------------------------------------------------
-  # A leftover perf_determinism cap (half clock) silently slows every kernel ~1.9x.
-  # Log the perf level + a live sclk sample and try to restore boost (non-fatal).
-  if command -v rocm-smi >/dev/null 2>&1; then
-    echo "[$(date)] GPU perf-level check:" | tee -a "$LOG"
-    rocm-smi --showperflevel 2>/dev/null | grep -iE "GPU\[[0-9]+\]" | tee -a "$LOG" || true
-    if rocm-smi --showperflevel 2>/dev/null | grep -iqE "Performance Level: *(perf_determinism|manual|low)"; then
-      echo "[$(date)] WARNING: GPUs not in 'auto' perf level — attempting --setperflevel auto" | tee -a "$LOG"
-      rocm-smi --setperflevel auto 2>/dev/null | grep -iE "set to auto" | tee -a "$LOG" \
-        || echo "[$(date)] WARNING: could not set perf level (no permission?). Run 'rocm-smi --setperflevel auto' on the HOST before benchmarking — clocks may be capped." | tee -a "$LOG"
-    fi
-    echo "[$(date)] sclk sample (GPU0):$(rocm-smi -d 0 --showclocks 2>/dev/null | grep -i 'sclk clock level' | sed -E 's/.*sclk clock level//')" | tee -a "$LOG" || true
-  fi
-
-  # --- stray-trainer / leaked-VRAM guard -------------------------------------
-  # The trainer runs via `docker exec` into a long-lived container, so its procs
-  # live in the container PID namespace, NOT the SLURM job cgroup. If a prior job
-  # OOM'd/crashed, a rank can leak and keep holding ~half of every GPU's VRAM,
-  # which persists across jobs (container survives) and guarantees the next
-  # attempt OOMs. Before launching, reap any pre-existing trainer procs (there
-  # should be none at this point) and wait for VRAM to drain. [g]-guard avoids
-  # self-match. Non-fatal.
-  if pgrep -f '[g]enerative_recommenders' >/dev/null 2>&1; then
-    echo "[$(date)] WARNING: leaked trainer procs found pre-launch — killing." | tee -a "$LOG"
-    pkill -9 -f '[g]enerative_recommenders' 2>/dev/null || true
-    for _i in $(seq 1 15); do
-      pgrep -f '[g]enerative_recommenders' >/dev/null 2>&1 || break
-      sleep 2
-    done
-    sleep 5  # let the driver release VRAM after process exit
-    if command -v rocm-smi >/dev/null 2>&1; then
-      echo "[$(date)] post-cleanup GPU0 used GiB:$(rocm-smi --showmeminfo vram 2>/dev/null | awk -F: '/Used/{printf " %.0f", $3/1073741824; exit}')" | tee -a "$LOG"
-    fi
-  fi
-
-  # WORKER_CMD override: run an arbitrary in-container command (e.g. an a2a/RCCL
-  # micro-benchmark) instead of the trainer, REUSING all the NCCL/RDMA/topology
-  # setup above so it exercises the exact transport the trainer uses. The
-  # supervisor never sets WORKER_CMD, so the training path is unchanged.
-  if [ -n "${WORKER_CMD:-}" ]; then
-    echo "[$(date)] WORKER_CMD override (WORLD_SIZE=$WORLD_SIZE): $WORKER_CMD" | tee -a "$LOG"
-    bash -lc "cd $REPO_ROOT && $WORKER_CMD" 2>&1 | tee -a "$LOG"
-    return
-  fi
-
-  echo "[$(date)] launching train_ranker with WORLD_SIZE=$WORLD_SIZE" | tee -a "$LOG"
-  python -m generative_recommenders.dlrm_v3.train.train_ranker \
-      --dataset yambda-5b --mode "${MODE:-streaming-train-eval}" 2>&1 | tee -a "$LOG"
-}
-
-# ---- dispatch ---------------------------------------------------------------
-case "$PHASE" in
-  orchestrate) orchestrate ;;
-  provision)   provision ;;
-  worker)      worker ;;
-  *) echo "launch_slurm.sh: unknown LAUNCH_SLURM_PHASE='$PHASE'" >&2; exit 2 ;;
-esac

From 1dae61d687e366b467b1dec38d2db76ac6500242 Mon Sep 17 00:00:00 2001
From: suachong <suachong@amd.com>
Date: Wed, 24 Jun 2026 22:57:52 +0000
Subject: [PATCH 096/113] dlrmv4: revert streaming_resume_test.sh to base (out
 of MLPerf PR scope)

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/tests/streaming_resume_test.sh           | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
index 47c451696..e14e557e8 100755
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
@@ -40,10 +40,9 @@ KEEP=0
 # correctness gates are the functional-invariant checks below (RNG restored,
 # resumed-at-correct-step, atomic/keep_last_n), not this number.
 ATOL=0.15
-SCRATCH=${SCRATCH:-$HOME/yambda_runs}
-CKPT_ROOT=${CKPT_ROOT:-$SCRATCH/ckpts_resume_test}
-LOG_DIR=${LOG_DIR:-$SCRATCH/streaming_resume_test}
-REPO=${REPO:-$(cd "$(dirname "${BASH_SOURCE[0]}")/../../../.." && pwd)}
+CKPT_ROOT=/apps/chcai/ckpts_resume_test
+LOG_DIR=/apps/chcai/streaming_resume_test
+REPO=/home/chcai/training/recommendation_v4
 
 while [[ $# -gt 0 ]]; do
     case $1 in

From 7ec6fcc565c7f589f7ccf2e3e7559782dde41a24 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Thu, 25 Jun 2026 00:48:23 +0000
Subject: [PATCH 097/113] dlrmv4: make MLPerf run markers resume-aware via
 checkpoint state

Persist mlperf_run_started in the checkpoint so a resume relaunch
continues the SAME MLPerf run instead of re-emitting INIT_START/RUN_START
(compliance requires EXACTLY_ONE). Cold-vs-resume is detected from the
on-disk checkpoint before setup(); the log is truncated on a cold start
and appended on a resume so the single run's event stream accumulates
into one file. Legacy/cold checkpoints default the flag to False.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/checkpoint.py                     |  9 +++++
 .../dlrm_v3/train/mlperf_logging_utils.py     | 35 ++++++++++++++++
 .../dlrm_v3/train/train_ranker.py             | 40 +++++++++++++------
 .../generative_recommenders/dlrm_v3/utils.py  |  6 +++
 4 files changed, 77 insertions(+), 13 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py b/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py
index 1d7f7f391..46cc10e2e 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/checkpoint.py
@@ -411,6 +411,9 @@ def save_dmp_checkpoint(
                 # load so pre-existing checkpoints restore as 0 and resume the
                 # count from there.
                 "cumulative_train_samples": metric_logger.cumulative_train_samples,
+                # MLPerf run-marker state: lets a resume relaunch continue the
+                # SAME run's event stream without re-emitting INIT_START/RUN_START.
+                "mlperf_run_started": metric_logger.mlperf_run_started,
                 "sparse_tensor_keys": sparse_tensor_keys,
                 # Streaming resume fields. Defaulted on load so old checkpoints
                 # (pre-streaming-resume) still load as a normal restart.
@@ -549,6 +552,12 @@ def load_nonsparse_checkpoint(
         metric_logger.cumulative_train_samples = non_sparse_state_dict.get(
             "cumulative_train_samples", 0
         )
+        # Defaulted False for legacy/cold checkpoints: a resume that loads a
+        # checkpoint where the run was already open continues without re-emitting
+        # the run markers.
+        metric_logger.mlperf_run_started = non_sparse_state_dict.get(
+            "mlperf_run_started", False
+        )
         class_metric_state_dict = non_sparse_state_dict["class_metrics"]
         regression_metric_state_dict = non_sparse_state_dict["reg_metrics"]
         # Length-safe positional restore: if a checkpoint was written with a
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/mlperf_logging_utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/mlperf_logging_utils.py
index e190f0325..8a716f87b 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/mlperf_logging_utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/mlperf_logging_utils.py
@@ -71,6 +71,7 @@ def __init__(
         benchmark_name: str = "hstu",
         submitter_name: str = "AMD",
         submission_platform: str = "MI355X",
+        fresh: bool = True,
     ):
         self.enabled: bool = _MLLOG_AVAILABLE
         # Use the EXPLICIT caller rank: this is built before init_process_group,
@@ -87,6 +88,14 @@ def __init__(
             log_dir = os.path.dirname(log_path)
             if log_dir:  # guard: os.makedirs("") raises for a bare filename
                 os.makedirs(log_dir, exist_ok=True)
+            # mllog's FileHandler APPENDS (mode "a"), which is what a resume needs
+            # so the single run's event stream accumulates across relaunches into
+            # one file. On a genuine cold start, truncate first so a re-used run
+            # dir / a previous crashed-cold-start's orphaned stream can't leave a
+            # second run_start in the file (the compliance checker requires
+            # EXACTLY_ONE). Resume (fresh=False) appends to continue the stream.
+            if fresh:
+                open(log_path, "w").close()
             mllog.config(filename=log_path, default_stack_offset=default_stack_offset)
         else:
             mllog.config(default_stack_offset=default_stack_offset)
@@ -339,6 +348,30 @@ def finalize(self, final_metrics: Dict[str, float]) -> None:
         self.run_stop(c.SUCCESS if success else c.ABORTED)
 
 
+def mlperf_checkpoint_present(ckpt_path: str) -> bool:
+    """True iff ``ckpt_path`` resolves to an existing checkpoint (i.e. a resume).
+
+    A dependency-light mirror of ``checkpoint._resolve_latest_subdir`` so
+    ``train_ranker`` can decide cold-start vs resume BEFORE the heavy checkpoint
+    import + ``setup()``. This gates the one-time INIT_START/RUN_START markers:
+    emit them on a genuine cold start only, and never re-emit on a resume
+    relaunch (the MLPerf run spans the resume). Matches the loader's resolution:
+    empty path or a base dir with no numeric subdirs => cold start.
+    """
+    if not ckpt_path:
+        return False
+    base = ckpt_path.rstrip("/")
+    # A leaf save (numeric basename) is a resume iff that dir actually exists.
+    if os.path.basename(base).isdigit():
+        return os.path.isdir(base)
+    if not os.path.isdir(base):
+        return False
+    for name in os.listdir(base):
+        if name.isdigit() and os.path.isdir(os.path.join(base, name)):
+            return True
+    return False
+
+
 @gin.configurable
 def get_mlperf_logger(
     rank: int = 0,
@@ -346,6 +379,7 @@ def get_mlperf_logger(
     benchmark_name: str = "hstu",
     submitter_name: str = "AMD",
     submission_platform: str = "MI355X",
+    fresh: bool = True,
 ) -> Optional[MLPerfLogger]:
     """Build a configured :class:`MLPerfLogger`, or ``None`` if unavailable.
 
@@ -375,4 +409,5 @@ def get_mlperf_logger(
         benchmark_name=benchmark_name,
         submitter_name=submitter_name,
         submission_platform=resolved_platform,
+        fresh=fresh,
     )
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
index 50d359ef6..d5797697b 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
@@ -78,22 +78,31 @@ def _main_func(
     from generative_recommenders.dlrm_v3.train._env_bootstrap import apply_env_bootstrap
     from generative_recommenders.dlrm_v3.train.mlperf_logging_utils import (
         get_mlperf_logger,
+        mlperf_checkpoint_present,
     )
 
     gin.parse_config_file(gin_file, skip_unknown=True)
     apply_env_bootstrap()
 
-    # Rank-0-gated MLPerf logger, only for the streaming-train-eval path.
+    # Cold-start vs resume, decided from the on-disk checkpoint BEFORE setup so
+    # the one-time INIT/RUN markers fire on a genuine cold start only and are NOT
+    # re-emitted on a resume relaunch — the MLPerf run (run_start..run_stop) spans
+    # the resume as a single coherent event stream in one appended log file.
+    mlperf_resume = mlperf_checkpoint_present(os.environ.get("CKPT_PATH", ""))
+    # Rank-0-gated MLPerf logger, only for the streaming-train-eval path. `fresh`
+    # truncates the log on cold start (one run_start per file) but appends on a
+    # resume so the pre-crash events are preserved and continued.
     mlperf_logger = (
-        get_mlperf_logger(rank=rank) if mode == "streaming-train-eval" else None
+        get_mlperf_logger(rank=rank, fresh=not mlperf_resume)
+        if mode == "streaming-train-eval"
+        else None
     )
-    # Emit INIT_START before setup only on a guaranteed cold start (CKPT_PATH
-    # unset); resume relaunches skip it so the log stays balanced.
-    mlperf_init_logged = False
-    if mlperf_logger is not None and not os.environ.get("CKPT_PATH", ""):
+    # INIT_START fires before setup on a cold start only (resume continues the
+    # already-open run, whose markers were emitted by the original process).
+    mlperf_cold_start = mlperf_logger is not None and not mlperf_resume
+    if mlperf_cold_start:
         mlperf_logger.event(key=mlperf_logger.constants.CACHE_CLEAR, value=True)
         mlperf_logger.start(key=mlperf_logger.constants.INIT_START)
-        mlperf_init_logged = True
 
     # Phase 2: heavy imports. Triton kernel modules evaluate their autotune
     # decorators here, using the env vars set above.
@@ -200,12 +209,13 @@ def _main_func(
         )
     )
 
-    # MLPerf submission info + hyperparameters + INIT_STOP/RUN_START, only on a
-    # genuine cold start so resume relaunches don't reopen the run markers.
-    mlperf_run_active = (
-        mlperf_logger is not None and mlperf_init_logged and resume_cold_start
-    )
-    if mlperf_run_active:
+    # MLPerf run markers: open the run exactly once. On a cold start emit
+    # submission info + hyperparameters + INIT_STOP/RUN_START and mark the run as
+    # started (persisted in the checkpoint via metrics.mlperf_run_started). On a
+    # resume, load_dmp_checkpoint restored mlperf_run_started=True, so we skip the
+    # markers and just continue the stream. `metrics.mlperf_run_started` guards a
+    # double-emit even if cold/resume detection and the checkpoint ever disagree.
+    if mlperf_cold_start and not metrics.mlperf_run_started:
         # Submission info + hyperparameters + INIT_STOP/RUN_START, all emitted by
         # the logger (optimizer names/LRs read from gin internally). Seed is the
         # value setup() resolved and exported to $SEED.
@@ -213,6 +223,10 @@ def _main_func(
             global_batch_size=world_size * int(train_dataloader.batch_size),
             seed=int(os.environ.get("SEED", "1")),
         )
+        metrics.mlperf_run_started = True
+    # Pass the logger to the loop whenever MLPerf logging is enabled, so block /
+    # eval / train_loss / run_stop events emit on BOTH a cold start and a resume.
+    mlperf_run_active = mlperf_logger is not None and metrics.mlperf_run_started
 
     # train loop
     try:
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
index 281a37b5d..ed456bde6 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
@@ -1034,6 +1034,12 @@ def _make_reg(ws: int) -> List[RecMetricComputation]:
         # MLPerf `samples_count` progress unit: global trained samples, persisted
         # alongside global_step so a resumed run continues the count.
         self.cumulative_train_samples: int = 0
+        # Whether the MLPerf run markers (RUN_START etc.) were already emitted for
+        # this logical run. Checkpointed so a resume relaunch knows the run is
+        # already open and does NOT re-emit INIT_START/RUN_START (the compliance
+        # checker requires EXACTLY_ONE); the resumed process continues the same
+        # event stream and emits the single RUN_STOP at convergence/end.
+        self.mlperf_run_started: bool = False
         self._rank: int = int(rank)
         # Optional MLPerf logger + LR accessor wired by the streaming loop (duck-
         # typed to avoid a train-module import cycle); drives the train_loss event.

From 5c2b9405c7e6b52edcec50935794b120bba90f5c Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Thu, 25 Jun 2026 01:04:45 +0000
Subject: [PATCH 098/113] dlrmv4: reproducible-by-default config (seed=1,
 AUC_THRESHOLD=1.0)

Default SEED back to 1 for a fixed, reproducible weight init out of the
box ($SEED=-1 still draws a fresh random seed per run).

Default AUC_THRESHOLD to 1.0 (unreachable) in both the gin binding and
the launch_slurm.sh fallback so a streaming-train-eval run trains through
all windows by default instead of early-stopping; set
$AUC_THRESHOLD=0.80275 for the MLPerf convergence target.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin                    | 10 ++++++----
 recommendation_v4/scripts/launch_slurm.sh              |  2 +-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 8a3e5b595..dd645f06a 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -80,10 +80,11 @@ apply_env_bootstrap.TRITON_FULL_AUTOTUNE = False
 # PARSE NOTE: seed_everything() runs right before make_model() in train_ranker
 # (after the full gin parse), so this binding resolves in the second parse where
 # env_int is registered. Override per-run via $SEED.
-# Default -1 draws a fresh random seed each run; pin $SEED >= 0 to reproduce.
+# Default 1 gives a fixed, reproducible seed each run; override $SEED to vary it
+# (set $SEED = -1 to draw a fresh random seed per run).
 seed_everything.seed = @seed/env_int()
 seed/env_int.key     = "SEED"
-seed/env_int.default = -1
+seed/env_int.default = 1
 
 # $DECORRELATE_DROPOUT — re-seed torch/cuda with $SEED + rank after init so HSTU
 # dropout masks differ per data-parallel rank. 1 = on, 0 = identical masks (default).
@@ -457,10 +458,11 @@ tbp/env_path.key     = "TENSORBOARD_LOG_PATH"
 tbp/env_path.default = ""
 MetricsLogger.world_size = 8
 # MLPerf convergence target: run stops when the selected eval AUC reaches it.
-# Override via $AUC_THRESHOLD.
+# Default 1.0 is unreachable, so the run trains through all windows (no early
+# stop) out of the box; set $AUC_THRESHOLD=0.80275 for the MLPerf target.
 MetricsLogger.auc_threshold = @at/env_float()
 at/env_float.key     = "AUC_THRESHOLD"
-at/env_float.default = 0.80275
+at/env_float.default = 1.0
 # EVAL_ACCURACY + early-stop are driven by per-window AUC (window_auc) vs the
 # threshold above.
 # Lifetime-AUC backend, selectable independently for the train cumulative AUC and
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index fb71bdcb7..74d7ca154 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -311,7 +311,7 @@ orchestrate() {
       ${IN_WINDOW_CKPT_FREQ:+-e IN_WINDOW_CKPT_FREQ=$IN_WINDOW_CKPT_FREQ} \
       ${CKPT_STEP_FREQ:+-e CKPT_STEP_FREQ=$CKPT_STEP_FREQ} \
       -e TRAIN_SPLIT_PERCENTAGE=${TRAIN_SPLIT_PERCENTAGE:-1.0} \
-      -e AUC_THRESHOLD=${AUC_THRESHOLD:-0.80275} \
+      -e AUC_THRESHOLD=${AUC_THRESHOLD:-1.0} \
       ${MLPERF_SUBMISSION_PLATFORM:+-e MLPERF_SUBMISSION_PLATFORM=$MLPERF_SUBMISSION_PLATFORM} \
       -e SPLIT_SALT=${SPLIT_SALT:-0} \
       -e EVAL_HOLDOUT_TS=${EVAL_HOLDOUT_TS:--1} \

From ff55513f805b8d00b38933e7bd0ee48d9d8c93ca Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Thu, 25 Jun 2026 01:44:54 +0000
Subject: [PATCH 099/113] dlrmv4: re-enable GPUDirect RDMA by default in slurm
 worker

Restore NCCL_NET_GDR_LEVEL=5 + NCCL_DMABUF_ENABLE=1 defaults so RCCL does
true GPU<->NIC DMA over bnxt_re instead of host-memory staging (~+22%
throughput at 2 nodes; 65.7%->79.8% weak-scaling efficiency). The brcmrdma
host kernel ships the inbox peer-memory client, so GDR works with no
container/host changes; non-fatal fallback to host staging if peermem is
absent. Override with NCCL_NET_GDR_LEVEL=0.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 recommendation_v4/scripts/launch_slurm.sh | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index 74d7ca154..a1b6334e3 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -571,10 +571,20 @@ worker() {
       export NCCL_IB_TIMEOUT=${NCCL_IB_TIMEOUT:-14}
       export NCCL_IGNORE_CPU_AFFINITY=${NCCL_IGNORE_CPU_AFFINITY:-1}
       export RCCL_MSCCL_ENABLE=${RCCL_MSCCL_ENABLE:-0}
-      # GPU-Direct RDMA needs DMABUF/peermem (neither in-container here) — leave
-      # GDR off so RCCL stages through host memory (still real RDMA over bnxt_re).
-      export NCCL_NET_GDR_LEVEL=${NCCL_NET_GDR_LEVEL:-0}
-      echo "[$(date)] NCCL: RDMA over bnxt_re (GID idx ${NCCL_IB_GID_INDEX}, TC ${NCCL_IB_TC}, GDR_LEVEL=${NCCL_NET_GDR_LEVEL}; meta64 bnxt_re config, validated)" | tee -a "$LOG"
+      # GPU-Direct RDMA: ENABLED by default. The brcmrdma host kernel ships the
+      # inbox peer-memory client (`ib_register_peer_memory_client` in
+      # /proc/kallsyms), so RCCL does true GPU<->NIC DMA over bnxt_re instead of
+      # bouncing through host memory. Measured ~+22% throughput at 2 nodes
+      # (65.7%->79.8% weak-scaling efficiency) vs the old host-staged path.
+      # GDR_LEVEL=5 (most permissive) is required so GDR is used even when the GPU
+      # and NIC cross the CPU root complex. NCCL_DMABUF_ENABLE=1 is a harmless
+      # no-op here (kernel lacks CONFIG_DMABUF_MOVE_NOTIFY/CONFIG_PCI_P2PDMA, so
+      # peermem carries it). Enabling is non-fatal: if peermem is ever absent RCCL
+      # just logs "GDR 0" and falls back to host staging. Override with
+      # NCCL_NET_GDR_LEVEL=0 to force the legacy host-staged path.
+      export NCCL_NET_GDR_LEVEL=${NCCL_NET_GDR_LEVEL:-5}
+      export NCCL_DMABUF_ENABLE=${NCCL_DMABUF_ENABLE:-1}
+      echo "[$(date)] NCCL: RDMA over bnxt_re (GID idx ${NCCL_IB_GID_INDEX}, TC ${NCCL_IB_TC}, GDR_LEVEL=${NCCL_NET_GDR_LEVEL}, DMABUF=${NCCL_DMABUF_ENABLE}; meta64 bnxt_re config, validated)" | tee -a "$LOG"
     fi
   fi
   export NCCL_DEBUG=${NCCL_DEBUG:-WARN}

From 7b5e8690500e1f3b5ad88eccafdc23a0fadc6823 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Thu, 25 Jun 2026 01:52:05 +0000
Subject: [PATCH 100/113] =?UTF-8?q?dlrmv4:=20README=20=E2=80=94=20match=20?=
 =?UTF-8?q?launcher's=20actual=20smoke-shaped=20defaults?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The slimmed launch_slurm.sh has smoke-shaped run defaults (START_TS=150,
NUM_TRAIN_TS=1, NUM_TRAIN_BATCHES=20, per-window eval) and no SMOKE=1
toggle, so a bare submit is a short functional run — not the 299-window
reference. Document the bare submit as the smoke run and give the explicit
env-override command for the full reference sweep; drop the unimplemented
SMOKE=1 instructions.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 recommendation_v4/README.MD | 57 ++++++++++++++++++++-----------------
 1 file changed, 31 insertions(+), 26 deletions(-)

diff --git a/recommendation_v4/README.MD b/recommendation_v4/README.MD
index 6476e7729..b472cb0a8 100644
--- a/recommendation_v4/README.MD
+++ b/recommendation_v4/README.MD
@@ -11,9 +11,10 @@ auto-detects its context: run inside the container it takes the single-node
 worker path; submitted via `sbatch` it orchestrates the multi-node run
 (provision + per-node launch). N=1 is byte-for-byte the legacy single-node path.
 
-A bare submit reproduces the **frozen reference run** (full 299-window sweep +
-data-fraction eval cadence) — all run-shape/cadence defaults are baked in, so no
-env knobs are required:
+A bare submit runs a **short functional smoke run** (a single capped window with
+per-window eval) — the built-in defaults are intentionally small so a bare
+submit validates the full provision + launch + train/eval path quickly without
+consuming a whole window:
 
 **Single node (8-GPU):**
 
@@ -27,11 +28,14 @@ sbatch --nodes=1 scripts/launch_slurm.sh
 sbatch --nodes=2 scripts/launch_slurm.sh
 ```
 
-For a fast functional check instead of a full run, prepend `SMOKE=1` (short
-window, capped batches, per-window eval):
+To run the **full reference sweep** instead, set the run-shape/cadence knobs
+explicitly (full 299-window sweep + data-fraction eval cadence):
 
 ```bash
-SMOKE=1 sbatch --nodes=1 scripts/launch_slurm.sh
+START_TS=0 NUM_TRAIN_TS=299 \
+NUM_TRAIN_BATCHES=0 NUM_EVAL_BATCHES=0 \
+EVAL_EVERY_N_WINDOWS=0 EVAL_EVERY_DATA_PCT=0.005 \
+sbatch --nodes=1 scripts/launch_slurm.sh
 ```
 
 Multi-node uses real RDMA (RoCEv2). The fabric/NCCL setup and every
@@ -49,26 +53,27 @@ bash scripts/launch_slurm.sh
 
 Data path resolves at runtime via `env_path` gin macros (see [yambda_5b.gin](generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin)). Traces and any per-run outputs land in `results/<RUN_NAME>/`.
 
-### 1.1 The frozen reference shape
-
-The reference run-shape and eval cadence are the built-in defaults (set in the
-orchestrate phase of `scripts/launch_slurm.sh`), so the bare `sbatch` commands
-above ARE the reference run. The single- and multi-node launchers are identical
-except for `--nodes`; the trainer auto-derives `NNODES`/`NODE_RANK`/`MASTER_ADDR`/
-`WORLD_SIZE` from SLURM. The baked-in shape:
-
-| knob | reference default |
-|---|---|
-| `START_TS` / `NUM_TRAIN_TS` | 0 / 299 (full sweep) |
-| eval cadence | `EVAL_EVERY_DATA_PCT=0.005` (every 0.5% of the training stream — a fixed number of samples between evals, independent of node count), per-window cadence off |
-| `NUM_TRAIN_BATCHES` / `NUM_EVAL_BATCHES` | 0 / 0 (consume full windows) |
-
-To customize, override any knob via env (e.g. `RUN_NAME=...`, `LOG=...`,
-`AUC_THRESHOLD=...`). Selecting the per-window eval cadence
-(`EVAL_EVERY_N_WINDOWS>0`) automatically disables the data-fraction one (they are
-mutually exclusive). Keep all run outputs (`LOG`, checkpoints, mllog,
-TensorBoard) under a writable scratch path you own — the dataset mount is
-read-only.
+### 1.1 Run shape: smoke default vs. full reference
+
+The run-shape and eval cadence come from env-overridable defaults set in the
+orchestrate phase of `scripts/launch_slurm.sh`. The built-in defaults are
+**smoke-shaped** (small, for a fast functional check); the **full reference
+sweep** is the same launcher with the run-shape knobs overridden. The
+single- and multi-node launchers are identical except for `--nodes`; the trainer
+auto-derives `NNODES`/`NODE_RANK`/`MASTER_ADDR`/`WORLD_SIZE` from SLURM.
+
+| knob | smoke default (bare submit) | full reference (override) |
+|---|---|---|
+| `START_TS` / `NUM_TRAIN_TS` | 150 / 1 (one window) | 0 / 299 (full sweep) |
+| `NUM_TRAIN_BATCHES` / `NUM_EVAL_BATCHES` | 20 / 10 (capped) | 0 / 0 (consume full windows) |
+| eval cadence | `EVAL_EVERY_N_WINDOWS=1` (per-window) | `EVAL_EVERY_N_WINDOWS=0` + `EVAL_EVERY_DATA_PCT=0.005` (every 0.5% of the training stream — a fixed number of samples between evals, independent of node count) |
+
+Override any knob via env (e.g. `RUN_NAME=...`, `LOG=...`, `AUC_THRESHOLD=...`);
+see the full-reference command in §1 above. The per-window and data-fraction eval
+cadences are mutually exclusive — selecting one (`EVAL_EVERY_N_WINDOWS>0` or
+`EVAL_EVERY_DATA_PCT>0`) requires the other be 0. Keep all run outputs (`LOG`,
+checkpoints, mllog, TensorBoard) under a writable scratch path you own — the
+dataset mount is read-only.
 
 ## 2. Data preparation
 

From 881d92518d81474e7c37db90d864f08006ed7510 Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Thu, 25 Jun 2026 02:37:19 +0000
Subject: [PATCH 101/113] recommendation_v4: add MLPerf reference scripts,
 structure, and docs

Bring the HSTU/yambda-5b benchmark in line with MLPerf Training reference
conventions:
- add download_dataset.sh / verify_dataset.sh / run_and_time.sh wrappers
- add md5sums checksum file (placeholder hashes) for dataset verification
- restructure README.MD to the MLPerf spec (summaries, model+paper,
  hyperparameter table with tuning rules, quality target, eval frequency,
  steps-to-run)
- freeze requirements.txt to the exact Dockerfile/training_recipe versions
- add blank RCP placeholder (rcp/README.md) to be filled once convergence
  runs are generated

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 recommendation_v4/README.MD                   | 472 +++++++++++-------
 recommendation_v4/download_dataset.sh         |  39 ++
 .../md5sums_yambda_5b_processed.txt           |  22 +
 recommendation_v4/rcp/README.md               |  28 ++
 recommendation_v4/requirements.txt            |  51 +-
 recommendation_v4/run_and_time.sh             |  65 +++
 recommendation_v4/verify_dataset.sh           |  64 +++
 7 files changed, 548 insertions(+), 193 deletions(-)
 create mode 100755 recommendation_v4/download_dataset.sh
 create mode 100644 recommendation_v4/md5sums_yambda_5b_processed.txt
 create mode 100644 recommendation_v4/rcp/README.md
 create mode 100755 recommendation_v4/run_and_time.sh
 create mode 100755 recommendation_v4/verify_dataset.sh

diff --git a/recommendation_v4/README.MD b/recommendation_v4/README.MD
index b472cb0a8..e078bcf0a 100644
--- a/recommendation_v4/README.MD
+++ b/recommendation_v4/README.MD
@@ -1,115 +1,128 @@
-# Recommendation v4 — HSTU + Yambda-5b
+# Recommendation v4 — HSTU sequential recommendation (Yambda-5b)
+
+MLPerf Training reference benchmark. This is a fork of
+[meta-recsys/generative-recommenders](https://github.com/meta-recsys/generative-recommenders)
+extended to train an HSTU (Hierarchical Sequential Transduction Units) ranking
+model on the [Yambda-5b](https://huggingface.co/datasets/yandex/yambda)
+music-recommendation dataset, sized as an MLPerf-style training benchmark inside
+the `mlcommons/training` tree.
+
+## 1. Summary
+
+This benchmark trains a model that predicts what a person will listen to next.
+Given the history of songs a user has played, liked, or skipped, the model
+learns to rank which song the user is most likely to genuinely listen to (rather
+than skip) next. This is the same kind of "what should we recommend next?"
+problem that powers music and video streaming feeds. The model is trained on a
+large public dataset of anonymized music-listening events and is scored on how
+well it predicts future listens it has never seen.
+
+## 2. Benchmark overview (technical)
+
+The model is a **sequential recommender**: instead of treating each interaction
+independently (as classic click-through-rate models like DLRM-DCNv2 do), it
+consumes a user's chronologically ordered interaction history as a sequence and
+applies a Transformer-style attention stack (HSTU) over it. Each training
+example is one "anchor" listen event together with that user's prior history
+(user interaction history, or UIH) and a set of contextual/cross features. The
+supervised target is a binary `listen_plus` label (a real listen: played for at
+least 50% of the track) versus a skip.
+
+Training is **streaming / temporal-order**: the timeline is sliced into
+fixed-duration windows and the model trains on window `T` then evaluates on the
+strictly-future window `T+1`, so every reported metric is genuine
+next-period generalization with no future leakage. The quality metric is
+**AUC** on the held-out future window, and the convergence target is
+**AUC >= 0.80275** (matching the DLRM-DCNv2-style target).
+
+The reference runs on 8 GPUs (validated on AMD Instinct MI350X / MI355X and
+NVIDIA B200; see [docs/training_recipe.md](docs/training_recipe.md)) and scales
+to multi-node via SLURM.
+
+## 3. Directions — steps to run
+
+The benchmark follows the standard MLPerf reference script flow:
 
-This is a fork of [meta-recsys/generative-recommenders](https://github.com/meta-recsys/generative-recommenders) extended to train HSTU (Hierarchical Sequential Transducer Units) on the [Yambda-5b](https://huggingface.co/datasets/yandex/yambda) music-recommendation dataset, sized as an MLPerf-style training benchmark inside the `mlcommons/training` tree.
-
-For the original repository and the underlying ICML'24 paper (*Actions Speak Louder than Words*), see the upstream README at the link above. This README focuses on what this fork adds: the Yambda data pipeline, the per-pool gather strategy, and how the data feeds into the HSTU `modules/` (dlrm_v3) path.
-
-## 1. Quick start (Yambda, N×8-GPU)
-
-`scripts/launch_slurm.sh` is the single entry point for **N ≥ 1 nodes**. It
-auto-detects its context: run inside the container it takes the single-node
-worker path; submitted via `sbatch` it orchestrates the multi-node run
-(provision + per-node launch). N=1 is byte-for-byte the legacy single-node path.
+```bash
+# 0. build/enter the container (canonical frozen environment)
+docker build -t recommendation_v4 .
+docker run --rm -it --device=/dev/kfd --device=/dev/dri \
+    -v /path/to/dlrm_data:/data/mlperf_dlrm_v4 recommendation_v4
 
-A bare submit runs a **short functional smoke run** (a single capped window with
-per-window eval) — the built-in defaults are intentionally small so a bare
-submit validates the full provision + launch + train/eval path quickly without
-consuming a whole window:
+# 1. download + preprocess the dataset
+DLRM_DATA_PATH=/data/mlperf_dlrm_v4 ./download_dataset.sh
 
-**Single node (8-GPU):**
+# 2. verify the preprocessed dataset
+DLRM_DATA_PATH=/data/mlperf_dlrm_v4 ./verify_dataset.sh
 
-```bash
-sbatch --nodes=1 scripts/launch_slurm.sh
+# 3. run the benchmark to the quality target and report wall-clock time
+DLRM_DATA_PATH=/data/mlperf_dlrm_v4 ./run_and_time.sh
 ```
 
-**Multi-node (N×8-GPU):**
+- [`download_dataset.sh`](download_dataset.sh) wraps the preprocessing pipeline
+  in `generative_recommenders.dlrm_v3.preprocess_public_data` (HuggingFace
+  download + temporal split + session segmentation + item-popularity counts).
+- [`verify_dataset.sh`](verify_dataset.sh) checks the preprocessed files against
+  [`md5sums_yambda_5b_processed.txt`](md5sums_yambda_5b_processed.txt) (falls
+  back to a layout check until the canonical checksums are pinned).
+- [`run_and_time.sh`](run_and_time.sh) runs the full-reference streaming
+  train+eval sweep on a single 8-GPU host with `AUC_THRESHOLD=0.80275` and MLPerf
+  compliance logging, printing the elapsed time of the timed region.
 
-```bash
-sbatch --nodes=2 scripts/launch_slurm.sh
-```
+### 3.1 Multi-node (SLURM)
 
-To run the **full reference sweep** instead, set the run-shape/cadence knobs
-explicitly (full 299-window sweep + data-fraction eval cadence):
+For N >= 1 nodes use [`scripts/launch_slurm.sh`](scripts/launch_slurm.sh), which
+provisions the container on each node and launches the same trainer. A bare
+submit runs a small functional smoke run; set the run-shape knobs for the full
+sweep:
 
 ```bash
+# smoke (fast functional check)
+sbatch --nodes=1 scripts/launch_slurm.sh
+
+# full reference sweep
 START_TS=0 NUM_TRAIN_TS=299 \
 NUM_TRAIN_BATCHES=0 NUM_EVAL_BATCHES=0 \
 EVAL_EVERY_N_WINDOWS=0 EVAL_EVERY_DATA_PCT=0.005 \
-sbatch --nodes=1 scripts/launch_slurm.sh
-```
-
-Multi-node uses real RDMA (RoCEv2). The fabric/NCCL setup and every
-cluster-specific knob (interfaces, HCAs, GID/TC, RDMA overlay) are documented in
-[docs/multi_node_config.md](docs/multi_node_config.md) — read it before running on
-a different cluster.
-
-Override the data path or run name without editing the gin:
-
-```bash
-DLRM_DATA_PATH=/apps/chcai/dlrm_data \
-RUN_NAME=my_experiment \
-bash scripts/launch_slurm.sh
+AUC_THRESHOLD=0.80275 \
+sbatch --nodes=2 scripts/launch_slurm.sh
 ```
 
-Data path resolves at runtime via `env_path` gin macros (see [yambda_5b.gin](generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin)). Traces and any per-run outputs land in `results/<RUN_NAME>/`.
-
-### 1.1 Run shape: smoke default vs. full reference
+Multi-node uses real RDMA (RoCEv2); the fabric/NCCL setup is documented in
+[docs/multi_node_config.md](docs/multi_node_config.md). Keep all run outputs
+(log, checkpoints, mllog, TensorBoard) under a writable scratch path you own —
+the dataset mount is read-only.
 
-The run-shape and eval cadence come from env-overridable defaults set in the
-orchestrate phase of `scripts/launch_slurm.sh`. The built-in defaults are
-**smoke-shaped** (small, for a fast functional check); the **full reference
-sweep** is the same launcher with the run-shape knobs overridden. The
-single- and multi-node launchers are identical except for `--nodes`; the trainer
-auto-derives `NNODES`/`NODE_RANK`/`MASTER_ADDR`/`WORLD_SIZE` from SLURM.
+## 4. Model
 
-| knob | smoke default (bare submit) | full reference (override) |
-|---|---|---|
-| `START_TS` / `NUM_TRAIN_TS` | 150 / 1 (one window) | 0 / 299 (full sweep) |
-| `NUM_TRAIN_BATCHES` / `NUM_EVAL_BATCHES` | 20 / 10 (capped) | 0 / 0 (consume full windows) |
-| eval cadence | `EVAL_EVERY_N_WINDOWS=1` (per-window) | `EVAL_EVERY_N_WINDOWS=0` + `EVAL_EVERY_DATA_PCT=0.005` (every 0.5% of the training stream — a fixed number of samples between evals, independent of node count) |
+The model is **HSTU** (Hierarchical Sequential Transduction Units), the
+generative-recommender architecture from Meta's ICML'24 paper *Actions Speak
+Louder than Words: Trillion-Parameter Sequential Transducers for Generative
+Recommendations* ([arXiv:2402.17152](https://arxiv.org/abs/2402.17152)).
 
-Override any knob via env (e.g. `RUN_NAME=...`, `LOG=...`, `AUC_THRESHOLD=...`);
-see the full-reference command in §1 above. The per-window and data-fraction eval
-cadences are mutually exclusive — selecting one (`EVAL_EVERY_N_WINDOWS>0` or
-`EVAL_EVERY_DATA_PCT>0`) requires the other be 0. Keep all run outputs (`LOG`,
-checkpoints, mllog, TensorBoard) under a writable scratch path you own — the
-dataset mount is read-only.
+HSTU replaces the feature-interaction stack of a classic DLRM with a stack of
+pointwise-attention "transducer" layers operating over the user's interaction
+sequence. In this benchmark (the `dlrm_v3` path):
 
-## 2. Data preparation
+- **Embeddings**: sparse tables for `item_id`, `artist_id`, `album_id`, `uid`,
+  and 7 cross-feature hashes (e.g. `user_x_artist`, `item_x_hour`), sharded
+  across GPUs with TorchRec `DistributedModelParallel`.
+- **Sequence model**: an HSTU attention stack (`HSTU_NUM_LAYERS`, default 3)
+  over the interleaved UIH, computed with a fused jagged-attention Triton kernel
+  in bf16.
+- **Supervision**: a single `listen_plus` binary task. The candidate event's
+  `action_weight` carries the supervision bit, and BCE loss is masked to
+  listen_plus candidates.
 
-```bash
-python3 -m generative_recommenders.dlrm_v3.preprocess_public_data \
-  --dataset yambda-5b --data-path /apps/chcai/dlrm_data
-```
+See `generative_recommenders/dlrm_v3/configs.py`
+(`get_hstu_configs`, `get_embedding_table_config`) for the exact architecture
+and table specs, and the upstream README for the original modeling code.
 
-This downloads the 5b variant of [yandex/yambda](https://huggingface.co/datasets/yandex/yambda) from HuggingFace, then:
+## 5. Dataset
 
-1. **Encodes** the raw `event_type` string column into a uint8 lookup (listen=0, like=1, dislike=2, unlike=3, undislike=4).
-2. **Splits** events temporally — 300 train days, 30-min gap, 1 test day — by `Global Temporal Split` (GTS).
-3. **Segments** per-user event timelines into sessions on a 30-min inactivity gap.
-4. **Computes** per-item popularity for downstream metric weighting.
-5. **Writes** the layout `DLRMv3YambdaDataset` expects:
-
-```
-<data-path>/
-├── raw/5b/multi_event.parquet                    50 GB (downloaded)
-├── shared_metadata/
-│   ├── artist_item_mapping.parquet               60 MB
-│   ├── album_item_mapping.parquet                76 MB
-│   └── embeddings.parquet                        18 GB (unused by HSTU training)
-└── processed_5b/
-    ├── train_sessions.parquet                    47 GB  ← main training input
-    ├── test_events.parquet                       152 MB
-    ├── session_index.parquet                     600 MB
-    ├── item_popularity.npy                       75 MB
-    └── split_meta.json                           anchor + boundary stats
-```
-
-For smaller variants (yambda-50m / yambda-500m) substitute the dataset name. The preprocessor takes ~2 min for 50m and ~53 min for 5b end-to-end.
-
-## 3. Yambda dataset statistics
-
-Numbers from the 5b variant, after preprocessing:
+[Yambda-5b](https://huggingface.co/datasets/yandex/yambda) is a public
+anonymized music-recommendation dataset from Yandex. The `5b` variant is used
+for the reference. Statistics after preprocessing:
 
 | | |
 |---|---|
@@ -120,23 +133,61 @@ Numbers from the 5b variant, after preprocessing:
 | Mean events per user | 4,763 |
 | Train events (300d) | 4.76 B |
 | Test events (1d) | 22.4 M |
-| Training positions (≥2039 prior events filter) | **3.23 B** |
 | Item catalog size | 9.39 M |
 
-### 3.1 Per-event-type distribution (across the full 4.76 B corpus)
+### 5.1 Per-event-type distribution (across the full 4.76 B corpus)
 
 | Pool | Definition | Count | Share |
 |---|---|---|---|
-| **listen_plus (lp)** | `is_listen AND played_ratio ≥ 50%` | 2.92 B | **61.3%** |
+| **listen_plus (lp)** | `is_listen AND played_ratio >= 50%` | 2.92 B | **61.3%** |
 | **skip** | `is_listen AND played_ratio < 50%` | 1.71 B | **35.9%** |
 | **like** | explicit thumbs-up action | 89 M | **1.9%** |
 | other | dislike / unlike / undislike | 47 M | 1.0% |
 
-The `like` pool is roughly **30× rarer** than `lp` — important context for the gather strategy below.
+The `like` pool is roughly **30x rarer** than `lp` — important context for the
+gather strategy in §6.
 
-## 4. How data is fed to HSTU
+### 5.2 Preprocessing & download
 
-For every training anchor (a LISTEN event with ≥ `min_history` prior events — frozen default `4086`, the "full `history_length` of context required" power-users filter; set `$MIN_HISTORY=0` to include ~all users plus their cold-start first event), the dataset builds a `(uih_kjt, candidate_kjt)` pair:
+`./download_dataset.sh` (which calls
+`python3 -m generative_recommenders.dlrm_v3.preprocess_public_data --dataset
+yambda-5b --data-path <root>`) downloads the 5b variant from HuggingFace, then:
+
+1. **Encodes** the raw `event_type` string into a uint8 lookup (listen=0,
+   like=1, dislike=2, unlike=3, undislike=4).
+2. **Splits** events temporally — 300 train days, 30-min gap, 1 test day — by
+   Global Temporal Split (GTS).
+3. **Segments** per-user event timelines into sessions on a 30-min inactivity
+   gap.
+4. **Computes** per-item popularity for downstream metric weighting.
+5. **Writes** the layout `DLRMv3YambdaDataset` expects:
+
+```
+<data-path>/
+├── raw/5b/multi_event.parquet                    50 GB (downloaded)
+├── shared_metadata/
+│   ├── artist_item_mapping.parquet               60 MB
+│   ├── album_item_mapping.parquet                76 MB
+│   └── embeddings.parquet                         18 GB (unused by HSTU training)
+└── processed_5b/
+    ├── train_sessions.parquet                     47 GB  ← main training input
+    ├── test_events.parquet                       152 MB
+    ├── session_index.parquet                     600 MB
+    ├── item_popularity.npy                        75 MB
+    └── split_meta.json                            anchor + boundary stats
+```
+
+For smaller variants (`yambda-50m` / `yambda-500m`) substitute the dataset name
+(`DATASET=yambda-50m ./download_dataset.sh`). Preprocessing takes ~2 min for 50m
+and ~53 min for 5b end-to-end.
+
+Integrity is verified with `./verify_dataset.sh` against
+[`md5sums_yambda_5b_processed.txt`](md5sums_yambda_5b_processed.txt).
+
+## 6. How data is fed to HSTU
+
+For every training anchor (a LISTEN event with >= `min_history` prior events),
+the dataset builds a `(uih_kjt, candidate_kjt)` pair:
 
 ```
 UIH (User Interaction History):
@@ -154,42 +205,121 @@ CANDIDATE (the LISTEN event at the anchor):
   item_dummy_watchtime
 ```
 
-The candidate's `action_weight` is **the supervision label**: HSTU's `_get_supervision_labels_and_weights` masks BCE training to `(supervision_bitmask & task_weight) > 0`, with `task_weight = 1` (LP bit) for the single `listen_plus` task — so only listen_plus candidates supervise.
+The candidate's `action_weight` is **the supervision label**: HSTU's
+`_get_supervision_labels_and_weights` masks BCE training to
+`(supervision_bitmask & task_weight) > 0`, with `task_weight = 1` (LP bit) for
+the single `listen_plus` task — so only listen_plus candidates supervise.
 
-### 4.1 Per-pool gather (the cap = L // 3 strategy)
+### 6.1 Per-pool gather (the cap = L // 3 strategy)
 
-The UIH is built by `DLRMv3YambdaDataset._gather_interleaved_history`. For each anchor, it:
+The UIH is built by `DLRMv3YambdaDataset._gather_interleaved_history`. For each
+anchor it:
 
-1. Scans the most recent `scan_window` (default 20,000) events of any type before the anchor, **clipped to user_start** so users with shorter history get a smaller window.
-2. From those, takes **the last `L // 3` events** from each of the three pools (lp, like, skip) independently.
-3. Concatenates the three streams and **re-sorts chronologically** to produce an interleaved sequence.
-4. Tags each event's pool identity into `action_weight` via OR'd bitmask (LP=1, LIKE=2, SKIP=4).
+1. Scans the most recent `scan_window` (default 20,000) events of any type
+   before the anchor, **clipped to user_start**.
+2. From those, takes **the last `L // 3` events** from each of the three pools
+   (lp, like, skip) independently.
+3. Concatenates and **re-sorts chronologically** to produce an interleaved
+   sequence.
+4. Tags each event's pool identity into `action_weight` via OR'd bitmask
+   (LP=1, LIKE=2, SKIP=4).
 
-With `L = 2039` and `max_seq_len = 2048`:
-- Per-pool cap = `L // 3 = 679`
-- Maximum total UIH = `3 × 679 = 2037` events
-- Plus `8 contextual + 1 candidate = 9` overhead → 2046 ≤ 2048 model budget (no truncation)
+With `history_length = 4086` and `max_seq_len = 4096`: per-pool cap = `4086 //
+3 = 1362`, and `3 × 1362 + 8 contextual + 1 candidate = 4095 <= 4096` (no
+truncation). Because the `like` pool is rare (1.9%) it under-fills (~105 events
+per anchor on average); the Triton jagged-attention backend skips unfilled
+slots, so the under-fill costs sequence budget but not GPU compute.
 
-### 4.2 Effective per-anchor fill on real data
+## 7. Optimizer
 
-Because the `like` pool is rare (1.9% of events) and the average user has only ~4,763 lifetime events:
+Two optimizers, configured in
+[`yambda_5b.gin`](generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin):
 
-| Pool | per-pool cap (L//3) | actual avg fill per anchor | fill rate |
+| component | optimizer | gin binding | key settings |
 |---|---|---|---|
-| lp | 679 | ~673 | **99%** |
-| like | 679 | ~105 | **15%** (data-bounded, not cap-bounded) |
-| skip | 679 | ~624 | 92% |
-| **total UIH** | 2037 max | **~1402** | 69% |
-
-The `like` cap of 679 is unreachable for yambda data — at the 1.9% global like rate, filling 679 likes would require a user to have ~36k prior events, but the **longest user in the dataset has only 27,738 events total** (and the median user has 2,695). So under-fill on `like` is fundamental to the data.
-
-This means the model sees on average ~1,402 UIH events per sample, not the theoretical 2,037. With the TRITON jagged-attention backend the GPU only does work for the actual events, so the under-fill costs **sequence budget but not GPU compute** — no wasted attention work, just less context per sample than the budget suggests.
-
-## 5. Streaming (temporal-order) training
-
-`scripts/launch_slurm.sh` defaults to `--mode streaming-train-eval`, which
-trains Yambda in strict wall-clock order instead of shuffling the whole corpus.
-The timeline is sliced into fixed-duration **windows** (default 1 day,
+| Dense params (HSTU blocks, MLPs) | **Adam** | `dense_optimizer_factory_and_class.*` | lr `DENSE_LR`, betas (0.95, 0.999), eps 1e-8, weight_decay 0 |
+| Sparse embedding tables | **RowWiseAdagrad** (fused FBGEMM TBE) | `sparse_optimizer_factory_and_class.*` | lr `SPARSE_LR`, eps 1e-8, weight_decay 0 |
+
+Gradient clipping (`GRAD_CLIP_NORM`, default `max_norm=1.0`) is applied to the
+dense parameters on the streaming path; the fused sparse optimizer is
+unaffected. Training is bf16 mixed precision (`make_model.bf16_training=True`).
+
+## 8. Hyperparameters
+
+All tunable hyperparameters live in
+[`yambda_5b.gin`](generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin)
+(the config-file source of truth) and are **overridable via environment
+variables** (the env value takes precedence over the gin default, per MLPerf
+CONTRIBUTING rule 4d). The gin macros (`@env_int`, `@env_float`, `@env_str`)
+enforce the correct type for each parameter.
+
+| hyperparameter | env var | gin binding | type | default | tuning rule |
+|---|---|---|---|---|---|
+| Per-rank batch size | `BATCH_SIZE` | `batch_size` | int | 1024 | positive integer (global batch = `BATCH_SIZE × world_size`) |
+| Dense learning rate | `DENSE_LR` | `dense_optimizer_factory_and_class.learning_rate` | float | 1e-7 | positive float |
+| Sparse learning rate | `SPARSE_LR` | `sparse_optimizer_factory_and_class.learning_rate` | float | 1e-7 | positive float |
+| Grad clip max-norm | `GRAD_CLIP_NORM` | `streaming_train_eval_loop.grad_clip_norm` | float | 1.0 | float >= 0 (0 disables) |
+| RNG seed | `SEED` | `seed_everything.seed` | int | 1 | any integer (-1 = random per run) |
+| HSTU attention layers | `HSTU_NUM_LAYERS` | `get_hstu_configs.hstu_attn_num_layers` | int | 3 | positive integer |
+| UIH history length | `HISTORY_LENGTH` | `get_dataset.history_length` | int | 4086 | positive integer (per-pool cap = L//3) |
+| Max sequence length | `MAX_SEQ_LEN` | `get_hstu_configs.max_seq_len` | int | 4096 | positive integer (>= `history_length + 9`) |
+| History strategy | `HISTORY_STRATEGY` | `get_dataset.history_strategy` | str | `interleaved` | one of `interleaved` \| `last_n` |
+| Min history (anchor floor) | `MIN_HISTORY` | `get_dataset.min_history` | int | 4086 | integer >= 0 |
+| Train user split | `TRAIN_SPLIT_PERCENTAGE` | `*.train_split_percentage` | float | 1.0 | float in (0, 1] |
+| Streaming shuffle fraction | `STREAMING_SHUFFLE_FRACTION` | `get_dataset.streaming_shuffle_fraction` | float | 0.0 | float in [0, 1] |
+| Streaming shuffle seed | `STREAMING_SHUFFLE_SEED` | `get_dataset.streaming_shuffle_seed` | int | 0 | any integer |
+| Split salt | `SPLIT_SALT` | `get_dataset.split_salt` | int | 0 | any integer |
+| Start window | `START_TS` | `streaming_train_eval_loop.start_ts` | int | 150 | integer >= 0 |
+| Number of train windows | `NUM_TRAIN_TS` | `streaming_train_eval_loop.num_train_ts` | int | 149 | positive integer (clamped to available) |
+| Sparse A2A fwd precision | `SPARSE_A2A_FWD` | `make_optimizer_and_shard.sparse_a2a_forward_precision` | str | `fp32` | one of `fp32` \| `bf16` \| `fp16` |
+| Sparse A2A bwd precision | `SPARSE_A2A_BWD` | `make_optimizer_and_shard.sparse_a2a_backward_precision` | str | `fp32` | one of `fp32` \| `bf16` \| `fp16` |
+
+Non-tunable / fixed reference values (optimizer betas (0.95, 0.999), eps 1e-8,
+weight_decay 0, bf16 training, streaming window = 86400 s) are pinned in the gin
+file. Submitters tuning hyperparameters must follow the allowed values above and
+the
+[MLPerf training rules](https://github.com/mlcommons/training_policies/blob/master/training_rules.adoc#hyperparameters).
+
+## 9. Quality target & evaluation
+
+- **Metric**: AUC on the held-out future evaluation window (`window_auc` for the
+  `listen_plus` task), computed by `MetricsLogger` in
+  `generative_recommenders/dlrm_v3/utils.py`.
+- **Target**: **eval AUC >= 0.80275**. Set via `AUC_THRESHOLD=0.80275`
+  (`MetricsLogger.auc_threshold`); the run logs `RUN_STOP` with `SUCCESS` and
+  stops once the target is reached. The gin default of `1.0` is unreachable
+  (trains all windows with no early stop) and is overridden by the reference
+  scripts.
+- **Evaluation frequency**: the full-reference run uses
+  `EVAL_EVERY_DATA_PCT=0.005` — evaluate every 0.5% of the training stream
+  (~200 evenly-data-spaced eval points), independent of node count. The
+  alternative per-window cadence (`EVAL_EVERY_N_WINDOWS`) is mutually exclusive.
+- **Evaluation set**: a fixed held-out future window (`eval_holdout_ts`, default
+  `start_ts + num_train_ts`); with `TRAIN_SPLIT_PERCENTAGE < 1.0` the held-out
+  users' anchors over that window form the eval set. The temporal one-window
+  lead guarantees no future leakage (see §11).
+
+Evaluation is always one window ahead of training, so reported AUC is genuine
+next-period generalization.
+
+## 10. Reference Convergence Points (RCP)
+
+*Placeholder — to be generated.*
+
+RCPs have **not yet been generated** for this benchmark. Per the MLPerf
+[CONTRIBUTING guidance](https://github.com/mlcommons/training_policies/blob/master/CONTRIBUTING.md),
+RCPs must be generated for at least 3 reasonable batch sizes using at least 2N
+seeds (N = number of submission runs), in FP32 or BF16, with the exact precision
+recorded in the RCP JSON. The convergence curves (steps/samples to reach
+AUC >= 0.80275) will be added under [`rcp/`](rcp/) once the convergence runs are
+complete. This section is intentionally left blank for now.
+
+## 11. Streaming (temporal-order) training
+
+`scripts/launch_slurm.sh` and `run_and_time.sh` default to
+`--mode streaming-train-eval`, which trains Yambda in strict wall-clock order
+instead of shuffling the whole corpus. The timeline is sliced into
+fixed-duration **windows** (default 1 day,
 `get_dataset.streaming_window_seconds = 86400`), and the loop walks them forward:
 
 ```
@@ -203,76 +333,48 @@ window T:    train          window T+1:  eval (then train)   window T+2: eval (t
 i.e. for each step it **trains window T, then evaluates window T+1** before
 advancing — always predicting the immediate future from the past.
 
-### 5.1 Temporal guarantee
+### 11.1 Temporal guarantee
 
 The streaming path enforces **no future leakage** at two levels:
 
-1. **Across windows** — a window is the set of anchors whose *target/candidate*
+1. **Across windows** — a window is the set of anchors whose target/candidate
    timestamp falls in `[t_min + T·W, t_min + (T+1)·W)`. Training only ever sees
-   windows `≤ T`; the evaluation window `T+1` is strictly in the future of every
-   training anchor it is scored against. Eval always leads train by exactly one
-   window, so reported eval NE/AUC is genuine next-period generalization, never
-   an in-sample measurement.
-2. **Within an anchor** — history is still gathered **causally**: the UIH scan
-   is `scan_start:flat_pos` (events strictly before the anchor), so even though a
-   long user history may reach back across earlier windows, no event at or after
-   the anchor's timestamp can enter its features. Forward-time windowing and
-   causal history are independent guarantees, and both hold simultaneously.
-
-Note this is a *temporal* split on the training stream — distinct from the
-preprocessing GTS split (§2) that carves off the final test day. Windows are
+   windows `<= T`; the evaluation window `T+1` is strictly in the future of every
+   training anchor it is scored against.
+2. **Within an anchor** — history is gathered **causally**: the UIH scan is
+   `scan_start:flat_pos` (events strictly before the anchor), so no event at or
+   after the anchor's timestamp can enter its features.
+
+This is a *temporal* split on the training stream — distinct from the
+preprocessing GTS split (§5) that carves off the final test day. Windows are
 indexed off the per-anchor target timestamp via a lazily-built, mmap'd
-`anchor_ts_L{H}[_m{MIN_HISTORY}].npy` cache (built once on first use; the
-default non-streaming path never touches it). The anchor `positions` and
-`anchor_ts` arrays are keyed by `(history_length, min_history)` so different
-floors don't collide and the expensive flat store is shared across them.
+`anchor_ts` cache keyed by `(history_length, min_history)`.
 
-### 5.2 Knobs
+### 11.2 Streaming knobs
 
-All configurable via gin ([yambda_5b.gin](generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin))
-with env overrides:
+All configurable via
+[`yambda_5b.gin`](generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin) with
+env overrides:
 
 | env | gin | default | meaning |
 |---|---|---|---|
-| `START_TS` | `streaming_train_eval_loop.start_ts` | 150 | first window (early windows are near-empty warm-up; start dense) |
-| `NUM_TRAIN_TS` | `streaming_train_eval_loop.num_train_ts` | 30 | number of train windows (clamped to available) |
-| `PERSISTENT_LOADER` | `streaming_train_eval_loop.persistent_loader` | 1 | reuse one worker pool across windows (no per-window respawn) |
-| `DOUBLE_BUFFER` | `streaming_train_eval_loop.double_buffer` | 1 | prepare the next window in a background thread during compute |
-| `EVAL_EACH_WINDOW` | `streaming_train_eval_loop.eval_each_window` | 1 | eval window T+1 after training window T |
-| `MIN_HISTORY` | `get_dataset.min_history` | 4086 | anchor-eligibility floor: min prior events for a LISTEN to be a sample (frozen default 4086 = full-history power-users filter; 0 = ~all users incl. cold-start) |
-| — | `streaming_train_eval_loop.num_train_batches` / `num_eval_batches` | unset | cap per-window steps (unset = consume full window) |
-
-### 5.3 Hiding the window-reset overhead
-
-Advancing to a new window has a fixed cost — selecting the window's anchor
-indices and warming the dataloader's first batch — that, done naively, stalls
-training at every window boundary. Three layers drive it to ~0:
-
-1. **Persistent loader** (`persistent_loader=1`). The naive path recreates a
-   `DataLoader` per window, re-forking workers and paying first-batch warmup
-   each time (~11 s/window). Instead we build **one** `DataLoader` backed by a
-   stateful `StreamingWindowSampler` whose index set is swapped per window
-   (`set_window`), so workers fork once and persist. This removes the respawn
-   but still pays the index-mask + first-batch stall (~3.6 s/window).
-2. **Double buffering** (`double_buffer=1`). Two pre-forked worker pools
-   ping-pong: while the current window trains on pool A, the *next* window's
-   index mask (`window_indices`, a GIL-releasing NumPy `np.where`) and
-   first-batch prefetch are prepared on pool B in a **background thread**, so
-   that work overlaps GPU compute. The boundary train batch then arrives warm —
-   measured train first-batch data-wait drops to **~1–3 ms**. Pools are forked
-   up front on the main thread (never inside the background thread), so a forking
-   worker can never race a thread holding a lock.
-3. **Eval prefetch one window ahead.** With `eval_each_window=1` the eval window
-   (`T+1`) is prepared *before* training window `T` runs, so the idle eval pool
-   prefetches its first batches concurrently with train compute. This hides the
-   eval-side first-batch stall (**~0.55 s → ~2 ms**). It is safe because a
-   sample's content depends only on the sampler's window indices, not on any
-   train/eval flag.
-
-Net effect: steady-state throughput matches the non-streaming baseline and the
-per-window reset is effectively free; the only remaining one-time cost is the
-process cold start (CUDA-graph capture + the first lazy `anchor_ts` mmap).
-
-## 6. License
+| `START_TS` | `streaming_train_eval_loop.start_ts` | 150 | first window (early windows are near-empty warm-up) |
+| `NUM_TRAIN_TS` | `streaming_train_eval_loop.num_train_ts` | 149 | number of train windows (clamped to available) |
+| `PERSISTENT_LOADER` | `streaming_train_eval_loop.persistent_loader` | 1 | reuse one worker pool across windows |
+| `DOUBLE_BUFFER` | `streaming_train_eval_loop.double_buffer` | 1 | prepare the next window in a background thread |
+| `EVAL_EVERY_N_WINDOWS` | `streaming_train_eval_loop.eval_every_n_windows` | 1 | eval cadence by window count (0 to use data-pct) |
+| `EVAL_EVERY_DATA_PCT` | `streaming_train_eval_loop.eval_every_data_pct` | 0.0 | eval cadence by fraction of train data (full ref: 0.005) |
+| `MIN_HISTORY` | `get_dataset.min_history` | 4086 | anchor-eligibility floor (0 = ~all users incl. cold-start) |
+
+### 11.3 Checkpointing & resume
+
+The streaming loop is resume-aware: set `CKPT_PATH` to enable DMP checkpoint
+save/load (auto-resolves to the highest-numbered subdir), with retention via
+`KEEP_LAST_N` and cadences `IN_WINDOW_CKPT_FREQ` / `CKPT_STEP_FREQ` /
+`CKPT_TIME_INTERVAL_S`. The MLPerf run state (run-started flag, global sample
+count) is persisted across resume so compliance logging is continuous. See
+`generative_recommenders/dlrm_v3/checkpoint.py`.
+
+## 12. License
 
 Apache 2.0 (inherited from upstream).
diff --git a/recommendation_v4/download_dataset.sh b/recommendation_v4/download_dataset.sh
new file mode 100755
index 000000000..d02f382dc
--- /dev/null
+++ b/recommendation_v4/download_dataset.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+# MLPerf Training reference script: download + preprocess the dataset.
+#
+# Downloads the Yambda dataset from HuggingFace (yandex/yambda) and runs the
+# preprocessing pipeline (event-type encoding, temporal GTS split, session
+# segmentation, item-popularity counts) into the on-disk layout that
+# DLRMv3YambdaDataset consumes. This is a thin wrapper over
+#   generative_recommenders.dlrm_v3.preprocess_public_data
+# so the full reference data pipeline lives in one place.
+#
+# Usage:
+#   DLRM_DATA_PATH=/path/to/dlrm_data ./download_dataset.sh
+#   DATASET=yambda-50m DLRM_DATA_PATH=/path/to/dlrm_data ./download_dataset.sh
+#
+# Env:
+#   DATASET         dataset variant (default: yambda-5b). One of
+#                   kuairand-1k | kuairand-27k | yambda-50m | yambda-500m | yambda-5b
+#   DLRM_DATA_PATH  destination data root (required).
+set -euo pipefail
+
+DATASET="${DATASET:-yambda-5b}"
+: "${DLRM_DATA_PATH:?Set DLRM_DATA_PATH to the destination data root}"
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "${REPO_ROOT}"
+
+echo "[download_dataset] dataset=${DATASET} data-path=${DLRM_DATA_PATH}"
+mkdir -p "${DLRM_DATA_PATH}"
+
+python3 -m generative_recommenders.dlrm_v3.preprocess_public_data \
+    --dataset "${DATASET}" \
+    --data-path "${DLRM_DATA_PATH}"
+
+echo "[download_dataset] done. Preprocessed layout under ${DLRM_DATA_PATH}:"
+echo "  raw/5b/multi_event.parquet"
+echo "  shared_metadata/{artist,album}_item_mapping.parquet, embeddings.parquet"
+echo "  processed_5b/{train_sessions,test_events,session_index}.parquet"
+echo "  processed_5b/item_popularity.npy, processed_5b/split_meta.json"
+echo "[download_dataset] verify integrity with: ./verify_dataset.sh"
diff --git a/recommendation_v4/md5sums_yambda_5b_processed.txt b/recommendation_v4/md5sums_yambda_5b_processed.txt
new file mode 100644
index 000000000..82998cca3
--- /dev/null
+++ b/recommendation_v4/md5sums_yambda_5b_processed.txt
@@ -0,0 +1,22 @@
+# MD5 checksums for the preprocessed yambda-5b dataset (processed_5b/).
+#
+# Format: standard `md5sum` output -> "<md5hash>  <relative_path>".
+# Paths are relative to ${DLRM_DATA_PATH}/processed_5b/.
+#
+# These hashes are PLACEHOLDERS (TODO). They must be generated from a canonical
+# preprocessing run before this benchmark is submitted, e.g.:
+#
+#   cd "${DLRM_DATA_PATH}/processed_5b"
+#   md5sum train_sessions.parquet test_events.parquet session_index.parquet \
+#          item_popularity.npy split_meta.json \
+#     > <repo>/md5sums_yambda_5b_processed.txt
+#
+# Until then `verify_dataset.sh` falls back to an existence/layout check and
+# warns that checksums are not yet pinned.
+#
+# TODO(rcp/data): replace the lines below with real md5 hashes.
+TODO_GENERATE_HASH  train_sessions.parquet
+TODO_GENERATE_HASH  test_events.parquet
+TODO_GENERATE_HASH  session_index.parquet
+TODO_GENERATE_HASH  item_popularity.npy
+TODO_GENERATE_HASH  split_meta.json
diff --git a/recommendation_v4/rcp/README.md b/recommendation_v4/rcp/README.md
new file mode 100644
index 000000000..02a977862
--- /dev/null
+++ b/recommendation_v4/rcp/README.md
@@ -0,0 +1,28 @@
+# Reference Convergence Points (RCP)
+
+**Status: placeholder — RCPs not yet generated. Intentionally left blank.**
+
+This directory will hold the Reference Convergence Points for the
+recommendation_v4 (HSTU / yambda-5b) benchmark once convergence runs are
+complete.
+
+Per the MLPerf Training
+[CONTRIBUTING guidance](https://github.com/mlcommons/training_policies/blob/master/CONTRIBUTING.md)
+("Some things to note while generating reference convergence points"):
+
+- Use FP32 or BF16 precision and record the exact precision used in the RCP JSON.
+- Generate RCPs for at least **3 reasonable batch sizes**.
+- Run RCPs with an eval frequency **higher** than the chosen benchmark eval
+  frequency (more data points for picking the target accuracy).
+- Run at least **2N seeds**, where N = number of submission runs.
+
+The convergence target for this benchmark is **eval AUC >= 0.80275** (see
+[../README.MD](../README.MD) §9). The RCP JSON files and convergence-curve plots
+(samples-to-converge vs. batch size / seed) will be committed here.
+
+## TODO
+
+- [ ] Run >= 2N-seed convergence sweeps at >= 3 batch sizes.
+- [ ] Record precision (FP32/BF16) per the rules.
+- [ ] Add `rcp_<batchsize>.json` files in the mlperf_logging RCP format.
+- [ ] Add convergence-curve plots and the chosen target-accuracy justification.
diff --git a/recommendation_v4/requirements.txt b/recommendation_v4/requirements.txt
index d1aba1e95..852aa149b 100644
--- a/recommendation_v4/requirements.txt
+++ b/recommendation_v4/requirements.txt
@@ -1,8 +1,43 @@
-torch>=2.6.0
-fbgemm_gpu>=1.1.0
-torchrec>=1.1.0
-gin_config>=0.5.0
-pandas>=2.2.0
-tensorboard>=2.19.0
-pybind11
-git+https://github.com/mlcommons/logging.git@6.0.0-rc6
+# Frozen dependency versions for the recommendation_v4 (HSTU / yambda-5b) MLPerf
+# reference. The CANONICAL, fully-reproducible environment is the Dockerfile
+# (built on rocm/primus:v26.3); see docs/training_recipe.md for the per-platform
+# (MI350X / B200) install commands and rationale. The pins below mirror that
+# stack. torch / torchvision / torchaudio / fbgemm_gpu / torchrec are
+# accelerator-specific and MUST be installed with --no-deps from the matching
+# index (see Dockerfile) so pip does not clobber the +rocm wheels.
+
+# --- accelerator stack (install via Dockerfile; --no-deps, matching index) ---
+# torch==2.12.0+rocm7.2          # --index-url https://download.pytorch.org/whl/rocm7.2
+# torchvision==0.27.0+rocm7.2
+# torchaudio==2.11.0+rocm7.2
+# fbgemm_gpu                     # built from FBGEMM commit 10b775730212923f65f7b78f79b6a01d80cf3c29 for gfx950
+torch==2.12.0
+fbgemm_gpu==1.7.0
+torchrec @ git+https://github.com/pytorch/torchrec.git@v2026.06.01.00
+
+# --- data / config / logging -------------------------------------------------
+polars-u64-idx==1.33.1
+gin-config==0.5.0
+absl-py==2.1.0
+pandas==2.2.3
+pyarrow==17.0.0
+numpy==1.26.4
+xxhash==3.5.0
+datasets==3.2.0
+huggingface_hub==0.27.0
+
+# --- metrics / training utils ------------------------------------------------
+torchmetrics==1.0.3
+tensordict==0.6.2
+tensorboard==2.19.0
+pyre-extensions==0.0.32
+iopath==0.1.10
+typing-inspect==0.9.0
+psutil==6.1.0
+tqdm==4.67.1
+pyyaml==6.0.2
+pybind11==2.13.6
+lightning-utilities==0.11.9
+
+# --- MLPerf compliance logging (pinned to the Training 6.0 tag) --------------
+mlperf-logging @ git+https://github.com/mlcommons/logging.git@6.0.0-rc6
diff --git a/recommendation_v4/run_and_time.sh b/recommendation_v4/run_and_time.sh
new file mode 100755
index 000000000..a0207e71c
--- /dev/null
+++ b/recommendation_v4/run_and_time.sh
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+# MLPerf Training reference script: run the benchmark and report wall-clock time.
+#
+# Runs the full-reference HSTU / yambda-5b streaming train+eval sweep to the
+# MLPerf quality target (eval AUC >= 0.80275) and prints the elapsed time of the
+# timed region. This is the canonical single-host (8-GPU) entry point; for
+# multi-node SLURM launches use scripts/launch_slurm.sh (which calls into the
+# same trainer).
+#
+# Usage:
+#   DLRM_DATA_PATH=/path/to/dlrm_data ./run_and_time.sh
+#
+# Env (run shape / cadence -- defaults are the FULL reference sweep):
+#   DLRM_DATA_PATH        data root (required).
+#   SEED                  RNG seed (default 1).
+#   START_TS / NUM_TRAIN_TS   window range (default 0 / 299 = full sweep).
+#   EVAL_EVERY_DATA_PCT   eval cadence as a fraction of train data (default 0.005).
+#   AUC_THRESHOLD         convergence target (default 0.80275).
+#   GPUS_PER_NODE         GPUs on this host (default 8).
+#   RUN_NAME              results dir name under results/ (default reference_run).
+set -euo pipefail
+
+: "${DLRM_DATA_PATH:?Set DLRM_DATA_PATH to the data root}"
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "${REPO_ROOT}"
+
+# ---- Reference run shape (full sweep to the quality target) -----------------
+export SEED="${SEED:-1}"
+export START_TS="${START_TS:-0}"
+export NUM_TRAIN_TS="${NUM_TRAIN_TS:-299}"
+export NUM_TRAIN_BATCHES="${NUM_TRAIN_BATCHES:-0}"
+export NUM_EVAL_BATCHES="${NUM_EVAL_BATCHES:-0}"
+export EVAL_EVERY_N_WINDOWS="${EVAL_EVERY_N_WINDOWS:-0}"
+export EVAL_EVERY_DATA_PCT="${EVAL_EVERY_DATA_PCT:-0.005}"
+export AUC_THRESHOLD="${AUC_THRESHOLD:-0.80275}"
+export RUN_NAME="${RUN_NAME:-reference_run}"
+
+# ---- Single-host distributed topology (override for multi-node) -------------
+export GPUS_PER_NODE="${GPUS_PER_NODE:-8}"
+export NNODES="${NNODES:-1}"
+export NODE_RANK="${NODE_RANK:-0}"
+export WORLD_SIZE="${WORLD_SIZE:-$((NNODES * GPUS_PER_NODE))}"
+export MASTER_ADDR="${MASTER_ADDR:-127.0.0.1}"
+export MASTER_PORT="${MASTER_PORT:-29500}"
+
+# ---- MLPerf compliance logging ----------------------------------------------
+export MLPERF_LOGGING="${MLPERF_LOGGING:-1}"
+export MLPERF_LOG_PATH="${MLPERF_LOG_PATH:-${REPO_ROOT}/results/${RUN_NAME}/mlperf/yambda_5b_mlperf.log}"
+export MLPERF_SUBMISSION_PLATFORM="${MLPERF_SUBMISSION_PLATFORM:-MI355X}"
+mkdir -p "$(dirname "${MLPERF_LOG_PATH}")"
+
+# ---- Timed region -----------------------------------------------------------
+# Pull the start timestamp into a clear region per the MLPerf run_and_time.sh idiom.
+start=$(date +%s)
+echo "STARTING TIMING RUN AT $(date -u '+%Y-%m-%d %r')"
+
+python -m generative_recommenders.dlrm_v3.train.train_ranker \
+    --dataset yambda-5b \
+    --mode streaming-train-eval
+
+end=$(date +%s)
+result=$(( end - start ))
+echo "ENDING TIMING RUN AT $(date -u '+%Y-%m-%d %r')"
+echo "RESULT,recommendation_v4_hstu_yambda_5b,${SEED},${result},$(whoami),$(date -u '+%Y-%m-%d %r')"
diff --git a/recommendation_v4/verify_dataset.sh b/recommendation_v4/verify_dataset.sh
new file mode 100755
index 000000000..839ccb91f
--- /dev/null
+++ b/recommendation_v4/verify_dataset.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+# MLPerf Training reference script: verify the preprocessed dataset.
+#
+# Checks the integrity of the preprocessed dataset under
+#   ${DLRM_DATA_PATH}/${PROCESSED_SUBDIR}
+# against md5sums_yambda_5b_processed.txt (standard `md5sum -c` format).
+#
+# If the checksum file still contains placeholder hashes (TODO_GENERATE_HASH),
+# the script falls back to an existence/layout check and warns that the
+# canonical checksums have not been pinned yet.
+#
+# Usage:
+#   DLRM_DATA_PATH=/path/to/dlrm_data ./verify_dataset.sh
+#
+# Env:
+#   DLRM_DATA_PATH    data root (required).
+#   PROCESSED_SUBDIR  processed subdir under the data root (default: processed_5b).
+set -euo pipefail
+
+: "${DLRM_DATA_PATH:?Set DLRM_DATA_PATH to the data root}"
+PROCESSED_SUBDIR="${PROCESSED_SUBDIR:-processed_5b}"
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CHECKSUM_FILE="${REPO_ROOT}/md5sums_yambda_5b_processed.txt"
+PROCESSED_DIR="${DLRM_DATA_PATH}/${PROCESSED_SUBDIR}"
+
+echo "[verify_dataset] processed dir: ${PROCESSED_DIR}"
+
+if [[ ! -d "${PROCESSED_DIR}" ]]; then
+    echo "[verify_dataset] ERROR: ${PROCESSED_DIR} does not exist. Run ./download_dataset.sh first." >&2
+    exit 1
+fi
+
+EXPECTED_FILES=(
+    train_sessions.parquet
+    test_events.parquet
+    session_index.parquet
+    item_popularity.npy
+    split_meta.json
+)
+
+# Detect whether the checksum file has real (32 hex char) hashes or placeholders.
+if grep -qiE '^[0-9a-f]{32}[[:space:]]' "${CHECKSUM_FILE}"; then
+    echo "[verify_dataset] checking md5 checksums from ${CHECKSUM_FILE}"
+    (cd "${PROCESSED_DIR}" && md5sum -c "${CHECKSUM_FILE}")
+    echo "[verify_dataset] OK: all checksums match."
+else
+    echo "[verify_dataset] WARNING: ${CHECKSUM_FILE} contains placeholder hashes;" >&2
+    echo "[verify_dataset]          falling back to existence/layout check only." >&2
+    missing=0
+    for f in "${EXPECTED_FILES[@]}"; do
+        if [[ -s "${PROCESSED_DIR}/${f}" ]]; then
+            echo "  OK   ${f}"
+        else
+            echo "  MISS ${f}" >&2
+            missing=1
+        fi
+    done
+    if [[ "${missing}" -ne 0 ]]; then
+        echo "[verify_dataset] ERROR: one or more expected files are missing/empty." >&2
+        exit 1
+    fi
+    echo "[verify_dataset] layout OK (checksums NOT yet pinned -- see TODO in ${CHECKSUM_FILE})."
+fi

From 4cf4d859f9e7c4e268f7c0969817ef43c30e391d Mon Sep 17 00:00:00 2001
From: Chris Cai <chcai@amd.com>
Date: Thu, 25 Jun 2026 18:53:45 +0000
Subject: [PATCH 102/113] recommendation_v4: prune
 inference/AOT/CUDA-cpp/research + non-yambda configs

Remove subtrees unused by the yambda-5b TRITON training path:
- dlrm_v3/inference/ (incl thirdparty/loadgen)
- ops/cpp/ (CUTLASS CUDA kernels) and ops/triton_aot/ (AOT-inference kernels)
- generative_recommenders/research/ and its entrypoints (main.py,
  run_fractal_expansion.py, repo-root preprocess_public_data.py)
- configs/{ml-1m,ml-20m,ml-3b,amzn-books} and non-yambda train gins
  (keep yambda_5b.gin + debug.gin)
- ops/benchmarks/hstu_attention_bench.py (dangling ops.cpp import)

The TRITON (default) and PYTORCH kernel paths are unaffected: aot_* calls are
gated behind HammerKernel.TRITON_INFERENCE and ops/triton/* has no cpp/aot deps.
Validated by import smoke + a streaming-train-eval e2e smoke (rc=0).

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../hstu-sampled-softmax-n512-final.gin       |   49 -
 .../hstu-sampled-softmax-n512-large-final.gin |   49 -
 .../sasrec-sampled-softmax-n512-final.gin     |   50 -
 .../ml-1m/hstu-sampled-softmax-n128-final.gin |   45 -
 .../hstu-sampled-softmax-n128-large-final.gin |   45 -
 .../sasrec-sampled-softmax-n128-final.gin     |   44 -
 .../hstu-sampled-softmax-n128-final.gin       |   45 -
 .../hstu-sampled-softmax-n128-large-final.gin |   45 -
 .../sasrec-sampled-softmax-n128-final.gin     |   44 -
 ...tu-sampled-softmax-n96-seqlen500-final.gin |   42 -
 ...pled-softmax-n96-seqlen500-large-final.gin |   42 -
 ...ec-sampled-softmax-n96-seqlen500-final.gin |   42 -
 .../dlrm_v3/inference/README.md               |   88 -
 .../dlrm_v3/inference/accuracy.py             |   86 -
 .../dlrm_v3/inference/cpp/hstu_runner.cpp     |  215 --
 .../dlrm_v3/inference/data_producer.py        |  227 --
 .../dlrm_v3/inference/dense_predict_module.py |   96 -
 .../dlrm_v3/inference/end_to_end_test.py      |  795 -----
 .../dlrm_v3/inference/gin/debug.gin           |   13 -
 .../dlrm_v3/inference/gin/kuairand_1k.gin     |   14 -
 .../dlrm_v3/inference/gin/movielens_13b.gin   |   16 -
 .../dlrm_v3/inference/gin/streaming_100b.gin  |   15 -
 .../dlrm_v3/inference/gin/streaming_400m.gin  |   15 -
 .../dlrm_v3/inference/inference_modules.py    |  253 --
 .../dlrm_v3/inference/main.py                 |  805 -----
 .../dlrm_v3/inference/mlperf.conf             |   98 -
 .../dlrm_v3/inference/model_family.py         |  705 ----
 .../inference/sparse_predict_module.py        |  106 -
 .../dlrm_v3/inference/tests/inference_test.py |   39 -
 .../inference/tests/test_scripted_parity.py   |  236 --
 .../thirdparty/loadgen/.clang-format          |    2 -
 .../thirdparty/loadgen/CMakeLists.txt         |  113 -
 .../inference/thirdparty/loadgen/MANIFEST.in  |    2 -
 .../inference/thirdparty/loadgen/README.md    |  223 --
 .../thirdparty/loadgen/README_BUILD.md        |   47 -
 .../thirdparty/loadgen/README_FAQ.md          |   78 -
 .../inference/thirdparty/loadgen/VERSION.txt  |    1 -
 .../thirdparty/loadgen/benchmark/.gitignore   |    2 -
 .../thirdparty/loadgen/benchmark/README.md    |   10 -
 .../thirdparty/loadgen/benchmark/repro.cpp    |  296 --
 .../thirdparty/loadgen/benchmark/run.sh       |   21 -
 .../thirdparty/loadgen/benchmark/run_debug.sh |   21 -
 .../thirdparty/loadgen/bindings/c_api.cc      |  176 -
 .../thirdparty/loadgen/bindings/c_api.h       |   95 -
 .../thirdparty/loadgen/bindings/python_api.cc |  484 ---
 .../thirdparty/loadgen/demos/lon/README.md    |   67 -
 .../loadgen/demos/lon/py_demo_server_lon.py   |  191 -
 .../demos/lon/sut_over_network_demo.py        |   88 -
 .../loadgen/demos/py_demo_multi_stream.py     |   86 -
 .../loadgen/demos/py_demo_offline.py          |   81 -
 .../loadgen/demos/py_demo_server.py           |   74 -
 .../loadgen/demos/py_demo_single_stream.py    |   84 -
 .../token_metrics/py_demo_multi_stream.py     |  142 -
 .../demos/token_metrics/py_demo_offline.py    |  130 -
 .../token_metrics/py_demo_offline_inferred.py |  130 -
 .../demos/token_metrics/py_demo_server.py     |  132 -
 .../token_metrics/py_demo_server_inferred.py  |  125 -
 .../token_metrics/py_demo_single_stream.py    |  129 -
 .../loadgen/diagram_network_submission.png    |  Bin 51192 -> 0 bytes
 .../thirdparty/loadgen/diagram_submission.png |  Bin 36510 -> 0 bytes
 .../thirdparty/loadgen/docs/src/BUILD.gn      |   33 -
 .../thirdparty/loadgen/docs/src/README.md     |   34 -
 .../thirdparty/loadgen/docs/src/doxygen.cfg   | 2495 -------------
 .../loadgen/docs/src/doxygen_footer.html      |   26 -
 .../loadgen/docs/src/doxygen_header.html      |   49 -
 .../docs/src/doxygen_html_generator.py        |   37 -
 .../loadgen/docs/src/doxygen_layout.xml       |  211 --
 .../loadgen/docs/src/doxygen_stylesheet.css   | 1629 ---------
 .../docs/src/loadgen_integration_diagram.dia  |  Bin 1943 -> 0 bytes
 .../loadgen/docs/src/mlperf_icon.png          |  Bin 4632 -> 0 bytes
 .../docs/src/mlperf_logo_horizontal_color.svg |   55 -
 .../thirdparty/loadgen/early_stopping.cc      |  117 -
 .../thirdparty/loadgen/early_stopping.h       |   27 -
 .../loadgen/generated/version_generated.cc    |   98 -
 .../loadgen/issue_query_controller.cc         |  552 ---
 .../loadgen/issue_query_controller.h          |  215 --
 .../inference/thirdparty/loadgen/loadgen.cc   | 1345 -------
 .../inference/thirdparty/loadgen/loadgen.h    |  103 -
 .../loadgen/loadgen_integration_diagram.svg   |   85 -
 .../inference/thirdparty/loadgen/logging.cc   | 1301 -------
 .../inference/thirdparty/loadgen/logging.h    |  816 -----
 .../inference/thirdparty/loadgen/mlperf.conf  |  164 -
 .../thirdparty/loadgen/mlperf_conf.h          |  167 -
 .../thirdparty/loadgen/pyproject.toml         |    7 -
 .../loadgen/query_dispatch_library.h          |   42 -
 .../thirdparty/loadgen/query_sample.h         |   91 -
 .../thirdparty/loadgen/query_sample_library.h |   75 -
 .../thirdparty/loadgen/requirements.txt       |    1 -
 .../inference/thirdparty/loadgen/results.cc   |  856 -----
 .../inference/thirdparty/loadgen/results.h    |  128 -
 .../inference/thirdparty/loadgen/setup.py     |  136 -
 .../thirdparty/loadgen/system_under_test.h    |   67 -
 .../thirdparty/loadgen/test_settings.h        |  329 --
 .../loadgen/test_settings_internal.cc         |  800 -----
 .../loadgen/test_settings_internal.h          |  182 -
 .../thirdparty/loadgen/tests/BUILD.gn         |   25 -
 .../thirdparty/loadgen/tests/README.md        |   42 -
 .../thirdparty/loadgen/tests/basic.cc         |  314 --
 .../thirdparty/loadgen/tests/loadgen_test.h   |  198 --
 .../loadgen/tests/loadgen_test_main.cc        |   33 -
 .../loadgen/tests/perftests_null_sut.cc       |  230 --
 .../loadgen/tests/perftests_null_sut.py       |   61 -
 .../loadgen/tools/mlperf-trace.ipynb          |  441 ---
 .../inference/thirdparty/loadgen/utils.cc     |  124 -
 .../inference/thirdparty/loadgen/utils.h      |   70 -
 .../inference/thirdparty/loadgen/version.cc   |   85 -
 .../inference/thirdparty/loadgen/version.h    |   39 -
 .../thirdparty/loadgen/version_generator.py   |  141 -
 .../dlrm_v3/inference/ts_types.py             |   70 -
 .../dlrm_v3/inference/user.conf               |    5 -
 .../dlrm_v3/train/gin/kuairand_1k.gin         |   41 -
 .../dlrm_v3/train/gin/movielens_13b.gin       |   41 -
 .../dlrm_v3/train/gin/movielens_18b.gin       |   56 -
 .../dlrm_v3/train/gin/movielens_1m.gin        |   38 -
 .../dlrm_v3/train/gin/movielens_20m.gin       |   56 -
 .../dlrm_v3/train/gin/streaming_100b.gin      |   52 -
 .../dlrm_v3/train/gin/streaming_200b.gin      |   63 -
 .../dlrm_v3/train/gin/streaming_400m.gin      |   61 -
 .../ops/benchmarks/hstu_attention_bench.py    |  406 ---
 .../concat_1d_jagged_jagged_bench.py          |  125 -
 .../benchmarks/jagged_transpose_1d_bench.py   |  117 -
 .../replace_last_n_with_jagged_bench.py       |  150 -
 .../split_1d_jagged_jagged_bench.py           |  116 -
 .../generative_recommenders/ops/cpp/common.h  |   60 -
 .../ops/cpp/complete_cumsum.cpp               |   44 -
 .../ops/cpp/complete_cumsum.cu                |   51 -
 .../ops/cpp/concat_1d_jagged_jagged.cpp       |  111 -
 .../ops/cpp/concat_1d_jagged_jagged.cu        |  130 -
 .../ops/cpp/cpp_ops.cpp                       |  207 --
 .../ops/cpp/cuda_hstu_attention.py            |  193 -
 .../cpp/cuda_hstu_preprocess_and_attention.py |  668 ----
 .../ops/cpp/expand_1d_jagged_to_dense.cpp     |   97 -
 .../ops/cpp/expand_1d_jagged_to_dense.cu      |  103 -
 .../hstu_attention/copy_sm90_bulk_reduce.h    |   66 -
 .../ops/cpp/hstu_attention/epilogue_bwd.h     |  481 ---
 .../ops/cpp/hstu_attention/epilogue_fwd.h     |  550 ---
 .../ops/cpp/hstu_attention/flash.h            |  157 -
 .../ops/cpp/hstu_attention/flash_api.cpp      |  322 --
 .../ops/cpp/hstu_attention/flash_api_cpu.cpp  |  256 --
 .../hstu_attention/flash_bwd_kernel_sm90.h    |  402 ---
 .../flash_bwd_launch_template.h               |  492 ---
 .../flash_bwd_postprocess_kernel.h            |  348 --
 .../flash_bwd_preprocess_kernel.h             |  349 --
 .../ops/cpp/hstu_attention/flash_common.cpp   | 1165 ------
 .../ops/cpp/hstu_attention/flash_common.h     |  149 -
 .../cpp/hstu_attention/flash_common_cpu.cpp   |  172 -
 .../ops/cpp/hstu_attention/flash_common_cpu.h |  114 -
 .../hstu_attention/flash_fwd_kernel_sm90.h    |  511 ---
 .../flash_fwd_launch_template.h               |  376 --
 .../cpp/hstu_attention/generate_kernels.py    |  236 --
 ...lash_bwd_hdim128_bf16_softmaxfalse_sm90.cu |   33 -
 ...flash_bwd_hdim128_bf16_softmaxtrue_sm90.cu |   33 -
 ...lash_bwd_hdim128_fp16_softmaxfalse_sm90.cu |   33 -
 ...flash_bwd_hdim128_fp16_softmaxtrue_sm90.cu |   33 -
 ...lash_bwd_hdim192_bf16_softmaxfalse_sm90.cu |   33 -
 ...flash_bwd_hdim192_bf16_softmaxtrue_sm90.cu |   33 -
 ...lash_bwd_hdim192_fp16_softmaxfalse_sm90.cu |   33 -
 ...flash_bwd_hdim192_fp16_softmaxtrue_sm90.cu |   33 -
 ...lash_bwd_hdim256_bf16_softmaxfalse_sm90.cu |   33 -
 ...flash_bwd_hdim256_bf16_softmaxtrue_sm90.cu |   33 -
 ...lash_bwd_hdim256_fp16_softmaxfalse_sm90.cu |   33 -
 ...flash_bwd_hdim256_fp16_softmaxtrue_sm90.cu |   33 -
 ...flash_bwd_hdim64_bf16_softmaxfalse_sm90.cu |   33 -
 .../flash_bwd_hdim64_bf16_softmaxtrue_sm90.cu |   33 -
 ...flash_bwd_hdim64_fp16_softmaxfalse_sm90.cu |   33 -
 .../flash_bwd_hdim64_fp16_softmaxtrue_sm90.cu |   33 -
 ...flash_bwd_hdim96_bf16_softmaxfalse_sm90.cu |   33 -
 .../flash_bwd_hdim96_bf16_softmaxtrue_sm90.cu |   33 -
 ...flash_bwd_hdim96_fp16_softmaxfalse_sm90.cu |   33 -
 .../flash_bwd_hdim96_fp16_softmaxtrue_sm90.cu |   33 -
 ...lash_fwd_hdim128_bf16_softmaxfalse_sm90.cu |   33 -
 ...flash_fwd_hdim128_bf16_softmaxtrue_sm90.cu |   33 -
 ...lash_fwd_hdim128_e4m3_softmaxfalse_sm90.cu |   33 -
 ...flash_fwd_hdim128_e4m3_softmaxtrue_sm90.cu |   33 -
 ...lash_fwd_hdim128_fp16_softmaxfalse_sm90.cu |   33 -
 ...flash_fwd_hdim128_fp16_softmaxtrue_sm90.cu |   33 -
 ...lash_fwd_hdim192_bf16_softmaxfalse_sm90.cu |   33 -
 ...flash_fwd_hdim192_bf16_softmaxtrue_sm90.cu |   33 -
 ...lash_fwd_hdim192_e4m3_softmaxfalse_sm90.cu |   33 -
 ...flash_fwd_hdim192_e4m3_softmaxtrue_sm90.cu |   33 -
 ...lash_fwd_hdim192_fp16_softmaxfalse_sm90.cu |   33 -
 ...flash_fwd_hdim192_fp16_softmaxtrue_sm90.cu |   33 -
 ...lash_fwd_hdim256_bf16_softmaxfalse_sm90.cu |   33 -
 ...flash_fwd_hdim256_bf16_softmaxtrue_sm90.cu |   33 -
 ...lash_fwd_hdim256_e4m3_softmaxfalse_sm90.cu |   33 -
 ...flash_fwd_hdim256_e4m3_softmaxtrue_sm90.cu |   33 -
 ...lash_fwd_hdim256_fp16_softmaxfalse_sm90.cu |   33 -
 ...flash_fwd_hdim256_fp16_softmaxtrue_sm90.cu |   33 -
 ...flash_fwd_hdim64_bf16_softmaxfalse_sm90.cu |   33 -
 .../flash_fwd_hdim64_bf16_softmaxtrue_sm90.cu |   33 -
 ...flash_fwd_hdim64_e4m3_softmaxfalse_sm90.cu |   33 -
 .../flash_fwd_hdim64_e4m3_softmaxtrue_sm90.cu |   33 -
 ...flash_fwd_hdim64_fp16_softmaxfalse_sm90.cu |   33 -
 .../flash_fwd_hdim64_fp16_softmaxtrue_sm90.cu |   33 -
 ...flash_fwd_hdim96_bf16_softmaxfalse_sm90.cu |   33 -
 .../flash_fwd_hdim96_bf16_softmaxtrue_sm90.cu |   33 -
 ...flash_fwd_hdim96_e4m3_softmaxfalse_sm90.cu |   33 -
 .../flash_fwd_hdim96_e4m3_softmaxtrue_sm90.cu |   33 -
 ...flash_fwd_hdim96_fp16_softmaxfalse_sm90.cu |   33 -
 .../flash_fwd_hdim96_fp16_softmaxtrue_sm90.cu |   33 -
 .../mainloop_bwd_sm90_tma_gmma_ws.h           | 3166 -----------------
 .../mainloop_fwd_sm90_tma_gmma_ws.h           | 2180 ------------
 .../ops/cpp/hstu_attention/mask.h             |  396 ---
 .../ops/cpp/hstu_attention/named_barrier.h    |  101 -
 .../ops/cpp/hstu_attention/seqlen.h           |  134 -
 .../hstu_attention/sm90_pipeline_no_cluster.h |  150 -
 .../ops/cpp/hstu_attention/softmax.h          |  256 --
 .../ops/cpp/hstu_attention/static_switch.h    |  135 -
 .../ops/cpp/hstu_attention/tile_scheduler.h   |  616 ----
 .../ops/cpp/hstu_attention/tile_size.h        |  220 --
 .../ops/cpp/hstu_attention/utils.h            |  789 ----
 .../ops/cpp/hstu_attention/version.txt        |    1 -
 .../ops/cpp/jagged_transpose_1d.cpp           |  130 -
 .../ops/cpp/jagged_transpose_1d.cu            |  127 -
 .../ops/cpp/replace_last_n_with_jagged.cpp    |  139 -
 .../ops/cpp/replace_last_n_with_jagged.cu     |  156 -
 .../generative_recommenders/ops/cpp/setup.py  |  487 ---
 .../ops/cpp/sort_kv_pairs_cuda.cpp            |   40 -
 .../sort_kv_pairs_cuda_kernels_template.cu    |   82 -
 .../cpp/sort_kv_pairs_cuda_kernels_template.h |   15 -
 .../ops/cpp/split_1d_jagged_jagged.cpp        |  136 -
 .../ops/cpp/split_1d_jagged_jagged.cu         |  147 -
 .../cpp/tests/concat_1d_jagged_jagged_test.py |  135 -
 .../ops/cpp/tests/hstu_mha_cpu_test.py        |   39 -
 .../ops/cpp/tests/jagged_transpose_1d_test.py |  132 -
 .../tests/replace_last_n_with_jagged_test.py  |  105 -
 .../cpp/tests/split_1d_jagged_jagged_test.py  |  100 -
 .../ops/triton_aot/README.md                  |   54 -
 .../ops/triton_aot/compile/arg_descriptor.py  |  146 -
 .../ops/triton_aot/compile/codegen.py         |  780 ----
 .../ops/triton_aot/compile/compile_state.py   |  409 ---
 .../ops/triton_aot/compile/pipeline.py        |  300 --
 .../ops/triton_aot/compile/spec_processing.py |  593 ---
 .../ops/triton_aot/compile/stable_types.py    |   35 -
 .../triton_aot/compile/triton_aot_compile.py  |  149 -
 .../ops/triton_aot/compile/utils.py           |   47 -
 .../ops/triton_aot/preprocess.py              |   76 -
 .../ops/triton_aot/shared/compat.py           |   91 -
 .../ops/triton_aot/shared/spec_conversion.py  |  389 --
 .../ops/triton_aot/shared/types.py            |   58 -
 .../triton_aot/templates/embedded_cubins.cpp  |    7 -
 .../ops/triton_aot/templates/kernel.cpp       |  104 -
 .../ops/triton_aot/templates/kernel.h         |   36 -
 .../triton_aot/templates/template_utils.py    |   96 -
 .../ops/triton_aot/templates/torch_op.cpp     |   22 -
 .../ops/triton_aot/transform/import_utils.py  |   89 -
 .../transform/kernel_wrapper_codegen.py       |  500 ---
 .../triton_aot/transform/replace_kernels.py   |  137 -
 .../triton_aot/transform/transform_kernels.py |   29 -
 .../ops/triton_aot/triton_addmm.py            |  347 --
 .../ops/triton_aot/triton_concat_2d_jagged.py |  183 -
 .../triton_group_norm_mul_dropout.py          |  124 -
 .../ops/triton_aot/triton_layer_norm.py       |  119 -
 .../triton_layer_norm_mul_dropout.py          |  162 -
 .../ops/triton_aot/triton_position.py         |  176 -
 .../triton_ragged_hstu_attention.py           |  366 --
 .../ops/triton_aot/triton_rms_norm.py         |  114 -
 .../ops/triton_aot/triton_split_2d_jagged.py  |  138 -
 .../ops/triton_aot/types.py                   |  181 -
 .../research/data/dataset.py                  |  248 --
 .../research/data/eval.py                     |  263 --
 .../research/data/item_features.py            |   29 -
 .../research/data/preprocessor.py             |  474 ---
 .../research/data/reco_dataset.py             |  176 -
 .../research/indexing/candidate_index.py      |  179 -
 .../research/indexing/utils.py                |   43 -
 .../research/modeling/initialization.py       |   35 -
 .../sequential/autoregressive_losses.py       |  477 ---
 .../modeling/sequential/embedding_modules.py  |  108 -
 .../modeling/sequential/encoder_utils.py      |  150 -
 .../research/modeling/sequential/features.py  |   94 -
 .../research/modeling/sequential/hstu.py      |  808 -----
 .../input_features_preprocessors.py           |  259 --
 .../sequential/losses/sampled_softmax.py      |  193 -
 .../sequential/output_postprocessors.py       |   82 -
 .../research/modeling/sequential/sasrec.py    |  316 --
 .../research/modeling/sequential/utils.py     |  129 -
 .../research/modeling/similarity_module.py    |   68 -
 .../research/modeling/similarity_utils.py     |  222 --
 .../rails/indexing/candidate_index.py         |   41 -
 .../research/rails/indexing/mips_top_k.py     |   80 -
 .../research/rails/indexing/mol_top_k.py      |  132 -
 .../similarities/dot_product_similarity_fn.py |   68 -
 .../research/rails/similarities/layers.py     |   82 -
 .../research/rails/similarities/module.py     |   55 -
 .../rails/similarities/mol/embeddings_fn.py   |   52 -
 .../similarities/mol/item_embeddings_fn.py    |   99 -
 .../similarities/mol/query_embeddings_fn.py   |  164 -
 .../rails/similarities/mol/similarity_fn.py   |  388 --
 .../research/trainer/data_loader.py           |   57 -
 .../research/trainer/train.py                 |  532 ---
 recommendation_v4/main.py                     |   82 -
 recommendation_v4/preprocess_public_data.py   |   32 -
 recommendation_v4/run_fractal_expansion.py    |  588 ---
 294 files changed, 55303 deletions(-)
 delete mode 100644 recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-final.gin
 delete mode 100644 recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-large-final.gin
 delete mode 100644 recommendation_v4/configs/amzn-books/sasrec-sampled-softmax-n512-final.gin
 delete mode 100644 recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-final.gin
 delete mode 100644 recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-large-final.gin
 delete mode 100644 recommendation_v4/configs/ml-1m/sasrec-sampled-softmax-n128-final.gin
 delete mode 100644 recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-final.gin
 delete mode 100644 recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-large-final.gin
 delete mode 100644 recommendation_v4/configs/ml-20m/sasrec-sampled-softmax-n128-final.gin
 delete mode 100644 recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-final.gin
 delete mode 100644 recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-large-final.gin
 delete mode 100644 recommendation_v4/configs/ml-3b/sasrec-sampled-softmax-n96-seqlen500-final.gin
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/README.md
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/accuracy.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/cpp/hstu_runner.cpp
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/data_producer.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/dense_predict_module.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/end_to_end_test.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/debug.gin
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/kuairand_1k.gin
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/movielens_13b.gin
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_100b.gin
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_400m.gin
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/inference_modules.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/main.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/mlperf.conf
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/model_family.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/sparse_predict_module.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/inference_test.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/test_scripted_parity.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/.clang-format
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/CMakeLists.txt
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/MANIFEST.in
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README.md
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_BUILD.md
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_FAQ.md
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/VERSION.txt
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/.gitignore
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/README.md
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/repro.cpp
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run.sh
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run_debug.sh
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.cc
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.h
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/python_api.cc
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/README.md
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/py_demo_server_lon.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/sut_over_network_demo.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_multi_stream.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_offline.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_server.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_single_stream.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_multi_stream.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline_inferred.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server_inferred.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_single_stream.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/diagram_network_submission.png
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/diagram_submission.png
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/BUILD.gn
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/README.md
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen.cfg
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_footer.html
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_header.html
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_html_generator.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_layout.xml
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_stylesheet.css
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/loadgen_integration_diagram.dia
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/mlperf_icon.png
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/mlperf_logo_horizontal_color.svg
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/early_stopping.cc
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/early_stopping.h
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/generated/version_generated.cc
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/issue_query_controller.cc
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/issue_query_controller.h
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen.cc
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen.h
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen_integration_diagram.svg
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/logging.cc
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/logging.h
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/mlperf.conf
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/mlperf_conf.h
 delete mode 100755 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/pyproject.toml
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_dispatch_library.h
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_sample.h
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_sample_library.h
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/requirements.txt
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/results.cc
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/results.h
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/setup.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/system_under_test.h
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings.h
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings_internal.cc
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings_internal.h
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/BUILD.gn
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/README.md
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/basic.cc
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/loadgen_test.h
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/loadgen_test_main.cc
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/perftests_null_sut.cc
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/perftests_null_sut.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tools/mlperf-trace.ipynb
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/utils.cc
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/utils.h
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version.cc
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version.h
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version_generator.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/ts_types.py
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/inference/user.conf
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/train/gin/kuairand_1k.gin
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_13b.gin
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_18b.gin
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_1m.gin
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_20m.gin
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_100b.gin
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_200b.gin
 delete mode 100644 recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_400m.gin
 delete mode 100644 recommendation_v4/generative_recommenders/ops/benchmarks/hstu_attention_bench.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/benchmarks/concat_1d_jagged_jagged_bench.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/benchmarks/jagged_transpose_1d_bench.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/benchmarks/replace_last_n_with_jagged_bench.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/benchmarks/split_1d_jagged_jagged_bench.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/common.h
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/complete_cumsum.cpp
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/complete_cumsum.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/concat_1d_jagged_jagged.cpp
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/concat_1d_jagged_jagged.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/cpp_ops.cpp
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/cuda_hstu_attention.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/cuda_hstu_preprocess_and_attention.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/expand_1d_jagged_to_dense.cpp
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/expand_1d_jagged_to_dense.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/copy_sm90_bulk_reduce.h
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/epilogue_bwd.h
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/epilogue_fwd.h
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash.h
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_api.cpp
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_api_cpu.cpp
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_kernel_sm90.h
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_launch_template.h
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_postprocess_kernel.h
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_preprocess_kernel.h
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common.cpp
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common.h
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common_cpu.cpp
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common_cpu.h
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_fwd_kernel_sm90.h
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_fwd_launch_template.h
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/generate_kernels.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_bf16_softmaxfalse_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_bf16_softmaxtrue_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_fp16_softmaxfalse_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_fp16_softmaxtrue_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_bf16_softmaxfalse_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_bf16_softmaxtrue_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_fp16_softmaxfalse_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_fp16_softmaxtrue_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_bf16_softmaxfalse_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_bf16_softmaxtrue_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_fp16_softmaxfalse_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_fp16_softmaxtrue_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_bf16_softmaxfalse_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_bf16_softmaxtrue_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_fp16_softmaxfalse_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_fp16_softmaxtrue_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_bf16_softmaxfalse_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_bf16_softmaxtrue_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_fp16_softmaxfalse_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_fp16_softmaxtrue_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_bf16_softmaxfalse_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_bf16_softmaxtrue_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_e4m3_softmaxfalse_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_e4m3_softmaxtrue_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_fp16_softmaxfalse_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_fp16_softmaxtrue_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_bf16_softmaxfalse_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_bf16_softmaxtrue_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_e4m3_softmaxfalse_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_e4m3_softmaxtrue_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_fp16_softmaxfalse_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_fp16_softmaxtrue_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_bf16_softmaxfalse_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_bf16_softmaxtrue_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_e4m3_softmaxfalse_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_e4m3_softmaxtrue_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_fp16_softmaxfalse_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_fp16_softmaxtrue_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_bf16_softmaxfalse_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_bf16_softmaxtrue_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_e4m3_softmaxfalse_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_e4m3_softmaxtrue_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_fp16_softmaxfalse_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_fp16_softmaxtrue_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_bf16_softmaxfalse_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_bf16_softmaxtrue_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_e4m3_softmaxfalse_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_e4m3_softmaxtrue_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_fp16_softmaxfalse_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_fp16_softmaxtrue_sm90.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mainloop_bwd_sm90_tma_gmma_ws.h
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mainloop_fwd_sm90_tma_gmma_ws.h
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mask.h
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/named_barrier.h
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/seqlen.h
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/sm90_pipeline_no_cluster.h
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/softmax.h
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/static_switch.h
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/tile_scheduler.h
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/tile_size.h
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/utils.h
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/version.txt
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/jagged_transpose_1d.cpp
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/jagged_transpose_1d.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/replace_last_n_with_jagged.cpp
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/replace_last_n_with_jagged.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/setup.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda.cpp
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda_kernels_template.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda_kernels_template.h
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/split_1d_jagged_jagged.cpp
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/split_1d_jagged_jagged.cu
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/tests/concat_1d_jagged_jagged_test.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/tests/hstu_mha_cpu_test.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/tests/jagged_transpose_1d_test.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/tests/replace_last_n_with_jagged_test.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/cpp/tests/split_1d_jagged_jagged_test.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/README.md
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/compile/arg_descriptor.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/compile/codegen.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/compile/compile_state.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/compile/pipeline.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/compile/spec_processing.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/compile/stable_types.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/compile/triton_aot_compile.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/compile/utils.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/preprocess.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/shared/compat.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/shared/spec_conversion.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/shared/types.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/templates/embedded_cubins.cpp
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/templates/kernel.cpp
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/templates/kernel.h
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/templates/template_utils.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/templates/torch_op.cpp
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/transform/import_utils.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/transform/kernel_wrapper_codegen.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/transform/replace_kernels.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/transform/transform_kernels.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/triton_addmm.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/triton_concat_2d_jagged.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/triton_group_norm_mul_dropout.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/triton_layer_norm.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/triton_layer_norm_mul_dropout.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/triton_position.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/triton_ragged_hstu_attention.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/triton_rms_norm.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/triton_split_2d_jagged.py
 delete mode 100644 recommendation_v4/generative_recommenders/ops/triton_aot/types.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/data/dataset.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/data/eval.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/data/item_features.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/data/preprocessor.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/data/reco_dataset.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/indexing/candidate_index.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/indexing/utils.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/modeling/initialization.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/modeling/sequential/autoregressive_losses.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/modeling/sequential/embedding_modules.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/modeling/sequential/encoder_utils.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/modeling/sequential/features.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/modeling/sequential/hstu.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/modeling/sequential/input_features_preprocessors.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/modeling/sequential/losses/sampled_softmax.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/modeling/sequential/output_postprocessors.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/modeling/sequential/sasrec.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/modeling/sequential/utils.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/modeling/similarity_module.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/modeling/similarity_utils.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/rails/indexing/candidate_index.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/rails/indexing/mips_top_k.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/rails/indexing/mol_top_k.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/rails/similarities/dot_product_similarity_fn.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/rails/similarities/layers.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/rails/similarities/module.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/rails/similarities/mol/embeddings_fn.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/rails/similarities/mol/item_embeddings_fn.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/rails/similarities/mol/query_embeddings_fn.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/rails/similarities/mol/similarity_fn.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/trainer/data_loader.py
 delete mode 100644 recommendation_v4/generative_recommenders/research/trainer/train.py
 delete mode 100644 recommendation_v4/main.py
 delete mode 100644 recommendation_v4/preprocess_public_data.py
 delete mode 100644 recommendation_v4/run_fractal_expansion.py

diff --git a/recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-final.gin b/recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-final.gin
deleted file mode 100644
index 8fb8b258c..000000000
--- a/recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-final.gin
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Frozen config, validated on 04/12/2024.
-# Based on HSTU results (w/ identical configurations as a SotA Transformer baseline) in
-# Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations (https://arxiv.org/abs/2402.17152).
-#
-# Run this as:
-# mkdir -p logs/amzn-books-l50/
-# CUDA_VISIBLE_DEVICES=1 python3 main.py --gin_config_file=configs/amzn-books/hstu-sampled-softmax-n512-final.gin --master_port=12346 2>&1 | tee logs/amzn-books-l50/hstu-sampled-softmax-n512-final.log
-
-train_fn.dataset_name = "amzn-books"
-train_fn.max_sequence_length = 50
-train_fn.local_batch_size = 128
-train_fn.eval_batch_size = 128
-
-train_fn.main_module = "HSTU"
-train_fn.dropout_rate = 0.5
-train_fn.user_embedding_norm = "l2_norm"
-train_fn.item_embedding_dim = 64
-
-hstu_encoder.num_blocks = 4
-hstu_encoder.num_heads = 4
-hstu_encoder.dv = 16
-hstu_encoder.dqk = 16
-hstu_encoder.linear_dropout_rate = 0.5
-
-train_fn.eval_interval = 4000
-train_fn.num_epochs = 201
-train_fn.learning_rate = 1e-3
-train_fn.weight_decay = 0
-train_fn.num_warmup_steps = 0
-
-train_fn.interaction_module_type = "DotProduct"
-train_fn.top_k_method = "MIPSBruteForceTopK"
-
-train_fn.loss_module = "SampledSoftmaxLoss"
-train_fn.num_negatives = 512
-
-train_fn.sampling_strategy = "local"
-train_fn.temperature = 0.05
-train_fn.item_l2_norm = True
-train_fn.l2_norm_eps = 1e-6
-
-train_fn.enable_tf32 = True
-train_fn.full_eval_every_n = 5
-train_fn.partial_eval_num_iters = 64
-
-create_data_loader.prefetch_factor = 1024
-create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-large-final.gin b/recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-large-final.gin
deleted file mode 100644
index 097d4cbc7..000000000
--- a/recommendation_v4/configs/amzn-books/hstu-sampled-softmax-n512-large-final.gin
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Frozen config, validated on 04/12/2024.
-# Based on HSTU-large results in
-# Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations (https://arxiv.org/abs/2402.17152).
-#
-# Run this as:
-# mkdir -p logs/amzn-books-l50/
-# CUDA_VISIBLE_DEVICES=1 python3 main.py --gin_config_file=configs/amzn-books/hstu-sampled-softmax-n512-large-final.gin --master_port=12346 2>&1 | tee logs/amzn-books-l50/hstu-sampled-softmax-n512-large-final2.log
-
-train_fn.dataset_name = "amzn-books"
-train_fn.max_sequence_length = 50
-train_fn.local_batch_size = 128
-train_fn.eval_batch_size = 128
-
-train_fn.main_module = "HSTU"
-train_fn.dropout_rate = 0.5
-train_fn.user_embedding_norm = "l2_norm"
-train_fn.item_embedding_dim = 64
-
-hstu_encoder.num_blocks = 16
-hstu_encoder.num_heads = 8
-hstu_encoder.dv = 8
-hstu_encoder.dqk = 8
-hstu_encoder.linear_dropout_rate = 0.5
-
-train_fn.eval_interval = 4000
-train_fn.num_epochs = 201
-train_fn.learning_rate = 1e-3
-train_fn.weight_decay = 0
-train_fn.num_warmup_steps = 0
-
-train_fn.interaction_module_type = "DotProduct"
-train_fn.top_k_method = "MIPSBruteForceTopK"
-
-train_fn.loss_module = "SampledSoftmaxLoss"
-train_fn.num_negatives = 512
-
-train_fn.sampling_strategy = "local"
-train_fn.temperature = 0.05
-train_fn.item_l2_norm = True
-train_fn.l2_norm_eps = 1e-6
-
-train_fn.enable_tf32 = True
-train_fn.full_eval_every_n = 5
-train_fn.partial_eval_num_iters = 64
-
-create_data_loader.prefetch_factor = 1024
-create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/amzn-books/sasrec-sampled-softmax-n512-final.gin b/recommendation_v4/configs/amzn-books/sasrec-sampled-softmax-n512-final.gin
deleted file mode 100644
index bc899c9fb..000000000
--- a/recommendation_v4/configs/amzn-books/sasrec-sampled-softmax-n512-final.gin
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Frozen config, validated on 04/12/2024.
-# Based on baseline settings in Revisiting Neural Retrieval on Accelerators (https://arxiv.org/abs/2306.04039, KDD'23).
-#
-# Run this as:
-# mkdir -p logs/amzn-books-l50/
-# CUDA_VISIBLE_DEVICES=1 python3 main.py --gin_config_file=configs/amzn-books/sasrec-sampled-softmax-n512-final.gin  --master_port=12346 2>&1 | tee logs/amzn-books-l50/sasrec-sampled-softmax-n512-final.log
-
-train_fn.dataset_name = "amzn-books"
-train_fn.max_sequence_length = 50
-train_fn.local_batch_size = 128
-train_fn.eval_batch_size = 128
-
-train_fn.main_module = "SASRec"
-train_fn.dropout_rate = 0.5
-train_fn.user_embedding_norm = "l2_norm"
-train_fn.item_embedding_dim = 64
-
-sasrec_encoder.num_blocks = 4
-sasrec_encoder.num_heads = 4
-sasrec_encoder.ffn_dropout_rate = 0.5
-sasrec_encoder.ffn_hidden_dim = 64
-sasrec_encoder.ffn_activation_fn = "relu"
-
-train_fn.eval_interval = 4000
-train_fn.num_epochs = 201
-train_fn.learning_rate = 1e-3
-train_fn.weight_decay = 0
-train_fn.num_warmup_steps = 0
-
-train_fn.save_ckpt_every_n = 10
-
-train_fn.interaction_module_type = "DotProduct"
-train_fn.top_k_method = "MIPSBruteForceTopK"
-
-train_fn.loss_module = "SampledSoftmaxLoss"
-train_fn.num_negatives = 512
-
-train_fn.sampling_strategy = "local"
-train_fn.temperature = 0.05
-train_fn.item_l2_norm = True
-train_fn.l2_norm_eps = 1e-6
-
-train_fn.enable_tf32 = True
-train_fn.full_eval_every_n = 5
-train_fn.partial_eval_num_iters = 64
-
-create_data_loader.prefetch_factor = 1024
-create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-final.gin b/recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-final.gin
deleted file mode 100644
index 841b1c80a..000000000
--- a/recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-final.gin
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Frozen config, validated on 04/11/2024.
-# Based on HSTU results (w/ identical configurations as a SotA Transformer baseline) in
-# Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations (https://arxiv.org/abs/2402.17152).
-#
-# Run this as:
-# mkdir -p logs/ml-1m-l200/
-# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-1m/hstu-sampled-softmax-n128-final.gin --master_port=12345 2>&1 | tee logs/ml-1m-l200/hstu-sampled-softmax-n128-final.log
-
-train_fn.dataset_name = "ml-1m"
-train_fn.max_sequence_length = 200
-train_fn.local_batch_size = 128
-
-train_fn.main_module = "HSTU"
-train_fn.dropout_rate = 0.2
-train_fn.user_embedding_norm = "l2_norm"
-train_fn.num_epochs = 101
-train_fn.item_embedding_dim = 50
-
-hstu_encoder.num_blocks = 2
-hstu_encoder.num_heads = 1
-hstu_encoder.dqk = 50
-hstu_encoder.dv = 50
-hstu_encoder.linear_dropout_rate = 0.2
-
-train_fn.learning_rate = 1e-3
-train_fn.weight_decay = 0
-train_fn.num_warmup_steps = 0
-
-train_fn.interaction_module_type = "DotProduct"
-train_fn.top_k_method = "MIPSBruteForceTopK"
-
-train_fn.loss_module = "SampledSoftmaxLoss"
-train_fn.num_negatives = 128
-
-train_fn.sampling_strategy = "local"
-train_fn.temperature = 0.05
-train_fn.item_l2_norm = True
-train_fn.l2_norm_eps = 1e-6
-
-train_fn.enable_tf32 = True
-
-create_data_loader.prefetch_factor = 128
-create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-large-final.gin b/recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-large-final.gin
deleted file mode 100644
index 7ffc7ef64..000000000
--- a/recommendation_v4/configs/ml-1m/hstu-sampled-softmax-n128-large-final.gin
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Frozen config, validated on 04/11/2024.
-# Based on HSTU-large results in
-# Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations (https://arxiv.org/abs/2402.17152).
-#
-# Run this as:
-# mkdir -p logs/ml-1m-l200/
-# CUDA_VISIBLE_DEVICES=1 python3 main.py --gin_config_file=configs/ml-1m/hstu-sampled-softmax-n128-large-final.gin --master_port=12346 2>&1 | tee logs/ml-1m-l200/hstu-sampled-softmax-n128-large-final.log
-
-train_fn.dataset_name = "ml-1m"
-train_fn.max_sequence_length = 200
-train_fn.local_batch_size = 128
-
-train_fn.main_module = "HSTU"
-train_fn.dropout_rate = 0.2
-train_fn.user_embedding_norm = "l2_norm"
-train_fn.num_epochs = 101
-train_fn.item_embedding_dim = 50
-
-hstu_encoder.num_blocks = 8
-hstu_encoder.num_heads = 2
-hstu_encoder.dqk = 25
-hstu_encoder.dv = 25
-hstu_encoder.linear_dropout_rate = 0.2
-
-train_fn.learning_rate = 1e-3
-train_fn.weight_decay = 0
-train_fn.num_warmup_steps = 0
-
-train_fn.interaction_module_type = "DotProduct"
-train_fn.top_k_method = "MIPSBruteForceTopK"
-
-train_fn.loss_module = "SampledSoftmaxLoss"
-train_fn.num_negatives = 128
-
-train_fn.sampling_strategy = "local"
-train_fn.temperature = 0.05
-train_fn.item_l2_norm = True
-train_fn.l2_norm_eps = 1e-6
-
-train_fn.enable_tf32 = True
-
-create_data_loader.prefetch_factor = 128
-create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-1m/sasrec-sampled-softmax-n128-final.gin b/recommendation_v4/configs/ml-1m/sasrec-sampled-softmax-n128-final.gin
deleted file mode 100644
index ead7bb21c..000000000
--- a/recommendation_v4/configs/ml-1m/sasrec-sampled-softmax-n128-final.gin
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Frozen config, validated on 04/11/2024.
-# Based on baseline settings in Revisiting Neural Retrieval on Accelerators (https://arxiv.org/abs/2306.04039, KDD'23).
-#
-# Run this as:
-# mkdir -p logs/ml-1m-l200/
-# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-1m/sasrec-sampled-softmax-n128-final.gin --master_port=12345 2>&1 | tee logs/ml-1m-l200/sasrec-sampled-softmax-n128-final.log
-
-train_fn.dataset_name = "ml-1m"
-train_fn.max_sequence_length = 200
-train_fn.local_batch_size = 128
-
-train_fn.main_module = "SASRec"
-train_fn.dropout_rate = 0.2
-train_fn.user_embedding_norm = "l2_norm"
-train_fn.num_epochs = 101
-train_fn.item_embedding_dim = 50
-
-sasrec_encoder.num_blocks = 2
-sasrec_encoder.num_heads = 1
-sasrec_encoder.ffn_dropout_rate = 0.2
-sasrec_encoder.ffn_hidden_dim = 50
-sasrec_encoder.ffn_activation_fn = "relu"
-
-train_fn.learning_rate = 1e-3
-train_fn.weight_decay = 0
-train_fn.num_warmup_steps = 0
-
-train_fn.top_k_method = "MIPSBruteForceTopK"
-train_fn.interaction_module_type = "DotProduct"
-
-train_fn.loss_module = "SampledSoftmaxLoss"
-train_fn.num_negatives = 128
-
-train_fn.sampling_strategy = "local"
-train_fn.temperature = 0.05
-train_fn.item_l2_norm = True
-train_fn.l2_norm_eps = 1e-6
-
-train_fn.enable_tf32 = True
-
-create_data_loader.prefetch_factor = 128
-create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-final.gin b/recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-final.gin
deleted file mode 100644
index 5823ad5b6..000000000
--- a/recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-final.gin
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Frozen config, validated on 04/12/2024.
-# Based on HSTU results (w/ identical configurations as a SotA Transformer baseline) in
-# Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations (https://arxiv.org/abs/2402.17152).
-#
-# Run this as:
-# mkdir -p logs/ml-20m-l200/
-# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-20m/hstu-sampled-softmax-n128-final.gin --master_port=12345 2>&1 | tee logs/ml-20m-l200/hstu-sampled-softmax-n128-final.log
-
-train_fn.dataset_name = "ml-20m"
-train_fn.max_sequence_length = 200
-train_fn.local_batch_size = 128
-
-train_fn.main_module = "HSTU"
-train_fn.dropout_rate = 0.2
-train_fn.user_embedding_norm = "l2_norm"
-train_fn.num_epochs = 101
-train_fn.item_embedding_dim = 256
-
-hstu_encoder.num_blocks = 4
-hstu_encoder.num_heads = 4
-hstu_encoder.dv = 64
-hstu_encoder.dqk = 64
-hstu_encoder.linear_dropout_rate = 0.2
-
-train_fn.learning_rate = 1e-3
-train_fn.weight_decay = 0
-train_fn.num_warmup_steps = 0
-
-train_fn.interaction_module_type = "DotProduct"
-train_fn.top_k_method = "MIPSBruteForceTopK"
-
-train_fn.loss_module = "SampledSoftmaxLoss"
-train_fn.num_negatives = 128
-
-train_fn.sampling_strategy = "local"
-train_fn.temperature = 0.05
-train_fn.item_l2_norm = True
-train_fn.l2_norm_eps = 1e-6
-
-train_fn.enable_tf32 = True
-
-create_data_loader.prefetch_factor = 128
-create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-large-final.gin b/recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-large-final.gin
deleted file mode 100644
index 0199afa24..000000000
--- a/recommendation_v4/configs/ml-20m/hstu-sampled-softmax-n128-large-final.gin
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Frozen config, validated on 04/12/2024.
-# Based on HSTU-large results in
-# Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations (https://arxiv.org/abs/2402.17152).
-#
-# Run this as:
-# mkdir -p logs/ml-20m-l200/
-# CUDA_VISIBLE_DEVICES=0 PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True python3 main.py --gin_config_file=configs/ml-20m/hstu-sampled-softmax-n128-large-final.gin --master_port=12345 2>&1 | tee logs/ml-20m-l200/hstu-sampled-softmax-n128-large-final.log
-
-train_fn.dataset_name = "ml-20m"
-train_fn.max_sequence_length = 200
-train_fn.local_batch_size = 128
-
-train_fn.main_module = "HSTU"
-train_fn.dropout_rate = 0.2
-train_fn.user_embedding_norm = "l2_norm"
-train_fn.num_epochs = 101
-train_fn.item_embedding_dim = 256
-
-hstu_encoder.num_blocks = 16
-hstu_encoder.num_heads = 8
-hstu_encoder.dv = 32
-hstu_encoder.dqk = 32
-hstu_encoder.linear_dropout_rate = 0.2
-
-train_fn.learning_rate = 1e-3
-train_fn.weight_decay = 0
-train_fn.num_warmup_steps = 0
-
-train_fn.interaction_module_type = "DotProduct"
-train_fn.top_k_method = "MIPSBruteForceTopK"
-
-train_fn.loss_module = "SampledSoftmaxLoss"
-train_fn.num_negatives = 128
-
-train_fn.sampling_strategy = "local"
-train_fn.temperature = 0.05
-train_fn.item_l2_norm = True
-train_fn.l2_norm_eps = 1e-6
-
-train_fn.enable_tf32 = True
-
-create_data_loader.prefetch_factor = 128
-create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-20m/sasrec-sampled-softmax-n128-final.gin b/recommendation_v4/configs/ml-20m/sasrec-sampled-softmax-n128-final.gin
deleted file mode 100644
index 3c666f802..000000000
--- a/recommendation_v4/configs/ml-20m/sasrec-sampled-softmax-n128-final.gin
+++ /dev/null
@@ -1,44 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Frozen config, validated on 04/12/2024.
-# Based on baseline settings in Revisiting Neural Retrieval on Accelerators (https://arxiv.org/abs/2306.04039, KDD'23).
-#
-# Run this as:
-# mkdir -p logs/ml-20m-l200/
-# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-20m/sasrec-sampled-softmax-n128-final.gin --master_port=12345 2>&1 | tee logs/ml-20m-l200/sasrec-sampled-softmax-n128-final.log
-
-train_fn.dataset_name = "ml-20m"
-train_fn.max_sequence_length = 200
-train_fn.local_batch_size = 128
-
-train_fn.main_module = "SASRec"
-train_fn.dropout_rate = 0.2
-train_fn.user_embedding_norm = "l2_norm"
-train_fn.num_epochs = 101
-train_fn.item_embedding_dim = 256
-
-sasrec_encoder.num_blocks = 4
-sasrec_encoder.num_heads = 4
-sasrec_encoder.ffn_dropout_rate = 0.2
-sasrec_encoder.ffn_hidden_dim = 256
-sasrec_encoder.ffn_activation_fn = "relu"
-
-train_fn.learning_rate = 1e-3
-train_fn.weight_decay = 0
-train_fn.num_warmup_steps = 0
-
-train_fn.top_k_method = "MIPSBruteForceTopK"
-train_fn.interaction_module_type = "DotProduct"
-
-train_fn.loss_module = "SampledSoftmaxLoss"
-train_fn.num_negatives = 128
-
-train_fn.sampling_strategy = "local"
-train_fn.temperature = 0.05
-train_fn.item_l2_norm = True
-train_fn.l2_norm_eps = 1e-6
-
-train_fn.enable_tf32 = True
-
-create_data_loader.prefetch_factor = 128
-create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-final.gin b/recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-final.gin
deleted file mode 100644
index ac7a85350..000000000
--- a/recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-final.gin
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Run this as:
-# mkdir -p logs/ml-3b-l500/
-# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-final.gin --master_port=12345 2>&1 | tee logs/ml-3b-l500/hstu-sampled-softmax-n96-seqlen500-final.log
-
-train_fn.dataset_name = "ml-3b"
-train_fn.max_sequence_length = 500
-train_fn.local_batch_size = 96
-train_fn.eval_batch_size = 96
-
-train_fn.main_module = "HSTU"
-train_fn.dropout_rate = 0.2
-train_fn.user_embedding_norm = "l2_norm"
-train_fn.num_epochs = 100
-train_fn.item_embedding_dim = 256
-
-hstu_encoder.num_blocks = 4
-hstu_encoder.num_heads = 4
-hstu_encoder.dv = 64
-hstu_encoder.dqk = 64
-hstu_encoder.linear_dropout_rate = 0.2
-
-train_fn.learning_rate = 1e-3
-train_fn.weight_decay = 0
-train_fn.num_warmup_steps = 0
-
-train_fn.interaction_module_type = "DotProduct"
-train_fn.top_k_method = "MIPSBruteForceTopK"
-
-train_fn.loss_module = "SampledSoftmaxLoss"
-train_fn.num_negatives = 128
-
-train_fn.sampling_strategy = "local"
-train_fn.temperature = 0.05
-train_fn.item_l2_norm = True
-train_fn.l2_norm_eps = 1e-6
-
-train_fn.enable_tf32 = True
-
-create_data_loader.prefetch_factor = 128
-create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-large-final.gin b/recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-large-final.gin
deleted file mode 100644
index a30ad3657..000000000
--- a/recommendation_v4/configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-large-final.gin
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Run this as:
-# mkdir -p logs/ml-3b-l500/
-# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-3b/hstu-sampled-softmax-n96-seqlen500-large-final.gin --master_port=12345 2>&1 | tee logs/ml-3b-l500/hstu-sampled-softmax-n96-seqlen500-large-final.log
-
-train_fn.dataset_name = "ml-3b"
-train_fn.max_sequence_length = 500
-train_fn.local_batch_size = 96
-train_fn.eval_batch_size = 96
-
-train_fn.main_module = "HSTU"
-train_fn.dropout_rate = 0.2
-train_fn.user_embedding_norm = "l2_norm"
-train_fn.num_epochs = 100
-train_fn.item_embedding_dim = 256
-
-hstu_encoder.num_blocks = 16
-hstu_encoder.num_heads = 8
-hstu_encoder.dv = 32
-hstu_encoder.dqk = 32
-hstu_encoder.linear_dropout_rate = 0.2
-
-train_fn.learning_rate = 1e-3
-train_fn.weight_decay = 0
-train_fn.num_warmup_steps = 0
-
-train_fn.interaction_module_type = "DotProduct"
-train_fn.top_k_method = "MIPSBruteForceTopK"
-
-train_fn.loss_module = "SampledSoftmaxLoss"
-train_fn.num_negatives = 128
-
-train_fn.sampling_strategy = "local"
-train_fn.temperature = 0.05
-train_fn.item_l2_norm = True
-train_fn.l2_norm_eps = 1e-6
-
-train_fn.enable_tf32 = True
-
-create_data_loader.prefetch_factor = 128
-create_data_loader.num_workers = 8
diff --git a/recommendation_v4/configs/ml-3b/sasrec-sampled-softmax-n96-seqlen500-final.gin b/recommendation_v4/configs/ml-3b/sasrec-sampled-softmax-n96-seqlen500-final.gin
deleted file mode 100644
index 034c478b4..000000000
--- a/recommendation_v4/configs/ml-3b/sasrec-sampled-softmax-n96-seqlen500-final.gin
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Run this as:
-# mkdir -p logs/ml-3b-l500/
-# CUDA_VISIBLE_DEVICES=0 python3 main.py --gin_config_file=configs/ml-3b/sasrec-sampled-softmax-n96-seqlen500-final.gin --master_port=12345 2>&1 | tee logs/ml-3b-l500/sasrec-sampled-softmax-n96-seqlen500-final.log
-
-train_fn.dataset_name = "ml-3b"
-train_fn.max_sequence_length = 500
-train_fn.local_batch_size = 96
-train_fn.eval_batch_size = 96
-
-train_fn.main_module = "SASRec"
-train_fn.dropout_rate = 0.2
-train_fn.user_embedding_norm = "l2_norm"
-train_fn.num_epochs = 100
-train_fn.item_embedding_dim = 256
-
-sasrec_encoder.num_blocks = 4
-sasrec_encoder.num_heads = 4
-sasrec_encoder.ffn_dropout_rate = 0.2
-sasrec_encoder.ffn_hidden_dim = 256
-sasrec_encoder.ffn_activation_fn = "relu"
-
-train_fn.learning_rate = 1e-3
-train_fn.weight_decay = 0
-train_fn.num_warmup_steps = 0
-
-train_fn.top_k_method = "MIPSBruteForceTopK"
-train_fn.interaction_module_type = "DotProduct"
-
-train_fn.loss_module = "SampledSoftmaxLoss"
-train_fn.num_negatives = 128
-
-train_fn.sampling_strategy = "local"
-train_fn.temperature = 0.05
-train_fn.item_l2_norm = True
-train_fn.l2_norm_eps = 1e-6
-
-train_fn.enable_tf32 = True
-
-create_data_loader.prefetch_factor = 128
-create_data_loader.num_workers = 8
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/README.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/README.md
deleted file mode 100644
index ef1c9686d..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/README.md
+++ /dev/null
@@ -1,88 +0,0 @@
-# MLPerf Inference reference implementation for DLRMv3
-
-## Install dependencies
-
-The reference implementation has been tested on a single host, with x86_64 CPUs
-and 8 NVIDIA H100/B200 GPUs. Dependencies can be installed below,
-
-```
-cd generative_recommenders/
-pip install -e .
-```
-
-## Build loadgen
-
-```
-cd generative_recommenders/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/
-CFLAGS="-std=c++14 -O3" python -m pip install .
-```
-
-## Dataset download
-
-DLRMv3 uses a synthetic dataset specifically designed to match the model and
-system characteristics of large-scale sequential recommendation (large item set
-and long average sequence length for each request). To generate the dataset used
-for both training and inference, run
-
-```
-cd generative_recommenders/dlrm_v3/
-python streaming_synthetic_data.py
-```
-
-The generated dataset has 2TB size, and contains 5 million users interacting
-with a billion items over 100 timestamps.
-
-Only 1% of the dataset is used in the inference benchmark. The sampled DLRMv3
-dataset and trained checkpoint are available at
-https://inference.mlcommons-storage.org/.
-
-Script to download the sampled dataset used in inference benchmark:
-
-```
-bash <(curl -s https://raw.githubusercontent.com/mlcommons/r2-downloader/refs/heads/main/mlc-r2-downloader.sh) https://inference.mlcommons-storage.org/metadata/dlrm-v3-dataset.uri
-```
-
-Script to download the 1TB trained checkpoint:
-
-```
-bash <(curl -s https://raw.githubusercontent.com/mlcommons/r2-downloader/refs/heads/main/mlc-r2-downloader.sh) https://inference.mlcommons-storage.org/metadata/dlrm-v3-checkpoint.uri
-```
-
-## Inference benchmark
-
-```
-cd generative_recommenders/generative_recommenders/dlrm_v3/inference/
-WORLD_SIZE=8 python main.py --dataset sampled-streaming-100b
-```
-
-The config file is listed in `dlrm_v3/inference/gin/streaming_100b.gin`.
-`WORLD_SIZE` is the number of GPUs used in the inference benchmark.
-
-To load checkpoint from training, modify `run.model_path` inside the inference
-gin config file. (We will relase the checkpoint soon.)
-
-To achieve the best performance, tune `run.target_qps` and `run.batch_size` in
-the config file.
-
-## Accuracy test
-
-Set `run.compute_eval` will run the accuracy test and dump prediction outputs in
-`mlperf_log_accuracy.json`. To check the accuracy, run
-
-```
-python accuracy.py --path path/to/mlperf_log_accuracy.json
-```
-
-We use normalized entropy (NE), accuracy, and AUC as the metrics to evaluate the model quality. For accepted submissions, all three metrics (NE, Accuracy, AUC) must be within 99% of the reference implementation values. The accuracy for the reference implementation evaluated on 34,996 requests across 10 inference timestamps are listed below:
-
-```
-NE: 86.687%
-Accuracy: 69.651%
-AUC: 78.663%
-```
-
-## Run unit tests
-
-```
-python tests/inference_test.py
-```
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/accuracy.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/accuracy.py
deleted file mode 100644
index 19242f7bd..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/accuracy.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-strict
-"""
-Tool to calculate accuracy for loadgen accuracy output found in mlperf_log_accuracy.json
-"""
-
-import argparse
-import json
-import logging
-
-import numpy as np
-import torch
-from generative_recommenders.dlrm_v3.configs import get_hstu_configs
-from generative_recommenders.dlrm_v3.utils import MetricsLogger
-
-logger: logging.Logger = logging.getLogger("main")
-
-
-def get_args() -> argparse.Namespace:
-    """Parse commandline."""
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--path",
-        required=True,
-        help="path to mlperf_log_accuracy.json",
-    )
-    args = parser.parse_args()
-    return args
-
-
-def main() -> None:
-    """
-    Main function to calculate accuracy metrics from loadgen output.
-
-    Reads the mlperf_log_accuracy.json file, parses the results, and computes
-    accuracy metrics using the MetricsLogger. Each result entry contains
-    predictions, labels, and weights packed as float32 numpy arrays.
-    """
-    args = get_args()
-    logger.warning("Parsing loadgen accuracy log...")
-    with open(args.path, "r") as f:
-        results = json.load(f)
-    hstu_config = get_hstu_configs(dataset="sampled-streaming-100b")
-    metrics = MetricsLogger(
-        multitask_configs=hstu_config.multitask_configs,
-        batch_size=1,
-        window_size=3000,
-        device=torch.device("cpu"),
-        rank=0,
-    )
-    logger.warning(f"results have {len(results)} entries")
-    for result in results:
-        data = np.frombuffer(bytes.fromhex(result["data"]), np.float32)
-        num_candidates = data[-1].astype(int)
-        assert len(data) == 1 + num_candidates * 3
-        mt_target_preds = torch.from_numpy(data[0:num_candidates])
-        mt_target_labels = torch.from_numpy(data[num_candidates : num_candidates * 2])
-        mt_target_weights = torch.from_numpy(
-            data[num_candidates * 2 : num_candidates * 3]
-        )
-        num_candidates = torch.tensor([num_candidates])
-        metrics.update(
-            predictions=mt_target_preds.view(1, -1),
-            labels=mt_target_labels.view(1, -1),
-            weights=mt_target_weights.view(1, -1),
-            num_candidates=num_candidates,
-        )
-    for k, v in metrics.compute().items():
-        logger.warning(f"{k}: {v}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/cpp/hstu_runner.cpp b/recommendation_v4/generative_recommenders/dlrm_v3/inference/cpp/hstu_runner.cpp
deleted file mode 100644
index d4d0d4082..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/cpp/hstu_runner.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
-//
-// End-to-end runner for the HSTU torch.jit / torch.package artifacts produced
-// by generative_recommenders/dlrm_v3/inference/packager.py and exercised by
-// :end_to_end_test.
-//
-// CLI:
-//   hstu_runner [--aott_library <lib.so> ...] <sparse.pt> <dense.pt>
-//       <inputs.pt> <output.pt>
-//
-// Where:
-//   sparse.pt   ScriptModule whose forward(uih, candidates) returns
-//               Tuple[Dict[str,Tensor], Dict[str,Tensor],
-//                     Dict[str,Tensor], Tensor, Tensor]
-//   dense.pt    ScriptModule (cuda:0, bf16) whose forward(...) returns
-//               Tuple[Tensor, Optional[Tensor], Optional[Tensor]]
-//   inputs.pt   ScriptModule whose forward() returns
-//               Tuple[KeyedJaggedTensor, KeyedJaggedTensor]
-//   output.pt   torch::pickle_save destination for the predictions tensor;
-//               readable from Python as ``torch.load(output.pt)``.
-
-#include <dlfcn.h>
-
-#include <fstream>
-#include <functional>
-#include <iostream>
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-#include <torch/csrc/jit/serialization/import.h>
-#include <torch/script.h>
-
-namespace {
-
-struct RunnerArgs {
-  std::vector<std::string> aottLibraryPaths;
-  std::string sparsePath;
-  std::string densePath;
-  std::string inputsPath;
-  std::string outputPath;
-};
-
-RunnerArgs parseArgs(int argc, char** argv) {
-  RunnerArgs args;
-  std::vector<std::string> positional;
-  for (int i = 1; i < argc; ++i) {
-    const std::string arg{argv[i]};
-    if (arg == "--aott_library") {
-      if (++i >= argc) {
-        throw std::runtime_error("--aott_library requires a path");
-      }
-      args.aottLibraryPaths.emplace_back(argv[i]);
-    } else {
-      positional.push_back(arg);
-    }
-  }
-
-  if (positional.size() != 4) {
-    throw std::runtime_error(
-        "Usage: hstu_runner [--aott_library <lib.so> ...] <sparse.pt> "
-        "<dense.pt> <inputs.pt> <output.pt>");
-  }
-  args.sparsePath = positional[0];
-  args.densePath = positional[1];
-  args.inputsPath = positional[2];
-  args.outputPath = positional[3];
-  return args;
-}
-
-void loadAottLibraries(
-    const std::vector<std::string>& libraryPaths,
-    const std::function<void(const std::string&)>& log) {
-  for (const auto& path : libraryPaths) {
-    log("[runner] loading AOT-T library " + path);
-    void* handle = dlopen(path.c_str(), RTLD_GLOBAL | RTLD_NOW);
-    if (handle == nullptr) {
-      throw std::runtime_error(
-          "failed to dlopen AOT-T library " + path + ": " + dlerror());
-    }
-  }
-}
-
-torch::jit::Module loadModule(const std::string& path) {
-  // @patternlint-disable-next-line no-torch-low-level-api
-  auto m = torch::jit::load(path);
-  m.eval();
-  return m;
-}
-
-// Walk a Dict<str, Tensor> and replace every value with .to(device) (and
-// optionally .to(bfloat16)). C++ analog of move_sparse_output_to_device.
-void moveDictToDevice(
-    c10::impl::GenericDict& d,
-    const torch::Device& device,
-    bool toBfloat16) {
-  for (auto& kv : d) {
-    auto t = kv.value().toTensor().to(device);
-    if (toBfloat16) {
-      t = t.to(torch::kBFloat16);
-    }
-    d.insert_or_assign(kv.key(), t);
-  }
-}
-
-void writePickle(const torch::Tensor& t, const std::string& path) {
-  // torch::pickle_save returns a byte buffer in the same wire format as
-  // ``torch.save(tensor, ...)``, so the Python side can read it with
-  // ``torch.load(path)``.
-  const auto data = torch::jit::pickle_save(c10::IValue(t));
-  std::ofstream out(path, std::ios::binary);
-  if (!out) {
-    throw std::runtime_error("failed to open output: " + path);
-  }
-  out.write(data.data(), static_cast<std::streamsize>(data.size()));
-}
-
-} // namespace
-
-int main(int argc, char** argv) {
-  RunnerArgs args;
-  try {
-    args = parseArgs(argc, argv);
-  } catch (const std::exception& e) {
-    std::cerr << e.what() << '\n';
-    return 1;
-  }
-
-  // Log to a file next to the output so we can inspect even if
-  // buck2 swallows stderr.
-  const std::string logPath = args.outputPath + ".log";
-  std::ofstream logFile(logPath);
-  auto log = [&](const std::string& msg) {
-    logFile << msg << std::endl;
-    logFile.flush();
-    std::cerr << msg << std::endl;
-  };
-
-  try {
-    log("[runner] step 0: loading AOT-T libraries");
-    loadAottLibraries(args.aottLibraryPaths, log);
-    log("[runner] step 0 done: loaded " +
-        std::to_string(args.aottLibraryPaths.size()) + " AOT-T libraries");
-
-    log("[runner] step 1: loading sparse module from " + args.sparsePath);
-    auto sparse = loadModule(args.sparsePath);
-
-    log("[runner] step 2: loading dense module from " + args.densePath);
-    auto dense = loadModule(args.densePath);
-
-    log("[runner] step 3: loading inputs module from " + args.inputsPath);
-    auto inputs = loadModule(args.inputsPath);
-
-    log("[runner] step 4: running inputs.forward()");
-    auto inputsTuple = inputs.forward({}).toTuple();
-    auto uihLengths = inputsTuple->elements()[0];
-    auto uihValues = inputsTuple->elements()[1];
-    auto candidatesLengths = inputsTuple->elements()[2];
-    auto candidatesValues = inputsTuple->elements()[3];
-    log("[runner] step 4 done: got 4 input tensors");
-
-    log("[runner] step 5: running sparse.forward()");
-    std::vector<c10::IValue> sparseInputs{
-        uihLengths, uihValues, candidatesLengths, candidatesValues};
-    auto sparseOut = sparse.forward(sparseInputs).toTuple();
-    log("[runner] step 5 done: sparse forward returned " +
-        std::to_string(sparseOut->elements().size()) + " elements");
-
-    log("[runner] step 6: unpacking sparse output dicts");
-    auto seqEmbValues = sparseOut->elements()[0].toGenericDict();
-    auto seqEmbLengths = sparseOut->elements()[1].toGenericDict();
-    auto payloadFeatures = sparseOut->elements()[2].toGenericDict();
-    auto uihSeqLengths = sparseOut->elements()[3].toTensor();
-    auto numCandidates = sparseOut->elements()[4].toTensor();
-    log("[runner] step 6 done: unpacked dicts");
-
-    log("[runner] step 7: moving dicts to cuda:0");
-    const auto device = torch::Device(torch::kCUDA, 0);
-    moveDictToDevice(seqEmbValues, device, /*toBfloat16=*/true);
-    log("[runner] step 7a: seqEmbValues moved");
-    moveDictToDevice(seqEmbLengths, device, /*toBfloat16=*/false);
-    log("[runner] step 7b: seqEmbLengths moved");
-    moveDictToDevice(payloadFeatures, device, /*toBfloat16=*/false);
-    log("[runner] step 7c: payloadFeatures moved");
-    uihSeqLengths = uihSeqLengths.to(device);
-    numCandidates = numCandidates.to(device);
-    log("[runner] step 7 done: all on cuda:0");
-
-    log("[runner] step 8: running dense.forward()");
-    std::vector<c10::IValue> denseInputs{
-        seqEmbValues,
-        seqEmbLengths,
-        payloadFeatures,
-        uihSeqLengths,
-        numCandidates,
-    };
-    auto denseOut = dense.forward(denseInputs);
-    log("[runner] step 8 done: dense forward returned");
-
-    auto preds = denseOut.toTensor().detach().cpu();
-    log("[runner] step 9: preds on cpu");
-
-    std::cout << "preds shape: " << preds.sizes() << '\n';
-    std::cout << "preds sum:   "
-              << preds.to(torch::kFloat32).sum().item<float>() << '\n';
-
-    writePickle(preds, args.outputPath);
-    std::cout << "wrote " << args.outputPath << '\n';
-    log("[runner] step 10: done, wrote output");
-    return 0;
-  } catch (const std::exception& e) {
-    log(std::string("hstu_runner FAILED: ") + e.what());
-    return 1;
-  }
-}
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/data_producer.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/data_producer.py
deleted file mode 100644
index 6a8db77c8..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/data_producer.py
+++ /dev/null
@@ -1,227 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-strict
-"""
-Data producer module for DLRMv3 inference.
-
-This module provides classes for producing and managing query data during inference,
-supporting both single-threaded and multi-threaded data production modes.
-"""
-
-import logging
-import threading
-import time
-from queue import Queue
-from typing import List, Optional, Tuple, Union
-
-import torch
-from generative_recommenders.dlrm_v3.datasets.dataset import Dataset, Samples
-
-logging.basicConfig(level=logging.INFO)
-logger: logging.Logger = logging.getLogger("data_producer")
-
-
-class QueryItem:
-    """
-    Container for a query item to be processed by the inference thread pool.
-
-    Attributes:
-        query_ids: List of unique identifiers for the queries in this batch.
-        samples: The sample data containing features for the queries.
-        start: Time when the query was first received.
-        dt_queue: Time spent in the queue before processing.
-        dt_batching: Time spent on batching the data.
-    """
-
-    def __init__(
-        self,
-        query_ids: List[int],
-        samples: Samples,
-        start: float,
-        dt_queue: float,
-        dt_batching: float,
-    ) -> None:
-        self.query_ids = query_ids
-        self.samples = samples
-        self.start: float = start
-        self.dt_queue: float = dt_queue
-        self.dt_batching: float = dt_batching
-
-
-class SingleThreadDataProducer:
-    """
-    Single-threaded data producer for synchronous query processing.
-
-    This producer processes queries on the main thread without any parallelism,
-    suitable for debugging or low-throughput scenarios.
-
-    Args:
-        ds: The dataset to fetch samples from.
-        run_one_item: Callback function to process a single QueryItem.
-    """
-
-    def __init__(self, ds: Dataset, run_one_item) -> None:  # pyre-ignore [2]
-        self.ds = ds
-        self.run_one_item = run_one_item  # pyre-ignore [4]
-
-    def enqueue(
-        self, query_ids: List[int], content_ids: List[int], t0: float, dt_queue: float
-    ) -> None:
-        """
-        Enqueue queries for immediate synchronous processing.
-
-        Args:
-            query_ids: List of unique query identifiers.
-            content_ids: List of content/sample identifiers to fetch.
-            t0: Timestamp when the query batch was created.
-            dt_queue: Time spent waiting in the queue.
-        """
-        with torch.profiler.record_function("data batching"):
-            t0_batching: float = time.time()
-            samples: Union[Samples, List[Samples]] = self.ds.get_samples(content_ids)
-            dt_batching: float = time.time() - t0_batching
-            if isinstance(samples, Samples):
-                query = QueryItem(
-                    query_ids=query_ids,
-                    samples=samples,
-                    start=t0,
-                    dt_queue=dt_queue,
-                    dt_batching=dt_batching,
-                )
-                self.run_one_item(query)
-            else:
-                start_idx = 0
-                for sample in samples:
-                    batch_size: int = sample.batch_size()
-                    query = QueryItem(
-                        query_ids=query_ids[start_idx : start_idx + batch_size],
-                        samples=sample,
-                        start=t0,
-                        dt_queue=dt_queue,
-                        dt_batching=dt_batching,
-                    )
-                    start_idx += batch_size
-                    self.run_one_item(query)
-
-    def finish(self) -> None:
-        """Finalize the producer. No-op for single-threaded mode."""
-        pass
-
-
-class MultiThreadDataProducer:
-    """
-    Multi-threaded data producer for parallel query processing.
-
-    Uses a thread pool to fetch and batch data in parallel with model inference,
-    improving throughput for high-load scenarios.
-
-    Args:
-        ds: The dataset to fetch samples from.
-        threads: Number of worker threads to use.
-        run_one_item: Callback function to process a single QueryItem.
-    """
-
-    def __init__(
-        self,
-        ds: Dataset,
-        threads: int,
-        run_one_item,  # pyre-ignore [2]
-    ) -> None:
-        queue_size_multiplier = 4
-        self.ds = ds
-        self.threads = threads
-        self.run_one_item = run_one_item  # pyre-ignore [4]
-        self.tasks: Queue[Optional[Tuple[List[int], List[int], float, float]]] = Queue(
-            maxsize=threads * queue_size_multiplier
-        )
-        self.workers: List[threading.Thread] = []
-        for _ in range(self.threads):
-            worker = threading.Thread(target=self.handle_tasks, args=(self.tasks,))
-            worker.daemon = True
-            self.workers.append(worker)
-            worker.start()
-
-    def handle_tasks(
-        self, tasks_queue: Queue[Optional[Tuple[List[int], List[int], float, float]]]
-    ) -> None:
-        """
-        Worker thread main loop to process tasks from the queue.
-
-        Each worker maintains its own CUDA stream for parallel execution.
-
-        Args:
-            tasks_queue: Queue containing task tuples or None for termination.
-        """
-        stream = torch.cuda.Stream()
-        while True:
-            query_and_content_ids = tasks_queue.get()
-            if query_and_content_ids is None:
-                tasks_queue.task_done()
-                break
-            query_ids, content_ids, t0, dt_queue = query_and_content_ids
-            t0_batching: float = time.time()
-            samples: Union[Samples, List[Samples]] = self.ds.get_samples(content_ids)
-            dt_batching: float = time.time() - t0_batching
-            if isinstance(samples, Samples):
-                qitem = QueryItem(
-                    query_ids=query_ids,
-                    samples=samples,
-                    start=t0,
-                    dt_queue=dt_queue,
-                    dt_batching=dt_batching,
-                )
-                with torch.inference_mode(), torch.cuda.stream(stream):
-                    self.run_one_item(qitem)
-            else:
-                start_idx = 0
-                for sample in samples:
-                    batch_size: int = sample.batch_size()
-                    qitem = QueryItem(
-                        query_ids=query_ids[start_idx : start_idx + batch_size],
-                        samples=sample,
-                        start=t0,
-                        dt_queue=dt_queue,
-                        dt_batching=dt_batching,
-                    )
-                    start_idx += batch_size
-                    with torch.inference_mode(), torch.cuda.stream(stream):
-                        self.run_one_item(qitem)
-            tasks_queue.task_done()
-
-    def enqueue(
-        self, query_ids: List[int], content_ids: List[int], t0: float, dt_queue: float
-    ) -> None:
-        """
-        Enqueue queries for asynchronous processing by worker threads.
-
-        Args:
-            query_ids: List of unique query identifiers.
-            content_ids: List of content/sample identifiers to fetch.
-            t0: Timestamp when the query batch was created.
-            dt_queue: Time spent waiting in the queue.
-        """
-        with torch.profiler.record_function("data batching"):
-            self.tasks.put((query_ids, content_ids, t0, dt_queue))
-
-    def finish(self) -> None:
-        """
-        Signal all worker threads to terminate and wait for completion.
-
-        Sends None to each worker to trigger graceful shutdown.
-        """
-        for _ in self.workers:
-            self.tasks.put(None)
-        for worker in self.workers:
-            worker.join()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/dense_predict_module.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/dense_predict_module.py
deleted file mode 100644
index add2781bc..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/dense_predict_module.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-strict
-
-"""
-TorchScript-friendly wrapper for the HSTU dense path (GPU transformer).
-
-``HSTUDenseScriptModule`` accepts the *flattened* sparse-output dicts produced
-by :class:`HSTUSparseScriptModule`, reconstructs ``Dict[str,
-SequenceEmbedding]`` for the existing :meth:`DlrmHSTU.main_forward` and
-returns a 3-tuple of ``(preds, labels, weights)`` -- the only fields the
-predictor actually consumes.
-"""
-
-from typing import Dict
-
-import torch
-from generative_recommenders.dlrm_v3.inference.inference_modules import get_hstu_model
-from generative_recommenders.dlrm_v3.inference.ts_types import (
-    SeqEmbLengths,
-    SeqEmbValues,
-    unflatten_seq_embeddings,
-)
-from generative_recommenders.modules.dlrm_hstu import DlrmHSTU, DlrmHSTUConfig
-from torchrec.modules.embedding_configs import EmbeddingConfig
-
-
-class HSTUDenseScriptModule(torch.nn.Module):
-    """Script-friendly dense module.
-
-    The wrapper owns a dense-only :class:`DlrmHSTU` (no
-    ``_embedding_collection``) and delegates to ``main_forward`` after
-    reconstructing the ``SequenceEmbedding`` NamedTuple form.
-    """
-
-    def __init__(
-        self,
-        hstu_config: DlrmHSTUConfig,
-        table_config: Dict[str, EmbeddingConfig],
-    ) -> None:
-        super().__init__()
-        self._hstu_model: DlrmHSTU = get_hstu_model(
-            table_config=table_config,
-            hstu_config=hstu_config,
-            table_device="cpu",
-            is_dense=True,
-        )
-
-    def forward(
-        self,
-        seq_emb_values: SeqEmbValues,
-        seq_emb_lengths: SeqEmbLengths,
-        payload_features: Dict[str, torch.Tensor],
-        uih_seq_lengths: torch.Tensor,
-        num_candidates: torch.Tensor,
-    ) -> torch.Tensor:
-        # TorchScript supports ``int(tensor.item())`` on a 0-d tensor.
-        max_uih_len: int = int(uih_seq_lengths.max().item())
-        max_num_candidates: int = int(num_candidates.max().item())
-
-        seq_embeddings = unflatten_seq_embeddings(seq_emb_values, seq_emb_lengths)
-
-        (
-            _,
-            _,
-            _,
-            mt_target_preds,
-            _mt_target_labels,
-            _mt_target_weights,
-        ) = self._hstu_model.main_forward(
-            seq_embeddings=seq_embeddings,
-            payload_features=payload_features,
-            max_uih_len=max_uih_len,
-            uih_seq_lengths=uih_seq_lengths,
-            max_num_candidates=max_num_candidates,
-            num_candidates=num_candidates,
-        )
-        assert mt_target_preds is not None
-        # Return just the predictions tensor; labels/weights are unused by
-        # the predictor at inference time and would force ``Optional[Tensor]``
-        # in the return type, which torch.jit.trace rejects ("Only tensors,
-        # lists, tuples of tensors, or dictionary of tensors can be output
-        # from traced functions").
-        return mt_target_preds
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/end_to_end_test.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/end_to_end_test.py
deleted file mode 100644
index f1b956d9c..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/end_to_end_test.py
+++ /dev/null
@@ -1,795 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-strict
-
-"""
-End-to-end smoke test for the HSTU TorchScript + C++ deployment pipeline.
-
-What this binary does, in order:
-
-1. Build a synthetic batch (uih_kjt, candidates_kjt) via :func:`get_random_data`.
-2. Build the eager :class:`HSTUSparseScriptModule` and
-   :class:`HSTUDenseScriptModule`.
-3. Run them eagerly to obtain the reference ``preds_eager``.
-4. ``torch.jit.script`` + save:
-       - ``sparse.pt`` (CPU)
-       - ``dense.pt``  (cuda:0, bf16)
-       - ``inputs.pt`` (an :class:`InputsBundle` ScriptModule whose
-         ``forward()`` returns ``Tuple[KeyedJaggedTensor, KeyedJaggedTensor]``)
-5. Run the C++ runner
-       ``hstu_runner [--aott_library <lib.so> ...] <sparse.pt> <dense.pt> <inputs.pt> <preds_cpp.pt>``.
-6. ``torch.load`` the runner's output and compare against ``preds_eager``
-   with :func:`torch.testing.assert_close` (loose tolerance because the
-   scripted path may use either the PyTorch fallback trace or AOT-T-loaded
-   Triton inference kernels).
-
-Usage (manual override of the runner path):
-
-    buck2 run @mode/opt //generative_recommenders/dlrm_v3/inference:end_to_end_test \\
-        -- --cpp_runner /path/to/hstu_runner
-
-By default the binary locates the runner via ``libfb.py.parutil`` -- it ships
-inside the par as a resource (see BUCK).
-"""
-
-import argparse
-import logging
-import os
-import shutil
-import sys
-import tempfile
-from typing import Any, Dict, List, Tuple
-
-import torch
-from generative_recommenders.dlrm_v3.configs import (
-    get_embedding_table_config,
-    get_hstu_configs,
-)
-from generative_recommenders.dlrm_v3.datasets.dataset import get_random_data
-from generative_recommenders.dlrm_v3.inference.dense_predict_module import (
-    HSTUDenseScriptModule,
-)
-from generative_recommenders.dlrm_v3.inference.sparse_predict_module import (
-    HSTUSparseScriptModule,
-)
-from generative_recommenders.dlrm_v3.inference.ts_types import (
-    SeqEmbLengths,
-    SeqEmbValues,
-    unflatten_seq_embeddings,
-)
-from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig
-from security.frameworks.python.exec.subprocess import TrustedSubprocessWithList
-from torchrec.modules.embedding_configs import EmbeddingConfig
-from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
-
-
-logger: logging.Logger = logging.getLogger(__name__)
-
-
-_DEFAULT_DATASET = "kuairand-1k"
-
-
-class InputsBundle(torch.nn.Module):
-    """Scripted holder for the test inputs.
-
-    Returns the constituent tensors of the two KJTs as a 4-tuple
-    ``(uih_lengths, uih_values, candidates_lengths, candidates_values)`` so
-    the traced sparse module can rebuild the KJTs inside its forward (KJT
-    instances themselves are not traceable inputs).
-    """
-
-    def __init__(
-        self,
-        uih_kjt: KeyedJaggedTensor,
-        candidates_kjt: KeyedJaggedTensor,
-    ) -> None:
-        super().__init__()
-        self.register_buffer("uih_lengths", uih_kjt.lengths())
-        self.register_buffer("uih_values", uih_kjt.values())
-        self.register_buffer("candidates_lengths", candidates_kjt.lengths())
-        self.register_buffer("candidates_values", candidates_kjt.values())
-
-    def forward(
-        self,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-        return (
-            self.uih_lengths,
-            self.uih_values,
-            self.candidates_lengths,
-            self.candidates_values,
-        )
-
-
-class _SparseTraceShim(torch.nn.Module):
-    """Adapter that takes raw tensors and rebuilds the KJTs inside forward.
-
-    ``torch.jit.trace`` does not accept ``KeyedJaggedTensor`` (or any
-    non-Tensor / non-collection-of-Tensor type) as a top-level forward
-    input, so we make the traced boundary tensor-only and bake the
-    ``List[str]`` of feature keys in as Python constants captured by the
-    closure / module attribute.
-    """
-
-    def __init__(
-        self,
-        sparse_module: HSTUSparseScriptModule,
-        uih_keys: List[str],
-        candidates_keys: List[str],
-    ) -> None:
-        super().__init__()
-        self._sparse_module: HSTUSparseScriptModule = sparse_module
-        self._uih_keys: List[str] = uih_keys
-        self._candidates_keys: List[str] = candidates_keys
-
-    def forward(
-        self,
-        uih_lengths: torch.Tensor,
-        uih_values: torch.Tensor,
-        candidates_lengths: torch.Tensor,
-        candidates_values: torch.Tensor,
-    ) -> Tuple[
-        SeqEmbValues,
-        SeqEmbLengths,
-        Dict[str, torch.Tensor],
-        torch.Tensor,
-        torch.Tensor,
-    ]:
-        uih_kjt = KeyedJaggedTensor(
-            keys=self._uih_keys,
-            lengths=uih_lengths,
-            values=uih_values,
-        )
-        candidates_kjt = KeyedJaggedTensor(
-            keys=self._candidates_keys,
-            lengths=candidates_lengths,
-            values=candidates_values,
-        )
-        return self._sparse_module(
-            uih_features=uih_kjt, candidates_features=candidates_kjt
-        )
-
-
-class _DenseAottTraceShim(torch.nn.Module):
-    """FX-traceable dense adapter for the representative AOT-T shape."""
-
-    def __init__(
-        self,
-        dense_module: HSTUDenseScriptModule,
-        max_uih_len: int,
-        max_num_candidates: int,
-        total_uih_len: int,
-        total_targets: int,
-    ) -> None:
-        super().__init__()
-        self._dense_module: HSTUDenseScriptModule = dense_module
-        self._max_uih_len: int = max_uih_len
-        self._max_num_candidates: int = max_num_candidates
-        self._total_uih_len: int = total_uih_len
-        self._total_targets: int = total_targets
-
-    def forward(
-        self,
-        seq_emb_values: SeqEmbValues,
-        seq_emb_lengths: SeqEmbLengths,
-        payload_features: Dict[str, torch.Tensor],
-        uih_seq_lengths: torch.Tensor,
-        num_candidates: torch.Tensor,
-    ) -> torch.Tensor:
-        seq_embeddings = unflatten_seq_embeddings(seq_emb_values, seq_emb_lengths)
-
-        (
-            _,
-            _,
-            _,
-            mt_target_preds,
-            _mt_target_labels,
-            _mt_target_weights,
-        ) = self._dense_module._hstu_model.main_forward(
-            seq_embeddings=seq_embeddings,
-            payload_features=payload_features,
-            max_uih_len=self._max_uih_len,
-            uih_seq_lengths=uih_seq_lengths,
-            max_num_candidates=self._max_num_candidates,
-            num_candidates=num_candidates,
-            total_uih_len=self._total_uih_len,
-            total_targets=self._total_targets,
-        )
-        assert mt_target_preds is not None
-        return mt_target_preds
-
-
-def _dense_aott_concrete_args(
-    dense_inputs: Tuple[
-        Dict[str, torch.Tensor],
-        Dict[str, torch.Tensor],
-        Dict[str, torch.Tensor],
-        torch.Tensor,
-        torch.Tensor,
-    ],
-) -> Dict[str, Any]:
-    from torch.fx._symbolic_trace import PH
-
-    seq_emb_values, seq_emb_lengths, payload_features, _, _ = dense_inputs
-    return {
-        "seq_emb_values": {key: PH for key in seq_emb_values},
-        "seq_emb_lengths": {key: PH for key in seq_emb_lengths},
-        "payload_features": {key: PH for key in payload_features},
-    }
-
-
-def _find_cpp_runner() -> str:
-    """Locate the bundled hstu_runner binary.
-
-    Tries ``importlib.resources`` (the canonical fbcode resource resolver,
-    works whether the binary is in a par or unpacked), and falls back to
-    looking next to ``sys.argv[0]``.
-    """
-    try:
-        from importlib.resources import files
-
-        path = files("generative_recommenders.dlrm_v3.inference.cpp").joinpath(
-            "hstu_runner"
-        )
-        if path.is_file():
-            return str(path)
-    except Exception as exc:
-        logger.debug("importlib.resources lookup failed: %s", exc)
-
-    candidate = os.path.join(
-        os.path.dirname(os.path.abspath(sys.argv[0])), "hstu_runner"
-    )
-    if os.path.exists(candidate):
-        return candidate
-
-    raise RuntimeError(
-        "Could not find hstu_runner binary. "
-        "Pass --cpp_runner=<path> or build the cpp_binary target first."
-    )
-
-
-def _eager_run(
-    sparse_module: HSTUSparseScriptModule,
-    dense_module: HSTUDenseScriptModule,
-    uih_kjt: KeyedJaggedTensor,
-    candidates_kjt: KeyedJaggedTensor,
-    device: torch.device,
-) -> torch.Tensor:
-    """Reference path: sparse → device-move + bf16 → dense, all in Python."""
-    with torch.no_grad():
-        seq_emb_values, seq_emb_lengths, payload, uih_lens, num_cands = sparse_module(
-            uih_features=uih_kjt, candidates_features=candidates_kjt
-        )
-        seq_emb_values = {
-            k: v.to(device).to(torch.bfloat16) for k, v in seq_emb_values.items()
-        }
-        seq_emb_lengths = {k: v.to(device) for k, v in seq_emb_lengths.items()}
-        payload = {k: v.to(device) for k, v in payload.items()}
-        uih_lens = uih_lens.to(device)
-        num_cands = num_cands.to(device)
-        preds = dense_module(
-            seq_emb_values, seq_emb_lengths, payload, uih_lens, num_cands
-        )
-    return preds.detach().to(torch.float32).cpu()
-
-
-def _find_aott_libraries() -> List[str]:
-    from generative_recommenders.ops.triton_aot.compile.compile_state import (
-        get_aott_compile_path,
-    )
-
-    compile_path = get_aott_compile_path()
-    libraries: List[str] = []
-    for root, _, files in os.walk(compile_path):
-        for filename in files:
-            if filename.endswith(".so"):
-                libraries.append(os.path.join(root, filename))
-    return sorted(libraries)
-
-
-def _copy_aott_libraries_to_workdir(
-    library_paths: List[str], workdir: str
-) -> List[str]:
-    copied: List[str] = []
-    for index, path in enumerate(library_paths):
-        dst = os.path.join(workdir, f"aott_{index}_{os.path.basename(path)}")
-        shutil.copy2(path, dst)
-        copied.append(dst)
-    return copied
-
-
-def _load_aott_libraries_for_python(library_paths: List[str]) -> None:
-    for library_path in library_paths:
-        logger.info("Python roundtrip: loading AOT-T library %s", library_path)
-        torch.ops.load_library(library_path)
-
-
-def _save_aott_dense_module(
-    dense_module: HSTUDenseScriptModule,
-    dense_inputs: Tuple[
-        Dict[str, torch.Tensor],
-        Dict[str, torch.Tensor],
-        Dict[str, torch.Tensor],
-        torch.Tensor,
-        torch.Tensor,
-    ],
-    dense_path: str,
-    workdir: str,
-    atol: float,
-    rtol: float,
-) -> List[str]:
-    """Lower the dense module with AOT-T and save a TorchScript artifact.
-
-    This follows the AOT-T example flow:
-
-    1. FX trace the module.
-    2. Unwrap outer `aot_triton_kernel_wrapper_*` nodes.
-    3. Run representative CUDA inputs under `TritonAOTCompile`.
-    4. `transform_kernels` to replace wrappers with `torch.ops.triton_aot.*`.
-    5. Script and save the transformed dense module.
-
-    The full HSTU dense wrapper has historically needed tracing rather than FX,
-    so failures here are reported with context and the default path remains the
-    D102 traced TorchScript fallback.
-    """
-    from generative_recommenders.ops.triton_aot.compile.triton_aot_compile import (
-        TritonAOTCompile,
-    )
-    from generative_recommenders.ops.triton_aot.preprocess import (
-        unwrap_aott_wrapper_nodes,
-    )
-    from generative_recommenders.ops.triton_aot.transform.transform_kernels import (
-        transform_kernels,
-    )
-    from tgif.fx.tgif_tracer import TGIFTracer
-
-    max_uih_len = int(dense_inputs[3].max().item())
-    max_num_candidates = int(dense_inputs[4].max().item())
-    total_uih_len = int(dense_inputs[3].sum().item())
-    total_targets = int(dense_inputs[4].sum().item())
-    trace_shim = _DenseAottTraceShim(
-        dense_module=dense_module,
-        max_uih_len=max_uih_len,
-        max_num_candidates=max_num_candidates,
-        total_uih_len=total_uih_len,
-        total_targets=total_targets,
-    ).eval()
-
-    logger.info(
-        "AOT-T dense: FX tracing representative shape "
-        "(max_uih_len=%d, max_num_candidates=%d, "
-        "total_uih_len=%d, total_targets=%d)...",
-        max_uih_len,
-        max_num_candidates,
-        total_uih_len,
-        total_targets,
-    )
-    try:
-        fx_dense = TGIFTracer().symbolic_trace(
-            trace_shim,
-            concrete_args=_dense_aott_concrete_args(dense_inputs),
-        )
-        lowered_dense = unwrap_aott_wrapper_nodes(fx_dense, TGIFTracer())
-    except Exception as exc:
-        raise RuntimeError(
-            "AOT-T dense lowering requires an FX-traceable dense entry point. "
-            "Use --dense_backend=torchscript to fall back to the D102 traced "
-            "TorchScript path."
-        ) from exc
-
-    logger.info("AOT-T dense: compiling Triton kernels from sample inputs...")
-    with torch.no_grad():
-        with TritonAOTCompile():
-            ref_output = lowered_dense(*dense_inputs)
-
-    original_code = lowered_dense.code
-    lowered_dense = transform_kernels(lowered_dense)
-    if lowered_dense.code == original_code:
-        logger.warning(
-            "AOT-T dense: transform_kernels did not change the FX graph. "
-            "This usually means no aot_triton_kernel_wrapper_* nodes were "
-            "present in the dense path."
-        )
-
-    libraries = _find_aott_libraries()
-    if not libraries:
-        raise RuntimeError(
-            "AOT-T dense lowering produced no .so files. Ensure the dense path "
-            "uses HammerKernel.TRITON_INFERENCE branches backed by triton_aot ops."
-        )
-
-    with torch.no_grad():
-        lowered_output = lowered_dense(*dense_inputs)
-    torch.testing.assert_close(ref_output, lowered_output, atol=atol, rtol=rtol)
-
-    logger.info("AOT-T dense: tracing transformed module...")
-    torch.jit.trace(
-        lowered_dense,
-        example_inputs=dense_inputs,
-        strict=False,
-        check_trace=False,
-    ).save(dense_path)
-    copied_libraries = _copy_aott_libraries_to_workdir(libraries, workdir)
-    logger.info("AOT-T dense: copied %d library file(s)", len(copied_libraries))
-    return copied_libraries
-
-
-def _build_synthetic_inputs(
-    hstu_config: DlrmHSTUConfig,
-    table_config: Dict[str, EmbeddingConfig],
-    uih_max_seq_len: int,
-) -> Tuple[KeyedJaggedTensor, KeyedJaggedTensor]:
-    contextual: List[str] = list(hstu_config.contextual_feature_to_max_length.keys())
-    # The kuairand-1k dataset has tiny embedding tables for some contextual
-    # features (e.g. user_active_degree has num_embeddings=8). Clamp the
-    # random value range so every index stays in range for every table.
-    min_rows = min(t.num_embeddings for t in table_config.values())
-    value_bound = max(2, min_rows)
-    logger.info(
-        "synthetic value_bound=%d (min table rows=%d across %d tables)",
-        value_bound,
-        min_rows,
-        len(table_config),
-    )
-    return get_random_data(
-        contexual_features=contextual,
-        hstu_uih_keys=hstu_config.hstu_uih_feature_names,
-        hstu_candidates_keys=hstu_config.hstu_candidate_feature_names,
-        uih_max_seq_len=uih_max_seq_len,
-        max_num_candidates=hstu_config.max_num_candidates_inference,
-        value_bound=value_bound,
-    )
-
-
-def _parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description=__doc__)
-    parser.add_argument(
-        "--cpp_runner",
-        type=str,
-        default=None,
-        help="Path to the hstu_runner binary; default: bundled resource.",
-    )
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        default=_DEFAULT_DATASET,
-        help="Dataset key for HSTU/embedding configs.",
-    )
-    parser.add_argument(
-        "--device", type=str, default="cuda:0", help="Dense-module device."
-    )
-    parser.add_argument(
-        "--uih_max_seq_len",
-        type=int,
-        default=128,
-        help="Max UIH length for the synthetic batch.",
-    )
-    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--atol", type=float, default=1e-2)
-    parser.add_argument("--rtol", type=float, default=1e-2)
-    parser.add_argument(
-        "--dense_backend",
-        choices=("torchscript", "aott"),
-        default="torchscript",
-        help="Dense artifact backend. aott lowers TRITON_INFERENCE wrappers and passes compiled libraries to the C++ runner.",
-    )
-    parser.add_argument(
-        "--aott_library",
-        action="append",
-        default=[],
-        help="Additional prebuilt AOT-T shared library to dlopen before loading dense.pt. May be repeated.",
-    )
-    parser.add_argument(
-        "--keep_workdir",
-        action="store_true",
-        help="Do not delete the temp dir holding the saved artifacts.",
-    )
-    return parser.parse_args()
-
-
-def main() -> None:  # noqa: C901
-    logging.basicConfig(level=logging.INFO, format="[e2e] %(message)s", force=True)
-    logger.setLevel(logging.DEBUG)
-    args = _parse_args()
-
-    if not torch.cuda.is_available():
-        logger.error("CUDA is required; aborting.")
-        sys.exit(2)
-
-    runner_path = args.cpp_runner or _find_cpp_runner()
-    logger.info("Using C++ runner: %s", runner_path)
-
-    torch.manual_seed(args.seed)
-    device = torch.device(args.device)
-    torch.cuda.set_device(device)
-
-    hstu_config = get_hstu_configs(args.dataset)
-    table_config = get_embedding_table_config(args.dataset)
-
-    uih_kjt, candidates_kjt = _build_synthetic_inputs(
-        hstu_config, table_config, args.uih_max_seq_len
-    )
-
-    sparse_module = HSTUSparseScriptModule(
-        table_config=table_config,
-        hstu_config=hstu_config,
-        use_no_copy_embedding_collection=True,
-    ).eval()
-    dense_module = (
-        HSTUDenseScriptModule(hstu_config=hstu_config, table_config=table_config)
-        .to(torch.bfloat16)
-        .to(device)
-        .eval()
-    )
-
-    from generative_recommenders.common import HammerKernel
-
-    dense_kernel = (
-        HammerKernel.TRITON_INFERENCE
-        if args.dense_backend == "aott"
-        else HammerKernel.PYTORCH
-    )
-    sparse_module._sparse._hstu_model.set_hammer_kernel(HammerKernel.PYTORCH)
-    dense_module._hstu_model.set_hammer_kernel(dense_kernel)
-
-    # Diagnostic: walk every HammerModule submodule and print its effective
-    # kernel selection, so any submodule that didn't pick up the override
-    # surfaces immediately. Triton/Triton-CC selections will fail at trace
-    # time, so this print is critical for triaging the next iteration if
-    # tracing fails.
-    from generative_recommenders.common import HammerModule as _HM
-
-    for name, m in list(sparse_module.named_modules()) + list(
-        dense_module.named_modules()
-    ):
-        if isinstance(m, _HM):
-            logger.info(
-                "kernel-pin %-60s -> %s (is_inference=%s, use_triton_cc=%s)",
-                name or "<root>",
-                m.hammer_kernel().value,
-                m._is_inference,
-                m._use_triton_cc,
-            )
-
-    # === 1. Eager reference ===
-    logger.info("Running eager reference...")
-    preds_eager = _eager_run(
-        sparse_module, dense_module, uih_kjt, candidates_kjt, device
-    )
-    logger.info(
-        "preds_eager shape=%s sum=%.6f",
-        tuple(preds_eager.shape),
-        preds_eager.sum().item(),
-    )
-
-    # === 2. Trace/lower + save ===
-    # The default path keeps D102's trace-based TorchScript artifact. The
-    # AOT-T path follows ModelStore's compile/transform flow and saves a
-    # scripted FX module whose Triton kernels dispatch through torch.ops.
-    workdir = tempfile.mkdtemp(prefix="hstu_e2e_")
-    sparse_path = os.path.join(workdir, "sparse.pt")
-    dense_path = os.path.join(workdir, "dense.pt")
-    inputs_path = os.path.join(workdir, "inputs.pt")
-    cpp_out_path = os.path.join(workdir, "preds_cpp.pt")
-    eager_out_path = os.path.join(workdir, "preds_eager.pt")
-    aott_library_paths: List[str] = list(args.aott_library)
-    python_roundtrip_aott_library_paths: List[str] = list(args.aott_library)
-    logger.info("workdir: %s", workdir)
-
-    # Re-run sparse eagerly to capture an example output that can drive the
-    # dense trace.
-    with torch.no_grad():
-        sparse_out = sparse_module(
-            uih_features=uih_kjt, candidates_features=candidates_kjt
-        )
-        seq_emb_values = {
-            k: v.to(device).to(torch.bfloat16) for k, v in sparse_out[0].items()
-        }
-        seq_emb_lengths = {k: v.to(device) for k, v in sparse_out[1].items()}
-        payload = {k: v.to(device) for k, v in sparse_out[2].items()}
-        uih_lens = sparse_out[3].to(device)
-        num_cands = sparse_out[4].to(device)
-
-    logger.info("Tracing sparse module via raw-tensor shim (CPU)...")
-    sparse_shim = _SparseTraceShim(
-        sparse_module=sparse_module,
-        uih_keys=list(uih_kjt.keys()),
-        candidates_keys=list(candidates_kjt.keys()),
-    )
-    traced_sparse = torch.jit.trace(
-        sparse_shim,
-        example_inputs=(
-            uih_kjt.lengths(),
-            uih_kjt.values(),
-            candidates_kjt.lengths(),
-            candidates_kjt.values(),
-        ),
-        strict=False,
-        check_trace=False,
-    )
-    traced_sparse.save(sparse_path)
-
-    dense_inputs = (
-        seq_emb_values,
-        seq_emb_lengths,
-        payload,
-        uih_lens,
-        num_cands,
-    )
-    if args.dense_backend == "aott":
-        logger.info("Lowering dense module with AOT-T...")
-        generated_aott_library_paths = _save_aott_dense_module(
-            dense_module,
-            dense_inputs,
-            dense_path,
-            workdir,
-            args.atol,
-            args.rtol,
-        )
-        aott_library_paths.extend(generated_aott_library_paths)
-    else:
-        logger.info("Tracing dense module (cuda:0, bf16)...")
-        traced_dense = torch.jit.trace(
-            dense_module,
-            example_inputs=dense_inputs,
-            strict=False,
-            check_trace=False,
-        )
-        traced_dense.save(dense_path)
-
-    logger.info("Scripting + saving inputs bundle...")
-    torch.jit.script(InputsBundle(uih_kjt, candidates_kjt)).save(inputs_path)
-    torch.save(preds_eager, eager_out_path)
-
-    # === 2.5. Python-side roundtrip verification ===
-    # Load the saved traced artifacts back in Python and verify they produce
-    # the same results as the eager run. This proves the artifacts are correct
-    # independently of the C++ runner.
-    logger.info("Python roundtrip: loading traced artifacts back...")
-    if python_roundtrip_aott_library_paths:
-        _load_aott_libraries_for_python(python_roundtrip_aott_library_paths)
-    rt_inputs = torch.jit.load(inputs_path)
-    rt_sparse = torch.jit.load(sparse_path)
-    rt_dense = torch.jit.load(dense_path)
-
-    with torch.no_grad():
-        rt_uih_l, rt_uih_v, rt_cand_l, rt_cand_v = rt_inputs()
-        logger.info(
-            "  rt inputs: uih_l=%s uih_v=%s cand_l=%s cand_v=%s",
-            rt_uih_l.shape,
-            rt_uih_v.shape,
-            rt_cand_l.shape,
-            rt_cand_v.shape,
-        )
-
-        rt_sparse_out = rt_sparse(rt_uih_l, rt_uih_v, rt_cand_l, rt_cand_v)
-
-        for i, elem in enumerate(rt_sparse_out):
-            if isinstance(elem, dict):
-                for k, v in elem.items():
-                    has_nan = torch.isnan(v).any().item()
-                    has_inf = torch.isinf(v).any().item()
-                    logger.info(
-                        "  sparse_out[%d][%s] shape=%s dtype=%s nan=%s inf=%s",
-                        i,
-                        k,
-                        tuple(v.shape),
-                        v.dtype,
-                        has_nan,
-                        has_inf,
-                    )
-            elif isinstance(elem, torch.Tensor):
-                logger.info(
-                    "  sparse_out[%d] shape=%s dtype=%s nan=%s inf=%s",
-                    i,
-                    tuple(elem.shape),
-                    elem.dtype,
-                    torch.isnan(elem).any().item(),
-                    torch.isinf(elem).any().item(),
-                )
-
-        rt_sev = {
-            k: v.to(device).to(torch.bfloat16) for k, v in rt_sparse_out[0].items()
-        }
-        rt_sel = {k: v.to(device) for k, v in rt_sparse_out[1].items()}
-        rt_pay = {k: v.to(device) for k, v in rt_sparse_out[2].items()}
-        rt_uih = rt_sparse_out[3].to(device)
-        rt_nc = rt_sparse_out[4].to(device)
-
-        preds_rt = rt_dense(rt_sev, rt_sel, rt_pay, rt_uih, rt_nc)
-
-    preds_rt_cpu = preds_rt.detach().to(torch.float32).cpu()
-    logger.info(
-        "preds_roundtrip shape=%s sum=%.6f nan=%s inf=%s",
-        tuple(preds_rt_cpu.shape),
-        preds_rt_cpu.sum().item(),
-        torch.isnan(preds_rt_cpu).any().item(),
-        torch.isinf(preds_rt_cpu).any().item(),
-    )
-
-    try:
-        torch.testing.assert_close(
-            preds_eager, preds_rt_cpu, atol=args.atol, rtol=args.rtol
-        )
-    except AssertionError as e:
-        logger.error("PYTHON ROUNDTRIP PARITY FAILED: %s", e)
-        if not args.keep_workdir:
-            logger.info("(workdir kept for inspection: %s)", workdir)
-        sys.exit(1)
-    logger.info("PYTHON ROUNDTRIP PASSED (atol=%g rtol=%g)", args.atol, args.rtol)
-
-    # === 3. Invoke C++ runner ===
-    runner_args: List[str] = []
-    for library_path in aott_library_paths:
-        runner_args.extend(["--aott_library", library_path])
-    runner_args.extend([sparse_path, dense_path, inputs_path, cpp_out_path])
-
-    logger.info("Running C++: %s %s", runner_path, " ".join(runner_args))
-    # pyre-fixme[6]: TrustedSubprocessWithList requires Literal[str] but this
-    # runner is resolved from a built resource or explicit test argument.
-    result = TrustedSubprocessWithList.run(
-        executable=runner_path,
-        cmd_args=runner_args,
-        capture_output=True,
-        text=True,
-        check=False,
-    )
-    if result.stdout:
-        logger.info("--- runner stdout ---\n%s", result.stdout.rstrip())
-    if result.stderr:
-        logger.info("--- runner stderr ---\n%s", result.stderr.rstrip())
-    if result.returncode != 0:
-        if result.returncode == -11:
-            logger.warning(
-                "C++ runner SIGSEGV (exit -11). This is a known issue with "
-                "torch-cpp-cuda static initialization on some machines. "
-                "Python roundtrip verification passed above. "
-                "Artifacts in: %s",
-                workdir,
-            )
-            args.keep_workdir = True
-        else:
-            logger.error("C++ runner exited with code %d", result.returncode)
-        if not args.keep_workdir:
-            shutil.rmtree(workdir, ignore_errors=True)
-        sys.exit(result.returncode)
-
-    # === 4. Compare ===
-    if not os.path.exists(cpp_out_path):
-        logger.error("C++ runner did not produce %s", cpp_out_path)
-        sys.exit(1)
-    preds_cpp = torch.load(cpp_out_path, weights_only=False).to(torch.float32).cpu()
-    logger.info(
-        "preds_cpp   shape=%s sum=%.6f",
-        tuple(preds_cpp.shape),
-        preds_cpp.sum().item(),
-    )
-
-    try:
-        torch.testing.assert_close(
-            preds_eager, preds_cpp, atol=args.atol, rtol=args.rtol
-        )
-    except AssertionError as e:
-        logger.error("PARITY FAILED: %s", e)
-        if not args.keep_workdir:
-            logger.info("(workdir kept for inspection: %s)", workdir)
-        sys.exit(1)
-
-    logger.info("PASSED: eager and C++ agree (atol=%g rtol=%g)", args.atol, args.rtol)
-    if not args.keep_workdir:
-        shutil.rmtree(workdir, ignore_errors=True)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/debug.gin b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/debug.gin
deleted file mode 100644
index e2025dee0..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/debug.gin
+++ /dev/null
@@ -1,13 +0,0 @@
-run.model_path = ""
-run.scenario_name = "Server"
-run.batchsize = 16
-run.output_trace = False
-run.data_producer_threads = 4
-run.compute_eval = False
-run.find_peak_performance = False
-run.train_split_percentage = 0.75
-
-# below will override mlperf rules compliant settings - don't use for official submission
-run.target_qps = 2000
-run.num_queries = 10000
-run.numpy_rand_seed = 123
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/kuairand_1k.gin b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/kuairand_1k.gin
deleted file mode 100644
index a770aa014..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/kuairand_1k.gin
+++ /dev/null
@@ -1,14 +0,0 @@
-# run.model_path = "/home/linjianma/ckpts/kuairand_1k/2025_01_12_17_56_43/"
-run.scenario_name = "Server"
-run.batchsize = 16
-run.output_trace = False
-run.data_producer_threads = 4
-run.compute_eval = False
-run.find_peak_performance = False
-run.train_split_percentage = 0.75
-
-# below will override mlperf rules compliant settings - don't use for official submission
-run.target_qps = 2000
-run.num_queries = 10000
-run.numpy_rand_seed = 123
-run.dataset_path_prefix = "/home/linjianma"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/movielens_13b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/movielens_13b.gin
deleted file mode 100644
index 3121ac0e7..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/movielens_13b.gin
+++ /dev/null
@@ -1,16 +0,0 @@
-run.model_path = ""
-run.scenario_name = "Server"
-run.batchsize = 5
-run.output_trace = False
-run.data_producer_threads = 8
-run.compute_eval = False
-run.find_peak_performance = False
-run.train_split_percentage = 0.75
-run.sparse_quant = False
-
-# below will override mlperf rules compliant settings - don't use for official submission
-run.target_qps = 5000
-run.num_queries = 30000
-run.numpy_rand_seed = 123
-run.dataset_path_prefix = "/home/linjianma"
-run.dataset_percentage = 0.0625
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_100b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_100b.gin
deleted file mode 100644
index 0655734c2..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_100b.gin
+++ /dev/null
@@ -1,15 +0,0 @@
-# run.model_path = "/home/linjianma/ckpts/streaming_100b/89/"
-run.scenario_name = "Server"
-run.batchsize = 10
-run.output_trace = False
-run.data_producer_threads = 16
-run.compute_eval = False
-run.find_peak_performance = False
-run.sparse_quant = False
-run.numpy_rand_seed = 123
-run.dataset_path_prefix = "/home/linjianma"
-run.dataset_percentage = 0.001
-run.warmup_ratio = 0.3
-run.num_queries = 20000
-# Needs to be tuned for different implementations to balance latency and throughput
-run.target_qps = 1000
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_400m.gin b/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_400m.gin
deleted file mode 100644
index eed13e0ff..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/gin/streaming_400m.gin
+++ /dev/null
@@ -1,15 +0,0 @@
-run.model_path = ""
-run.scenario_name = "Server"
-run.batchsize = 5
-run.output_trace = False
-run.data_producer_threads = 8
-run.compute_eval = False
-run.find_peak_performance = False
-run.train_split_percentage = 0.75
-run.sparse_quant = False
-
-# below will override mlperf rules compliant settings - don't use for official submission
-run.target_qps = 5000
-run.numpy_rand_seed = 123
-run.dataset_path_prefix = "/home/linjianma"
-run.dataset_percentage = 0.00625
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/inference_modules.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/inference_modules.py
deleted file mode 100644
index cb567df63..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/inference_modules.py
+++ /dev/null
@@ -1,253 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-"""
-Inference modules for DLRMv3.
-
-This module provides inference-specific components for the HSTU model,
-including sparse inference modules and utilities for moving tensors between devices.
-"""
-
-from typing import Dict, Optional, Tuple
-
-import torch
-import torchrec
-from generative_recommenders.modules.dlrm_hstu import (
-    DlrmHSTU,
-    DlrmHSTUConfig,
-    SequenceEmbedding,
-)
-from torchrec.modules.embedding_modules import (
-    EmbeddingBagCollection,
-    EmbeddingCollection,
-)
-from torchrec.sparse.jagged_tensor import JaggedTensor, KeyedJaggedTensor
-from torchrec.sparse.tensor_dict import maybe_td_to_kjt
-
-
-IS_INFERENCE: bool = True
-
-
-class _NoCopyEmbeddingCollection(torchrec.EmbeddingCollection):
-    """
-    EmbeddingCollection variant that skips the dtype-cast copy in
-    ``EmbeddingCollection.forward`` and clamps indices into the hash-size
-    range. This is the script-mode replacement for the
-    ``functools.partial`` monkey-patch in
-    :func:`generative_recommenders.dlrm_v3.inference.model_family.ec_patched_forward_wo_embedding_copy`.
-
-    The body mirrors that helper exactly so that the eager and scripted paths
-    produce the same embeddings.
-    """
-
-    def forward(
-        self,
-        features: KeyedJaggedTensor,
-    ) -> Dict[str, JaggedTensor]:
-        features = maybe_td_to_kjt(features, None)
-        feature_embeddings: Dict[str, JaggedTensor] = {}
-        jt_dict: Dict[str, JaggedTensor] = features.to_dict()
-        # Inline HASH_SIZE_1B - 1 as a literal so TorchScript can see it; the
-        # imported module-level constant is treated as an opaque "closed-over
-        # global" by jit.script and would fail with
-        # "python value of type 'int' cannot be used as a value".
-        max_index: int = 999_999_999  # HASH_SIZE_1B - 1
-        for i, emb_module in enumerate(self.embeddings.values()):
-            feature_names = self._feature_names[i]
-            embedding_names = self._embedding_names_by_table[i]
-            for j, embedding_name in enumerate(embedding_names):
-                feature_name = feature_names[j]
-                f = jt_dict[feature_name]
-                indices = torch.clamp(f.values(), min=0, max=max_index)
-                lookup = emb_module(input=indices)
-                feature_embeddings[embedding_name] = JaggedTensor(
-                    values=lookup,
-                    lengths=f.lengths(),
-                    weights=f.values() if self._need_indices else None,
-                )
-        return feature_embeddings
-
-
-def set_is_inference(is_inference: bool = False) -> None:
-    """
-    Set the global inference mode flag.
-
-    Args:
-        is_inference: If True, model operates in inference mode (no labels/weights).
-                     If False, model operates in training/eval mode with labels.
-    """
-    global IS_INFERENCE
-    IS_INFERENCE = is_inference
-
-
-def get_hstu_model(
-    table_config,
-    hstu_config: DlrmHSTUConfig,
-    table_device: str = "meta",
-    max_hash_size: Optional[int] = None,
-    is_dense: bool = False,
-) -> DlrmHSTU:
-    """
-    Create and initialize an HSTU model for inference.
-
-    Args:
-        table_config: Dictionary of embedding table configurations.
-        hstu_config: HSTU model configuration object.
-        table_device: Device to place embedding tables on ('meta', 'cpu', or 'cuda').
-        max_hash_size: Optional maximum hash size to cap embedding table sizes.
-        is_dense: If True, creates model for dense-only operations.
-
-    Returns:
-        Initialized DlrmHSTU model in eval mode.
-    """
-    if max_hash_size is not None:
-        for t in table_config.values():
-            t.num_embeddings = (
-                max_hash_size if t.num_embeddings > max_hash_size else t.num_embeddings
-            )
-    model = DlrmHSTU(
-        hstu_configs=hstu_config,
-        embedding_tables=table_config,
-        is_inference=IS_INFERENCE,
-        is_dense=is_dense,
-    )
-    model.eval()
-    model.recursive_setattr("_use_triton_cc", False)
-    for _, module in model.named_modules():
-        if isinstance(module, EmbeddingBagCollection) or isinstance(
-            module, EmbeddingCollection
-        ):
-            module.to_empty(device=table_device)
-            # to_empty leaves parameters uninitialized; fill with small random
-            # values so downstream bf16 ops don't produce NaN from
-            # uninitialized memory.
-            for p in module.parameters():
-                if not p.is_meta:
-                    torch.nn.init.uniform_(p, -0.01, 0.01)
-    return model
-
-
-class HSTUSparseInferenceModule(torch.nn.Module):
-    """
-    Module for sparse (embedding) inference operations.
-
-    Handles embedding lookups and preprocessing for the HSTU model,
-    running on CPU to handle large embedding tables.
-
-    Args:
-        table_config: Dictionary of embedding table configurations.
-        hstu_config: HSTU model configuration object.
-    """
-
-    def __init__(
-        self,
-        table_config,
-        hstu_config: DlrmHSTUConfig,
-    ) -> None:
-        super().__init__()
-        self._hstu_model: DlrmHSTU = get_hstu_model(
-            table_config,
-            hstu_config,
-            table_device="cpu",
-        )
-
-    def forward(
-        self,
-        uih_features: KeyedJaggedTensor,
-        candidates_features: KeyedJaggedTensor,
-    ) -> Tuple[
-        Dict[str, SequenceEmbedding],
-        Dict[str, torch.Tensor],
-        int,
-        torch.Tensor,
-        int,
-        torch.Tensor,
-    ]:
-        """
-        Run sparse preprocessing and embedding lookups.
-
-        Args:
-            uih_features: User interaction history features as KeyedJaggedTensor.
-            candidates_features: Candidate item features as KeyedJaggedTensor.
-
-        Returns:
-            Tuple containing:
-                - seq_embeddings: Dictionary of sequence embeddings per feature.
-                - payload_features: Dictionary of payload feature tensors.
-                - max_uih_len: Maximum user interaction history length.
-                - uih_seq_lengths: Tensor of UIH sequence lengths per batch item.
-                - max_num_candidates: Maximum number of candidates.
-                - num_candidates: Tensor of candidate counts per batch item.
-        """
-        (
-            seq_embeddings,
-            payload_features,
-            max_uih_len,
-            uih_seq_lengths,
-            max_num_candidates,
-            num_candidates,
-        ) = self._hstu_model.preprocess(
-            uih_features=uih_features,
-            candidates_features=candidates_features,
-        )
-        return (
-            seq_embeddings,
-            payload_features,
-            max_uih_len,
-            uih_seq_lengths,
-            max_num_candidates,
-            num_candidates,
-        )
-
-
-def move_sparse_output_to_device(
-    seq_embeddings: Dict[str, SequenceEmbedding],
-    payload_features: Dict[str, torch.Tensor],
-    uih_seq_lengths: torch.Tensor,
-    num_candidates: torch.Tensor,
-    device: torch.device,
-) -> Tuple[
-    Dict[str, SequenceEmbedding],
-    Dict[str, torch.Tensor],
-    torch.Tensor,
-    torch.Tensor,
-]:
-    """
-    Move sparse module outputs from CPU to the target device (typically GPU).
-
-    Converts embeddings to bfloat16 for efficient GPU computation.
-
-    Args:
-        seq_embeddings: Dictionary of sequence embeddings to move.
-        payload_features: Dictionary of payload features to move.
-        uih_seq_lengths: UIH sequence lengths tensor to move.
-        num_candidates: Number of candidates tensor to move.
-        device: Target device (e.g., torch.device('cuda:0')).
-
-    Returns:
-        Tuple of moved tensors on the target device.
-    """
-    num_candidates = num_candidates.to(device)
-    uih_seq_lengths = uih_seq_lengths.to(device)
-    seq_embeddings = {
-        k: SequenceEmbedding(
-            lengths=seq_embeddings[k].lengths.to(device),
-            embedding=seq_embeddings[k].embedding.to(device).to(torch.bfloat16),
-        )
-        for k in seq_embeddings.keys()
-    }
-    for k, v in payload_features.items():
-        payload_features[k] = v.to(device)
-    return seq_embeddings, payload_features, uih_seq_lengths, num_candidates
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/main.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/main.py
deleted file mode 100644
index 00e334119..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/main.py
+++ /dev/null
@@ -1,805 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-strict
-"""
-mlperf dlrm_v3 inference benchmarking tool.
-"""
-
-import argparse
-import array
-import logging
-import random
-import threading
-
-logging.basicConfig(level=logging.INFO)
-import os
-import sys
-import time
-from typing import Any, Dict, List, Optional, Union
-
-import gin
-
-# pyre-ignore [21]
-import mlperf_loadgen as lg  # @manual
-import numpy as np
-import torch
-from generative_recommenders.common import set_dev_mode, set_verbose_level
-from generative_recommenders.dlrm_v3.configs import (
-    get_embedding_table_config,
-    get_hstu_configs,
-)
-from generative_recommenders.dlrm_v3.datasets.dataset import Dataset, Samples
-from generative_recommenders.dlrm_v3.datasets.synthetic_streaming import (
-    DLRMv3SyntheticStreamingDataset,
-)
-from generative_recommenders.dlrm_v3.inference.data_producer import (
-    MultiThreadDataProducer,
-    QueryItem,
-    SingleThreadDataProducer,
-)
-from generative_recommenders.dlrm_v3.inference.inference_modules import set_is_inference
-from generative_recommenders.dlrm_v3.inference.model_family import HSTUModelFamily
-from generative_recommenders.dlrm_v3.utils import (
-    get_dataset,
-    profiler_or_nullcontext,
-    SUPPORTED_DATASETS,
-)
-
-
-logger: logging.Logger = logging.getLogger("main")
-
-torch.multiprocessing.set_start_method("spawn", force=True)
-
-USER_CONF = f"{os.path.dirname(__file__)}/user.conf"
-
-SUPPORTED_CONFIGS = {
-    "debug": "debug.gin",
-    "kuairand-1k": "kuairand_1k.gin",
-    "movielens-13b": "movielens_13b.gin",
-    "streaming-400m": "streaming_400m.gin",
-    "sampled-streaming-100b": "streaming_100b.gin",
-}
-
-
-SCENARIO_MAP = {  # pyre-ignore [5]
-    "Server": lg.TestScenario.Server,
-    "Offline": lg.TestScenario.Offline,
-}
-
-
-def get_args():  # pyre-ignore [3]
-    """Parse commandline."""
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--dataset", default="debug", choices=SUPPORTED_DATASETS, help="dataset"
-    )
-    args, unknown_args = parser.parse_known_args()
-    logger.warning(f"unknown_args: {unknown_args}")
-    return args
-
-
-class Runner:
-    """
-    Orchestrates inference benchmark execution.
-
-    Manages data production, model inference, and result collection for
-    MLPerf LoadGen-based benchmarking.
-
-    Args:
-        model: The HSTU model family instance for making predictions.
-        ds: Dataset to fetch samples from.
-        num_queries: Total number of queries to process.
-        data_producer_threads: Number of threads for data loading (default: 1).
-        batchsize: Batch size for inference (default: 128).
-        compute_eval: Whether to compute evaluation metrics (default: False).
-    """
-
-    def __init__(
-        self,
-        model: HSTUModelFamily,
-        ds: Dataset,
-        num_queries: int,
-        data_producer_threads: int = 1,
-        batchsize: int = 128,
-        compute_eval: bool = False,
-    ) -> None:
-        self.model = model
-        if data_producer_threads == 1:
-            self.data_producer: Union[
-                MultiThreadDataProducer, SingleThreadDataProducer
-            ] = SingleThreadDataProducer(ds, self.run_one_item)
-        else:
-            self.data_producer = MultiThreadDataProducer(
-                ds, data_producer_threads, self.run_one_item
-            )
-        self.batchsize = batchsize
-        self.compute_eval = compute_eval
-        self.reset_states(num_queries=num_queries)
-
-    def reset_states(self, num_queries: int) -> None:
-        """
-        Reset all internal state for a new benchmark run.
-
-        Args:
-            num_queries: Number of queries expected in this run.
-        """
-        self.result_timing: List[Dict[str, float]] = []
-        self.result_batches: List[int] = []
-        self.current_query_ids: List[int] = []
-        self.current_content_ids: List[int] = []
-        self.current_t0: List[float] = []
-        self.num_queries: int = num_queries
-        self.processed_queries: int = 0
-
-    def run_one_item(self, qitem: QueryItem) -> None:
-        """
-        Process a single query item through model inference.
-
-        Runs prediction, records timing metrics, and sends results back to LoadGen.
-
-        Args:
-            qitem: Query item containing batch of samples to process.
-        """
-        try:
-            t0_prediction: float = time.time()
-            prediction_output = self.model.predict(qitem.samples)
-            dt_prediction: float = time.time() - t0_prediction
-            assert prediction_output is not None
-            (
-                mt_target_preds,
-                mt_target_labels,
-                mt_target_weights,
-                dt_sparse,
-                dt_dense,
-            ) = prediction_output
-            if self.compute_eval:
-                assert mt_target_labels is not None
-                assert mt_target_weights is not None
-            self.result_timing.append(
-                {
-                    "total": time.time() - qitem.start,
-                    "prediction": dt_prediction,
-                    "queue": qitem.dt_queue,
-                    "batching": qitem.dt_batching,
-                    "sparse": dt_sparse,
-                    "dense": dt_dense,
-                }
-            )
-            self.result_batches.append(len(qitem.query_ids))
-        except Exception as ex:  # pylint: disable=broad-except
-            logger.error("thread: failed, %s", ex)
-        finally:
-            candidate_size = mt_target_preds.size(1) // len(qitem.query_ids)
-            if not self.compute_eval:
-                for i, query_id in enumerate(qitem.query_ids):
-                    query_mt_target_preds = (
-                        mt_target_preds[  # pyre-ignore [61]
-                            0,
-                            candidate_size * i : candidate_size * (i + 1),
-                        ]
-                        .view(-1)
-                        .float()
-                        .numpy()
-                    )
-                    response_array = array.array("B", query_mt_target_preds.tobytes())
-                    bi = response_array.buffer_info()
-                    # since we send buffer to loadgen, needs `response_array` in memory during send
-                    lg.QuerySamplesComplete(
-                        [lg.QuerySampleResponse(query_id, bi[0], bi[1])]
-                    )
-            else:
-                for i, query_id in enumerate(qitem.query_ids):
-                    query_mt_target_preds = (
-                        mt_target_preds[  # pyre-ignore [61]
-                            0, candidate_size * i : candidate_size * (i + 1)
-                        ]
-                        .view(-1)
-                        .float()
-                        .numpy()
-                    )
-                    query_mt_target_labels = (
-                        mt_target_labels[  # pyre-ignore [16,61]
-                            0, candidate_size * i : candidate_size * (i + 1)
-                        ]
-                        .view(-1)
-                        .float()
-                        .numpy()
-                    )
-                    query_mt_target_weights = (
-                        mt_target_weights[  # pyre-ignore [61]
-                            0, candidate_size * i : candidate_size * (i + 1)
-                        ]
-                        .view(-1)
-                        .float()
-                        .numpy()
-                    )
-                    np_array = np.concatenate(
-                        [
-                            query_mt_target_preds,
-                            query_mt_target_labels,
-                            query_mt_target_weights,
-                            np.array([candidate_size]).astype(np.float32),
-                        ]
-                    )
-                    response_array = array.array("B", np_array.tobytes())
-                    bi = response_array.buffer_info()
-                    # since we send buffer to loadgen, needs `response_array` in memory during send
-                    lg.QuerySamplesComplete(
-                        [lg.QuerySampleResponse(query_id, bi[0], bi[1])]
-                    )
-
-    def enqueue(self, query_samples, t0: float) -> None:  # pyre-ignore [2]
-        """
-        Enqueue query samples for batch processing.
-
-        Collects samples until batch size is reached, then dispatches to data producer.
-
-        Args:
-            query_samples: List of LoadGen query sample objects.
-            t0: Timestamp when this batch started.
-        """
-        self.current_query_ids.extend([q.id for q in query_samples])
-        self.current_content_ids.extend([q.index for q in query_samples])
-        self.current_t0.append(t0)
-        self.processed_queries += len(query_samples)
-        t0: float = min(self.current_t0)
-        dt_queue: float = max(self.current_t0) - min(self.current_t0)
-        if (
-            self.processed_queries >= self.num_queries
-            or len(self.current_query_ids) >= self.batchsize
-        ):
-            for i in range(len(self.current_query_ids) // self.batchsize):
-                self.data_producer.enqueue(
-                    query_ids=self.current_query_ids[
-                        i * self.batchsize : (i + 1) * self.batchsize
-                    ],
-                    content_ids=self.current_content_ids[
-                        i * self.batchsize : (i + 1) * self.batchsize
-                    ],
-                    t0=t0,
-                    dt_queue=dt_queue,
-                )
-            remaining_s: int = len(self.current_query_ids) % self.batchsize
-            if remaining_s > 0:
-                self.data_producer.enqueue(
-                    query_ids=self.current_query_ids[-remaining_s:],
-                    content_ids=self.current_content_ids[-remaining_s:],
-                    t0=t0,
-                    dt_queue=dt_queue,
-                )
-            self.current_query_ids = []
-            self.current_content_ids = []
-            self.current_t0 = []
-
-    def finish(self) -> None:
-        """Signal data producer to finish and wait for completion."""
-        self.data_producer.finish()
-
-
-def add_results(
-    final_results: Dict[str, Any],
-    result_timing: List[Dict[str, float]],
-    result_batches: List[int],
-) -> None:
-    """
-    Aggregate and log benchmark results.
-
-    Computes percentile statistics and QPS metrics from timing data.
-
-    Args:
-        final_results: Dictionary to populate with aggregated results.
-        result_timing: List of timing dictionaries for each batch.
-        result_batches: List of batch sizes processed.
-    """
-    percentiles: list[float] = [50.0, 80.0, 90.0, 95.0, 99.0, 99.9]
-    buckets_dict: Dict[str, List[float]] = {}
-    buckets_str_dict: Dict[str, str] = {}
-    total_timing: list[float] = [result["total"] for result in result_timing]
-    for key in ["total", "prediction", "queue", "batching", "sparse", "dense"]:
-        timing: list[float] = [result[key] for result in result_timing]
-        buckets: List[float] = np.percentile(timing, percentiles).tolist()
-        buckets_str: str = ",".join(
-            ["| {}:{:.4f}| ".format(p, b) for p, b in zip(percentiles, buckets)]
-        )
-        buckets_dict[key] = buckets
-        buckets_str_dict[key] = buckets_str
-    total_batches = sum(result_batches)
-
-    final_results["good"] = len(total_timing)
-    final_results["avg_time"] = np.mean(total_timing)
-    final_results["percentiles"] = {
-        str(k): v for k, v in zip(percentiles, buckets_dict["total"])
-    }
-    final_results["qps"] = total_batches / final_results["took"]
-    final_results["count"] = total_batches
-
-    for i, timing in enumerate(result_timing):
-        logger.warning(f"timing of {i}: {timing}")
-
-    logger.warning(
-        "{} qps={:.2f}, avg_query_time={:.4f}, time={:.3f}, queries={}, tiles={}".format(
-            final_results["scenario"],
-            final_results["qps"],
-            final_results["avg_time"],
-            final_results["took"],
-            len(result_timing),
-            buckets_str_dict["total"],
-        )
-    )
-    for key in ["prediction", "queue", "batching", "sparse", "dense"]:
-        logger.warning(f"{key}: {buckets_str_dict[key]}")
-
-
-def get_num_queries(
-    input_size: Optional[int],
-    one_pass_size: int,
-    scenario_name: str,
-    offline_target_qps: int,
-    target_duration: float,
-) -> int:
-    """
-    Determine the number of queries to run based on scenario and settings.
-
-    Args:
-        input_size: User-specified query count (None to use defaults).
-        one_pass_size: Size of one complete pass through the dataset.
-        scenario_name: MLPerf scenario name ('Server' or 'Offline').
-        offline_target_qps: Target QPS for offline scenario.
-        target_duration: Target duration in milliseconds.
-
-    Returns:
-        Number of queries to execute in the benchmark run.
-    """
-    if scenario_name == "Offline":
-        # consistent with https://github.com/mlcommons/inference/blob/8999c4d686f6e4a180da14597c97063fce7c9f33/loadgen/test_settings_internal.cc#L147
-        return int(1.1 * target_duration / 1000 * offline_target_qps)
-    else:
-        if input_size is None:
-            return one_pass_size
-        return input_size
-
-
-class StreamingQuerySampler:
-    """
-    Sampler for streaming dataset
-    The execution order is determined by `StreamingQuerySampler.run_order`, not by the QSL or input query ID.
-    This ensures that queries are executed according to their timestamp constraints.
-    """
-
-    def __init__(
-        self,
-        ds: DLRMv3SyntheticStreamingDataset,
-        dataset_percentage: float,
-        scenario_name: str,
-        offline_target_qps: int,
-        target_duration: float,
-        input_queries: Optional[int] = None,
-        compute_eval: bool = False,
-    ) -> None:
-        self.ds: DLRMv3SyntheticStreamingDataset = ds
-        self.ds.is_inference = True
-        self.inference_ts: int = self.ds.total_ts - self.ds.train_ts
-        self.start_ts: int = self.ds.train_ts
-        self.dataset_percentage: float = dataset_percentage
-        self.num_unique_requests: List[int] = self.get_num_unique_requests(
-            warmup_ratio=1.0
-        )
-        self.num_unique_requests_cumsum: List[int] = np.cumsum(
-            self.num_unique_requests
-        ).tolist()
-        self.total_requests: int = sum(self.num_unique_requests)
-        self.run_order: List[List[int]] = self.build_random_exec_order()
-        self.ts_idx: int = 0
-        self.ts_processed_cnt: int = 0
-        self.last_loaded: float = -1.0
-        num_queries: int = get_num_queries(
-            input_size=input_queries,
-            one_pass_size=self.total_requests,
-            scenario_name=scenario_name,
-            offline_target_qps=offline_target_qps,
-            target_duration=target_duration,
-        )
-        logger.warning(
-            f"StreamingQuerySampler constructred to handle {num_queries} queries"
-        )
-        self.num_repeats: int = (
-            max(1, num_queries // self.total_requests) if not compute_eval else 1
-        )
-        self.remaining_queries: int = (
-            num_queries % self.total_requests if not compute_eval else 0
-        )
-        self._lock = threading.Lock()
-
-    def get_num_unique_requests(self, warmup_ratio: float) -> List[int]:
-        """
-        Calculate number of unique requests per timestamp.
-
-        Args:
-            warmup_ratio: Fraction of users to include in warmup.
-
-        Returns:
-            List of request counts per timestamp.
-        """
-        num_unique_requests = [
-            int(
-                self.ds.ts_to_users_cumsum[t][-1]
-                * self.dataset_percentage
-                * warmup_ratio
-            )
-            for t in range(self.start_ts, self.start_ts + self.inference_ts)
-        ]
-        return num_unique_requests
-
-    def build_random_exec_order(self) -> List[List[int]]:
-        """
-        Build randomized execution order for each timestamp.
-
-        Returns:
-            List of shuffled index lists, one per timestamp.
-        """
-        order = []
-        for req_size in self.num_unique_requests:
-            within_ts_order = list(range(req_size))
-            random.shuffle(within_ts_order)
-            order.append(within_ts_order)
-        return order
-
-    def init_sut(self) -> None:
-        """Initialize System Under Test state for a new benchmark run."""
-        self.ts_idx = 0
-        self.ts_processed_cnt = 0
-        self.ds.set_ts(self.start_ts)
-
-    def load_query_samples(self, query_ids: List[Optional[int]]) -> None:
-        """
-        Load query samples into memory for the benchmark.
-
-        Args:
-            query_ids: List of query identifiers to load.
-        """
-        length = len(query_ids)
-        ts_idx: int = 0
-        while self.num_unique_requests_cumsum[ts_idx] < length:
-            ts_idx += 1
-        for i in range(0, ts_idx):
-            self.ds.set_ts(i + self.start_ts)
-            self.ds.load_query_samples(self.run_order[i])
-        self.ds.set_ts(ts_idx + self.start_ts)
-        delta_length = (
-            length
-            if ts_idx == 0
-            else length - self.num_unique_requests_cumsum[ts_idx - 1]
-        )
-        self.ds.load_query_samples(self.run_order[ts_idx][:delta_length])
-        self.init_sut()
-        self.last_loaded = time.time()
-
-    def unload_query_samples(self, sample_list: List[int]) -> None:
-        """
-        Unload query samples from memory.
-
-        Args:
-            sample_list: List of sample identifiers to unload.
-        """
-        self.ds.unload_query_samples(sample_list)
-
-    def get_samples(self, id_list: List[int]) -> List[Samples]:
-        """
-        Get samples for a batch of queries, handling timestamp boundaries.
-
-        Args:
-            id_list: List of query identifiers.
-
-        Returns:
-            List of Samples objects, potentially spanning multiple timestamps.
-        """
-        batch_size: int = len(id_list)
-        with self._lock:
-            curr_ts_idx: int = self.ts_idx
-            curr_ts_unique_requests: int = self.num_unique_requests[curr_ts_idx]
-            curr_ts_queries: int = curr_ts_unique_requests * self.num_repeats
-            if curr_ts_idx == self.inference_ts - 1:
-                curr_ts_queries += self.remaining_queries
-            begin_query_idx: int = self.ts_processed_cnt
-            end_query_idx: int = min(begin_query_idx + batch_size, curr_ts_queries)
-            begin_request_idx: int = begin_query_idx % curr_ts_unique_requests
-            end_request_idx: int = end_query_idx % curr_ts_unique_requests
-            if begin_query_idx + batch_size >= curr_ts_queries:
-                self.ts_idx += 1
-                self.ts_processed_cnt = begin_query_idx + batch_size - curr_ts_queries
-            else:
-                self.ts_processed_cnt = begin_query_idx + batch_size
-        # requests of current ts
-        outputs: List[Samples] = []
-        if end_request_idx > begin_request_idx:
-            output: Samples = self.ds.get_samples_with_ts(
-                self.run_order[curr_ts_idx][begin_request_idx:end_request_idx],
-                curr_ts_idx + self.start_ts,
-            )
-            outputs.append(output)
-        else:
-            if begin_request_idx < curr_ts_unique_requests:
-                output: Samples = self.ds.get_samples_with_ts(
-                    self.run_order[curr_ts_idx][begin_request_idx:],
-                    curr_ts_idx + self.start_ts,
-                )
-                outputs.append(output)
-            if end_request_idx > 0:
-                output = self.ds.get_samples_with_ts(
-                    self.run_order[curr_ts_idx][0:end_request_idx],
-                    curr_ts_idx + self.start_ts,
-                )
-                outputs.append(output)
-        # requests of next ts
-        if begin_query_idx + batch_size > curr_ts_queries:
-            output: Samples = self.ds.get_samples_with_ts(
-                self.run_order[curr_ts_idx + 1][
-                    : begin_query_idx + batch_size - curr_ts_queries
-                ],
-                curr_ts_idx + 1 + self.start_ts,
-            )
-            outputs.append(output)
-        return outputs
-
-    def get_item_count(self) -> int:
-        """
-        Get total number of items in the dataset.
-
-        Returns:
-            Total request count across all timestamps.
-        """
-        return self.total_requests
-
-
-@gin.configurable
-def run(
-    dataset: str = "sampled-streaming-100b",
-    model_path: str = "",
-    scenario_name: str = "Server",
-    batchsize: int = 16,
-    output_trace: bool = False,
-    data_producer_threads: int = 4,
-    compute_eval: bool = False,
-    find_peak_performance: bool = False,
-    dataset_path_prefix: str = "",
-    train_split_percentage: float = 0.75,
-    warmup_ratio: float = 0.1,
-    target_qps: Optional[int] = None,
-    num_queries: Optional[int] = None,
-    numpy_rand_seed: int = 123,
-    sparse_quant: bool = False,
-    dataset_percentage: float = 1.0,
-) -> None:
-    """
-    Execute the MLPerf DLRMv3 inference benchmark.
-
-    Sets up the model, dataset, and LoadGen infrastructure, then runs
-    warmup and official benchmark phases.
-
-    Args:
-        dataset: Dataset identifier to use.
-        model_path: Path to model checkpoint directory.
-        scenario_name: MLPerf scenario ('Server' or 'Offline').
-        batchsize: Batch size for inference.
-        output_trace: Whether to output profiling traces.
-        data_producer_threads: Number of data loading threads.
-        compute_eval: Whether to compute accuracy metrics.
-        find_peak_performance: Whether to run peak performance finding mode.
-        dataset_path_prefix: Prefix path for dataset files.
-        warmup_ratio: Fraction of data to use for warmup.
-        target_qps: Target queries per second.
-        num_queries: Number of queries to run (None for automatic).
-        numpy_rand_seed: Random seed for reproducibility.
-        sparse_quant: Whether to quantize sparse embeddings.
-        dataset_percentage: Fraction of dataset to use.
-    """
-    set_dev_mode(False)
-    if scenario_name not in SCENARIO_MAP:
-        raise NotImplementedError("valid scanarios:" + str(list(SCENARIO_MAP.keys())))
-    scenario = SCENARIO_MAP[scenario_name]
-    np.random.seed(numpy_rand_seed)
-    random.seed(numpy_rand_seed)
-
-    hstu_config = get_hstu_configs(dataset)
-    hstu_config.max_num_candidates = hstu_config.max_num_candidates_inference
-    table_config = get_embedding_table_config(dataset)
-    set_is_inference(is_inference=not compute_eval)
-
-    user_conf = os.path.abspath(USER_CONF)
-    if not os.path.exists(user_conf):
-        logger.error("{} not found".format(user_conf))
-        sys.exit(1)
-
-    settings = lg.TestSettings()
-    settings.FromConfig(user_conf, model_path, scenario_name)
-    settings.scenario = scenario
-    settings.mode = lg.TestMode.PerformanceOnly
-    if compute_eval:
-        settings.mode = lg.TestMode.AccuracyOnly
-    if find_peak_performance:
-        settings.mode = lg.TestMode.FindPeakPerformance
-    if target_qps:
-        settings.server_target_qps = float(target_qps)
-        settings.offline_expected_qps = float(target_qps)
-
-    model_family = HSTUModelFamily(
-        hstu_config=hstu_config,
-        table_config=table_config,
-        sparse_quant=sparse_quant,
-        output_trace=output_trace,
-        compute_eval=compute_eval,
-    )
-    is_streaming: bool = "streaming" in dataset
-    dataset, kwargs = get_dataset(dataset, dataset_path_prefix)
-
-    ds: Dataset = dataset(
-        hstu_config=hstu_config,
-        embedding_config=table_config,
-        is_inference=not compute_eval,
-        **kwargs,
-    )
-    if is_streaming:
-        ds = StreamingQuerySampler(  # pyre-ignore
-            ds=ds,  # pyre-ignore [6]
-            dataset_percentage=dataset_percentage,
-            input_queries=num_queries,
-            compute_eval=compute_eval,
-            scenario_name=scenario_name,
-            offline_target_qps=settings.offline_expected_qps,
-            target_duration=settings.min_duration_ms,
-        )
-    model_family.load(model_path)
-
-    # warmup
-    for autotune_bs in range(batchsize, 0, -1):
-        logger.warning(f"Autotune for batch size {autotune_bs}")
-        warmup_ids = list(range(autotune_bs))
-        ds.load_query_samples(warmup_ids)
-        for _ in range(4 * int(os.environ.get("WORLD_SIZE", 1))):
-            if is_streaming:
-                ds.init_sut()  # pyre-ignore [16]
-            sample: Union[Samples, List[Samples]] = ds.get_samples(warmup_ids)
-            if isinstance(sample, Samples):
-                model_family.predict(sample)
-            else:
-                for s in sample:
-                    model_family.predict(s)
-        ds.unload_query_samples(None)
-    for h in logger.handlers:
-        h.flush()
-    logger.info("Model forward warmup done")
-
-    count = int(
-        ds.get_item_count() * dataset_percentage
-        if not is_streaming
-        else ds.get_item_count()
-    )
-    train_size: int = round(train_split_percentage * count) if not is_streaming else 0
-    if compute_eval:
-        count = count - train_size
-
-    runner: Runner = Runner(
-        model_family,
-        ds,
-        data_producer_threads=data_producer_threads,
-        batchsize=batchsize,
-        compute_eval=compute_eval,
-        num_queries=count,
-    )
-
-    def issue_queries(query_samples) -> None:  # pyre-ignore [2]
-        if compute_eval:
-            for sample in query_samples:
-                sample.index = sample.index + train_size
-        runner.enqueue(query_samples, time.time())
-
-    def load_query_samples(query_ids: List[int]) -> None:
-        if compute_eval:
-            query_ids = [q + train_size for q in query_ids]
-        ds.load_query_samples(query_ids)
-
-    def flush_queries() -> None:
-        pass
-
-    if scenario == lg.TestScenario.Server:
-        # inference benchmark warmup
-        if is_streaming:
-            ds.init_sut()
-            warmup_count: int = sum(
-                ds.get_num_unique_requests(  # pyre-ignore [16]
-                    warmup_ratio=warmup_ratio
-                )
-            )
-        else:
-            warmup_count: int = int(count * warmup_ratio)
-        runner.reset_states(num_queries=warmup_count)
-        final_results = {
-            "runtime": model_family.name(),
-            "version": model_family.version(),
-            "time": int(time.time()),
-            "scenario": str(scenario),
-        }
-        settings.min_query_count = warmup_count
-        settings.max_query_count = warmup_count
-        sut = lg.ConstructSUT(issue_queries, flush_queries)
-        qsl = lg.ConstructQSL(
-            warmup_count,
-            warmup_count,
-            load_query_samples,
-            ds.unload_query_samples,
-        )
-        with profiler_or_nullcontext(enabled=output_trace, with_stack=False):
-            logger.info(f"starting warmup {scenario} with {warmup_count} queries")
-            lg.StartTest(sut, qsl, settings)
-            lg.DestroyQSL(qsl)
-            lg.DestroySUT(sut)
-
-    # official run
-    if is_streaming:
-        ds.init_sut()
-    final_results = {
-        "runtime": model_family.name(),
-        "version": model_family.version(),
-        "time": int(time.time()),
-        "scenario": str(scenario),
-    }
-    query_size: int = get_num_queries(
-        input_size=num_queries,
-        one_pass_size=count,
-        scenario_name=scenario_name,
-        offline_target_qps=settings.offline_expected_qps,
-        target_duration=settings.min_duration_ms,
-    )
-    settings.min_query_count = query_size
-    settings.max_query_count = query_size
-    runner.reset_states(num_queries=query_size if not compute_eval else count)
-    sut = lg.ConstructSUT(issue_queries, flush_queries)
-    qsl = lg.ConstructQSL(
-        count,
-        count,
-        load_query_samples,
-        ds.unload_query_samples,
-    )
-    with profiler_or_nullcontext(enabled=output_trace, with_stack=False):
-        logger.info(
-            f"starting {scenario} with {query_size} queries and {query_size // count} repeats"
-        )
-        lg.StartTest(sut, qsl, settings)
-        runner.finish()
-        final_results["took"] = time.time() - ds.last_loaded
-        lg.DestroyQSL(qsl)
-        lg.DestroySUT(sut)
-
-    add_results(
-        final_results,
-        runner.result_timing,
-        runner.result_batches,
-    )
-    # If multiple subprocesses are running the model send a signal to stop them
-    if int(os.environ.get("WORLD_SIZE", 1)) > 1:
-        model_family.predict(None)
-
-
-def main() -> None:
-    set_verbose_level(1)
-    args = get_args()
-    logger.info(args)
-    gin_path = f"{os.path.dirname(__file__)}/gin/{SUPPORTED_CONFIGS[args.dataset]}"
-    gin.parse_config_file(gin_path)
-    run(dataset=args.dataset)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/mlperf.conf b/recommendation_v4/generative_recommenders/dlrm_v3/inference/mlperf.conf
deleted file mode 100644
index a2b4f6fff..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/mlperf.conf
+++ /dev/null
@@ -1,98 +0,0 @@
-# The format of this config file is 'key = value'.
-# The key has the format 'model.scenario.key'. Value is mostly int64_t.
-# Model maybe '*' as wildcard. In that case the value applies to all models.
-# All times are in milli seconds
-
-# Set performance_sample_count for each model.
-# User can optionally set this to higher values in user.conf.
-resnet50.*.performance_sample_count_override = 1024
-ssd-mobilenet.*.performance_sample_count_override = 256
-retinanet.*.performance_sample_count_override = 64
-bert.*.performance_sample_count_override = 10833
-dlrm.*.performance_sample_count_override = 204800
-dlrm-v2.*.performance_sample_count_override = 204800
-rnnt.*.performance_sample_count_override = 2513
-gptj.*.performance_sample_count_override = 13368
-llama2-70b.*.performance_sample_count_override = 24576
-stable-diffusion-xl.*.performance_sample_count_override = 5000
-# set to 0 to let entire sample set to be performance sample
-3d-unet.*.performance_sample_count_override = 0
-
-# Set seeds. The seeds will be distributed two weeks before the submission.
-*.*.qsl_rng_seed = 3066443479025735752
-*.*.sample_index_rng_seed = 10688027786191513374
-*.*.schedule_rng_seed = 14962580496156340209
-# Set seeds for TEST_05. The seeds will be distributed two weeks before the submission.
-*.*.test05_qsl_rng_seed = 16799458546791641818
-*.*.test05_sample_index_rng_seed = 5453809927556429288
-*.*.test05_schedule_rng_seed = 5435552105434836064
-
-
-*.SingleStream.target_latency_percentile = 90
-*.SingleStream.min_duration = 600000
-
-*.MultiStream.target_latency_percentile = 99
-*.MultiStream.samples_per_query = 8
-*.MultiStream.min_duration = 600000
-*.MultiStream.min_query_count = 662
-retinanet.MultiStream.target_latency = 528
-
-# 3D-UNet uses equal issue mode because it has non-uniform inputs
-3d-unet.*.sample_concatenate_permutation = 1
-
-# LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario
-gptj.*.sample_concatenate_permutation = 1
-llama2-70b.*.sample_concatenate_permutation = 1
-mixtral-8x7b.*.sample_concatenate_permutation = 1
-
-*.Server.target_latency = 10
-*.Server.target_latency_percentile = 99
-*.Server.target_duration = 0
-*.Server.min_duration = 600000
-resnet50.Server.target_latency = 15
-retinanet.Server.target_latency = 100
-bert.Server.target_latency = 130
-dlrm.Server.target_latency = 60
-dlrm-v2.Server.target_latency = 60
-rnnt.Server.target_latency = 1000
-gptj.Server.target_latency = 20000
-stable-diffusion-xl.Server.target_latency = 20000
-# Llama2-70b benchmarks measures token latencies
-llama2-70b.*.use_token_latencies = 1
-mixtral-8x7b.*.use_token_latencies = 1
-# gptj benchmark infers token latencies
-gptj.*.infer_token_latencies = 1
-gptj.*.token_latency_scaling_factor = 69
-# Only ttft and tpot are tracked for the llama2-70b & mixtral-8x7B benchmark therefore target_latency = 0
-llama2-70b.Server.target_latency = 0
-llama2-70b.Server.ttft_latency = 2000
-llama2-70b.Server.tpot_latency = 200
-
-mixtral-8x7b.Server.target_latency = 0
-mixtral-8x7b.Server.ttft_latency = 2000
-mixtral-8x7b.Server.tpot_latency = 200
-
-*.Offline.target_latency_percentile = 90
-*.Offline.min_duration = 600000
-
-# In Offline scenario, we always have one query. But LoadGen maps this to
-# min_sample_count internally in Offline scenario. If the dataset size is larger
-# than 24576 we limit the min_query_count to 24576 and otherwise we use
-# the dataset size as the limit
-
-resnet50.Offline.min_query_count = 24576
-retinanet.Offline.min_query_count = 24576
-dlrm-v2.Offline.min_query_count = 24576
-bert.Offline.min_query_count = 10833
-gptj.Offline.min_query_count = 13368
-rnnt.Offline.min_query_count = 2513
-3d-unet.Offline.min_query_count = 43
-stable-diffusion-xl.Offline.min_query_count = 5000
-llama2-70b.Offline.min_query_count = 24576
-mixtral-8x7b.Offline.min_query_count = 15000
-
-# These fields should be defined and overridden by user.conf.
-*.SingleStream.target_latency = 10
-*.MultiStream.target_latency = 80
-*.Server.target_qps = 1.0
-*.Offline.target_qps = 1.0
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/model_family.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/model_family.py
deleted file mode 100644
index 1c8bcd237..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/model_family.py
+++ /dev/null
@@ -1,705 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-strict
-"""
-model_family for dlrm_v3.
-"""
-
-import copy
-import functools
-import logging
-import os
-import time
-import uuid
-from threading import Event
-from typing import Dict, List, Optional, Tuple, Union
-
-import torch
-import torch.multiprocessing as mp
-import torchrec
-from generative_recommenders.dlrm_v3.checkpoint import (
-    load_nonsparse_checkpoint,
-    load_sparse_checkpoint,
-)
-from generative_recommenders.dlrm_v3.configs import HASH_SIZE_1B
-from generative_recommenders.dlrm_v3.datasets.dataset import Samples
-from generative_recommenders.dlrm_v3.inference.inference_modules import (
-    get_hstu_model,
-    HSTUSparseInferenceModule,
-    move_sparse_output_to_device,
-    set_is_inference,
-)
-from generative_recommenders.dlrm_v3.utils import Profiler
-from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig, SequenceEmbedding
-from pyre_extensions import none_throws
-from torch import quantization as quant
-from torchrec.distributed.quant_embedding import QuantEmbeddingCollection
-from torchrec.modules.embedding_configs import EmbeddingConfig, QuantConfig
-from torchrec.sparse.jagged_tensor import JaggedTensor, KeyedJaggedTensor
-from torchrec.sparse.tensor_dict import maybe_td_to_kjt
-from torchrec.test_utils import get_free_port
-
-logger: logging.Logger = logging.getLogger(__name__)
-
-
-class HSTUModelFamily:
-    """
-    High-level interface for the HSTU model family.
-
-    Manages both sparse (embedding) and dense (transformer) components of the
-    HSTU model, supporting distributed inference across multiple GPUs.
-
-    Args:
-        hstu_config: Configuration object for the HSTU model.
-        table_config: Dictionary of embedding table configurations.
-        output_trace: Whether to enable profiling trace output.
-        sparse_quant: Whether to quantize sparse embeddings.
-        compute_eval: Whether to compute evaluation metrics (includes labels).
-    """
-
-    def __init__(
-        self,
-        hstu_config: DlrmHSTUConfig,
-        table_config: Dict[str, EmbeddingConfig],
-        output_trace: bool = False,
-        sparse_quant: bool = False,
-        compute_eval: bool = False,
-    ) -> None:
-        self.hstu_config = hstu_config
-        self.table_config = table_config
-        self.sparse: ModelFamilySparseDist = ModelFamilySparseDist(
-            hstu_config=hstu_config,
-            table_config=table_config,
-            quant=sparse_quant,
-        )
-
-        assert torch.cuda.is_available(), "CUDA is required for this benchmark."
-        ngpus = torch.cuda.device_count()
-        self.world_size = int(os.environ.get("WORLD_SIZE", str(ngpus)))
-        logger.warning(f"Using {self.world_size} GPU(s)...")
-        dense_model_family_clazz = (
-            ModelFamilyDenseDist
-            if self.world_size > 1
-            else ModelFamilyDenseSingleWorker
-        )
-        self.dense: Union[ModelFamilyDenseDist, ModelFamilyDenseSingleWorker] = (
-            dense_model_family_clazz(
-                hstu_config=hstu_config,
-                table_config=table_config,
-                output_trace=output_trace,
-                compute_eval=compute_eval,
-            )
-        )
-
-    def version(self) -> str:
-        """Return the PyTorch version string."""
-        return torch.__version__
-
-    def name(self) -> str:
-        """Return the model family name identifier."""
-        return "model-family-hstu"
-
-    def load(self, model_path: str) -> None:
-        """
-        Load model checkpoints from disk.
-
-        Args:
-            model_path: Base path to the model checkpoint directory.
-        """
-        self.sparse.load(model_path=model_path)
-        self.dense.load(model_path=model_path)
-
-    def predict(
-        self, samples: Optional[Samples]
-    ) -> Optional[
-        Tuple[
-            torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], float, float
-        ]
-    ]:
-        """
-        Run inference on a batch of samples.
-
-        Processes samples through sparse embeddings, then dense forward pass.
-
-        Args:
-            samples: Input samples containing features. If None, signals shutdown.
-
-        Returns:
-            Tuple of (predictions, labels, weights, sparse_time, dense_time) or None.
-        """
-        with torch.no_grad():
-            if samples is None:
-                self.dense.predict(None, None, 0, None, 0, None)
-                return None
-            (
-                seq_embeddings,
-                payload_features,
-                max_uih_len,
-                uih_seq_lengths,
-                max_num_candidates,
-                num_candidates,
-                dt_sparse,
-            ) = self.sparse.predict(samples)
-            out = self.dense.predict(
-                seq_embeddings,
-                payload_features,
-                max_uih_len,
-                uih_seq_lengths,
-                max_num_candidates,
-                num_candidates,
-            )
-            (  # pyre-ignore [23]
-                mt_target_preds,
-                mt_target_labels,
-                mt_target_weights,
-                dt_dense,
-            ) = out
-            return (
-                mt_target_preds,
-                mt_target_labels,
-                mt_target_weights,
-                dt_sparse,
-                dt_dense,
-            )
-
-
-def ec_patched_forward_wo_embedding_copy(
-    ec_module: torchrec.EmbeddingCollection,
-    features: KeyedJaggedTensor,  # can also take TensorDict as input
-) -> Dict[str, JaggedTensor]:
-    """
-    Run the EmbeddingBagCollection forward pass. This method takes in a `KeyedJaggedTensor`
-    and returns a `Dict[str, JaggedTensor]`, which is the result of the individual embeddings for each feature.
-
-    Args:
-        features (KeyedJaggedTensor): KJT of form [F X B X L].
-
-    Returns:
-        Dict[str, JaggedTensor]
-    """
-    features = maybe_td_to_kjt(features, None)
-    feature_embeddings: Dict[str, JaggedTensor] = {}
-    jt_dict: Dict[str, JaggedTensor] = features.to_dict()
-    for i, emb_module in enumerate(ec_module.embeddings.values()):
-        feature_names = ec_module._feature_names[i]
-        embedding_names = ec_module._embedding_names_by_table[i]
-        for j, embedding_name in enumerate(embedding_names):
-            feature_name = feature_names[j]
-            f = jt_dict[feature_name]
-            indices = torch.clamp(f.values(), min=0, max=HASH_SIZE_1B - 1)
-            lookup = emb_module(
-                input=indices
-            )  # remove the dtype cast at https://github.com/meta-pytorch/torchrec/blob/0a2cebd5472a7edc5072b3c912ad8aaa4179b9d9/torchrec/modules/embedding_modules.py#L486
-            feature_embeddings[embedding_name] = JaggedTensor(
-                values=lookup,
-                lengths=f.lengths(),
-                weights=f.values() if ec_module._need_indices else None,
-            )
-    return feature_embeddings
-
-
-class ModelFamilySparseDist:
-    """
-    Sparse Arch module manager.
-
-    Handles loading and inference of sparse embedding lookups, optionally
-    with quantization for memory efficiency.
-
-    Args:
-        hstu_config: HSTU model configuration.
-        table_config: Embedding table configurations.
-        quant: Whether to apply dynamic quantization to embeddings.
-    """
-
-    def __init__(
-        self,
-        hstu_config: DlrmHSTUConfig,
-        table_config: Dict[str, EmbeddingConfig],
-        quant: bool = False,
-    ) -> None:
-        super(ModelFamilySparseDist, self).__init__()
-        self.hstu_config = hstu_config
-        self.table_config = table_config
-        self.module: Optional[torch.nn.Module] = None
-        self.quant: bool = quant
-
-    def load(self, model_path: str) -> None:
-        """
-        Load sparse model checkpoint and optionally apply quantization.
-
-        Args:
-            model_path: Path to the model checkpoint directory.
-        """
-        logger.warning(f"Loading sparse module from {model_path}")
-
-        sparse_arch: HSTUSparseInferenceModule = HSTUSparseInferenceModule(
-            table_config=self.table_config,
-            hstu_config=self.hstu_config,
-        )
-        load_sparse_checkpoint(model=sparse_arch._hstu_model, path=model_path)
-        sparse_arch.eval()
-        if self.quant:
-            self.module = quant.quantize_dynamic(
-                sparse_arch,
-                qconfig_spec={
-                    torchrec.EmbeddingCollection: QuantConfig(
-                        activation=quant.PlaceholderObserver.with_args(
-                            dtype=torch.float
-                        ),
-                        weight=quant.PlaceholderObserver.with_args(dtype=torch.int8),
-                    ),
-                },
-                mapping={
-                    torchrec.EmbeddingCollection: QuantEmbeddingCollection,
-                },
-                inplace=False,
-            )
-        else:
-            sparse_arch._hstu_model._embedding_collection.forward = (  # pyre-ignore[8]
-                functools.partial(
-                    ec_patched_forward_wo_embedding_copy,
-                    sparse_arch._hstu_model._embedding_collection,
-                )
-            )
-            self.module = sparse_arch
-        logger.warning(f"sparse module is {self.module}")
-
-    def predict(
-        self, samples: Samples
-    ) -> Tuple[
-        Dict[str, SequenceEmbedding],
-        Dict[str, torch.Tensor],
-        int,
-        torch.Tensor,
-        int,
-        torch.Tensor,
-        float,
-    ]:
-        """
-        Run sparse forward pass (embedding lookups).
-
-        Args:
-            samples: Input samples with feature tensors.
-
-        Returns:
-            Tuple of (seq_embeddings, payload_features, max_uih_len, uih_seq_lengths,
-            max_num_candidates, num_candidates, elapsed_time).
-        """
-        with torch.profiler.record_function("sparse forward"):
-            module: torch.nn.Module = none_throws(self.module)
-            assert self.module is not None
-            uih_features = samples.uih_features_kjt
-            candidates_features = samples.candidates_features_kjt
-            t0: float = time.time()
-            (
-                seq_embeddings,
-                payload_features,
-                max_uih_len,
-                uih_seq_lengths,
-                max_num_candidates,
-                num_candidates,
-            ) = module(
-                uih_features=uih_features,
-                candidates_features=candidates_features,
-            )
-            dt_sparse: float = time.time() - t0
-            return (
-                seq_embeddings,
-                payload_features,
-                max_uih_len,
-                uih_seq_lengths,
-                max_num_candidates,
-                num_candidates,
-                dt_sparse,
-            )
-
-
-class ModelFamilyDenseDist:
-    """
-    Distributed dense module manager for multi-GPU inference.
-
-    Spawns worker processes for each GPU to run dense forward passes in parallel,
-    with samples distributed via inter-process queues.
-
-    Args:
-        hstu_config: HSTU model configuration.
-        table_config: Embedding table configurations.
-        output_trace: Whether to enable profiling traces.
-        compute_eval: Whether to compute evaluation metrics.
-    """
-
-    def __init__(
-        self,
-        hstu_config: DlrmHSTUConfig,
-        table_config: Dict[str, EmbeddingConfig],
-        output_trace: bool = False,
-        compute_eval: bool = False,
-    ) -> None:
-        super(ModelFamilyDenseDist, self).__init__()
-        self.hstu_config = hstu_config
-        self.table_config = table_config
-        self.output_trace = output_trace
-        self.compute_eval = compute_eval
-
-        ngpus = torch.cuda.device_count()
-        self.world_size = int(os.environ.get("WORLD_SIZE", str(ngpus)))
-        self.rank = 0
-        os.environ["MASTER_ADDR"] = "localhost"
-        os.environ["MASTER_PORT"] = str(get_free_port())
-        self.dist_backend = "nccl"
-
-        ctx = mp.get_context("spawn")
-        self.samples_q: List[mp.Queue] = [ctx.Queue() for _ in range(self.world_size)]
-        self.result_q: List[mp.Queue] = [ctx.Queue() for _ in range(self.world_size)]
-
-    def load(self, model_path: str) -> None:
-        """
-        Load dense model and spawn worker processes for distributed inference.
-
-        Args:
-            model_path: Path to the model checkpoint directory.
-        """
-        logger.warning(f"Loading dense module from {model_path}")
-
-        ctx = mp.get_context("spawn")
-        processes = []
-        for rank in range(self.world_size):
-            p = ctx.Process(
-                target=self.distributed_setup,
-                args=(
-                    rank,
-                    self.world_size,
-                    model_path,
-                ),
-            )
-            p.start()
-            processes.append(p)
-
-    def distributed_setup(self, rank: int, world_size: int, model_path: str) -> None:
-        """
-        Initialize and run a dense worker process.
-
-        Each worker loads the model, processes samples from its queue, and
-        returns results.
-
-        Args:
-            rank: Process rank (GPU index).
-            world_size: Total number of worker processes.
-            model_path: Path to model checkpoint.
-        """
-        nprocs_per_rank = 16
-        start_core: int = nprocs_per_rank * rank
-        cores: set[int] = set([start_core + i for i in range(nprocs_per_rank)])
-        os.sched_setaffinity(0, cores)
-        set_is_inference(is_inference=not self.compute_eval)
-        model = get_hstu_model(
-            table_config=self.table_config,
-            hstu_config=self.hstu_config,
-            table_device="cpu",
-            max_hash_size=100,
-            is_dense=True,
-        ).to(torch.bfloat16)
-        model.set_training_dtype(torch.bfloat16)
-        device = torch.device(f"cuda:{rank}")
-        torch.cuda.set_device(f"cuda:{rank}")
-        load_nonsparse_checkpoint(
-            model=model, device=device, optimizer=None, path=model_path
-        )
-        model = model.to(device)
-        model.eval()
-        profiler = Profiler(rank) if self.output_trace else None
-
-        with torch.no_grad():
-            while True:
-                item = self.samples_q[rank].get()
-                # If -1 is received terminate all subprocesses
-                if item == -1:
-                    break
-                if self.output_trace:
-                    assert profiler is not None
-                    profiler.step()
-                with torch.profiler.record_function("get_item_from_queue"):
-                    # Copy here to release data in the producer to avoid invalid cuda caching allocator release.
-                    item = copy.deepcopy(item)
-                    (
-                        id,
-                        seq_embeddings,
-                        payload_features,
-                        max_uih_len,
-                        uih_seq_lengths,
-                        max_num_candidates,
-                        num_candidates,
-                    ) = item
-                    assert seq_embeddings is not None
-                with torch.profiler.record_function("dense forward"):
-                    (
-                        _,
-                        _,
-                        _,
-                        mt_target_preds,
-                        mt_target_labels,
-                        mt_target_weights,
-                    ) = model.main_forward(
-                        seq_embeddings=seq_embeddings,
-                        payload_features=payload_features,
-                        max_uih_len=max_uih_len,
-                        uih_seq_lengths=uih_seq_lengths,
-                        max_num_candidates=max_num_candidates,
-                        num_candidates=num_candidates,
-                    )
-                    # mt_target_preds = torch.empty(1, 2048 * 20).to(device="cpu")
-                    # mt_target_labels = None
-                    # mt_target_weights = None
-                    assert mt_target_preds is not None
-                    mt_target_preds = mt_target_preds.detach().to(device="cpu")
-                    if mt_target_labels is not None:
-                        mt_target_labels = mt_target_labels.detach().to(device="cpu")
-                    if mt_target_weights is not None:
-                        mt_target_weights = mt_target_weights.detach().to(device="cpu")
-                    self.result_q[rank].put(
-                        (id, mt_target_preds, mt_target_labels, mt_target_weights)
-                    )
-
-    def capture_output(
-        self, id: uuid.UUID, rank: int
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
-        """
-        Retrieve inference results from a worker process.
-
-        Args:
-            id: Unique identifier for the request.
-            rank: Worker rank to retrieve from.
-
-        Returns:
-            Tuple of (predictions, labels, weights).
-        """
-        while True:
-            recv_id, preds, labels, weights = self.result_q[rank].get()
-            assert recv_id == id
-            return preds, labels, weights
-
-    def get_rank(self) -> int:
-        """
-        Get the next worker rank for load balancing.
-
-        Returns:
-            Rank index, cycling through available workers.
-        """
-        rank = self.rank
-        self.rank = (self.rank + 1) % self.world_size
-        return rank
-
-    def predict(
-        self,
-        seq_embeddings: Optional[Dict[str, SequenceEmbedding]],
-        payload_features: Optional[Dict[str, torch.Tensor]],
-        max_uih_len: int,
-        uih_seq_lengths: Optional[torch.Tensor],
-        max_num_candidates: int,
-        num_candidates: Optional[torch.Tensor],
-    ) -> Optional[
-        Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], float]
-    ]:
-        """
-        Run distributed dense forward pass.
-
-        Dispatches work to a worker process and collects results.
-
-        Args:
-            seq_embeddings: Sequence embeddings from sparse module.
-            payload_features: Additional feature tensors.
-            max_uih_len: Maximum UIH sequence length.
-            uih_seq_lengths: Per-sample UIH lengths.
-            max_num_candidates: Maximum candidates per sample.
-            num_candidates: Per-sample candidate counts.
-
-        Returns:
-            Tuple of (predictions, labels, weights, elapsed_time) or None if shutdown.
-        """
-        id = uuid.uuid4()
-        # If none is received terminate all subprocesses
-        if seq_embeddings is None:
-            for rank in range(self.world_size):
-                self.samples_q[rank].put(-1)
-            return None
-        rank = self.get_rank()
-        device = torch.device(f"cuda:{rank}")
-        assert (
-            payload_features is not None
-            and num_candidates is not None
-            and uih_seq_lengths is not None
-        )
-        t0: float = time.time()
-        seq_embeddings, payload_features, uih_seq_lengths, num_candidates = (
-            move_sparse_output_to_device(
-                seq_embeddings=seq_embeddings,
-                payload_features=payload_features,
-                uih_seq_lengths=uih_seq_lengths,
-                num_candidates=num_candidates,
-                device=device,
-            )
-        )
-        self.samples_q[rank].put(
-            (
-                id,
-                seq_embeddings,
-                payload_features,
-                max_uih_len,
-                uih_seq_lengths,
-                max_num_candidates,
-                num_candidates,
-            )
-        )
-        (mt_target_preds, mt_target_labels, mt_target_weights) = self.capture_output(
-            id, rank
-        )
-        dt_dense = time.time() - t0
-        return (
-            mt_target_preds,
-            mt_target_labels,
-            mt_target_weights,
-            dt_dense,
-        )
-
-
-class ModelFamilyDenseSingleWorker:
-    """
-    Single-worker dense module manager for single-GPU inference.
-
-    Simpler alternative to ModelFamilyDenseDist for single-GPU setups.
-
-    Args:
-        hstu_config: HSTU model configuration.
-        table_config: Embedding table configurations.
-        output_trace: Whether to enable profiling traces.
-        compute_eval: Whether to compute evaluation metrics.
-    """
-
-    def __init__(
-        self,
-        hstu_config: DlrmHSTUConfig,
-        table_config: Dict[str, EmbeddingConfig],
-        output_trace: bool = False,
-        compute_eval: bool = False,
-    ) -> None:
-        self.model: Optional[torch.nn.Module] = None
-        self.hstu_config = hstu_config
-        self.table_config = table_config
-        self.output_trace = output_trace
-        self.device: torch.device = torch.device("cuda:0")
-        torch.cuda.set_device(self.device)
-        self.profiler: Optional[Profiler] = (
-            Profiler(rank=0) if self.output_trace else None
-        )
-
-    def load(self, model_path: str) -> None:
-        """
-        Load dense model for single-GPU inference.
-
-        Args:
-            model_path: Path to the model checkpoint directory.
-        """
-        logger.warning(f"Loading dense module from {model_path}")
-        self.model = (
-            get_hstu_model(
-                table_config=self.table_config,
-                hstu_config=self.hstu_config,
-                table_device="cpu",
-                is_dense=True,
-            )
-            .to(self.device)
-            .to(torch.bfloat16)
-        )
-        self.model.set_training_dtype(torch.bfloat16)
-        load_nonsparse_checkpoint(
-            model=self.model, device=self.device, optimizer=None, path=model_path
-        )
-        assert self.model is not None
-        self.model.eval()
-
-    def predict(
-        self,
-        seq_embeddings: Optional[Dict[str, SequenceEmbedding]],
-        payload_features: Optional[Dict[str, torch.Tensor]],
-        max_uih_len: int,
-        uih_seq_lengths: Optional[torch.Tensor],
-        max_num_candidates: int,
-        num_candidates: Optional[torch.Tensor],
-    ) -> Optional[
-        Tuple[
-            torch.Tensor,
-            Optional[torch.Tensor],
-            Optional[torch.Tensor],
-            float,
-        ]
-    ]:
-        """
-        Run dense forward pass on single GPU.
-
-        Args:
-            seq_embeddings: Sequence embeddings from sparse module.
-            payload_features: Additional feature tensors.
-            max_uih_len: Maximum UIH sequence length.
-            uih_seq_lengths: Per-sample UIH lengths.
-            max_num_candidates: Maximum candidates per sample.
-            num_candidates: Per-sample candidate counts.
-
-        Returns:
-            Tuple of (predictions, labels, weights, elapsed_time).
-        """
-        if self.output_trace:
-            assert self.profiler is not None
-            self.profiler.step()
-        assert (
-            payload_features is not None
-            and uih_seq_lengths is not None
-            and num_candidates is not None
-            and seq_embeddings is not None
-        )
-        t0: float = time.time()
-        with torch.profiler.record_function("dense forward"):
-            seq_embeddings, payload_features, uih_seq_lengths, num_candidates = (
-                move_sparse_output_to_device(
-                    seq_embeddings=seq_embeddings,
-                    payload_features=payload_features,
-                    uih_seq_lengths=uih_seq_lengths,
-                    num_candidates=num_candidates,
-                    device=self.device,
-                )
-            )
-            assert self.model is not None
-            (
-                _,
-                _,
-                _,
-                mt_target_preds,
-                mt_target_labels,
-                mt_target_weights,
-            ) = self.model.main_forward(  # pyre-ignore [29]
-                seq_embeddings=seq_embeddings,
-                payload_features=payload_features,
-                max_uih_len=max_uih_len,
-                uih_seq_lengths=uih_seq_lengths,
-                max_num_candidates=max_num_candidates,
-                num_candidates=num_candidates,
-            )
-            assert mt_target_preds is not None
-            mt_target_preds = mt_target_preds.detach().to(device="cpu")
-            if mt_target_labels is not None:
-                mt_target_labels = mt_target_labels.detach().to(device="cpu")
-            if mt_target_weights is not None:
-                mt_target_weights = mt_target_weights.detach().to(device="cpu")
-            dt_dense: float = time.time() - t0
-            return mt_target_preds, mt_target_labels, mt_target_weights, dt_dense
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/sparse_predict_module.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/sparse_predict_module.py
deleted file mode 100644
index e3ec10415..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/sparse_predict_module.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-strict
-
-"""
-TorchScript-friendly wrapper for the HSTU sparse path (CPU embedding lookup).
-
-``HSTUSparseScriptModule`` wraps :class:`HSTUSparseInferenceModule` and
-flattens the ``Dict[str, SequenceEmbedding]`` output into the parallel
-value/length dicts defined in :mod:`ts_types` so the boundary is composed
-entirely of TorchScript-supported types.
-"""
-
-from typing import Dict, Tuple
-
-import torch
-from generative_recommenders.dlrm_v3.inference.inference_modules import (
-    _NoCopyEmbeddingCollection,
-    HSTUSparseInferenceModule,
-)
-from generative_recommenders.dlrm_v3.inference.ts_types import (
-    flatten_seq_embeddings,
-    SeqEmbLengths,
-    SeqEmbValues,
-)
-from generative_recommenders.modules.dlrm_hstu import DlrmHSTUConfig
-from torchrec.modules.embedding_configs import EmbeddingConfig
-from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
-
-
-class HSTUSparseScriptModule(torch.nn.Module):
-    """Script-friendly sparse module.
-
-    ``forward`` returns 5 tensors / dicts (no Python ``int`` scalars):
-
-    1. ``seq_emb_values``   ``Dict[str, Tensor]`` -- jagged embedding values.
-    2. ``seq_emb_lengths``  ``Dict[str, Tensor]`` -- per-feature lengths.
-    3. ``payload_features`` ``Dict[str, Tensor]`` -- side features.
-    4. ``uih_seq_lengths``  ``Tensor[B]``        -- UIH lengths.
-    5. ``num_candidates``   ``Tensor[B]``        -- candidate counts.
-
-    The dense module (or the C++ glue) recovers the ``int`` ``max_uih_len`` /
-    ``max_num_candidates`` values from these tensors via ``.max().item()``.
-    """
-
-    def __init__(
-        self,
-        table_config: Dict[str, EmbeddingConfig],
-        hstu_config: DlrmHSTUConfig,
-        use_no_copy_embedding_collection: bool = True,
-    ) -> None:
-        super().__init__()
-        self._sparse: HSTUSparseInferenceModule = HSTUSparseInferenceModule(
-            table_config=table_config,
-            hstu_config=hstu_config,
-        )
-        if use_no_copy_embedding_collection:
-            # Re-class the existing EmbeddingCollection so TorchScript picks up
-            # the no-copy ``forward`` override (matches the eager-only
-            # ``ec_patched_forward_wo_embedding_copy`` monkey-patch).
-            self._sparse._hstu_model._embedding_collection.__class__ = (
-                _NoCopyEmbeddingCollection
-            )
-
-    def forward(
-        self,
-        uih_features: KeyedJaggedTensor,
-        candidates_features: KeyedJaggedTensor,
-    ) -> Tuple[
-        SeqEmbValues,
-        SeqEmbLengths,
-        Dict[str, torch.Tensor],
-        torch.Tensor,
-        torch.Tensor,
-    ]:
-        (
-            seq_embeddings,
-            payload_features,
-            _max_uih_len,
-            uih_seq_lengths,
-            _max_num_candidates,
-            num_candidates,
-        ) = self._sparse(
-            uih_features=uih_features,
-            candidates_features=candidates_features,
-        )
-        seq_emb_values, seq_emb_lengths = flatten_seq_embeddings(seq_embeddings)
-        return (
-            seq_emb_values,
-            seq_emb_lengths,
-            payload_features,
-            uih_seq_lengths,
-            num_candidates,
-        )
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/inference_test.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/inference_test.py
deleted file mode 100644
index 948f10618..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/inference_test.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-
-from generative_recommenders.common import gpu_unavailable
-from generative_recommenders.dlrm_v3.inference.main import main
-from hypothesis import given, settings, strategies as st, Verbosity
-
-
-class DLRMV3InferenceTest(unittest.TestCase):
-    @unittest.skipIf(*gpu_unavailable)
-    @given(
-        world_size=st.sampled_from([1]),
-    )
-    @settings(
-        verbosity=Verbosity.verbose,
-        max_examples=1,
-        deadline=None,
-    )
-    def test_e2e(self, world_size: int) -> None:
-        os.environ["WORLD_SIZE"] = str(world_size)
-        main()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/test_scripted_parity.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/test_scripted_parity.py
deleted file mode 100644
index 34d0388ea..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/tests/test_scripted_parity.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-strict
-
-"""
-Numerical parity test: eager HSTU vs traced (sparse + dense) on a synthetic
-batch.
-
-The production deployment path (see ``end_to_end_test.py``) uses
-``torch.jit.trace``, not ``torch.jit.script``, for the HSTU sparse/dense
-wrappers. Tracing records the actual tensor ops executed during a forward
-pass and ignores source-level dispatch logic (HammerKernel enum,
-``is_fx_tracing()``, ``torch.autocast``, IntEnum branches) that scripting
-cannot compile. This unit test mirrors that path.
-
-Tolerances are deliberately loose because the traced path replaces the
-Triton fused kernels with PyTorch fallbacks and skips ``torch.autocast`` in
-the user-forward block; both can perturb low-order bits in bf16.
-"""
-
-import unittest
-from typing import Dict, List, Tuple
-
-import torch
-from generative_recommenders.common import gpu_unavailable, HammerKernel
-from generative_recommenders.dlrm_v3.configs import (
-    get_embedding_table_config,
-    get_hstu_configs,
-)
-from generative_recommenders.dlrm_v3.datasets.dataset import get_random_data
-from generative_recommenders.dlrm_v3.inference.dense_predict_module import (
-    HSTUDenseScriptModule,
-)
-from generative_recommenders.dlrm_v3.inference.sparse_predict_module import (
-    HSTUSparseScriptModule,
-)
-from generative_recommenders.dlrm_v3.inference.ts_types import (
-    SeqEmbLengths,
-    SeqEmbValues,
-)
-from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
-
-
-_DATASET = "kuairand-1k"
-
-
-def _move_dense_inputs(
-    seq_emb_values: Dict[str, torch.Tensor],
-    seq_emb_lengths: Dict[str, torch.Tensor],
-    payload_features: Dict[str, torch.Tensor],
-    uih_seq_lengths: torch.Tensor,
-    num_candidates: torch.Tensor,
-    device: torch.device,
-) -> Tuple[
-    Dict[str, torch.Tensor],
-    Dict[str, torch.Tensor],
-    Dict[str, torch.Tensor],
-    torch.Tensor,
-    torch.Tensor,
-]:
-    """C++-side ``move_sparse_output_to_device`` analog for the test."""
-    return (
-        {k: v.to(device).to(torch.bfloat16) for k, v in seq_emb_values.items()},
-        {k: v.to(device) for k, v in seq_emb_lengths.items()},
-        {k: v.to(device) for k, v in payload_features.items()},
-        uih_seq_lengths.to(device),
-        num_candidates.to(device),
-    )
-
-
-class _SparseTraceShim(torch.nn.Module):
-    """Adapter that takes raw tensors and rebuilds the KJTs inside forward.
-
-    ``torch.jit.trace`` does not accept ``KeyedJaggedTensor`` (or any
-    non-Tensor / non-collection-of-Tensor type) as a top-level forward
-    input, so we make the traced boundary tensor-only and bake the
-    ``List[str]`` of feature keys in as module attributes.
-    """
-
-    def __init__(
-        self,
-        sparse_module: HSTUSparseScriptModule,
-        uih_keys: List[str],
-        candidates_keys: List[str],
-    ) -> None:
-        super().__init__()
-        self._sparse_module: HSTUSparseScriptModule = sparse_module
-        self._uih_keys: List[str] = uih_keys
-        self._candidates_keys: List[str] = candidates_keys
-
-    def forward(
-        self,
-        uih_lengths: torch.Tensor,
-        uih_values: torch.Tensor,
-        candidates_lengths: torch.Tensor,
-        candidates_values: torch.Tensor,
-    ) -> Tuple[
-        SeqEmbValues,
-        SeqEmbLengths,
-        Dict[str, torch.Tensor],
-        torch.Tensor,
-        torch.Tensor,
-    ]:
-        uih_kjt = KeyedJaggedTensor(
-            keys=self._uih_keys,
-            lengths=uih_lengths,
-            values=uih_values,
-        )
-        candidates_kjt = KeyedJaggedTensor(
-            keys=self._candidates_keys,
-            lengths=candidates_lengths,
-            values=candidates_values,
-        )
-        return self._sparse_module(
-            uih_features=uih_kjt, candidates_features=candidates_kjt
-        )
-
-
-class HSTUScriptedParityTest(unittest.TestCase):
-    @unittest.skipIf(*gpu_unavailable)
-    def test_scripted_matches_eager(self) -> None:
-        torch.manual_seed(0)
-        device = torch.device("cuda:0")
-        torch.cuda.set_device(device)
-
-        hstu_config = get_hstu_configs(_DATASET)
-        table_config = get_embedding_table_config(_DATASET)
-
-        # Some embedding tables in kuairand-1k are tiny (e.g.
-        # user_active_degree has num_embeddings=8). Clamp the random value
-        # range so every index stays in range for every table; otherwise the
-        # default value_bound=1000 triggers an out-of-range embedding lookup.
-        min_rows = min(t.num_embeddings for t in table_config.values())
-        value_bound = max(2, min_rows)
-
-        uih_kjt, candidates_kjt = get_random_data(
-            contexual_features=list(
-                hstu_config.contextual_feature_to_max_length.keys()
-            ),
-            hstu_uih_keys=hstu_config.hstu_uih_feature_names,
-            hstu_candidates_keys=hstu_config.hstu_candidate_feature_names,
-            uih_max_seq_len=128,
-            max_num_candidates=hstu_config.max_num_candidates_inference,
-            value_bound=value_bound,
-        )
-
-        sparse_module = HSTUSparseScriptModule(
-            table_config=table_config,
-            hstu_config=hstu_config,
-            use_no_copy_embedding_collection=True,
-        ).eval()
-        dense_module = (
-            HSTUDenseScriptModule(
-                hstu_config=hstu_config,
-                table_config=table_config,
-            )
-            .to(torch.bfloat16)
-            .to(device)
-            .eval()
-        )
-
-        # Pin the HammerKernel to PyTorch on both wrappers. The Triton
-        # kernels use Python-level dispatch (autotune, constexpr arguments)
-        # that interacts badly with torch.jit.trace's recording pass. The
-        # eager reference run uses the same setting so the comparison is
-        # apples-to-apples.
-        sparse_module._sparse._hstu_model.set_hammer_kernel(HammerKernel.PYTORCH)
-        dense_module._hstu_model.set_hammer_kernel(HammerKernel.PYTORCH)
-
-        # === Eager reference path ===
-        with torch.no_grad():
-            sparse_out_e = sparse_module(
-                uih_features=uih_kjt, candidates_features=candidates_kjt
-            )
-            dense_inputs_e = _move_dense_inputs(*sparse_out_e, device=device)
-            preds_eager = dense_module(*dense_inputs_e)
-
-        # === Traced path ===
-        # Sparse is traced via a raw-tensor shim because KJT is not a valid
-        # traced input. Dense is traced directly with the eager sparse
-        # output as the example.
-        sparse_shim = _SparseTraceShim(
-            sparse_module=sparse_module,
-            uih_keys=list(uih_kjt.keys()),
-            candidates_keys=list(candidates_kjt.keys()),
-        )
-        traced_sparse = torch.jit.trace(
-            sparse_shim,
-            example_inputs=(
-                uih_kjt.lengths(),
-                uih_kjt.values(),
-                candidates_kjt.lengths(),
-                candidates_kjt.values(),
-            ),
-            strict=False,
-            check_trace=False,
-        )
-        traced_dense = torch.jit.trace(
-            dense_module,
-            example_inputs=tuple(dense_inputs_e),
-            strict=False,
-            check_trace=False,
-        )
-
-        with torch.no_grad():
-            sparse_out_t = traced_sparse(
-                uih_kjt.lengths(),
-                uih_kjt.values(),
-                candidates_kjt.lengths(),
-                candidates_kjt.values(),
-            )
-            dense_inputs_t = _move_dense_inputs(*sparse_out_t, device=device)
-            preds_traced = traced_dense(*dense_inputs_t)
-
-        torch.testing.assert_close(
-            preds_eager.float(),
-            preds_traced.float(),
-            atol=1e-2,
-            rtol=1e-2,
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/.clang-format b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/.clang-format
deleted file mode 100644
index f08c9c2c8..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/.clang-format
+++ /dev/null
@@ -1,2 +0,0 @@
-BasedOnStyle: Google
-Standard: Cpp11
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/CMakeLists.txt b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/CMakeLists.txt
deleted file mode 100644
index 4fec0e44f..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/CMakeLists.txt
+++ /dev/null
@@ -1,113 +0,0 @@
-cmake_minimum_required(VERSION 3.12)
-
-project(mlperf_loadgen)
-
-# Read the version file
-file(READ "${CMAKE_SOURCE_DIR}/VERSION.txt" VERSION_CONTENTS)
-
-# Extract the major, minor, and patch versions from the VERSION file (assuming "MAJOR.MINOR.PATCH" format)
-string(REGEX MATCH "^([0-9]+)\\.([0-9]+)\\.([0-9]+)" VERSION_MATCH ${VERSION_CONTENTS})
-
-# Set the variables for the major, minor, and patch versions
-set(mlperf_loadgen_VERSION_MAJOR "${CMAKE_MATCH_1}")
-set(mlperf_loadgen_VERSION_MINOR "${CMAKE_MATCH_2}")
-set(mlperf_loadgen_VERSION_PATCH "${CMAKE_MATCH_3}")
-
-# Check if the version format was parsed correctly
-if(NOT DEFINED mlperf_loadgen_VERSION_MAJOR OR NOT DEFINED mlperf_loadgen_VERSION_MINOR OR NOT DEFINED mlperf_loadgen_VERSION_PATCH)
-    message(FATAL_ERROR "Version format in VERSION.txt is incorrect. Expected format: MAJOR.MINOR.PATCH")
-endif()
-
-# Print out the version
-message("mlperf_loadgen v${mlperf_loadgen_VERSION_MAJOR}.${mlperf_loadgen_VERSION_MINOR}.${mlperf_loadgen_VERSION_PATCH}")
-
-# Set build options. NB: CXX_STANDARD is supported since CMake 3.1.
-if (NOT MSVC)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -W -Wall")
-endif()
-# Extra build options can be specified by setting the MLPERF_LOADGEN_CXX_FLAGS variable
-if (MLPERF_LOADGEN_CXX_FLAGS)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${MLPERF_LOADGEN_CXX_FLAGS}")
-endif()
-message(STATUS "Using C++ compiler flags: ${CMAKE_CXX_FLAGS}")
-set(CMAKE_CXX_STANDARD "14")
-message(STATUS "Using C++ standard: ${CMAKE_CXX_STANDARD}")
-message(STATUS "Using static linker flags: ${CMAKE_STATIC_LINKER_FLAGS}")
-message(STATUS "Using shared linker flags: ${CMAKE_SHARED_LINKER_FLAGS}")
-
-# Output directory for libraries.
-set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR})
-message(STATUS "Using output path: ${LIBRARY_OUTPUT_PATH}")
-
-# Detect Python to use for generating source file with version info.
-# NB: PythonInterp has been deprecated since CMake 3.12
-# but it works with earlier versions of CMake.
-find_package(PythonInterp)
-message(STATUS "Using Python interpreter: ${PYTHON_EXECUTABLE}")
-
-# Specify the source and destination files
-set(CONF_FILE "mlperf.conf")
-set(HEADER_FILE "mlperf_conf.h")
-
-# Read the content of the configuration file
-file(READ ${CONF_FILE} CONF_CONTENTS)
-
-# Escape all double quotes and backslashes
-string(REPLACE "\\" "\\\\" CONF_CONTENTS "${CONF_CONTENTS}")
-string(REPLACE "\"" "\\\"" CONF_CONTENTS "${CONF_CONTENTS}")
-
-# Handle new lines
-string(REPLACE "\n" "\\n\"\n\"" CONF_CONTENTS "${CONF_CONTENTS}")
-
-# Wrap the content in a C++ string declaration
-set(FORMATTED_CONTENT "const char* mlperf_conf =\n\"${CONF_CONTENTS}\";\n")
-
-# Write the formatted content to the header file
-file(WRITE ${HEADER_FILE} "${FORMATTED_CONTENT}")
-
-message(STATUS "Output config:  ${CMAKE_BINARY_DIR}/mlperf_conf.h")
-
-# Generate source file with version info.
-execute_process(COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/version_generator.py ${CMAKE_BINARY_DIR}/version_generated.cc ${CMAKE_CURRENT_SOURCE_DIR})
-
-# Add source files.
-set(SOURCE
-  ${CMAKE_CURRENT_SOURCE_DIR}/bindings/c_api.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/bindings/c_api.cc
-  ${CMAKE_CURRENT_SOURCE_DIR}/early_stopping.cc  
-  ${CMAKE_CURRENT_SOURCE_DIR}/issue_query_controller.cc
-  ${CMAKE_CURRENT_SOURCE_DIR}/loadgen.cc
-  ${CMAKE_CURRENT_SOURCE_DIR}/logging.cc
-  ${CMAKE_CURRENT_SOURCE_DIR}/logging.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/test_settings_internal.cc
-  ${CMAKE_CURRENT_SOURCE_DIR}/test_settings_internal.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/utils.cc
-  ${CMAKE_CURRENT_SOURCE_DIR}/utils.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/results.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/results.cc
-  ${CMAKE_CURRENT_SOURCE_DIR}/version.cc
-  ${CMAKE_CURRENT_SOURCE_DIR}/version.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/mlperf_conf.h
-  ${CMAKE_CURRENT_SOURCE_DIR}/VERSION.txt
-  ${CMAKE_BINARY_DIR}/version_generated.cc
-)
-
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-
-add_library(mlperf_loadgen STATIC ${SOURCE})
-target_link_libraries(mlperf_loadgen)
-
-if(WIN32)
-set (LIBS "")
-else()
-set (LIBS pthread)
-endif()
-
-add_executable(benchmark benchmark/repro.cpp)
-target_link_libraries(benchmark PUBLIC mlperf_loadgen ${LIBS})
-
-# Install library and headers.
-install(TARGETS mlperf_loadgen
-	DESTINATION ${CMAKE_INSTALL_PREFIX}/lib)
-install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/
-	DESTINATION ${CMAKE_INSTALL_PREFIX}/include FILES_MATCHING PATTERN "*.h")
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/MANIFEST.in b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/MANIFEST.in
deleted file mode 100644
index 152b53111..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/MANIFEST.in
+++ /dev/null
@@ -1,2 +0,0 @@
-include VERSION.txt
-include mlperf.conf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README.md
deleted file mode 100644
index 212c8a53c..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README.md
+++ /dev/null
@@ -1,223 +0,0 @@
-# Overview {#mainpage}
-
-## Introduction
-
-* The LoadGen is a *reusable* module that *efficiently* and *fairly* measures
-  the performance of inference systems.
-* It generates traffic for scenarios as formulated by a diverse set of experts
-  in the [MLCommons working group](https://mlcommons.org/).
-* The scenarios emulate the workloads seen in mobile devices,
-  autonomous vehicles, robotics, and cloud-based setups.
-* Although the LoadGen is not model or dataset aware, its strength is in its
-  reusability with logic that is.
-
-## Integration Example and Flow
-The following is an diagram of how the LoadGen can be integrated into an
-inference system, resembling how some of the MLPerf reference models are
-implemented.
-<div style="display:flex; flex-flow:row wrap; justify-content: space-evenly;">
-<img src="https://raw.githubusercontent.com/mlcommons/inference/master/loadgen/loadgen_integration_diagram.svg" width="500px" style="padding: 20px">
-<ol style="padding: 20px">
-<li>Benchmark knows the model, dataset, and preprocessing.</li>
-<li>Benchmark hands dataset sample IDs to LoadGen.</li>
-<li>LoadGen starts generating queries of sample IDs.</li>
-<li>Benchmark creates requests to backend.</li>
-<li>Result is post processed and forwarded to LoadGen.</li>
-<li>LoadGen outputs logs for analysis.<br>
-</ol>
-</div>
-
-## Useful Links
-* [FAQ](README_FAQ.md)
-* [LoadGen Build Instructions](README_BUILD.md)
-* [LoadGen API](loadgen.h)
-* [Test Settings](test_settings.h) -
-  A good description of available scenarios, modes, and knobs.
-* [MLPerf Inference Code](https://github.com/mlcommons/inference) -
-  Includes source for the LoadGen and reference models that use the LoadGen.
-* [MLPerf Inference Rules](https://github.com/mlcommons/inference_policies) -
-  Any mismatch with this is a bug in the LoadGen.
-
-## Scope of the LoadGen's Responsibilities
-
-### In Scope
-* **Provide a reusable** C++ library with python bindings.
-* **Implement** the traffic patterns of the MLPerf Inference scenarios and
-  modes.
-* **Record** all traffic generated and received for later analysis and
-  verification.
-* **Summarize** the results and whether performance constraints were met.
-* **Target high-performance** systems with efficient multi-thread friendly
-  logging utilities.
-* **Generate trust** via a shared, well-tested, and community-hardened
-  code base.
-
-### Out of Scope
-The LoadGen is:
-* **NOT** aware of the ML model it is running against.
-* **NOT** aware of the data formats of the model's inputs and outputs.
-* **NOT** aware of how to score the accuracy of a model's outputs.
-* **NOT** aware of MLPerf rules regarding scenario-specific constraints.
-
-Limitting the scope of the LoadGen in this way keeps it reusable across
-different models and datasets without modification. Using composition and
-dependency injection, the user can define their own model, datasets, and
-metrics.
-
-Additionally, not hardcoding MLPerf-specific test constraints, like test
-duration and performance targets, allows users to use the LoadGen unmodified
-for custom testing and continuous integration purposes.
-
-## Submission Considerations
-
-### Upstream all local modifications
-* As a rule, no local modifications to the LoadGen's C++ library are allowed
-for submission.
-* Please upstream early and often to keep the playing field level.
-
-### Choose your TestSettings carefully!
-* Since the LoadGen is oblivious to the model, it can't enforce the MLPerf
-requirements for submission. *e.g.:* target percentiles and latencies.
-* For verification, the values in TestSettings are logged.
-* To help make sure your settings are spec compliant, use
-TestSettings::FromConfig in conjunction with the relevant config file provided
-with the reference models.
-
-## Responsibilities of a LoadGen User
-
-### Implement the Interfaces
-* Implement the SystemUnderTest and QuerySampleLibrary interfaces and pass
-  them to the StartTest function.
-* Call QuerySampleComplete for every sample received by
-  SystemUnderTest::IssueQuery.
-
-### Assess Accuracy
-* Process the *mlperf_log_accuracy.json* output by the LoadGen to determine
-  the accuracy of your system.
-* For the official models, Python scripts will be provided by the MLPerf model
-  owners for you to do this automatically.
-
-For templates of how to do the above in detail, refer to code for the demos,
-tests, and reference models.
-
-
-## LoadGen over the Network
-
-For reference, on a high level a submission looks like this:
-
-<div align="center" style="display:flex; flex-flow:row wrap; justify-content: space-evenly;">
-<img src="https://raw.githubusercontent.com/mlcommons/inference/master/loadgen/diagram_submission.png" width="300px" style="padding: 20px">
-</div>
-
-The LoadGen implementation is common to all submissions, while the QSL (“Query Sample Library”) and SUT (“System Under Test”) are implemented by submitters. QSL is responsible for loading the data and includes untimed preprocessing.
-
-A submission over the network introduces a new component “QDL” (query dispatch library) that is added to the system as presented in the following diagram:
-
-<div align="center" style="display:flex; flex-flow:row wrap; justify-content: space-evenly;">
-<img src="https://raw.githubusercontent.com/mlcommons/inference/master/loadgen/diagram_network_submission.png" width="300px" style="padding: 20px">
-</div>
-
-QDL is a proxy for a load-balancer, that dispatches queries to SUT over a physical network, receives the responses and passes them back to LoadGen.  It is implemented by the submitter. The interface of the QDL is the same as the API to SUT. 
-
-In scenarios using QDL, data may be compressed in QSL at the choice of the submitter in order to reduce network transmission time. Decompression is part of the timed processing in SUT. A set of approved standard compression schemes will be specified for each benchmark; additional compression schemes must be approved in advance by the Working Group.
-
-All communication between LoadGen/QSL and SUT is via QDL, and all communication between QDL and SUT must pass over a physical network.
-
-QDL implements the protocol to transmit queries over the network and receive responses. It also implements decompression of any response returned by the SUT, where compression of responses is allowed. Performing any part of the timed preprocessing or inference in QDL is specifically disallowed. Currently no batching is allowed in QDL, although this may be revisited in future.
-
-The MLperf over the Network will run in Server mode and Offline mode. All LoadGen modes are expected to work as is with insignificant changes. These include running the test in performance mode, accuracy mode, find peak performance mode and compliance mode. The same applies for power measurements.
-
-### QDL details
-The Query Dispatch Library is implemented by the submitter and interfaces with LoadGen using the same SUT API. All MLPerf Inference SUTs implement the `mlperf::SystemUnderTest` class which is defined in system_under_test.h. The QDL implements `mlperf::QueryDispatchLibrary` class which inherits the `mlperf::SystemUnderTest` class and has the same API and support all existing `mlperf::SystemUnderTest` methods. It has a separate header file query_dispatch_library.h. Using sut with `mlperf::SystemUnderTest` class in LoadGen StartTest is natively upcasting `mlperf::QueryDispatchLibrary` class.
-
-#### QDL Query issue and response over the network
-
-The QDL gets the queries from the LoadGen through 
-```CPP
-void IssueQuery(const std::vector<QuerySample>& samples)
-```
-
-The QDL dispatches the queries to the SUT over the physical media. The exact method and implementation for it are submitter specific and would not be specified at MLCommons. Submitter implementation includes all methods required to serialize the query, load balance, drive it to the Operating system and network interface card and send to the SUT.
-
-The QDL receives the query responses over the network from the SUT. The exact method and implementation for it are submitter specific and would not be specified at MLCommons. The submitter implementation includes all methods required to receive the network data from the Network Interface card, go through the Operating system, deserialize the query response, and provide it back to the LoadGen through query completion by:
-
-```CPP
-struct QuerySampleResponse {
-  ResponseId id;
-  uintptr_t data;
-  size_t size;
-};
-void QuerySamplesComplete(QuerySampleResponse* responses, 
-                          size_t response_count);
-
-```
-
-#### QDL Additional Methods
-
-In addition to that the QDL needs to implement the following methods that are provided by the SUT interface to the LoadGen:
-```CPP
-const std::string& Name();
-```
-The `Name` function returns a known string for over the Network SUTs to identify it as over the network benchmark.
-```CPP
-void FlushQueries();
-```
-
-It is not specified here how the QDL would query and configure the SUT to execute the above methods. The QDL responds to the LoadGen after receiving its own response from the SUT.
-
-### Example
-
-Refer to [LON demo](demos/lon) for a reference example illustrating usage of Loadgen over the network.
-
-## Find Peak Performance Mode
-
-The Find Peak Performance mode can be used to find the optimal queries per second (QPS) for the server scenario.
-
-### Setup
-
-You can setup loadgen to run this mode by setting the `mode` variable in the `test_settings` used to run the test. Using the Python API:
-
-```python
-settings = mlperf_loadgen.TestSettings()
-settings.server_target_qps = 100
-settings.scenario = mlperf_loadgen.TestScenario.Server
-settings.mode = mlperf_loadgen.TestMode.FindPeakPerformance
-...
-
-mlperf_loadgen.StartTest(sut, qsl, settings)
-```
-
-Using the C/C++ API:
-```CPP
-mlperf::TestSettings settings;
-setting.server_target_qps = 100;
-settings.scenario = mlperf::TestScenario::Server;
-settings.mode = mlperf::TestMode::FindPeakPerformance;
-mlperf::LogSettings log_settings;
-/*
-Construct QSL and SUT
-*/
-mlperf::StartTest(&sut, &qsl, settings, log_settings);
-```
-
-**Note:** Make sure you are setting the TestScenario to server and you are providing an initial target QPS.
-
-### Description
-
-The Find Peak Performance mode works by finding a lower and upper boundary for the optimal QPS. Then performing a binary search between the lower and upper bound to find the optimal QPS.
-
-#### Finding lower and upper boundary
-
-LoadGen begins by running performance mode at the specified target QPS. If the test passes, this value is used as the lower bound; otherwise, an error is raised. The algorithm then guesses the upper bound as twice the target QPS. 
-
-Then LoadGen will run performance mode using the upper bound guess. If the test is successful, both the lower bound and upper bound will be doubled. This repeats until the upper bound guess fails the test.
-
-```
-[initial_target_qps, 2*initial_target_qps] -> [2*initial_target_qps, 4*initial_target_qps] -> [4*initial_target_qps, 8*initial_target_qps]...
-```
-
-Finally, the final lower bound and upper bound are set to their current values. This process assures that the lower bound passes the performance mode, but the upper bound doesn’t.
-
-#### Binary Search
-
-Once the lower and upper bounds are set, binary search can be performed over the range `[lower, upper]`` to find the optimal QPS. If a given QPS fails in performance mode, the optimal value lies below it; if it passes, the optimal is higher.
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_BUILD.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_BUILD.md
deleted file mode 100644
index 499cc360a..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_BUILD.md
+++ /dev/null
@@ -1,47 +0,0 @@
-# Building the LoadGen {#ReadmeBuild}
-
-## Prerequisites
-
-    sudo apt-get install libglib2.0-dev python-pip python3-pip
-    pip2 install absl-py numpy
-    pip3 install absl-py numpy
-
-## Quick Start
-### Installation - Python
-
-    pip install absl-py numpy
-    git clone --recurse-submodules https://github.com/mlcommons/inference.git mlperf_inference
-    cd mlperf_inference/loadgen
-    CFLAGS="-std=c++14 -O3" python -m pip install .
-
-This will fetch the loadgen source, build and install the loadgen as a python module, and run a simple end-to-end demo.
-
-Alternatively, we provide wheels for several python versions and operating system that can be installed using pip directly.
-
-    pip install mlperf-loadgen
-
-**NOTE:** Take into account that we only update the published wheels after an official release, they may not include the latest changes.
-
-### Testing your Installation
-The following command will run a simple end-to-end demo:
-
-    python mlperf_inference/loadgen/demos/py_demo_single_stream.py
-
-A summary of the test results can be found in the *"mlperf_log_summary.txt"* logfile.
-
-For a timeline visualization of what happened during the test, open the *"mlperf_log_trace.json"* file in Chrome:
-* Type “chrome://tracing” in the address bar, then drag-n-drop the json.
-* This may be useful for SUT performance tuning and understanding + debugging the loadgen.
-
-### Installation - C++
-To build the loadgen as a C++ library, rather than a python module:
-
-    git clone https://github.com/mlcommons/inference.git mlperf_inference
-    cd mlperf_inference
-    mkdir loadgen/build/ && cd loadgen/build/
-    cmake .. && cmake --build .
-    cp libmlperf_loadgen.a ..
-
-## Quick start: Loadgen Over the Network
-
-Refer to [LON demo](demos/lon/README.md) for a basic example.
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_FAQ.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_FAQ.md
deleted file mode 100644
index ab4e0c75d..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/README_FAQ.md
+++ /dev/null
@@ -1,78 +0,0 @@
-# LoadGen FAQ {#ReadmeFAQ}
-
-## Q: The LoadGen does not match the MLPerf specification. Who is right?
-**A:**
-The MLPerf spec is *always* right.
-Please file a LoadGen bug so it may be resolved.
-
-## Q: How can I file a bug?
-**A:**
-On GitHub: https://github.com/mlcommons/inference/issues/new
-
-## Q: Can I make local modifications to the LoadGen for submission?
-**A:**
-No. To keep the playing field level, please upstream any local
-modificiations you need to make. Ideally upstream such changes behind a runtime
-flag or via an abstract interface the client can implement. This will help
-with testability.
-
-## Q: Where can I find the results of a test?
-**A:**
-By default, the loadgen will output an *mlperf_log_summary.txt* file
-that summarizes the target metrics and constraints of the test, along with
-other stats about the run.
-
-*Note:* LogSettings also has a flag to forward the results to stdout and
-there's an outstanding TODO to make this more programmable.
-
-## Q: The reference implementation for \<*some_model*\> prints out results of its own. Are those for submission?
-**A:**
-They are not. The LoadGen results are the ground truth for submission
-results since they will work even for systems that forgo the python bindings.
-If you notice a bug in the LoadGen's results, please file a bug or submit a
-patch.
-
-## Q: I'm getting linker errors for LoadgenVersion definitions. Where is *version_generated.cc*?
-**A:**
-If you have a custom build setup, make sure you run the *version_generator.py*
-script, which will create the cc file you are looking for. The official build
-files that come with the LoadGen do this for you out of the box.
-
-## Q: What is this *version_generator.py* script?
-**A:**
-The LoadGen records git stats (if available) and the SHA1 of all its
-source files (always) at build time for verification purposes. This is easy
-to circumvent, but try your best to run *version_generator.py* correctly;
-ideally integrated with your build system if you have a custom build.
-The intention is more to help with debugging efforts and detect accidental
-version missmatches than to detect bad actors.
-
-## Q: How do I view the *mlperf_log_trace.json* file?
-**A:**
-This file uses the [Trace Event Format]
-(https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/edit)
-to record a timeline of all the threads involved.
-You can view the file by typing [chrome://tracing](chrome://tracing) into
-Chrome's address bar and dragging the json file there.
-This file zips well and you can drag the zip file directly into
-[chrome://tracing](chrome://tracing) too.
-Please include zipped traces (and the other logs) when filing bug reports.
-
-## Q: Why is the code littered with so many lambdas? My eyes hurt.
-**A:**
-Lambdas are a convenient and efficient way to ship arbitrary data + deferred
-logic over to the logging thread without much boilerplate.
-Much of the loadgen is built on top of the logging utilities.
-Thus the lambdas. (Sorry about the eyes.)
-
-## Q: What C++ version does the LoadGen target?
-**A:**
-It currently targets and requires C++14. It should compile with recent
-versions of clang, gcc, and msvc.
-
-## Q: What dependencies does the LoadGen code have?
-**A:**
-The C++ code has no external dependencies. The loadgen itself, logging
-utilities, and unit test utilities are built solely on the C++ Standard Library.
-The python bindings, however, do require
-[pybind11](https://github.com/pybind/pybind11).
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/VERSION.txt b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/VERSION.txt
deleted file mode 100644
index ac14c3dfa..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/VERSION.txt
+++ /dev/null
@@ -1 +0,0 @@
-5.1.1
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/.gitignore b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/.gitignore
deleted file mode 100644
index e792c8e55..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-loadgen_build
-build
\ No newline at end of file
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/README.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/README.md
deleted file mode 100644
index 24e872983..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/README.md
+++ /dev/null
@@ -1,10 +0,0 @@
-Note: please install jemalloc first. See: http://jemalloc.net/
-Command: bash run.sh <target_qps> <0=Basic,1=Queue> <numCompleteThreads> <maxSizeInComplete> <server_coalesce_queries=0or1>
-
-Experiments:
-- On Intel(R) Xeon(R) CPU E5-1650 v4 @ 3.60GHz
-- Basic SUT : 500-600k i/s
-- Basic SUT + jemalloc: 800-900k i/s (`bash run.sh 800000 0`)
-- Queued SUT (2 complete threads) + jemalloc: 1.2-1.3M i/s (`bash run.sh 1200000 1 2 2048`)
-- Queued SUT (2 complete threads) + jemalloc + server_coalesce_queries: 1.4-1.5M is/ (`bash run.sh 1400000 1 2 512 1`)
-- Basic SUT + jemalloc + server_coalesce_queries + 4 IssueQueryThreads: 2.4-2.5M is/ (`bash run.sh 2400000 0 2 512 1 4`)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/repro.cpp b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/repro.cpp
deleted file mode 100644
index 44ff53efa..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/repro.cpp
+++ /dev/null
@@ -1,296 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <cassert>
-#include <condition_variable>
-#include <deque>
-#include <iostream>
-#include <map>
-#include <mutex>
-#include <thread>
-#include <vector>
-
-#include "loadgen.h"
-#include "query_sample_library.h"
-#include "system_under_test.h"
-#include "test_settings.h"
-
-class QSL : public mlperf::QuerySampleLibrary {
- public:
-  ~QSL() override{};
-  const std::string& Name() override { return mName; }
-  size_t TotalSampleCount() override { return 1000000; }
-  size_t PerformanceSampleCount() override { return TotalSampleCount(); }
-  void LoadSamplesToRam(const std::vector<mlperf::QuerySampleIndex>&) override {
-  }
-  void UnloadSamplesFromRam(
-      const std::vector<mlperf::QuerySampleIndex>&) override {}
-
- private:
-  std::string mName{"Dummy QSL"};
-};
-
-class BasicSUT : public mlperf::SystemUnderTest {
- public:
-  BasicSUT() {
-    // Start with some large value so that we don't reallocate memory.
-    initResponse(10000);
-  }
-  ~BasicSUT() override {}
-  const std::string& Name() override { return mName; }
-  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
-    size_t n = samples.size();
-    if (n > mResponses.size()) {
-      std::cerr << "Warning: reallocating response buffer in BasicSUT. Maybe "
-                   "you should initResponse with larger value!?"
-                << std::endl;
-      initResponse(samples.size());
-    }
-    for (size_t i = 0; i < n; i++) {
-      mResponses[i].id = samples[i].id;
-    }
-    mlperf::QuerySamplesComplete(mResponses.data(), n);
-  }
-  void FlushQueries() override {}
-
- private:
-  void initResponse(int size) {
-    mResponses.resize(size,
-                      {0, reinterpret_cast<uintptr_t>(&mBuf), sizeof(int)});
-  }
-  int mBuf{0};
-  std::string mName{"BasicSUT"};
-  std::vector<mlperf::QuerySampleResponse> mResponses;
-};
-
-class QueueSUT : public mlperf::SystemUnderTest {
- public:
-  QueueSUT(int numCompleteThreads, int maxSize) {
-    // Each thread handle at most maxSize at a time.
-    std::cout << "QueueSUT: maxSize = " << maxSize << std::endl;
-    initResponse(numCompleteThreads, maxSize);
-    // Launch complete threads
-    for (int i = 0; i < numCompleteThreads; i++) {
-      mThreads.emplace_back(&QueueSUT::CompleteThread, this, i);
-    }
-  }
-  ~QueueSUT() override {
-    {
-      std::unique_lock<std::mutex> lck(mMtx);
-      mDone = true;
-      mCondVar.notify_all();
-    }
-    for (auto& thread : mThreads) {
-      thread.join();
-    }
-  }
-  const std::string& Name() override { return mName; }
-  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
-    std::unique_lock<std::mutex> lck(mMtx);
-    for (const auto& sample : samples) {
-      mIdQueue.push_back(sample.id);
-    }
-    // Let some worker thread to consume tasks
-    mCondVar.notify_one();
-  }
-  void FlushQueries() override {}
-
- private:
-  void CompleteThread(int threadIdx) {
-    auto& responses = mResponses[threadIdx];
-    size_t maxSize{responses.size()};
-    size_t actualSize{0};
-    while (true) {
-      {
-        std::unique_lock<std::mutex> lck(mMtx);
-        mCondVar.wait(lck, [&]() { return !mIdQueue.empty() || mDone; });
-
-        if (mDone) {
-          break;
-        }
-
-        actualSize = std::min(maxSize, mIdQueue.size());
-        for (size_t i = 0; i < actualSize; i++) {
-          responses[i].id = mIdQueue.front();
-          mIdQueue.pop_front();
-        }
-        mCondVar.notify_one();
-      }
-      mlperf::QuerySamplesComplete(responses.data(), actualSize);
-    }
-  }
-  void initResponse(int numCompleteThreads, int size) {
-    mResponses.resize(numCompleteThreads);
-    for (auto& responses : mResponses) {
-      responses.resize(size,
-                       {0, reinterpret_cast<uintptr_t>(&mBuf), sizeof(int)});
-    }
-  }
-  int mBuf{0};
-  std::string mName{"QueueSUT"};
-  std::vector<std::vector<mlperf::QuerySampleResponse>> mResponses;
-  std::vector<std::thread> mThreads;
-  std::deque<mlperf::ResponseId> mIdQueue;
-  std::mutex mMtx;
-  std::condition_variable mCondVar;
-  bool mDone{false};
-};
-
-class MultiBasicSUT : public mlperf::SystemUnderTest {
- public:
-  MultiBasicSUT(int numThreads)
-      : mNumThreads(numThreads), mResponses(numThreads) {
-    // Start with some large value so that we don't reallocate memory.
-    initResponse(10000);
-    for (int i = 0; i < mNumThreads; ++i) {
-      mThreads.emplace_back(&MultiBasicSUT::startIssueThread, this, i);
-    }
-  }
-  ~MultiBasicSUT() override {
-    for (auto& thread : mThreads) {
-      thread.join();
-    }
-  }
-  const std::string& Name() override { return mName; }
-  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
-    int thread_idx = mThreadMap[std::this_thread::get_id()];
-    size_t n = samples.size();
-    auto& reponses = mResponses[thread_idx];
-    if (n > reponses.size()) {
-      std::cout
-          << "Warning: reallocating response buffer in MultiBasicSUT. Maybe "
-             "you should initResponse with larger value!?"
-          << std::endl;
-      initResponse(samples.size());
-    }
-    for (size_t i = 0; i < n; i++) {
-      reponses[i].id = samples[i].id;
-    }
-    mlperf::QuerySamplesComplete(reponses.data(), n);
-  }
-  void FlushQueries() override {}
-
- private:
-  void initResponse(int size) {
-    for (auto& responses : mResponses) {
-      responses.resize(size,
-                       {0, reinterpret_cast<uintptr_t>(&mBuf), sizeof(int)});
-    }
-  }
-  void startIssueThread(int thread_idx) {
-    {
-      std::lock_guard<std::mutex> lock(mMtx);
-      mThreadMap[std::this_thread::get_id()] = thread_idx;
-    }
-    mlperf::RegisterIssueQueryThread();
-  }
-  int mBuf{0};
-  int mNumThreads{0};
-  std::string mName{"MultiBasicSUT"};
-  std::vector<std::vector<mlperf::QuerySampleResponse>> mResponses;
-  std::mutex mMtx;
-  std::vector<std::thread> mThreads;
-  std::map<std::thread::id, int> mThreadMap;
-};
-
-int main(int argc, char** argv) {
-  assert(argc >= 2 && "Need to pass in at least one argument: target_qps");
-  int target_qps = std::stoi(argv[1]);
-  std::cout << "target_qps = " << target_qps << std::endl;
-
-  bool useQueue{false};
-  int numCompleteThreads{4};
-  int maxSize{1};
-  bool server_coalesce_queries{false};
-  int num_issue_threads{0};
-  if (argc >= 3) {
-    useQueue = std::stoi(argv[2]) != 0;
-  }
-  if (argc >= 4) {
-    numCompleteThreads = std::stoi(argv[3]);
-  }
-  if (argc >= 5) {
-    maxSize = std::stoi(argv[4]);
-  }
-  if (argc >= 6) {
-    server_coalesce_queries = std::stoi(argv[5]) != 0;
-  }
-  if (argc >= 7) {
-    num_issue_threads = std::stoi(argv[6]);
-  }
-
-  QSL qsl;
-  std::unique_ptr<mlperf::SystemUnderTest> sut;
-
-  // Configure the test settings
-  mlperf::TestSettings testSettings;
-  testSettings.scenario = mlperf::TestScenario::Server;
-  testSettings.mode = mlperf::TestMode::PerformanceOnly;
-  testSettings.server_target_qps = target_qps;
-  testSettings.server_target_latency_ns = 10000000;  // 10ms
-  testSettings.server_target_latency_percentile = 0.99;
-  testSettings.min_duration_ms = 60000;
-  testSettings.min_query_count = 270000;
-  testSettings.server_coalesce_queries = server_coalesce_queries;
-  std::cout << "testSettings.server_coalesce_queries = "
-            << (server_coalesce_queries ? "True" : "False") << std::endl;
-  testSettings.server_num_issue_query_threads = num_issue_threads;
-  std::cout << "num_issue_threads = " << num_issue_threads << std::endl;
-
-  // Configure the logging settings
-  mlperf::LogSettings logSettings;
-  logSettings.log_output.outdir = "build";
-  logSettings.log_output.prefix = "mlperf_log_";
-  logSettings.log_output.suffix = "";
-  logSettings.log_output.prefix_with_datetime = false;
-  logSettings.log_output.copy_detail_to_stdout = false;
-  logSettings.log_output.copy_summary_to_stdout = true;
-  logSettings.log_mode = mlperf::LoggingMode::AsyncPoll;
-  logSettings.log_mode_async_poll_interval_ms = 1000;
-  logSettings.enable_trace = false;
-
-  // Choose SUT
-  if (num_issue_threads == 0) {
-    if (useQueue) {
-      std::cout << "Using QueueSUT with " << numCompleteThreads
-                << " complete threads" << std::endl;
-      sut.reset(new QueueSUT(numCompleteThreads, maxSize));
-    } else {
-      std::cout << "Using BasicSUT" << std::endl;
-      sut.reset(new BasicSUT());
-    }
-  } else {
-    if (useQueue) {
-      std::cout << "Using MultiQueueSUT with " << numCompleteThreads
-                << " complete threads" << std::endl;
-      std::cerr << "!!!! MultiQueueSUT is NOT implemented yet !!!!"
-                << std::endl;
-      return 1;
-      // sut.reset(new MultiQueueSUT(num_issue_threads, numCompleteThreads,
-      // maxSize));
-    } else {
-      std::cout << "Using MultiBasicSUT" << std::endl;
-      sut.reset(new MultiBasicSUT(num_issue_threads));
-    }
-  }
-
-  // Start test
-  std::cout << "Start test..." << std::endl;
-  mlperf::StartTest(sut.get(), &qsl, testSettings, logSettings);
-  std::cout << "Test done. Clean up SUT..." << std::endl;
-  sut.reset();
-  std::cout << "Done!" << std::endl;
-  return 0;
-}
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run.sh b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run.sh
deleted file mode 100644
index 62559c1a8..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/usr/bin/bash
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-echo "Building loadgen..."
-if [ ! -e loadgen_build ]; then mkdir loadgen_build; fi;
-cd loadgen_build && cmake ../.. && make -j && cd ..
-echo "Building test program..."
-if [ ! -e build ]; then mkdir build; fi;
-g++ --std=c++11 -O3 -I.. -o build/repro.exe repro.cpp -Lloadgen_build -lmlperf_loadgen -lpthread && \
-LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2 build/repro.exe $1 $2 $3 $4 $5 $6
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run_debug.sh b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run_debug.sh
deleted file mode 100644
index ba63727c8..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/benchmark/run_debug.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/usr/bin/bash
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-echo "Building loadgen in Debug mode..."
-if [ ! -e loadgen_build ]; then mkdir loadgen_build; fi;
-cd loadgen_build && cmake -DCMAKE_BUILD_TYPE=Debug ../.. && make -j && cd ..
-echo "Building test program in Debug mode..."
-if [ ! -e build ]; then mkdir build; fi;
-g++ --std=c++11 -O0 -g -I.. -o build/repro.exe repro.cpp -Lloadgen_build -lmlperf_loadgen -lpthread && \
-gdb --args build/repro.exe $1 $2 $3 $4 $5 $6
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.cc
deleted file mode 100644
index 0248a1c16..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.cc
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "c_api.h"
-
-#include <string>
-
-#include "../loadgen.h"
-#include "../query_sample.h"
-#include "../query_sample_library.h"
-#include "../system_under_test.h"
-#include "../test_settings.h"
-
-namespace mlperf {
-namespace c {
-namespace {
-
-// Forwards SystemUnderTest calls to relevant callbacks.
-class SystemUnderTestTrampoline : public SystemUnderTest {
- public:
-  SystemUnderTestTrampoline(ClientData client_data, std::string name,
-                            IssueQueryCallback issue_cb,
-                            FlushQueriesCallback flush_queries_cb)
-      : client_data_(client_data),
-        name_(std::move(name)),
-        issue_cb_(issue_cb),
-        flush_queries_cb_(flush_queries_cb) {}
-  ~SystemUnderTestTrampoline() override = default;
-
-  const std::string& Name() override { return name_; }
-
-  void IssueQuery(const std::vector<QuerySample>& samples) override {
-    (*issue_cb_)(client_data_, samples.data(), samples.size());
-  }
-
-  void FlushQueries() override { (*flush_queries_cb_)(); }
-
- private:
-  ClientData client_data_;
-  std::string name_;
-  IssueQueryCallback issue_cb_;
-  FlushQueriesCallback flush_queries_cb_;
-};
-
-}  // namespace
-
-void* ConstructSUT(ClientData client_data, const char* name, size_t name_length,
-                   IssueQueryCallback issue_cb,
-                   FlushQueriesCallback flush_queries_cb) {
-  SystemUnderTestTrampoline* sut = new SystemUnderTestTrampoline(
-      client_data, std::string(name, name_length), issue_cb, flush_queries_cb);
-  return reinterpret_cast<void*>(sut);
-}
-
-void DestroySUT(void* sut) {
-  SystemUnderTestTrampoline* sut_cast =
-      reinterpret_cast<SystemUnderTestTrampoline*>(sut);
-  delete sut_cast;
-}
-
-namespace {
-
-// Forwards QuerySampleLibrary calls to relevant callbacks.
-class QuerySampleLibraryTrampoline : public QuerySampleLibrary {
- public:
-  QuerySampleLibraryTrampoline(
-      ClientData client_data, std::string name, size_t total_sample_count,
-      size_t performance_sample_count,
-      LoadSamplesToRamCallback load_samples_to_ram_cb,
-      UnloadSamplesFromRamCallback unload_samples_from_ram_cb)
-      : client_data_(client_data),
-        name_(std::move(name)),
-        total_sample_count_(total_sample_count),
-        performance_sample_count_(performance_sample_count),
-        load_samples_to_ram_cb_(load_samples_to_ram_cb),
-        unload_samples_from_ram_cb_(unload_samples_from_ram_cb) {}
-  ~QuerySampleLibraryTrampoline() override = default;
-
-  const std::string& Name() override { return name_; }
-  size_t TotalSampleCount() override { return total_sample_count_; }
-  size_t PerformanceSampleCount() override { return performance_sample_count_; }
-
-  void LoadSamplesToRam(const std::vector<QuerySampleIndex>& samples) override {
-    (*load_samples_to_ram_cb_)(client_data_, samples.data(), samples.size());
-  }
-  void UnloadSamplesFromRam(
-      const std::vector<QuerySampleIndex>& samples) override {
-    (*unload_samples_from_ram_cb_)(client_data_, samples.data(),
-                                   samples.size());
-  }
-
- private:
-  ClientData client_data_;
-  std::string name_;
-  size_t total_sample_count_;
-  size_t performance_sample_count_;
-  LoadSamplesToRamCallback load_samples_to_ram_cb_;
-  UnloadSamplesFromRamCallback unload_samples_from_ram_cb_;
-};
-
-}  // namespace
-
-void* ConstructQSL(ClientData client_data, const char* name, size_t name_length,
-                   size_t total_sample_count, size_t performance_sample_count,
-                   LoadSamplesToRamCallback load_samples_to_ram_cb,
-                   UnloadSamplesFromRamCallback unload_samples_from_ram_cb) {
-  QuerySampleLibraryTrampoline* qsl = new QuerySampleLibraryTrampoline(
-      client_data, std::string(name, name_length), total_sample_count,
-      performance_sample_count, load_samples_to_ram_cb,
-      unload_samples_from_ram_cb);
-  return reinterpret_cast<void*>(qsl);
-}
-
-void DestroyQSL(void* qsl) {
-  QuerySampleLibraryTrampoline* qsl_cast =
-      reinterpret_cast<QuerySampleLibraryTrampoline*>(qsl);
-  delete qsl_cast;
-}
-
-// mlperf::c::StartTest just forwards to mlperf::StartTest after doing the
-// proper cast.
-void StartTest(void* sut, void* qsl, const TestSettings& settings,
-               const std::string& audit_config_filename = "audit.config") {
-  SystemUnderTestTrampoline* sut_cast =
-      reinterpret_cast<SystemUnderTestTrampoline*>(sut);
-  QuerySampleLibraryTrampoline* qsl_cast =
-      reinterpret_cast<QuerySampleLibraryTrampoline*>(qsl);
-  LogSettings default_log_settings;
-  mlperf::StartTest(sut_cast, qsl_cast, settings, default_log_settings,
-                    audit_config_filename);
-}
-
-void QuerySamplesComplete(QuerySampleResponse* responses,
-                          size_t response_count) {
-  mlperf::QuerySamplesComplete(responses, response_count);
-}
-
-void QuerySamplesCompleteResponseCb(QuerySampleResponse* responses,
-                                    size_t response_count,
-                                    ResponseCallback response_cb,
-                                    ClientData client_data) {
-  mlperf::QuerySamplesComplete(
-      responses, response_count,
-      [client_data, response_cb](QuerySampleResponse* response) {
-        response_cb(client_data, response);
-      });
-}
-
-void FirstTokenComplete(QuerySampleResponse* responses, size_t response_count) {
-  mlperf::FirstTokenComplete(responses, response_count);
-}
-
-void FirstTokenCompleteResponseCb(QuerySampleResponse* responses,
-                                  size_t response_count,
-                                  ResponseCallback response_cb,
-                                  ClientData client_data) {
-  mlperf::FirstTokenComplete(
-      responses, response_count,
-      [client_data, response_cb](QuerySampleResponse* response) {
-        response_cb(client_data, response);
-      });
-}
-
-void RegisterIssueQueryThread() { mlperf::RegisterIssueQueryThread(); }
-
-}  // namespace c
-}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.h
deleted file mode 100644
index 0ee44fb71..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/c_api.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/// \file
-/// \brief A C API wrapping the C++ loadgen. Not tested. Needs work.
-/// \details The C API allows a C or Python client to easily create
-/// a SystemUnderTest without having to expose the SystemUnderTest class
-/// directly.
-/// ConstructSUT works with a bunch of function poitners instead that are
-/// called from an underlying trampoline class.
-
-#ifndef SYSTEM_UNDER_TEST_C_API_H_
-#define SYSTEM_UNDER_TEST_C_API_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "../query_sample.h"
-#include "../test_settings.h"
-
-namespace mlperf {
-
-namespace c {
-
-/// \brief Optional opaque client data that creators of SUTs and QSLs can have
-/// the loadgen pass back to their callback invocations.
-/// Helps avoids global variables.
-typedef uintptr_t ClientData;
-
-typedef void (*IssueQueryCallback)(ClientData, const QuerySample*, size_t);
-typedef void (*FlushQueriesCallback)();
-typedef void (*ResponseCallback)(ClientData, QuerySampleResponse*);
-
-/// \brief SUT calls this function to report query result back to loadgen
-void QuerySamplesComplete(QuerySampleResponse* responses,
-                          size_t response_count);
-
-void QuerySamplesCompleteResponseCb(QuerySampleResponse* responses,
-                                    size_t response_count,
-                                    ResponseCallback response_cb,
-                                    ClientData client_data);
-
-void FirstTokenComplete(QuerySampleResponse* responses, size_t response_count);
-
-void FirstTokenCompleteResponseCb(QuerySampleResponse* responses,
-                                  size_t response_count,
-                                  ResponseCallback response_cb,
-                                  ClientData client_data);
-
-/// \brief Create an opaque SUT pointer based on C callbacks.
-void* ConstructSUT(ClientData client_data, const char* name, size_t name_length,
-                   IssueQueryCallback issue_cb,
-                   FlushQueriesCallback flush_queries_cb);
-/// \brief Destroys the SUT created by ConstructSUT.
-void DestroySUT(void* sut);
-
-typedef void (*LoadSamplesToRamCallback)(ClientData, const QuerySampleIndex*,
-                                         size_t);
-typedef void (*UnloadSamplesFromRamCallback)(ClientData,
-                                             const QuerySampleIndex*, size_t);
-
-/// \brief Create an opaque QSL pointer based on C callbacks.
-void* ConstructQSL(ClientData client_data, const char* name, size_t name_length,
-                   size_t total_sample_count, size_t performance_sample_count,
-                   LoadSamplesToRamCallback load_samples_to_ram_cb,
-                   UnloadSamplesFromRamCallback unload_samples_from_ram_cb);
-/// \brief Destroys the QSL created by ConsructQSL.
-void DestroyQSL(void* qsl);
-
-/// \brief Run tests on a SUT created by ConstructSUT().
-/// \details This is the C entry point. See mlperf::StartTest for the C++ entry
-/// point.
-void StartTest(void* sut, void* qsl, const TestSettings& settings,
-               const std::string& audit_config_filename);
-
-///
-/// \brief Register a thread for query issuing in Server scenario.
-/// \details This is the C entry point. See mlperf::RegisterIssueQueryThread for
-/// the C++ entry point.
-///
-void RegisterIssueQueryThread();
-
-}  // namespace c
-}  // namespace mlperf
-
-#endif  // SYSTEM_UNDER_TEST_C_API_H_
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/python_api.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/python_api.cc
deleted file mode 100644
index 96396dab9..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/bindings/python_api.cc
+++ /dev/null
@@ -1,484 +0,0 @@
-/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/// \file
-/// \brief Python bindings for the loadgen using pybind11.
-
-#ifndef PYTHON_BINDINGS_H
-#define PYTHON_BINDINGS_H
-
-#include <functional>
-
-#include "../loadgen.h"
-#include "../query_dispatch_library.h"
-#include "../query_sample.h"
-#include "../query_sample_library.h"
-#include "../system_under_test.h"
-#include "../test_settings.h"
-#include "pybind11/functional.h"
-#include "pybind11/pybind11.h"
-#include "pybind11/stl.h"
-#include "pybind11/stl_bind.h"
-
-namespace mlperf {
-
-namespace {
-
-using IssueQueryCallback = std::function<void(std::vector<QuerySample>)>;
-using FastIssueQueriesCallback =
-    std::function<void(std::vector<ResponseId>, std::vector<QuerySampleIndex>)>;
-using FlushQueriesCallback = std::function<void()>;
-using NameCallback = std::function<std::string()>;
-
-// Forwards SystemUnderTest calls to relevant callbacks.
-class SystemUnderTestTrampoline : public SystemUnderTest {
- public:
-  SystemUnderTestTrampoline(std::string name, IssueQueryCallback issue_cb,
-                            FlushQueriesCallback flush_queries_cb)
-      : name_(std::move(name)),
-        issue_cb_(issue_cb),
-        flush_queries_cb_(flush_queries_cb) {}
-  ~SystemUnderTestTrampoline() override = default;
-
-  const std::string& Name() override { return name_; }
-
-  void IssueQuery(const std::vector<QuerySample>& samples) override {
-    pybind11::gil_scoped_acquire gil_acquirer;
-    issue_cb_(samples);
-  }
-
-  void FlushQueries() override { flush_queries_cb_(); }
-
- protected:
-  std::string name_;
-  IssueQueryCallback issue_cb_;
-  FlushQueriesCallback flush_queries_cb_;
-};
-
-class FastSystemUnderTestTrampoline : public SystemUnderTestTrampoline {
- public:
-  FastSystemUnderTestTrampoline(std::string name,
-                                FastIssueQueriesCallback fast_issue_cb,
-                                FlushQueriesCallback flush_queries_cb)
-      : SystemUnderTestTrampoline(name, nullptr, flush_queries_cb),
-        fast_issue_cb_(fast_issue_cb) {}
-  ~FastSystemUnderTestTrampoline() override = default;
-
-  void IssueQuery(const std::vector<QuerySample>& samples) override {
-    pybind11::gil_scoped_acquire gil_acquirer;
-    std::vector<ResponseId> responseIds;
-    std::vector<QuerySampleIndex> querySampleIndices;
-    for (auto& s : samples) {
-      responseIds.push_back(s.id);
-      querySampleIndices.push_back(s.index);
-    }
-    fast_issue_cb_(responseIds, querySampleIndices);
-  }
-
- private:
-  FastIssueQueriesCallback fast_issue_cb_;
-};
-
-using LoadSamplesToRamCallback =
-    std::function<void(std::vector<QuerySampleIndex>)>;
-using UnloadSamplesFromRamCallback =
-    std::function<void(std::vector<QuerySampleIndex>)>;
-
-// Forwards QuerySampleLibrary calls to relevant callbacks.
-class QuerySampleLibraryTrampoline : public QuerySampleLibrary {
- public:
-  QuerySampleLibraryTrampoline(
-      std::string name, size_t total_sample_count,
-      size_t performance_sample_count,
-      LoadSamplesToRamCallback load_samples_to_ram_cb,
-      UnloadSamplesFromRamCallback unload_samples_from_ram_cb)
-      : name_(std::move(name)),
-        total_sample_count_(total_sample_count),
-        performance_sample_count_(performance_sample_count),
-        load_samples_to_ram_cb_(load_samples_to_ram_cb),
-        unload_samples_from_ram_cb_(unload_samples_from_ram_cb) {}
-  ~QuerySampleLibraryTrampoline() override = default;
-
-  const std::string& Name() override { return name_; }
-  size_t TotalSampleCount() { return total_sample_count_; }
-  size_t PerformanceSampleCount() { return performance_sample_count_; }
-
-  void LoadSamplesToRam(const std::vector<QuerySampleIndex>& samples) override {
-    pybind11::gil_scoped_acquire gil_acquirer;
-    load_samples_to_ram_cb_(samples);
-  }
-  void UnloadSamplesFromRam(
-      const std::vector<QuerySampleIndex>& samples) override {
-    pybind11::gil_scoped_acquire gil_acquirer;
-    unload_samples_from_ram_cb_(samples);
-  }
-
- private:
-  std::string name_;
-  size_t total_sample_count_;
-  size_t performance_sample_count_;
-  LoadSamplesToRamCallback load_samples_to_ram_cb_;
-  UnloadSamplesFromRamCallback unload_samples_from_ram_cb_;
-};
-
-// A QDL that allows defining callbacks for
-// IssueQuery, FlushQueries, and Name methods.
-class QueryDispatchLibraryTrampoline : public QueryDispatchLibrary {
- public:
-  QueryDispatchLibraryTrampoline(IssueQueryCallback issue_query_callback,
-                                 FlushQueriesCallback flush_queries_callback,
-                                 NameCallback name_callback)
-      : issue_query_callback_(issue_query_callback),
-        flush_queries_callback_(flush_queries_callback),
-        name_callback_(name_callback) {}
-
-  // Returns the name of the SUT. Name shall be returned over the network
-  // TODO: other bindings should also be fixed eventually to be used over the
-  // network
-  const std::string& Name() override {
-    static std::string name;  // HACK: avoid returning a reference to temporary.
-    pybind11::gil_scoped_acquire gil_acquirer;
-    name = name_callback_();  // name_callback_() shall returned name over the
-                              // network.
-    return name;
-  }
-
-  void IssueQuery(const std::vector<QuerySample>& samples) override {
-    pybind11::gil_scoped_acquire gil_acquirer;
-    issue_query_callback_(samples);
-  }
-
-  void FlushQueries() override { flush_queries_callback_(); }
-
- protected:
-  IssueQueryCallback issue_query_callback_;
-  FlushQueriesCallback flush_queries_callback_;
-  NameCallback name_callback_;
-};
-
-}  // namespace
-
-/// \brief Python bindings.
-namespace py {
-
-uintptr_t ConstructSUT(IssueQueryCallback issue_cb,
-                       FlushQueriesCallback flush_queries_cb) {
-  SystemUnderTestTrampoline* sut =
-      new SystemUnderTestTrampoline("PySUT", issue_cb, flush_queries_cb);
-  return reinterpret_cast<uintptr_t>(sut);
-}
-
-void DestroySUT(uintptr_t sut) {
-  SystemUnderTestTrampoline* sut_cast =
-      reinterpret_cast<SystemUnderTestTrampoline*>(sut);
-  delete sut_cast;
-}
-
-uintptr_t ConstructFastSUT(FastIssueQueriesCallback fast_issue_cb,
-                           FlushQueriesCallback flush_queries_cb) {
-  FastSystemUnderTestTrampoline* sut = new FastSystemUnderTestTrampoline(
-      "PyFastSUT", fast_issue_cb, flush_queries_cb);
-  return reinterpret_cast<uintptr_t>(sut);
-}
-
-void DestroyFastSUT(uintptr_t sut) {
-  FastSystemUnderTestTrampoline* sut_cast =
-      reinterpret_cast<FastSystemUnderTestTrampoline*>(sut);
-  delete sut_cast;
-}
-
-uintptr_t ConstructQSL(
-    size_t total_sample_count, size_t performance_sample_count,
-    LoadSamplesToRamCallback load_samples_to_ram_cb,
-    UnloadSamplesFromRamCallback unload_samples_from_ram_cb) {
-  QuerySampleLibraryTrampoline* qsl = new QuerySampleLibraryTrampoline(
-      "PyQSL", total_sample_count, performance_sample_count,
-      load_samples_to_ram_cb, unload_samples_from_ram_cb);
-  return reinterpret_cast<uintptr_t>(qsl);
-}
-
-void DestroyQSL(uintptr_t qsl) {
-  QuerySampleLibraryTrampoline* qsl_cast =
-      reinterpret_cast<QuerySampleLibraryTrampoline*>(qsl);
-  delete qsl_cast;
-}
-
-uintptr_t ConstructQDL(IssueQueryCallback issue_cb,
-                       FlushQueriesCallback flush_queries_cb,
-                       NameCallback name_callback) {
-  QueryDispatchLibraryTrampoline* qdl = new QueryDispatchLibraryTrampoline(
-      issue_cb, flush_queries_cb, name_callback);
-  return reinterpret_cast<uintptr_t>(qdl);
-}
-
-void DestroyQDL(uintptr_t qdl) {
-  QueryDispatchLibraryTrampoline* qdl_cast =
-      reinterpret_cast<QueryDispatchLibraryTrampoline*>(qdl);
-  delete qdl_cast;
-}
-
-void StartTest(uintptr_t sut, uintptr_t qsl, mlperf::TestSettings test_settings,
-               const std::string& audit_config_filename) {
-  pybind11::gil_scoped_release gil_releaser;
-  SystemUnderTestTrampoline* sut_cast =
-      reinterpret_cast<SystemUnderTestTrampoline*>(sut);
-  QuerySampleLibraryTrampoline* qsl_cast =
-      reinterpret_cast<QuerySampleLibraryTrampoline*>(qsl);
-  LogSettings default_log_settings;
-  mlperf::StartTest(sut_cast, qsl_cast, test_settings, default_log_settings,
-                    audit_config_filename);
-}
-
-void StartTestWithLogSettings(uintptr_t sut, uintptr_t qsl,
-                              mlperf::TestSettings test_settings,
-                              mlperf::LogSettings log_settings,
-                              const std::string& audit_config_filename) {
-  pybind11::gil_scoped_release gil_releaser;
-  SystemUnderTestTrampoline* sut_cast =
-      reinterpret_cast<SystemUnderTestTrampoline*>(sut);
-  QuerySampleLibraryTrampoline* qsl_cast =
-      reinterpret_cast<QuerySampleLibraryTrampoline*>(qsl);
-  mlperf::StartTest(sut_cast, qsl_cast, test_settings, log_settings,
-                    audit_config_filename);
-}
-
-using ResponseCallback = std::function<void(QuerySampleResponse*)>;
-
-/// TODO: Get rid of copies.
-void QuerySamplesComplete(std::vector<QuerySampleResponse> responses,
-                          ResponseCallback response_cb = {}) {
-  pybind11::gil_scoped_release gil_releaser;
-  mlperf::QuerySamplesComplete(responses.data(), responses.size(), response_cb);
-}
-
-void FirstTokenComplete(std::vector<QuerySampleResponse> responses,
-                        ResponseCallback response_cb = {}) {
-  pybind11::gil_scoped_release gil_releaser;
-  mlperf::FirstTokenComplete(responses.data(), responses.size(), response_cb);
-}
-
-PYBIND11_MODULE(mlperf_loadgen, m) {
-  m.doc() = "MLPerf Inference load generator.";
-
-  pybind11::enum_<TestScenario>(m, "TestScenario")
-      .value("SingleStream", TestScenario::SingleStream)
-      .value("MultiStream", TestScenario::MultiStream)
-      .value("Server", TestScenario::Server)
-      .value("Offline", TestScenario::Offline);
-
-  pybind11::enum_<TestMode>(m, "TestMode")
-      .value("SubmissionRun", TestMode::SubmissionRun)
-      .value("AccuracyOnly", TestMode::AccuracyOnly)
-      .value("PerformanceOnly", TestMode::PerformanceOnly)
-      .value("FindPeakPerformance", TestMode::FindPeakPerformance);
-
-  pybind11::class_<TestSettings>(m, "TestSettings")
-      .def(pybind11::init<>())
-      .def_readwrite("scenario", &TestSettings::scenario)
-      .def_readwrite("mode", &TestSettings::mode)
-      .def_readwrite("single_stream_expected_latency_ns",
-                     &TestSettings::single_stream_expected_latency_ns)
-      .def_readwrite("single_stream_target_latency_percentile",
-                     &TestSettings::single_stream_target_latency_percentile)
-      .def_readwrite("multi_stream_expected_latency_ns",
-                     &TestSettings::multi_stream_expected_latency_ns)
-      .def_readwrite("multi_stream_target_latency_percentile",
-                     &TestSettings::multi_stream_target_latency_percentile)
-      .def_readwrite("multi_stream_samples_per_query",
-                     &TestSettings::multi_stream_samples_per_query)
-      .def_readwrite("server_target_qps", &TestSettings::server_target_qps)
-      .def_readwrite("server_target_latency_ns",
-                     &TestSettings::server_target_latency_ns)
-      .def_readwrite("server_target_latency_percentile",
-                     &TestSettings::server_target_latency_percentile)
-      .def_readwrite("server_coalesce_queries",
-                     &TestSettings::server_coalesce_queries)
-      .def_readwrite("server_find_peak_qps_decimals_of_precision",
-                     &TestSettings::server_find_peak_qps_decimals_of_precision)
-      .def_readwrite("server_find_peak_qps_boundary_step_size",
-                     &TestSettings::server_find_peak_qps_boundary_step_size)
-      .def_readwrite("server_max_async_queries",
-                     &TestSettings::server_max_async_queries)
-      .def_readwrite("server_num_issue_query_threads",
-                     &TestSettings::server_num_issue_query_threads)
-      .def_readwrite("offline_expected_qps",
-                     &TestSettings::offline_expected_qps)
-      .def_readwrite("min_duration_ms", &TestSettings::min_duration_ms)
-      .def_readwrite("max_duration_ms", &TestSettings::max_duration_ms)
-      .def_readwrite("min_query_count", &TestSettings::min_query_count)
-      .def_readwrite("max_query_count", &TestSettings::max_query_count)
-      .def_readwrite("qsl_rng_seed", &TestSettings::qsl_rng_seed)
-      .def_readwrite("sample_index_rng_seed",
-                     &TestSettings::sample_index_rng_seed)
-      .def_readwrite("schedule_rng_seed", &TestSettings::schedule_rng_seed)
-      .def_readwrite("accuracy_log_rng_seed",
-                     &TestSettings::accuracy_log_rng_seed)
-      .def_readwrite("accuracy_log_probability",
-                     &TestSettings::accuracy_log_probability)
-      .def_readwrite("print_timestamps", &TestSettings::print_timestamps)
-      .def_readwrite("performance_issue_unique",
-                     &TestSettings::performance_issue_unique)
-      .def_readwrite("performance_issue_same",
-                     &TestSettings::performance_issue_same)
-      .def_readwrite("performance_issue_same_index",
-                     &TestSettings::performance_issue_same_index)
-      .def_readwrite("performance_sample_count_override",
-                     &TestSettings::performance_sample_count_override)
-      .def_readwrite("test05", &TestSettings::test05)
-      .def_readwrite("test05_qsl_rng_seed", &TestSettings::test05_qsl_rng_seed)
-      .def_readwrite("test05_sample_index_rng_seed",
-                     &TestSettings::test05_sample_index_rng_seed)
-      .def_readwrite("test05_schedule_rng_seed",
-                     &TestSettings::test05_schedule_rng_seed)
-      .def_readwrite("use_token_latencies", &TestSettings::use_token_latencies)
-      .def_readwrite("ttft_latency", &TestSettings::server_ttft_latency)
-      .def_readwrite("tpot_latency", &TestSettings::server_tpot_latency)
-      .def_readwrite("infer_token_latencies",
-                     &TestSettings::infer_token_latencies)
-      .def_readwrite("token_latency_scaling_factor",
-                     &TestSettings::token_latency_scaling_factor)
-      .def("FromConfig", &TestSettings::FromConfig, pybind11::arg("path"),
-           pybind11::arg("model"), pybind11::arg("scenario"),
-           pybind11::arg("conf_type") = 1,
-           "This function configures settings from the given user "
-           "configuration file, model, and scenario. The conf_type flag "
-           "should be set to 1 for loading user.conf or else only the default "
-           "mlperf_conf file "
-           "will be loaded by the loadgen.");
-
-  pybind11::enum_<LoggingMode>(m, "LoggingMode")
-      .value("AsyncPoll", LoggingMode::AsyncPoll)
-      .value("EndOfTestOnly", LoggingMode::EndOfTestOnly)
-      .value("Synchronous", LoggingMode::Synchronous);
-
-  pybind11::class_<LogOutputSettings>(m, "LogOutputSettings")
-      .def(pybind11::init<>())
-      .def_readwrite("outdir", &LogOutputSettings::outdir)
-      .def_readwrite("prefix", &LogOutputSettings::prefix)
-      .def_readwrite("suffix", &LogOutputSettings::suffix)
-      .def_readwrite("prefix_with_datetime",
-                     &LogOutputSettings::prefix_with_datetime)
-      .def_readwrite("copy_detail_to_stdout",
-                     &LogOutputSettings::copy_detail_to_stdout)
-      .def_readwrite("copy_summary_to_stdout",
-                     &LogOutputSettings::copy_summary_to_stdout);
-
-  pybind11::class_<LogSettings>(m, "LogSettings")
-      .def(pybind11::init<>())
-      .def_readwrite("log_output", &LogSettings::log_output)
-      .def_readwrite("log_mode", &LogSettings::log_mode)
-      .def_readwrite("log_mode_async_poll_interval_ms",
-                     &LogSettings::log_mode_async_poll_interval_ms)
-      .def_readwrite("enable_trace", &LogSettings::enable_trace);
-
-  pybind11::class_<QuerySample>(m, "QuerySample")
-      .def(pybind11::init<>())
-      .def(pybind11::init<ResponseId, QuerySampleIndex>())
-      .def_readwrite("id", &QuerySample::id)
-      .def_readwrite("index", &QuerySample::index)
-      .def(pybind11::pickle(
-          [](const QuerySample& qs) {  // __getstate__
-            /*Return a tuple that fully encodes state of object*/
-            return pybind11::make_tuple(qs.id, qs.index);
-          },
-          [](pybind11::tuple t) {  // __setstate__
-            if (t.size() != 2)
-              throw std::runtime_error("Invalid state for QuerySample");
-            /* Create a new C++ instance*/
-            QuerySample q;
-            q.id = t[0].cast<uintptr_t>();
-            q.index = t[1].cast<size_t>();
-            return q;
-          }));
-
-  pybind11::class_<QuerySampleResponse>(m, "QuerySampleResponse")
-      .def(pybind11::init<>())
-      .def(pybind11::init<ResponseId, uintptr_t, size_t>())
-      .def(pybind11::init<ResponseId, uintptr_t, size_t, int64_t>())
-      .def_readwrite("id", &QuerySampleResponse::id)
-      .def_readwrite("data", &QuerySampleResponse::data)
-      .def_readwrite("size", &QuerySampleResponse::size)
-      .def_readwrite("n_tokens", &QuerySampleResponse::n_tokens)
-      .def(pybind11::pickle(
-          [](const QuerySampleResponse& qsr) {  // __getstate__
-            /* Return a tuple that fully encodes state of object*/
-            return pybind11::make_tuple(qsr.id, qsr.data, qsr.size);
-          },
-          [](pybind11::tuple t) {  // __setstate__
-            if ((t.size() != 3) || (t.size() != 4))
-              throw std::runtime_error("Invalid state for QuerySampleResponse");
-            /* Create a new C++ instance*/
-            QuerySampleResponse q;
-            q.id = t[0].cast<uintptr_t>();
-            q.data = t[1].cast<uintptr_t>();
-            q.size = t[2].cast<size_t>();
-            if (t.size() == 4) {
-              q.n_tokens = t[3].cast<int64_t>();
-            } else {
-              q.n_tokens = 0;
-            }
-            return q;
-          }));
-
-  // TODO: Use PYBIND11_MAKE_OPAQUE for the following vector types.
-  pybind11::bind_vector<std::vector<QuerySample>>(m, "VectorQuerySample");
-  pybind11::bind_vector<std::vector<QuerySampleResponse>>(
-      m, "VectorQuerySampleResponse");
-
-  m.def("ConstructSUT", &py::ConstructSUT, "Construct the system under test.");
-  m.def("DestroySUT", &py::DestroySUT,
-        "Destroy the object created by ConstructSUT.");
-
-  m.def("ConstructFastSUT", &py::ConstructFastSUT,
-        "Construct the system under test, fast issue query");
-  m.def("DestroyFastSUT", &py::DestroyFastSUT,
-        "Destroy the object created by ConstructFastSUT.");
-
-  m.def("ConstructQSL", &py::ConstructQSL,
-        "Construct the query sample library.");
-  m.def("DestroyQSL", &py::DestroyQSL,
-        "Destroy the object created by ConstructQSL.");
-
-  m.def("ConstructQDL", &py::ConstructQDL,
-        "Construct the query sample library, communicating with the SUT over "
-        "the network.");
-  m.def("DestroyQDL", &py::DestroyQDL,
-        "Destroy the object created by ConstructQDL.");
-
-  m.def("StartTest", &py::StartTest,
-        "Run tests on a SUT created by ConstructSUT() with the provided QSL. "
-        "Uses default log settings.",
-        pybind11::arg("sut"), pybind11::arg("qsl"),
-        pybind11::arg("test_settings"),
-        pybind11::arg("audit_config_filename") = "audit.config");
-  m.def("StartTestWithLogSettings", &py::StartTestWithLogSettings,
-        "Run tests on a SUT created by ConstructSUT() with the provided QSL. "
-        "Accepts custom log settings.",
-        pybind11::arg("sut"), pybind11::arg("qsl"),
-        pybind11::arg("test_settings"), pybind11::arg("log_settings"),
-        pybind11::arg("audit_config_filename") = "audit.config");
-  m.def("QuerySamplesComplete", &py::QuerySamplesComplete,
-        "Called by the SUT to indicate that samples from some combination of"
-        "IssueQuery calls have finished.",
-        pybind11::arg("responses"),
-        pybind11::arg("response_cb") = ResponseCallback{});
-  m.def("FirstTokenComplete", &py::FirstTokenComplete,
-        "Called by the SUT to indicate that tokens from some combination of"
-        "IssueQuery calls have finished.",
-        pybind11::arg("responses"),
-        pybind11::arg("response_cb") = ResponseCallback{});
-}
-
-}  // namespace py
-}  // namespace mlperf
-
-#endif  // PYTHON_BINDINGS_H
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/README.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/README.md
deleted file mode 100644
index f46e22a65..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/README.md
+++ /dev/null
@@ -1,67 +0,0 @@
-# Demo
-
-## Loadgen Over the Network
-
-### Overview
-
-
-This folder provides a demo implementation for LoadGen over the network.\
-Two sides are implemented:
-
-1. The SUT side which is implemented in [sut_over_network_demo.py](sut_over_network_demo.py). Each Node should run it for multiple Nodes operation.
-2. The LoadGen node running the LoadGen, QSL and QDL instances, implemented in [py_demo_server_lon.py](py_demo_server_lon.py)
-
-The demo SUT is implemented with a Flask server. the LON node implements a Flask client for network operation.
-
-The test runs in MLPerf Server mode. the SUT is not implementing a benchmark but contains dummy interface to preprocessing, postprocessing and  model calling functions.
-
-### Setup
-
-Install python packages:
-
-```sh
-pip install absl-py numpy wheel flask requests
-```
-
-Clone:
-
-```sh
-git clone --recurse-submodules https://github.com/mlcommons/inference.git mlperf_inference
-```
-
-Build:
-
-```sh
-cd mlperf_inference/loadgen
-CFLAGS="-std=c++14 -O3" python setup.py bdist_wheel
-cd ..; pip install --force-reinstall loadgen/dist/`ls -r loadgen/dist/ | head -n1` ; cd -
-```
-
-### Run the demo (single machine)
-
-Start the demo SUT server (run this at a separate terminal):
-
-```sh
-python demos/lon/sut_over_network_demo.py --port 8000
-```
-
-Start the test:
-
-```sh
-python demos/lon/py_demo_server_lon.py --sut_server http://localhost:8000
-```
-
-### Run the demo (over the network)
-
-To run over a network - simply run the demo SUT over on a different machine. For multiple Nodes run the demo SUT on each machine specifying the node number.\
-
-```sh
-python demos/lon/sut_over_network_demo.py --port 8000 --node N1
-```
-
-Then, when running the client, replace `localhost` with the correct IP.
-
-
-```sh
-python demos/lon/py_demo_server_lon.py --sut_server IP1:8000,IP2:8000,IP3:8000
-```
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/py_demo_server_lon.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/py_demo_server_lon.py
deleted file mode 100644
index 1248215db..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/py_demo_server_lon.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# Copyright 2019 The MLPerf Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-"""
-Python demo showing how to use the MLPerf Inference LoadGen over the Network bindings.
-This programs runs in the LON Node side.
-It runs the demo in MLPerf server mode over the network.
-It communicates over the network with a Network SUT node,
-which is running the Network SUT demo based on a flask server, implemented in SUT_over_network.py
-"""
-
-import threading
-import requests
-import array
-import time
-
-from absl import app
-from absl import flags
-import mlperf_loadgen
-
-FLAGS = flags.FLAGS
-
-flags.DEFINE_list(
-    "sut_server", "http://localhost:8000", "Address of the server(s) under test."
-)
-
-
-class QSL:
-    """Demo QuerySampleLibrary with dummy features."""
-
-    def __init__(self, total_sample_count, performance_sample_count):
-        self.eval_features = {
-            i: f"what_is_my_dummy_feature_{i}?" for i in range(total_sample_count)
-        }
-        self.qsl = mlperf_loadgen.ConstructQSL(
-            total_sample_count,
-            performance_sample_count,
-            self.load_samples_to_ram,
-            self.unload_samples_from_ram,
-        )
-
-    def get_features(self, sample_id):
-        """Returns the feature for a given sample id."""
-        return self.eval_features[sample_id]
-
-    def load_samples_to_ram(self, query_samples):
-        """Loads the features for the given query samples into RAM."""
-        # Current implementation is not using this functionality.
-        del query_samples
-        return
-
-    def unload_samples_from_ram(self, query_samples):
-        """Unloads the features for the given query samples from RAM."""
-        # Current implementation is not using this functionality.
-        del query_samples
-        return
-
-    def __del__(self):
-        mlperf_loadgen.DestroyQSL(self.qsl)
-
-
-class QDL:
-    """QDL acting as a proxy to the SUT.
-    This QDL communicates with the SUT via HTTP.
-    It uses two endpoints to communicate with the SUT:
-    - /predict/ : Send a query to the SUT and get a response.
-    - /getname/ : Get the name of the SUT. Send a getname to the SUT and get a response.
-    """
-
-    def __init__(self, qsl: QSL, sut_server_addr: list):
-        """
-        Constructor for the QDL.
-        Args:
-            qsl: The QSL to use.
-            sut_server_addr: A list of addresses of the SUT.
-        """
-        self.qsl = qsl
-
-        # Construct QDL from the python binding
-        self.qdl = mlperf_loadgen.ConstructQDL(
-            self.issue_query, self.flush_queries, self.client_get_name
-        )
-        self.sut_server_addr = sut_server_addr
-        self.num_nodes = len(sut_server_addr)
-
-        # For round robin between the SUTs:
-        self.next_sut_id = 0
-        self.lock = threading.Lock()
-
-    def issue_query(self, query_samples):
-        """Process the query to send to the SUT"""
-        threading.Thread(
-            target=self.process_query_async,
-            args=[query_samples]).start()
-
-    def flush_queries(self):
-        """Flush the queries. Dummy implementation."""
-        pass
-
-    def process_query_async(self, query_samples):
-        """
-        This function is called by the Loadgen in a separate thread.
-        It is responsible for
-            1. Creating a query for the SUT, by reading the features from the QSL.
-            2. Sending the query to the SUT.
-            3. Waiting for the response from the SUT.
-            4. Deserializing the response.
-            5. Calling mlperf_loadgen.QuerySamplesComplete(query_samples, response)
-        Args:
-            query_samples: A list of QuerySample objects.
-        """
-        responses = []
-        for s in query_samples:
-            # Overall process:
-            # QDL builds a real-world query and sends to SUT --> SUT processes --> SUT sends back to QDL
-            # Read features from the QSL
-            features = self.qsl.get_features(s.index)
-
-            time.sleep(0.001)  # Ensure a maximal rate of queries to the SUT
-
-            # Send the query to SUT in round robin
-            # Wait for a response
-            sut_result = self.client_predict(features, s.index)
-            response_array = array.array("B", sut_result.encode("utf-8"))
-            bi = response_array.buffer_info()
-            responses.append(
-                mlperf_loadgen.QuerySampleResponse(
-                    s.id, bi[0], bi[1]))
-        mlperf_loadgen.QuerySamplesComplete(responses)
-
-    def get_sut_id_round_robin(self):
-        """Get the SUT id in round robin."""
-        with self.lock:
-            res = self.next_sut_id
-            self.next_sut_id = (self.next_sut_id + 1) % self.num_nodes
-        return res
-
-    def client_predict(self, query, id):
-        """Serialize the query, send it to the SUT in round robin, and return the deserialized response."""
-        url = "{}/predict/".format(
-            self.sut_server_addr[self.get_sut_id_round_robin()])
-        response = requests.post(url, json={"query": query, id: id})
-        return response.json()["result"]
-
-    def client_get_name(self):
-        """Get the name of the SUT from ALL the SUTS."""
-        if len(self.sut_server_addr) == 1:
-            return requests.post(
-                f"{self.sut_server_addr[0]}/getname/").json()["name"]
-
-        sut_names = [
-            requests.post(f"{addr}/getname/").json()["name"]
-            for addr in self.sut_server_addr
-        ]
-        return "Multi-node SUT: " + ", ".join(sut_names)
-
-    def __del__(self):
-        mlperf_loadgen.DestroyQDL(self.qdl)
-
-
-def main(argv):
-    del argv
-    settings = mlperf_loadgen.TestSettings()
-    settings.scenario = mlperf_loadgen.TestScenario.Server
-    settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
-    settings.server_target_qps = 100
-    settings.server_target_latency_ns = 100000000
-    settings.min_query_count = 100
-    settings.min_duration_ms = 10000
-
-    # QDL and QSL
-    qsl = QSL(1024, 128)
-    qdl = QDL(qsl, sut_server_addr=FLAGS.sut_server)
-
-    mlperf_loadgen.StartTest(qdl.qdl, qsl.qsl, settings)
-
-
-if __name__ == "__main__":
-    app.run(main)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/sut_over_network_demo.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/sut_over_network_demo.py
deleted file mode 100644
index 55e5e038d..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/lon/sut_over_network_demo.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright 2019 The MLPerf Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-
-"""
-Python demo showing how to use the MLPerf Inference load generator bindings over the network.
-This part of the demo runs the "demo SUT" which is connected over the network to the LON node.
-A corresponding "demo LON node" with the demo test is implemented in py_demo_server_lon.py.
-
-The SUT is implemented using a Flask server, with dummy implementation of the inference processing.
-Two endpoints are exposed:
-- /predict/ : Receives a query (e.g., a text) runs inference, and returns a prediction.
-- /getname/ : Get the name of the SUT.
-
-The current implementation is a dummy implementation, which does not use
-a real DNN model, batching, or pre/postprocessing code,
-but rather just returns subset of the input query as a response,
-Yet, it illustrates the basic structure of a SUT server.
-"""
-
-import argparse
-from flask import Flask, request, jsonify
-
-
-app = Flask(__name__)
-
-
-node = ""
-
-
-def preprocess(query):
-    """[SUT Node] A dummy preprocess."""
-    # Here may come for example batching, tokenization, resizing,
-    # normalization, etc.
-    response = query
-    return response
-
-
-def dnn_model(query):
-    """[SUT Node] A dummy DNN model."""
-    # Here may come for example a call to a dnn model such as resnet, bert,
-    # etc.
-    response = query
-    return response
-
-
-def postprocess(query):
-    """[SUT Node] A dummy postprocess."""
-    # Here may come for example a postprocessing call, e.g., NMS,
-    # detokenization, etc.
-    response = query
-    return response
-
-
-@app.route("/predict/", methods=["POST"])
-def predict():
-    """Receives a query (e.g., a text) runs inference, and returns a prediction."""
-    query = request.get_json(force=True)["query"]
-    result = postprocess(dnn_model(preprocess(query)))
-    return jsonify(result=result)
-
-
-@app.route("/getname/", methods=["POST", "GET"])
-def getname():
-    """Returns the name of the SUT."""
-    return jsonify(name=f"Demo SUT (Network SUT) node" +
-                   (" " + node) if node else "")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--port", type=int, default=8000)
-    parser.add_argument("--node", type=str, default="")
-    args = parser.parse_args()
-    node = args.node
-    app.run(debug=False, port=args.port)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_multi_stream.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_multi_stream.py
deleted file mode 100644
index f6082cad6..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_multi_stream.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright 2019 The MLPerf Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-"""Python demo showing how to use the MLPerf Inference load generator bindings.
-"""
-
-from __future__ import print_function
-
-import threading
-import time
-
-from absl import app
-import mlperf_loadgen
-
-from datetime import datetime
-
-# Global var
-NUM_AGENTS = 8
-LOOPBACK_LATENCY_S = 0.001
-
-
-def load_samples_to_ram(query_samples):
-    del query_samples
-    return
-
-
-def unload_samples_from_ram(query_samples):
-    del query_samples
-    return
-
-
-# Processes queries in NUM_AGENTS slices that complete at different times.
-def process_query_async(query_samples, i_slice):
-    time.sleep(LOOPBACK_LATENCY_S * (i_slice + 1))
-    responses = []
-    samples_to_complete = query_samples[i_slice: len(
-        query_samples): NUM_AGENTS]
-    for j, s in enumerate(samples_to_complete):
-        responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0))
-    mlperf_loadgen.QuerySamplesComplete(responses)
-
-
-def issue_query(query_samples):
-    for i in range(8):
-        threading.Thread(
-            target=process_query_async, args=(
-                query_samples, i)).start()
-
-
-def flush_queries():
-    pass
-
-
-def main(argv):
-    del argv
-    settings = mlperf_loadgen.TestSettings()
-    settings.scenario = mlperf_loadgen.TestScenario.MultiStream
-    settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
-    settings.multi_stream_expected_latency_ns = 8000000
-    settings.multi_stream_samples_per_query = 8
-    settings.min_query_count = 100
-    settings.min_duration_ms = 10000
-
-    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
-    qsl = mlperf_loadgen.ConstructQSL(
-        1024, 128, load_samples_to_ram, unload_samples_from_ram
-    )
-    mlperf_loadgen.StartTest(sut, qsl, settings)
-    mlperf_loadgen.DestroyQSL(qsl)
-    mlperf_loadgen.DestroySUT(sut)
-
-
-if __name__ == "__main__":
-    app.run(main)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_offline.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_offline.py
deleted file mode 100644
index 909585edc..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_offline.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright 2019 The MLPerf Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-"""Python demo showing how to use the MLPerf Inference load generator bindings.
-"""
-
-from __future__ import print_function
-
-import threading
-import time
-
-from absl import app
-import mlperf_loadgen
-
-
-def load_samples_to_ram(query_samples):
-    del query_samples
-    return
-
-
-def unload_samples_from_ram(query_samples):
-    del query_samples
-    return
-
-
-# Processes queries in 3 slices that complete at different times.
-def process_query_async(query_samples, i_slice):
-    time.sleep(3 * (i_slice + 1))
-    responses = []
-    samples_to_complete = query_samples[i_slice: len(query_samples): 3]
-    for s in samples_to_complete:
-        responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0))
-    mlperf_loadgen.QuerySamplesComplete(responses)
-
-
-def issue_query(query_samples):
-    threading.Thread(
-        target=process_query_async, args=(
-            query_samples, 0)).start()
-    threading.Thread(
-        target=process_query_async, args=(
-            query_samples, 1)).start()
-    threading.Thread(
-        target=process_query_async, args=(
-            query_samples, 2)).start()
-
-
-def flush_queries():
-    pass
-
-
-def main(argv):
-    del argv
-    settings = mlperf_loadgen.TestSettings()
-    settings.scenario = mlperf_loadgen.TestScenario.Offline
-    settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
-    settings.offline_expected_qps = 1000
-
-    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
-    qsl = mlperf_loadgen.ConstructQSL(
-        1024, 128, load_samples_to_ram, unload_samples_from_ram
-    )
-    mlperf_loadgen.StartTest(sut, qsl, settings)
-    mlperf_loadgen.DestroyQSL(qsl)
-    mlperf_loadgen.DestroySUT(sut)
-
-
-if __name__ == "__main__":
-    app.run(main)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_server.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_server.py
deleted file mode 100644
index 8b6f2b826..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_server.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright 2019 The MLPerf Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-"""Python demo showing how to use the MLPerf Inference load generator bindings.
-"""
-
-from __future__ import print_function
-
-import threading
-import time
-
-from absl import app
-import mlperf_loadgen
-
-
-def load_samples_to_ram(query_samples):
-    del query_samples
-    return
-
-
-def unload_samples_from_ram(query_samples):
-    del query_samples
-    return
-
-
-def process_query_async(query_samples):
-    time.sleep(0.001)
-    responses = []
-    for s in query_samples:
-        responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0))
-    mlperf_loadgen.QuerySamplesComplete(responses)
-
-
-def issue_query(query_samples):
-    threading.Thread(target=process_query_async, args=[query_samples]).start()
-
-
-def flush_queries():
-    pass
-
-
-def main(argv):
-    del argv
-    settings = mlperf_loadgen.TestSettings()
-    settings.scenario = mlperf_loadgen.TestScenario.Server
-    settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
-    settings.server_target_qps = 100
-    settings.server_target_latency_ns = 100000000
-    settings.min_query_count = 100
-    settings.min_duration_ms = 10000
-
-    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
-    qsl = mlperf_loadgen.ConstructQSL(
-        1024, 128, load_samples_to_ram, unload_samples_from_ram
-    )
-    mlperf_loadgen.StartTest(sut, qsl, settings)
-    mlperf_loadgen.DestroyQSL(qsl)
-    mlperf_loadgen.DestroySUT(sut)
-
-
-if __name__ == "__main__":
-    app.run(main)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_single_stream.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_single_stream.py
deleted file mode 100644
index 8806271bd..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/py_demo_single_stream.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright 2019 The MLPerf Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-"""Python demo showing how to use the MLPerf Inference load generator bindings.
-"""
-
-from __future__ import print_function
-
-import array
-import threading
-import time
-
-from absl import app
-import mlperf_loadgen
-
-
-def load_samples_to_ram(query_samples):
-    del query_samples
-    return
-
-
-def unload_samples_from_ram(query_samples):
-    del query_samples
-    return
-
-
-def process_query_async(query_samples):
-    """Processes the list of queries."""
-    time.sleep(0.001)
-    responses = []
-    response_array = array.array(
-        "f", [0, 1, 7, 8, 15, 16, 31, 32, 63, 64, 127, 128, 254, 255]
-    )
-    response_info = response_array.buffer_info()
-    response_data = response_info[0]
-    response_size = response_info[1] * response_array.itemsize
-    for s in query_samples:
-        responses.append(
-            mlperf_loadgen.QuerySampleResponse(
-                s.id, response_data, response_size)
-        )
-    mlperf_loadgen.QuerySamplesComplete(responses)
-
-
-def issue_query(query_samples):
-    threading.Thread(target=process_query_async, args=[query_samples]).start()
-
-
-def flush_queries():
-    pass
-
-
-def main(argv):
-    del argv
-    settings = mlperf_loadgen.TestSettings()
-    settings.scenario = mlperf_loadgen.TestScenario.SingleStream
-    settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
-    settings.single_stream_expected_latency_ns = 1000000
-    settings.min_query_count = 100
-    settings.min_duration_ms = 10000
-
-    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
-    qsl = mlperf_loadgen.ConstructQSL(
-        1024, 128, load_samples_to_ram, unload_samples_from_ram
-    )
-    mlperf_loadgen.StartTest(sut, qsl, settings)
-    mlperf_loadgen.DestroyQSL(qsl)
-    mlperf_loadgen.DestroySUT(sut)
-
-
-if __name__ == "__main__":
-    app.run(main)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_multi_stream.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_multi_stream.py
deleted file mode 100644
index e4b083853..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_multi_stream.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright 2019 The MLPerf Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-"""Python demo showing how to use the MLPerf Inference load generator bindings.
-"""
-
-from __future__ import print_function
-
-import argparse
-import threading
-import time
-import numpy as np
-import array
-
-import mlperf_loadgen
-
-from datetime import datetime
-
-# Global var
-NUM_AGENTS = 8
-LOOPBACK_LATENCY_S = 0.001
-
-
-def f(x, y):
-    return 4 + 3 * x * y + x**3 + y**2
-
-
-def create_responses(n, m, mod=4):
-    r = []
-    for i in range(n):
-        r.append([f(i, j) for j in range(m + (i % mod))])
-    return r
-
-
-responses = create_responses(1024, 20)
-
-
-def load_samples_to_ram(query_samples):
-    del query_samples
-    return
-
-
-def unload_samples_from_ram(query_samples):
-    del query_samples
-    return
-
-
-# Processes queries in NUM_AGENTS slices that complete at different times.
-def process_query_async(query_samples, i_slice):
-    time.sleep(LOOPBACK_LATENCY_S * (i_slice + 1))
-    query_responses = []
-    samples_to_complete = query_samples[i_slice: len(
-        query_samples): NUM_AGENTS]
-    for j, s in enumerate(samples_to_complete):
-        response_array = np.array(responses[s.index], np.int32)
-        token = response_array[0]
-        time.sleep(0.0002)
-        response_token = array.array("B", token.tobytes())
-        response_token_info = response_token.buffer_info()
-        response_token_data = response_token_info[0]
-        response_token_size = response_token_info[1] * response_token.itemsize
-        mlperf_loadgen.FirstTokenComplete(
-            [
-                mlperf_loadgen.QuerySampleResponse(
-                    s.id, response_token_data, response_token_size
-                )
-            ]
-        )
-        time.sleep(0.02)
-        n_tokens = len(response_array)
-        response_array = array.array("B", response_array.tobytes())
-        response_info = response_array.buffer_info()
-        response_data = response_info[0]
-        response_size = response_info[1] * response_array.itemsize
-        query_responses.append(
-            mlperf_loadgen.QuerySampleResponse(
-                s.id, response_data, response_size, n_tokens
-            )
-        )
-    mlperf_loadgen.QuerySamplesComplete(query_responses)
-
-
-def issue_query(query_samples):
-    for i in range(8):
-        threading.Thread(
-            target=process_query_async, args=(
-                query_samples, i)).start()
-
-
-def flush_queries():
-    pass
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--mode", choices=["performance", "accuracy"], default="performance"
-    )
-    parser.add_argument("--expected-latency", type=int, default=8000000)
-    parser.add_argument("--samples-per-query", type=int, default=8)
-    parser.add_argument("--min-query-count", type=int, default=100)
-    parser.add_argument("--min-duration-ms", type=int, default=30000)
-    return parser.parse_args()
-
-
-def main():
-    args = get_args()
-    settings = mlperf_loadgen.TestSettings()
-    settings.scenario = mlperf_loadgen.TestScenario.MultiStream
-    if args.mode == "performance":
-        settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
-    else:
-        settings.mode = mlperf_loadgen.TestMode.AccuracyOnly
-    settings.multi_stream_expected_latency_ns = args.expected_latency
-    settings.multi_stream_samples_per_query = args.samples_per_query
-    settings.min_query_count = args.min_query_count
-    settings.min_duration_ms = args.min_duration_ms
-    settings.use_token_latencies = True
-
-    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
-    qsl = mlperf_loadgen.ConstructQSL(
-        1024, 128, load_samples_to_ram, unload_samples_from_ram
-    )
-    mlperf_loadgen.StartTest(sut, qsl, settings)
-    mlperf_loadgen.DestroyQSL(qsl)
-    mlperf_loadgen.DestroySUT(sut)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline.py
deleted file mode 100644
index 2e190cdd5..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright 2019 The MLPerf Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-"""Python demo showing how to use the MLPerf Inference load generator bindings.
-"""
-
-from __future__ import print_function
-
-import argparse
-import threading
-import time
-import numpy as np
-import array
-
-import mlperf_loadgen
-
-
-def f(x, y):
-    return 4 + 3 * x * y + x**3 + y**2
-
-
-def create_responses(n, m, mod=4):
-    r = []
-    for i in range(n):
-        r.append([f(i, j) for j in range(m + (i % mod))])
-    return r
-
-
-responses = create_responses(1024, 20)
-
-
-def load_samples_to_ram(query_samples):
-    del query_samples
-    return
-
-
-def unload_samples_from_ram(query_samples):
-    del query_samples
-    return
-
-
-# Processes queries in 3 slices that complete at different times.
-def process_query_async(query_samples, i_slice):
-    time.sleep(3 * (i_slice + 1))
-    query_responses = []
-    samples_to_complete = query_samples[i_slice: len(query_samples): 3]
-    for s in samples_to_complete:
-        response_array = np.array(responses[s.index], np.int32)
-        token = response_array[0]
-        time.sleep(0.0002)
-        response_token = array.array("B", token.tobytes())
-        response_token_info = response_token.buffer_info()
-        response_token_data = response_token_info[0]
-        response_token_size = response_token_info[1] * response_token.itemsize
-        # mlperf_loadgen.FirstTokenComplete([mlperf_loadgen.QuerySampleResponse(s.id, response_token_data, response_token_size)])
-        time.sleep(0.02)
-        n_tokens = len(response_array)
-        response_array = array.array("B", response_array.tobytes())
-        response_info = response_array.buffer_info()
-        response_data = response_info[0]
-        response_size = response_info[1] * response_array.itemsize
-        query_responses.append(
-            mlperf_loadgen.QuerySampleResponse(
-                s.id, response_data, response_size, n_tokens
-            )
-        )
-    mlperf_loadgen.QuerySamplesComplete(query_responses)
-
-
-def issue_query(query_samples):
-    threading.Thread(
-        target=process_query_async, args=(
-            query_samples, 0)).start()
-    threading.Thread(
-        target=process_query_async, args=(
-            query_samples, 1)).start()
-    threading.Thread(
-        target=process_query_async, args=(
-            query_samples, 2)).start()
-
-
-def flush_queries():
-    pass
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--mode", choices=["performance", "accuracy"], default="performance"
-    )
-    parser.add_argument("--expected-qps", type=int, default=1000)
-    parser.add_argument("--min-duration-ms", type=int, default=30000)
-    return parser.parse_args()
-
-
-def main():
-    args = get_args()
-    settings = mlperf_loadgen.TestSettings()
-    settings.scenario = mlperf_loadgen.TestScenario.Offline
-    if args.mode == "performance":
-        settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
-    else:
-        settings.mode = mlperf_loadgen.TestMode.AccuracyOnly
-    settings.offline_expected_qps = args.expected_qps
-    settings.min_duration_ms = args.min_duration_ms
-    settings.use_token_latencies = True
-
-    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
-    qsl = mlperf_loadgen.ConstructQSL(
-        1024, 128, load_samples_to_ram, unload_samples_from_ram
-    )
-    mlperf_loadgen.StartTest(sut, qsl, settings)
-    mlperf_loadgen.DestroyQSL(qsl)
-    mlperf_loadgen.DestroySUT(sut)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline_inferred.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline_inferred.py
deleted file mode 100644
index 9325b8410..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_offline_inferred.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright 2019 The MLPerf Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-"""Python demo showing how to use the MLPerf Inference load generator bindings.
-"""
-
-from __future__ import print_function
-
-import argparse
-import threading
-import time
-import numpy as np
-import array
-
-import mlperf_loadgen
-
-
-def f(x, y):
-    return 4 + 3 * x * y + x**3 + y**2
-
-
-def create_responses(n, m, mod=4):
-    r = []
-    for i in range(n):
-        r.append([f(i, j) for j in range(m + (i % mod))])
-    return r
-
-
-responses = create_responses(1024, 20, mod=3)
-
-
-def load_samples_to_ram(query_samples):
-    del query_samples
-    return
-
-
-def unload_samples_from_ram(query_samples):
-    del query_samples
-    return
-
-
-# Processes queries in 3 slices that complete at different times.
-def process_query_async(query_samples, i_slice):
-    time.sleep(3 * (i_slice + 1))
-    query_responses = []
-    samples_to_complete = query_samples[i_slice: len(query_samples): 3]
-    for s in samples_to_complete:
-        response_array = np.array(responses[s.index], np.int32)
-        token = response_array[0]
-        time.sleep(0.0002)
-        response_token = array.array("B", token.tobytes())
-        response_token_info = response_token.buffer_info()
-        response_token_data = response_token_info[0]
-        response_token_size = response_token_info[1] * response_token.itemsize
-        # mlperf_loadgen.FirstTokenComplete([mlperf_loadgen.QuerySampleResponse(s.id, response_token_data, response_token_size)])
-        time.sleep(0.02)
-        n_tokens = len(response_array)
-        response_array = array.array("B", response_array.tobytes())
-        response_info = response_array.buffer_info()
-        response_data = response_info[0]
-        response_size = response_info[1] * response_array.itemsize
-        query_responses.append(
-            mlperf_loadgen.QuerySampleResponse(
-                s.id, response_data, response_size)
-        )
-    mlperf_loadgen.QuerySamplesComplete(query_responses)
-
-
-def issue_query(query_samples):
-    threading.Thread(
-        target=process_query_async, args=(
-            query_samples, 0)).start()
-    threading.Thread(
-        target=process_query_async, args=(
-            query_samples, 1)).start()
-    threading.Thread(
-        target=process_query_async, args=(
-            query_samples, 2)).start()
-
-
-def flush_queries():
-    pass
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--mode", choices=["performance", "accuracy"], default="performance"
-    )
-    parser.add_argument("--expected-qps", type=int, default=1000)
-    parser.add_argument("--min-duration-ms", type=int, default=30000)
-    return parser.parse_args()
-
-
-def main():
-    args = get_args()
-    settings = mlperf_loadgen.TestSettings()
-    settings.scenario = mlperf_loadgen.TestScenario.Offline
-    if args.mode == "performance":
-        settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
-    else:
-        settings.mode = mlperf_loadgen.TestMode.AccuracyOnly
-    settings.offline_expected_qps = args.expected_qps
-    settings.min_duration_ms = args.min_duration_ms
-    settings.infer_token_latencies = 1
-    settings.token_latency_scaling_factor = 21
-
-    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
-    qsl = mlperf_loadgen.ConstructQSL(
-        1024, 128, load_samples_to_ram, unload_samples_from_ram
-    )
-    mlperf_loadgen.StartTest(sut, qsl, settings)
-    mlperf_loadgen.DestroyQSL(qsl)
-    mlperf_loadgen.DestroySUT(sut)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server.py
deleted file mode 100644
index b564543cd..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright 2019 The MLPerf Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-"""Python demo showing how to use the MLPerf Inference load generator bindings.
-"""
-
-from __future__ import print_function
-
-import argparse
-import array
-import threading
-import time
-import numpy as np
-
-from absl import app
-import mlperf_loadgen
-
-
-def f(x, y):
-    return 4 + 3 * x * y + x**3 + y**2
-
-
-def create_responses(n, m, mod=4):
-    r = []
-    for i in range(n):
-        r.append([f(i, j) for j in range(m + (i % mod))])
-    return r
-
-
-responses = create_responses(1024, 20)
-
-
-def load_samples_to_ram(query_samples):
-    del query_samples
-    return
-
-
-def unload_samples_from_ram(query_samples):
-    del query_samples
-    return
-
-
-def process_query_async(query_samples):
-    """Processes the list of queries."""
-    query_responses = []
-    for s in query_samples:
-        response_array = np.array(responses[s.index], np.int32)
-        token = response_array[0]
-        time.sleep(0.0002)
-        response_token = array.array("B", token.tobytes())
-        response_token_info = response_token.buffer_info()
-        response_token_data = response_token_info[0]
-        response_token_size = response_token_info[1] * response_token.itemsize
-        mlperf_loadgen.FirstTokenComplete(
-            [
-                mlperf_loadgen.QuerySampleResponse(
-                    s.id, response_token_data, response_token_size
-                )
-            ]
-        )
-        time.sleep(0.02)
-        n_tokens = len(response_array)
-        response_array = array.array("B", response_array.tobytes())
-        response_info = response_array.buffer_info()
-        response_data = response_info[0]
-        response_size = response_info[1] * response_array.itemsize
-        # print(f"Reported size python: {n_tokens}")
-        query_responses.append(
-            mlperf_loadgen.QuerySampleResponse(
-                s.id, response_data, response_size, n_tokens
-            )
-        )
-    mlperf_loadgen.QuerySamplesComplete(query_responses)
-
-
-def issue_query(query_samples):
-    threading.Thread(target=process_query_async, args=[query_samples]).start()
-
-
-def flush_queries():
-    pass
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--mode", choices=["performance", "accuracy"], default="performance"
-    )
-    parser.add_argument("--target-qps", type=int, default=100)
-    parser.add_argument("--target-latency-ns", type=int, default=100000000)
-    parser.add_argument("--min-query-count", type=int, default=100)
-    parser.add_argument("--min-duration-ms", type=int, default=30000)
-    return parser.parse_args()
-
-
-def main():
-    args = get_args()
-    settings = mlperf_loadgen.TestSettings()
-    settings.scenario = mlperf_loadgen.TestScenario.Server
-    if args.mode == "performance":
-        settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
-    else:
-        settings.mode = mlperf_loadgen.TestMode.AccuracyOnly
-    settings.server_target_qps = args.target_qps
-    settings.server_target_latency_ns = args.target_latency_ns
-    settings.min_query_count = args.min_query_count
-    settings.min_duration_ms = args.min_duration_ms
-    settings.use_token_latencies = True
-
-    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
-    qsl = mlperf_loadgen.ConstructQSL(
-        1024, 128, load_samples_to_ram, unload_samples_from_ram
-    )
-    mlperf_loadgen.StartTest(sut, qsl, settings)
-    mlperf_loadgen.DestroyQSL(qsl)
-    mlperf_loadgen.DestroySUT(sut)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server_inferred.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server_inferred.py
deleted file mode 100644
index 76461a75d..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_server_inferred.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright 2019 The MLPerf Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-"""Python demo showing how to use the MLPerf Inference load generator bindings.
-"""
-
-from __future__ import print_function
-
-import argparse
-import array
-import threading
-import time
-import numpy as np
-
-from absl import app
-import mlperf_loadgen
-
-
-def f(x, y):
-    return 4 + 3 * x * y + x**3 + y**2
-
-
-def create_responses(n, m, mod=4):
-    r = []
-    for i in range(n):
-        r.append([f(i, j) for j in range(m + (i % mod))])
-    return r
-
-
-responses = create_responses(1024, 20, mod=3)
-
-
-def load_samples_to_ram(query_samples):
-    del query_samples
-    return
-
-
-def unload_samples_from_ram(query_samples):
-    del query_samples
-    return
-
-
-def process_query_async(query_samples):
-    """Processes the list of queries."""
-    query_responses = []
-    for s in query_samples:
-        response_array = np.array(responses[s.index], np.int32)
-        token = response_array[0]
-        time.sleep(0.0002)
-        response_token = array.array("B", token.tobytes())
-        response_token_info = response_token.buffer_info()
-        response_token_data = response_token_info[0]
-        response_token_size = response_token_info[1] * response_token.itemsize
-        time.sleep(0.02)
-        n_tokens = len(response_array)
-        response_array = array.array("B", response_array.tobytes())
-        response_info = response_array.buffer_info()
-        response_data = response_info[0]
-        response_size = response_info[1] * response_array.itemsize
-        # print(f"Reported size python: {n_tokens}")
-        query_responses.append(
-            mlperf_loadgen.QuerySampleResponse(
-                s.id, response_data, response_size)
-        )
-    mlperf_loadgen.QuerySamplesComplete(query_responses)
-
-
-def issue_query(query_samples):
-    threading.Thread(target=process_query_async, args=[query_samples]).start()
-
-
-def flush_queries():
-    pass
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--mode", choices=["performance", "accuracy"], default="performance"
-    )
-    parser.add_argument("--target-qps", type=int, default=100)
-    parser.add_argument("--target-latency-ns", type=int, default=100000000)
-    parser.add_argument("--min-query-count", type=int, default=100)
-    parser.add_argument("--min-duration-ms", type=int, default=30000)
-    return parser.parse_args()
-
-
-def main():
-    args = get_args()
-    settings = mlperf_loadgen.TestSettings()
-    settings.scenario = mlperf_loadgen.TestScenario.Server
-    if args.mode == "performance":
-        settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
-    else:
-        settings.mode = mlperf_loadgen.TestMode.AccuracyOnly
-    settings.server_target_qps = args.target_qps
-    settings.server_target_latency_ns = args.target_latency_ns
-    settings.min_query_count = args.min_query_count
-    settings.min_duration_ms = args.min_duration_ms
-    settings.infer_token_latencies = 1
-    settings.token_latency_scaling_factor = 21
-
-    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
-    qsl = mlperf_loadgen.ConstructQSL(
-        1024, 128, load_samples_to_ram, unload_samples_from_ram
-    )
-    mlperf_loadgen.StartTest(sut, qsl, settings)
-    mlperf_loadgen.DestroyQSL(qsl)
-    mlperf_loadgen.DestroySUT(sut)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_single_stream.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_single_stream.py
deleted file mode 100644
index ca8d84591..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/demos/token_metrics/py_demo_single_stream.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright 2019 The MLPerf Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-"""Python demo showing how to use the MLPerf Inference load generator bindings.
-"""
-
-from __future__ import print_function
-
-import argparse
-import array
-import threading
-import time
-import numpy as np
-
-from absl import app
-import mlperf_loadgen
-
-
-def f(x, y):
-    return 4 + 3 * x * y + x**3 + y**2
-
-
-def create_responses(n, m, mod=4):
-    r = []
-    for i in range(n):
-        r.append([f(i, j) for j in range(m + (i % mod))])
-    return r
-
-
-responses = create_responses(1024, 20)
-
-
-def load_samples_to_ram(query_samples):
-    del query_samples
-    return
-
-
-def unload_samples_from_ram(query_samples):
-    del query_samples
-    return
-
-
-def process_query_async(query_samples):
-    """Processes the list of queries."""
-    query_responses = []
-    for s in query_samples:
-        response_array = np.array(responses[s.index], np.int32)
-        time.sleep(0.0002)
-        token = response_array[:1]
-        response_token = array.array("B", token.tobytes())
-        response_token_info = response_token.buffer_info()
-        response_token_data = response_token_info[0]
-        response_token_size = response_token_info[1] * response_token.itemsize
-        mlperf_loadgen.FirstTokenComplete(
-            [
-                mlperf_loadgen.QuerySampleResponse(
-                    s.id, response_token_data, response_token_size
-                )
-            ]
-        )
-        time.sleep(0.02)
-        n_tokens = len(response_array)
-        response_array = array.array("B", response_array.tobytes())
-        response_info = response_array.buffer_info()
-        response_data = response_info[0]
-        response_size = response_info[1] * response_array.itemsize
-        query_responses.append(
-            mlperf_loadgen.QuerySampleResponse(
-                s.id, response_data, response_size, n_tokens
-            )
-        )
-    mlperf_loadgen.QuerySamplesComplete(query_responses)
-
-
-def issue_query(query_samples):
-    threading.Thread(target=process_query_async, args=[query_samples]).start()
-
-
-def flush_queries():
-    pass
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--mode", choices=["performance", "accuracy"], default="performance"
-    )
-    parser.add_argument("--expected-latency", type=int, default=2050000)
-    parser.add_argument("--min-query-count", type=int, default=100)
-    parser.add_argument("--min-duration-ms", type=int, default=30000)
-    return parser.parse_args()
-
-
-def main():
-    args = get_args()
-    settings = mlperf_loadgen.TestSettings()
-    settings.scenario = mlperf_loadgen.TestScenario.SingleStream
-    if args.mode == "performance":
-        settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
-    else:
-        settings.mode = mlperf_loadgen.TestMode.AccuracyOnly
-    settings.single_stream_expected_latency_ns = args.expected_latency
-    settings.min_query_count = args.min_query_count
-    settings.min_duration_ms = args.min_duration_ms
-    settings.use_token_latencies = True
-
-    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
-    qsl = mlperf_loadgen.ConstructQSL(
-        1024, 128, load_samples_to_ram, unload_samples_from_ram
-    )
-    mlperf_loadgen.StartTest(sut, qsl, settings)
-    mlperf_loadgen.DestroyQSL(qsl)
-    mlperf_loadgen.DestroySUT(sut)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/diagram_network_submission.png b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/diagram_network_submission.png
deleted file mode 100644
index 35663b97fe3ad1453c431cbf5c6155ab37c78851..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 51192
zcmeFYby$?o_cu<ngs{K@OD`phf|N+t(jhFMu!wYb%hI)UhlJ7~AxJ8Mba$voOLs`O
zz_a>(f4<-E_5A++{qsD}Ue|ri+%t2|nK}2FIdkSU8wpoeyhlh&h=GA|PgzM$3j+hQ
z3IhWx34n(#sqKi1!oVQ5wUL#DE6d74;I2-VHue@67)p_^b@36}{Z!fdNeKyKSb0hW
z&8**|Q-CjUFt?0Z*@95yW-xelK{R27fn0Td8B0_}MIhG$)Al;(SqlQLIU#QVl3#nO
zBt7Z3=Cg6VK9VC~b*s(qwoi=l#_we$8|e=$h|x<(Y&^iEsOX)5vIZuWb1t@0V8>cN
zTX}uGHHJmWU1L)t=7+qO9y-o9ZogZjNQ@vWn1O16!pS@<YqE317%hn$iBuTU&o;S>
zDV0D-#;z|~U`Ur8NLWA8^PzJQxzN2Cww_w%7UDqK*MqMuv2$HWw=+VU?E4B12~2hQ
zjM(s`zY{D=XD1AXBgh)H{cl0I6@gPEGpdlVLyVjuC*j(=o*a>((KSm_GYi9ze5waY
z%18AAx1-;FJN1tylU`N@#Xd+;=y`Y)F4**1j9n$D6zCZ(U}j4BDXd<jOLu|+BZVbJ
z@jmCCrdf8(y&Zfou<@uNPZLNdOaEvjI#Op#pjK{)K4QsMrZ}2VI1)%F-GqP;sluo2
zU#myzUcz>23hh%>0#Y9`oRl4$N@u_A(mHG<BNTq%uU4Rxo{q(bYgz`=bS3Vh1vd7%
zQsh5-QV@;WO=qSn6*nYnrR+Mya*>NjVtP1~6xLg|t9oo9dugg4%Ay`a$ixM_ukc_e
zBx~6Pe<i%ZI1SfS=Nm9yI+a!;g~$j<TQtPr^9abnUV^-0!(~1kXb_f9qAsrK<;L+M
zBW%X)Vyx-F`SDV`0ppQ6yHqwiXN2mro=EC7=iV;Pm3G}g>o&Z+K!VjOi(pO&e#Fd|
zoyH}eH6`IiEY3bWti|$g#KTH7^kkh@SWv70X*P_SB`@ZX0xUS-G$;JiK(Rm{Ej>4V
zW+G8GER!@}n(;R`Qzkp(St@b1bnGV6d_&ctag$dSXU|=;ztZ*d&fquJuN5R$6H7(R
zm5Euy$XO$o<b37;q+K_M))&NK!}reH{lbqfY&DQ7)*Rw2FFD+Sjv7>ys;$wkT$_Qv
zm)^fH_tz(y!*lgsW=3L|hYty33t^4a<DYbPgydRjlvtE4>pTm+WH?{ozgS;@PbV7m
zL-lg#)#|SOi^4*}Hh81+Cl)3YY3$aWo2NTj7p|=WwEZM^3tB#RQJIBT>FnaXBC+e(
z#-tF76Evo-E+7<d`jSmtQNiqV7;?A@WBe-h+vT|=<?ED&iTYO!{#tRAfFf$>6GopX
z^RQ<r++4;ri<2^Ks}Ww0)YXJ^g@iCb3$$xHJKSambiuqM0YA8T@gc@NZEe!ubqzqi
z8;qiapYO6NMQ7D7-UySnO?!wpe1w;FQj%i%s#<qsSpw98*sDS=>oIv2NrPp;U~E$v
z7A5Rt<JY0iS*q=id5MPb*>j1@G3?s$s&MV`8`~wFf#V_fov|`8soN}^8RkOEiYc0K
zBo-Mq6@e(!7*NzSBAt06f^|9U0urPICXel6wNb)Lij5!>;V04yOkpldxYzfjkoJ;B
zBsf!ggn6&eVU_t?Xp@X&K3GNGLz7}0k0J6Zm+LX%Q#p?O>CI;!0R|C|WFL+_aOZgu
zo-NIjJNcF5gncF&Q;osQcs*TUJ*2uC!NoAxQ8-0k4tm)8zUoCe+3l!qHAy+$bs&X{
z=tx)A5@B3dvNNM3WljiBx6oqaq3CarKV?9~`JxfvaWG$d;-carP>Wz1WrHGziI{LK
z@h&}{Q_Vto%CRKMd}R0TY))4gVjf|8#O}!Qi{lB!Y-D_7GaGq-!m1J>q&um;5KD`|
zM#6??OL&X0G<m$ZsKT;ny@;wJwc@_cL!F1Zvjwlz_M&D4KKa*BO~2VyjAK9U7w8-8
z7wd~%Rb5qgm6CsJs!EW{rsWDf=A=w{F<>(wF`)0^;X&kKbn3Fra_Y5Bxs5+Q`d;eU
zo%D}9QTX&Y!7~vL8V~AE8QyQNQWXlO$AWFPK4Q)AzYyGG+r#{z5mK>~-~M&cZs<qJ
zc**zr{6m%Ga+?qR-!cpJ3iT?yl#Y`uKU3)qDnx(i{_&>Zd8vooPLWRK69b==SLLr-
z&b0O<W;Q<N>?F9`yLY%>h!T0!?PQL7sr7EAx<AQMG?HGBk$Sp6&b~df?dtC4KIneF
zRkwAvxwYxF$-eV+vTR)J{bYL^_z5uyYkYO4OM*+tq2_{I&O4qlqkN;X0QtM-c0h<R
zc92YIyMM?I9@D*yFq5!`u!r|{?(NepDUc|v#1zpuJP~ymmw<Vq?f~-<-^lP4=lB$2
z8Ddsqt!SNuTCC^nOl`?cSorjuT)!30rkK#$zcKsLbCPb8M)Q+vOwGKmXSaTusE9GS
zd&o?OSA)-ID0Y~2cx`=lXm;qD+nzg#D~kI}f9`W3;tFA@S5a+Wy<5#+OKT@M_`YX5
zPH=jr_l0%y$d?~i&9NVzMh&yP^?tiihgU~G%Q8zl>+|YyBc!qP)$>=AyOC#-XANg>
zcX`BeNwP^`;VR*(aU}(51${2bW{ni>{gcYO&(_?(XKW9-K60;c@!#xlj(o-Ws?^g<
z@SCmqRo`n?eMNtlQ}<!FpG)4JDI+C(1yPEEao#KIi#={-?v3l}Zls<$JM)XTGo4>$
z7ptdNQaOIQm2a?&QjLxp&Kqh~zNmcU^l+beKXzWo&)sj?H}xv>l>H*?V&v4}=w@+a
zO?T0AY+ob5>h9&8#O<i!Y#de0Jl~Z|pge|<vA(n-M1Yso+0Ixo!3!u<@?cZ5%7Rx4
zX>}lwD)BJ<Q`+f+E<rh3SK(EO*?5V!Kk1`F6;P>Rb)j1*_D~0a573b;o57G`o??lH
zPat0;2}ncPz<+eV7uC7)%+0$88J{7s{tP7efZ0}dyudo#+Qxka;l7(*UDi7vT^&c(
zJKh_NP~Y92x^}5^(O&UqP^708s&JkAexZ3-yR1*KNsvr&#@3`tto$+?Db)W+IeR>*
zJVTthLG;p3#d_d<O5Av2X{|O-A<etUZ?xZ(xB9olzWels#~O*XxCL*Zk_d$E&!}fg
z9?r29D0?NxsZV4MW%)uw5)zZDIaVlyQuVv{hF8rUhjy2$y>^g1m+ij=e?O85G=9CP
zrz1V_zI<aEbGody5hc^%YVtXPdj{g2MUhd<zTxL+|Eppys41oC=F;|(=2Edsbou3B
zDQA&(k%mp0u!;&BgZ52PMb->!Y|^h3=9<WwFf0RPNh|bo@*N}q0Y)J7ri8e?X7aXX
zzIT7;8F5apHt(s;KgiFqTo&*W(6`JOeIvEAWIwU%;Fchg;r3$v#fX9MXE(#Hrk}^0
z$8H;UI=e5L-d|MWauU%}zv0UKd~irgCN-X7ouZ`b_~^*!qXl0J;$yu-<aN@iYD>zI
z>S$U+>gzPW7X$j<x4oKDI=_Xhxa{<1T`Sk#>H>8mbmtH>A6?HM>^+w+W2|Is{Ni}e
z?djH=)(Le(pNhEx-$Bs<(aSG!QJYa=r#d^j<#!)t>X*DNl-6!zd+5^aeOn`2w+7Z1
zhd=47)ulLG)<6GvI#Czl;PL#)^R?N?S)$KUKNFh@PC0UN-h0ySam`+Or6F~De_i}6
ze&1YHb`IXxGYpyNv-@(UKIP+OwAm8&Vq?bPn&x_tBoO-uo5z3T;<Bf`B!oNMm*mH-
z>CIqI=Sk-~52c?mU$VVT{1jzh$X+M%DZZe+p(DN9KH90ce_S+B;H{X(R6wtfv|mJi
zJ)E~xb~}&UrHI$R|9MU#!~1Ba)TMYvT`^12=l<`ZejS~i!&*$Mw<DKXVy^QWzBjm)
zL>|-<JdC%)$DcbL3>`euns@|Xo;A-+Nq)JRdY$`_qaw^u!Y$Tquj4d*XHO`}aqf%A
zl4URV?T3xe$BwNn+Q{O&UrjzJpLge<ex*tA`om8fZ}zXnnndFyiT#j=njHm=@m}#p
zl8F+R;|&L#dp4H>wN^hye`X7LgnAnI^WDmCsP+%u{z{VicsF%1@aqRp0N$NfHOHso
z<4-52ZExQ>-COu|HMmllI$uAJzkjrF+RC#huseIxz9GBQn-V{IRqV&&_x{5EuHjI1
zJk1CD5*wSmB1H@dYBQFtt-(On5@Ecc#c)T4<;LjF<Kp9L&Ku#AU1%V@jOal;7`7(G
z!acQbF@-VS4vY8Q^XVp&<OApa`qlZAy@tzEh@F~p9tH5j#&&v&0TIMH&=chyYqc?6
zJSG9}I$*phez2cQ$lM{;Ie~%u#Szknu@HIWfAutO;ar+52>+txjymgULX4K<Mm9S5
zh_XN^TRwe?@c^9%VBlcVV&I~4nCM3mlkUIr3Ybs~?7!t$7#QI;7&!m7QAekLehKL3
z56wR*c2XDyKKhLu{UCF({?i(>Di`}dd8{OK9fpjytg<pX)i!gruyAy<c5?qnD_e#x
zAaGXFbHl)(V*m4CDr>PFp!rYOJVUr6o~ntOIXOHsF?TYxc!YFt{=)}D94U&<I#{@y
zK#&ghj&7nz36{StMA7*_#XKyKzfIijBv=qn;SgCTR}09KN4$@CStJP|5Qw;|xuvL<
zoWj54=r;)#Yj<~NQ63&oPtQl50*{<rt$6rEL_~Oa`FZ&HxzR1S-Mk##O_1D<Zmj<>
z`41mC3pX=Y8)tVLCr8L1z9yzl9_|t>EPn+3*YBTxS|DxyTau&Ozr#WgkmpYg58oqR
zp8w*FCKdlvDhjtjTG;E!**KtehL$1uL{M1#Z~OmG&3{Y$AC!pyrhF{G|34}JN6r6E
z`P|LIRo2M?Ez(``zdzT%#Q(GMUqW%7KO_Gin)pY}e@oGNmLwGC`LAy#Nf?Zr{E7aI
z^fq!D&(JB_$o~8=|JVlfpY%`OeF|F7_XGn&8beu5<{1)mx7pL(pf~ZCexEUhG7tw4
zD5A)lOwO*~K9FQFow^2{uCb5bJB+cK|3Yz>qi5G)ke%VN;FnD^y8_X#qi35{(W!A|
zW53U9MUJM0q+_rg|EMI<7sY752gh+9X)j&J(wOq`)D8IA*z$gR+mC<5hKwgJkaZCt
zj1?yx#hkE&lWNMkzATFMw-Ah_ZY)VgkEz&3=8U67j>%D#LjQz%2L1A<j9_TvKa?sg
zi#VxCo$D~Me+Y$xF6sYiM5d&Sli8-_I`z`}FBf`D4&EH>e?@Xe22vnYNZu0d{gIDm
zhO_SWM--Yngq_S8OSRv`Esx<J@tJMDB>yx56tbcF#VKO{=;J>f52Q#R{ihLqP>Ng|
z0T+AtaOkVQeMB=k7Wq56K-PpLfN&7|*Q&$Wd;<f6Z(qNfXjxcUsajeVnVXxlC8nft
zsBE&{tV;L|4h+=z8yXImdU&+u)z*IR5nq(yX=7ugKA=%|gM3mLc=yy2J*XseceuHO
zgM;hu3l|ah%>>SKPVJ(;i^COm9v%%y?Gk(>TjF7CY^>|{IH!%RZ4w!)Lf>kCvc*t3
zUuT}r%`a?W-ig!GQ!RacJLJdDpJz{jIx;drAZqH?b0ORD6936P4pFSFV6=*oVji!0
ztPjylw+6H}U9P{id|jzmt6({jX*0k;PDg@!lpp`&#}C)tDLzgvu9P2Tx~g7Ym(H7p
zIeq}fSF*ApOI22*j7y{}W+ZT4ov5fN^Q)6Be!B^-3^C6NGCg(mIIl9lC(9hdST=v9
z5xXC#&HJ4jc&#QX#<WS^9>xzItqnf)ilrCfS7vJ#=Zj;MctH2WJ|RoYb6}}6{9Spu
z+OMW#^$a1W`L*YnEV{Cb*Jpdl4zqOxW(K;t96$;@!T=N@b4_Y0)is@vV<s1(N}7uM
z1sf$LrIwaf5{rZfA45<(+M?_a_<rdUp}xZ{d0H;=ea{;g1hKAXCR^@L8hB=TaiCJd
zWJQn%xlO;11{F*jJ@->>z8CfDwj5EiZZD0swp=+cwqeq(Od9$Oi|<WL=&*7)x6y^R
z0V-j?^gla(I)yh?l~sIE;kAYcj2sG9w<MgcYz6++Zw<xN5gScpI~gmRgZ*5+CZq3x
z%yf%i;|I5=S)$KV9~s#01YTZVDySvU?wE?b+O0ZZDeUz+pZBQ@)z{;pf5g1jE;DQQ
zl=`n(6yis<!Bb@Pl7_)Ga~C9KS>*~zI2}wmE2{O`I~}J>jmDS91ENoz^pck5vM0ae
zIXr*<yr0ggiTUjl2XgVMtuG@K`$6&s8jUiJ+h!7x`+)ZD^AJ&C1ueNvsSWK%6L({d
zh=S#pGs`~Ci$3$(Forw>^UX4pw~^Nx<z;ce6$fj{tYhO-$=^$)g9EQ0a;MN5DFOtS
zKin=;w^m68Q*5NArVea=E!aC5n+)`9%EaX9iEKF#=^OBRNpS_*Uh=u^_|)VYBN=|%
zGuZDF7k+k0zhGD*M9&1h3>H6$KF>4yxX<nfM9<HpYDOrQtW9mUg0SoUe6yCeHY<ll
z_G63ew{IVs^~MQWh}e8%V;-!wI%ArUV&)RZNa8LDUnQTgfP>8M)jaAvt+g6WC7`(P
zu5F*GXoT2d5eb@HU$<;btz*XC4(lt)VSA0llD*BIJkEfDU$5eUgjJHbBx{ZlQY$`$
zQRFmAQMq@DarD*YiY+T|vTrs7*cW>|km7y)SM&?Dmfh!zQRE!sitFY|6}g60kq=}l
z(hR+oZ!dH1rpXbKXVvljpC%3M7^k`-DH;t%Xs046I0|cOw6guLY*=5@Io8Wi@fg~t
zhYNm98155h6><$0B4kJ^FoY!-<N3F{iTPc9H0gX8dZxr>Qo%cal6?B43074wisGT!
zzaYQrPOqs@#-y%z$9)hIlNf&(mlj_goyId?8<Q5_GB%351b@Cf-k25UWlH?6dc}-h
zK-tVBfjD8G=(c4t&;lQ_p~!=M;aot{DBvIy3rr7Jl@3>dR%+>C5;1%ph)YPq9;;PI
zl0;b1uuhvHU=_!y(3g0oDQ*r<Dhw>fjnw3D$SsHGkB20!idT^0P*o^SlbNn~H$4W4
zOiF9@4GNCq6C~OC4|0D-<bYK`41~!$h5<JoW%%T4KA;C1ZW1>}pKM=qN&^7P>t75S
zU-g$&w%BfbR|Fk=@z_zjgzbJLqH(@CoiuV<CAfbg^<FySW02x$<=s_fQJ>V+*Z6@S
zx<(EjLHGQ2w=@IE_Cln(r0+E6PjH>^-jVbyK1Ch{CV`RK)P>kyiE9ncOE3XMeQGsH
z|KoAeUoShq(8+ZnWfQ=73B>iryL0~6dtnS7+W72X@tDcuQ7NZ|1>fW0SKJ=FOuTFf
z@vB)7rb+zEzPeC)8*Kh%u&cA=(N~RCv!=^z$>dKeVr(Ko${Mm<wp_ob4rI{&LAi$?
zI2z5w8;Pw7wg~3GUSYbkjsm}nbyd*JJrKrUhWKUCF=EM*6@WGEGf7=c4$Zg=%9$ln
zz^A!G%oHGk0h{R>i?d!@lCCxvz+#3~cC~KGqbP>uJU=DSnk<VVyrvH$h+D6wD^6B?
zRPuV)XOb1$XQ?Fos*Ql9eK1#2DeMz7|H&qk>&n^~=}i8Z<WYV}0|`&yZ3Uvbqau7P
zk#y~1L~Orr-~U|f7>w{Fygq=xPXG>N-pams8ISeuF5$&983NXYtJ9q|e5s2dPWK*+
zcf;U6j)hlFr{`e;%Mt9!$5<|3rg-+QgAO8|^!(!Z-+Y#!o@5bB^^>(8Iw#+FXJ$O-
zj!q&&euS+0BC0qvT=tqR6SA`*c`ou+AV}H<yT)upCL1V)b8<NvFux93L1BjaB0Gi(
z=2vAEk`M>HHp!BvU0s8{^}T~a2C}1II8eYX-n+Sw3GJPg`Pb7_jiH!In3XHn3wJH_
za8>%9w{-ORGM6%jh=Eaw^Gv-SM6<4;*E_u$M0Eniz;^51b?eho1D9^fqL&26OCotg
zSnV>wP==9)or*>^Of!e?w*j{s0WYq{HLepVm>Ic#55E<9OO4gA=#h(p@Q%nXxk&7>
zN-;_+F>QE@%Yq(l<SxP^n6AdcS4Z!Dk3Ox2IXku9)SWVk9D)w(st%aJAQ2qZI~yX4
zem)QfAsu^k_(><z?N|!`<XHG8yGdHi+_^)gyMu74bh1ElT&WuD(lROBP!gG70e%?O
z2?hB(N#4-$1blh*RzAjqp$<{1y+9@p8QRv%VW*gKfFf)aB~&W3*ew!sN8NfM-x)my
z#W}lJiGNFPE(WUt^CO=mP*(a{4*8!HFzHSHYQ6KP<5^vyC+yuK>KdAI-F>_8yJ#4{
z%5s?WGF_~%_ocL?N|SHGncw-(^K9RQZ3+!>lN(vdJ}E;<02Aq(NhIu3)J^Nhzvdlw
z<w`&d3(5r0Ez9KD6I@Cg0&#Uj2<hJiB`JEruoC9dT29!t+mO^yB5??pe;?*rRB#*i
zvRJtTAW&CQmKQMx6>s9`@P{&rQ^CxUoQOWC34jD^S=fXf#IFLtl0D2zSn%HAW(BOa
zuEdF{-8xGYQUKC*w>z<e+aN%3v8(uw!b(*7rW7FsmIA@DM4tVYlnT9T(cK~jBlM*>
zn>Z2Z(UQ;=faV*Ac+=t%EFH+x6-YgigLN24AzW~fh`)k8y(utHxf;lU9my=lHe}aS
zjOma3!aNk_EA-PnxnI_ZnfD++izCPP+k6NvBTwP5Z%7-MCjxIY{sswr!jDE@7|o87
zbAERLC~DSUY#TRm#&j0MOWIa0cAn;3ba4#nn=&#p`N_S4HPaUd^ii+NCXf=B(iH$Y
z%(9Kkd39tY4-_QV+2rW^q8nD(Z~i(vzSfsmveo48O`37u@f&>b;xYm8MukeLKt#o3
zjqa`{i!g4tD{luP@Iu3>gJGSGCi`Sv!5(oPOZ$P|3aD3oEGS<1Zy<1hN|niV70g^F
zb@N?HFqC?x$PZ=+B;K(@#z5IB6!o-tRy#!c?oKNMegYoddZ$Q5#lC_sLp=e+A`Mib
z`0cs8t*0NHGQvSHjgO>!n)rsDP65~YcISVV-q=k5H_Uk9dj7n;vd<&yS7*VmMVB$F
z5zGy4{krsCIjQyrDO}=qiW(g3s^Ia%-2+iDb{qwHND9PKTsq>-xi9`#8Db!}S+^(j
zPT#5dM-QtZ$vPI@iU=ty%9$M}q$9koEwO>Z8I`+ySQJkxagtzySGv4p0$$?#M$L=Q
zqeQq*7=q)~e)MOm#$aU7(X;L(?JN_mqI`kuD=6Qb->V$Mizb<T*2}##u4s4I@+ct3
z|Ifa>Ki%k2P#y-O&n}<#dm#2wDv$3HQIYVn6S5-$s?rw>XBYtD#(X0%q_vYl8!kv3
z+k8c|+!REajZFw9CGPGIvjd6e34-_sh~(v!0Ns=el2NpzB)G&hC@whwMQCUT@zKh3
z9`A}OKre3!Q5g(n=50~~&|}G#l6!YzhZt*4$lzTC`a;zK)DLfG7jEmwaBjLnWSPHv
zw)7H(pj{IAO|PBI;djaGuQ@l<IW(HRgEXg%-K5!zYQgu8q|1?QO=0JY7(Gb(7!ErG
zG6aZacCdeUyB|iKr|wLgZR$IV%;@V+7nlGtufPHzygfl@rFVlyw>1p$t&+&ha+jrM
zikeQd?gI>-Dv0@zPXvBJxLbPs-67MR$trRC5yNWN^+ks`Owy2swMlL}ye)$?L0nj-
zid6g{kmkkV<n{H5!e8IUgd523bBKN4#kgDQ_d#b-dc&orW%*MwtQa1)7r9+F;r7Dw
z1z`9%)U4dS2?NAr(vc80as>{6Qbgbfv--+XG2n#3x@tR?6*a@)<duh^SMXct{l2It
zhwj$Ymzuugp*1^uWZEA`J%C`pz<LPp1RUN61be{Y-l*(lA`~&+c&8T-vM1X`PsE4|
zzTI26{jd})j*{kF3_h1<g3p&Te3Q&peMXQ?Jx{rpTfQRE$@>V|2gGHhxap#kSHHT<
z81?O$$c50sLhdf&@2qi9#Ps1)34$#sFI*EXz?T1Mnb9X8Gm_6qI9#-pX>NsRHH>)i
zYFt}`COhCVEx;y>pvDxZYF)+XcA4k4W0e)ve*t0Y3Qrtkx@^O{43+^A)52o=&ZkHH
z&#*3+El$t3!v97AUhv1zHPRy}C+O>Fz^$9MFVsjpZ61Ijul$O3@AqKq?`Se`st_X>
zh3bIm<;hjwGP5?(0Pr+0<oozis84Z7W85IT;&c`5Fpg(G1R9uu7^zb8v;)*wGMGIS
zQ{8|IQyc{(5h=)&c_<RSdh_^4Fp~ptmV3U-RQs>8GVzIAkaS1m<AnvBdj@b!+5i&$
zdnv$dy<TZBW1u7>m7Ov80q~xR2#{+)Ean~_P7NPb0@%)Ai1Z3uGwA}xbhpHG+uny8
zL(uSzT73F~JR-5Wi=tb=@<Yrh5G>ax%rYo(K4%uD@%z-oke|c9u%8~tbBsHhTD|J6
z6qCT+r38K50E>9i7fvScN|<e$;yd&f!i1DIKE%1+(g?Wv-&6$x4&t*{Ee7F|uA4G~
z9lnw<f(M95W5&pFLkY-WTIDy3%rg93GX)UM2&hIAZ-*C@pUV>Df6!KfkHuOz{vEp!
z9F%wBe>rGGiwUy?B9LcHr=c(?@p6bXqF0u4850*P<fV1UC^8(0kRTdJl!_o8y|)%?
z+TAIqiH~)CjovSu1Bll&@*HOXjNtl{UTju+fFbe|@-gZqflGcBGw!YTLnN=cyeZiE
z9vfao0t83Kk4;kA^w7>DuR?lioV16zmYhGdEg_VRoVWh^SEIumI_?RkFvov%cjk0w
z3FfOzCmO<r1tf4Y4<Sh-s|k*lp39iV$RkvV9scZlSee|ys{`Ax+LxcoR?#0X18~sc
z9~L=*fAdQ$)*%#r#|&VAJhc55I+43f#0X-;2)daJxJlDnV-u5>V-qqythpbAA7TZQ
zK%w9Vzn3uchTy(X9uN}_o=L(@M8`XSkboTx()tkho#HGCsF!KwfVQwF08JV7k7NOU
zO|Id>daWVS!qkI|SSAjku^)kSHF?IeN?h5HK6M;BapsN;^I4hszIc`%H!v|)2=6A(
z&Ha8HF3k(5=M_oAiB8*buheZd8Q7dC=n#k0{AR?n#ms~J4BKj(<t4QdOgP=8w=)QG
zFv1*h3q%0?f=|&#pv}4@{iQtK1jl%lceFy=$>WQ6FsSJ}gsCvZ7wDjI2_WSs>FX87
zy-b)T7|K1lV?le)XbdYT=_wm;xgki*PJ8YI+AvQjBoW04ebt0r?on<46EzPFWy4F+
zoRR{aZRFPLeGkM?st9ZYkiuS~qTo><$+#>g>Zf{9EbHPa-*bNV=S-af>2_pM_=jp2
z*$L!JLfdU`tQ^8W+JFQMarVYECY<2jKnk3JQ{YN>MKQxw%m!vqo}4U#lpIEulnL7w
zS_O(jLF_on`~-g8`fHcLsJ6B+=P6?#7!~pvWz|u8uN^1Ystg7;>fuXSWQNg8T~Cj4
zX(vd^7@HrP6egf#vtk5S<dPJXUgR-uxAhr|ZwfdBKlG3W-Y{1|NcR#7JLfv=tajUT
zTv#O-GY{)7)l7)n6L-LLF{q0|+L0|zCNq?8CQ-FWP*74D6m2#6V>BIR5(F`H=|I{j
zf;E+hiq9SA(0d)yP!J%HE~i8C0U&UUoJtC<9cG>;5P%ISih6Zm3>QZdLghhRI1xfU
zv?K!wE90bOn!@T|B%>&z>B(6n7zXL(qkqJvb%~PeNHMZZuI%Mi36lgZMJ{C};`W_>
z#P?d^26muO^^92Tv%qep0Ba_!5Wtoc+f|v-%|M0KhwzmXvVGjp$WIzf^^6^`CllN7
z8mJe*k{3YyE}j~%|LC!IM<nX!fwAWA7&5zr6AF=#4LK$1cna)5McqV&xX+y~#F)ec
zJ-G1{ESNkKKOnM7BpvN5W|Ch?da-3*5(b5iapRGj4I#{k;T*@P-OdXy!WQIsAO1R6
zsV{uDuXpM^C7w@Ca$Vd*ENGg0{Z1aTh(m=r0%r_vLwr=!OH3ths0s@SmaUdyX{OX_
z(qjS;(yk5NLcTR?;z5Z!gfbQBfYQh$oPoE67!h%&_%mVBRZ}8&+BsJ#oML_VZ~-=X
zZ5;(cfL3N!oUrE_H!tI@fK99&`cSnI`LD~GRqd3a5gjYu;u0gW{jzQ5pT1HtDk_oo
zYMb6f!a3)u5i$cy%Jfi)BwpAx6t09xO16(N$f&jEBmQ(X<XK`Y{2@+MerQ`8Y3HKT
z?YdKpoCpunxeueKPg)<Tfik{M4vnBD3aYJ)vSU`Ol7H2caQp3%t1IzQ&}QQ6>L?CE
zkhsT^1AjJh7f5dwdl>YYQYm}|1i)z$;Xn!x;m6>v>p9t_nv8s9TnRrsW_`LIh+x*=
z?hN4|0K^1SNB+)4Pf8lQVj0P+VBgFT4)skM`-8EP_y+WD?C}v_#p3r_T7!g9qwf^x
zqg^%{&lDj>Oe#@A5+|HW4AZF5;2q9K^91O?A)S|1g@_#+bFn&i3L*s{e3(7@F_bbU
zU%I`w(gqz|fN-L$@V)Y%g^(*Y;5lJg#BJbW=7A3B6W;RD52W!@L`CmtzWT^ivy7Ct
zNi1()7SEGz=@gJ@kbMRfZVCCFkQ2<0K2>o=K2CUiU3KtLIPFaH4OFtCS2bj_KKtw<
zhQG+DofQ**X3-arVh#oz(=A+tkx|s(V_{Q+1mt3lx@(P%C~x-5#v~n_C6-j1jIV;f
z#orMvP4K>op`0gR!^{ieu=7d~zQJ*FE`XEPgo%QAoP&lsYq1kE{Bbl=zobh6>D}@^
zlTVN^J0<x-ba+v|eC}Yynbml5V*Yi^qePHgL%Q#f$7%uNYJroCB;eJTiVvrCWHS#=
z@4iv}Ri3w_7VH63h)V|~7eFZVq@N1Oi#UKPb6J2^qu+3~KP(Kxf+em*O22?If><U*
z2}zG+8Va1Nd=Gg66ea`(ZHK8<!JS42sRxj%IUA;=TqYT1iljon6n5NhIiK+$t_0u_
za9fs-6&nYa?`uFrjPv~^esj4*eEyXe=JJPO9C2fKlLfya_*bOaNSzbxwJ0lP{S&6S
z191pW9~O4-v9f-6mF5tT?FM$JWlcTJ3xw!xuO#Y?f6pNY;8sT6F3D>$@$=s%+?yv@
z#drdBZ&K?^J$!d}h+AHY-N`EXctAXt<RGsSWxJ9LwT|ZtyRc5KJMf2mDS_n#++8oc
z4If3q%8~UzxD$IH{`5+7_!!z71jqErv<t%|BjOwAR?TjaE;v}|!vV}loYs74W+0bq
z$`h3)DO`4WB|Ikrt=vTuxW-gvs9?;p99aPdGgb<-3MLk;x&SVWk2@?%!xjOPzu$+=
z9xv3#q(bMV$q)5T5!!QOT)^L<V~?fxErKxZX(~d*R)bKb;gaD}0RBD{F%V!#W**iN
z{!+6A8F)C00q=Q9cSV{Jgs+kSV=z{@Yv=C6@^WR^V1o0+wd7_pJI;_~<Ko1|EOs%?
zbMpr(2dHdnBUbVywq+Eq`oUw2r@_?qFp?OW9KQR@s%^CSZSeLTk||AmKMTK@+ap>)
z!Wr0XHFw`PUTE|}j?z37#S2KMIC<;kUd>i;J3wu|`&5!Q@AkHjn!S;U?oZ%zua)m<
zKch@4kg+hLH>|_dFG{dz@h0~-GmrQ)ajq2JRhrg2D7)DHA84fwAJ#^&i`~9XeeW?A
z5X{KB#7oWzMU5bm2@F|uq`KrINHwsJShy7GSmslx(Rhmz58AhUvujGO`;4mT%QSD#
zxmOFMjlL?sshWTR;;SG#hCuqIhuMJkFZbjWB8H>UfJz}X`V(yl&MVLAPXruPDaApu
zay+0)(Vw8V$k7+~Oymf6U~G8_K=gr!oxb58sHXW(bX<9;el?%O)+S1w2jiavJPcz4
zPILn(Yn~|FXP-tTr2XxV#-HxE%_Y}OG7q8;p!(@iRQ-@HY*y^}Q(pcF0L>AYi+``_
z4;JP92T{si`PKXv$APT<L4z)tPyT^`60*^Xd$&>VQ~4JhQu0Ex)$#{4|AQ+Pa{L*9
z$gg?3e{d_tKe!bWZ2upyYU>Z?<Q>;|{ujmyN<pIzTx;QDV*fy3|38YAZUw|J(XMWA
zb;J`CaXkCCcl1HGkc6x(2dRLr_^%|EThu{*e^8I3s5|+&cF@4izc7}P1dg*izkt9V
z^>^J5FWA@C)<R|?{mG_=g%<1?!NG?c-T#7eWSfx~nxgbsQBi~q`z_bz{YmT+8Z=|0
zH!M|lmCZT>0|VyWQIuz*x#UQTe^Izjte`h<Xs7F43Q|&1`fkn_cm`9s>Bjv}quVAb
z4D4dO_nXg2L<>l4otj<$!SI4)WQ~v4hmXc&NtMy)k@!Iyc8%}lk&In*R8|`vrD5gV
zpWXzW{5g#B>zVP{{sZr|RiTa79&xYOv*-WS{$HYv^7!gsySpi8bx*d&bVYjm&y1LK
zFHQcLmXtxn{R0C@b4@ROo~|1T2c7bX{!o0smgTYYZhO(h#>pZ1QN+-n1xXmI6C)@l
zCdRRO(G3mOs=aw4KT+NcyZ<y}ZVy@G=Zv*uCRb-a!zFcn7*gZ%6AyiVAhF^sPO{>l
z^RI;$La~4^EW@}Anqj~0u5`_(2iBB-l5tc?hFN_Ni#=K-4dA+dvY(>twNePNlVj4o
z__u=;K@=aXRu@ZOWa#l3F<z`snqO|e{tx9!Bi*Xv!W~oTf|t?+X!NymLG)F92kBq@
zJK+t0UQ&~zlA$7H`BHn$Zs97d98vqARBRb%qW1tjQp^M<^t=u-2|+JKCveNm@8o__
zeKViz-sR`Rel?#P>eZMl8x@moyKq#=gib#6M7L!_PaZCBUUZ~zcv$dvSZu`Tb<KJ+
z>;IMfS=j%N(#H3=wvotNF;Dd`yOpZ^Cq_D!spgjEa;&dS6}QYb8XoP~g`LF)oFtxi
zHlq$>Pg~1!e#V+$E9lW=*aT@y1u6G`H+9QZJt5(}^pNwoEuOCz4U*Sr{m5E15%)^u
z$2T?({ShUzW1IfLkwlvnRSWyuP3uCf-OY@;TGx{k7m?Zd2VQz2{Gy&HP#jkRnTQwX
zM6xQy=YleA@Ct=!onSHDs=J+A>w`WW*84`)ob-GzRC+-)hBsfvG*nb@-puM3-VPDi
z*xK=3Dy*%m9&Wnu^K(dXt~IB_)s3lT@1Ob2kWFn()M~lCmwB%~(cj)f%h^ZAsjF_l
zmfZXL?<jXh9uG{++_*09($HiH!Bi9|X*J{NEaiT>A!bsVARlC7J!&pD88O4t*M>8w
z&n1t4B+wL0snO`O5OJsrSxnYC-ermc9zV`WGB^zn)m8c0$H)H}8P!~*&LfAOLW&Rf
z0K0|WMru(vDuzCB3_9-wRV4ZC-VK3`sA8W*F@zQ_h}3Au?c7si8YGM|A3^o=8#=8n
zy?h}$v${Hce|pkcZoXpXQSWHB);mERh0?XBB^Rd5<4xuE>{p3eqTWqI58jy1Z#(zc
z)SrL&<zDdYJ*MvDxV1rdpuV_ESPBSK^lTD6Z^O~uh^b;;{nVJJQa8zv0{isEDd)4!
zI!Gf<!4?qLlYgq|54#IBUT$@<E~B>-Wkwn#|ENX4`5mTws8DJ&D+aciO|HhtnP}a>
z-tgLHuG&Hc9}3j!0^L&P&>YBua~{`Ci60wkG@*F#UB!9w^rW7XwFS<?oAo<4?}aIr
z(mD?gZYX7hg=?1z2g%2$QcPixA(Y-={31=lah{&sF%^vD1d)@yS06Vb4@#UP50Fr1
zO4EWZ%UR!<W~)ETs>Y#X9#+tgXMg_k4an5a{}89=&D4PMvBgY%WL*!g=BW-qVBNfX
zqMh*ZD&h30j&yp9mT5oV*;()%p0~J9T<^7OmwEecWKwZ)AU2<&#7<M@Xj;7Pxr@j)
z&MT45!=TEaO<uL)Cu6cn$;qsLPP(~_T7F|s{>UN9YLkQnb%UhSEU)q|PCu2Y&NPY=
zA7>8P*PdubQ4iV@=_R}z^Z_~eSXu3QKQByxOlsj9Vpkrr&mAmz&v5LJglfRBs7gY|
z9;2q-;!jMTexz1K#FVP1n2GcCUIn!5Z(M6IK|eB?kBBaWOIt5}0qafQy}Ad6NL6N<
z)!Rr+Q*X5zj1LA#Nbs89mlLl<zK`KI9r(5Oo~V`Z;y^{_+f;h@Rg7=7OB|W0aLyrM
zf2H?3uclr{CIE?Or4-sGWwSU|p7mqW$ik-KQxoBMl_@f5!_Ulg)a4vMoCTNah2~`;
znC5fb=tJrWDtp%FF4ZI)q|B>>17pL7Bh!^lSc^Bhwqq^4Epu{;4;|dZCrEo9KPld;
z*P%7f%bL3B{4G*k-?N(cv|`I)KJ?72a{LE7!xg^dN%8Fo!Su6QMcYyO;^=SRv7`GL
zIyK%_?<DcjO-u07wM5@;;GAI+#BddxS*gE>_=PxDR;9Pz-yNGJ4b9!*d~+%!GieAg
zNW77wdIp8oe>W$1(C@q#SzG*0ulC){=tF+Tso%RV^^-GrT{G{egXjocWbxOeQ<+_E
zz3-7}DJJ{>(jp(!95pGg@T#VAzc6o3GHnFKf?2i4u_H;Dld?{4Ppj>@L8f&BapmTA
ziOotz5Y{Av`&2OjJYtpanulQ?A7hh*-$hb5%uxBs2E~jpl=?9}?B2L$s%eF!uh1@w
z{hTqMC-=kqSj+Ii6(%67e<JL$n)EDDXrO|l9gjS2KcKB<V@H|{90>tqqHl;e{Afu8
zcGi1t3MU`%)t0>Md04mjy)bWJTf5hO_?~?`*1I93jJp}Ab@^*(m~!GWtUd{rg&#L*
z-TWcy5Fiji>+v(rynw1(F0aZ365>8E6&a;kH=L%BC9r6{0S#On^nLjC$#N~Uo!sK-
zI`<KomSpLafIkqwCOJbz>V9m7G4nIY?+Y)6O(lMQv)_pQb}4SNi#msuF~3b?Bj|7+
z7jSr4Iwr*L!>k<@y89_$yw?{>uW9TdEnGgzXeVmp2ELpb$B;9uupp^<rB^<gve{JN
zS0XUfs6B(<vs^xMd=^LYR%(u7PH1h0^vbvDu*8CnRS{D4jc;WSyi>^L`qi;$VC3bB
z<jQNl=&z4;@7JN1f|Y%oSCKeWAXvyevb<n#^6l|h<O`WCV;h+9Tp)C=RE1WcCr$qd
zM=ngL!zChLr{7BdC#FGsS)IElsK}sFh|zO2!d~kO(e4#x^7?yM&WS3WI!>1+NiCG;
z1XD_LZ#+VaaDZtVE(zt2*eHL%SrH^xM43v9KBcX{O*aToI@>x8cW>U68?<Sx)|9m8
z3=AxND8(@ef)W~=^t)&9dY6CtUJ`Y_cvw)Et;7J$Kc|Ej#b{3x%<A1m7o{Hs2g0-Q
zPefA~38<iKAUu`UGc~R~hP4nDSn0^K->Dke9Lb~SO!73(y~)9=ZZ|`esX<{{IBD~Q
z{9u`EU4xj>)B+z$t6|geCw#ndu<`c?@E!|X3Ulu1GL;9=#;bLTC)rT>I_r<`O@BBd
z7I>95i6UX;y~C0@YMmZ+I$l-ZYSwt&A(X7J=f9wv>6%I0Ycum&2A#|_@2*bx#Cq!p
zO2l_xJ%D$MS*S(+Xhtrp2|gkGut&nKx%xr#=)L$C4U)|(9tBdQfD!K#g>6K^2{o0*
zq!+(tuWjb3XZQ$Dohh%v5Zy!qM3M0L)0KJ1aLxzO_zWw%`#j^ful2tHk0Y)~1t$i|
z@P+h!+&I*#eN2PTlI_3xdGFH;8#?d{mN1W7;z6=sq|%rqdvLp2nq+v7R)5kLXCr3X
zF5Djyi6Fj&@chCg{viLRy!|0gzi%!MR>)%eO2l&YVV`3si$*NgRcoPp-I<f`XD?%F
zjsq$5R`ejN?NMS|Fun_d^2f-P>>IW*dm(%HMcc5*92_dvl2fqNSZHOIwd^=vw($y~
z?DY9Qt+9-$P13gdCeB*ykyzg3@^o$$RVnOE9u6va5N;9i+~>ml1ACsc?8-go8^H4`
zP`THLi0gYpryeysy6#B)9<S=19rN(})wvOUT5=7wq=fH6ew*7d>b`@?{MxcL&`lZ9
zGNpBxd{*jdWpnlZvBLJJzR1@xH3~Ah2h8$2tq&e0DfN_52h~g$7sj%LUL3wS<WI!>
zmY+KZ5!G5ud^mAeLnN*z*3cW!r+x<`$5&{7-(Mc|`i*A72STwH9M!jVy`{b~VO3$S
zN-p=w<Sw5Qx+-Q^^?~^<QS7zPr@y7Wczx=wUwfJ(2kc`KV<aVTiWEH<fEU^oaXpha
z%r{g`9S%u5cS>T7;!t5~>UnVKCV{fe&LY=&>rE4r<<;ZOIPa1@Fy~eAeVt~dkI8WU
zwKmmsD#3_MQAn@d@ND3P=wr25-^HjrT*)Ek9<}g!@p{56r^Pm~z$iRF<~qEH=aY9k
z@u=PBqEfT2y&o|e;_GV`ZZXozDQy@9>T%6YP6cGguV?mM+1b|-S+wxk*B9wlCte>1
zI>kE@+8S4GMM{tH=`G`O%-z6R_qn2b3D@z}=|43@O?G*mSd|YDOn=9#(UyAmthxT(
z113D0Fx}VlbYRVwA&Izp<R(zKUIMdT6gtFF5)Lw83Cg2C-x{t}bej8l`n<>;o>2Q5
z0zXmngNgZ^^cYmy%t4>^nWfI7YB|*S-_>b+Q0HfEAZzfNEMb=_C>c@;q$6Ekxtb((
zOuYZ8qxxlOjwnCBz|f8?U)3n?yU(^qShc<c0y4nyXsmE!IQ*rUXU8*EUU6WPLPfZ=
zW^mbf=hA(JAX!F4J$+gEPpdqoK57~`M#0e$_s6;o#B2MKX|!uepj>mas7rdR66{P9
zHU4di)JcF}9+S+m^Nr<jhq=_NK0x~I$a9u-gU5QxU*@)Z#ilC3%`PS1Dsp5OPK9qa
z?dYU6utJoo7dhUW7FPo8Kj&ljr03-xYITO4Z728I({=m`>s;q9IAC@b=i%k*n@o5=
z;nlR2(p0p9Gfj@gHrIe$=E^|MC!ue}wC2t9V8_#0_#9^nD)`4Ru-$gdd;Xv+eKJZP
ztS|p9asWn$q}hMA$E!j2O<Hr5j|;?ey6O5cPV_YC)ef@CA^66Aa{A2z=&e3-raUx%
zh*?G)J-lOc6_ZF#w+KppW3^W$GCH|=3p<vNgz~$SSHaW1CwEk@>GsJ?*_-;#8$nD>
z8NTd;m$nc-jr3wxk8B{GMO;fwKml3zu-hcma%FF(dB#ZmWlsnjT^&*yj-0By2aB(q
z^p~x$*k)LYq`(YxZi~8Xe!3yVxMT0&lr-|yG?s+dzra-i>_V0Y>3SM8rChM#kNj*~
z?Bz`C+u(h+`1B&-%a%SrB}difS7)q7Ijrb+8^8E>#+&f(edf~a?ztD}nwKl58K|Fv
z0k=oEqo6uFvHX@@Q0)lleXe6NTQb*9iZ`-+`D6Mia&Yg;GAzYNprou`H5mnwyY{eQ
z?Ns$I0_sPcZ!Jm}5W@_%i8=O;{GsP{<N))I8RdgEMA|f>cD5qEUrL9n?030Sx_qV3
z1*gGzp!5XOw29qgpuj42P@eIWzq8T@BhZ5#hr!ovfV3pGpx*Fe8E|=x!gb}fGy1|W
zeZH&>otG&_3HgAo>J8y37*g!Qsz`^KM2sETQaXzP!4TIFFTs7^hGhEVf%svMfr?Ji
zCv`I>QTD<8)ZfangG{V*<)Rg0Uca_2sZy6yO62O+Z15_w_q3G?<`_Xv(WaQn{6t~u
z+0B)rhLO%c6)it#VZAji3{1lgTH;=5H|65Z8SoGnPU8i4g}0;lYh%y1`=vJ*aWOMB
zlX~d9T^1VG+gCVc0K0Z#?^h0L$N_Hmc)+Gza44-{59pPCns?(DMjXy2KV;*FS?(W~
zT@6E4J%^O1K+Rns-B+B6{d!VEv8w4Nh!UznF+IiQJv7S(r$+>=ihOah_a$SH>!otQ
z`;R<eTQJE`4yVfR1oMM7fPt%FY~5%&Y*+WGzI|a#`B2uL3fxPt;R+HQ%5Sj2!AQ?Y
zl}J2M+;A6R>a6jVdNrhNlfupQeVFPN$N4%~kZQAFIJ|`5NmnmAhQsvyV;`N3UM>rG
z&*wQjzS||v3%}#n;MSM`EJueT#viO(b`Gp7V+fU45Pvnv$ab*%B*^MmSFK_@7r|Uh
z7}|F4HuZsclgPHE1Oe{I@b9PxBvdPUWs5#s3)y?MBPZW(?@_Jf^l1y9;BlZLVC?4I
zw2X`tgjS9O8qm#0QbHt*kR3@BzGgWP5uQZU)j^A6QsDZWtDNCv4iZ%y_4?_8h-dH_
z6RuJT`X<nxV<mc7Su1KNi#1+xFoM}apK<x6klO_U1x>O`z9qN8QOE`$hf0~1jLA{2
zu%Vb)1}Aca03A+IDPVV5+8MsLmX+b7l_^b=wI*?;$tw(FCZ7;Hv_5nNe3HSLNvW%y
z6X&gx&VU)0t5#3b%hTuBW5?BP8CG(b8>AaSJ`XjstPg8vP_0gtqAwwyO{!&o(xxu!
z73&AYc%AJ2`Yf0^NhJV61ZWTOWiAiO2!ai@@;i-Tc(9%X2F4+Q#`z~2nH&>~H)Mnr
z6MTM-lcGeOl%<-OEo|Cj;SW=c+k>E37bfLj?@tQ!9wG=DlBPPEI%>PGqj<`#$hK!=
ztxDt+tPgbdgpNTggs|dK;Nl0j%4Z!Dn_XVwB^;L4zjS<ehHFFX5$Ib88TRtTi4Q3`
zl!#d&)8dtZp%e(|{BvE7DpfY#l>OSq$qXylRJ8mKjdv%7e>6|mJU6IVXr}JV4b{vd
zUS?h%P#ly(I}nMG$_V!DWqGoLhel`F>MsFqY7fS`?H$5_gd2A*`(l=w1?JXt7V;Oo
zExGj_3YHxY5O1Ig;x!v*<)3k#U_2<xqB75LmIxl`!6|K0M=<+GEKM0>C=76r>uvl#
zdT3{Pi!wpZmzjusA5&f~jKz*vXjIQ3lG9k}h)|^#PD$6L4iIb^yA{3X_f<0EH8k^`
zMJ-Q88-ny@F0&jLGR1xrXbSf>Mo|YUDx<&i7qXi!6(bRwY%hgF*B4K+&*JTy1-{J7
z0nVGNpAokONu>y<AC`B|1IlG6UKBwU5_;+Hb($Q&+empVbCM*M)RICRwdfB^WHk@X
zb$>E@*@u&-dz(Qm0W<{>pI#8ti0d`2vIu%39$KiaM1Lxnzi%F)DWJN6n`f|2XNO=@
z9lMX})aHh~+_sP${(A5@TSPBlLQ3$Nb;#p0QHgjnun+109B+(4PWu=e-uxvS541N?
zau{T#Yz^9vxyCPueFbnZrBB*)@7#U<Dxix_I;c&UbU)b|SBOGReqn3n$M-a-axZ{u
zC<LjNgudv8ID!7%6I4$etd0n&nQ&`{M#+mRH$N!vzrd1Vknos;DWaAEd2<bmw)(Pd
zgyN+*@)r@I=H+wR?@Qi1zf4q*BbVbNS1B;mww)mv-IPrfZ{FdKK9sdCdiIPDx0}u!
za?%kSzs%;_F>?<mEc9l4L@4$d;JHjkbAS9st==RQeR;wbwFKc^^)Vy>?Ywb!b@WZb
z1&ns5x!IyO>yOI~@7clTK5x<j2=TGZFd2D>2Is_?{Bm7=32<PREFSt)*>xJJh-X_{
zK{5y?b0&l}6Df#?25To)nosngqmNL&I)93QgM5M&m+8o~3`2oHC8gN63t%4=tM@5v
z`k;+g&?LyE2eQNiW^1QlKE$1ayZ|tT+;3UTl6W<b%^DfvK!MCeO0OSwdF4~xk3QBZ
z2^Wbx#L-0mGoXp(Eh)cYaW?O9iFAJ!kQ2chf6(PMb8Tx7ViDKmg)9pt*L&EBXbbE}
z=)925-V*I8r+I@}GAVG*n+u3Pf1C^M#S=W(oV0t>-ACW(F#lR1yL=;2B-(S-x1+ZB
z*9AEs<`aKMZH#6Mx#%nqQH*N`h}i)z%r_lhT#=v76L8Ub{hf_T)Y&toIQeq0Ix3+h
z5MZ}yjd8JupM5x;Zu94M7M0=E=I=5tQ1`juhIStF!w&Ab+}-vL(JoU92Eu5DZY-}V
zM=}5xG|<cTGT*w2MbrL}9cm1Ez${&Ez{(2%UOume8>!u%OT@$RR|j9vFamLT9-=`{
z$ZB*=fk6$RQ?k-o!jXvPHT21CqHjAbqmJtXGQ<$}&=c&yPkF$OLZcJXWtN}x9pU8=
z_bTI?2ig>lrgjf#Yzw<auxs;Jc%Mk24KlSyeS%H@L0=}@Tfr7<r=dmZUoN!YdRSS?
zC(H93C#oPXVyI)7V+dj%{=)5OWHXU=N9rsdnHYqak;pouex(ZmDR-L8Jxwu|DD_UH
zTQ?ApT?j=s?GB=xp(T%5ai-xZ(`DE>zRRKF{r|<@Uk1e$bZww$m`rejCAe#Fhu{R4
z-~k4=!GZ;M3-0a~g1bX-cXx;2?iT!R-uL^eZr%Iy-cxl>o%179X3w<j)vH%`uYT6E
zn81#fV4w9|^pkwuD6GrKhmZ$L#eHORUQaJWQ$ckdz#}NpiExC0fV}-sQ-+|2J2p4|
zVF0cL9Uf^q@Y+U>o^x|R=!erDO1}Y%6i)X!DK=IcDe3HclD8$xKOfQaaMlEBMyy(G
z8c)6~UJKU-k@oK17Jw8&SE?SSUBm0=yMAo0luXex&JMz#Mq7SFBIRr$2F+Zfbw(P)
z+zNZ6F@@lw!rv)~I-@fc{1(2!PziAU(BIKo{trX>tdd&bjQ_z;@@*fl6hCOHRzUl0
zmmvzcS6pkhcMrg}Wzep(RXtJ_!gaxF#eBlijS+zGdU&3?4y6DnKbD9NBqYQqUspY4
z4$)pBCeG@%B2wMKKofcnf6Ij=p1kv(jFYTYR`4+7{D)!lC%%C$yaWCMs6K4`4L&L|
zQ-CD~8vFn^G{RQ*Y_L_V*Xt?zSVO){tDK0u7wORI6ZFggm1JG=erZW0?_6ggG!RDe
z0!h5X@)E%CZ|>vy6gruK^UM>mFWWyy{(x3utbLd5>>zPhnTuMe4_1WRDPc&Q$BO?)
z0_Ecgb~^=Qg|}eNQ)1w7p{i>`Rue?CR>wMHaIv9%=vd2z3$7))QO03nGJ9~Sp;7Ni
z1wJ5fz1KktVUo`z?sY}oWwwzLq+0kb+CnTD5cOWwQ*w#QLNY-AQMpbX^@|J1Zt(-h
zf(V0GiPZ-@aBnQ*s-|fDVH4ADI|eC=68Y$E#e5>1RHzh|PkTiH!pCTLp}lvM?Y9(u
z-HB+bNb-AnUMOu%y}irpLVC^w6q{&D#%c5Hqp${HxXrR*urXTj*%)$E``!vT;4l4g
zmM9dbXa|U--98k|&3K)&&V$YXC~@E?rJK!9W0D93;h>Mr&<uFnMad8aYW4ih_#{wp
zjHDm+Wkf*d`EqhG)Is=%C>-+^Nm^pur-gsA!svEEt>@1?E0S{ku`b%B7ib!OX9LDi
z%Y<=@vGEE^wFyqC8yMcOKdHv$3FoxDw;wK3uUlH-!_G(->T*u21A>tg-WASaz#5fE
zW>Y7>J&oBM`1b0JCqG6&#t1sLcJWed=IV!UsK^k8_j{sQ#_{Wwd|%Pyp|bUf8j+%X
zq$$|mfxld(y1GJRz`$^1t6}BPas>$~t~RUT03nP0)AGNx>f3pA7b$*13{pvNcykQM
zpx@1WklBOlbQP2s&+M#59)c93KaqKee|(Q|Swq77f!5aMvU*fV(s(BB-HudcyX;fx
z;`6iNjQF&lE<9!Q0jlDJn5PL+Xc{xM&U}2z>Q@l>5T4`>V~NAB<pEA#dqwCc+L8xg
zJSsU-loJqc8!<3-zSmQG76gLcQ#dhW3x7&hnB_oAa-}GFK*sWkh6ll*mw8%^@sMZS
z;R#c48x~PHN6nZRI(W7U*$<l|qc)T6<XK9p=8n43M}uR$n6*eSCBs-Ui*~afNzG!c
znBCh0xcl9`#}R03-@l8@*n0ngZ53~ctX~8AqSX)Hde(nHc)&s3=kwPh{bfGw>-U*-
z>g^K+^K25$az3Aw-`5E|l*m{#5kDWdlpO44Aq*3eN+_+I-b8lvEHIrVshA`8^uwu(
zqP})0gU?9%EjC~`;vs^w>_qVRpJ+b8%*rWfK)fufC=8>*hnM%9h5Y_ijGve799|y{
zO+0WWp#56s{xGOh%hy082J(p#A4TcS$+J`nQxyJG%(|!+ir>?551Pxihfj|32-rsP
z**3%34GQ2qcIB7*&ayx0YzFB@`y>yu-rF}wX_--hiyMkI-`~vVHpx#qPMAX4yNEyp
zh4;D0K`L?nfR?~Eh)S0(<{k$&`izDn4r@4oWB7@A$GQKL7X~3o5!T4OUq1^LB(#V}
zh1&2^{LnKGF*)BbfXj!P6^951KZGU3d6sN|4@YTOyK~z4k}7*t@V>+SON0o|>TtOc
z9tH(38fvsBbD;ClRA}QE@+sU;eK|B93ishfEO}&~ACf2Bzn{7ler_>0I}1xVgFi}1
zJ}QAOO%m}wf7p4aUIyTh>oF+X&tNpeh{Gr(6Yq-mu@c4f^nR8sw^69A`}fpk$7v*3
zADMAg#>-$bcD!)=h@lECqT`q&9z}P8$m7O~vWPqNAjXQ63Ej*~v#yXeVT88aEcSy<
zW5!ug*iKwD#I@+!9-ZXM?<{4isDZVt^ajG@_(e_&D59SO#GV%JOWGleSKQTc!AqDw
z`HHb<-9rBC!y~MBpk4fhg`$2W6__k<=->_v6SyIdzF+x8;VeDXsP4-GQWop-WZ{#e
z-||4=z#E+K&?)+(pm9YU`37X@fpm&!LH{-9$*}1S7;CW)(NjBfy{}<9_z}|Zi4TeR
z;I`!92mjocRp+dJ2}qED!HBPVvp-Sqn=x=2Gq@8^>k;qU=ki@TMAR@+YvgyxVTZ;g
z6)1o7!r|sKg&$78KcKM~zWEYcgL%``<2Fda^n>V7XZYQ{R4;y{4H;M~_o+Mf9X7K6
zOsu^?!Q{>Wy2Jgny^P%)w=XG+M_QS$SwXzL3nC&`Ijpa*hl9IU3nhd7nxjzT9cBul
z%taU82x50Z*7K}m1~<--o*!Biz9C|6ct>aUu;eGQul>!xGNHSbr7$JTXP+KBIh=pe
znd^o*oP92&zPb9qQFQfBb%j%y!kdEJEhH?0|1toi&+66n2!q{u=*JOyMYIuo<7b?#
zh9+o;m0|CTy_WZ+uS<abCXD?|&k|LhsWr;aFW>1~HH0STyf2W}e45RFUVTo)5qZ2g
zUu_;|oTfn<pod^jdgTJxMUxFKXG1>)b_I&^ydc9b*ty|wejgoG;}PZMs9QLTD&W5^
z@^FzI5`UtyqTr;z#0_Qh6DkNyebdu-_bH4_g(?a`8nhjB(MxN5?1xN;)g-IGhw^o=
zN7%=;IyUcX%coWsN@dyg&3F#e04!A&)M!e8Z!a|8xv$J$tTN`jj0BCd4Fk3tX~RZ>
zg80|gdBuTZNeDs+%Egy;yUVJ~pdYAf*%S(B7N57jl+3oeVtrDX1t|_Yd!nJtO>+8(
z9zkMwEB>8#N^Z21XTP9f4k8`7E%i(9#i3EG!a#l+S}%O(?+dGJXA^EX4{g?~|B4WY
zvh{aujU%E0J^k$C+_-@GH+}6OfuAT(CL#<^q%x>)19soQ(s%gT<gQ&|m#4ataEHAQ
zBmKZ|i9wD45={>9a3D?g;SSaByuHfbO+nW|f&D0Nqc0m$!AA7q9aC6kw`sent91q~
zXF_w<rB8#jbA$q+dAx;cgxXdm#|lK6Cnsdx2TmLVb$|@#Jrw09uMW=oRB{S;C_$8f
zs3=Ue)uqHKIUi#)A5pCF7t?qo+=1UYgrR7APrg|ncTi#9Qos?lj1jL$mfJ>^e9><s
zT_t(??aiB1ITx1S6E^+z@4I{EO~`!B%Vjq$e)kKo@O8-Yf;PnsQL(V7Q|T33pJ?d6
zZ;Jih{*@yFA51U!;Vl=YbPz~N$XS`A160iN>{OtDg9Z-{qV80tRiH&9;X)Vm5bkGj
z_FJxi64_n~lR5Lo=?zRWihiwvrMW}H%EHHBQb>b_qR(!MOXyd1e4qraS8&2%K?;}0
z``;uRpo`QTb!1rv1@@_<lOYkoO9$DVvF{5a`um|AZ11mrBFf0{5r*}RGA8DuxF<MN
zD?>NmOLUNXc`Fl#G|Ln6;coMz{_8|O&h8f3doLCQ61B<70b(R6Yt~AR4HcsNlnC;J
zL<p~TMXNFVFmPu1y64J{t0{j9e1Z4-NXpt-g+_1qwt)n5U6qgifV7!Qu3X;(u`0KF
z$2``>z`|-*JrM2d+4xTav}Yo1E+eg=PiQf{Y`@1oG$U8tM`3aYVVsSdZjni&ZBY>P
z6C6lgEmMbaz`nEdKM&*j)J|wv@&yH7I4FsE7&j=h0coHId#mP~%TDB|uB6{xyF(PA
zCL1J>Jg2Y4`)i!{?Z7OXqrhkVr0X<6pMPEa1CJd+<im2oBVNc8kf6i|j4l|L1}_$&
zn5@@pa^XdENYWo?h9jGG@dNqwWDo!QBW^2k=?aQMU;Gq&zUg;1KYU|(UY@i44MH%7
z2u~5spZsDzap6$-_k>LT@@&Rn#jt_SgNMmoKZ|>;i+i>`C0ri&VT~$|W6@6G6ZH)3
z&R0KFm0HDNg&_MItCr1%Hm8MJIO~0SH%kJE%eRK)@-=wx{`IF81(OAI3%`g6n#S@#
zrv9$C<+aC2(y;stwwxAsxjjlu&v|+G6IK}+mJiB|g2<gVcgaCf`D;(E_=wK~rb>ut
z>v!369MaO*zBqhV1al_I;{2&aAy?KQ8sCDsUuTYoz$UH%#nuiqm9M{hHwotLd0z%I
zu{=qZXGJ)=Tpo=%D`XStsi!!vb>i-k^-CDLX(A{h%;hMOdoIrqJrr-8=&-(j0P^Mg
zsf*bzzWy4vAtC1i+k3yX{kN2VN4Y7=6r6Eb`bf)30tv@oHg-{;`ZfcXX{N&ohMgoo
zJ698guAe_YewqFE@YC*7R@-_;j@(HovsV;$c_bR`rQcW-0j!q0Tje6gZ5t+p{l-i}
zZ{i2?OEzSJ_qtN#sB7K0cLUl%6d5eskI~~OG&eub*zO9(wW~a;MdJF3>ebyG7$LOr
z8lTepdZx|PcZ$14V;$%ZG1PvClrT;~s<eu^N}(JUMIxVO(uRhwtUmU4f45_U$g$ly
z5I|(TGL5(@0Q2}9*HHq&Fj}-jn0Mf<pMc9C7}&%YAgHY8a@1zlMv;Y{nLA`+Cx`hx
zxCSHy5DKl5N{|G<wo8NzT${NEtV~qYiGYxZu!wLEt#76Uw22MS)1QP(-k#0VRS~;r
z2wM>Ljp8{by|xM!du<i^pH}N63>Pd-=*=TOn6tJ^3g~HvZ6UFupASPw>P4eGsRWTV
z+nWs|gg~!N!LI!072VjO_0cE`(|AK~$dM*kvDobFk7cUiXWo*>Mhv$+Z=axO{&_7U
z{8}d9wG6S~f6$vko&S0ADyaYGNyPt2r6E9xaMUOGFKhbb^Cvp$;g~AL&1|<liT{G6
z6~RSvGeb)fbut6ZW*0No9@?rUI`Px|3;*8;8+V0mS#Ho$vsv}yQ3srn^|l<%C?Pp{
zwJqF7lK;W>ijw}wYZ3dN8ZlJTbGaRgy38b#$Rrc@>FN6%LJ4Y8o%I@H<l_$df`&Q`
zq2=-9!puV7N6-2H0Dq@NN!ZAhtE#G?iaiDm4;k_Uo&HF%jpTBA9?0?lTg}m8gYv;)
zq*P+eTtk_iht_)Fe_ucr%*x72LCz);Lj7T=`R+2uw9_x=BnT6tgc^yo`(Fm3A1nYA
zoG~EG>-4{$qox6*IOX-(pCA7NcE%O~5cCH9|8D>1vs5^M!TkS21!Ln#PAALU>*dwe
z$~-(gU%iixESA!(Qwbchv#}-gK9i#DxVgDCIfU*W7#SOfjNNV#8&Owm5oeReN2UI0
zXizUIE;dRP5HFSqwir$0R?^X71PD~(o12?S<NNUKz_6Gw$h82aO<`eSbg5?j!IWbo
z^!f2lNmZ4KTq2?-u=~C7gU;<XKqKPl@((4o*mrkkdYS?t{!}XDx+K<h{=&n{vwcuJ
zJJ=}}3cbjY5;x!jC^Dx*$)gV7YiepF4GpuS=h(*hilYPHH{2{9LMMLv1Kd1h+2Y~0
z^7O%Q3kbj;lTTtu8FL-Ja7ItOr;-k`rD;RM!NKDTjGIV(gGcJs(8ZcUR;l2UU%&J|
z^<35w_`rz0@kOyd?2jhDxLG=czWDgs<~4}%Op_0d3`PyaQV$%c&^ffq+sRu9MH9~t
zrnSRjjl~t0=I_=T$>j1qEZa>^u2PzpwT{lVa+bM?ENY}cPtw$@gu3WLv6&JkCzX)O
zD^&mXpi6RqA#0TOk~!=3ZBOY0=B9BCp$VS~(_<M@SL1(1-`QTuat&g5&^Eal4DdF}
z=?(C5ijb*ujhR2Wa%(t!t-27GQj|&_PU4rBqE-7|T%42jT#&}2L%lGlMoEbYU7Rjt
zwO-{*{v}_0({n<KOKS~%gI-Wxj_pZncG~(@NyX!vtwdOiOK5_$oK*&az#m;X_GuXt
z&Dgkv4^7mZ-hWW-Rpj*EhMR^=JF$!vD_3nVrdlPsASLd(s%33`pM(`8SI*_S)+}!&
zk$8U`A7+=1CEdK!MVz3Hr%F-j`&CRATPC_g-jJnf)MWl-rMiy{1;+xDN-l<m&3Gv7
z?j!5>D;#c@Dn&=DC3+@-ha717;SXTYc0(~C7^tIqA+sZwlN{yQwbwf?9}h?PoO}Xg
zrHA5^E-2}Ehb9l?)A?vPRN!*!b}9&p`N@tMMp(%N<*DKmrNqSijc9gub~O5Jn73~l
z34MmX{GHyPsynH58`D{+X;5lpz3RhHEQu?W*7FzZ!Ll*6w2W@do~bf-%bh0X-^iHX
zek0I7$K!*-+CHG+(a*X3$1IlbZcBMGPmtZ?IJ|!7XU|Yh!5>lVG6@IFbbD9HQ;2r>
z(L$yA&CxQIo6~U=f|$SU1EG_oe_3cW0CdY-JLY)-z`k!ml{w*d35;#}o8l;hCy9*#
zm{_!J2Pyc_!J6ciZVKKB{z|JTifuURiX4K34(t#Q9Voule1)RqvgzSoOtRh1P-loy
zmHV{@4PEB4Y|#B;c9p{rEcYL;NP_b68IDSgLN!)=HrE<@?YeIX3^Q#;L-Y;VOa9Lt
z=#n6?hK5FblpJEl9KLlcO@8X=;B@jOz5QpS17ut_M=dTILn)$@V28Jq2!ep^S^tIX
zFWKaP^0zSm`Imki!++Sqin*MPRqKZ5YRe-%H`wsRwWejmw3q4!HzNh;o1CoTEmq)W
zb03T}*u#=C6V25%WD@D40Xha1<FSmD|CK=0T!%=YQFQ={uIIJNzyL92z57)x&l=rm
z;9zxAFfQ5qkr7$ji*6fj%q21={@b{``OH{et2XWM+T2qkYR9H=L1Y>rP(}pN>4Lp?
zXQbuf24MkWGFI2WUPOD{%Q-NAPI*x&@1Ya>nYIyO2<5R!SCAqfKXRZ*g)YZ_Lv&?p
z*~%Q6eHHvJ2X8EiMFbHEsY!d67}%i*{gU3j1w=y$&{-v{G&zMrjm;-}g_FyZ149!^
zqBeq!R=SbV15?NYbFVF814bhUpIe$JxTuF_-4o4g^tT7&>u#2gg<qI+0Nag1vpleb
z!*wH1^jKJPpfTSk=#jZHe?Doo)+j~f@$bukvdFc}?Mg9e7v*{2Qwup@@P#x=uB{fT
zdW3^ZEiF?xQfch4zV#F0*(9-f*>K$jf9A_W4{X7+={Gv<Z(uaDH2;VZAJQAih0%0e
zx5T$Gj#8WT@)yi`l|*e63J}?F(=$%LA=LwlxN&U!80KW&jE){wRDjSstv>l9k-g^)
z`uuo6%5Aribn;e{blV8eW?VGyD@tbMxM=RihCwp7U7iF1tf{+zDTkaM@rMR|$8^kj
zW_*QHISSD5E!>y{!N~!`h5|0O&A;Cg1Y{jw|9+va0RPPVD$DCbIc55X-WOP*=Jm+=
z*spngP*6st)&NA!Bx*F$XYW_>kR%}hI2?MPztqZ8fL&w9V;s^0c=R~u;ZbFufYJ4}
zo2=geCtQhaf!>n<!~GWC7*uQvk|FswFkOQQVGR!$Vx51jh_h2fxearE{BPP-DWJAb
zXbs+PgMgOb{7h~yN^8e}_hjWb1}A*YUavyt1S;|kKO3Ju1#_~0ruLu%<RmD{`1cJE
zw8iPIM^!=lf3;W8rhBCXTwS#ioSd6+kfW_v9R$c)<L#vrmK{*r`j?-u6P+wp=p&ym
zz5qu>f!hW7p$)On8u{H4-Q?utm|VfDL;#@Ne7gZpfe$*rd|Ga>tIDlXBAUHhwTZrK
zO}{H^6Us=Rpvg38WtzP!n+t~S5saPPWl+350(igPvP>IH&&){Wu6C<PL`W(t$4<}A
z4rH`E<ovO(dKNz3g_b=}ySgtFwkbS3tUYSs9TXXLG&kJK)NF>msjjYue*NO}HG@wb
zi>5F>F0Na5=akSVN67oG6<9?H3I)ha#xlZ3Zu#AEOg8|W?5TY#u}SM#CY{(OQf_v(
zNL6)}(R4wyz*?-qH?db*Ly`cEFks#e#mWnq2^Q;UYSLLArz<Hii({$MCu*KFYZ{Me
zkIUxpH`NVIMGDe3Ien9sa7(SJON-yhV~fd5q>iPEe`O5Zu3NH-U8pv3Y8n;%3>xrm
zFVk+?I`7Mjl1=3ts0@JJ-#dz%$nNX^*(0-^$1C76ao1|HtGa(+kytm2#ddXJYxins
z)axvgUkSiiO-9q8dpe7QKww~=2r*v;3O?5$Kt+~YdYKk8$EyCvxq*IsZX;&Ws-K<6
zI_k;INopqRNl8<gQUX}eZ~FCddGmy3wJOCE7_@1D=(X5a-nswjnUL&=;zEVKuf<}s
zde`<61v#BW0=+KfB^F~euvW;d|2w(l9KO9@#N%Fa7yoWwW?jBa;liC<^?im`y$oFE
zd~Oi=nZo%;dOHT+o8a%Qxw&F^`1mvL*2p4@sb?o=Db=*yvWK-Hq+*!T(5-&OnCNJI
zul{~KVF`(*={dsql_ff(mkdH%=NF=62%ipr`4WfwABA#UvqQb$gV_2*cr9cfHj_BN
z@86+Av!xdsq2-dQC-4pDH<3fjccBz&+3@OkcwtN>vFCHk%M>Vm2hh12hKpXAF)2^0
zyydJ*kAWCOlx%EiPfyQBqi4?`Aea*0pglI1d*dz&;^0urXlflVq9U=im0ZnQR{^0-
zD2=(GnSyo)MaKZ4iCngu=Z00*w{o?jr~?DNt^>1L;gcI=$vj3?!zmn^A)(aH7!p;;
zWkWVFAb<`N3$$Dh&{ST}@|c<O(fIcmDG4|P4A1_!`}cdpYA))p{f$NSH#NdI2K>39
z!p&<|$1C_`9+wzO`?$nJd>0!pXDI@}yaaa%&}c+lbzkN6@>tGsl#9UK3@nwmQPj=^
zGK8_3?0ju_zCVQahT$ee``7OPJ3vRU@SgkJNsFyQcpc043T=;LSyqQ$i`2gU(+st$
z$c={!Zl9J&F^$%yBx+9AVP~1#vC&b*i$%I{l<Ucv30gz($T*tVp^?iJPKTl}Lf$&f
z0v3iZS91BtpplUgC0$)6x0|DQhtp;{hLmP@6HkA1h}1p>$Na_0rF8Pd6@B*yyrPbe
zwfWSJjfE<zB@NQI&Cr1altXsRVp%K(Exm^}T3W0DQTasTWtj6txQ{+hc+&3Ace}Yo
zjpE{km8zUZjd&JA(5*cX$85v^<4D~#NBI}RpD=~X{1q@hEZ21V`@w{K-zdZLM@=t!
zxi1@~sT2B3lG*HEWE#qxJ`R17oj==v&!GiT%cY@7TGYE5EIFM_J`HfQs&So~n02qW
zuddkE?Z=h7<hK9i@yTr`LCFp3z$~8bj>yk+kz3?g*i11OD9M-;reUI+YB9(TR3fk*
zLdW?4hU^|px9Ba^S;k5bxrl!MerR+*+)yWIk#D>Z!9=jR0MIWywyPe1IV+s=+$}pW
z0eZ|NjRLsgx2Yw`0X>av=2v@)F{!Dk7ap(SyQH!ivX6Yx#;2_w_i+&#<h4M|TQZAf
ziV1#RJG%`?#^;MII&j|8`+-KHogq=U9?X9|tt_s`wfw`EUhB{JtqaQgt+yJ*nDgG9
z`|;dBq%Z&&z(L0_AMcYnR%SvTvq*8(6%!L{9iw{(JBdL54t7Ay{=ZbPf6#z{IV6^I
z>sRtx;3TT_2Kf8|hXQ-z^;ilc@mKiu?AYDiX7k_QOTqx3mGq*tME@u16$zL(O=y@e
z|A}7(;-23DE|LH3?`wsOfJf*^`0FG7|NQ=gP&-0N<QGJILYM1Tav0#qdj!YlBD=kD
z$r~?WeFl|}zdHs(0tpJZUB7W!%>QcB(#jrtcT|;r@1eC37{~&8{9yt_Jo3*w-(742
zN4=EE7JfXgHr_yI&j;&YRBO|fcj?hIM%zXPmrd*qyY-k5V!Hy{Lsgf5>w!jxv&S)6
zAwym+e^A>4?(~T`re_u?L$Mg5#yoe}ZrbO*om|XbUN%kcI?<wTdJklLzgz-(o3&PD
z6#96Sl{qxg2&-OW?;rP-{&W*jkS`6rptv+)F#J^7Oez|n=E{tv{P>Y(W#x9qsb%z(
zQlL%Ql^IUxA9EY^!DB1eyPen0__mr&8Q%@~3dXBygC73npHAsoYA*tg0O!x-6^4gY
zE?4VtluVjizy^*^CXrEU|HKL~z(jn4ZKsWWyH3)mJ8$6ITKXP!61M)+xi3zj!?(ZK
zd4V<4_-%&qiUZ(6J3|hy%m@__gB~x|DrIFQyPdYECD2Qi_#c92=S*|wfEdavyi@zb
zQUG%l#0x|=Y)ZreB*YaI#3b0KRsV5njs3@|m9q8fD!6;o;GF0W_ypM9flJYU|0XD2
z1H=d5`g@w_0d@GjZU?vv`y#2h0UISHAjG(81RS}sP%?o60-o#xuVLyJFt+E6NI(;&
z|6bj6=h;#bKz%W&QK3%iKoeqJy8y%>l33Ojs4vFdTdR5KHMSE^;MEuSXc`0T-V(On
zZf$5|0ZmMt0e6D`x9oDPoh{%`**`dl86A~#<DdKZF#2l<ZD6=RDr}hkHE?)%`0K#m
z>SwW!2Zsj-(>zUEX|0ZC_*p-U#KrTV#`|k`u5Pn|U;FLk4i(Lg9>+Zk`gDQa&3(o@
zeNU(K%r)&@dpQ>3<QN{@WVd8aVuuK}wz0AE)$hh~PBl*R9+OMwXWD-x{^+t^w<2VA
zou3Up);*j&K6SM8@=3rlkpV+fO>&G-lAW#nO7l#Oho4io6GR}W%9Jmk88e%J)ih9T
zXA7M!m{5YSe~})d8gTX?<Y*MKwYzi`!|G{Y!!6x3b%>vyn;AUc?8|A`4_mof?qB|u
zT^oTwnX9D5s`w&ZI{FVWbu=wuF?+`LVhh2{+PwObT0Wi8us0%9kl}a@&}Mm<{;bkc
z6}b%4xNM#_{D_%>{aET;v0XlBTzovk9l$s}Z?kD+px*F7_ESQoArd8%V4G%-z7DZ<
zK_WkIJyLlK7I_!)z{t?lL!MACRzy9wK%-vR)?SNW2bQi}CJzmV#QP=wzT0>240A0e
zKhWT?Q9akxcmN<7{wp&Xz2&Ioc3D|jM>bVL9+D;}i})guhij_x`qQ!fk<8RYrp+rV
z`b<`pol#v1<@~Xj;9MhC4eOd!zTcyx@69YNIarrf8>DWI7bs^`;I`v1&<DYwq0v!J
z;j}!5ku-k!9n%&x^+yyPU8iq<S{o0dVsv8(bsc(UlmuM9Uj^kYC{kCf!tsfJ))xPU
z^F_k`{mG&#P2l^J*!x8dwu+=kZtewE>4A5f7B|c9SUq+ulG%+?Ff_78mEoy(5d0sE
z)=ttd9@!#?d_0;)%n$cVS=t(myZaTDAP^|Dn;^5`8rzf?O|zK`w$RG;*0M?W2VAt_
ze#nfY=PAETxixSx+sA8(P}0yK6)`in5DsZLCoEL7F3fHj=_Dk--s!sB$U8Q*=A!h@
z&vVLfk-5ZXVA)s~CuTFY;PN~FlVtDaXf@S%`w2aR1@%207Z=yuL3fim_mV{hG0{w#
z3lj9E*&i4a?2nor67PxQ>ermbZcbNd^5xTGq@=aDmTmI^e^tGv+x~M0Ldh-e_~N30
z_*B;0hcFW6@gR^8MkTy(*~o50#CtO{Gft*Vw9P-~VyteXpkxlGjFI!sbIcMSG=uWx
ztBA*gEaJgcZQii00wimcZ2Au;sdA;*B8<G$bI9%cfQP9{#T+^~jVo@2V;%?W!v^V?
z_{iZ@<bqJLPekTcasoDn6|^<lfEZ&`FAV@THD7tJv5-qN*L1iVr*;kd-NIDi)Hvkp
z1is1rFt;mtB-a+ajSyAfHSAKGQ{vWcWZZUf8St+>F5bK>#umk?yF@`-Y24L`&W8Vb
zp1ev-s`A{~S3MmGn~4K8f*Da~4dLnetmCQ5;S#oGKofdUlU!xL{qZ6?^)w2%dS@{_
z%>SXYTK)iU%$R#&{?nAnNOa`d<vSm~mgY#{Jo^Cga~c-md+x%Mov(Oia5*Vl(@c85
z37?JRTPVrzDG8_4h4ga|9!1G`EZy4fu}hj_>Pm-XQYanA=5W<rrS;b9x&COORy%L6
znSq9(w!#sfyUKI9(C#rVfX2M&z2;t(EJ~O9f&Yr*Ts}ZGz+nu=NXYL#NnA_nfb$tq
z6gipYJ>@hUm6N=Dy#l6wg?nI8=lBF(^_gT$qpiKK#p2BcJYpm%15?IHM#-6iol)jf
zcIueqa&6`hs+ZG{Oap$zP9FsbTmRrFtGwl9@AXALljm>J=<nA>SV<pcAbF<3!CTF{
zKIyfay!F=6Zy#=@OI^LDo#yy&2RCx*DCD~m{8$0)rDl>yZ*MgGz(?8qlKAqkhPjV9
zC+GvcPL*)pXp+(4A5mQ9=0%RqcK5R4(&#6=lQJH6p#%F9Ifb81H@%uhiaoY8$q-{5
zWj-JDN=PB;u?rHGjQSlEmvFWuZch?o5PrOcK&gnLZng4Q=HfiX;i`~z`4CcER(lt*
zC8xL2SH4S4U6B!F|B~hio}KIrpZRp!`qaqxIQ0TJ_x{R?m9nf?uA`KC9dfvwH{{a?
zZ$?CVY5Xw`i6P3dyMSh^7YIJAdui;QV`etUtLom7h8}Ku^dySQ;&o)Z?H+S&-3jth
zWq-?Zzmd_6{a9SiT3&kJPfA_k+|GEhicrGKZ2ZeC$hC{tdVtCEk)JZ>X{Uurvr@e5
zuD`Ud_hTFE$1>yeu!448>ozL+r;|D3EsuZqsqBM4YUxWL#;_1RpY(CJy@W{5&Yjy5
z?n??kJQ0tQ@-Gp*b(VGa3f)nQ&Hf*4$97K>#ia8};@1cGuan&LVCpl<<@ET`_n*0e
z2z2KzmW3v_CV$lm1v_?eufz1rny6qS!E2I~H*@ORY~&4wmjMnJIjeuE@gfhyTfPBD
zQp+sSst=^S4%V$?s845Y?3cx|0*<%4(0Gh_=<J{?6g6XpgPGn?(jq&+rHVbR`bK2V
z9Ns2Ey-8vk!x5GKm)AJ}q(7ODdJ1t8HGK?TdzS9<w8WwzIg)j5RG9liQjy_b0A9Z+
zP30N{j`jxVfg8cO;yQCH&Z(44+K4{rwtn6H-77h_!+B1t3E|R3n;b*u%9%}TCtNW+
z?+@^jFd<_T&M(!k)9mVo5eRR;E~(e?S_lf_V6M7v{qe#T7o?gjHor7Nh)SR{pWWuJ
zz|co1%}#FQ6M$xYw@1xD0rFYTdfK2~s~+yaw|+=kDVSon5D^C;68fnSV>OFYTv175
zoUBZ#*!+c`P8r7E^-*|zdVuVBZoqr}fnY=%m$z1*IJg}5>~bvCO1A!+u}(G_YYvgG
z+w(L!nrh`wF)Bd!Db?gW&^Tt1#Y>^iREi9CiEzT6ZNI;07ha)YnVBQ1J@kAUkF;kf
zl@Yj0W*RK2t?UP90hnj+S$|;ounjO?WPxdGc04Bdl;qDjnkK-&HbeU~pD)04$x$<b
z?E*tUG)0{0B3!qR5+M=LXFs;vMsMf9@P;nI`Q_B}Ix3@Hb8fPCpUkPjNL7}eNev@a
z56mI194CSOp+5+nd_>x)gD~`CD7*aJDukVycL;&rv*_meRcRTGf3=)fQKBEu<RJe>
z%xpNNQ>wI3DPIz3)Mco%d>hlLNj4Bfk^2z$hUJX8ig{deV~@$FG7ueOvk|WC#cMmP
ztbWD*K^x1J`bv_j0vIjD|M$_d=&NJ6Xws$YD9GiOmV!QaQYK^wC}Sr*D9<O>o|Mj`
z=rkx=|G56!`gkokcT_}obfMSL+H`^<lT95a>}YtGQ|QQunEMU8W#P6bZ!bC}++T%Y
zNg2z{0Oq3$mVuXeSL|ok-ye|ahjeCw-spG9yWg*Rnv@HFqVN<lVB&l7J63yK%zR=(
zik+I}ZFF>}0rzs?n25W-oW9ILqqA*iIEW{}QN`?8hPnPL>Fq*=w*a9EFXQ?Wz-(a|
zkH5_&_)-}?1Rmkj`h_$-xAG&E!3s{{w~iEdFAwFNLCfk^?sn`Jz{aAB_gmdO5go(N
z{p{-IWHoC&7EiO;_)Dt9zY6X+WabQZX1PqyH_6dtsFmj*z!8dR?Z^QFFLtOUyrRPm
zCK=A(qzaT4?TKfBX}>)x?>S2bUKu-jQqB%Qxjis8f<H>`Bc5^LQEhz;w;n=VVeMRH
zn7j~#_t77J`up+_k$0q~xzQKGVeZg%fv4$Y2Rihi|2K<pv@!pK0+=ur{avWf?ZWm(
z2e{E1U+X@~&+o|>P60cOiIFr&TzSh?vp{^dR_pU|TV<5wAEMCMGqM4)lK9H~SPv+N
zv}ZoMa&!eE5q7Fj{yyxOikgn){$=$pV;n+H58mK#obSV}Gd#;1yz>L_EFpo6%p3)|
zPI92#4pZC`aOApCa}cb)mB-P8b1;V+kffl~S3nomD)j3zev~~eYl4EZb5T;hUHHB{
za&f!nNGI$Q4SuXkv@PXF?*_IYm47v<D?$ysqJiN!Zft)_)$QhEX8nz|w_1M4F%&W~
z^h5(yiT?e_NR$)<kw*>VQ6R`cj-?&TGt!aY&%mi_mP_Z+KgGw7loIi1F{Z0CmG*3P
z(I;*^5m#o1(pUXCHrMlr1g?9X!iL|vT{zv(7ckuWg;%qney!GDD4)s17@AzspYS)y
z_*d$pckA`U6xKfccB;XPh6v{t<Yl3Gm*J|1<85wFjN|v`8=JHIKL0w{@jTiF)byE?
zK#CVcUUTv4T)i<Xpwz8~-(@I>N@drKQjqZ)M4xTHLYF^fuwBBNuV*0Wbj6mRqx#lA
z)gl!hX+$9Wy)xe_c4wI94Zdu><-)H43Qg!(gk@dy{b-Y*d8){OIFNrq+W~awCYtuT
z-U^Q`(QzGeh8r3uRT4+jcRo|lTrZ#YKbMueU%pJBZD=%QDE!SugIE|k1fp-;KA$?V
zvnuAJN2W3ubWrVMQb+LmI1qPQWdfCT*mMNekTX}k)pfo>=)g22#yXVGe5_ghdl1~S
z_7@V4hT;E1_jB(QV!r{!$REYqxM*Yd$UI#6seH>Q#<2RC9EzT7TZqu^f^D2$@CEb-
z(WuOzhp$z{j!%n4nAO~=OI3%G$MkzH&d7f@AhtcS7eHIz%hM0+aV(T%32>rc#)XY}
zo}1rzYi01gw0@a+c#Popy)@z&Uriw7;ToD2YC821j7J<1?uM_1hRO5`Bcs08Yx$Q&
z{8BGQW%V$>QYRR*OMWr_{kFX^QvVUy4>+?kx@S6qdr?MZ`!Oq|{4*h(>H_8R?RsAp
zGGNQ0Vx;>UM+;Qsl?1ZQ@ajL02p^uvoqsg3`~HZdAEy`3kZiBU<nFzP5gh8Y+jUML
zh&=Im1i{=8{2Y`$JxE!;y`E5nivyc1U?R?s59a~rhYVBge+z(Nr6LWDqR3XW&wj@7
z7+%08`WH|#O+mgxhkr{m%+$HeSI<lryVj^|J2Y9V7LLqUz9iaj0Ob!e!Kp$GlgQH^
z6yG8F7=UNz)?$1j@Vv*w5WIgf<^e?4ebqXV`2ScqApIE270<FO6?cEE=tsxVHnda3
zoOAip;xeOuooQ<EMJ1YaqrZnvv98CZ9CX!9V4@ZZ8~%H|dP{Q!#m2~3E>n+)cO`Ud
zf$;5`5t57l`fMP-_Wti?G`h~h?=iW@27!yf+S&#r?4cMVx)Drr-(dv$Uk?9W_bYZo
zxMe204I>|g!z+yM#%sA2yYkiHQWf|n7)jy+JNzqWmX|Feg*R{8N!k-L9(?(MT^w=f
z)q+0-kTu~>hO~mY&E@y7W_cTi+Yj$mVhVl_>04P0Gm<ZUQTdpVTb8hwEihJOum34P
zVbn=(=R7@H?sX%Aq<{8CJ1A%ut*Shf87Xw#YdnAObQ<oT?Pk?^65L5ufY@h06a0&U
zPo16?H(L{mO^yds$1@im=L`AthN1a=Ou7#K$G01|(fy2DGy;R#2MZ4Ks3^iQd|BQ-
zb{W$xqT702$dP4;U~#hkZz%QdU+4PgnO8IwiRIPUujjH=ZXo?^{GJoD2qg~|z*Z+I
zMIyuBLUYF?EXZntqjG^$@GL~fZJ!${caD&s)+I&1|8%m}bb5V6O&8<$`?*#7)Z>?w
zdc9~|>vOPp0YO*f-B{O~&T=jiS@^a~Wzi<bZ;Ku^snjcn-F>l0=C*10FU-0E)mR>v
z$Ha-rjROt!oqY8{y>6F}oqYz^j@m_@fCGr%7J#d*!%?H|Y0MY9zP%(*@Z%<-$ms3x
zbwT2IoGZfBI@;c7ew#A6nj?Qq5)QE|iqlHD29h;eddd2`e$^T>bd*eAt6G0CIX&{s
zF8RsqX*v~>ccECMkVxz29f$R!h5cOI8u@#L8aO=VN>`+gMJEm)A;nbqE4!$yy^n@~
zs7%|NCX{)Q;?l<B=zR*}5c_?dv*Rtd)D(2=Wao&@Zq)=xFqLfGNmESew7FP=zG_$5
z$bYN^pBTj{Zvcl1Dzer}O$qzRE4lLv^SHRY1(%NK2X)vb3w+oh(OiE1rvGH3j%Uwo
zeRiIx+0v@jIw@Zpgm8Iw8d(H<BTr*8AiQblERawB#imS*T#E4+b5b08|4FQ4uaZts
z9ODb|oTd?uJ*IE4DvdQVL2w4fE$04cMOu4*jDNG~MPq3NziPBT<%~h6Q>ky4){(6M
z2MZC>jlnmY(J&kF?w?26;^KN~_2`rC-_4!Ap0z&|a|GFsH~LE=s9Y>4hx3W*wc4jf
zw6lIT-t-6J9arW|I`_xqdX5SpEM2T`&TCAge@d`f-l|H%<Z+Ugs?2ZMC-!Lmh9%R-
zUCjA=H+~qJK)0OP0iwzd4R0H|r;Aqq(@;#OTG@w`R@r5K5;TIhk1oCC_CB?5*69~c
z7q_1dL~Q}yX5#>2Q8Cc<Gf=AZOArQAxn6qq!!c>cj#U6DsF-29<Q<xry&%5}nIn`6
zy!!N2p5ANLtg*iVdtE$?@q{L3a&*2zUpWk{(2;$rr$Rho-P~k7=8nrm5sNHlrxHeq
z5&vu@2wh!UUS^m&Og8;x<KS1K)yPj=<*E0fEf0iZG>!8u`=y&vC$qj<uk)c<&ftSs
zpgBr^=+Mj_w_lpSsCi30x(=q#osxwHXz0XDKV7Hd+YL?iCj``I{gK<x({){yI2R<L
zAwzvw*P;CJ@R5HjzhKo!_n9Aht&4>oW0M6(HxB3JT=N)t%*R<g)4Suua??lPjKLrI
zc)!CXroCYXcG!)jyaJGde}Fv?b+l^7lcH5`JwL^!qZlOR$F3?w?DWmW4S+AHi$fq+
zI>eL@F2=(&Mq5cs55MLqCN);JHWvtH`4~KOl_Cl1{lJ4nif*+<#$GC4fHP$!j<IV@
zEx<$M;5M~0TF};vJr)%0-;0fPBpvKQ{g5-{x_*5$k9#}F$@Uxsg`Vc`@2Kgz6QKx_
z;kXB!j=-Nk>teSsrRE#5trD1g{~sMmXlI46l4X-U2C3|mghnCtQkYdF&!zSR`rp+M
z{tz@@7h8>V5|bj}#MeRh=x}{N?M5!gl0ocHqV$topbfIHef)maTOq#i*UzrVC<k#T
z{aU=ZM$g3h!^KFEe|2(oNw4{oitWg()}fc%lHq!AL<HpXXWqo(>sE0$S@4bwJ2`9E
zuS={VDzlBo{hc&_WbW+Fv-OQfRiu5JWUmR~bsT#%Vpu}aFM&|$GKZu}u`2W|rw~~;
z-)_ogJ~GrM$6F0WbMugfus>B}w%2|R;ZN=hH!CPIpBmC-XCCDxuENFj?4AFT)xeif
zeSKL9-L~b9lg>@5;r#p=C!}X1r7U+JAuP#bW~dQ&EZy{|4|ndTi$Sk^<rM2IiI~&E
zo`q{CqXj3e?dC_tq`2@9qap8k5+UIrNHrgP{}mZ{fZ((%ru*Ud2%?P_?F3WXzppnq
z!GvkD%NZ{mRKd<E-gEP_t5{Qcm9~|6sro_f?ldM+gnJ~o0rf{4XUBJUNqdzwf1r)?
z2CyF+=k<4`)6dB~b1!73tq+_9%y`C<RXSW3CnEgEHb$}%SKGUOR`48IB>o_ofYBkx
zt91WC9pfP!_con+@-GT9A97209peqkEd#zw|J%#lr7JMtqx(w_?k2`}wl$zP7PUNG
zKp!~na5{{y6&C{6al&J&cAJya_|c1xrfUpKy3Tv_)~frrs(^P6z2S%Z@^|RLH>oRA
zDfM07S^4LeA^CWbL@?;{i+ZE{qA@HuqKM<Cv@RTui}#Da?$+9XVusY9%kv3~4fvpT
z4D6_cL?jsWB-BixDv9b~auSe?mXL-HBUn^Vz@3bln0SOL!W*962FA>#0S(RP?ku?7
z-JL!4Gl;CB`(4iVwy+S;|Np1L|KDY(<^@(|3<S`YfxK-@0ds3S7>=jMmtI-M_;@~{
z*}Gi>>!@9u$EE8nPHPBos6+*aYY+VQPaWq!=B^&b_4l2$k?cNVMS6NYCVIB%!rg0g
zJ~6~g173Ls<oE9b3@ZO1e&TziI{+vhGx)KkMMU%szVjR{U%SswH*BxB$+4kCwvQ}G
z|J%RtuV@)3TorI-KU!PUool;0YOvGqzjAX|y4Rljsg>K4y{7iH&Fw(6O-zpt{yA`B
zf|-wxKwMnBbEDG$0$|eCF;lFdui!Lcb93|gdTU)A931$T7PrIuUFo(%4fo>Kf38#3
z=<|rQ3WeV~dt?%Yjf|;c=33n7+=Zr4!rL1iNGk^FE%p(Ci|)FY>*HvdHhc~3m+Q!4
zt|!lF@C`MPy7c9RN`hbgfAf_dr#mGP_QcnZ4V!-W0s|vW68y`%3LF!x)85P3%Q;V#
zg3LL@xVmRD*Yt2IFt+p8+9@*<bqS0g9jhMG!AT}f0!Ac9pp3&DB+25-E??_>S2~e?
zvY*V$=27f5!nT~OIir%lI>pl_Ff+5F68L*WsjhmIXQ(%V(EK$YWH>$-V{ToDm=IW0
z2pl4*x7zG-^Vb(u!_u;J#de5+(`KWavqymv#$37eyVmP(5JiMQsyJ%qzsN{HWN270
z2oT8gv?=g~=r3k4g`U4nEi5ev_)Z@^%G-ZiK3x3&;6m-$kf%*ZaZyniv1Pmr(6@%x
z*lnPFT!f>wwl#sfEzEkgNFx3cyWLS-O@(Vdpj+Dm_*G&+e`OJlmK&ykJLA9iILvz9
zgdSHDmW!Eq)|w!Uin{IMSwJYZ!rp~lD#fY`=H)Fr^!GsSM^lji<Oy~IxyLCCHVQq4
ztUM3k-DyMKY^dn!S_7FF>vW$l5=U=<#D^U%f<<|G;$*6dtB=bNg!x_it5o<Pp<m#e
zlNIx&dh2>1-{9g~RKCCBF2}WVi*Yo{CGwd9vq07~Aqk?J3JN{_2eoWW81x|cqxs4%
zS>je8Y3WTrdE{p*HEoCZbh}Y*E4)gl)76b{K7imq?y)D+gUF&@Rq*iWxPFZjY_V8l
zW;J;B6N$0HZj679N<PEF7Zzo3_Sg5_pWa}4Y^x}AKv0%adH}!m%~AE(B9J~)9|@b$
z9>i%pjFaf2Z96~-NT;IiNXN1s9T;hRAC+pL#2raHDB>Lw);y34?4ZoYa<OIrceY$t
z&<W)zgs!Y&B3r_oR6<}megF{xLZwv%^h!Yrj8+~H09#yKTpdW{+FvTXTx|mIZPr_@
zEYo(Imo@ah>LZYT6{Awx)(Tn-F6ktovIFFwBLBDD=WU(at(9h*XG<y%1!k(CSF5=<
zjVNomVz&4K@MWpn?TM<U=F(%YN_7{4WIGRVPxY#erSKK*&p;1Zs~D>)dqT0702%By
zZZ6IN&OxnNG0xdyH7169YdVP<#U_jjPEfHi&>l}lJAll#Tx<2{rN+mdH;m-p@w}aD
z{g<trFE^VjgMGV?c-<C4LIP6^&ov;4j^<;FM)G&X>*cAe@UXmXwKp7JEy8$491hSj
z2Q+5~IuP0TGcY*EW6cb1ao)9*+O-V87|{U=rqz%=(<|Ao5w3s|<PHA~5fE>#bCHXq
zbbCslR4t%WAfOeb_TWlV$#w(gt71TN=7E~@@6<q834$DWK-*xT8t~_j5DJ)Lq5J_T
z@FW3fw>C^u=QhTBdxot_1`xv$P{>GPF$+byo##Uv4>k<CsXS2bCTHcwL#ll-qtX=(
z2E88vXkLy$kqS_0BT%6+(ENqG5GcWm*r-wgl;EFm5CZg(9r%%0(<%HWrJe-I1U8@v
zSs1UAd|??8hbg7$VvueHX}43jYyx;8`wuWW#&>63s7)v3t@qzIa}yS66W^5gxjdft
ztrY*TYS_lQ4LGhNkX5--Y7j9aO!Ioaa~x*rn?GCcDz#o~HJedkd^IB@WbLWR0BCLh
zKrWfJrzN;SowrDd5y4`qZsawgtS=mD-Ro4N`kFMJ=g2A3&e(LsdV4c{fA6{}0n9dU
z_&DG8K4LG6!&x8;Eealczw6DB`hSP0pscKJG@ZBUwz0M^OuHI<G(eqc{}~Sefe~8*
zEt4s{2hpjO)g<bAE^s)k11aXlvqXj_N*4D>K=3$%eS}VF>5hwLjz^V!b%m58fC|+L
zg@<~>e@JLQTBtrgX*zY>Y`dNX6f;Fi{|g+?4+O}sh?xq}Ul{O`?X(>-ceRS31l_fM
zR&@9bU^Y5q*YCff)Uymxd#!{57)Z(qMRhzt<ang<z>^wW;J>L^45D2a6~ceieLwiP
z1H#{)Vv0A22bU=EQSyqB2mu58CVeqNB*wdz+`@qpME2t|48r_6-<29C@RddLH%&kz
z!d^APPrW^lLhqe@V*owKg_HzlD3qFZ>OBI)=w4PeR0s|D$(KKsU9hAuhs;+85Wt3m
z@(h1*0FO(v@;!iwq9G;(nx%F3UsDIE{u2>;Bl>F-crBg?4|sl=cHy(%y(FNI(ttJ2
ziWmxBz^F+3ENmPkgbqy4kiF}RaG;Pt!gndKC<!_6R^oa^o;W~1^*>()0&4CCyjLyt
z@!5ThxBsqkxFnc1z#Cj*pcOA8LaEna9~y??V}KUr{jqDzl0zB+q8Y8y1PoUp(6_>%
zZ=^neEMX!-01aMD0E$`O1*|rL>FrU?-N~X10U;3Z=7YY*daHQHLVv^i0bRr@GFdXw
z0%pW|i4IUo8mKzn`*(SYEbxLB--<o~ieLqlv&Cdkb;GDIxYNx3=%a;h8thHTW8Z&p
zcz6uhAXOC=quSb9RRDBaY%`E_0|?Zdc=X_PkpcUZfE>^MepzaAD)#^&MXxx=!|B4<
zaALt#AoZx)ZwHT$Jb)1Ph-UwZlJtSA;Bk`YPF9*M0gbH#GP=5L$E%e9CeHD0J8`K%
zG250;I(b$Vcv(v^)ni~*=3tnXd2_I{<1EY-sT5lPNvS!3C=1P2_XZSP)^4lDeX3Mu
zec|uAMQB7oE6xHlL88A5;MvvMZ;qEtc%4pa8tir#mF4-j1)Po-xARa>|7^Zb<#Vy^
z2~!V9hy*0)PCpX^jEXqO4AfggDri1eR_nCt(r2Wn^>ztJchCgOX}_oN3j02~nvx%z
zuPzD{0)+Y#n*$cTK;gfI7Sy!0)A{eVVi}QmFNfJy>^Ex6rb>XN%ki+Zei29<Oc&=p
zY=r{!&wS5J7qDZh=#rSss`#vye++sMmuk#nF-zD+KX*AE&Fjnao%=Z)FRB3Ahx+Yi
z6r*DBCbI<FCxmJ|&ihUbAHZf+?Pj=*0z$B$PPdN%(rr%yR@vlmuDrhKv~?+58y6x*
z3@T;Sc~##<bzp=zz$q9NmP>UDlesdMuQ_Qu;E4ZxHJ{9pay%%^U$p5XG?7W5kM^#G
zbtid{ZU@-_ykVet$zPS;Hp;B>3wncRKmz1y4)fVR)Rs{m>40!JozL{RhI=3O@gS70
zZP$QV(il@<0E|&hl#n4Q&vu;ha)?M3)5*1g;ouLz9pTuCKn)qo;9qeHzm4_gruToy
z1eg3^obwhV5d=4*ResCBk9oK~wFYuXWTf&y3m*ZMJC2cItOW9g6C2m8MKU>{95J)|
z-;01$KRKCKkVP?pbb_2kr*ji^zDp1~#5Jvjy0!dS72D|<W9cKXOIr67Y>o3K49HXP
zl59uZwn9+_Ux;~IgwFW!t(~RWjaI(eMWaiPhZ~2n&jkFgHExr9^68P*7!{wxH=Y?`
z=*mtxY|;0AfU1p0s+vw3hh%Z96gImt<!iq9!s5<#qLpt(3a+-Wih*X=%La2{faR`H
zOgvbK1^&#rfcLa{%1HYwan-lyLAr8#B(6sL{SBQ&r4T>X3Z-u$x9Q*#XMwpOF%mAY
z^I$KKO2ZmRlm4Sz6qn67%&m$ql*?Jl2#Cp&(xlX*to(2yg8*6ADuW3X@`OKA8Z^mp
z>E%B?d%PU02U5k)Pe>43LUpB5!Sum$HiHy;J?FLJw4es^c1$kkMOZE-yQk2ZfFSGz
zwMi-%ncvE3tOyVr_tNMPq1U}^$Db;dx4^g~W;$ZQpzpy0Rv_Y6Mw<X7ka|3r9s>ql
z5}4;QFyEpK5FlR&O7%m8B!N<-F4fzHK&iC!n-4%K5}?$3P-e9ir5?9k&@ehsiW(?o
z^34wqqk_y?-SC4?J776McBV*y<S^!>pO2}4O@j?k5n-QzJYY;Owon$xfJgK|MaKGw
ztue1_g}FD`|IBAg;HvU}x1l_7z<d=s-_5f_fLLjfiU8&;1=teLYANZ@#d`Z6>xRpL
ze^LP3ipkPsI7*nIu&=zYOZDrzF5yN(`~)<%4yR7~|7!0$qnc{kb|FSc1cVT20@90s
z(xeEQfKsKYC`DQj5R@Y2p$LTDJA%@s2^M-sN(e=|6qQ~=6-263frK->&-<S5d)7MV
z|5;~!YvoUpy=P`mxyyB3GaE!Gr&jb#p<e-*?ry~){C_+8l`*^dmws0BdE>y|UI*bx
zk3Z5G695&V)Q*FLpT@v*ly9A@s)mX@=znI+O(zBl#`k>@y9PI)F!%`4f;d|65Z(vb
zxkVs8s-*nh1Pn_Bj13slArQfX&aa3&Yo-oiY>1RnK)m;c(9qDF+D@4`z-o<w?EwGs
zs=nZ@xkaamNjJw^mADkpQb~z5;I9X!L6CXVYmeE`V5HcjsJOWA^feIHA<)9c#`sJU
zFXqoyun6dXn*cnR-uA@9-D=+xpI=k;HQ=?by_b9r%nOQ&q~Hd~fpeQx&_XipVLIik
zuW&oC3+D6{=JjrC<))3EV=F@iYoC_hROH6Ub;C#BswW6x!^)e^f&shYmv3_Fcard@
z_8X|<tG)j?yAy4}7DMd;ZPXvjdhniKyY9rxYqoKoJ!e7ZT%F|P0e8HF9VoM%E~U(q
z<E6b}qvtF+ZtX1Aqp?0aJo3K^D_*+k8)ktD{N?mo>HXtir5<Mx7F!e}xO|<=^?XDJ
zh@H`-t=P3mmwuY3wHGxcfuBu<-x&f$gMb#8wY6X-)6x+$->DzHDHiYEczz^K5r`S5
zri%S}x`>TC7!^0w;`66|{SM~T@mO}z%LE^QRb&K0;kDCOKd<#l_S-#87%vF`g$s?o
zcp=t^!%bhY|Dl@(tCS*-sQ|{}lYrBWgPB0!1}$U=-W3j(4z`<XqbQw6=D$Ev@T`Qs
z^nudHI3+(Xm)cUC!<3wRbPIZ2;3>zea0LyxBRe%>0&E{hyTv<N7H5I?BfnyQ|7bCc
zj8s@$ER6!Mo!)!U^S6%=CyU)lk#qf6RyfAtp}zZ&AF{~lID;pzITv4D=f1J-U~^Tq
z{#mXC`~Z1DJIKuQXWj#g|J(_2BVhN(fJ>OV7Sk7$VDq8S7~0t~*RNtJbkArJ$ywn8
ztgLYeaiyRn|GXOHb%YwjOPFKn*4>8zU9KZ0V5j&nKV(f2ucLot%fB}G?$E{v0*2Z}
zQf~69xJ7%^_r8DE1m<OUOg9*Uau88iW#`ou@q?&9>XsCks+(RAdNKBMYasj>F0LI)
zfk8pE!C2|FEh7szAbE2~DrMb2?dBmDhG#X<!dvVD70R5UU<O?O>rN-XI{wyPaXV<m
z>DtCMP!ZCgg`1&%WMK-YE3iA{K8;i;Fa*~VAE*NNb{*XICHt+M6%Qx5FzX#DUPv%7
zK*Gmd12{5}ne-SnhAVV}Yq-2*LM;;f76&d%s6&NLEy*o*{SQwxPFqoK1pTcF@&R-v
z)IJ_6Vzd9uk$a#R+_V;9l%bCA_M+ZQrJHvB{7xFN3H3111n7a+nW~ekZ8~s+Fw-wv
z$-J0F*Yg_>({6fI7sXCTJ}~p`JQlv)JrVh+-~uEQmtEUUah`4S=h5Ebe#wPzMwaiO
zh03Fqx*DJp9^EP4@!4h)DFwp$HbxN-LSbF|$DET&h9oipF&dg(Y0sUfZeJeik0p!e
z28M8vQ;j$NVNHh?HlSveeQ0kS#Mw_a45`G*PsMDYOB>kVS4D7_KArb0NZT&nMdM0L
z+}EU}uWmB6-@yy5Y!-T3@L?8rXD64VEB6WLp*`g!{ME)ePuA<6-Y?|u+A=855YM`-
zeNbJmxth`I&_I$)6Z0`v9?KjufE(yN{zwo4OG$55{;@TD6rpyG7z`B|BR(8t`<1;Q
zB*fsKf4VTAe0xa`H_&(=TptVmt)VZqL<wpA%abIQ89-hm3}%M-FkR<g`|)kxdwQA|
zp392*OkWy>hm>3;*KjL^iloHe(FG%YpUg_+wVxG~(ZdyPP#y?w@_vwd3k(xP(4q_~
z!c&Sb(7?ZB7a`a9k^A)mXxIe>mPkGfi~m_u6UggT4BSA1<5fPGoq1&41|sA=Fp<p_
z&>Bi`)1UX(g24nnd$%3<HWiG&ev%)bJZN06YTjiq?3eu{7&Y(`5UK_cTaPvW&m;U_
zM#1U&*;SA=>H%nImy-4o_I4wP)JL<IJDxv$_)zbmiM`aIj;odc$Oqa#e*Bo5DP;C}
zt(8PGNUBsNj*mYoC@%g}sQm#*ZG33iWHB_M7jx=%?pa7lNevQ0?M_J8zT*2wd018X
z9kY46&wc&&S`#(&JzyS~Cj4XaEYg<-#xJi<)pu0ekp}d9O211NngIyPzUpSC_gowG
zy`PhxB6nAZdSc{pO*G7Z<z&b%SpJ$?VivfTSg|&W`=VE5@MC<e-a1y8L;P7GyO(iO
zTfF^vZS%?xoUJ^X)OmQxE1jL-n?)0KIJ05+yy@w$kz(!1rVlOiN3^Ab^Za9YB!v+G
zd$<RS>_0xEBY+t;CHRkN30ht6?)!M6+OFD4W4gwD+Ow6h$;_H{xVS17J-%L^xgyBA
zGF(*8?&XbU_5Aul+PT+59{bx++GF|haFLgL(jw+?d`Q^7o%P#lAYWt-)_oKn7&@*m
z`lmWW!rOALlkrNjb(7~RImlTuGQ3w{U`3}#XUdLAV{4l)9e&oaC#CP7pUBjVE-cEP
zy}N@={Hbh+(Mn|S<l21|8qH>vWbBp~og&udxw~RvQuo)cvZBu&@DxJ-cnTw7jOMoZ
z!&<l92sIDWGOI{JZ29{lhm~UMe$(MbFWQ2&$uTX;^@*B()L7LwE1)A0jE*WV7_<-=
zcOo9j_vn`#<EK3qWvVR)1Kww>N%PXzYT&SWG#uCEoGht8K%vS}+aX<OWYMY;6;Yla
zE-g;qNj%ycFKF@_v~t~|D4lN=w7t!n@_5zh`=Hxn-q$z1c4PEiQbt&;-~3Xl@rtz(
zsmh=grd^{221E592E)nrY?SRoW3N=1qYS}se^QDJs(HK14=2>$QCB!2`%94_=?iWC
zzaD{MtmfMgEv?C+w(ZHMEdmL}=qUW%#Aia?4f>41t5lCZc|YZq6&=kv>soy?Rh!qS
zel^0@MCH}B$`=*ZV`uhBXqktybCu%K(kouhyf;j8o;Kx0GS#h$;ZA;Nv91ySZ0CqL
zRUscr+zyKP&~h+bpp&Vlq~rSSrKZ2L$(02yV7W~GVYwzBk<ZHw?Fr6%Ze+0RwcTzC
zYr`aExHs$D^&W(4Q`fdD54R*vy#XN^v2Lguvu>AU?o?!ZmzP=Jd*g|KEUYc{VNZ_`
z277pk)ymt*PRG*Pdp>nkUeeS(K!Y*F5&P4YP@0hQv*Tb`Dpg{2tUk7$6zoy9%=>?C
zibZ@|of`EM<&nlWF~y3mKgYi~95>UNNVirRFWDT5;jon!^+gf+aDvy;m7}$79oBEo
zg!-S-GE*Iub9GV$-9ngXNulRG-HSarK0fx5jC~brMm6ID328|`{}68z6aDzS^!{Oo
zYxMA)wFZJ)eMTkq)KOMDgu`_yheL;f@Y^x9`E7TUm8b`eev`bDGPG{YYqI_|hpCCP
z*zNxP1Oose${^rOlK$aP9E$wb#wsSRbbGk}_{+;~<>ubKO5hh=L`&iEf@KPUpIL?m
zD_IMUfVn!t`=HOd&)|I~*sqjX?-B-8_bdBM>STL*ur!*G*K2+N!`goN4@*?UAm)2A
zG8gQ+H|L`BX>mh4F^3`S3?+?Q&krwEDlX4m6#wDRHI&!>Pg|4R&kT0P`VpcrAbk1@
zlXB|I{K1m@FV{3Td@gyzVQT!LwB(};;$A<f(^!ktCV~C<%U3_vCePgL7uxaej^x1M
zF2{lc24%jfJ{aBAF>&wSk0AiEx2`w6^m~!RzI_=iVc=N*n~wy>NZ#FNb-npf{j=br
zFh(`+TJwzhj)=?cft0zbH-@&=KHMIBU|#uAv~J|%i63%x!JNmgQ~lj2yv)`Lc<S>&
zs0ya&-G59|<E!M`Q|sYxI?JRyxjPP*L*^JbM9QQFbD~^r_38yIH<K?jVmH0pd<Jvh
zv}d9|8{nM|6Dy`9+=>&59&I&m=yRzQoH6p`7=6EtDuriGQ|8-ZrX{}|ff?cQ&rV)s
zEDapg`eJEPpA!`|6f$X{>{h)!zihpvTJ5^c?_RML!#=T!ClLp8-)dJ=;eNZyU-8^Q
zj&yZS8cdUXc5DPMX-V-unkKCYm$)yg-=0_b@nL`RXHY_QyH@J)2QiiY+ags@S+JLd
z7@ab0b#6L0x&E|to8KK73)-5$qffms@nZYO*!K^*rzzNz-sE^WA-I7M_d_*qkoV>!
zG!|At+AZE5Uzp9diHH;V`2E}buV1tmV_(j3HErjHuJ^^PpGmt;tXjUzuBzOBu|epO
zo3HkcBOxJ&A$nr9D#<MB%}enSk4Nos%j#@|#zP9wdbh`4rS}`%SCh~uP1R1si~Eyc
zYew#>!|xPn%nY2*J;Co)8#lRd^kT(xK1^ozfasM|gbrSnzyGwzZ)>uZVes2hjZz9j
zKl?%#Yu=knu>(%A%>A57$v<#1-%LFYohFb|Om2HGGDYhJywbJ~uL(F8=!v-Jxb5x|
zV&ANae^}u3VB|qpST|bjlbTNE*VWU!hhIovrY|X**Kya$6RpaH7G7Ym!sLDMCKW+f
zT^J9G{>~U*mulFa?M2_5zS$;|%4n(BuAlR+BUA3UQfGr0(fEr>vSH@?mlP}7X?OiZ
z@gf)6>BX^Nq|Qlz*TRJoG*>P+6;EK@-j50U<F}XjCbs(@d+)FPbnDMRm)Q?m<o2EX
z9?ALH>f$5%wj+tH5n9O02%1ply$6~gdEqdR4}@}8vWIt2MGU3UgHpofm-Eju4S#WE
z8U37O3Vg2Vpd_?jCH8g1^BV{5{cm(~qJG+Y9VDS+ixccCDw;gZc}=b5nzOH#Jf0nk
zc&>RAy}Rs`h!se^m40op&_<n9w*E?FYp-@VgeGkITP$h@x|ueZpwM^Fa((@Jq^jzE
z?yGTLOx}CJGs0Sn+ovN_G4T8fz!hxCD}ziP31cR51bVkC_uX@knoR$E*}ke#xBM$f
zMQ85d7B6Qe>P`e}+KX86=hJikxEsdqef5%SKRY<0iZj`xruSV_knfuZsjhwsX+PtO
zcnbu_qXrj#=S4Ox>Is)G;M^+*<|~r8Rdqwta^H=MMmXfWL27Z~7#jP|4mPYD8s+Zw
zk#>kCgV-l|(r|;zLiD9yAD~Y3AbZ!Th{F&NN;4{GjOJBAeq7?Ka*e!_j_&Z;pQ~`_
zx?xBwBUBNT!J5+5LdV~>G9T++`?U27heOosiRaF`K}^aJ>1Vw`YL1@u9a`}ZuN0<P
zH2M5cH)6k;@rL!yi%3af#;%rU@*^r5TeQj}lY5_JWVX*S=}B=XHdw6YM2bdFb4c@h
z{;}LSOwb;5uA>K;A2e}UxagM1m(RY!<lDSE04oBmGq-hqH++tn#>FHun5{%<V=I+i
zi+Ck+I={hp!?eaPM~8rmCu3iGQ{5cN<F2&tR%BM+PW&R@TD$H_Qf5-!pk-Z$cCxOV
zeX>{hzHyl^<tUeL|4a_wmNs2($6Y5;mi<Kg*tYG~=wCF0mZ^h8CDC#=vC^I2F9;}|
zHec+xMz!Uv_r8c~xqSYm7fA;3G9cX8|7W=I2^tl8P;B_A#UocjhDWk0Z26j!FfB2W
zgXO-fYqaai1KWmowl=&MVIARv-??;s6OW?3I){~Xb`4k>CGGxv{k;Ce=Jgy$#I*2_
z6{63sxvSPY!EY-U!xzc+`Ja?m`kbK{yEj;1{8Cz}#G$Xgn%^^Fr+E32ltUp(_z_Ll
zeG)eSGc7m%k?PDj!wecUPSh@&GI!NC(v?pQe!Je8{4AGAZ!i4Pg1pV<U3sp<>*9yA
zW%t$7e0`Y<*kzPbCf>;?Y(y%Rb;I=u7O|1}j4#-R?(X+0=d-3&TyKtszI+$5d0%^#
z8RZPpduNwD+Nv0kGFAK|WvZQov|pPo2B6ePajeOc`OM(XYZh9G*sT_g{G(&qkq<Ac
zv4f3P>rXEYMoNB?t`<#AEqoqUUT5>`f|Z#`y=&}r@}#^cpGL`Y{xXkQ*4TrI%AOus
z!v`%@*H<}(R~f7H{}roX|LKn$$a_W`KQ2XqJa3HaAMN>{EcE}cL>gK8fp+u_ad&T5
zqomxNN7dR+ap*R8spTNVYwv?<q1V#I`mIA-`{M&AE}D(7vfebA{(S9>X8>fE28b3<
zH>)ofxvi#>uQl-CJwd`%&L&gdUYpsWc3<jfq!$tq!}U6Z@7M=i#t^V^t7?%$q`Enf
zVzi0?PFas;N3LUX<JIv2ZqG>+0_4IaI@4yAoo~EX^s6!8tVfU1=@r)VuWHUz>4CHL
z3JMDI^V?NoFIo|ZeOA&dmGKN#zIp(XF%19UQve{3N5LCm86c5-rt$h}6@YCYfuru&
zU;3~&zgIJR@Gu2r)e~hMdsT?mP(uRF+<E~(?Cew5nG61nQPaTyoWdDXm}m~bwa>cc
zy8ofM&st?eMFicSiE$5P1E<Iy(d3b#fsa^ni8g?aM*u9WsPp60OYje0*|G*8s2Bli
zQbm^P_VhX^uxIS44*2a<d~kj#yJ<WbTpvwd@B$%72mbX>&j`)Fga18qb_}Ruz<t59
z9T6<Sr;H}wZDr1VQ0OQ3dj=GD@GZaJsqjG9)d8v(ze6<&fImO&kvQe7Xz<|Uu)@?U
z03_5)jl?kk!d@G|JQ*o_Hm4}E@XS;F)B3?>=UcPU<Ef|Zg%@BEg6F}%PTm=j@ptgf
zvlJ{`5Jgb`+1d7PmLO=Mvp1iHGUo=U|Hb}Y%$_>nAl`?7`qe=F{+p4b00R#*9Z6Qs
zvOldqq~MW#04NwPcK8VJwvwRaCzj*c|BVu!10^|KJZ$^|`eB@{OpeaK1=PmSKJ<tA
zJye9lCys%e4gzk2nHFY=gZ}R{lz0erK79}u%HSt&pd!~|pWXNm^?{26)W^A>ZZwae
z<3XTWt<sN!`Jf%C+_@V7rY;8@u@ztbVG)>-whX8>pfRBX_-17DK#4sNE)4HQ4F$q3
z00^s4@J(NrEZ|<8f_mxv$v_DZ$H#m$iV!{Z>6b;^anFIb^ga@h{RQZrPY=$}`=1BJ
z0M7gKssTh#v&r^5_W&b!E_=m?_R^;V!+VhdP$rguMCpNP4jBPZbXd-39MI}BE}*1-
z!{r*c(*TCDbLAHF9KhoA=v0@ilIcsq;pT2(3X8}%zlS(l;b&0T=*|5}i=(}@3gKe#
z=A7t@5AW^u7=TecrZcE@%)WWJoZG&%^Yfc7O@Z76`wvwQ6rrh{V(#n4fW$nq`;saJ
zggIirfQkqywOe(nbKpge*0~m3F{%50kS=Zi$a(SogMWk9A_k@S3~?Z?83WRmqU>z0
z4|S)ov{IHj1pGMsL04dYd#i)vdHEaBIKLP4hjs{mu(C#Tlq2l`u&p8?N^Xl#S>pvg
zjClX?qcb@D$STR?b+^*Lia#v~h(*Ahd97N~$&UbH5<33}peq2<4}fm)iBkpNZg?t7
zhx?tj^DP*;Jns8v|Gx6ZsfJ-txEx3UKtY_ZfG=W*7Xp`o&q6%-7Hb->by~{*CioTq
zzB2Z7+U`Wy0TvEasm2FV3+1=x{=V|X>FDI@TPB>A5~d&o<{U^rPFn^R&Z*$wQX_ZT
z-wi)S+3zDh1pIyFCGhR{GQ${n*3*Uqbp90nzr}-(l=)*m|M|tSO%Dh82G7Y);sScx
zx=iHR>Fn9#FDA(){+DdXKYk<$Q0YThxA*2N$6}D(-CZ}pP3*UB*a3;adPkEpsg!+3
zl?&8$?&sby%|%6Ux>GEY0m7gN<YMiOrVVVvhkms{@^rME8*8^Q-8eijfB{EI)d2RW
z&vR!vTeFZY%V?BB0D0_zp3T%j{b^%I0(#gTH>;Ej;F<MM!gXy~mBjceI#05Cu&f;)
z!)_aZ1E3lY<|3<Imb6&`k(%M$eOcOOfb$oq+%Tl(#r6}UQjGr<(}q#h!OR(OPE8tB
z@+nb-U9&Z-08FYu(p3{<1rd$VXS_&qrDuR(t9Dy^wDj(ku>M8Vyu1=k{H;>iOF+<I
z{V&%2(#>%SYUUYeH1d-D=TqY0O~1?aIejzi5Q(&LGA->eX+t>vA+ttHnxVC834#wd
z`JR-I*4qqn9}!N7W?sbI5qm(JRa^Bi3g`h&H=no}imn8bw+<`w40-pf8otkXS^Z<t
zq|xCbBkNPy-Dz>B$_ICmxL__*6gUn<`^J`%yPfx<uvwYYeEV2g$E9E1Nkytd=TFH^
zs3?V4NHx7!eqORrx|Ho};?6sC)jq+OpvA%+W$9sQ`YzL_F2k_}o~fmN*017m$P95_
za<i^ZE{R=IDl8VrEauz7JuNNLJvje@jn6a(X2h8|HlMx9v(2q99+;H<AF>CCh}0n}
zn-R~ULL3L)EQHU#-q^Ot&txTyd#S`$^l{*+dJeZ+^nC2aQpQr>H2h_BN$x=Yb3)DS
z39vddtso>XR3lAv(>5Wgyk~zFJtkusO8^H0HntNaRm@%kunJ8mIe9Bm<lEFl2=yTj
zPx@f7V1{l38gb0`WZ!q<C7*^}XB?Nb&oMZZ#U;QAsH|!e=)7fsF*({T@m<qa>g?;p
zJ2Vm9fbQcr!sl)5>(Or1)|Z;urtS1O-y;I6Z~MM|1BWa21>@_dds43H($!ajn)onW
z9^!uD*Z43%U)4}g(c((S=(!o2@D$}^Elxp&KIrIo*A#V%Cs5I!3>Z3fx=KM;(I?yJ
zDHKpteai4!|J1%KIv#6wlw-1IUPg)D(eSFA0Gd2~nV2-(hxxT!9lM3p^F;c}29vEJ
zm&7CGb2}X&>^rRiFWYrfPy^p+S=Syj&8;%!ZL&wQF30x2Rd`+HJMwe~-A}nb+E0k0
zs>e3r99&R_h5kg%wQ113)fOG+j}Awg95mO?bD$<oHXe&Oe&#UdvGCjiRH)O@OcVMr
z7wxQzSLML&ASN+AJG^_3g?kNZ0~;$R4#ru$$0(=2FCtlI7x%5riaW~gQJQVB5K5Cc
zj^~NLjv}cHHC9_0S6iB**IF54bPt7E_{oI>l7ul)T5#qO85k#&6T-|&{>Jy<RL{^H
z%c*#mD{an>_&ztrnLNaI5-Pfa-~`s)VPu#=-*^_n`z-{jKx#(D*@U8CIi%1H!QAs}
z`|DFj6+jol{r>aXwqneTcDq7cb|wycREJm3;vwU5&<q{a@iyLR{iqfoPCUdf@0(9T
z4G;6vu`C?lZ0%N;-c7^4f6p!^jE>YOpM-^OaD+saJUA(}v~xlMZ{lwMYA#T$?P!-M
zLuzv<_QC;CpBvB#f5NT<T~HF%oYEq!X8zNu6xFT&=6-ECI8bvaBhIn#F30TVyruWn
zJk$vBsJ*MKrR)N_44<^+H0wK_Wj4w~cR=r-MXN1mL+Q2^EwN{XW~6pvYtyf9L81qw
z=a&+iSSOaSog}-_4)NP2s}6P?Bo4X1-lus68>O2};_mt+`328)o#Rl|g|wq*fYye&
z;A^e+tr-E2P1+=d+hn|@o3!bezkYWKOc+{cEGnJhEzmD<o0GyG{*?v}kECkRls^^g
zIppzV9&3pi)cy4#t>y8>$ZG87J5dOOf8<bn0>73s6<kn2A)dkG2&M-Wz*#e}KbCV_
zHDHx`mRLWng0Z+5g=1z4z<a>1MFonNs`;-55)GVGSsu3v-8l>(U-Oz(n*1~S93n#g
zia0BF?c9@5IL9it!6E5sBngf~wKkjT9eh3t<(UsayH&hQXfkhaPxfxwTuSA(n2{I7
z7e5X^dC`nNIoUNkS%h4V^hlnwIu3I(#%!LIwvZp`8QtU%fJX&7(NA9X1r&Aqmvptw
z4AD?YNIUZ%7HPX=3l*bP8H;x6Q=#mykt3!-1^ig7h2Ok6;PV|A7Lsr~eEaN2M_)wm
z__MI^wEP|_@fj-1V=taJ=Ma9}h-IT3<*1>ZIG7!^Xhr24fgeu1m(~K&kw&u9%Sid#
z)29q^O+fo_q939GUaQICG^|CY^4&X&o9vK-G+R-5IBr`~6^<0B^KB?Zc`wAPrh6a+
zVRO(kjAT(jgk>WCX|sjOgzRK$!)GagiHNABqYXeSwq)8PMVgMLHslaIM%<2cYK>SJ
zKWjBzTjx34X0T{&6s=-*kr{DCzZj)R?FIW9g>2@TK}LPbclnu@B$urW(<^$ksJCR!
z%5oMmx0aKR*Y@>(EXnP>{Y7R1D5{M)xBcT8l5xHPoUB9(O5bm2NJvqViCc<=5<Aor
z=%Zt3{h1KKwAj{jC>Tvn>xC=aI;8@F0`{hH(;o9Q0vKVvLdTkWVm?aLWmz<$Eu$R<
z$F!4ktR0XsYH8F~8&cZ3kIkB+I-Xoy`@kv3uP6$6f;@Q2YKGdxpIHZE9HH|R882Y5
zFK`2PeUCVnNle)m-EF+N(;RW9?Z+#@`XKRL*{e^H^$pmAvN9~&{7&lwA;{0QR^QPM
z0qO=UTj*#<w@GIFVU%c}KqY|>ll`LdR!0Iq#`5l@Ks(xvWR8@DolAe{;-=-d*qgEj
zRNQXliDaMA&q!rFK468WQ*AC~`r?du6|(leY4;?F)>vm^hf&9yw`;=45L;_JDq+#+
z`6ZMT<Sh4W;H5PS4*#HFN*J#_a`S+T6R`jvh<cUPZzz#+PIwLG4OQ>UH@vJ4fGs)5
zCAXJ|@`iN3S|Cy!5oHTeVPRxpa^n<=3Y1%oC2NN0IdZacmhvd!d{^K4+F7hRD(#f9
zaJ8sNm*01|Jp+YE7tXNBI%csl9V5^OoOG+L+|){8p6aZ-2=PPjA1T(x7=a3|tO`-?
zu7l%P>uy52!Zk*<)fr337U#D4<N;M$m)de9lI0jK-2Nx}xp%7*|4b{)A2TF$V=NTO
zei$rnTn0zqB<_53IIfv;ORJ{{)nbdatK4p(60Nm}O`s{RF?IPOwZ0HujSyS4ICeVf
z>?@7ko8NP4Fy|n=IHD6>o%hJ4KEhDAGqyPPavgI4z$mWKNYO$ADb}6oxpG-1=eh6E
zL<kIl{}ZaQiQS)apOU(oUt9Z$!O2`GVkVl*iSbzHWJky0;~MSBChZjIyTwZMhwBn#
z5PI_X-eBcRFlOtO%2*f}j8<*qPB}W&;ylDCY_jrXAwj3kPpQf5c#g@0t^*ao-{RNF
z#R;SHZ?{(4@%J;Q`j&1yF-!d>OL;Hk3RD^z^;r<eW2%285r2kO!_qa220MUa=R{U~
z&Qxywc`nUfDA$^W=$uQ>m6}=!2>T-R(cb>Zxvp#Qm{HnX;JqiUjAf=ha(iNI0{yh<
z@{d8N<dP{Hp5?teq+j3vgDnJVzdFae0Mvxey03O3?Zc?C;@S|&$9RF9pac51@M*hg
z9m|gg8J8cg&KNX554$qoMnE5p^|MI^U22ChK*AIpTY|DAv6SylRh;2MEqBZg{FD{b
z?+&8IxDDy|I8$NYfzEG3+n|)y?B&ndT;PDeze>3BRp5x{!0Bi+uqIY>a-zd&c5QyY
z7V9`8ix;ST7^DT!gNo|onZH4!@LCjj76I!9s6*!1fk-4?VXp;knNJVL8_%oJixpqt
zLZw&nh*Z7+TG~3lz|sN6HOXkD0Uj}DH?7jgyR0nK{!B;7sXQ~J8A`O{o>Tt%*-|Gz
zflXtu0phCPASFZv&yVCnE@-lfD+?m*imeW)qpCY6VTGFF%|ZBX670f$y%UAbz@Dy?
zrLm@=xJvu&8D6oMR|e7CL4MqomX1)*!zZcd@?mEbS(<Sx9}A2_G_owsq>GohP?o6g
z0=7Z4y2+KZ?}k(5F2Gkcv%j7f;M!DC=(K_FjJ6)o(K5F(hLA4u`&bws#1BOo8?~sy
zDCchFcjMBd#I%n)ze}`ql?5o=ixpw^$FS{Ca!3dml%nS`HxgZP{Acd9lQdPGzqW3L
z;LRx?zZg_z`h+}d7c5FRvUs0}jba+kTjlxHg6eKmu;9S8qhii@w`?en@?w-XozF&z
zYvJi=^U2`GzDG5_zxveX72~s1ulv9Lz!AV|T@78~CFi;O(vWTd_B(xS+valL?|D}L
z#Ghyh!Qxl(h~qDW<LI8dbQkolbC*KzuktV-`l-spBAfcd3kN8*#>1aOl~yFII#{G3
z$HWa^f{D&8{Bq_mbEq|L>5;BCPw5mmhp<nteXjo){uai1)DlGfDuHaJO(sCzwX2g(
zaHW+Y{>eq;tYtS1ZYvs3t07^Yf4|GY_jsP89tHadC05dq=a?M_m(J)0g=NLEA8+1N
zWJ9#7S|(fG4Lie{LX{$|8)Phh%V0+9$pwN<k|u4ob-JYjOzu{=f+Kfx7i9}w6XWuP
z!rSXyS(oN>efLxa&70vgBo(nwI?>b-wgH=4d-3tJl&cnp;y5QrAQmRK_d=fT;d2h^
zWB7U0l{paq<&3z<{<ucYBZgo`ls1umHM#i<qo@NintmvdZH=&fg;L++=5%b=^y9bT
zy67{Mw9zVho>Q1JPNJ$^`9%ON({>kXnd6AGQbOt;3&nfJTisb9kk$y3DU$X)e`#JR
zdn=FQ!uc3VHJ>A&%em1^OMW{G)(0Eal9^iQo<GthMVcMASUFmTn&LYhVDDEo^i-a-
zedqq>xJ5&**hYu!TVP?;4YZUFqtty6m7ytu<C23>RkvOU@%=5}J0}~bPQQ>KEuj!_
zGt+L=4=wA2&paML%_|5ZF8Rok&$R#59u5=pk+qPK7Q-D>U49;j=JFRu-BzO$62-9T
zgk6Ft=1&@wA{b-T(R`ZdgK#9ITsY<(a7fdCk}lhhThGfLjQA2qH20heOVKH{Ok%6^
z7dcz>x>4x90W@%SUbVN{llU>PHehLunCyIZaydTzOg$KYUA&R)PaTCHF($V_OE5yH
zn&Zj(5{<`yTR`|n0_flYaPmw3T%(EQvD9lECtDm-J8ax|o%@)jixiLVP&k3?vcAG`
zTtHVz2_c3P@cS&&jSy1`%DVGJkw2i+@k>bI`tkB1*EbUpIxgLa=P)Mj7J)g9A5wso
z=m^Eav%bs*nv8}LqjEpkO6(}7!6>=a@>SXgA7JynyF6#qfAr!e5284a62dc&7X)IE
zEy_kPCtVg!0XoZ9qu`x2IW2G+mWwz`Hcj%cZr(f^dFQ!9*u%Qajz62#R?~?x+8n%#
z5F6@wz$+l2sejVpd)#3yAn~P)7X6qbMKr1bYiy>BF_b2Z;%bYyPOg$ujny+;tBwJ=
z?6ca@z7+Yn&ttNRnYJvzl^VQPh_jXp&kDwG63s}9W|z21p>@80l6;+-lPX?7`UY(3
z!h1h{Z{OkZXYx0%i=Sa)65DH9ultpH1!Ji*X{}7r?kZU>8h|pyc+Wq&P(X&DP<qYz
zWsXssx-$ZnninKGP0@rrE@O>RAF-t4E$1tL<Rc(&b@&Pyc%M}o!_fd=NdHSLIzsE1
zbJ7xHsiryc$1c7saa{JEV%#jX?%p=UU$oKBsfCW<9nu+28)+)*MrU^%UV`vCRH~&j
z@TM_>Mo~xkB$F&d{KO^M4W8ZZL>S6G<{-7AVvIyFdC=+yMqEAu<Wz$N46^y43yMn|
z%hD@61J35ORZ$l|J@|=slxS4GyOh8I4Vh6D+bvjn35^OySW;+uadDDDD}EiQOr%?F
ztDo!ynm|3qmwKnIF@{iTC(IVvx990qlF@2@u~cUblb9|*uK6>}dhwmBb9~IhSs@$J
zFUygsj_92C{+fRt-e#<8`Sf~<i2oZ~FH5$(;*3Ak@gL+F?{ON4Fwed{qo}|vdk<3=
z|Cy$+jh8jn%p!-zj(4KP;c!cU8dk*wgHr3klq|?|X?!+?1t?%YlbVjLM$V6Mt+@n{
zSwk5WJ4%dMkCuE%)-rq%SmV%zAbtmq^x5ULR%yEhCBI)q(<o@+W&gZrZo5-S`6o#G
z=sidXGw6xPUr=|*GL6jSOu%`F@+O9-l*bwB93VI12aC}>Bdv-*uDU$FIdYZmP9HvH
z`W7`TEZrR0dp%G96VDZI=(sn1k=(++`!?;(g>q#(mG{Is%0vD*<h3sTPaKi9+E7(y
zu_r+xKlPxoA9Rbm+TJN3ob!9%_b}6;^Mz(vnCjX?xpmVty<T?BP>wI`LD$VJ2<<FF
z+<x{jO5LV|v$MBE=*~f-Y_Z<IWdzP`eHMyBI?V?81uJUGxUfgt>ciA;w5rpKwfi^G
z-Eb)Jqw!IeHSfd~KCxm;e=GJ-t;sTGw4?0D+I9giHtl;LPYRpvE6dPQJgTXeV2z;{
zYsk`q$|WnP!kNh_n>qbnK6RXZb5xN0*kUDwtPhRI|5{7Yk%Ddx#{52AVpp?dRPnHN
zkyuqh_$^BsQaBfkfm>xJ2kh4ygXtO>+aC3~<K|awqWuuH2-qq(uDi5rrqfuq{@rt>
zmt;LeRT1ZzK~>93mT!Q|HivKokvl0m&fMm2qxp(Tcw850eX&Jc9Y3R^cW1bzN*;Q)
zdG;dv7*%s75zDHX*Ai=+Gb!!ATwdF(!^CY2LdndY2g$D^t-p(foJpZ^li2MDheS0F
zaK)fn)sbSPE=LhDhvSGaaYiKu^Vw8pH}O|63u#~O+x$ZZN-YziYA#ws1lq9JnNB#e
zp#j^C?@jdo)OzRSmkorS4+AxWaFa*d-t_T|A=9Z=q_dQZ&eGXja_{a`NPaZeTX!8C
z3V~FN#0TBwX4KSidZHz*ny<Ra%SlfS_egI<wsql`Yos6{n~p`{PPpUWJ;lBd9wN^;
zjCqWXn7~KRmLkM^n|C?kp<^a1c_tQvr=_|^2Mr4J%I)0o9$tQ~x;dx2!pvj9bH`Ew
zc7s@&b*6Z;<GeQ+ZCeCc42`AtabLKCHv{)va~EvBP1G(5Y19?REPzvR>^eS(0WT$Y
z43nDQgbcEg=c4UY?tYr`n-4<LLJ`?ZvdwO7;n8~^BC(p_70P9{-KgUqYx8f({C5Dk
z!HZeX`g$VwhV0w*Nsd52$H6p7eUkY$ZMuTe6MV|m5qi>&&dK+T`1T_H+uV4U+^b_}
zn!`Q_UY$eoUT+03+KD@pogqVeB@I(MJO(NS5wzZ!4E%h7%euqT5PBcNM?iQdJIqC8
zJ{5+XQ)Do2K?RW?8KC-DT)cS%k?#=#kT}|6=v{~uM1=z-X0gAI+rK$-{+p9yHg@&x
zCI7Ue$xg93*0rywTS0N0FPBtxUmVBSd_@<WD@(!OIf;T(tOnktzXorZEbYBTEnQna
z9^=H#X`Ixm5+F6-!k@p0>HaS0eP=3$u11O^@1IhQ;Lv`O+}%pKDe#R)gAe&u6bfTS
zxE*rowlT2@-(Wezm0xu5{Sq~Tw>_L=>vB#&Z9H7kRD!Q9jfn9Vo&9B&%u+LQa$Yq5
zk1Wn36-8a^)4la@g(SY(n&foEta15>S;yk2bK%pINHUkXK<uPW`_|nyEfxbr<C|7-
z7sj64oX%Ud()bj#b>1=0kHXiNMFM2Vnfz}X&#@}{OFj!g8j|@ZJ{5$>_{Z*j8cZhq
zq11`v5<9$nD}cM1it8;J!k2UF1bQ=oo&43_G0V*qsyVexr=zR5gE^knTpFdQnSR;Y
z5XAujGWzr}@E%_BdS-cDBh74t0sLhcvKrwHFNQvE6U8AE;Q3LZ!Q_>JFa#%yJn`Fk
z@;JGj*toJmbU<^tGn8B<eX%0`X<T!2D5{jJFO*O=75QU8P$SpZ?s*@@bGzHJ7<Ikg
zGBPviV|++aWw?n*-;7f5Otk1ICh7#CB*Eq}c%IPFq0Y+2vl<I?%m{Z5O0d;ApfL9r
zUEI7{)l3~yI*U@)R>1Ua^*G()*rMUUaCA`n_*v=han+7?sKeGA(=pEG5@cj&Mo-^Y
zdaat)?A|mW{&BaDO(2Xp`ofNyP-=+$w|h`UlHy@jn=1k$$Q>L;$99pR22Sy&^|Yjb
z(nD#tvasbaAA!w5lwvQXRGfH5*j>6wxES<70Fql3tCV0DEL|2z<JEQ0fc<{ODzku%
z82eK;;L;x54oqz|)+CJ5Ke0E&kD;dDnz5Agy)IL{!s;CB;;2rXr7tr*>_b1}2FJa3
zzR6qzVtkwu+CEG2qHL>F9Co|jbS&j^K~yM5fS`DyR+FqO?810(D!m77HKvn|f-X^F
zSM$X*>`e_i+h?Lg&BF#-qL61~d4kx^M7@gM8WeS9Ipew&yrln86xVu^PUdt2AAk(u
zCD*Bl-7*K;`CD=MtHnDTe3)13%?hgmbT1JH3Vq+-&;~%Iy5Zf&7W}ew>GQ1Cqfqvh
zuM~<5;0!{14=J4;iHDbx>LO^3E0u3T=*TlT@AyJ523*=5i&Y3wdoi1vjzY+-=+-1+
zr}Yl;6sw9N@V?dl{dDthDF0YI8?7;_BI_#laLZi7dwWkea}T3tgzAahr==yJZ+&J)
z`1A{0HO0{qP~J0IDu|_4k`v0O>4(ppd^?Qem`t}FB6(<j<A6bX%Sp<CKqYn5IyTXr
zTuLiH+nYRpON+3hk-7QqDiJULwDn}OO{TJow)$}X?G|p$-fW(9`;QTXF+xD<{3%r>
z2=>GpG_w`Yf?b|r3W9*}%J#TFe@bKd2Y~It485}#i%)s0AV4$z&rSc2cErI6RY^#_
Vh{?rhaPSA2y0W%X@hwci{{qr(<EsDw

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/diagram_submission.png b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/diagram_submission.png
deleted file mode 100644
index 3ed6429c01294f35c4fe61df11c0c92001dbc462..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 36510
zcmeFXWmKF^vnWa+1b26bA-KB@4#C|acyJ#)gu$H<B#=P^ArRc%f?Ejg?(RNklK0);
z-s|2!cio@ooLQ^qX{oC2uI{d`uBs??Re20lQdBrNI1EJv8BI927d5a1Mt%jWF;$&9
zfP+JSXD=<St|%={t?uq(WAA7U2d5DANe@Y1Yk&}9l$@A24xg)l(n>cGlY(vd^2MGx
z9eoHG*Al2+nID5%W+GFWS3(<IR`#Ct_1n%mz;&Cxy2iAu3BK%>ih|@!;FkZ+!}e%4
zkL|M-x5o)O+~+{!D0-|Zcxp3a>bL}Ci^9UMJc?>B;9YYN6yA4j4bYc1G}ysee}gu+
zG{5+fYwV@#`sDH09*t>6z47AxoA-hrIX1SW7t!I`l9-YR;Uu+p*@_4h@IjP4O`62i
zJr4MSMo~U2u7$V)81wYK)iiDB?@2!meX>Ewal_ir2y1cd&p$(XtH)(V|4Q;V%DN;Z
zaVSC`r&%lL86UCi{Ve9ZGIjVFT=uYwU^TQiTWEM}%LdER+B7te@O84{c>~Y$*u<mD
zz}QEuyNZyw*C}$nEawq?EuTafltPNJePVbl-xAb@H>maKO;f<7(5A>UG9PJJLSiuv
zkchFH&l__!u*sy!*+*ldboY3wW!A_e*WO7L#h?mCVUtO==u@L9tIs)pQjOBP10GZr
zIHoEEr?OC7mYiNmLNa?a&)RWN1z!if$yZ2Ehv!0kTLRQ@NADrUZti!-%hTr1j{zU1
z(~uR5n&Pw*^qj%F$wVeou?#1N_mvzfUsy}uy)_D>RgFcZV#Q{Zdwmf4W!(*FBcjnf
z4e_n+1a^XCDydisni)1};V^|iJ2n%;H_$yjBF))gqo8auQBhSN+e=>@)K<hE%BrrH
zQ^ul=aO|oK5)cOFNM-HbD55Rbz8>a{PQCYb9j|iVqij}Ke_*CYikxpcXkO#kQV?8)
zXYPLmzgjwhKB7QEj?--m4}cGrq=&0o^Q8&RhgV0w%8sZVEaK@WC1)egN<xDGv#|0c
zDIeLWvKT0@Q_&%kal2H@jTL9+Exr}Zz0lSmg$JGP51;Km$%$_!6$@D@qSJ-r(nYPw
z_%9-3^*o)~-J*w&VBB;DMx5WiQv)g4F^SR|GkIb=s}T|?x5v1%?!JFq`)+6zWQ4Z(
z%H40B1_WmnF)WB606*G*blKe%nq#Z>&AMb=S3B&E;%0^Wc6&vAF3Ds7V31AzrtTrA
zFg!GThchN`dSzOk#9`aDb*`Ik<-s;sD^UC}zpcK9&@!Sz_wc1RrbE~M3p|l{K69eV
zd{ohvCJ1_28I8+X=-DQa@|VP;n~pfar<BI&2Jglo&3FRjLLvY^rGK<lxONH~t9jMx
zj8w;Fq%XVb8v#850XX~>(yfC7Hp^4851gaHQ*4|_)aJb%9g@HGOz;DCD1}k$p<l{{
z7gTRQ3u1N5d5JdGs26t=V8I6{+jV`hL4Fg$P!W39@PcC%>w^?AF~VCZS_OoU=AXh`
zzbJR|a-t0*G320^!Z~!lsz7u^YVH(w#hwghbcN4)LDXUGO0gJLQiRv?Qf!rCR~{P-
z9>*4b8<|cs9Z9zyeoGyqK#UvLLuaq>Dmg9^M~EBE;C%{BNg_r+e*x(oiO`2E$x)i4
zey2^EiLe$a@jPNBSuYK|$yXFn_c^S*s46l{d2_qkHOMBB?9wdbuRS>oBOsC-IWxa7
zFB#@zUc8~OG~Z6=*$%C&)Mup_>MEEeFU4o+`(9yKit{|CSBY6l_V6ChO?b5D%NlBY
z&qr5Eaf0knj-LXn&1b@o_(250kvFSm$h;r8I+IrASFtrw=D_w~T%eEz(;DX*@1pV-
zu#XIFl2i?YUw3P|+%U~3Jv)Ol?J*NS-a=GDR4YC1K;otXD)rCgh5~p^6nimyjy=IW
z)Z&kmMTKQHh1-RMWvOM1x-7aZdJFm9Z;qnpd1`~I2<JW@%EvQY4Dj?14T$u|Z7Of7
zx=YArzEwuaq1SW=Trd-)7!KMGiVYfhd3m9EnO(W<(_Z=R6YL{Rj(wNVhDuK53aige
zqG$_wk$4f+X82{^r^@BejeoG;tAU^AHsm{^KYH;)EwpScuk+Wc!|>F%$#1_K^3Ie#
zmfHUqn8+$HC@?7VRk%pDsV6iTl8gEAbLw-xPO+EFL7{Frzlnc}cd2*VwdRr7{7y~w
zL87OlXP4)#Fq&80LDr=2o4(ytPyR3RW|AvX5-KN?4Eyu@?w%f=L!LK#b$i#ld%M27
z3<oMRC6glGXF5BG`Oz`y5-PLY65T@2G*)D?zjBP5<(ZWP%R*Z_kweW9LZpg2gF+8p
zQDJ0+TZA`;vtS%xoRF=_Vajd97Lqve3p-7U0e!$w<mJc-93=TgF1a|0*o`<_QWt?X
zyG4h$?{F<>xeQ#~Ckhr)EJz(cTQ>DxrrW2HEU}Kiv8wAmY?wnUr2P1E*ix5Mjmv*H
zZiH@RYkOgMVfcZ~ku8}un(f+XvA#h6Uf;%`tkSXau#&r))PZm4d+&Zc-`sqkp<U}}
z)6{)y+z*xL5!y_@%$>Sdb>s`Q3#1GF-n`A!&Bfk2-ZO_$*W%ZW*O`YLA~~24Okjjk
zgmV11{IvXjw~v<1c%1_?iig@;p1(8phuzsd%iMx?yIiBZnZ1jBeEBBcS>5-4qBD{Y
za=Y>z@mN~(^GO-~#+4r}&lm5vvAx>sQR3OWt?Gg0lYOwf`aIwLOM10(ZX=ax$)j|K
zc8qXr%yijQv)r(p-G$`@{UmN#AkZ^#Js|Zy>x$tv{C4!p>HKMRXG?F@XZ%Dh*cNIG
z6?-0&Ux+7+UFN!XdoK$oU~VKSPtC(g=jvcCpXiG%@a^@kMujz}1jzQ3Csm9kqBia7
zbq}8msk`8&*g}F><`Q{ym>f7Yye@1H%n;^;?2qk?1EDa*TgF=>;o`{?O2#H3XyiU;
z>;rdiXnXkef)X;swzcv3Uemmjp3Jw4u(S8v(Dyt{uPo_Xj;V~t>6`5Pps#wkKl|WT
z=cct0L?KU3Bv9tQ`1@Alta{xDZx`hw-Zg!TGP<HM1SBxPt_Yb-F3k|7X%xN-RI(fV
zo)SNqR9vmaQ9$xFYJzk^ac^Kx<hOrcM4XvOo5zP8a59Pj<GgB?_}L<TzM}8Pc-85w
z;V%Jz(8Q$VN~R4wfmEZPM<bh7&clammA(fcj=RoBzDIVc_vW8g4Rj@^znAXJy_hSh
zZU#$rxm(mnvdvTbeZkA9X4naIc04Xy3~5PedAfUdM{+0MBfM^WR?J+eRj6j4Ca9!D
zPoecxSoUR}E-v{vg{CU1DjePf^sOCG{}D<ZtWT`3Z!jys<~yIeH~;(RZ;nyd^h&GV
z>b%puY@2l+UmhcyjIqxW2WyVghfW@eLKz;0+lHeig7qGzJuOQY%oiRzP~AhrmhZRa
zh|FlDM4wr+>QB$Ga3m&E>{1kzo!QULYOJ~1^lKWNq8^g3l-p9ymB-Q=Q$M8z8V(xy
zJ@;uy=spTousRqlxR-Bb>S60e>MiP%)VSZgKGKmbp)99tZgR%(P}%$3KCNo%U$&SZ
zFeE%EeAg5oy&D~VrF)=P3ayc9So5=1*m{oZB};P*Xpd^&8{A$UsWp02m*RBSpi^@-
zT^H%(rNghYwGg#{Rxh!X)RKS2l%4(EhxCYb;m$V=q}O+RyHxbOwWQ>R_{6|8bh_W6
z=~{Kx-`8xnE!=Qt-syqlVF>d*!Z`v*(CF=5Z|ApAwuk`Cspq#(L%rRX-Cw;FmSUSA
zeini9(uUFxNnG-Vq)%j6(Eam+GRBL-!F)gYG^%`ZBaq`N=-1h@jiSd*)FEDi5o7(L
zScc#Ee6d^6ysG>caev0g;Q?LUgR|-vwwa@MUqsxOcLJUe%h9}u#5gFQM=t8SolKp)
z(pot9jIUc4XT_VIW<TYyFqMUyih0C&9Ccl#9~=oJJ1;f~t=aUkJ^$FLzi@7E(*hMi
zk6Zk~{$FoukJBVLgVe8@pH3b`T7=`p(E~we8eRF#3BCzt;z?q6lZ~g$NA`C-)wWY(
zOArCCFdvg3u4maD<$<B+<79~%=<My_@f1h!E2wWJQ*F^j?d4TR=2sVtmE-%Njq=px
zhGnFa^OdW1jw7DKg{RIP>4UzMgt7agK-$3Xw~o-pGv&!Ne}p>(1l+O|5fFZdxpZ|E
z9H<%%&X5$&6BM2ktGA4Zgs8D>hJ<sgrtfP;j?V%2&Z0=Lw>tBMAYA5%Xg`MkPaJVB
z;+*5-ZWV?qRv!TdB7$Wwav%bNiwYbyAN;9-Fz0x?z4__|Ch?&Y+~=a#CpoAzT_WAn
zaEMLL)ctTPQRhMTD)B2fk~kqqw{1|OFDi+#n$9~A*yAJGT3^vdMFs9PERPKL@&zdz
zA}sd;mc(C>{U<N?0sx2bw;mo2F2Ww}<-ahhu<xH=A}syE`R9v}91e#BE8)TtC<p$(
zpkLJFAp9o}hPA;-X-O+8!oFIT?$*}M9(FFCk&G%tunH7c1p^N_I6{U$>4l;u{V9z8
zw7s^zr@qP?VM`Y$b_*+)x7O?+C)Yo8;6y>fu&k4{rv)|0$<f(E7$ipf7eW}8|5MFD
zOZ^wb(?N_@Uqzi-+Qr?PnxCDMos(7^m71Db)ZNNPSW`ysUvyYWjMmQ6(^Z&*!^g*m
z-G_(W#od;JOGrqFgOi(so0|;=!RF!X>}dgFbM~P7hsb}?k+Jr$bhmf)w0Ci){zKQ|
zt&5kZ7%lCej{fueXPnj``~U9A+2h}7!6wM@r-g%yos;7~v|*^Ce`<x*?LpR#1~T?e
zFrLBs5a$sT6#WbT|7!W~9{&fX{(ocg^6>vp%>U8y|HRbsuy&Vrae{T}DgNKf^)K@O
zZ2T8dl;h9L{|6`jVe{Ww7@x&aMLGVnX5y&2_Mi;dGLqZNsA<E#uub;&*9H3rz|x;D
zEUDV-#pDgc!AZg?%1CL0UK}pzC7SfP37jI(hW1C%NA_fS5E6W2723I-t8yF=(iYOY
z>%<Cj1yu|wVpg&Jau>248VpfN0VKa2rIvb$OPz$56jC1S!L@p}4yUg^yR{^Xb?`QA
z2g0+q!t-=;yeA~GZJxrYOq+;I4<98-%h`oP{1MFG#gM+F_7XQlh8hQeOrN_Yqwr6G
zm;hFaL*?40{@bSTGI`t`^Dkl|ax5yWBo{{PfeoYchn%9F5YoS3a)_`}J}_1&5=QY4
zE&7@)gnz;E;bA2WnqG35e+PkUIQpmKKVTXmlDPBVuq`oZ{}yGaQP_6=As*H7O_CbL
zNjs7YDe0e~CxTNw{=t$l*QQPc^S|y%5B+!C3cw+u|C-@zYy}`0!P{Y}f5#1*9{K;@
z>7l|SVNp6_k_)=t^4JWg+Z@hTj@TD{nZ6ZMhSH9wepw-KFR4<#yDC4dvSlKI&GTc-
zPUH`#ljYVCWscU#!^OtxxmpK3n(DpZMFJn0)SP?c$n;V+A}JOD$AWZLD{X-z$4kwb
zM+^07D>!pKQFxiZ)g)RoI4!y?-je;~+#JUZ`5=jFn5_zadblBC2r~}=DOcN!ykk`P
z?5XMhhU9zjP%7I<lh?s#l}vs<5${90!Z#Tctfnmu83Hb*8nGLF2@=IeHrD0jFHDIM
zz>EEZwg=NcMz%*XGsXO`QyROlGlV>Bc%0^IXqBO6aUb=T-NiGIF9l$$8Tgv2rz+s?
ze7mxb(t$GIv=eBva_zJ-Z5(j+lc~0kqwV(XML%6a%2<Aa_<X5Ja~(k0%6q}7o=?x%
zP1fUj&X(vWNRw=I&=d#0Kekd@6-)F~@}dV%jh98Y)NZ=m%nUg`q)U1ugG*^ta3gm6
z@nXz{y=ga(+lLrH@*zNWd`FrN4#rS%f(ef?v6;odo23y7pT%DRn{-sn1?QRVcbQ>U
zsy1oa0p|nT!O)xFOpd@k<q>Htu}R0phD!bOG!t)wi;<Pbk&zLBmBlQNQK2w(HJfIG
z85ZgHNOevNb>U}cX9>YAck?rC)1`*B-$8bfgGI;r_7<aALSe&Y?a%kyaV`OtH1(I0
zrX<&Q+EBVg@N^M3DLXJ98@gZq{5>&4jA(h5A<cNg7_=4^);}tGG2CnVSuBIKTpXUG
z<rH`}eB64oygg+0?2aM;@mcmA`GAOq>WiYdpeOCu(!l31&AUCZ*yw?lYvQxyHG)WF
ztbQDLCI3-O;iW1&Yzdf%#y;vUoFT^%EZ6n$8UOI0DZc;Wz-LssB($OxVD`99l)3WI
zy24^nlIh%3+;}x@lqt}%hA}enRaxXCifDe21i>2HmEK9tQuvC4#FMAmOzk~vMaLy-
zgXfE#rrgl$e$1Qx=vJGVzp7a{yf=~92OSW<dI+Eh<;;OExi1bpcz2;c1Db?hNI-ut
zdd--Oz;>qzFdFzvU>Rh|C0D5W#qhq_qC0%MxCrvD?fg?YKGur(0*1iic*{0lIdf0%
z!SQP*Vbhxjbk<tR4PG0GgTN~&lK2MHA@Is{Y)b7RsH)W6ZAmOSt?BqGu-Wv+)8ycD
z#8&U4Z4c~#PZ#A0Q|1~Bcc}##<`CxhU<|<Hidwxk>|wno;ju}{J}yCPW+b4lk+=<&
z08F<ePO{Z6P8fTP@Qvij#>6Yoo|gJ8TwN@_doqdsZnLT(dH)S_rshN%#mo17Z7??n
zv!z$_SX<%~jaO~V{gjc!v4HoHuPJ<3n}b?#*wW-$n<c?qXbZ*I!N)p>>zzu-4Asqn
zL`%P7=KUC6&{swNo0Z_;3z%%K0FzJs%naHQSuqHr<+;)oQpClPBh)mlb2PX+{$5h{
zt_}Lo-ro3lJ|q%h(pR!_UoJktAm8n<Dym(4eK0#Bv=R2n;dJ^WKdyaRK7Ri!OGwVS
zcJfRSOhcqU<NGap{#$`gwnld4l<#iS^W@mCZHqwh;COVHFuvDT?W5yGjUaHA(gRil
zp4ux?@SS7QxRZ#@Ib?guWY>kQb1=Z(4@dQB=#4Y&&sQV0Gp0sBP32TpW((FaJ?k9R
zf!DZ-i7$TwLlk%uLyCi-<#9^*kY7|*AIt%)Gnd&PG}`M#%b{(0_0RS(nsR`BnAq^v
zD8pynGFhMnZk8?IyQu_EA~6-Li+~H-PbD378mFJTdU@^(sURO+bFt<*U#*Ee9r;n}
zT8&eN?wHKjj@2woQ=OB2dw<nl5^z;w=JdR&_FRCSGEOsg_`@a8VUFzJyZqZ<MG{YY
z$@-TtsgTBbj`cK-#iwWLYh4UMkDHmjBw5}x>WcyFgG!{<Nb@vajY{9@aBHPKY>#D|
zS)m1wp*nPT`XKcEz}iMbA3?f6Fopb9yHF6T%?_Gj6smhL*pd6>@MGqpmVM^Z%*c!K
zv*or6g&B#}GtXPt)p4MuL6I=7TW)I7r5a>dqm!%Efh)o~{rx?&Yif(@dRG`)%-ci?
zra2h}B}N8B)nB)>2~v)%!8audVZmc!H@st_7jJ0Dyc}lMi#C&zdk~ZeLo~6KbiQ4?
z17uKTbs@r28r94kt(Oyd4VPvj2@=mBMeiK_h3JH?U*Tt`+%>Yyv=TIePZzo004hg6
zj2o*0)P8+Ot3+;81Xb!OtW7^RWmjoO6p%m7Fx>(m^1ypGFq6MJ)nqX>0ty)?0i`GY
zhb~!i)MT-P(&|AE$PP+&Bz&#Ubz1OU+DJE+gz^5mTi`=|u>XME*KbLZ&BQTW)Lxe6
z=({{y<%uDU>y%c^Z=&9>nU{+1^D@<J8C2;eyoI0!-`i&)_omk>d%M>5*_I_Xd#^lS
z*9#=a_seH5grPGuQ}YQ9jHlQ;94ET@Qzj2}_#%%9V;w<t8;MtJB_8Hp6@QfaLbOd|
zEXp+LtZF|aW*|rT_l~LFz#$F0bAevFFf2jx>Li)nB-qp?3?M1!UwdewZ6^<Hc_{$(
zTmXc`TIQ}Ih?y!n`a}IpSF*foe}cR1o)&rv9>Y$4kW*tqguJ3dOtMcl_M92?%8ZCX
zjafrdA)R}I5mAu_&EwoJfJtUsDEgNeKGnoIx-vP5hM|l9!ouJM@>FX|Ymnw}O19f>
z_9pV6&KmRH$;2$(W4Oz0IJfHHRI5T!a=oAURk0eu+uz<4)jLzxEx;Ouq`abUaTqk9
z(F^3b4+73(4r#295)4mqE4Oi52J|EO#BZdmptlK7eTD4B#282I^b$R>O<-@(z6g3v
zLS*d<JE!bz#lx<=grSv_c%zvKgzweHNA)f`UK?;3yX9j73>R3jO*TgA>-UP%YVcN_
zN&06CXr_5tryn6|w>poaim~~o^(4@mG9B;QYed$aN)Rhfg(`OH;#9V!Jn+atSz-XO
zm?VY{lj^nuX;4(g{kXPwU({-LHU{5azQZcndXwhwg+|7jt8<X9X=|c+Ey^8}wQhYj
z7@ohld|4<!(6xr4@v4QIrVn~w4t;$?i!%7M=rP7dg_hZOmJPkjmc>z5!q4;vFb;w>
zr(R8%@A+Sgv~7q)cNJpeFah&dnQ=B{e0TH3H6-=S0=@P)4i+*5t-_6CBi3cA@ClRi
z0+NKjT8WA(K4}Ylbrg+E1M;b^Xk@WEdnvi?e9~+wqg!>kFLZfrn$2&1z5;z(`MQGs
z5|!jm`)8!!7c_7Te9X!7@d-Pmw8=UyXdN#b5V)nd-)`$mdmmZBtmmY0nZo~J6$PvC
zl+)ZEJ>F2gzqhl58n5S#oH%D;`dS2be~f!MA|*0$(%wg|P6DCi7$HOi4~_Of<D%P=
z7hMNHyKR~|yY7Z7d6Rj8pr_~42npj@ws5R<$;5pvb4zxT-+y?J=G}hK+*XoEGO@ju
zS+tFmr(J%R>+IB(J1Sk}G$w6%Lk<l6!j*yLMQO-#Z?UM{J+5(Mju9Je{d|b^M}|vd
z%3r>%0AGi31mprhgNOcYJN}<%T93VCbJZD<F)|c~iX53i(K7w=0QtMe9f_64X_qJ6
zGuO5Yk@jkk!Lg-~Kjec2*8x@+UR?_trm+@rO8)pbOkJAU?duc2t_13z1fKZEe0oz{
z^&RK4pZ+njo`k^|exdiGJlFF>JJi28#yt<e{l>q&9w0VYgi9*>oQoCs4Pa&!jvkw#
z3AFR+X%vh8II$sxLRUG<97qX=wex4*$|AiHr9taVL3xR<$F4_3PFObrVOk%WL2-Ml
zysUk9EWG^m{*@@=C_=GJXhZNRQg8*pYMC6dILPW-U_tD7Er(*anq2Q0YF>C_9Wm*?
zhl&X@dTs_9K5z2nFL~mN4vTm={8!O#)Yy<z|AF+KNJooka<JLJP4@F%cC<#v?}$dB
z91dg*Mg?gRQRk~&H14S~-_skIDu`j6ggtkbIAB2I;UOs&eB%?WcYjwc@lgF!`nOq?
zlfQEd|0|5pucOih!#QhfnLH=HWBy7KYabeLKKn*C;OKpcu#z^t-Y+H7uZwE;<b;yo
zz#c_c#p$n<?SQ<;UyZ@YVPF8xV5FuzKRy`Isk5rI{mQ8Q>Xb}AUe1otOUDXmoWh2v
zlB82Uz-LNQj))V$S#_e{j!zpFN^Ee29JkX&xB5$+9$!%QD@EYcSo>{EJAg(dl_xAx
zkeM}?dyMv_(#Nl=Qm;!rDRE1ha7n}DqS_voPT0Y68g!~&73*Oy*KEp_pW08++ST`S
zvzU)zx^O$Q(;gGIC)Q!gchFQm7CzHDm~RZzq6^&Fp(VR`|27Nx(?s~E>o}6R$4fP+
zr^ceIAcR&H$hlofP4gs)o5K6{H-A?DE8h$>WK6eHxe+{5CWKB?5_oE=O;hO#PFcB4
zOXJ?*YOaKu?tK2SetmyrjHH9U3F>I7=QS$!b7_20tqt7obppDrMo)K1RCh_5X>Fgo
zIg6<Z`Z&3XVhL=JH7wxLBSZx(&qUwE7L7_j`Spn`Yuww7o?*k-jx6o*vD1n&IZo(R
z`D+9(8xr5YEFXrkyvqYW9#JVoi~r~Y9Oz|Ay{5EP&}(@*-WW=$ThnvopBe-lh@9$Y
zO=(ODYdY2u=pbwS!fS(0WF3HFFoC}&{^^eRO`|)d>Ta^U)k%+7>4EnejYbzf9ir0O
z9?`@E(iL*9#(MknLA$Bw==Xw2Edr*WKJp-smLjvePiD+76R2Zv{>)Ly40eGAH)7NN
zEX71*-YakA56rzy4ab##?v2{4QPF6-8*yQ$wD&z7m*^@ymW4x3l+O=@ST%pS<}qp4
zTTHCrEOup^7-t$Zf*#t_xD~)NL4@Y0@y<i3(4Vhepmnhbou{h<pWR)E&L3uipJvWu
z#V;z_wv5`oP9}_-k5!z#tD}ORQE@=L9()TnDRB^-x^{Phj(3#&E2e@WeC26+-`ddu
zPBcO8*5A^ly(sIp3>Vxvgwo}KpL^+gP<+5PVw{|%c(0UAkgo0?g{pA=XgYTosp9~+
z=$P{&IKYH=ZsfajWb4DWYoIu3Zt=%@V3N8dtJfg*3-xd(wA6`$3IJXht&~k<lzX0e
zA|gZh_@RM@esn7yxHL}Uv0H)_LVr>iK5!UZ6XPGI#_Zl+a*`Ibmxk)5o__C$rHp(!
z6Ld7ABr*T=^>B&l(yCm=?pQDQN-st}j9yI5?OYu@&aDPzQGe~*N(-sr=|rpv)EDkP
zHZn{vNxB&n@4nTDDsuiD^t=<?hW*(;Y2F+aOIZTMcnt|K|Dh(z2kvc<bZzmVM-PUS
zi*ZsioinWbSuo~6pe8zkmCf+chtgq&rwF#t(1;CjFyms86Sx=l`mBp-%lLgtz<r&S
zNFCr{2I^HHao!xHcBL}dWAY`5fT`b3jnj4W7ufbFNp$Y%)V!9{kQ3~xf&$ijq(86I
zPg~|FaX$wP>vyC&(Q5>mRZ#l^Ti8*H0#|+^$GPW828?CqaE65dPRc%<f_=damz~Ra
zVVgo=M&9rO!0Wjv{Kj>XLEPLqx?J9*DC}I$amk-bQ4T}tPpVQ24jMk0-vA~uvbnX&
zI-rzICW*TNZpI$@sQEPaC!XxPVT(Mn+kj^F%|dEE(~XnI(G&bqY<SJksg4>?nN4Xf
zt*T8%ZFCNL!Nd*+|4$vU6jAmci*ke79{~58rY}G6Ce?U)o$u5lGtsJV@CjUI7Detl
z*y)PZinp_DV{-HG&(ae3mmTx8gAL;b9;^Bazj&XC(kc2!nnA%<zUs=|-LkyFm-4~d
zqJDj{V4e}W3u)+xbd?3V9aR4UTU=CVmUj7_J`FD<X9wqVqScbJ^9^4xRpZ}-eW(f^
z2YAQSANfvMv}ywL9htqAz%x=wb1ZoQO{fe?_Gg^fxLbDZa{6ewDAjZR(suZ<Cn-%g
zA>+?5@l^vb(G|hM6na;0^1W*svf>+?Lg?^xz`#4E)o?uva-QmTs5c8P-?)MT&4Bjp
z56$gHk=?TaiM^6oBTo<po`TpO5YH)4ov&uE&0C65R1<U_j@|0EApb!l^LFjSZU49^
z&?Z?TcF1%??P*etAi&q<x)l4f_!NCyh*cp~?by5&<NeSAk!IPekNilaZ@kbU0;V2O
zOyQ0r29&r42WVZt>c$&|%qev&`$TIJ`uda5;TOvCz6fK5v0QyT(>}1(PhtaZH0pH2
zo`)^`=YL-v8L+bm;{7yEFQ3JVh4HH*n)zL{X+v}#PU=4DmAXx-eOT8=w(X*49bkR{
z&b-m`d1^Pf_noa58uQZ`71RyIXwGt@^0W!`?g+?y{u00zO@^{d7Spa`my(*eCW&=m
zM@jnAMgq3gcnaR&eBMMDz|k4d1IiL6V?VZI6RJh-gt!(2&+r%_|6yua8WuP3UBv{Z
zV>7B_m_Zy)NJBzIkMz{@FVW`xVlLwTPLC??BouDV>_g3?h0G5knp0`sA@xGJ(EB%S
z{qrD}+(x8aFB08a_J_b@Tf1Vkz1<k>Vs(zLPX*G>_ZAe1QNEEW16%U2{e#xlml<)t
zYl&%(WkJi!EoFD4Aa!K+Og{H_qdb~A0ZXQNt!YK#&hH1Z9|v%0-}&R}bRKXxWKV2-
zq2`DVg%={swJ$oJd3rv925HQT3ZlmC&<@YB<7bP}JV-}kkWd?J9nkP9>6qb^6kuDH
z`WxZD#h}n#FQlf>GJkVw9M{aKvWpkSyik2iFwI2`?0LySl7P>SyVk0?Do<ClpQ~}0
z>;iTtc?I+~5uCbyZQ9GAoQT9mA`MNXNOtWoZI-yUl<=PA;ePhEKt3%4MfK{5QMn7^
z05>vp=%R2WpOdvgL8qK9`;_FGVFU@2%Vg5hfM25c$&>PwnQtlYzqLd7@Kn^Z+oG}Q
zb88udg@e^#qPekc9xRuke%iuo|M0v*MhX4^l412TMGKS3P*%lhT31xBYQZ<fV(?-G
zmVgy1&s_nOxvY?gVTs(Yq^i=CqVV)3rem~f(O>eO{g5%3)gkyyU8_!y9axW&G}HUc
zq#VG!#j9MEmo4-&=C2TBdbL6jzQ%BZ=#k&g3~E)n^lZ2~$JhqKi1%=a{5GY_0!b4v
z>DP;~%yta58DI#_<H^chlastahkwYASDxkhL>~eGzwlMHDjQQE4;cc=B=#1zb8eVq
zBj+}nt;B`CFo$##VWK1XyYOAw+c%lzMfyf24#2H(f%Z*F9S=l#b^C<-B^3ytuLtI7
z`YvInjlC~mXNKG_FR>%?qV+Pzu?acecRiSAaai8z$}&3k?{C#s%vy8O@}^b>a^$Kh
zB19$#z<3c#v88OG8QTJv%Q?18xHFzeR+ZBa(6XXdd=I~^qcN<I$_@2HRtd(b<|5`;
zHzp-)yZ*A0_mWxlB~AV>&{YQa_S`Lm`$wQ;m#6>)n?p)L{RtWCtP{`To0&T}dtYF*
z?1MhQ$w^rK2gRm>YGl|%;i@m^zO3$M#I3L3J%%lKjhqOmna#=amQmVN&?QQ$OMUgl
z8N~Y2d0`@|d+M7I*2LW?Kf9^lGIl4y&<Ev}$pKdy`K#pOO2AdnE38IpM$4(-n~Vrx
z&US92?hcJReiNgfm*Oi$tRD_Gab*sF?6PVGwpGU<bfD2SIA}v4C_k(Cmo=gIPr1&@
z9)p8xZNQ!P8e)Y$&WrNUt0i$HxyC>Gh4);*4zv2}eM#CB03yvx`H*u}{fU&9)@q6i
z@_G0MOZ=zIG7xgy9LKAKE-P!a5{)*K%zLkvE)9I9@(yu)Tt4qGBx_M%X1}&wvV5*;
zpe!$>o$fjY{5trID7X@^!%vCd{M?k#w=QlEO=-w$zfO5wd9M9V0efwiY3n;j3lPlU
zfM3Z4oIF^h;)q|L3a-jE54#2E4z~EYRisSqEO(|ve-w_zd+wF2{1_z$u!=?wh=>LF
z737xp&Z!<tSv|LrzDW|q<uSDiBfrAiKGwoomy6v0RmOEOye4O}o#q>v(~PRgz;blP
zJJSC7m}B8L<ZESzvRiwCFF@0rvMb3r(d<h^EJe?Vxn$L8DF-Ph=D|<57Z;NO>y5QH
z?#=g~XZ|%w@Ss*3W?oBxSS2oaB((Zv(%6jcckt<f`R|)164_S$sDK`3Mj_`4&xd2r
zrC@LDm$dF!KNXVHF-5Vc3KSGI(^Ev$Bo|s<Y|0hWNh>uSqiBSWcAhEd58_i;?=E}i
z3f67~fjg^nQkW%{)4K9kJGEolC@7`t?jt8bpCZ$CdNLh}3Y70?Su;F<uA|bG*96PC
z;>0YER|BAuodOBh0(D=M-VLU&#aNL>P#m~Y^U#1bM3%9K4q(Ihh5S&s<4yCksT-}2
zB_CEz_s0pjr-t^ow%PG%$Y|i-jOa9CUfPG)ZF!U%x)EF*i|Jx&L85}(^66DfS4iw-
z^*egSjvW%)1hF_YAGl{MsC$MYf0-Rktp~Z&;XQfEbfk~8-`kH#Z#axednH96!(ojX
zp0`$Pb@=Kp3}@5f{Z)yKba)B55`co=obIY(4H_dqOjrn2m$Gep7w0P97+jWWp%yR8
zHyZ7hn#x$^mXsLwE4-|%SoWpDNBv5j^;O&y)g_+kgZZNjZpxQ+AsS$hBu&*fif1`N
zldre{?%amU>?&Pw>gC7<&{n(_(li6q*r#i02{R3zmu<fq2%N{J^`esfL5%k^R^3<X
zel_T-n}j_jr;b?$)wmrHj2K@L&T_{Is3}&x>4@$$M;Ar+(ilYElu@GK80!|iOiI(N
zeEagWuD>~%ef{e&i;%-udMpXoUU}1N#pG7M*g3%wEU!I}FI1v(a!FjxP`eA))!BD-
z%y~4H%cf$kE5~Mms#G{7ZpmmQA2nbvN%fuFI~B+_4Lbf97kUztB}V7P#A=fD<zM@D
zRSoC}O<l883$4yiwP~2FLeSulRUa))(|oBe=45H-x$NP!8!C~(Vq}gJjb50%M)mTQ
ze53+&Ql5m4uO}yP>;YQ$p{U}Js!<h3Q#9!0H9j{-anNOUp7x}D7Nk7eFyVPO|7@48
zJaVes>`3z@AGubeRqRPOb|C!#w6t*pGtGC)KlFEj=D{!J^$(C12ga-SWehj_GcQD2
z9NbmXa9wWvS5Lgi=huEDqIBoK{i;U5fx0N4dHSy2x<%YeoP@*iyHDv@Z|T^53FP`w
zaQTsR_T(PA9qRi7x^E80<^7#}Kk%wZoVK>k7-puO^;;ID_p$u3-0BY#`2zc*rQv!`
zOu_Mve>{tIk5xDSx(p*=8qrYk|2KzT7IyAbcSio(&Hwlw+vFtSKhkG+rsC1^2Ez`P
zZtB0@Mj7*OIEmH=SXY>JFs!?wa!t@z)fvy(PX3e|hCklXKRt#@bu_B<36Y1MZRh+U
zQR9a_d63kCLi-;ds%sU_>G`X2S&!9~OrKXqs*&R@!hH{`EhS@Ptp5^;>KKsAY2#f`
zD+iG|Dgwz^HGR$g@qTtx{Bd2fa@x}U3o0E4bAhVUWnD4-GXiupm|?UV-h)5+Uyto1
zOcBE1>oN%bzXbjd<3SN??$KOz(Og@g|AwuHtA53(++Tk?u`hD0{bv+BMqaHVRW^wS
zUpN@#510${?mX?-QTK=O9QkZ;j!=5R^w<UOLn>?^mBYhZ_Qu+)lKC4}+nud%ry9A~
z6S5gzzP@~WssgD3U#}uy@lrKCNsrKBiB7(;>*X)vvk13R{3j|vFjg}Cegt9@>~W`~
z#}7R*k(e|lk$7m#ssekjwBGPhfuLprER=!Y8_jz^*jyLbKd}}a8vWg_-SHj0OADW^
zjiz~(*_(5GZ~TL58t>;F!dtPJCRUl1?QWI>)h?cIo@A?!qdtxNur-@zg>XS-T!;Me
z{sajSUm=&yH<jH@TqN*cup2J6+!O`1G+y5y-1`kH+QGCTF{x?~*hxCw|6|M!dE%Qf
z{%st1H8s+_d{uT{y^(6so<ccaAq~M`gpcY#mZnDeA#JW`N0GDn^{-I4Cx=~Gm@gP|
zz1}CM{bla12TT|Vzf1v7zfdw&`on;nu|LKVY;{~iB!DmT-#`U%`9Fjt|1Y8cJ$Mu+
z2NWUR5Cxr@S3_4rY@i`e){`5mWDBtr9)^G7J}~j+uD^zkbZ$$=-gJZzyTr~R_a#uw
zWD#zNeL~HkqQ0QLK?sS?;7sRc<RtqYE9UgZ5|Z_<0pnF}QcyyMdElW~MLGJ{!3=TY
zVp1BEAC|iDa=-;DJfEBA){*Nak<2&_CpLQilS`73$tW#M=F%kLFp-zUY1fGMC-cok
zu|5~0M<WexmYF<7BhH1Y>5EStfYv5kXq%0Ef4~Fe%Shc42Ve8e5yhke4l#nc;VPf)
zeyq*lZB(cqy9+4^vUX^KILKn5@a^R=gu69^-_4!J;_#N!O0IA%4Lr>Sm7~kmaAZ7*
zSr8Byb2j{)MVNYn=t0PLQ8ePn*!$AvOI#t)&1J5ZuD7|^pmpn~&BQFqh_x-`_i(1L
z2QS7&gQc(iNtw;lH5+0??ScTm$o1jt!F>>a!yTuPf}=CB*v_zUNd}+wyxYPcnH(m3
z9#P9RQRpzGznd%<+<@<Mjlsws9wTCL9GOt4{S`0D4@9{qFJ9dFvOxdODjZH|+(rwc
z4K@aV+R70OG!Ey~Vet|nF?=G20A7M<f0M+U=n;pZ548$r!J|4O$p*IZg%*@PtO|is
z+eB!XV~zph$WFGPVy*a+ir#GgvJ7sEw;t0qjUx;fX<6>!9d02MYXZp9Cvkw@=s1T=
zjvt_J7N@3O(|02i`cynjYEF!CnVhup>ELOax?c)*90Z8%Uwz*`4Q>yyP_a|ak&U(1
zHM%aZ<JzPPItL)((%mGV&U+Y53#gTuO)cI$;&Jsdj|i9qIK=w=>Zh1w*=;;M|4jx#
zYb-~78_W-k_y3G2rFkl-Yq9!<QiA#Tix#reG4u8uC0gdMQ;HKum&94T16_V9|MfC%
z4%6z*=<U35B#$l@s$!aL`kMH;449M}L~e!&D?EJ$-k#^%CITE&$oOKzZu0iw-bFmK
zhQD1`yr|Kd+*;p17R%?vt*4unmBpZkqqyvwC7+cX5kAJM)>oO!Q}nqMR5Q4t35l{C
z{?5SzfKs(NbxIK%M~*rApc5qlp)5CpsuRDl5*~LhWp9eG`I0_018@0h&4WZZT94|R
z04?rwP{B1?_Hu);Xbn@sey(q!xC^V-=l*AM5&rqO<rd?Ubj|v<$>H&djc@s2*RFM^
znKg(Kdaou#1T6O;bj0FC^>J&ltj~k<wAC-Z9;)+N<Z^5XTqj(5Mj)m7JJY3QHx=(%
zn`K%@u61g1h<-|MLTItCV-)dbFpuSyvTM3w2`%%3TWnv$s<yII2^x`Li5^AK&uVK3
zMe@fWg`o4S2mIM9*MrgBAMTvN^G}zKM;jK4`Rq`yD>dj+xtUU{d@#NFtxZc%UGHAo
za?70AaYzK$l&|Ii;3?W~(dewip|lLgX=WYbeVylbWpVOs$F#q@w9p(pcRw}K2N7+G
z(0$k`+IbB@=+jsTstqoTn5Ox2zo@@5{_gK?<n6>UK-7GF6ff&pK|&Ohsv}`JF{uDZ
zCuvv1{E9c!Si7pN^HIC5mvU^|WI>5=8KSUR5$V5H79|BNk{2r?!m==bldV}jUN3K_
zu^!P%8%6y&gV*`30jMoF=-_@kginp{cRr*nabvW#S$TSAn>eCvH%(TAq07mma2OEC
zr8|_J#x2x4z3fp?fjvIG+jmcqsZ4QPBxw~h@I6I8H*icVUMW!Znr1@7CR@B!h?ux6
z=FD-aHGj@p^lYWz3=b7z`{v8kn=iayu|vc$3$s{(3wR_|Ev6ay4}h85Ly6fTdOmMO
zxoRdg7n|@!+c)B^i3_6*K1SwEG`%|pUepvy`pKhvNgB40VU*-AxDBMghTNCtO3_3u
zfilRBHl5Y0z)_8avusN4&qBV2idGdnNJDH%zupZeTTJTU86BdbR3|GoCx+hF9SPI7
zcPxnlQSq#OdwD#Hh8+w0(!pEI?wfkJrI<Hrb*G!$dWjLVHO<yL@yXz72PmY9iBDqF
z)~P0@)%0S9oO?wrTd?@jKx~_1aXP}nCTNC>M(*PGti_K6+~$<etqm&h`t$+2EE$Jr
z2CdkNQq6DMx|O`E?^<mZ!nZfnmZtRmO=#r(%dYdxqIY1viU%Qj*kOd2L<GUCcW_(d
z45;tlCZj@)6A`~wTkq7_?7#u5^$ADQ%?RUUm6Ei>tdk7BO@xWHyD^2$Xwj!6B04~A
z<XBym%rz!?c)Co*l*L&$chSNuerdY12(+uGqz4&zrp)%tPEt~B<{tNT@nlf^G4A5C
zoX_JAs!jAXQA8+k?dQCqSG6N+RPx^mRB*V?zzdY(?oXR2l!@DD`~iy*43X7|<o=0$
zPNHQiZ~<i2dWjFz(M<vGK$aQ`0e=2QO2tJ0SOlpQjYn^%rh%Qpj{;pR_NW~z9MURk
zj#ubi%hfJsfW<%zHINoDy{n;MU=V%Mu>H2|Hmihtcl;0QfS;?gt*xu@K2|c~y<E^s
zgOTJ6TYO`dIcnm;+hUB8m^`N7p6A87jvH+DZCu<eqOokUC>6Z!$T|EfQ>dnur}KjT
zHWAQuEF%PJ@Oy_J&1TrCurF($L{ID_S*m*PE97yyCxD;uTnn4Di_H@Ck%as{Zg@b2
zc*akk`(ijznb?J`kuzS`#gyo2&UkyThEs+e(%eu<WDqBEfn*>?`7n=cEd>rCk!XDy
z3AIK)@OF%9UG=iScZ_Nzl(Sq#atP0Bv@<243LR)#gK`U+KZC3|xOqn;97cs{<Tq_8
zzew0xWRz!v_Dif)>QP2!*D7e{gZXhwusK+N9LVzQ5RjuO1YWD*8b(OMlt@V-&jFw;
zoaP!1=@ySz3baT4w7<Z1;KW=%Ev%6me?T^ZkX!ACpLYs>*!x*le5K4SyUt_uutpwM
z$3b^LK#KA<8*_nNZi7p(wsWUAsUoIm#TU2ix}t~)Vj`~!Z#Ju#v>jOz^=K#HhK?u^
zYM-F4vu`f;Z_iZcNwqc2@cRyBZbqf>bxo|Fk-Tgm!edL0Qe?y#k)cq}qEn@Av|m`!
zsB)#7F_z>IuW2Ol(qn>%D%E;$SGG`by`D}W$0jDn@_Yj_j8cqk)-Pk;U=0mc`Lv0l
z71+7|NYgAD@84nc1)m%%&Pz(_f_)UzK&yhMi;PxJ5r9moapr7Z!YKzLJI+o28bdZ?
zz_X2iN9)%R;5B8r<*THf4CK{{MK0*q0#j|ZncrvS71SU57S9n`@P)t3o2&G(<uV*D
z<|!JrFEe$usOC)J3S$RFP)|Ez+=Z?0n6T{s)^%Y=a*IG8NV@TLv+}1E=pCN_L6%uw
z8bCikJC!A{b7<q1cutaZZaOA(Z)#;On;01+%<evvH$+!*8$0o8ahSy*+|1xgfy=JU
z#8u<cC1?-2-H}Oi4IK9HPykG1NaA37{BBtl`ywuWiUzb5rDYDpGqj>#A{j92wLT_E
z^GgJ{+{booob!xfkQ~~Hn*!sVu$=rBo6E6tu3qG$(6poNu)p>%_jpV38TzCT+YJTD
zU~hT=UrMFTQ26#UpIYi=@ZexiVWLgMgar+4(v%b^Ww}hh0>5J<h9g1N$3{aJe{E2O
zBc+uMY~=<KagZ{IO(A0#MbjsfsT3vr8U~sp*-80XL%*NHnK;Zt21)STKvE_S6W|k=
zFPa6Y6*RN6TBfqaWUdRkT+fA#vq!SZ$M`w1P9V*Qo6MU}X*F!M=z!dnXfi9_Fuz>h
zDW%R--sXs__D9S7`jd{>EMBUgy5*s<FQ!jeB1P!uPls`rwej^3L>%>Py)0i-jiCFe
zTmJ9~;J%OCJTEXyypkx$@`z5|G<TeE==1pvI>(Wha=9a6#M{kU^70xK6;0%cpx?eT
zVWNcO7r!8BkM5BvOj4anPi7Pi$r}Y7Z+2sRB-gaQ-8&t28Qr{MKV}&eZ+RI_aQCUZ
z<?QKPWsJy|1C?7uLLnq8Y&-zk)1)(Eo84AVj1F{AQ0e~!i^bSP^}o9K;BU}mIrzAm
zv>Tj(MAJTHA$n%B>^jP4eUd0;u`HBb@3`SH(pr)Yxb?%lZn1b3wPS|f$hkUv;V{~i
z(?2F?7W*9hZmu=VbzpO`1Fv!f^JMAKk3VwXW9WqCoR2bmi44KVppM~?_$Cn+G&DNK
z*4|F6vAM2~0yHGxZg-f<|4Gaf6v*vj{yYJJ*qWV%mS2;+p(R1SjUv)ZRi`yLQ?hs&
z{3^G0`1JivDs_!_>V_{@`<f+&c+GW2RX8!#ar^P1;5Ob<sO%V-=r+EePTL#c!Fw^3
zBL#Io%AsSMyKp{+qwF0IT~<yD?wF}O3)w1ows0Mbxyk_MilH$<0hoMiTnYN{vv*?0
zmzs;-qEj85@&X<xchl!9^wQTRk+#Z>@|}BD9PYgzBmS>Y2M9Z=#*=7~{l~5k7;3#2
z4q?W?7v-oYXlJ@uaT74cs|!ZTq`wOSaQ6&#=VTE~pUfkhR90liCMI&`dg~V>$N8LZ
z#*`=t_Uf~qEAM1$RqPtvS`M;=(HDX!O;ZNuc((cUBHk95qvp!a*LX-SbTz-n*?Or)
ziKUh_k+1GsBYy3%><)`e_4OEI5B$DXLdP)Oe9oPFHE|u550Fqg*GCjGBYk<C^FtFw
z1^0zoTq*mPdft2Ca=<S0br9`gJx2;N!i!vu!WISyM%60G9!J<gq5zgEl$d7IBQPr_
zi_C?=9*=ZRBk!jO=7}sab`lR9Wz<eg1SLdw48hd+zFSM)PgLss;T?9U`$m^?%wufS
zO+t?EhgiZM$Cho+rBFUKkis^OP-0Y&DW%M+2<7J!Sz?faUU+9$PNgoy6l#32S9HSs
z>b;Ot5Y5Y;Bg$~Hv51h<aWu8G>lM&)?MN0UdMU*YQy|kjH~X4;PFKP_>&Xs%K%nm>
zMI~x0-JDUXv8gD4@=!}E<+@jfd)VPAl;#ot6`a~<q-n+)o^>bTiV&2;%cXR&cFt{Q
zkxR4J`_;jJEENBU6Yc)I$F69kL79`Ei~egIs3theZsxx~v~6TOyj^o*Su42MW{p&z
zTX#>GZNr|vRky;02fuNgHxeam5<ZluP&epjHH?OYiHZ}hYpqS`ABu`a&-pfa<Ok{U
z;i{V=bNf*vCn13+0opY3Rx&R{No3snc+6M^cT!KO!(<oALENSf;NFLywp0L%aTY_c
z+}<()YTS9QWa4;S3_C{G{!ysTihbar6fwt^Icua*@K~03vv~*Jn!V=v%$Gn7Atdmn
z-vdBJqhPGhS9k~MB=9&Y?{?Yu*9fArKN$a}Qi-9)9rGy6HU=l8=IaSn33ZYHxo~7W
z_;qgQ^ByN7sMJLq8ovF3`c`qnEa*4-I^pqp2I`(hj%Y+9PT}jSEh=!VS!Cj3Xl_3V
zyC|I8kVBfWfvY{<=MzxfsC*K|x+XCh&`%AqleX43cYMY0zPbX0bzdrW{;-Q@hC=T1
zaxKg3z8MhttBpzeLlT_)L-5S2XEfTwId=5q2s2P2F+mH-gHE}TpAMI~u{`pE&&g}B
z`^CC)KGZ`=5MMRh_uk>`2`!BJh=99r_==%=6hQOKx>63b;!s!4L0RTUf_ftc)iyRG
zlN%l7b}=K{>6T#{KHC2@Gv0pZ1fiP};PHG*k&ZV_1>}h!u^_qwTUaEha@oc(`RtOD
zW|d1kBy*oY5?Ylm!fzbfmK#o!C))Vp)gKz^bG;Op{zwh|Z}<lvD>Z_HSsMm$N`gJW
z3gA-^rB~j}_~v(NGD246fb#~$$|TYDMw?E=pIy>6rM74Z+<Rd$pK+BuDKaWd!{}>S
z(0@udGhR!w?jv>aNNctT$aoE{59K{0cYZ=#IcVljgdjq9V}>#fqymsJ3={-ipN^*z
zKGlW!fj`WnO*1yNmRlb{9K<h%TYoy_q#MlQvy81(%Ub+k0UwAZ@y~_Tr6}3myJW|$
z%LU=1{w$&D`i##F)B^Ah76l&1XeL>2!<DuY(iGL3fwV4t+|EFWx3F!&yW!NnZpTUa
zIdQV()F$>T?mQ39HV-C^O@l@bFn=nsqFiBazPl!SOJo4o%4sDgATY7H080@mg@vQz
z7AG)s{Lq36AKTDBA$sRb0UMQst{#%>YQp6hy`IqcUT}?t&17=5>TZXBC&JdDY*z2-
zQVSN_?P6rM9j<^-#iceM7Mn;o!J9TxUk`pfq&quKyy01^q&1{W^{wDawg6S$Ag!Hu
z6}_6{R8JJ)i0DHlld1832nV)6EGxKP;rtNl-e5amEW02dIWYt;UBy{k-ys{29{j+G
zCNzw=8QdiU@&wDgWRJdcTBJo7lmu=hu|@Iq=G(-k>n-DMjL^^mu3dhdz22Em2zr1$
zJZO^WYv`nkUskf>lWy6qwQuBQMt_hvKQ6>hzA#InV(5dH7bWyGd6P0VI*V1|3!OM#
zvMxu>P(;mf)h|Cz$_0FkLmcsMW=Te5y>+YH#qRd1%y90kN2|);)!)4lz0sdDlClH7
z?oZ{el{XK$0I{bxpF#n#hH0%$YT{l#C?tlnU0%o-0|A!<<>aUcqX2*Ml<(T--cr$9
zc2WS82ij{O_UDn{Xn%uCgj@|mU{s`1fAZI|Q+I$Ycw3z2TfK>^<=Hqf{8XY{{oM1D
zHPFZeXbWk-K17SDdbc8Q>hl2L{ItBp$b0f4ti-M5Y|#Og@W{3#jqQ+;wFoK#?8S9_
zWM{#j<3c`k>I}~_C0(nS36uXe!k>FqW$H6JB}-FpHlU3{SWGSO<SV_{_tBo{;%eiJ
z5gn4hNo$ImjQ#qtf5R-k(h=bCdwK3~K)#KpJ3xl~AoM+h6v|O6-8NbRxmz(5GRu-h
zE=SRbY|H%^tMU!x1D^d~?Y(7NTwBmK7zhLiu7Ti<ySq2;!6mr6yG!Hl4#C|C?(QxD
zg1ZL~KFvAjnV&E-?_3YxxVrY<y=vDgx$9olK|3Sn7ycb&<=5%mgt_j9F(in`B8U4w
z8u{@(LwG%Nfx&o?71=8IxLUCr38<YM$U$(`$nwX}M64r92$Ud`vo*XEIVfOB{pp4%
znKR-dM-Y)UD53^iA0d0JkEN4w|2a=1&e&_1rRwbdWC{!>jDT?(xX%NWB7omFGU|oA
zabVpE*G|W@KkdbCjH=HsB($;tDJo7|HEerQoOZw2)y>j+d`@95zM-V{<d+C344rn9
z)7^gvQcRi=MXeFkxH|(pAsi8HwDr~tvBE}O>8U7*+<l{+cMTqLd~FXp8$y1wnTid_
zMLDxqV0;8%+-^~v(-xE?J+aS`&+mfzVp8le2MBg@OMWsaj{!{iq^(LsZWp`Jb1!{p
z6o?@PL9shua=g=~=_CKp*;cOKJ4C$QH~E8tOKc)yi$#JBF_T*vlXFJjh+~Nk2>kIL
zY-HdNAAhd~YJ9KozAEm%{W;8e{2SM(iK<usyfE_%%!m`JCh5myBP0SbxTL{Un7a`1
zTmkKi<BE4ipFWEOwh-)DzPwipHtBvH2rpZikNex#FyGHiFjNu<00vL}YQ$|7m^z}a
zDN!@+R*G%oLt>aAc%zdQ&jnkJz;>YmEBumsmN?}+IfjWv1|xcpKO@HXAk_N8Q~Xx?
z3L4f8dNwoz!X;iK?BRF}AwTFCv|J|Gu|>aF4-|YuNgN3?0KpyX7{93(Ls*y^)eUXk
z_Ag-`NJZ+SzW1;V`X_8-<o}66iJ3U$s<qp2zyl_T4v)nH7?ceeeprbNy<$thfs5tv
zpfa{_rGQXc_Rrk`;%2X^3FBh8D<N>!LElV3Mgav_K?W^lfzxj86<VOXq#6+2u9R6F
zg}YdVk`XD$^|ieo`9i)yJZ2h%hnj9B12Ty_Sq&k4Q(VSX3K@uDEjL}LBDrxMq0P@c
zE7U?gn(Yn$2zO@v<u^w5QXbl;N)Z*f3C>OwPj}e2O5O&d&bxJ$wvM|CYv7Z|I))-#
z6mM2k9;@;96=aJqh6WN67AVZgc)hTw9;D(qtpd$#NG!O7y$3CXQPZbfUm2K_`K#t%
zD`0JLb03r>vMDlLz-p@FfzJ&R>aQ091GJh(qo29p{gG)^e5x{r)G(np;32Ly1uzNP
z`6#awVEVrq#-@!)^cI^SS_liEjur<tV>`UQ^)i-hZc~PZb_Yv5GA?<5z(;J_?k2Rh
z5#hOf{$BD#sF9H}NGlZRO%Ma?<?mKt_f~{0L+W7A(mVP!;$jWXWTV_=Oxn$I#h?(H
zdPJZs-gGK+z%89&3h)LCcVxPQrVJfIM(>HLrI>+V=)haM;IkBxG*HLPMUDtJ_MFQb
zlalCLPx^yMGnZ%jLJVCn2)t9mS|W+-C*s4!;VKr<SMSd@^{>MY@~MhBWjbhE2~s1%
zMMG`Qxs95(T9M|gHMd0;-v7lDFntPLS9C%kq=L=9Bv5;Lo)=3A?TNw5D)o{t|5?MD
zHqBVE?2Jp<So^8>=?9)$y3dt|`gUkcvF81>M{a+dycy${QB0k+MpJP~{y1Xsw7OR+
zQNw!^r0X#K?u<LM-7TU9N8}(pcOW4gG9u2ED%o`=>fopRGVS3Qdr=pOZdBB|Y^-pB
zB3((D$KETLvFF(40wPJz;Je*Xg}`PtPhP8#nd%5Dxe@%kJW-r=3{TsG0dD4tpuV$-
zblrBhE$dvxtzH))P&<QInxhe~#0V~IREoNAbRZe)CtT!4qhFsswI76#m=Af>r+nrJ
z_A?}W(R#0l6W%N0TOwva&9W@hDtx?*KghxXxF)P$`1w}hZ(?eLA<Xx}zSV+a2@w35
zxjN$=0d?9p05kT5)}`GtO$J&-c3mvAgu<Wad0k*yF35&jlK6V;+Ic5awB2DYB50aG
ztE~b=hng<>;6QmblP6-KDB*cg21@UBbaPlcHjtDq@1mW(I^kl?o+|F`y(5QZcr=#0
z2B;7HME+Lb)NT%<0EmdDDV-zu=}^|D>~U(sPCS(J0HB7p4{GCag`v*(VXekdB?a!(
z-Mw~)H&L1KnnMO_IMWEe22GXkZh7;r$M&Eb6u>S>VsBM0$n*lbV$-aIJv;6yVwoR^
z0n81$gLdcvSdfk4lnui*EjC-C1Am1AV6zVf1QU3T2;3IeQ;pDei-}kzi5C%VZS^-@
zRZfkW`t{Th#T`e|Ma*)1_h0Lpp3@~(o3c+mH3fk08V}>H^WN%{P9UQN)DZ7CaYfzW
zf3JnH&T2exJ95i;P5>thq3_88mHFvqI1xb%A0gH@Vgx-BbXUn~gvwasc`oe0@0R$z
z)_lsJ9)B@Vi5ezQCu?*`%d08lVO`Zp6F==;xF21G#teipx)}i3fx*C>yK7L*DXPS(
zxWo=n;ae)!Zr>KwVKs+;O(N>o0b<&H-UaX>tz5vt5M{$a?%^?4R(uxrdf`DAR``<J
z>cRAB$OyMl6iqwJK}ma9UY1-;Lw0oMaSV6V-e~<K4HVB;xJuF)kho?0WF4G}HtNyU
z9p^|B-U)2HZmj~=X&;$+Xfb^OZE4>sa7B_>Sk$cDkO2KkNU>dF<SQ>518>z}65o1p
zZ%JHIm<WoFd@Uh&&X)}ywUKV-rl6y7_ni*9c$3U8$^rooGN%LF8AI5a7)!C~<9)cj
z^<_hRc%~>^cH@1khOr|Pl^l_w07_nJ&mLZ{gNKSP4iV4Aus>B+^XTLa>f&r;$uKX&
z@MC?O>fiIokS0%x3o`P97Nv)D_Wr5Z?5I)F^*8W{1>w65QW>kB*8C=zd%$z_kf`2g
z0bou4{X5zA(C#n_xH9<&>VGsj1JH!0KQw#VUhu{AO48cd-gh#;+Wm0J#Jmbdx`?hP
zipxDYj!UnWI5TkvRCuZh=+J7~Lg+6%)>Eu(>BD(@phE|_I-i>VvZd4?<3tbg0`Xq>
z`e=ZDW;4LYbA(PC#lzu02p8){ev9KHkfcC@g{R_<Fd-Uj;(h!<$j?g&W9nYeE9hyM
zkOR0}yN;uxz))HTGidcYRzF^Q6z2Wt6{f9*i>STa2Q9i41Wot$R!I`)>mwuPF5?E!
z7`JnV9&a;hoeog)2B+L{{%LP)q1dGvoBmEQ{t=4EiFX%-Wvj7_U*|TQVPKH$aZq4{
zi24C4(9bQQ!+qY9CAvb>ae~)Ca3oRLVz*bwm)K#aDr9NHXDbSIR7ZW-UPT1njTbgl
zeDrTT`P7c>8-$8uK+m9h{S>(V2cYiX!KigShWD5tat0Ge{7U1_o;#WBXuXklQ+Bxs
zL(`NbfYWe4DB;eI`Kd|5yeJS;f3J3~>ocGRksI?W3GVg}mI|6-G%9;+K@*7neE!tC
zRall-(`3$x*XF%6=_cHFRTrz?!kbsW&ExrbC*ax5VI5=ZPaOzJN_Ken^cxgb!MC}T
zUND=qhP0zx!Gl>oqEKh903TM=qi!n}ZnlAIBH*A$0p)&xrWxd+-kO~Pw|{??d=HB{
z%4*2unISMs3&H}45KE-&7b;Y=!x4*tIH@qAYN9#(hh@-Pwa}%)Aq~>TIKO{YEqg(d
zf@ZpcD!(7-6OM(I9qqiM67B(nArJavEVvqRzYcE*ql+aBB7NC1-Vw2Y{rHKI$(9u4
zrWENpoZz+Y)pILK+@`AQ%|=R~iSqR!mgh<@_0NjC?xV)K2l{$Ji-gbmSm<W#=NR;d
z&pgz|or79uXF+$spg}9!heo%B<d$9*wu>Ng_h+9kayMV{CMb5zCg&f=y+`O@>)Kmr
zO8eW>F}+#3+{5<^YYLo5E=n=9H?<|3Oj#2<exf6AHNr*7p-`uMFhO3)W;dL@45!K4
zxVw)m4;GX@c-qVX=exv_V!(V>^*maEY{U~wMrcXSzU>cZ@qc96HAe%y$a!<b`v7ZS
z^-<v`x$t($u`bStut10=pSz3L;n#MXxXx7sFv8}|i75$EzXA5hb|%`6V>2w1BQ5XL
zN4Xk>lkG<w;NevSNJC+r6~!=;5X5igLYwq|fi}KIY!F2A3FJx^7FSS4{}i?VgTafg
zrN%28GC;*%AY*?Rp+=w}*skbF+M8cD3nC3l%q*@S66b>ZKH^l)z;Wj@RQ@GXmv^KL
zfkPZNg||IDEbpqS=WAY>XS|9&KYn%$M=|!s)|0{1phml4(${o{`G5iB{H9FT7k#|w
zTdOCYE!M@>E=Ze=k)Gp>UPH0jvu*y;(}j9bz5UpfLk}u?<2JfT0CX_pJwo77<8>9i
zL!IuPdp3<bqtPiM7vp&BH;)$$ZJ=9kz)GwW>Fz2$;Pc%L3T3>I)cFjd_w~BMRs9gO
z<#k5f0h^7)q}QE`H$m);vkvCN$`@5P`+bTBH`jcwv)={YTm-tZs<T5BfPw7x2`>F1
zT!6?K|8>u^;<%^h@ZztJsQPvnMX6njlND6^u2Xt*41}I`YN8TL&GS2FJa^V3o6&CQ
z(au|Xt+&#5`#&LFuhnJ~IMT5{%1_TEy7*F~UPB$B*h;l<-5ulA>p_wH^q~ePVYAp#
zAhEdldJXpNfd?2R_4vB30&|`rU~ZYXnTQxJ&8h*)pb`p-G3_M1)7F~P8MGpj?2xqn
zS=4e$O-K+FEfR>br#~-$^UUy1=5Lb9bj9l{wSnzfm~|p~xHM((!G)SQ1hFMID8<eb
z2$f8-0>$>AtV_VoA$+-d)=IrZ7IX%xX#!<lb#Tz}>w0;@_78JflWdv+65f6JUlquP
z{%n;<K?VVz42s*;CletGfjEzg9glM63O{<~>z4Jj?(L4V0gwl`cxU|9E|}JgMxW)F
z<ifAFIr|1fIpb!fhc=Y$0KUEo-SEZkKhI`K^kCvRzDy}IM*%x|tv@gQ9dS1R)!l=7
zG3or`YM2Yp#x>bGc~{}prg-r_J5??bIz)68etBWqIF0FT(stYKwS6>}pOTI?6~o;R
z;j*ljC@Oq%<2zRhe+EkI=f<|X522%dB}|1XBoG&IGv{@-{M5M*=b((|MSkMUl%u!v
z*PoF~xEec<_*zp0MelTPWS{V=(Z(*seTT50e7nce5u@K8yE*+yLiKy}z+e33Bp2Y_
zzF{d~W!m=s^6P}BI6=kr-NqBTxsU~KL-r6~Dd_iBF8$f964;f-4w{iRSi~*oGfT!z
z>xD~7!-elPP4Q`bXfx;<_m^Y%`*V{PB|_V+0FRX(%Xuo^y2-dlu|eVkF@SdISYJ-W
zjBwPTVM)lHSh7F#bcLN58Wc%MzHBy7AJGY5@fNI+80yWZrD7K8zrLyvC^3(j?i`AC
zqacilzAChnLk=Roxvu4`E<i@A`g}6nujEVjX!DGLEbM<3z+tTZ8f$<J3Cw%dT?^;O
zK*&daw2?`Sw<lqF%Nq9H+mH4w%XEtX*52t+_E?m<^3J+T@)?o;9ylfxu(7(V*<4o9
z8{707x+7w%W98V~l*Q_GK!8~XUDE8um=Smk6>wNtAsXiEEb0DI$s@G#J$|0{NgML7
z=2t_;7Q%Q48jme`IIJY!ReHpAU?&`TQva;Q^ZKLMJ*atP275${%8_d^Q=sN(f@&oP
zXV**{pT-Hsk#nV7nDUtFn%b`7#R6B*nRjIw@1>RiTlddNJHvj^Q)J3a@y40G=3JzH
zNPiF!rh0dsx9&q8@;@=H*Nl`EeYIfE{2Kt{TyGTY4|^HGrwH21fThl9r@a=TL}d&F
zB2$nT9hdpkfBda$d$Y1;zI3thEZEj+Dr4zZM|~```K;4X)g*n}Fi)b;7vK^j^05Gf
z<p$y7p~J_^0)~0yXw|a`$CNq8IH|C`figf5-B3OOsf$H7cN@MRWida^5(3VG{2?O_
zrIFzYG$ScOXSPFvT9Z=tX5clBQu^`KmptWmI~C1`1XSeq@U%|h%QAq3p@u{GZwK)n
zkqUn&eD|T-qp3Fn#GmO}fPPFH!yMWAyvlo{fo#a0gDR1_-XXi+4@%{3c4zwvL)OPW
z^;|wG9l$)4t~ReaeCxW1M5H=`Urr#LExn$ZYRuhgaVZ5E^F~r!OMG7loC$6z!q4H(
z`MGtUk9p4w_#A!N=5doBLehI$IpTu<lWr=_bRi#1uG~oSA?&OU@qOylOnHj^2^q2e
ztHI-@lh9g5*qslq^5YtqDumz1OU9sp!ixz<tfSF?l^8_zyu+YS&x?vtT}6s0f|V&i
z2#CCSEL2PJPBq9128_HDeqVyX<9<ST!Nrjv4p~ZBF)`d`RZ>i{HvyLEMGWZf6^(($
z)6>oLj{EtAeIp0|V7%R6jNJF9oH!gQSEPvE{5!3jc=VleBDPExhKCeA`kJss`Cwr3
z9L1n7sZZ-EcSq-D5|2rW$LY(?2xpMk6W=EZCB_Qo9OG|GgP9A)iQ09`^kKp->9pcc
z#=xkbiQeIV#tV{pq$x00HF!WNWMBg1<2%tG=^bSDW0qoApyf<+sT+2)7?!!iv6Tr{
zKTnHN+&ZWc!+*akK=z-z>Xd2Nu}RMoWVF*Aoz;;Vhwf!BQ)2ocS<b%`H;HH8zw(vh
zKb%qT=yx6Xz3X5^`X83`4cGgfAf>=}B4hZ6sXcuFW<@*0Qi64z*q-<gQ#$8;J`eGa
z|Gm-wgF5mdZKz)Jw!MB8cp5V~+ZoF^nQZ={!RRa}K6U-?%Pgjcv~;nuI`_8Ma9kvl
z&LDgWyJuh0@zvH1w10cK=V;pS@!UB8k1OCf?2P8ZHm-5KC&^v&z3`rZS%2f0Ku6}C
z9na#dOk;NG)eA}Zm!@wgMO5SXmTl6vel`)LroH$)?76<C<$cEH4Gww9(*&NoGy5BF
z4;$&?gdWYyU9P&0o8A7MSjsN+|B!dL#5Nmhi`jO_BW>wBwy*YAyPkM5r0F}1n%nGv
zXIXWiM)RG){iEV+8+Myt(<JphJed?wcI%n%R@NOVsyCkcsdOTnrI_pD`8r*r-t_+V
zo16yl#In@r;2#DtczP2ycz>V=(-QtSckEF7ebR#h^>XolClS!ah~UmZ#AE+6KkX3}
ze1T7*)ZM@IVv$V{m~zacKYaB+gVKCNHj~g+-oO(H-`<v=Eqp(GV0?Hl8qd7pT$PuS
z&KLbYkxqj^q@<)M5)uZnD5=YCgTlNGwQ~gVdfaPADYvR!Ki55cw>uVIRJ)u!oM$DQ
zYFW6@UGRp&YSDKZqk2qY1zGY<;FzS$1g}>&P*sMTm7S@QO1S4;u|%~OI_`fvcGlm&
zOfH+|cmv2;!}^qUbm)zZjq5Cz=&#TIu<M9Hg?>OKf)wdyf>cpaxq3XQhVP`Ip~2ty
zA_R1d#AG0a6!C*a@#pdT0C$GuH!h9>0T6@=?LH+D1ZH?gK0v+y_W`x}BW&#S`G?`$
z8(10d=%}cXKYwZv5Rk$TH|*^%(a_OJW@Z%7aBy&>28=aAyYYqe%5)Hb46J0Nq}Uyy
zp`j>VsX#&qG|j^%1wm<8z{$x;Lh_<&7AK$0aAk9W2-d;Al%_>^|IiT3jr({c>@U9$
zH`^;e`OQ^2N~)@oOG+gCS-w{ti)v|Qf(d@sC1Sr);y_sBfX!q7ySqCnPtUgb#f35d
zKp0~uCz-|M>ZI*$z{vM@bxArE14BdQ-27fB995P~RJ$WldHL#wQl0ry9XxKQZ+R-O
z7j3p}iOI=QVq(EKmG+5PkMTet{0sYM2{>(%`g+r;Q6aLr$?Gqq?8K5aciSGi)ua>h
ziM_phiZxE9y9axc4i1Wf*<&)Hf|MkiU(>R<998*woFsTWv(Pa4hEg{U%yMibeX_0A
zT2-!;DfFT#N`o9M!|j)fSAEQyW+J5yj{RH3OxS)pUmf{R85!{c7p;F&$0^J>Z2!`4
z;kT0&H{+`F((tdC)iHN*X&leOwUwb$-o{~rutgGr((>5)nzk@6b3Cv)9hz97NfO6x
zx~T4|rI>Iqh8`neB4%;DKf&T?L9C9Cb@T}mFL!M(&+cTn;%OT*EhU~FM}sTA%c*5t
z!8G>#I%{@qfqpH^nwg267RS7Fg=HcoiAGJzWFL4ZbK!skH*~zl&D>^pSvM1LGj269
zGLevb5~!%FL>9PICG{A9L8V?N5(unN@E%4GZ>}h1ftyG#YV8PU_4v^gJx)q3cJ95Y
zprO;Q7$2`lDt0iodHZW|mQ)w-S$!pAYV_}64ejV;;sh>-lu(bN&=dyx4tA-1gNx@`
zceyY;M2gODZsv&4=p<|8^VmFGkC|eVze<QdXC3YeGHq>>BcaW&riGD5ulB~AqPdbC
z$8#4d&Szt%e}DU}4Ko)t&g^()q@iD1U}{92HgJ4MBgJ=5F@IgFCJruU7G|jNttXg&
z@XZ~D!jU!7YBW^^tn0X@xPh##i*VD7g&4xa7a)uj8N`CccaM{#XT9zc@dR8X$i+e;
zsXA-ymZP}#cWVY16KUjt@Oz7CLdzc2+6hRvE8Inw<9BDCy1_%_Mkg;kPCh@h>gpUx
z?bp3xT3nqQ$1d}8qz%vP&AI2yv>pi8YK0doH0X|wjs{o9cA}o+U_p$Ka^fbn7GbIB
zE1HudJ@WTsK~?OMQmN%de+P;JQ|c*0PS}efF#$uQ4JhWJf?qzKnDZ;y4vvmemsZ=x
zZq3+hWa<);m5&+qC7*H2$6j2<$(qWIydtkBwH{NqIBm6)mKaKyOV(D6%sjDM#b##W
z#oS+t15id`Iq}m(!4uEtohqBed4lOyT6m}^79>fD>BSnYmT5(Ug0z~rFiTLl$a8}v
zgrQooO(!$syf1uzoJ<_=9xo6a5n@jt5|LP_QY97VcuIy#sk(a^8EXG*e_7Q%d=`qT
z8VFMcX3&&UjSLDI4MmGP$yIjmc)u^18M32esi5*KcwqbG-q=XS=;ULJL@9E_z@O<@
z`sVYv;1%?%lzSfrk|h|U>wVcelUr(yqSn~59@tbM%)57K5EYc3otq<}qpNgyxH86>
zrG!10WJ?R054%>{tA2g_uGsDnHFUsMQhHgb`y%q_K|ez7{CPypc8D{}Qd((hXrT2E
zH&xil953V$q3~A-!`i<xbXcvG62iiPk!FJa1Jd&bxgeLR{j)l=uMyDui8RQBtOPN#
z+&SdvJ9SV5F%|(`uB+_z{B#901SUZwxj~YF!g2Sf?y|uRrmfclYhrPHDjl7zig%s0
z;sy&>bx%oZ&AwR_v41n1VY>}3J2h#K=MDTPvuo8OyA2co^5<R}pCkCb)5<NXjq!N;
z(wlxgkr}e7(xgQ@Ad`}`ujS2i9K$YT3POS;ZJ0KX(4d2ZvsT?Q-Y1UkRtBr>Pb@22
z=>SmN3Qp2|(4aPNzHPwZ$9<jEa=Qi{mOHcQBpw^vtO07cEZBGlRMUc(<EBj@lugX(
zAmwgzCyi^T8P(fdy8wH}#*2$wc%Q*iA*zeq+$wtMczq&7z3u68xBeiLlpnFvrxpxd
z(kIIPl4d@_<G6Id?o$5PX<7?qI%>Fys=KSU206@B&^||h`wRdo^Y|m&M4s3}Ui|-F
z_WuEu3>D(z;KaT9L*Nj6jlaCKFD<Q1*f?LKLBpis(e)AySMykRZ|W?PT7TZNL}Own
zo#KD0NK8(YQdTbL@9&oY0t1$P*6oSih*4otp(6Qj?#IBnL1a{vw6%8S>Dp7#Oo*xX
zQ6Fw~SP4IGlVdSB*h&}`91B}jkd9|bd&9=vIb#Nb_JP~<Pv@tniTOj(l1h#dH}|*F
z_d)Qp6+>weSG#G|zQU@BRd?z;?=GNi<5{M4C7qOmLydX%%W!+Y$jH=GLU{P+k<rmB
zH$q=h$lOT3nUH+(!qQTWs@vOAeyzmCMb(kfk(A<``zmBxVZxORGTTc7NqPB{{(h*D
zvG25)mP?9+z=|13GvY-ef9lE7x#B(fiw=sUB$e5j8LB=K!}!EZKw<gVUffVlTtF`-
z0iknQt=#L)oo(S=KU2%%Dv5oqBdO^OK%=cui@^IvL8H+sZRqJQ2`a2Hlw3J~d-0>)
zsd`~UW7F95bm(|S4r!s~;;5lKAH`UN6ff4>c<HBy&9x0lYv;lnt(H8JI!cYiEV8{l
z<16gj9ukwDXK=zJb7OkB{KZK`RZX`Vq)G*Y>_bmtVWOeZsUxLYnUoi=Qb)wsw==h@
znZA4d@d>(UD0<OEGFw2N-FltW*m6-qUq82?u+Y6dMgYzKo7g6J8mYt8u4I-W7kv&s
zFQuC2BPkDuyzGUg(az!%nW=r}(6(;-G+or~f?_d#J9<(Gnb^cd2D|k)AWcgAwgHR5
zwHt{q!e_AS`O(e)gxult(yBE>U@R*eb-H@rd4r53z0CLBNJF8Nc3iWdWA1_b3kn%3
zDk?J*6LEfizQfC-pOdTe@X_E`b3RwiR-RQ-77}d&pTGK4s(X8QE2mAkIgC88&HK~G
zC%8r%PZ!c^_o5@iqjA5VA*r-#6`7gq$>@EmM8w70oDICZWGXQ_96HOjo%2+wyEuv9
z?3u0(qs5K(Mw8;>;{o>e_Vr47`^R4(Nm;w};9oKXzX>0s%q}dD-Eu4PEX@P5%KF13
z%@sR+5_b3LR05<D54jv8=Y|L1wH@lJ#JHW#PgfSC(W;9E+XuKz)`<iJbQP79CZIUU
z`_n@GtYN#EJ{P!vBTkmM<!sBIs=0-El29r^!km$UO(zCsbs{uQ+Tf9)lX6zs<5R)U
zqY7mLeoAk?6jB;Pf7%gUD{j}cv^2ExXksG4qG=&N5Ei#J6Af9KO@ZnU$4@x$_4A%g
zs-Iw<3V%Li;t4oN&}$#Ywv?5OeNG+*&&!?1A=T}`%a4qol7dL1LTwI&_4_qAIEYaC
zvNV)4e8h5KwD}lYKsTOz6&|%R^Br*Mh(3?&AkjMII+~W?jF-07bC-6MFJZ<a%GiIA
zV^e^Y!>AjsmXv|IvaA%vz2&WN00SVUW0OqY-75_-j5abg9rI$7?WGMvMn|o|he%zz
zJW^3I5El<&ZE758ah!0tl!}RrmX%N(g``r~DIoug+Vi#H<fQ{IO=JF$lr`e3CAnya
zbAijxCi!sHWNA=3iu_T5Yk|yRdOhYN)yz-!!uVWAQyGy>8}T1)PXWqH^Xbm%&fHz>
zRq5Z<>lEja7!cW#jxmykxmY4VejpPC!-;~Drs(-ei|U4sllJ2?kP)o}Th#-!bV{m%
zhNfm2xU87kwQo;>Rrl}i-u$7}>ZGx<x`1<h%4I%x5%GngG>s34WlWz~C_Rs^h@Z!4
zY9i{$tQd+vfzhnkDrRAa@nH8RKG<eIS0r23Kr;?dpv^M@KyC#KUVtl<i*GPx!5$A<
zYmW!xVaYJ(T~o0*9XMxkpQ5Kv&CC4~=h+kkrG3AD&%ntwHZhheYbBba6CgriL5D}h
z7At9fh1h|1*Wn8!At529pcrc2pUarZ{zR6$Jf})a^<K95!McGBVjmh0j9R}JY7m|%
zME@^+3i}<1OF*#rjpL>DVy4Et63%lalB^>TwwZk*EG&#Ex95*H2kGeB+H`omF3jq#
zfE65~JpwYpqn75Xrn4_8Cc^c`DLibB-WSLO&z`W|{oPUU&}|tnfi{cHmWJ3jVWnL{
zPY@HT|98|H+1I}0X*p%Ju_@onU{+SvW2B#9%oHY*mDVF&VSbSaF~ggJIT+x+cZ#lu
zf@-Cvv)T_uZ{gPysyA0r&q-A>tAw{2vWBvqp}pIjXJvY=R_YUWn}pNMfqF9Z@g)N&
z?p5$Cg<N@3J~|B-?~{x=lZmepmX7s@L$+~mw4-mbc5b_3-Ruv@-+|>774d5K?c~CS
zfT@m#?^3EsZMbnTBHclP+RlZi-i9&#VqvH2FX4$PNiA|3FWMR^WS%y|Z<AlB-c&jE
zX|dMXA1HFcX_W&0>C_5)kV`lx+ys^4mUou*l{^rfAT|v?mULKuKFjAmpCe`Ez~KE8
z;P>xG?x?2Ob+WlxjoANt#s80Vl|cr?B_vF_ykvTj36(j7p#lH_BqU_i)QZR?6uxay
zZxpOKa1*ZMN!2oD{HWv%yAjB*cZuU2MR@5USTHHM2y@yOR55nX6JTRn5vvz9i><$8
zN_q;jPf|%@qeCHO?j80{94&dUTro(0QW=aRH^#j>(<mx7VP5k---tSu@95jAov9vS
z=A;Vg)2eJ4e>YI*)af6Be-V(8C3QF0TKG9B4f9FF{POb-B(%KR$1^x|4Gec|q)RAn
zpqYvo!cUf9_B3s)5fHUG7$Qp!&B7ugSRMw4D26k+9C`O6JjKPtSYs3t<P$sf40g83
zG%rlk2_RN6JRZ}%9>%RuSHSt<&At36pVNne;Aqv%QO)h=2QgJ}Od!gSwRifxKOPom
zmUK`;iIRJdv{`ce>&Nj|?)ZKFB+FrT?xero%Gb9Pq80}%nzvYlhso6rf3G9Ibq@Wr
zjL>O8>~EX7CrfJuj(b<4M{0_=I5_Og@S@Vn%+lXu>96-GaM8k}!q}vEvhw`=f<uBM
z6}LraRvlI7wcCjTKfajUz>sBR#Qu0710y?SBLe(BRpVgaHhOTZlBdPzQGH^JUQXI<
zr=LtpItg#={n=yLO3AF)4?29BTwE~rML%f-;>@blOQ)66d&xuA6bXcei3u*s!o<!G
zD9EJSM=iCi+T#>rG2_cgLZ7&KR-mDh3a)tNCGF0CAMsdJJ?iv%pH(vF?WN{q8gNHB
z-lhplMQ7n#2VB1vU@yogLS#+5U&oV@voGdYRdoE{Yi6~klb!wTq?$gN9e<sLFV<2`
z6Q}1C1j;sVkDiWKR5VtYG|=eGQ+Bvcmhu<`=Kskq=sY;aG&koaIhf3hpQAY0^HM8D
zxVs-=lu$h5A_CqR<l4cr<~);)6&R_s813yI*C+nE!J8d`KPb$t(oCPqUgYIA5_$wK
zziuLp%$=kaiFdOHWp|S6@}pA%Kl<O4^ENiIOHW)H&U&3U9NBF()y^mgD>JeTmWTor
zc;wTTx#}uPM6OckS-Ab8vC+g-4>7V8aNah-L-UDs5|Y5Fab2&0Mq&oVwXN3up62go
z_}1DBWLvnKLN}E(+V7n}2dsm!pmTcnx3!ItG$lk1hR&$``35H{N-h0S|GI)Ay5^|1
zv#$AftC)zL{?Wkr=G^deG^~t7QO!|Hj$?6T;QYEMB}@ZUGEB*m!dR70H%{qiREI^E
zykNN`T}n+DN=HKCI9p(!KlB+|oB@*|oFs=Z)?%Wqwra4QR*ycind!^cF>#xiFLc>(
zMw0;L0L_ww$vyzTRyMj@6O3v3zdc4|mhb{h<vG3rUiO>1Xv)WyR;!)s)jzIctWaV2
z0~#zl_-1F;0Y53qcDu!W+3TPah-u;J=Kw~^X%mNnkG(Qf+S*k{D#Vt`ul4ng&?b&I
zlX^D1UWZB#mtC*AEnY_fzu$(@>wI?Dspz$hx%K825Fd6r37@;(=)O$4jyA%$47VbF
zZ!lCN*FLkps#o*<;o$pMx(OG(J)Wt88e1DofU^{h)MPIz3et!$k{$5Kr?6aHCrNvB
z<s@6(%>l6s1KT|I;&J#EXyWjByd?BYKpM~_KgLf*Ju(eKKTE-gc2(>HW5<$cQYLoY
z8_`zzzMm&yl9fY^2I|nF$%He~EiNd7A*WX}F)#BYocLUm-th`6vNBQ@?m#Z|jx8Jf
zmMp#nv+)d#2!}!uIP0;y5F`6n#RNjQiEg{qx8<T~IT~zOV4w^~4i5>TZ?pVY3;B|E
zmyDLW!j)1;dwm-YMIJ|@7=onx#zk^Ns_sKeg>!?Jw=Igq3RP(-i*%+v==kYH+us(U
z==gm*L!FF|ESR|J%G?e0q0BvX%<|4Iefwu7${e@cPO!NtM9f}{#wc<l?<VsgY4**8
zQrhyGi|g-5h@$fiJlFC!!%V(K>tAJjt<H+zJQT~s#Il-UQAQ5==3d%zj4NMnK}Eb!
zRhG48#P*j%PK9=tq^fx}8wtypo3nNcLLRTVn5pmU&gwB6yS_(-<;5~7^jP?~7nBZ`
zFn8A{7HLE8pI*-?_q&PFNKa3X1>eNIb>9Y?qLx2(OtE{@i<>pQQNL#ci*`kzPY*B(
zDkq_DQ8~}*JW7c1MOn?5Qs3^^-$F4kDj7|j#&i5j^!BTkCYNjbVD_dd2up<(GkG6(
z>lN*%u-H7mm<=&d{UBjQ`6`n<VzD0Lh$W`4GzWe5VG&Qjg1<&n@IvSJ?R{Ncg~740
z$?m6@fnmgrDy}XDZKp49`Wep&)6b-=sUNp`)YY`fF4}Kt%xAKx$@5b%Xj8&(GmJ;-
zfOFT8OK{ed5IP_JH1zJW?$^svLwMg^`milEP{R$Tf9WxlKVubv%Ik&S{5vz1^o)cH
zDI+JDP@D}OCv6_zzyW6~$?3IO2&s578|D<bucyg%Hz0&hY%&br)=6TyWQ~!STb`c;
z+6Nzj1P4LQfi$%Av2qls+Xy~^MCmWB#%o2^ibW<|sgGD?m=HzIsrWOOxZfiBFxf}~
zcwhhC*+x0eQJ>xPoT{tV3eh2_)dS7U;hAMR?c~zZCtVwHJAmR1SHr|g>uR0X0x}v!
zIxzTwuRw~+Gl#l#kBK!etABOS(9;8<G96(8b$gQ2VG(Mv>WwTNv~tcQ{sH<nS*s}B
zuuP`->+$@290kM0t1Y+UY6;YrYN&bg5dO7lM^kZUo$Ugn@`RylhPS8+>hBy}OD++(
zEFQUbA`rPR5AwDrY)n`4*9<tT0`2v!Z7Y&y>tBu19UG}`SQq||PO*@&*3*j0hKTdO
zDf>JUzL~U}627EIH?!Nskx8eV3{y_DKiiV|B_RI1I=}F#beyE)rXFR-=Zq}U%SLV(
zK_y{LJ>(;E;|~~)_PI>MTi`mTbyj_7dwcCNI^)aW(<W%_S|)!ZWfg!stBYSwOF(qk
zjra%#Utjh1;j_KBG6q4A)uPP^p3<p=JPYdOqc>?@e_z{7&bQL4yR2YQ&u==om!sEh
zlc=>9;uxH6QfPIb%Da-8@$LO-w?6T)>C+>U%Z?j1loXSBBuYCchPLoF|8Z?#OPDT#
z-2d|n{8rWJ(x`HydU2!<KrxiW&$~k#DfDMDNhcEUR~l7Wb)sDEMLW*rVdrI&g+1`t
zV;Xu~A1>*X9d)07w~(?HN4dgO*UP?w(MYtfGC@T7lCuxLZy0qSo_rY{^;0=gZM4&*
zvGl9j^N`5I=Xp+|mVB*bn2`lu8$6t?+30&2=UB@!_^#fRJBIX_mz!^WsI-;nA0D9N
z^UnL+kg#HUr4csv)qRDl|1M@TWdXH@H`bfLIedc)__YjEj7KvPzbYO~Z(<>Txi=Oa
zaF{CcB(ptYuJ=LMcS^}@HCjmU5$xmwI<9!Qo^!uD+I@;#ed}Kogx)_Aw#B20mkhM!
z@3v_+6wq~p;=?Xt6^la3&tDl`Pap1mnMlef?)lArN2w*Lq&jn84=kvvjfOm}vsunC
zQe~B#*w--Pp6DwoUP>kyy=)o>xU*C&31P%bsvTwo$OGHL2F~!_{ptz363~%L^k>P?
zxoImNl;yNtq%rN0g}xtcs}V*tPOQ=}CAYy}+&JMPa<mm?R^Y*}|J<Vb*sZDdhtI{a
z<tYs1?$r|6o=2E0#6CRRu(+==ueUk#LCpiH7S!{>0$jzhhIssX+5X>e!VJX`O7+)}
zu@w8fvG+x2iw0pFKdAeKa|L3s?dZuy9pPuN@f#$6o*uh&(KNZz1P4fSL{llVssn8F
z?Up%{s{73$!cndyHIc?1yei^aX!wSY!F%LsCJN4~E>%@M<4jphsW@AFp-4ml$aY{b
zk<F>oI>8q$JqHvV`sr!idyKZ=d}+j91l8>>$TkgnqKPD$k?QpT`Bh`>I3p^9DHF(U
zwQRFbG!4&wV(bLjp?%Q3;<Sdan9+j+y&j+1*Dd{_<9oXxiZeuiqZSDht+tL4FFljE
ze?RKVS;OdrrM1#w2HCw21|?AV8-!CGQA%ihV9N^qEm3>fnB3+v>>M|q1d0xJt#HCT
zI`&EHGz5qY64mh!PhME;bbFHq<K)V}o6VYeeljC?`jgX|nm^jg<SHpE3Knmr&db*Q
z__ka#Pv3+O7@F&>>qfU6Yi0WkxqmJwhdtx$(r9i3q8GON<AA3!gpx`e;S|UIJ69;O
z#a&uPZ^1=da1{JCyU^Y<RW<~S+?MPEhn=B#3{aPar42KxNKGb`LqyO5EE-<vb(`aY
zho5u&y}(to*eDf8@+VqhA%YNOfa0)#Ij@9TL2RYTC0hmR9NFPaWgJlJWcqNA6W?Oe
zr|G}VhQ-&%gBroo*48?1b{J&)l7(Z;g^H<m<CTi{cD?bs3}sZ=Y?|O9X~Ui!z5Mme
z&;odYIU|gMSs?G(PsHfPc6WQ*>N%XcBs@%a6Nx%>?vJm_!hj%x-BQ5t6>NwiS@^3(
z(~}ltEO!p=w_FKwfc&qB;vX^YOT5aUA^dA*Py@#eF5jS&WSdV&=*=6dj;QCOmn`rL
z*H~*~z8XBv+`lR8-3D0s{$+ARQm~z6W6dmvVh*JDpZ59N@s^L@e(g{_NGY0~FlKt#
zcmc;z{6Ah05fzB!RZ)w|RxxSryaD6yWq9v8F-&=t44s-FReX7BL+VBZSj6|J_~YPp
zCceQ@d(yamKV6^{Du&P_0nY$Mn;Fakt97BT+P;9evcNJJkq1VN(QCmV#K%Fz$2f{?
zriY&PIoHY%(7uJ=>fc6iq*t{rB4HgSlL>>Bi=4#TMU$2!Dpo=_A~fQ_xvpK{jM0<B
zz6z&&WaQz0c^*?#Zl6t^6p|5mOz~yiZMyXjG42psyKKxxa2HM)_YQ__Gky~02>nJ!
zFo#OKlq+jpcog&B4*A3)_0z)ze-3pqS6+Y_eUkye91>lJhGz4L%ejRkCh90Sr0lod
zfjAU2?YLM2B<1$^uE$cQCf?VZ>pD5lWLRiW_Ssfo&SAulCEC`>#g<QzD_S!n+JM7j
zoWN%k3bZ)U8aqrrBibu^-KHMeDuTrpr08j_60W!RMJ`$uflORdzOlHNelHKRV9{+7
zF8fuZfzRJ*r6K4}bWbEO%$k`=bRGH<+P(VF3j2iX8%VT_xz9}-^bq%nBG{&4X_#af
z{bUuTa6<`rgMWy2_51o_BVoN<ZLeo2n-v`A*tTM~sp6)2bFUUQtp0ThCXLQB3SR+R
zZ%1bv*m)if2N4PTKox!zWHEz;p1`LlcNB7AkrDA-BoveYIFg&m;^WsCqLQhOzu`<=
zy&X8~2A&&kYn1GZcNi&EOfwItM7N7<CMd#7lqmk{sM>gZdi@K+AiH*Kl%pKbol4N{
zZ3H+vl=D|zgX4qMm|{6W=bx*}CHXRXGIud?%~Tk%5JdPVKC&-y$^=|yzy|-4&X=_}
z^F`b3$5E0TTmxXY9u&QpI_*+s9YqUF8u2g$Eq>SYh_4xTvyi(CPcS2w0Sy@~>Yqfd
zJ-q@H`NQ8}aNyr0_Rl$x;iuFNUFXACPI}ly@3tb#SKz!2r(rk3$L#=;AudGSpi1NE
ze35qX$O;q*>`DC4UoqYze}I$=c6{pj9~nkBF1-6vSx|rCc}np8Jp^+E^~rUyb=P~o
zn0)DX0ws)0>AvKKeF>vu9bFiK`reO#><DsElFcG~4>FeEAk$ZcW-1eM>WCQLF$J|8
z9F!|V2f5B3&9>gixNs6+bqIC$F%vJhl0~$dPl2lI^?@KC_qriLt*%bS#SF#J?Yy_u
zo~G1)eDg;TiWZ?6imjWB!2^O!(mLQSb~;ls{2eO0PM9V?_%v9cBTLcscE-RO{}+x0
z<^%fZN1kj}N6252U(vVi7X2W7>)yN#KRZ!TkPivj9Tpmp-VUS7<MsxW5c}-t-gS_$
zeH}yf54WE{ib@Lco@7+>zvZbiBc+6IEQ09S=DKKq2!8d}=J?|^?<Q*i3i&3Co!dhi
zR)WNc$oF<-v%|VA+km<T0spi14v`_@yDsJWc?a&~yB3&osB&9eLW4(MkZoU9I04qb
zU2?eHD>#_jiTJd6oDvi)zmc;n6-HF)y9)|JiX4PU3$zF3EdE&i#SaHp*CNlcTT#kw
zvY*7n@zT#Sl`-=ba3d~Sj(6-#|8g?@GO3p&IIaYBFnEr|;B{>)D?T}wne86>Dh_lc
z-g;U6&ko8I#Y$@eS@TGh%bQ3VXFtU49uXK}=L+GVqdDWc;dALHs^(`n^+w?+0ISMN
zp`bvLmZ{1p7z;nW5kwG}aBxp7qv`+>_BoS<dsG?#-SNVnL`Z1yUZAQKN1(GsYDzbZ
zXA$4sykkJ<eyKV9{8Q~1xVYf{W$~#^qfLg3Ji+`m&{4|2%3PR>Pn0Nazp-uK^t}lF
z1KtlyatlOp_v`L+R;OW;$TBg~@Si6%St_^!$NjvtdfK;Sg5-#NXs9*J1FZY%m-ZG`
zE62VuN(Jui{K=I!`uiB0KB<=GZVbf|OhqUXMn+AfQZXfBT6b`4yki7u;Pv;?)M@TF
zns||XI$no#<Ui(;F-;B)K|&%T-L2J|v*9_EDuD6|Gm0>`4M`T@rfv8!$YyC^?$=tA
zl4p|LonIT1Ow=l&$9yXE7a9$JuNv`{!{L&f8?CsnD(Cm>%jK&R^Irj(#i&3*3pe}b
zUYMS(k37$fhbuY6Wb;Pe>{`kCWmL)ogYX|5EkcmIjC3$kV~2jP8$`i{Ie18i@(0{n
zN#BsMEi4bL^104&y=&#}T`P_<u+1-o*e$@Cb7JS$Zb~ZeEg<?aH9tRuwtyjVce<n%
zOCWYp4GM2bGD1a4|F%U+J$iDc*1V;j`LFvBjzY;w>_1nsa<Xi3cGGbYiwa*eWq!D(
zjs;p7vhJ^he=@l{%9POB8{EFwX=Q;9t(W51ANYM!b@AM%))-^2O<1+jI3fUN9Sr((
z)eCzXP<7p#`C%WH+nF_ZW~pS-GOFB@MJtnOC@2nxd`!p=_WomG|HWaY0hDrHgp7VP
zD1{_bgz*X(gQ@V4`wQFu_Sl1<Lu-(g8+JTNB2_6+5+xid@8g4?%w<-3ur)7WV)2G?
zjP>vkLBFjA8;=1swzx?ISgw4K$%j_XkMth}VSTf1U+V|#MDq?!F6)uOu~O9SV3@h!
zL6ogc3EanAY$dkjdOtUm2n?RmKCfu}Rn#IxawpOyW(=pt8e+=vtLb9P*3*8{q!;@$
z*xs8cu48O`h~9=SiEDphsm@QL!Sr0qXOQ2QGb^f@gt`@qC~~}oK07~rn3IK)FypAT
zBTPz3kxuJ`bvaux?p&h?aQmjAK~{YZrgoS}m{n0XXEJY2`f6tJ)of^lw6Ypr{dVIh
zgCr-F8`K@27+W%f3mZHQwbZMdpKDz&>P^+FVTb9^B3%lw`1Oo?B9~gJPw(P7rovWB
zw2P$}5Kk!Pa(HWiguEUV8q2ug-v84Y#$}wH0aiQnj5N3Sd!tJr4yHjH@-7Y-t^mWp
z?EU~&USxb`PWCL7D|3F&6_L+9!*J5#V!^>ZCMMdX<4cTveIQsG;dGB{U=c6bk_`r<
zLhqwLT^!04qMYh9+%Pfw{J`*M(%?zl#qR(g@cGZPG`%X&0u(_K?6Ea)vQ#wSh#V4Z
z65?hM09Li#7=M2V^nDmUHBJL9HOEbCF~zcZb?hh<0Y+Q{kswrDF|9L;;C?f!&U29t
zMn^><?eoDZXDju-z`t&Xr3OOm<B_RDg1rW+Zmzla5xhU<-4tYQKWE&Gw&V2lIA6%l
z+<Jks>DvcT5O7EYQ%J%O$}c)~t-<Ll9f}07k469ZT%o-Gr;eAhm<qDqVsH>-UCbKc
zwi7EfSAf?xZtI7%t3<6f1wV}1&a1In>7gh*z+V?$Bb^aghn&O$M;DI$B)&baTaHMg
z+?-*;85g}pk;oAdd^+Ye5pdH7jE09!+k|(Y;rE<|GOD^k=z%BdeWCp%iUD1;Nurkl
z<PoKv2JPRpv`^AQy(bi!v2cjjmW;W2=O0YQD6<T&7ae~}<DW8Af`dMWJzBa_LGRN)
zWgeCh5Z=}q?qvm>!^)wO<#LHqxBpwFk~aKVR44q|B%AB4Jmj4Y+7AwZ(Ln*X!6d;a
z0~of)l^UrU=CDZq_lkFg{GX*Oa3k~EC0}(1y>}qV)i0%~vAR4ohKW(7t?=%n1m~;b
zjW;p0orB?X?Y4r7iqk5;-r=_srQNRBD_=EMD?b`A;1)uco5g7l^m(k$kyet<pw+39
ziBC}aNc(yG+(z&RIMRV}d^-0QuB!>#cG_~a>Q3A;GMfimm4~U~=MBs#Zuq2T58>Bq
zy14hA=-{B3`%{){LG4ySq`XY%p#VAH4<-~q1V92X8VCovACFKJmXsXQr2P8(ac^T#
z6&{#oJM0@b`Zk9(?E4FC1xyOv1OGCv!`?9_L=L!FdxY|hKOu1!L!-_pte}8Y-Zf37
zOX;=JqUs0MBKdm*6}}AgXX~HNH2}p9>`Ea*5^>P};TQ`tf?L_pAD#IVz-?_X%@{KJ
z103CEEVJ;xlw;UWU}lNlk4e-2q`OhU(s+s{68{pKdj!Ee3559vng66Y!P0)hqX=RD
zu$_sp!F=Taf9@Q~f2ASB!P0gTX~h5b)S*6t>4J>k4p{z^-ULfSB2lsYM~2oT5(L73
zI`$&}PZ|#_-A!y9{NFMVz{}_n%$oX7+7C>XhUY98kNtNU0si1+V3%B{{qLgxPk7UI
z;;@x@c|a*CDWikQEJZCXyhx9$|K6Ql3duKJT?Wtl^Q6{RZZ40TlARSX88Ziu+hn@$
zq-<;|$@JRPR8&-QwmZ)MzHLD$JQBOTQA{q+y9%vNFAeZ7lyCm+Tm>{dU3BSo`|j-Q
zL=$$y|7&%Au)pac<Kp7bP*I7W?k`-uPOMQ^!q6!Ya=9k6zWi(TA5j0GL!hi2FO=hN
zcz*}&bE@h5Xa6FbNMO5b>+`#||M$^gvQ+G}35?pl)W0V0iVcCbg8wNqhU6dOi2pK}
zN5an2%&hkBhH35ti=Q)Ydg1+R<m2xg(&e6{mVY1IjW6a0atVPr*!lPPfSwC@vrZsY
ujVJwU<>!oGaUKLtLh*l}Rv*~?2BY5np_P1yWbFg^Pf}D)qzb4P@P7d!Kz)G#

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/BUILD.gn b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/BUILD.gn
deleted file mode 100644
index 865bc4d3b..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/BUILD.gn
+++ /dev/null
@@ -1,33 +0,0 @@
-generated_doxygen_out_dir =
-    get_path_info(".", "gen_dir") + "/.."
-
-loadgen_doxygen_sources = [
-  "doxygen.cfg",
-  "doxygen_footer.html",
-  "doxygen_header.html",
-  "doxygen_layout.xml",
-  "doxygen_stylesheet.css",
-  "loadgen-integration_diagram.dia",
-  "mlperf_icon.png",
-  "mlperf_logo_horizontal_color.svg",
-  "README.md"
-]
-
-source_set("loadgen_doxygen_sources") {
-  sources = loadgen_doxygen_sources
-}
-
-source_set("doxygen_html_generator_script") {
-  sources = [ "doxygen_html_generator.py" ]
-}
-
-action("generate_doxygen_html") {
-  script = "doxygen_html_generator.py"
-  args = [ rebase_path(generated_doxygen_out_dir, root_build_dir),
-           rebase_path("../..") ]
-  outputs = [ generated_doxygen_out_dir ]
-  deps = [ ":loadgen_doxygen_sources",
-           ":doxygen_html_generator_script",
-           "../..:mlperf_loadgen_sources_no_gen",
-           "../..:docs" ]
-}
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/README.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/README.md
deleted file mode 100644
index d5cf5fe18..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/README.md
+++ /dev/null
@@ -1,34 +0,0 @@
-# Generating the HTML docs {#ReadmeHtmlDocs}
-
-This document is generated from inline docstrings in the source and
-various markdown files checked into the git repository. If you've
-checked out the code, you can generate this documentation.
-
-*Prerequisite:* You must have [doxygen](http://www.doxygen.nl) installed
-on your system:
-
-## With gn / ninja
-
-If you are using the gn build flow, you may run:
-
-    ninja -C out/Release generate_doxygen_html
-
-* This will output the documentation to out/Release/gen/loadgen/docs/gen and
-avoid poluting the source directory.
-
-## Manually
-
-Alternatively, you can manually run:
-
-    python docs/src/doxygen_html_generator.py <target_dir> <loadgen_root>
-
-* If <loadgen_root> is omitted, it will default to ".".
-* If <target_dir> is also omitted, it will default to "./docs/gen".
-
-## Hosting
-
-A version of this doc is currently hosted online at
-https://mlperf.github.io/inference/loadgen/index.html
-
-To update the hosted version, submit a PR to the
-[mlperf.github.io](https://github.com/mlperf/mlperf.github.io) repository.
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen.cfg b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen.cfg
deleted file mode 100644
index fc05853d1..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen.cfg
+++ /dev/null
@@ -1,2495 +0,0 @@
-# Doxyfile 1.8.13
-
-# This file describes the settings to be used by the documentation system
-# doxygen (www.doxygen.org) for a project.
-#
-# All text after a double hash (##) is considered a comment and is placed in
-# front of the TAG it is preceding.
-#
-# All text after a single hash (#) is considered a comment and will be ignored.
-# The format is:
-# TAG = value [value, ...]
-# For lists, items can also be appended using:
-# TAG += value [value, ...]
-# Values that contain spaces should be placed between quotes (\" \").
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
-# The default value is: UTF-8.
-
-DOXYFILE_ENCODING      = UTF-8
-
-# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
-# double-quotes, unless you are using Doxywizard) that should identify the
-# project for which the documentation is generated. This name is used in the
-# title of most generated pages and in a few other places.
-# The default value is: My Project.
-
-PROJECT_NAME           = "LoadGen Guide"
-
-# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
-# could be handy for archiving the generated documentation or if some version
-# control system is used.
-
-PROJECT_NUMBER         =
-
-# Using the PROJECT_BRIEF tag one can provide an optional one line description
-# for a project that appears at the top of each page and should give viewer a
-# quick idea about the purpose of the project. Keep the description short.
-
-PROJECT_BRIEF          =
-
-# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
-# in the documentation. The maximum height of the logo should not exceed 55
-# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
-# the logo to the output directory.
-
-PROJECT_LOGO           = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/mlperf_logo_horizontal_color.svg
-
-# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
-# into which the generated documentation will be written. If a relative path is
-# entered, it will be relative to the location where doxygen was started. If
-# left blank the current directory will be used.
-
-OUTPUT_DIRECTORY       = $(MLPERF_DOXYGEN_OUT_PATH)
-
-# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
-# directories (in 2 levels) under the output directory of each output format and
-# will distribute the generated files over these directories. Enabling this
-# option can be useful when feeding doxygen a huge amount of source files, where
-# putting all generated files in the same directory would otherwise causes
-# performance problems for the file system.
-# The default value is: NO.
-
-CREATE_SUBDIRS         = NO
-
-# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
-# characters to appear in the names of generated files. If set to NO, non-ASCII
-# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
-# U+3044.
-# The default value is: NO.
-
-ALLOW_UNICODE_NAMES    = NO
-
-# The OUTPUT_LANGUAGE tag is used to specify the language in which all
-# documentation generated by doxygen is written. Doxygen will use this
-# information to generate all constant output in the proper language.
-# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
-# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
-# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
-# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
-# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
-# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
-# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
-# Ukrainian and Vietnamese.
-# The default value is: English.
-
-OUTPUT_LANGUAGE        = English
-
-# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
-# descriptions after the members that are listed in the file and class
-# documentation (similar to Javadoc). Set to NO to disable this.
-# The default value is: YES.
-
-BRIEF_MEMBER_DESC      = YES
-
-# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
-# description of a member or function before the detailed description
-#
-# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
-# brief descriptions will be completely suppressed.
-# The default value is: YES.
-
-REPEAT_BRIEF           = YES
-
-# This tag implements a quasi-intelligent brief description abbreviator that is
-# used to form the text in various listings. Each string in this list, if found
-# as the leading text of the brief description, will be stripped from the text
-# and the result, after processing the whole list, is used as the annotated
-# text. Otherwise, the brief description is used as-is. If left blank, the
-# following values are used ($name is automatically replaced with the name of
-# the entity):The $name class, The $name widget, The $name file, is, provides,
-# specifies, contains, represents, a, an and the.
-
-ABBREVIATE_BRIEF       = "The $name class" \
-                         "The $name widget" \
-                         "The $name file" \
-                         is \
-                         provides \
-                         specifies \
-                         contains \
-                         represents \
-                         a \
-                         an \
-                         the
-
-# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
-# doxygen will generate a detailed section even if there is only a brief
-# description.
-# The default value is: NO.
-
-ALWAYS_DETAILED_SEC    = YES
-
-# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
-# inherited members of a class in the documentation of that class as if those
-# members were ordinary class members. Constructors, destructors and assignment
-# operators of the base classes will not be shown.
-# The default value is: NO.
-
-INLINE_INHERITED_MEMB  = NO
-
-# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
-# before files name in the file list and in the header files. If set to NO the
-# shortest path that makes the file name unique will be used
-# The default value is: YES.
-
-FULL_PATH_NAMES        = YES
-
-# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
-# Stripping is only done if one of the specified strings matches the left-hand
-# part of the path. The tag can be used to show relative paths in the file list.
-# If left blank the directory from which doxygen is run is used as the path to
-# strip.
-#
-# Note that you can specify absolute paths here, but also relative paths, which
-# will be relative from the directory where doxygen is started.
-# This tag requires that the tag FULL_PATH_NAMES is set to YES.
-
-STRIP_FROM_PATH        =
-
-# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
-# path mentioned in the documentation of a class, which tells the reader which
-# header file to include in order to use a class. If left blank only the name of
-# the header file containing the class definition is used. Otherwise one should
-# specify the list of include paths that are normally passed to the compiler
-# using the -I flag.
-
-STRIP_FROM_INC_PATH    =
-
-# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
-# less readable) file names. This can be useful is your file systems doesn't
-# support long names like on DOS, Mac, or CD-ROM.
-# The default value is: NO.
-
-SHORT_NAMES            = NO
-
-# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
-# first line (until the first dot) of a Javadoc-style comment as the brief
-# description. If set to NO, the Javadoc-style will behave just like regular Qt-
-# style comments (thus requiring an explicit @brief command for a brief
-# description.)
-# The default value is: NO.
-
-JAVADOC_AUTOBRIEF      = NO
-
-# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
-# line (until the first dot) of a Qt-style comment as the brief description. If
-# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
-# requiring an explicit \brief command for a brief description.)
-# The default value is: NO.
-
-QT_AUTOBRIEF           = NO
-
-# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
-# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
-# a brief description. This used to be the default behavior. The new default is
-# to treat a multi-line C++ comment block as a detailed description. Set this
-# tag to YES if you prefer the old behavior instead.
-#
-# Note that setting this tag to YES also means that rational rose comments are
-# not recognized any more.
-# The default value is: NO.
-
-MULTILINE_CPP_IS_BRIEF = NO
-
-# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
-# documentation from any documented member that it re-implements.
-# The default value is: YES.
-
-INHERIT_DOCS           = YES
-
-# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
-# page for each member. If set to NO, the documentation of a member will be part
-# of the file/class/namespace that contains it.
-# The default value is: NO.
-
-SEPARATE_MEMBER_PAGES  = NO
-
-# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
-# uses this value to replace tabs by spaces in code fragments.
-# Minimum value: 1, maximum value: 16, default value: 4.
-
-TAB_SIZE               = 4
-
-# This tag can be used to specify a number of aliases that act as commands in
-# the documentation. An alias has the form:
-# name=value
-# For example adding
-# "sideeffect=@par Side Effects:\n"
-# will allow you to put the command \sideeffect (or @sideeffect) in the
-# documentation, which will result in a user-defined paragraph with heading
-# "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
-
-ALIASES                =
-
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              =
-
-# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
-# only. Doxygen will then generate output that is more tailored for C. For
-# instance, some of the names that are used will be different. The list of all
-# members will be omitted, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_FOR_C  = NO
-
-# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
-# Python sources only. Doxygen will then generate output that is more tailored
-# for that language. For instance, namespaces will be presented as packages,
-# qualified scopes will look different, etc.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_JAVA   = NO
-
-# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
-# sources. Doxygen will then generate output that is tailored for Fortran.
-# The default value is: NO.
-
-OPTIMIZE_FOR_FORTRAN   = NO
-
-# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
-# sources. Doxygen will then generate output that is tailored for VHDL.
-# The default value is: NO.
-
-OPTIMIZE_OUTPUT_VHDL   = NO
-
-# Doxygen selects the parser to use depending on the extension of the files it
-# parses. With this tag you can assign which parser to use for a given
-# extension. Doxygen has a built-in mapping, but you can override or extend it
-# using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
-# Fortran. In the later case the parser tries to guess whether the code is fixed
-# or free formatted code, this is the default for Fortran type files), VHDL. For
-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
-#
-# Note: For files without extension you can use no_extension as a placeholder.
-#
-# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
-
-EXTENSION_MAPPING      =
-
-# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
-# according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
-# The output of markdown processing is further processed by doxygen, so you can
-# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
-# case of backward compatibilities issues.
-# The default value is: YES.
-
-MARKDOWN_SUPPORT       = YES
-
-# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
-# to that level are automatically included in the table of contents, even if
-# they do not have an id attribute.
-# Note: This feature currently applies only to Markdown headings.
-# Minimum value: 0, maximum value: 99, default value: 0.
-# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
-
-TOC_INCLUDE_HEADINGS   = 1
-
-# When enabled doxygen tries to link words that correspond to documented
-# classes, or namespaces to their corresponding documentation. Such a link can
-# be prevented in individual cases by putting a % sign in front of the word or
-# globally by setting AUTOLINK_SUPPORT to NO.
-# The default value is: YES.
-
-AUTOLINK_SUPPORT       = YES
-
-# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
-# to include (a tag file for) the STL sources as input, then you should set this
-# tag to YES in order to let doxygen match functions declarations and
-# definitions whose arguments contain STL classes (e.g. func(std::string);
-# versus func(std::string) {}). This also make the inheritance and collaboration
-# diagrams that involve STL classes more complete and accurate.
-# The default value is: NO.
-
-BUILTIN_STL_SUPPORT    = NO
-
-# If you use Microsoft's C++/CLI language, you should set this option to YES to
-# enable parsing support.
-# The default value is: NO.
-
-CPP_CLI_SUPPORT        = NO
-
-# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
-# will parse them like normal C++ but will assume all classes use public instead
-# of private inheritance when no explicit protection keyword is present.
-# The default value is: NO.
-
-SIP_SUPPORT            = NO
-
-# For Microsoft's IDL there are propget and propput attributes to indicate
-# getter and setter methods for a property. Setting this option to YES will make
-# doxygen to replace the get and set methods by a property in the documentation.
-# This will only work if the methods are indeed getting or setting a simple
-# type. If this is not the case, or you want to show the methods anyway, you
-# should set this option to NO.
-# The default value is: YES.
-
-IDL_PROPERTY_SUPPORT   = YES
-
-# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
-# tag is set to YES then doxygen will reuse the documentation of the first
-# member in the group (if any) for the other members of the group. By default
-# all members of a group must be documented explicitly.
-# The default value is: NO.
-
-DISTRIBUTE_GROUP_DOC   = NO
-
-# If one adds a struct or class to a group and this option is enabled, then also
-# any nested class or struct is added to the same group. By default this option
-# is disabled and one has to add nested compounds explicitly via \ingroup.
-# The default value is: NO.
-
-GROUP_NESTED_COMPOUNDS = NO
-
-# Set the SUBGROUPING tag to YES to allow class member groups of the same type
-# (for instance a group of public functions) to be put as a subgroup of that
-# type (e.g. under the Public Functions section). Set it to NO to prevent
-# subgrouping. Alternatively, this can be done per class using the
-# \nosubgrouping command.
-# The default value is: YES.
-
-SUBGROUPING            = YES
-
-# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
-# are shown inside the group in which they are included (e.g. using \ingroup)
-# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
-# and RTF).
-#
-# Note that this feature does not work in combination with
-# SEPARATE_MEMBER_PAGES.
-# The default value is: NO.
-
-INLINE_GROUPED_CLASSES = NO
-
-# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
-# with only public data fields or simple typedef fields will be shown inline in
-# the documentation of the scope in which they are defined (i.e. file,
-# namespace, or group documentation), provided this scope is documented. If set
-# to NO, structs, classes, and unions are shown on a separate page (for HTML and
-# Man pages) or section (for LaTeX and RTF).
-# The default value is: NO.
-
-INLINE_SIMPLE_STRUCTS  = NO
-
-# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
-# enum is documented as struct, union, or enum with the name of the typedef. So
-# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
-# with name TypeT. When disabled the typedef will appear as a member of a file,
-# namespace, or class. And the struct will be named TypeS. This can typically be
-# useful for C code in case the coding convention dictates that all compound
-# types are typedef'ed and only the typedef is referenced, never the tag name.
-# The default value is: NO.
-
-TYPEDEF_HIDES_STRUCT   = NO
-
-# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
-# cache is used to resolve symbols given their name and scope. Since this can be
-# an expensive process and often the same symbol appears multiple times in the
-# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
-# doxygen will become slower. If the cache is too large, memory is wasted. The
-# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
-# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
-# symbols. At the end of a run doxygen will report the cache usage and suggest
-# the optimal cache size from a speed point of view.
-# Minimum value: 0, maximum value: 9, default value: 0.
-
-LOOKUP_CACHE_SIZE      = 0
-
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-
-# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
-# documentation are documented, even if no documentation was available. Private
-# class members and static file members will be hidden unless the
-# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
-# Note: This will also disable the warnings about undocumented members that are
-# normally produced when WARNINGS is set to YES.
-# The default value is: NO.
-
-EXTRACT_ALL            = NO
-
-# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
-# be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PRIVATE        = YES
-
-# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
-# scope will be included in the documentation.
-# The default value is: NO.
-
-EXTRACT_PACKAGE        = YES
-
-# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
-# included in the documentation.
-# The default value is: NO.
-
-EXTRACT_STATIC         = YES
-
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
-# locally in source files will be included in the documentation. If set to NO,
-# only classes defined in header files are included. Does not have any effect
-# for Java sources.
-# The default value is: YES.
-
-EXTRACT_LOCAL_CLASSES  = YES
-
-# This flag is only useful for Objective-C code. If set to YES, local methods,
-# which are defined in the implementation section but not in the interface are
-# included in the documentation. If set to NO, only methods in the interface are
-# included.
-# The default value is: NO.
-
-EXTRACT_LOCAL_METHODS  = NO
-
-# If this flag is set to YES, the members of anonymous namespaces will be
-# extracted and appear in the documentation as a namespace called
-# 'anonymous_namespace{file}', where file will be replaced with the base name of
-# the file that contains the anonymous namespace. By default anonymous namespace
-# are hidden.
-# The default value is: NO.
-
-EXTRACT_ANON_NSPACES   = NO
-
-# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
-# undocumented members inside documented classes or files. If set to NO these
-# members will be included in the various overviews, but no documentation
-# section is generated. This option has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_MEMBERS     = NO
-
-# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
-# undocumented classes that are normally visible in the class hierarchy. If set
-# to NO, these classes will be included in the various overviews. This option
-# has no effect if EXTRACT_ALL is enabled.
-# The default value is: NO.
-
-HIDE_UNDOC_CLASSES     = NO
-
-# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO, these declarations will be
-# included in the documentation.
-# The default value is: NO.
-
-HIDE_FRIEND_COMPOUNDS  = NO
-
-# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
-# documentation blocks found inside the body of a function. If set to NO, these
-# blocks will be appended to the function's detailed documentation block.
-# The default value is: NO.
-
-HIDE_IN_BODY_DOCS      = NO
-
-# The INTERNAL_DOCS tag determines if documentation that is typed after a
-# \internal command is included. If the tag is set to NO then the documentation
-# will be excluded. Set it to YES to include the internal documentation.
-# The default value is: NO.
-
-INTERNAL_DOCS          = NO
-
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES, upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
-# The default value is: system dependent.
-
-CASE_SENSE_NAMES       = YES
-
-# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
-# their full class and namespace scopes in the documentation. If set to YES, the
-# scope will be hidden.
-# The default value is: NO.
-
-HIDE_SCOPE_NAMES       = NO
-
-# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
-# append additional text to a page's title, such as Class Reference. If set to
-# YES the compound reference will be hidden.
-# The default value is: NO.
-
-HIDE_COMPOUND_REFERENCE= NO
-
-# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
-# the files that are included by a file in the documentation of that file.
-# The default value is: YES.
-
-SHOW_INCLUDE_FILES     = YES
-
-# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
-# grouped member an include statement to the documentation, telling the reader
-# which file to include in order to use the member.
-# The default value is: NO.
-
-SHOW_GROUPED_MEMB_INC  = NO
-
-# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
-# files with double quotes in the documentation rather than with sharp brackets.
-# The default value is: NO.
-
-FORCE_LOCAL_INCLUDES   = NO
-
-# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
-# documentation for inline members.
-# The default value is: YES.
-
-INLINE_INFO            = YES
-
-# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
-# (detailed) documentation of file and class members alphabetically by member
-# name. If set to NO, the members will appear in declaration order.
-# The default value is: YES.
-
-SORT_MEMBER_DOCS       = YES
-
-# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
-# descriptions of file, namespace and class members alphabetically by member
-# name. If set to NO, the members will appear in declaration order. Note that
-# this will also influence the order of the classes in the class list.
-# The default value is: NO.
-
-SORT_BRIEF_DOCS        = NO
-
-# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
-# (brief and detailed) documentation of class members so that constructors and
-# destructors are listed first. If set to NO the constructors will appear in the
-# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
-# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
-# member documentation.
-# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
-# detailed member documentation.
-# The default value is: NO.
-
-SORT_MEMBERS_CTORS_1ST = NO
-
-# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
-# of group names into alphabetical order. If set to NO the group names will
-# appear in their defined order.
-# The default value is: NO.
-
-SORT_GROUP_NAMES       = NO
-
-# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
-# fully-qualified names, including namespaces. If set to NO, the class list will
-# be sorted only by class name, not including the namespace part.
-# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
-# Note: This option applies only to the class list, not to the alphabetical
-# list.
-# The default value is: NO.
-
-SORT_BY_SCOPE_NAME     = NO
-
-# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
-# type resolution of all parameters of a function it will reject a match between
-# the prototype and the implementation of a member function even if there is
-# only one candidate or it is obvious which candidate to choose by doing a
-# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
-# accept a match between prototype and implementation in such cases.
-# The default value is: NO.
-
-STRICT_PROTO_MATCHING  = NO
-
-# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
-# list. This list is created by putting \todo commands in the documentation.
-# The default value is: YES.
-
-GENERATE_TODOLIST      = YES
-
-# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
-# list. This list is created by putting \test commands in the documentation.
-# The default value is: YES.
-
-GENERATE_TESTLIST      = YES
-
-# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
-# list. This list is created by putting \bug commands in the documentation.
-# The default value is: YES.
-
-GENERATE_BUGLIST       = YES
-
-# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
-# the deprecated list. This list is created by putting \deprecated commands in
-# the documentation.
-# The default value is: YES.
-
-GENERATE_DEPRECATEDLIST= YES
-
-# The ENABLED_SECTIONS tag can be used to enable conditional documentation
-# sections, marked by \if <section_label> ... \endif and \cond <section_label>
-# ... \endcond blocks.
-
-ENABLED_SECTIONS       =
-
-# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
-# initial value of a variable or macro / define can have for it to appear in the
-# documentation. If the initializer consists of more lines than specified here
-# it will be hidden. Use a value of 0 to hide initializers completely. The
-# appearance of the value of individual variables and macros / defines can be
-# controlled using \showinitializer or \hideinitializer command in the
-# documentation regardless of this setting.
-# Minimum value: 0, maximum value: 10000, default value: 30.
-
-MAX_INITIALIZER_LINES  = 30
-
-# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
-# the bottom of the documentation of classes and structs. If set to YES, the
-# list will mention the files that were used to generate the documentation.
-# The default value is: YES.
-
-SHOW_USED_FILES        = YES
-
-# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
-# will remove the Files entry from the Quick Index and from the Folder Tree View
-# (if specified).
-# The default value is: YES.
-
-SHOW_FILES             = YES
-
-# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
-# page. This will remove the Namespaces entry from the Quick Index and from the
-# Folder Tree View (if specified).
-# The default value is: YES.
-
-SHOW_NAMESPACES        = YES
-
-# The FILE_VERSION_FILTER tag can be used to specify a program or script that
-# doxygen should invoke to get the current version for each file (typically from
-# the version control system). Doxygen will invoke the program by executing (via
-# popen()) the command command input-file, where command is the value of the
-# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
-# by doxygen. Whatever the program writes to standard output is used as the file
-# version. For an example see the documentation.
-
-FILE_VERSION_FILTER    =
-
-# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
-# by doxygen. The layout file controls the global structure of the generated
-# output files in an output format independent way. To create the layout file
-# that represents doxygen's defaults, run doxygen with the -l option. You can
-# optionally specify a file name after the option, if omitted DoxygenLayout.xml
-# will be used as the name of the layout file.
-#
-# Note that if you run doxygen from a directory containing a file called
-# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
-# tag is left empty.
-
-LAYOUT_FILE            = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/doxygen_layout.xml
-
-# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
-# the reference definitions. This must be a list of .bib files. The .bib
-# extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
-# For LaTeX the style of the bibliography can be controlled using
-# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
-# search path. See also \cite for info how to create references.
-
-CITE_BIB_FILES         =
-
-#---------------------------------------------------------------------------
-# Configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-
-# The QUIET tag can be used to turn on/off the messages that are generated to
-# standard output by doxygen. If QUIET is set to YES this implies that the
-# messages are off.
-# The default value is: NO.
-
-QUIET                  = NO
-
-# The WARNINGS tag can be used to turn on/off the warning messages that are
-# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
-# this implies that the warnings are on.
-#
-# Tip: Turn warnings on while writing the documentation.
-# The default value is: YES.
-
-WARNINGS               = YES
-
-# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
-# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
-# will automatically be disabled.
-# The default value is: YES.
-
-WARN_IF_UNDOCUMENTED   = NO
-
-# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some parameters
-# in a documented function, or documenting parameters that don't exist or using
-# markup commands wrongly.
-# The default value is: YES.
-
-WARN_IF_DOC_ERROR      = YES
-
-# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
-# are documented, but have no documentation for their parameters or return
-# value. If set to NO, doxygen will only warn about wrong or incomplete
-# parameter documentation, but not about the absence of documentation.
-# The default value is: NO.
-
-WARN_NO_PARAMDOC       = NO
-
-# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
-# a warning is encountered.
-# The default value is: NO.
-
-WARN_AS_ERROR          = NO
-
-# The WARN_FORMAT tag determines the format of the warning messages that doxygen
-# can produce. The string should contain the $file, $line, and $text tags, which
-# will be replaced by the file and line number from which the warning originated
-# and the warning text. Optionally the format may contain $version, which will
-# be replaced by the version of the file (if it could be obtained via
-# FILE_VERSION_FILTER)
-# The default value is: $file:$line: $text.
-
-WARN_FORMAT            = "$file:$line: $text"
-
-# The WARN_LOGFILE tag can be used to specify a file to which warning and error
-# messages should be written. If left blank the output is written to standard
-# error (stderr).
-
-WARN_LOGFILE           =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the input files
-#---------------------------------------------------------------------------
-
-# The INPUT tag is used to specify the files and/or directories that contain
-# documented source files. You may enter file names like myfile.cpp or
-# directories like /usr/src/myproject. Separate the files or directories with
-# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
-# Note: If this tag is empty the current directory is searched.
-
-INPUT                  = $(MLPERF_LOADGEN_SRC_PATH)
-
-# This tag can be used to specify the character encoding of the source files
-# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
-# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
-# possible encodings.
-# The default value is: UTF-8.
-
-INPUT_ENCODING         = UTF-8
-
-# If the value of the INPUT tag contains directories, you can use the
-# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
-# *.h) to filter out the source-files in the directories.
-#
-# Note that for custom extensions or not directly supported extensions you also
-# need to set EXTENSION_MAPPING for the extension otherwise the files are not
-# read by doxygen.
-#
-# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
-# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
-# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f95, *.f03, *.f08,
-# *.f, *.for, *.tcl, *.vhd, *.vhdl, *.ucf and *.qsf.
-
-FILE_PATTERNS          = *.c \
-                         *.cc \
-                         *.cxx \
-                         *.cpp \
-                         *.c++ \
-                         *.java \
-                         *.ii \
-                         *.ixx \
-                         *.ipp \
-                         *.i++ \
-                         *.inl \
-                         *.idl \
-                         *.ddl \
-                         *.odl \
-                         *.h \
-                         *.hh \
-                         *.hxx \
-                         *.hpp \
-                         *.h++ \
-                         *.cs \
-                         *.d \
-                         *.php \
-                         *.php4 \
-                         *.php5 \
-                         *.phtml \
-                         *.inc \
-                         *.m \
-                         *.markdown \
-                         *.md \
-                         *.mm \
-                         *.dox \
-                         *.py \
-                         *.pyw \
-                         *.f90 \
-                         *.f95 \
-                         *.f03 \
-                         *.f08 \
-                         *.f \
-                         *.for \
-                         *.tcl \
-                         *.vhd \
-                         *.vhdl \
-                         *.ucf \
-                         *.qsf
-
-# The RECURSIVE tag can be used to specify whether or not subdirectories should
-# be searched for input files as well.
-# The default value is: NO.
-
-RECURSIVE              = YES
-
-# The EXCLUDE tag can be used to specify files and/or directories that should be
-# excluded from the INPUT source files. This way you can easily exclude a
-# subdirectory from a directory tree whose root is specified with the INPUT tag.
-#
-# Note that relative paths are relative to the directory from which doxygen is
-# run.
-
-EXCLUDE                = depot_tools
-
-# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
-# directories that are symbolic links (a Unix file system feature) are excluded
-# from the input.
-# The default value is: NO.
-
-EXCLUDE_SYMLINKS       = NO
-
-# If the value of the INPUT tag contains directories, you can use the
-# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
-# certain files from those directories.
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories for example use the pattern */test/*
-
-EXCLUDE_PATTERNS       =
-
-# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
-# (namespaces, classes, functions, etc.) that should be excluded from the
-# output. The symbol name can be a fully qualified name, a word, or if the
-# wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-#
-# Note that the wildcards are matched against the file with absolute path, so to
-# exclude all test directories use the pattern */test/*
-
-EXCLUDE_SYMBOLS        =
-
-# The EXAMPLE_PATH tag can be used to specify one or more files or directories
-# that contain example code fragments that are included (see the \include
-# command).
-
-EXAMPLE_PATH           =
-
-# If the value of the EXAMPLE_PATH tag contains directories, you can use the
-# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
-# *.h) to filter out the source-files in the directories. If left blank all
-# files are included.
-
-EXAMPLE_PATTERNS       = *
-
-# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
-# searched for input files to be used with the \include or \dontinclude commands
-# irrespective of the value of the RECURSIVE tag.
-# The default value is: NO.
-
-EXAMPLE_RECURSIVE      = NO
-
-# The IMAGE_PATH tag can be used to specify one or more files or directories
-# that contain images that are to be included in the documentation (see the
-# \image command).
-
-IMAGE_PATH             = $(MLPERF_LOADGEN_SRC_PATH)/docs/src
-
-# The INPUT_FILTER tag can be used to specify a program that doxygen should
-# invoke to filter for each input file. Doxygen will invoke the filter program
-# by executing (via popen()) the command:
-#
-# <filter> <input-file>
-#
-# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
-# name of an input file. Doxygen will then use the output that the filter
-# program writes to standard output. If FILTER_PATTERNS is specified, this tag
-# will be ignored.
-#
-# Note that the filter must not add or remove lines; it is applied before the
-# code is scanned, but not when the output code is generated. If lines are added
-# or removed, the anchors will not be placed correctly.
-#
-# Note that for custom extensions or not directly supported extensions you also
-# need to set EXTENSION_MAPPING for the extension otherwise the files are not
-# properly processed by doxygen.
-
-INPUT_FILTER           =
-
-# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
-# basis. Doxygen will compare the file name with each pattern and apply the
-# filter if there is a match. The filters are a list of the form: pattern=filter
-# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
-# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
-# patterns match the file name, INPUT_FILTER is applied.
-#
-# Note that for custom extensions or not directly supported extensions you also
-# need to set EXTENSION_MAPPING for the extension otherwise the files are not
-# properly processed by doxygen.
-
-FILTER_PATTERNS        =
-
-# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
-# INPUT_FILTER) will also be used to filter the input files that are used for
-# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
-# The default value is: NO.
-
-FILTER_SOURCE_FILES    = NO
-
-# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
-# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
-# it is also possible to disable source filtering for a specific pattern using
-# *.ext= (so without naming a filter).
-# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
-
-FILTER_SOURCE_PATTERNS =
-
-# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
-# is part of the input, its contents will be placed on the main page
-# (index.html). This can be useful if you have a project on for instance GitHub
-# and want to reuse the introduction page also for the doxygen output.
-
-USE_MDFILE_AS_MAINPAGE =
-
-#---------------------------------------------------------------------------
-# Configuration options related to source browsing
-#---------------------------------------------------------------------------
-
-# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
-# generated. Documented entities will be cross-referenced with these sources.
-#
-# Note: To get rid of all source code in the generated output, make sure that
-# also VERBATIM_HEADERS is set to NO.
-# The default value is: NO.
-
-SOURCE_BROWSER         = YES
-
-# Setting the INLINE_SOURCES tag to YES will include the body of functions,
-# classes and enums directly into the documentation.
-# The default value is: NO.
-
-INLINE_SOURCES         = NO
-
-# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
-# special comment blocks from generated source code fragments. Normal C, C++ and
-# Fortran comments will always remain visible.
-# The default value is: YES.
-
-STRIP_CODE_COMMENTS    = YES
-
-# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
-# The default value is: NO.
-
-REFERENCED_BY_RELATION = NO
-
-# If the REFERENCES_RELATION tag is set to YES then for each documented function
-# all documented entities called/used by that function will be listed.
-# The default value is: NO.
-
-REFERENCES_RELATION    = NO
-
-# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
-# to YES then the hyperlinks from functions in REFERENCES_RELATION and
-# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
-# link to the documentation.
-# The default value is: YES.
-
-REFERENCES_LINK_SOURCE = YES
-
-# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
-# source code will show a tooltip with additional information such as prototype,
-# brief description and links to the definition and documentation. Since this
-# will make the HTML file larger and loading of large files a bit slower, you
-# can opt to disable this feature.
-# The default value is: YES.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-SOURCE_TOOLTIPS        = YES
-
-# If the USE_HTAGS tag is set to YES then the references to source code will
-# point to the HTML generated by the htags(1) tool instead of doxygen built-in
-# source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
-# 4.8.6 or higher.
-#
-# To use it do the following:
-# - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
-# - Make sure the INPUT points to the root of the source tree
-# - Run doxygen as normal
-#
-# Doxygen will invoke htags (and that will in turn invoke gtags), so these
-# tools must be available from the command line (i.e. in the search path).
-#
-# The result: instead of the source browser generated by doxygen, the links to
-# source code will now point to the output of htags.
-# The default value is: NO.
-# This tag requires that the tag SOURCE_BROWSER is set to YES.
-
-USE_HTAGS              = NO
-
-# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
-# verbatim copy of the header file for each class for which an include is
-# specified. Set to NO to disable this.
-# See also: Section \class.
-# The default value is: YES.
-
-VERBATIM_HEADERS       = YES
-
-# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
-# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
-# cost of reduced performance. This can be particularly helpful with template
-# rich C++ code for which doxygen's built-in parser lacks the necessary type
-# information.
-# Note: The availability of this option depends on whether or not doxygen was
-# generated with the -Duse-libclang=ON option for CMake.
-# The default value is: NO.
-
-CLANG_ASSISTED_PARSING = YES
-
-# If clang assisted parsing is enabled you can provide the compiler with command
-# line options that you would normally use when invoking the compiler. Note that
-# the include paths will already be set by doxygen for the files and directories
-# specified with INPUT and INCLUDE_PATH.
-# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
-
-CLANG_OPTIONS          = -I ../third_party/pybind/include --std=c++14
-
-#---------------------------------------------------------------------------
-# Configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-
-# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
-# compounds will be generated. Enable this if the project contains a lot o=
-# classes, structs, unions or interfaces.
-# The default value is: YES.
-
-ALPHABETICAL_INDEX     = YES
-
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
-# In case all classes in a project start with a common prefix, all classes will
-# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
-# can be used to specify a prefix (or a list of prefixes) that should be ignored
-# while generating the index headers.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-IGNORE_PREFIX          =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the HTML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
-# The default value is: YES.
-
-GENERATE_HTML          = YES
-
-# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_OUTPUT            = html
-
-# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
-# generated HTML page (for example: .htm, .php, .asp).
-# The default value is: .html.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FILE_EXTENSION    = .html
-
-# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
-# each generated HTML page. If the tag is left blank doxygen will generate a
-# standard header.
-#
-# To get valid HTML the header file that includes any scripts and style sheets
-# that doxygen needs, which is dependent on the configuration options used (e.g.
-# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
-# default header using
-# doxygen -w html new_header.html new_footer.html new_stylesheet.css
-# YourConfigFile
-# and then modify the file new_header.html. See also section "Doxygen usage"
-# for information on how to generate the default header that doxygen normally
-# uses.
-# Note: The header is subject to change so you typically have to regenerate the
-# default header when upgrading to a newer version of doxygen. For a description
-# of the possible markers and block names see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_HEADER            = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/doxygen_header.html
-
-# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
-# generated HTML page. If the tag is left blank doxygen will generate a standard
-# footer. See HTML_HEADER for more information on how to generate a default
-# footer and what special commands can be used inside the footer. See also
-# section "Doxygen usage" for information on how to generate the default footer
-# that doxygen normally uses.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_FOOTER            = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/doxygen_footer.html
-
-# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
-# sheet that is used by each HTML page. It can be used to fine-tune the look of
-# the HTML output. If left blank doxygen will generate a default style sheet.
-# See also section "Doxygen usage" for information on how to generate the style
-# sheet that doxygen normally uses.
-# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
-# it is more robust and this tag (HTML_STYLESHEET) will in the future become
-# obsolete.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_STYLESHEET        =
-
-# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
-# cascading style sheets that are included after the standard style sheets
-# created by doxygen. Using this option one can overrule certain style aspects.
-# This is preferred over using HTML_STYLESHEET since it does not replace the
-# standard style sheet and is therefore more robust against future updates.
-# Doxygen will copy the style sheet files to the output directory.
-# Note: The order of the extra style sheet files is of importance (e.g. the last
-# style sheet in the list overrules the setting of the previous ones in the
-# list). For an example see the documentation.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_STYLESHEET  = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/doxygen_stylesheet.css
-
-# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the HTML output directory. Note
-# that these files will be copied to the base HTML output directory. Use the
-# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
-# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
-# files will be copied as-is; there are no commands or markers available.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_EXTRA_FILES       = $(MLPERF_LOADGEN_SRC_PATH)/docs/src/mlperf_icon.png \
-                         $(MLPERF_LOADGEN_SRC_PATH)/loadgen_integration_diagram.svg
-
-# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
-# will adjust the colors in the style sheet and background images according to
-# this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
-# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
-# purple, and 360 is red again.
-# Minimum value: 0, maximum value: 359, default value: 220.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_HUE    = 220
-
-# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
-# in the HTML output. For a value of 0 the output will use grayscales only. A
-# value of 255 will produce the most vivid colors.
-# Minimum value: 0, maximum value: 255, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_SAT    = 127
-
-# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
-# luminance component of the colors in the HTML output. Values below 100
-# gradually make the output lighter, whereas values above 100 make the output
-# darker. The value divided by 100 is the actual gamma applied, so 80 represents
-# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
-# change the gamma.
-# Minimum value: 40, maximum value: 240, default value: 80.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_COLORSTYLE_GAMMA  = 80
-
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting this
-# to YES can help to show when doxygen was last run and thus if the
-# documentation is up to date.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_TIMESTAMP         = NO
-
-# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
-# documentation will contain sections that can be hidden and shown after the
-# page has loaded.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_DYNAMIC_SECTIONS  = YES
-
-# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
-# shown in the various tree structured indices initially; the user can expand
-# and collapse entries dynamically later on. Doxygen will expand the tree to
-# such a level that at most the specified number of entries are visible (unless
-# a fully collapsed tree already exceeds this amount). So setting the number of
-# entries 1 will produce a full collapsed tree by default. 0 is a special value
-# representing an infinite number of entries and will result in a full expanded
-# tree by default.
-# Minimum value: 0, maximum value: 9999, default value: 100.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-HTML_INDEX_NUM_ENTRIES = 50
-
-# If the GENERATE_DOCSET tag is set to YES, additional index files will be
-# generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
-# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_DOCSET        = NO
-
-# This tag determines the name of the docset feed. A documentation feed provides
-# an umbrella under which multiple documentation sets from a single provider
-# (such as a company or product suite) can be grouped.
-# The default value is: Doxygen generated docs.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-
-# This tag specifies a string that should uniquely identify the documentation
-# set bundle. This should be a reverse domain-name style string, e.g.
-# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-
-# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
-# the documentation publisher. This should be a reverse domain-name style
-# string, e.g. com.mycompany.MyDocSet.documentation.
-# The default value is: org.doxygen.Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
-
-# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
-# The default value is: Publisher.
-# This tag requires that the tag GENERATE_DOCSET is set to YES.
-
-DOCSET_PUBLISHER_NAME  = Publisher
-
-# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
-# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
-# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
-#
-# The HTML Help Workshop contains a compiler that can convert all HTML output
-# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
-# files are now used as the Windows 98 help format, and will replace the old
-# Windows help format (.hlp) on all Windows platforms in the future. Compressed
-# HTML files also contain an index, a table of contents, and you can search for
-# words in the documentation. The HTML workshop also contains a viewer for
-# compressed HTML files.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_HTMLHELP      = NO
-
-# The CHM_FILE tag can be used to specify the file name of the resulting .chm
-# file. You can add a path in front of the file if the result should not be
-# written to the html output directory.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_FILE               =
-
-# The HHC_LOCATION tag can be used to specify the location (absolute path
-# including file name) of the HTML help compiler (hhc.exe). If non-empty,
-# doxygen will try to run the HTML help compiler on the generated index.hhp.
-# The file has to be specified with full path.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-HHC_LOCATION           =
-
-# The GENERATE_CHI flag controls if a separate .chi index file is generated
-# (YES) or that it should be included in the master .chm file (NO).
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-GENERATE_CHI           = NO
-
-# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
-# and project file content.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-CHM_INDEX_ENCODING     =
-
-# The BINARY_TOC flag controls whether a binary table of contents is generated
-# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
-# enables the Previous and Next buttons.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-BINARY_TOC             = NO
-
-# The TOC_EXPAND flag can be set to YES to add extra items for group members to
-# the table of contents of the HTML help documentation and to the tree view.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
-
-TOC_EXPAND             = NO
-
-# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
-# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
-# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
-# (.qch) of the generated HTML documentation.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_QHP           = NO
-
-# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
-# the file name of the resulting .qch file. The path specified is relative to
-# the HTML output folder.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QCH_FILE               =
-
-# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
-# Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_NAMESPACE          = org.doxygen.Project
-
-# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
-# Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
-# folders).
-# The default value is: doc.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_VIRTUAL_FOLDER     = doc
-
-# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
-# filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_NAME   =
-
-# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
-# custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_CUST_FILTER_ATTRS  =
-
-# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
-# project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHP_SECT_FILTER_ATTRS  =
-
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
-# This tag requires that the tag GENERATE_QHP is set to YES.
-
-QHG_LOCATION           =
-
-# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
-# generated, together with the HTML files, they form an Eclipse help plugin. To
-# install this plugin and make it available under the help contents menu in
-# Eclipse, the contents of the directory containing the HTML and XML files needs
-# to be copied into the plugins directory of eclipse. The name of the directory
-# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
-# After copying Eclipse needs to be restarted before the help appears.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_ECLIPSEHELP   = NO
-
-# A unique identifier for the Eclipse help plugin. When installing the plugin
-# the directory name containing the HTML and XML files should also have this
-# name. Each documentation set should have its own identifier.
-# The default value is: org.doxygen.Project.
-# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
-
-ECLIPSE_DOC_ID         = org.doxygen.Project
-
-# If you want full control over the layout of the generated HTML pages it might
-# be necessary to disable the index and replace it with your own. The
-# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
-# of each HTML page. A value of NO enables the index and the value YES disables
-# it. Since the tabs in the index contain the same information as the navigation
-# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-DISABLE_INDEX          = NO
-
-# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
-# structure should be generated to display hierarchical information. If the tag
-# value is set to YES, a side panel will be generated containing a tree-like
-# index structure (just like the one that is generated for HTML Help). For this
-# to work a browser that supports JavaScript, DHTML, CSS and frames is required
-# (i.e. any modern browser). Windows users are probably better off using the
-# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
-# further fine-tune the look of the index. As an example, the default style
-# sheet generated by doxygen has an example that shows how to put an image at
-# the root of the tree instead of the PROJECT_NAME. Since the tree basically has
-# the same information as the tab index, you could consider setting
-# DISABLE_INDEX to YES when enabling this option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-GENERATE_TREEVIEW      = YES
-
-# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
-# doxygen will group on one line in the generated HTML documentation.
-#
-# Note that a value of 0 will completely suppress the enum values from appearing
-# in the overview section.
-# Minimum value: 0, maximum value: 20, default value: 4.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-ENUM_VALUES_PER_LINE   = 4
-
-# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
-# to set the initial width (in pixels) of the frame in which the tree is shown.
-# Minimum value: 0, maximum value: 1500, default value: 250.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-TREEVIEW_WIDTH         = 250
-
-# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
-# external symbols imported via tag files in a separate window.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-EXT_LINKS_IN_WINDOW    = NO
-
-# Use this tag to change the font size of LaTeX formulas included as images in
-# the HTML documentation. When you change the font size after a successful
-# doxygen run you need to manually remove any form_*.png images from the HTML
-# output directory to force them to be regenerated.
-# Minimum value: 8, maximum value: 50, default value: 10.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_FONTSIZE       = 10
-
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
-# generated for formulas are transparent PNGs. Transparent PNGs are not
-# supported properly for IE 6.0, but are supported on all modern browsers.
-#
-# Note that when changing this option you need to delete any form_*.png files in
-# the HTML output directory before the changes have effect.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-FORMULA_TRANSPARENT    = YES
-
-# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
-# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
-# installed or if you want to formulas look prettier in the HTML output. When
-# enabled you may also need to install MathJax separately and configure the path
-# to it using the MATHJAX_RELPATH option.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-USE_MATHJAX            = NO
-
-# When MathJax is enabled you can set the default output format to be used for
-# the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
-# Possible values are: HTML-CSS (which is slower, but has the best
-# compatibility), NativeMML (i.e. MathML) and SVG.
-# The default value is: HTML-CSS.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_FORMAT         = HTML-CSS
-
-# When MathJax is enabled you need to specify the location relative to the HTML
-# output directory using the MATHJAX_RELPATH option. The destination directory
-# should contain the MathJax.js script. For instance, if the mathjax directory
-# is located at the same level as the HTML output directory, then
-# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
-# Content Delivery Network so you can quickly see the result without installing
-# MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
-
-# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
-# extension names that should be enabled during MathJax rendering. For example
-# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_EXTENSIONS     =
-
-# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
-# of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
-# example see the documentation.
-# This tag requires that the tag USE_MATHJAX is set to YES.
-
-MATHJAX_CODEFILE       =
-
-# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
-# the HTML output. The underlying search engine uses javascript and DHTML and
-# should work on any modern browser. Note that when using HTML help
-# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
-# there is already a search function so this one should typically be disabled.
-# For large projects the javascript based search engine can be slow, then
-# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
-# search using the keyboard; to jump to the search box use <access key> + S
-# (what the <access key> is depends on the OS and browser, but it is typically
-# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
-# key> to jump into the search results window, the results can be navigated
-# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
-# the search. The filter options can be selected when the cursor is inside the
-# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
-# to select a filter and <Enter> or <escape> to activate or cancel the filter
-# option.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_HTML is set to YES.
-
-SEARCHENGINE           = YES
-
-# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
-# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
-# setting. When disabled, doxygen will generate a PHP script for searching and
-# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
-# and searching needs to be provided by external tools. See the section
-# "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SERVER_BASED_SEARCH    = NO
-
-# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
-# script for searching. Instead the search results are written to an XML file
-# which needs to be processed by an external indexer. Doxygen will invoke an
-# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
-# search results.
-#
-# Doxygen ships with an example indexer (doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
-#
-# See the section "External Indexing and Searching" for details.
-# The default value is: NO.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH        = NO
-
-# The SEARCHENGINE_URL should point to a search engine hosted by a web server
-# which will return the search results when EXTERNAL_SEARCH is enabled.
-#
-# Doxygen ships with an example indexer (doxyindexer) and search engine
-# (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
-# Searching" for details.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHENGINE_URL       =
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
-# search data is written to a file for indexing by an external tool. With the
-# SEARCHDATA_FILE tag the name of this file can be specified.
-# The default file is: searchdata.xml.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-SEARCHDATA_FILE        = searchdata.xml
-
-# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
-# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
-# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
-# projects and redirect the results back to the right project.
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTERNAL_SEARCH_ID     =
-
-# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
-# projects other than the one defined by this configuration file, but that are
-# all added to the same external search index. Each project needs to have a
-# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
-# to a relative location where the documentation can be found. The format is:
-# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
-# This tag requires that the tag SEARCHENGINE is set to YES.
-
-EXTRA_SEARCH_MAPPINGS  =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
-# The default value is: YES.
-
-GENERATE_LATEX         = NO
-
-# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_OUTPUT           = latex
-
-# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
-# invoked.
-#
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_CMD_NAME         = latex
-
-# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
-# index for LaTeX.
-# The default file is: makeindex.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-MAKEINDEX_CMD_NAME     = makeindex
-
-# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-COMPACT_LATEX          = NO
-
-# The PAPER_TYPE tag can be used to set the paper type that is used by the
-# printer.
-# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
-# 14 inches) and executive (7.25 x 10.5 inches).
-# The default value is: a4.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PAPER_TYPE             = a4
-
-# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
-# that should be included in the LaTeX output. The package can be specified just
-# by its name or with the correct syntax as to be used with the LaTeX
-# \usepackage command. To get the times font for instance you can specify :
-# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
-# To use the option intlimits with the amsmath package you can specify:
-# EXTRA_PACKAGES=[intlimits]{amsmath}
-# If left blank no extra packages will be included.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-EXTRA_PACKAGES         =
-
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the
-# generated LaTeX document. The header should contain everything until the first
-# chapter. If it is left blank doxygen will generate a standard header. See
-# section "Doxygen usage" for information on how to let doxygen write the
-# default header to a separate file.
-#
-# Note: Only use a user-defined header if you know what you are doing! The
-# following commands have a special meaning inside the header: $title,
-# $datetime, $date, $doxygenversion, $projectname, $projectnumber,
-# $projectbrief, $projectlogo. Doxygen will replace $title with the empty
-# string, for the replacement values of the other commands the user is referred
-# to HTML_HEADER.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HEADER           =
-
-# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the
-# generated LaTeX document. The footer should contain everything after the last
-# chapter. If it is left blank doxygen will generate a standard footer. See
-# LATEX_HEADER for more information on how to generate a default footer and what
-# special commands can be used inside the footer.
-#
-# Note: Only use a user-defined footer if you know what you are doing!
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_FOOTER           =
-
-# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
-# LaTeX style sheets that are included after the standard style sheets created
-# by doxygen. Using this option one can overrule certain style aspects. Doxygen
-# will copy the style sheet files to the output directory.
-# Note: The order of the extra style sheet files is of importance (e.g. the last
-# style sheet in the list overrules the setting of the previous ones in the
-# list).
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_EXTRA_STYLESHEET =
-
-# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the LATEX_OUTPUT output
-# directory. Note that the files will be copied as-is; there are no commands or
-# markers available.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_EXTRA_FILES      =
-
-# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
-# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
-# contain links (just like the HTML output) instead of page references. This
-# makes the output suitable for online browsing using a PDF viewer.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-PDF_HYPERLINKS         = YES
-
-# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES, to get a
-# higher quality PDF documentation.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-USE_PDFLATEX           = YES
-
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
-# command to the generated LaTeX files. This will instruct LaTeX to keep running
-# if errors occur, instead of asking the user for help. This option is also used
-# when generating formulas in HTML.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BATCHMODE        = NO
-
-# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
-# index chapters (such as File Index, Compound Index, etc.) in the output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_HIDE_INDICES     = NO
-
-# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source
-# code with syntax highlighting in the LaTeX output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_SOURCE_CODE      = NO
-
-# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
-# bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
-# The default value is: plain.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_BIB_STYLE        = plain
-
-# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
-# page will contain the date and time when the page was generated. Setting this
-# to NO can help when comparing the output of multiple runs.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_LATEX is set to YES.
-
-LATEX_TIMESTAMP        = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the RTF output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
-# RTF output is optimized for Word 97 and may not look too pretty with other RTF
-# readers/editors.
-# The default value is: NO.
-
-GENERATE_RTF           = NO
-
-# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: rtf.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_OUTPUT             = rtf
-
-# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
-# documents. This may be useful for small projects and may help to save some
-# trees in general.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-COMPACT_RTF            = NO
-
-# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
-# contain hyperlink fields. The RTF file will contain links (just like the HTML
-# output) instead of page references. This makes the output suitable for online
-# browsing using Word or some other Word compatible readers that support those
-# fields.
-#
-# Note: WordPad (write) and others do not support links.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_HYPERLINKS         = NO
-
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
-#
-# See also section "Doxygen usage" for information on how to generate the
-# default style sheet that doxygen normally uses.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_STYLESHEET_FILE    =
-
-# Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_EXTENSIONS_FILE    =
-
-# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code
-# with syntax highlighting in the RTF output.
-#
-# Note that which sources are shown also depends on other settings such as
-# SOURCE_BROWSER.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_RTF is set to YES.
-
-RTF_SOURCE_CODE        = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the man page output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
-# classes and files.
-# The default value is: NO.
-
-GENERATE_MAN           = NO
-
-# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it. A directory man3 will be created inside the directory specified by
-# MAN_OUTPUT.
-# The default directory is: man.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_OUTPUT             = man
-
-# The MAN_EXTENSION tag determines the extension that is added to the generated
-# man pages. In case the manual section does not start with a number, the number
-# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
-# optional.
-# The default value is: .3.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_EXTENSION          = .3
-
-# The MAN_SUBDIR tag determines the name of the directory created within
-# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
-# MAN_EXTENSION with the initial . removed.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_SUBDIR             =
-
-# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
-# will generate one additional man file for each entity documented in the real
-# man page(s). These additional files only source the real man page, but without
-# them the man command would be unable to find the correct page.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_MAN is set to YES.
-
-MAN_LINKS              = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the XML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
-# captures the structure of the code including all documentation.
-# The default value is: NO.
-
-GENERATE_XML           = NO
-
-# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
-# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
-# it.
-# The default directory is: xml.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_OUTPUT             = xml
-
-# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
-# listings (including syntax highlighting and cross-referencing information) to
-# the XML output. Note that enabling this will significantly increase the size
-# of the XML output.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_XML is set to YES.
-
-XML_PROGRAMLISTING     = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to the DOCBOOK output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
-# that can be used to generate PDF.
-# The default value is: NO.
-
-GENERATE_DOCBOOK       = NO
-
-# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
-# front of it.
-# The default directory is: docbook.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_OUTPUT         = docbook
-
-# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the
-# program listings (including syntax highlighting and cross-referencing
-# information) to the DOCBOOK output. Note that enabling this will significantly
-# increase the size of the DOCBOOK output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
-
-DOCBOOK_PROGRAMLISTING = NO
-
-#---------------------------------------------------------------------------
-# Configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
-# AutoGen Definitions (see http://autogen.sf.net) file that captures the
-# structure of the code including all documentation. Note that this feature is
-# still experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_AUTOGEN_DEF   = NO
-
-#---------------------------------------------------------------------------
-# Configuration options related to the Perl module output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
-# file that captures the structure of the code including all documentation.
-#
-# Note that this feature is still experimental and incomplete at the moment.
-# The default value is: NO.
-
-GENERATE_PERLMOD       = NO
-
-# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
-# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
-# output from the Perl module output.
-# The default value is: NO.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_LATEX          = NO
-
-# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
-# formatted so it can be parsed by a human reader. This is useful if you want to
-# understand what is going on. On the other hand, if this tag is set to NO, the
-# size of the Perl module output will be much smaller and Perl will parse it
-# just the same.
-# The default value is: YES.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_PRETTY         = YES
-
-# The names of the make variables in the generated doxyrules.make file are
-# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
-# so different doxyrules.make files included by the same Makefile don't
-# overwrite each other's variables.
-# This tag requires that the tag GENERATE_PERLMOD is set to YES.
-
-PERLMOD_MAKEVAR_PREFIX =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-
-# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
-# C-preprocessor directives found in the sources and include files.
-# The default value is: YES.
-
-ENABLE_PREPROCESSING   = YES
-
-# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
-# in the source code. If set to NO, only conditional compilation will be
-# performed. Macro expansion can be done in a controlled way by setting
-# EXPAND_ONLY_PREDEF to YES.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-MACRO_EXPANSION        = NO
-
-# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
-# the macro expansion is limited to the macros specified with the PREDEFINED and
-# EXPAND_AS_DEFINED tags.
-# The default value is: NO.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_ONLY_PREDEF     = NO
-
-# If the SEARCH_INCLUDES tag is set to YES, the include files in the
-# INCLUDE_PATH will be searched if a #include is found.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SEARCH_INCLUDES        = YES
-
-# The INCLUDE_PATH tag can be used to specify one or more directories that
-# contain include files that are not input files but should be processed by the
-# preprocessor.
-# This tag requires that the tag SEARCH_INCLUDES is set to YES.
-
-INCLUDE_PATH           =
-
-# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
-# patterns (like *.h and *.hpp) to filter out the header-files in the
-# directories. If left blank, the patterns specified with FILE_PATTERNS will be
-# used.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-INCLUDE_FILE_PATTERNS  =
-
-# The PREDEFINED tag can be used to specify one or more macro names that are
-# defined before the preprocessor is started (similar to the -D option of e.g.
-# gcc). The argument of the tag is a list of macros of the form: name or
-# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
-# is assumed. To prevent a macro definition from being undefined via #undef or
-# recursively expanded use the := operator instead of the = operator.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-PREDEFINED             =
-
-# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
-# tag can be used to specify a list of macro names that should be expanded. The
-# macro definition that is found in the sources will be used. Use the PREDEFINED
-# tag if you want to use a different macro definition that overrules the
-# definition found in the source code.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-EXPAND_AS_DEFINED      =
-
-# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
-# remove all references to function-like macros that are alone on a line, have
-# an all uppercase name, and do not end with a semicolon. Such function macros
-# are typically used for boiler-plate code, and will confuse the parser if not
-# removed.
-# The default value is: YES.
-# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
-
-SKIP_FUNCTION_MACROS   = YES
-
-#---------------------------------------------------------------------------
-# Configuration options related to external references
-#---------------------------------------------------------------------------
-
-# The TAGFILES tag can be used to specify one or more tag files. For each tag
-# file the location of the external documentation should be added. The format of
-# a tag file without this location is as follows:
-# TAGFILES = file1 file2 ...
-# Adding location for the tag files is done as follows:
-# TAGFILES = file1=loc1 "file2 = loc2" ...
-# where loc1 and loc2 can be relative or absolute paths or URLs. See the
-# section "Linking to external documentation" for more information about the use
-# of tag files.
-# Note: Each tag file must have a unique name (where the name does NOT include
-# the path). If a tag file is not located in the directory in which doxygen is
-# run, you must also specify the path to the tagfile here.
-
-TAGFILES               =
-
-# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
-# tag file that is based on the input files it reads. See section "Linking to
-# external documentation" for more information about the usage of tag files.
-
-GENERATE_TAGFILE       =
-
-# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
-# the class index. If set to NO, only the inherited external classes will be
-# listed.
-# The default value is: NO.
-
-ALLEXTERNALS           = NO
-
-# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
-# in the modules index. If set to NO, only the current project's groups will be
-# listed.
-# The default value is: YES.
-
-EXTERNAL_GROUPS        = YES
-
-# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
-# the related pages index. If set to NO, only the current project's pages will
-# be listed.
-# The default value is: YES.
-
-EXTERNAL_PAGES         = YES
-
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /usr/bin/perl
-
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-
-# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram
-# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to
-# NO turns the diagrams off. Note that this option also works with HAVE_DOT
-# disabled, but it is recommended to install and use dot, since it yields more
-# powerful graphs.
-# The default value is: YES.
-
-CLASS_DIAGRAMS         = YES
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
-# You can include diagrams made with dia in doxygen documentation. Doxygen will
-# then run dia to produce the diagram and insert it in the documentation. The
-# DIA_PATH tag allows you to specify the directory where the dia binary resides.
-# If left empty dia is assumed to be found in the default search path.
-
-DIA_PATH               =
-
-# If set to YES the inheritance and collaboration graphs will hide inheritance
-# and usage relations if the target is undocumented or is not a class.
-# The default value is: YES.
-
-HIDE_UNDOC_RELATIONS   = YES
-
-# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
-# available from the path. This tool is part of Graphviz (see:
-# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
-# Bell Labs. The other options in this section have no effect if this option is
-# set to NO
-# The default value is: YES.
-
-HAVE_DOT               = YES
-
-# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
-# to run in parallel. When set to 0 doxygen will base this on the number of
-# processors available in the system. You can set it explicitly to a value
-# larger than 0 to get control over the balance between CPU load and processing
-# speed.
-# Minimum value: 0, maximum value: 32, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_NUM_THREADS        = 0
-
-# When you want a differently looking font in the dot files that doxygen
-# generates you can specify the font name using DOT_FONTNAME. You need to make
-# sure dot is able to find the font, which can be done by putting it in a
-# standard location or by setting the DOTFONTPATH environment variable or by
-# setting DOT_FONTPATH to the directory containing the font.
-# The default value is: Helvetica.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTNAME           = Helvetica
-
-# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
-# dot graphs.
-# Minimum value: 4, maximum value: 24, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTSIZE           = 10
-
-# By default doxygen will tell dot to use the default font as specified with
-# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
-# the path where dot can find it using this tag.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_FONTPATH           =
-
-# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for
-# each documented class showing the direct and indirect inheritance relations.
-# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CLASS_GRAPH            = YES
-
-# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
-# graph for each documented class showing the direct and indirect implementation
-# dependencies (inheritance, containment, and class references variables) of the
-# class with other documented classes.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-COLLABORATION_GRAPH    = YES
-
-# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
-# groups, showing the direct groups dependencies.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GROUP_GRAPHS           = YES
-
-# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
-# collaboration diagrams in a style similar to the OMG's Unified Modeling
-# Language.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LOOK               = YES
-
-# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
-# class node. If there are many fields or methods and many nodes the graph may
-# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
-# number of items for each type to make the size more manageable. Set this to 0
-# for no limit. Note that the threshold may be exceeded by 50% before the limit
-# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
-# but if the number exceeds 15, the total amount of fields shown is limited to
-# 10.
-# Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-UML_LIMIT_NUM_FIELDS   = 10
-
-# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
-# collaboration graphs will show the relations between templates and their
-# instances.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-TEMPLATE_RELATIONS     = YES
-
-# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
-# YES then doxygen will generate a graph for each documented file showing the
-# direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDE_GRAPH          = YES
-
-# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
-# set to YES then doxygen will generate a graph for each documented file showing
-# the direct and indirect include dependencies of the file with other documented
-# files.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INCLUDED_BY_GRAPH      = YES
-
-# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable call graphs for selected
-# functions only using the \callgraph command. Disabling a call graph can be
-# accomplished by means of the command \hidecallgraph.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALL_GRAPH             = YES
-
-# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
-# dependency graph for every global function or class method.
-#
-# Note that enabling this option will significantly increase the time of a run.
-# So in most cases it will be better to enable caller graphs for selected
-# functions only using the \callergraph command. Disabling a caller graph can be
-# accomplished by means of the command \hidecallergraph.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-CALLER_GRAPH           = YES
-
-# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
-# hierarchy of all classes instead of a textual one.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GRAPHICAL_HIERARCHY    = YES
-
-# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
-# dependencies a directory has on other directories in a graphical way. The
-# dependency relations are determined by the #include relations between the
-# files in the directories.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DIRECTORY_GRAPH        = YES
-
-# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
-# generated by dot. For an explanation of the image formats see the section
-# output formats in the documentation of the dot tool (Graphviz (see:
-# http://www.graphviz.org/)).
-# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
-# to make the SVG files visible in IE 9+ (other browsers do not have this
-# requirement).
-# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd,
-# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo,
-# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo,
-# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
-# png:gdiplus:gdiplus.
-# The default value is: png.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_IMAGE_FORMAT       = png
-
-# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
-# enable generation of interactive SVG images that allow zooming and panning.
-#
-# Note that this requires a modern browser other than Internet Explorer. Tested
-# and working are Firefox, Chrome, Safari, and Opera.
-# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
-# the SVG files visible. Older versions of IE do not have SVG support.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-INTERACTIVE_SVG        = YES
-
-# The DOT_PATH tag can be used to specify the path where the dot tool can be
-# found. If left blank, it is assumed the dot tool can be found in the path.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_PATH               =
-
-# The DOTFILE_DIRS tag can be used to specify one or more directories that
-# contain dot files that are included in the documentation (see the \dotfile
-# command).
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOTFILE_DIRS           =
-
-# The MSCFILE_DIRS tag can be used to specify one or more directories that
-# contain msc files that are included in the documentation (see the \mscfile
-# command).
-
-MSCFILE_DIRS           =
-
-# The DIAFILE_DIRS tag can be used to specify one or more directories that
-# contain dia files that are included in the documentation (see the \diafile
-# command).
-
-DIAFILE_DIRS           =
-
-# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
-# path where java can find the plantuml.jar file. If left blank, it is assumed
-# PlantUML is not used or called during a preprocessing step. Doxygen will
-# generate a warning when it encounters a \startuml command in this case and
-# will not generate output for the diagram.
-
-PLANTUML_JAR_PATH      =
-
-# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
-# configuration file for plantuml.
-
-PLANTUML_CFG_FILE      =
-
-# When using plantuml, the specified paths are searched for files specified by
-# the !include statement in a plantuml block.
-
-PLANTUML_INCLUDE_PATH  =
-
-# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
-# that will be shown in the graph. If the number of nodes in a graph becomes
-# larger than this value, doxygen will truncate the graph, which is visualized
-# by representing a node as a red box. Note that doxygen if the number of direct
-# children of the root node in a graph is already larger than
-# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
-# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
-# Minimum value: 0, maximum value: 10000, default value: 50.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_GRAPH_MAX_NODES    = 50
-
-# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
-# generated by dot. A depth value of 3 means that only nodes reachable from the
-# root by following a path via at most 3 edges will be shown. Nodes that lay
-# further from the root node will be omitted. Note that setting this option to 1
-# or 2 may greatly reduce the computation time needed for large code bases. Also
-# note that the size of a graph can be further restricted by
-# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
-# Minimum value: 0, maximum value: 1000, default value: 0.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-MAX_DOT_GRAPH_DEPTH    = 0
-
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, because dot on Windows does not seem
-# to support this out of the box.
-#
-# Warning: Depending on the platform used, enabling this option may lead to
-# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
-# read).
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_TRANSPARENT        = NO
-
-# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
-# files in one run (i.e. multiple -o and -T options on the command line). This
-# makes dot run faster, but since only newer versions of dot (>1.8.10) support
-# this, this feature is disabled by default.
-# The default value is: NO.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_MULTI_TARGETS      = NO
-
-# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
-# explaining the meaning of the various boxes and arrows in the dot generated
-# graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-GENERATE_LEGEND        = YES
-
-# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
-# files that are used to generate the various graphs.
-# The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
-
-DOT_CLEANUP            = YES
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_footer.html b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_footer.html
deleted file mode 100644
index 0d7b177af..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_footer.html
+++ /dev/null
@@ -1,26 +0,0 @@
-<!-- HTML footer for doxygen 1.8.13-->
-<!-- start footer part -->
-<!--BEGIN GENERATE_TREEVIEW-->
-<div id="nav-path" class="navpath"><!-- id is needed for treeview function! -->
-  <ul>
-    $navpath
-    <li class="footer">
-      <div style="flex-grow:1"></div>
-      <div>
-        $generatedby
-        <a href="http://www.doxygen.org/index.html">doxygen</a>
-        $doxygenversion
-      </div>
-    </li>
-  </ul>
-</div>
-<!--END GENERATE_TREEVIEW-->
-<!--BEGIN !GENERATE_TREEVIEW-->
-<hr class="footer"/><address class="footer"><small>
-$generatedby &#160;<a href="http://www.doxygen.org/index.html">
-doxygen
-</a> $doxygenversion
-</small></address>
-<!--END !GENERATE_TREEVIEW-->
-</body>
-</html>
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_header.html b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_header.html
deleted file mode 100644
index 91d214b95..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_header.html
+++ /dev/null
@@ -1,49 +0,0 @@
-<!-- HTML header for doxygen 1.8.13-->
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
-<html xmlns="http://www.w3.org/1999/xhtml">
-<head>
-<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
-<meta http-equiv="X-UA-Compatible" content="IE=9"/>
-<meta name="generator" content="Doxygen $doxygenversion"/>
-<meta name="viewport" content="width=device-width, initial-scale=1"/>
-<link rel="shortcut icon" href="mlperf_icon.png">
-<!--BEGIN PROJECT_NAME--><title>LoadGen: $title</title><!--END PROJECT_NAME-->
-<!--BEGIN !PROJECT_NAME--><title>$title</title><!--END !PROJECT_NAME-->
-<link href="$relpath^tabs.css" rel="stylesheet" type="text/css"/>
-<script type="text/javascript" src="$relpath^jquery.js"></script>
-<script type="text/javascript" src="$relpath^dynsections.js"></script>
-$treeview
-$search
-$mathjax
-<link href="$relpath^$stylesheet" rel="stylesheet" type="text/css" />
-$extrastylesheet
-</head>
-<body>
-<div id="top" style="display:flex; flex-flow:row wrap; justify-content:flex-start; align-items:center;"><!-- do not remove this div, it is closed by doxygen! -->
-
-<!--BEGIN TITLEAREA-->
-<div id="titlearea" style="display:flex; flex-flow:row wrap; justify-content:flex-start; align-items:center;">
-  <!--BEGIN PROJECT_LOGO-->
-  <a href="https://www.mlperf.org"><img alt="MLPerf" src="$relpath^$projectlogo"/ height="55px"></a>
-  <!--END PROJECT_LOGO-->
-  <!--BEGIN PROJECT_NAME-->
-  <div style="padding-left: 1em;">
-    <div id="projectname"><a href="index.html">$projectname</a>
-   <!--BEGIN PROJECT_NUMBER-->&#160;<span id="projectnumber">$projectnumber</span><!--END PROJECT_NUMBER-->
-   </div>
-   <!--BEGIN PROJECT_BRIEF--><div id="projectbrief">$projectbrief</div><!--END PROJECT_BRIEF-->
-  </div>
-  <!--END PROJECT_NAME-->
-  <!--BEGIN !PROJECT_NAME-->
-   <!--BEGIN PROJECT_BRIEF-->
-    <div id="projectbrief" style="padding-left: 1em;">$projectbrief</div>
-   <!--END PROJECT_BRIEF-->
-  <!--END !PROJECT_NAME-->
-  <!--BEGIN DISABLE_INDEX-->
-   <!--BEGIN SEARCHENGINE-->
-   <div>$searchbox</div>
-   <!--END SEARCHENGINE-->
-  <!--END DISABLE_INDEX-->
-</div>
-<!--END TITLEAREA-->
-<!-- end header part -->
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_html_generator.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_html_generator.py
deleted file mode 100644
index 4065d7bd0..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_html_generator.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright 2019 The MLPerf Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-# \file
-#  \brief A script that sets the environment variables expected by doxygen.cfg.
-#  \details This can be run manually without any arguments, but also allows a
-#  build system to customize the output directory.
-
-import os
-import sys
-
-
-def generate_doxygen_html(doxygen_out_dir, loadgen_root):
-    os.environ["MLPERF_LOADGEN_SRC_PATH"] = loadgen_root
-    os.environ["MLPERF_DOXYGEN_OUT_PATH"] = doxygen_out_dir
-    os.popen("doxygen " + loadgen_root + "/docs/src/doxygen.cfg")
-
-
-def main(argv):
-    doxygen_out_dir = "./docs/gen" if len(argv) < 2 else argv[1]
-    loadgen_root = "." if len(argv) < 3 else argv[2]
-    generate_doxygen_html(doxygen_out_dir, loadgen_root)
-
-
-main(sys.argv)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_layout.xml b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_layout.xml
deleted file mode 100644
index 1fc5a9cb4..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_layout.xml
+++ /dev/null
@@ -1,211 +0,0 @@
-<doxygenlayout version="1.0">
-  <!-- Generated by doxygen 1.8.13 -->
-  <!-- Navigation index tabs for HTML output -->
-  <navindex>
-    <tab type="mainpage" visible="yes" title="Overview"/>
-    <tab type="usergroup" visible="yes" title="API and Usage">
-      <tab type="user" url="@ref LoadgenAPI" visible="yes" title="API"/>
-      <tab type="user" url="@ref LoadgenAPITestSettings" visible="yes" title="Settings"/>
-    </tab>
-    <tab type="usergroup" visible="yes" title="Development">
-      <tab type="user" url="@ref ReadmeBuild" visible="yes" title="Building"/>
-      <tab type="user" url="@ref ReadmeHtmlDocs" visible="yes" title="Generating this Document"/>
-      <tab type="user" url="@ref todo" visible="yes" title="TODOs"/>
-    </tab>
-    <tab type="usergroup" visible="yes" title="Test Coverage">
-      <tab type="user" url="@ref ReadmeTests" visible="yes" title="Build and Run"/>
-      <tab type="user" url="@ref LoadgenTestsBasic" visible="yes" title="Basic"/>
-      <tab type="user" url="@ref LoadgenTestsPerformance" visible="yes" title="Performance"/>
-    </tab>
-    <tab type="usergroup" visible="yes" title="The Code">
-      <tab type="modules" visible="no" title="" intro=""/>
-      <tab type="namespaces" visible="yes" title="">
-        <tab type="namespacelist" visible="yes" title="" intro=""/>
-        <tab type="namespacemembers" visible="yes" title="" intro=""/>
-      </tab>
-      <tab type="classes" visible="yes" title="">
-        <tab type="classlist" visible="yes" title="" intro=""/>
-        <tab type="classindex" visible="$ALPHABETICAL_INDEX" title=""/>
-        <tab type="hierarchy" visible="yes" title="" intro=""/>
-        <tab type="classmembers" visible="yes" title="" intro=""/>
-      </tab>
-      <tab type="files" visible="yes" title="">
-        <tab type="filelist" visible="yes" title="" intro=""/>
-        <tab type="globals" visible="yes" title="" intro=""/>
-      </tab>
-      <tab type="examples" visible="no" title="" intro=""/>
-    </tab>
-    <tab type="pages" visible="no" title="" intro=""/>
-    <tab type="user" url="@ref ReadmeFAQ" visible="yes" title="FAQ"/>
-  </navindex>
-
-  <!-- Layout definition for a class page -->
-  <class>
-    <briefdescription visible="yes"/>
-    <includes visible="$SHOW_INCLUDE_FILES"/>
-    <inheritancegraph visible="$CLASS_GRAPH"/>
-    <collaborationgraph visible="$COLLABORATION_GRAPH"/>
-    <memberdecl>
-      <nestedclasses visible="yes" title=""/>
-      <publictypes title=""/>
-      <services title=""/>
-      <interfaces title=""/>
-      <publicslots title=""/>
-      <signals title=""/>
-      <publicmethods title=""/>
-      <publicstaticmethods title=""/>
-      <publicattributes title=""/>
-      <publicstaticattributes title=""/>
-      <protectedtypes title=""/>
-      <protectedslots title=""/>
-      <protectedmethods title=""/>
-      <protectedstaticmethods title=""/>
-      <protectedattributes title=""/>
-      <protectedstaticattributes title=""/>
-      <packagetypes title=""/>
-      <packagemethods title=""/>
-      <packagestaticmethods title=""/>
-      <packageattributes title=""/>
-      <packagestaticattributes title=""/>
-      <properties title=""/>
-      <events title=""/>
-      <privatetypes title=""/>
-      <privateslots title=""/>
-      <privatemethods title=""/>
-      <privatestaticmethods title=""/>
-      <privateattributes title=""/>
-      <privatestaticattributes title=""/>
-      <friends title=""/>
-      <related title="" subtitle=""/>
-      <membergroups visible="yes"/>
-    </memberdecl>
-    <detaileddescription title=""/>
-    <memberdef>
-      <inlineclasses title=""/>
-      <typedefs title=""/>
-      <enums title=""/>
-      <services title=""/>
-      <interfaces title=""/>
-      <constructors title=""/>
-      <functions title=""/>
-      <related title=""/>
-      <variables title=""/>
-      <properties title=""/>
-      <events title=""/>
-    </memberdef>
-    <allmemberslink visible="yes"/>
-    <usedfiles visible="$SHOW_USED_FILES"/>
-    <authorsection visible="yes"/>
-  </class>
-
-  <!-- Layout definition for a namespace page -->
-  <namespace>
-    <briefdescription visible="yes"/>
-    <memberdecl>
-      <nestednamespaces visible="yes" title=""/>
-      <constantgroups visible="yes" title=""/>
-      <classes visible="yes" title=""/>
-      <typedefs title=""/>
-      <enums title=""/>
-      <functions title=""/>
-      <variables title=""/>
-      <membergroups visible="yes"/>
-    </memberdecl>
-    <detaileddescription title=""/>
-    <memberdef>
-      <inlineclasses title=""/>
-      <typedefs title=""/>
-      <enums title=""/>
-      <functions title=""/>
-      <variables title=""/>
-    </memberdef>
-    <authorsection visible="yes"/>
-  </namespace>
-
-  <!-- Layout definition for a file page -->
-  <file>
-    <briefdescription visible="yes"/>
-    <includes visible="$SHOW_INCLUDE_FILES"/>
-    <includegraph visible="$INCLUDE_GRAPH"/>
-    <includedbygraph visible="$INCLUDED_BY_GRAPH"/>
-    <sourcelink visible="yes"/>
-    <memberdecl>
-      <classes visible="yes" title=""/>
-      <namespaces visible="yes" title=""/>
-      <constantgroups visible="yes" title=""/>
-      <defines title=""/>
-      <typedefs title=""/>
-      <enums title=""/>
-      <functions title=""/>
-      <variables title=""/>
-      <membergroups visible="yes"/>
-    </memberdecl>
-    <detaileddescription title=""/>
-    <memberdef>
-      <inlineclasses title=""/>
-      <defines title=""/>
-      <typedefs title=""/>
-      <enums title=""/>
-      <functions title=""/>
-      <variables title=""/>
-    </memberdef>
-    <authorsection/>
-  </file>
-
-  <!-- Layout definition for a group page -->
-  <group>
-    <briefdescription visible="yes"/>
-    <groupgraph visible="$GROUP_GRAPHS"/>
-    <memberdecl>
-      <nestedgroups visible="yes" title=""/>
-      <dirs visible="yes" title=""/>
-      <files visible="yes" title=""/>
-      <namespaces visible="yes" title=""/>
-      <classes visible="yes" title=""/>
-      <defines title=""/>
-      <typedefs title=""/>
-      <enums title=""/>
-      <enumvalues title=""/>
-      <functions title=""/>
-      <variables title=""/>
-      <signals title=""/>
-      <publicslots title=""/>
-      <protectedslots title=""/>
-      <privateslots title=""/>
-      <events title=""/>
-      <properties title=""/>
-      <friends title=""/>
-      <membergroups visible="yes"/>
-    </memberdecl>
-    <detaileddescription title=""/>
-    <memberdef>
-      <pagedocs/>
-      <inlineclasses title=""/>
-      <defines title=""/>
-      <typedefs title=""/>
-      <enums title=""/>
-      <enumvalues title=""/>
-      <functions title=""/>
-      <variables title=""/>
-      <signals title=""/>
-      <publicslots title=""/>
-      <protectedslots title=""/>
-      <privateslots title=""/>
-      <events title=""/>
-      <properties title=""/>
-      <friends title=""/>
-    </memberdef>
-    <authorsection visible="yes"/>
-  </group>
-
-  <!-- Layout definition for a directory page -->
-  <directory>
-    <briefdescription visible="yes"/>
-    <directorygraph visible="yes"/>
-    <memberdecl>
-      <dirs visible="yes"/>
-      <files visible="yes"/>
-    </memberdecl>
-    <detaileddescription title=""/>
-  </directory>
-</doxygenlayout>
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_stylesheet.css b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_stylesheet.css
deleted file mode 100644
index 3bd61261c..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/doxygen_stylesheet.css
+++ /dev/null
@@ -1,1629 +0,0 @@
-/* The standard CSS for doxygen 1.8.13 */
-
-body, table, div, p, dl {
-	font: 400 14px/22px Roboto,sans-serif;
-}
-
-p.reference, p.definition {
-	font: 400 14px/22px Roboto,sans-serif;
-}
-
-/* @group Heading Levels */
-
-h1.groupheader {
-	font-size: 150%;
-}
-
-.title {
-	font: 400 14px/28px Roboto,sans-serif;
-	font-size: 175%;
-	font-weight: bold;
-	margin: 10px 2px;
-	color: #135384;
-}
-
-h2.groupheader {
-	border-bottom: 1px solid #879ECB;
-	color: #354C7B;
-	font-size: 150%;
-	font-weight: normal;
-	margin-top: 1.75em;
-	padding-top: 8px;
-	padding-bottom: 4px;
-	width: 100%;
-}
-
-h3.groupheader {
-	font-size: 100%;
-}
-
-h1, h2, h3, h4, h5, h6 {
-	-webkit-transition: text-shadow 0.5s linear;
-	-moz-transition: text-shadow 0.5s linear;
-	-ms-transition: text-shadow 0.5s linear;
-	-o-transition: text-shadow 0.5s linear;
-	transition: text-shadow 0.5s linear;
-	margin-right: 15px;
-	color: #135384;
-
-}
-
-h1.glow, h2.glow, h3.glow, h4.glow, h5.glow, h6.glow {
-	text-shadow: 0 0 15px cyan;
-}
-
-dt {
-	font-weight: bold;
-}
-
-div.multicol {
-	-moz-column-gap: 1em;
-	-webkit-column-gap: 1em;
-	-moz-column-count: 3;
-	-webkit-column-count: 3;
-}
-
-p.startli, p.startdd {
-	margin-top: 2px;
-}
-
-p.starttd {
-	margin-top: 0px;
-}
-
-p.endli {
-	margin-bottom: 0px;
-}
-
-p.enddd {
-	margin-bottom: 4px;
-}
-
-p.endtd {
-	margin-bottom: 2px;
-}
-
-/* @end */
-
-caption {
-	font-weight: bold;
-}
-
-span.legend {
-        font-size: 70%;
-        text-align: center;
-}
-
-h3.version {
-        font-size: 90%;
-        text-align: center;
-}
-
-div.qindex, div.navtab{
-	background-color: #EBEFF6;
-	border: 1px solid #A3B4D7;
-	text-align: center;
-}
-
-div.qindex, div.navpath {
-	width: 100%;
-	line-height: 140%;
-}
-
-div.navtab {
-	margin-right: 15px;
-}
-
-/* @group Link Styling */
-
-a {
-	color: #3D578C;
-	font-weight: normal;
-	text-decoration: none;
-}
-
-.contents a:visited {
-	color: #4665A2;
-}
-
-a:hover {
-	text-decoration: underline;
-}
-
-a.qindex {
-	font-weight: bold;
-}
-
-a.qindexHL {
-	font-weight: bold;
-	background-color: #9CAFD4;
-	color: #ffffff;
-	border: 1px double #869DCA;
-}
-
-.contents a.qindexHL:visited {
-        color: #ffffff;
-}
-
-a.el {
-	font-weight: bold;
-}
-
-a.elRef {
-}
-
-a.code, a.code:visited, a.line, a.line:visited {
-	color: #4665A2; 
-}
-
-a.codeRef, a.codeRef:visited, a.lineRef, a.lineRef:visited {
-	color: #4665A2; 
-}
-
-/* @end */
-
-dl.el {
-	margin-left: -1cm;
-}
-
-pre.fragment {
-        border: 1px solid #C4CFE5;
-        background-color: #FBFCFD;
-        padding: 4px 6px;
-        margin: 4px 8px 4px 2px;
-        overflow: auto;
-        word-wrap: break-word;
-        font-size:  9pt;
-        line-height: 125%;
-        font-family: monospace, fixed;
-        font-size: 105%;
-}
-
-div.fragment {
-        padding: 0px;
-        margin: 4px 8px 4px 2px;
-	background-color: #FBFCFD;
-	border: 1px solid #C4CFE5;
-}
-
-div.line {
-	font-family: monospace, fixed;
-        font-size: 13px;
-	min-height: 13px;
-	line-height: 1.0;
-	text-wrap: unrestricted;
-	white-space: -moz-pre-wrap; /* Moz */
-	white-space: -pre-wrap;     /* Opera 4-6 */
-	white-space: -o-pre-wrap;   /* Opera 7 */
-	white-space: pre-wrap;      /* CSS3  */
-	word-wrap: break-word;      /* IE 5.5+ */
-	text-indent: -53px;
-	padding-left: 53px;
-	padding-bottom: 0px;
-	margin: 0px;
-	-webkit-transition-property: background-color, box-shadow;
-	-webkit-transition-duration: 0.5s;
-	-moz-transition-property: background-color, box-shadow;
-	-moz-transition-duration: 0.5s;
-	-ms-transition-property: background-color, box-shadow;
-	-ms-transition-duration: 0.5s;
-	-o-transition-property: background-color, box-shadow;
-	-o-transition-duration: 0.5s;
-	transition-property: background-color, box-shadow;
-	transition-duration: 0.5s;
-}
-
-div.line:after {
-    content:"\000A";
-    white-space: pre;
-}
-
-div.line.glow {
-	background-color: cyan;
-	box-shadow: 0 0 10px cyan;
-}
-
-
-span.lineno {
-	padding-right: 4px;
-	text-align: right;
-	border-right: 2px solid #0F0;
-	background-color: #E8E8E8;
-        white-space: pre;
-}
-span.lineno a {
-	background-color: #D8D8D8;
-}
-
-span.lineno a:hover {
-	background-color: #C8C8C8;
-}
-
-.lineno {
-	-webkit-touch-callout: none;
-	-webkit-user-select: none;
-	-khtml-user-select: none;
-	-moz-user-select: none;
-	-ms-user-select: none;
-	user-select: none;
-}
-
-div.ah, span.ah {
-	background-color: black;
-	font-weight: bold;
-	color: #ffffff;
-	margin-bottom: 3px;
-	margin-top: 3px;
-	padding: 0.2em;
-	border: solid thin #333;
-	border-radius: 0.5em;
-	-webkit-border-radius: .5em;
-	-moz-border-radius: .5em;
-	box-shadow: 2px 2px 3px #999;
-	-webkit-box-shadow: 2px 2px 3px #999;
-	-moz-box-shadow: rgba(0, 0, 0, 0.15) 2px 2px 2px;
-	background-image: -webkit-gradient(linear, left top, left bottom, from(#eee), to(#000),color-stop(0.3, #444));
-	background-image: -moz-linear-gradient(center top, #eee 0%, #444 40%, #000 110%);
-}
-
-div.classindex ul {
-        list-style: none;
-        padding-left: 0;
-}
-
-div.classindex span.ai {
-        display: inline-block;
-}
-
-div.groupHeader {
-	margin-left: 16px;
-	margin-top: 12px;
-	font-weight: bold;
-}
-
-div.groupText {
-	margin-left: 16px;
-	font-style: italic;
-}
-
-body {
-	background-color: white;
-	color: black;
-        margin: 0;
-}
-
-div.contents {
-	margin-top: 10px;
-	margin-left: 12px;
-	margin-right: 8px;
-}
-
-td.indexkey {
-	background-color: #EBEFF6;
-	font-weight: bold;
-	border: 1px solid #C4CFE5;
-	margin: 2px 0px 2px 0;
-	padding: 2px 10px;
-        white-space: nowrap;
-        vertical-align: top;
-}
-
-td.indexvalue {
-	background-color: #EBEFF6;
-	border: 1px solid #C4CFE5;
-	padding: 2px 10px;
-	margin: 2px 0px;
-}
-
-tr.memlist {
-	background-color: #EEF1F7;
-}
-
-p.formulaDsp {
-	text-align: center;
-}
-
-img.formulaDsp {
-	
-}
-
-img.formulaInl {
-	vertical-align: middle;
-}
-
-div.center {
-	text-align: center;
-        margin-top: 0px;
-        margin-bottom: 0px;
-        padding: 0px;
-}
-
-div.center img {
-	border: 0px;
-}
-
-address.footer {
-	text-align: right;
-	padding-right: 12px;
-}
-
-img.footer {
-	border: 0px;
-	vertical-align: middle;
-}
-
-/* @group Code Colorization */
-
-span.keyword {
-	color: #008000
-}
-
-span.keywordtype {
-	color: #604020
-}
-
-span.keywordflow {
-	color: #e08000
-}
-
-span.comment {
-	color: #800000
-}
-
-span.preprocessor {
-	color: #806020
-}
-
-span.stringliteral {
-	color: #002080
-}
-
-span.charliteral {
-	color: #008080
-}
-
-span.vhdldigit { 
-	color: #ff00ff 
-}
-
-span.vhdlchar { 
-	color: #000000 
-}
-
-span.vhdlkeyword { 
-	color: #700070 
-}
-
-span.vhdllogic { 
-	color: #ff0000 
-}
-
-blockquote {
-        background-color: #F7F8FB;
-        border-left: 2px solid #9CAFD4;
-        margin: 0 24px 0 4px;
-        padding: 0 12px 0 16px;
-}
-
-/* @end */
-
-/*
-.search {
-	color: #003399;
-	font-weight: bold;
-}
-
-form.search {
-	margin-bottom: 0px;
-	margin-top: 0px;
-}
-
-input.search {
-	font-size: 75%;
-	color: #000080;
-	font-weight: normal;
-	background-color: #e8eef2;
-}
-*/
-
-td.tiny {
-	font-size: 75%;
-}
-
-.dirtab {
-	padding: 4px;
-	border-collapse: collapse;
-	border: 1px solid #A3B4D7;
-}
-
-th.dirtab {
-	background: #EBEFF6;
-	font-weight: bold;
-}
-
-hr {
-	height: 0px;
-	border: none;
-	border-top: 1px solid #4A6AAA;
-}
-
-hr.footer {
-	height: 1px;
-}
-
-/* @group Member Descriptions */
-
-table.memberdecls {
-	border-spacing: 0px;
-	padding: 0px;
-}
-
-.memberdecls td, .fieldtable tr {
-	-webkit-transition-property: background-color, box-shadow;
-	-webkit-transition-duration: 0.5s;
-	-moz-transition-property: background-color, box-shadow;
-	-moz-transition-duration: 0.5s;
-	-ms-transition-property: background-color, box-shadow;
-	-ms-transition-duration: 0.5s;
-	-o-transition-property: background-color, box-shadow;
-	-o-transition-duration: 0.5s;
-	transition-property: background-color, box-shadow;
-	transition-duration: 0.5s;
-}
-
-.memberdecls td.glow, .fieldtable tr.glow {
-	background-color: cyan;
-	box-shadow: 0 0 15px cyan;
-}
-
-.mdescLeft, .mdescRight,
-.memItemLeft, .memItemRight,
-.memTemplItemLeft, .memTemplItemRight, .memTemplParams {
-	background-color: #F9FAFC;
-	border: none;
-	margin: 4px;
-	padding: 1px 0 0 8px;
-}
-
-.mdescLeft, .mdescRight {
-	padding: 0px 8px 4px 8px;
-	color: #555;
-}
-
-.memSeparator {
-        border-bottom: 1px solid #DEE4F0;
-        line-height: 1px;
-        margin: 0px;
-        padding: 0px;
-}
-
-.memItemLeft, .memTemplItemLeft {
-        white-space: nowrap;
-}
-
-.memItemRight {
-	width: 100%;
-}
-
-.memTemplParams {
-	color: #4665A2;
-        white-space: nowrap;
-	font-size: 80%;
-}
-
-/* @end */
-
-/* @group Member Details */
-
-/* Styles for detailed member documentation */
-
-.memtitle {
-	padding: 8px;
-	border-top: 1px solid #A8B8D9;
-	border-left: 1px solid #A8B8D9;
-	border-right: 1px solid #A8B8D9;
-	border-top-right-radius: 4px;
-	border-top-left-radius: 4px;
-	margin-bottom: -1px;
-	background-image: url('nav_f.png');
-	background-repeat: repeat-x;
-	background-color: #E2E8F2;
-	line-height: 1.25;
-	font-weight: 300;
-	float:left;
-}
-
-.permalink
-{
-        font-size: 65%;
-        display: inline-block;
-        vertical-align: middle;
-}
-
-.memtemplate {
-	font-size: 80%;
-	color: #4665A2;
-	font-weight: normal;
-	margin-left: 9px;
-}
-
-.memnav {
-	background-color: #EBEFF6;
-	border: 1px solid #A3B4D7;
-	text-align: center;
-	margin: 2px;
-	margin-right: 15px;
-	padding: 2px;
-}
-
-.mempage {
-	width: 100%;
-}
-
-.memitem {
-	padding: 0;
-	margin-bottom: 10px;
-	margin-right: 5px;
-        -webkit-transition: box-shadow 0.5s linear;
-        -moz-transition: box-shadow 0.5s linear;
-        -ms-transition: box-shadow 0.5s linear;
-        -o-transition: box-shadow 0.5s linear;
-        transition: box-shadow 0.5s linear;
-        display: table !important;
-        width: 100%;
-}
-
-.memitem.glow {
-         box-shadow: 0 0 15px cyan;
-}
-
-.memname {
-        font-weight: 400;
-        margin-left: 6px;
-}
-
-.memname td {
-	vertical-align: bottom;
-}
-
-.memproto, dl.reflist dt {
-        border-top: 1px solid #A8B8D9;
-        border-left: 1px solid #A8B8D9;
-        border-right: 1px solid #A8B8D9;
-        padding: 6px 0px 6px 0px;
-        color: #253555;
-        font-weight: bold;
-        text-shadow: 0px 1px 1px rgba(255, 255, 255, 0.9);
-        background-color: #DFE5F1;
-        /* opera specific markup */
-        box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
-        border-top-right-radius: 4px;
-        /* firefox specific markup */
-        -moz-box-shadow: rgba(0, 0, 0, 0.15) 5px 5px 5px;
-        -moz-border-radius-topright: 4px;
-        /* webkit specific markup */
-        -webkit-box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
-        -webkit-border-top-right-radius: 4px;
-
-}
-
-.overload {
-        font-family: "courier new",courier,monospace;
-	font-size: 65%;
-}
-
-.memdoc, dl.reflist dd {
-        border-bottom: 1px solid #A8B8D9;      
-        border-left: 1px solid #A8B8D9;      
-        border-right: 1px solid #A8B8D9; 
-        padding: 6px 10px 2px 10px;
-        background-color: #FBFCFD;
-        border-top-width: 0;
-        background-image:url('nav_g.png');
-        background-repeat:repeat-x;
-        background-color: #FFFFFF;
-        /* opera specific markup */
-        border-bottom-left-radius: 4px;
-        border-bottom-right-radius: 4px;
-        box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
-        /* firefox specific markup */
-        -moz-border-radius-bottomleft: 4px;
-        -moz-border-radius-bottomright: 4px;
-        -moz-box-shadow: rgba(0, 0, 0, 0.15) 5px 5px 5px;
-        /* webkit specific markup */
-        -webkit-border-bottom-left-radius: 4px;
-        -webkit-border-bottom-right-radius: 4px;
-        -webkit-box-shadow: 5px 5px 5px rgba(0, 0, 0, 0.15);
-}
-
-dl.reflist dt {
-        padding: 5px;
-}
-
-dl.reflist dd {
-        margin: 0px 0px 10px 0px;
-        padding: 5px;
-}
-
-.paramkey {
-	text-align: right;
-}
-
-.paramtype {
-	white-space: nowrap;
-}
-
-.paramname {
-	color: #602020;
-	white-space: nowrap;
-}
-.paramname em {
-	font-style: normal;
-}
-.paramname code {
-        line-height: 14px;
-}
-
-.params, .retval, .exception, .tparams {
-        margin-left: 0px;
-        padding-left: 0px;
-}       
-
-.params .paramname, .retval .paramname {
-        font-weight: bold;
-        vertical-align: top;
-}
-        
-.params .paramtype {
-        font-style: italic;
-        vertical-align: top;
-}       
-        
-.params .paramdir {
-        font-family: "courier new",courier,monospace;
-        vertical-align: top;
-}
-
-table.mlabels {
-	border-spacing: 0px;
-}
-
-td.mlabels-left {
-	width: 100%;
-	padding: 0px;
-}
-
-td.mlabels-right {
-	vertical-align: bottom;
-	padding: 0px;
-	white-space: nowrap;
-}
-
-span.mlabels {
-        margin-left: 8px;
-}
-
-span.mlabel {
-        background-color: #728DC1;
-        border-top:1px solid #5373B4;
-        border-left:1px solid #5373B4;
-        border-right:1px solid #C4CFE5;
-        border-bottom:1px solid #C4CFE5;
-	text-shadow: none;
-	color: white;
-	margin-right: 4px;
-	padding: 2px 3px;
-	border-radius: 3px;
-	font-size: 7pt;
-	white-space: nowrap;
-	vertical-align: middle;
-}
-
-
-
-/* @end */
-
-/* these are for tree view inside a (index) page */
-
-div.directory {
-        margin: 10px 0px;
-        border-top: 1px solid #9CAFD4;
-        border-bottom: 1px solid #9CAFD4;
-        width: 100%;
-}
-
-.directory table {
-        border-collapse:collapse;
-}
-
-.directory td {
-        margin: 0px;
-        padding: 0px;
-	vertical-align: top;
-}
-
-.directory td.entry {
-        white-space: nowrap;
-        padding-right: 6px;
-	padding-top: 3px;
-}
-
-.directory td.entry a {
-        outline:none;
-}
-
-.directory td.entry a img {
-        border: none;
-}
-
-.directory td.desc {
-        width: 100%;
-        padding-left: 6px;
-	padding-right: 6px;
-	padding-top: 3px;
-	border-left: 1px solid rgba(0,0,0,0.05);
-}
-
-.directory tr.even {
-	padding-left: 6px;
-	background-color: #F7F8FB;
-}
-
-.directory img {
-	vertical-align: -30%;
-}
-
-.directory .levels {
-        white-space: nowrap;
-        width: 100%;
-        text-align: right;
-        font-size: 9pt;
-}
-
-.directory .levels span {
-        cursor: pointer;
-        padding-left: 2px;
-        padding-right: 2px;
-	color: #3D578C;
-}
-
-.arrow {
-    color: #9CAFD4;
-    -webkit-user-select: none;
-    -khtml-user-select: none;
-    -moz-user-select: none;
-    -ms-user-select: none;
-    user-select: none;
-    cursor: pointer;
-    font-size: 80%;
-    display: inline-block;
-    width: 16px;
-    height: 22px;
-}
-
-.icon {
-    font-family: Arial, Helvetica;
-    font-weight: bold;
-    font-size: 12px;
-    height: 14px;
-    width: 16px;
-    display: inline-block;
-    background-color: #728DC1;
-    color: white;
-    text-align: center;
-    border-radius: 4px;
-    margin-left: 2px;
-    margin-right: 2px;
-}
-
-.icona {
-    width: 24px;
-    height: 22px;
-    display: inline-block;
-}
-
-.iconfopen {
-    width: 24px;
-    height: 18px;
-    margin-bottom: 4px;
-    background-image:url('folderopen.png');
-    background-position: 0px -4px;
-    background-repeat: repeat-y;
-    vertical-align:top;
-    display: inline-block;
-}
-
-.iconfclosed {
-    width: 24px;
-    height: 18px;
-    margin-bottom: 4px;
-    background-image:url('folderclosed.png');
-    background-position: 0px -4px;
-    background-repeat: repeat-y;
-    vertical-align:top;
-    display: inline-block;
-}
-
-.icondoc {
-    width: 24px;
-    height: 18px;
-    margin-bottom: 4px;
-    background-image:url('doc.png');
-    background-position: 0px -4px;
-    background-repeat: repeat-y;
-    vertical-align:top;
-    display: inline-block;
-}
-
-table.directory {
-    font: 400 14px Roboto,sans-serif;
-}
-
-/* @end */
-
-div.dynheader {
-        margin-top: 8px;
-	-webkit-touch-callout: none;
-	-webkit-user-select: none;
-	-khtml-user-select: none;
-	-moz-user-select: none;
-	-ms-user-select: none;
-	user-select: none;
-}
-
-address {
-	font-style: normal;
-	color: #2A3D61;
-}
-
-table.doxtable caption {
-	caption-side: top;
-}
-
-table.doxtable {
-	border-collapse:collapse;
-        margin-top: 4px;
-        margin-bottom: 4px;
-}
-
-table.doxtable td, table.doxtable th {
-	border: 1px solid #2D4068;
-	padding: 3px 7px 2px;
-}
-
-table.doxtable th {
-	background-color: #374F7F;
-	color: #FFFFFF;
-	font-size: 110%;
-	padding-bottom: 4px;
-	padding-top: 5px;
-}
-
-table.fieldtable {
-        /*width: 100%;*/
-        margin-bottom: 10px;
-        border: 1px solid #A8B8D9;
-        border-spacing: 0px;
-        -moz-border-radius: 4px;
-        -webkit-border-radius: 4px;
-        border-radius: 4px;
-        -moz-box-shadow: rgba(0, 0, 0, 0.15) 2px 2px 2px;
-        -webkit-box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.15);
-        box-shadow: 2px 2px 2px rgba(0, 0, 0, 0.15);
-}
-
-.fieldtable td, .fieldtable th {
-        padding: 3px 7px 2px;
-}
-
-.fieldtable td.fieldtype, .fieldtable td.fieldname {
-        white-space: nowrap;
-        border-right: 1px solid #A8B8D9;
-        border-bottom: 1px solid #A8B8D9;
-        vertical-align: top;
-}
-
-.fieldtable td.fieldname {
-        padding-top: 3px;
-}
-
-.fieldtable td.fielddoc {
-        border-bottom: 1px solid #A8B8D9;
-        /*width: 100%;*/
-}
-
-.fieldtable td.fielddoc p:first-child {
-        margin-top: 0px;
-}       
-        
-.fieldtable td.fielddoc p:last-child {
-        margin-bottom: 2px;
-}
-
-.fieldtable tr:last-child td {
-        border-bottom: none;
-}
-
-.fieldtable th {
-        background-image:url('nav_f.png');
-        background-repeat:repeat-x;
-        background-color: #E2E8F2;
-        font-size: 90%;
-        color: #253555;
-        padding-bottom: 4px;
-        padding-top: 5px;
-        text-align:left;
-        font-weight: 400;
-        -moz-border-radius-topleft: 4px;
-        -moz-border-radius-topright: 4px;
-        -webkit-border-top-left-radius: 4px;
-        -webkit-border-top-right-radius: 4px;
-        border-top-left-radius: 4px;
-        border-top-right-radius: 4px;
-        border-bottom: 1px solid #A8B8D9;
-}
-
-
-.tabsearch {
-	top: 0px;
-	left: 10px;
-	height: 36px;
-	background-image: url('tab_b.png');
-	z-index: 101;
-	overflow: hidden;
-	font-size: 13px;
-}
-
-.navpath ul {
-	display: flex;
-	flex-flow: row wrap;
-	justify-content: flex-start;
-	align-items: center;
-	font-size: 11px;
-	background-image:none;
-	background-repeat:repeat-x;
-	background-position: 0 -5px;
-	height:auto;
-	line-height:30px;
-	color:#8AA0CC;
-	border:solid 1px #C2CDE4;
-	overflow:hidden;
-	margin:0px;
-	padding:0px;
-}
-
-.navpath li
-{
-	list-style-type:none;
-	float:left;
-	padding-left:10px;
-	padding-right:15px;
-	background-image:url('bc_s.png');
-	background-repeat:no-repeat;
-	background-position:right;
-	color:#364D7C;
-}
-
-.navpath li.navelem a
-{
-	height:32px;
-	display:block;
-	text-decoration: none;
-	outline: none;
-	color: #283A5D;
-	font-family: 'Lucida Grande',Geneva,Helvetica,Arial,sans-serif;
-	text-shadow: 0px 1px 1px rgba(255, 255, 255, 0.9);
-	text-decoration: none;        
-}
-
-.navpath li.navelem a:hover
-{
-	color:#6884BD;
-}
-
-.navpath li.footer
-{
-	display: flex;
-	flex-flow: row wrap;
-	justify-content: flex-start;
-	align-items: center;
-        flex-grow: 1;
-        list-style-type:none;
-        float:none;
-        padding-left:10px;
-        padding-right:15px;
-        background-image:none;
-        background-repeat:no-repeat;
-        background-position:right;
-        color:#364D7C;
-        font-size: 8pt;
-}
-
-div.summary
-{
-	float: right;
-	font-size: 8pt;
-	padding-right: 5px;
-	width: 50%;
-	text-align: right;
-}       
-
-div.summary a
-{
-	white-space: nowrap;
-}
-
-table.classindex
-{
-        margin: 10px;
-        white-space: nowrap;
-        margin-left: 3%;
-        margin-right: 3%;
-        width: 94%;
-        border: 0;
-        border-spacing: 0; 
-        padding: 0;
-}
-
-div.ingroups
-{
-	font-size: 8pt;
-	width: 50%;
-	text-align: left;
-}
-
-div.ingroups a
-{
-	white-space: nowrap;
-}
-
-div.header
-{
-        background-image:url('nav_h.png');
-        background-repeat:repeat-x;
-	background-color: #F9FAFC;
-	margin:  0px;
-	border-bottom: 1px solid #C4CFE5;
-}
-
-div.headertitle
-{
-	padding: 5px 5px 5px 10px;
-       	color: #135384;
-}
-
-dl
-{
-        padding: 0 0 0 10px;
-}
-
-/* dl.note, dl.warning, dl.attention, dl.pre, dl.post, dl.invariant, dl.deprecated, dl.todo, dl.test, dl.bug */
-dl.section
-{
-	margin-left: 0px;
-	padding-left: 0px;
-}
-
-dl.note
-{
-        margin-left:-7px;
-        padding-left: 3px;
-        border-left:4px solid;
-        border-color: #D0C000;
-}
-
-dl.warning, dl.attention
-{
-        margin-left:-7px;
-        padding-left: 3px;
-        border-left:4px solid;
-        border-color: #FF0000;
-}
-
-dl.pre, dl.post, dl.invariant
-{
-        margin-left:-7px;
-        padding-left: 3px;
-        border-left:4px solid;
-        border-color: #00D000;
-}
-
-dl.deprecated
-{
-        margin-left:-7px;
-        padding-left: 3px;
-        border-left:4px solid;
-        border-color: #505050;
-}
-
-dl.todo
-{
-        margin-left:-7px;
-        padding-left: 3px;
-        border-left:4px solid;
-        border-color: #00C0E0;
-}
-
-dl.test
-{
-        margin-left:-7px;
-        padding-left: 3px;
-        border-left:4px solid;
-        border-color: #3030E0;
-}
-
-dl.bug
-{
-        margin-left:-7px;
-        padding-left: 3px;
-        border-left:4px solid;
-        border-color: #C08050;
-}
-
-dl.section dd {
-	margin-bottom: 6px;
-}
-
-
-#projectlogo
-{
-	text-align: center;
-	vertical-align: bottom;
-	border-collapse: separate;
-}
- 
-#projectlogo img
-{ 
-	border: 0px none;
-}
- 
-#projectalign
-{
-        vertical-align: middle;
-}
-
-#projectname
-{
-	font: 200% Tahoma, Arial,sans-serif;
-	margin: 0px;
-	padding: 2px 0px;
-}
-    
-#projectbrief
-{
-	font: 120% Tahoma, Arial,sans-serif;
-	margin: 0px;
-	padding: 0px;
-}
-
-#projectnumber
-{
-	font: 50% Tahoma, Arial,sans-serif;
-	margin: 0px;
-	padding: 0px;
-}
-
-#top {
-	border-bottom: 1px solid #5373B4;
-}
-
-#titlearea
-{
-	flex-grow: 1;
-	padding: 0px;
-	margin: 0px;
-	width: auto;
-	border-bottom: none;
-}
-
-#main-nav {
-}
-
-#main-menu {
-	display: flex;
-	flex-flow: row wrap;
-	justify-content: flex-start;
-	align-items: center;
-	background-image: none;
-	min-width: 770px;
-}
-
-.ui-resizable-e {
-	height: 100%;
-	background-repeat: repeat-y;
-}
-
-.image
-{
-        text-align: center;
-}
-
-.dotgraph
-{
-        text-align: center;
-}
-
-.mscgraph
-{
-        text-align: center;
-}
-
-.plantumlgraph
-{
-        text-align: center;
-}
-
-.diagraph
-{
-        text-align: center;
-}
-
-.caption
-{
-	font-weight: bold;
-}
-
-div.zoom
-{
-	border: 1px solid #90A5CE;
-}
-
-dl.citelist {
-        margin-bottom:50px;
-}
-
-dl.citelist dt {
-        color:#334975;
-        float:left;
-        font-weight:bold;
-        margin-right:10px;
-        padding:5px;
-}
-
-dl.citelist dd {
-        margin:2px 0;
-        padding:5px 0;
-}
-
-div.toc {
-        padding: 14px 25px;
-        background-color: #F4F6FA;
-        border: 1px solid #D8DFEE;
-        border-radius: 7px 7px 7px 7px;
-        float: right;
-        height: auto;
-        margin: 0 8px 10px 10px;
-        width: 200px;
-}
-
-div.toc li {
-        background: url("bdwn.png") no-repeat scroll 0 5px transparent;
-        font: 10px/1.2 Verdana,DejaVu Sans,Geneva,sans-serif;
-        margin-top: 5px;
-        padding-left: 10px;
-        padding-top: 2px;
-}
-
-div.toc h3 {
-        font: bold 12px/1.2 Arial,FreeSans,sans-serif;
-	color: #4665A2;
-        border-bottom: 0 none;
-        margin: 0;
-}
-
-div.toc ul {
-        list-style: none outside none;
-        border: medium none;
-        padding: 0px;
-}       
-
-div.toc li.level1 {
-        margin-left: 0px;
-}
-
-div.toc li.level2 {
-        margin-left: 15px;
-}
-
-div.toc li.level3 {
-        margin-left: 30px;
-}
-
-div.toc li.level4 {
-        margin-left: 45px;
-}
-
-.inherit_header {
-        font-weight: bold;
-        color: gray;
-        cursor: pointer;
-	-webkit-touch-callout: none;
-	-webkit-user-select: none;
-	-khtml-user-select: none;
-	-moz-user-select: none;
-	-ms-user-select: none;
-	user-select: none;
-}
-
-.inherit_header td {
-        padding: 6px 0px 2px 5px;
-}
-
-.inherit {
-        display: none;
-}
-
-tr.heading h2 {
-        margin-top: 12px;
-        margin-bottom: 4px;
-}
-
-/* tooltip related style info */
-
-.ttc {
-        position: absolute;
-        display: none;
-}
-
-#powerTip {
-	cursor: default;
-	white-space: nowrap;
-	background-color: white;
-	border: 1px solid gray;
-	border-radius: 4px 4px 4px 4px;
-	box-shadow: 1px 1px 7px gray;
-	display: none;
-	font-size: smaller;
-	max-width: 80%;
-	opacity: 0.9;
-	padding: 1ex 1em 1em;
-	position: absolute;
-	z-index: 2147483647;
-}
-
-#powerTip div.ttdoc {
-        color: grey;
-	font-style: italic;
-}
-
-#powerTip div.ttname a {
-        font-weight: bold;
-}
-
-#powerTip div.ttname {
-        font-weight: bold;
-}
-
-#powerTip div.ttdeci {
-        color: #006318;
-}
-
-#powerTip div {
-        margin: 0px;
-        padding: 0px;
-        font: 12px/16px Roboto,sans-serif;
-}
-
-#powerTip:before, #powerTip:after {
-	content: "";
-	position: absolute;
-	margin: 0px;
-}
-
-#powerTip.n:after,  #powerTip.n:before,
-#powerTip.s:after,  #powerTip.s:before,
-#powerTip.w:after,  #powerTip.w:before,
-#powerTip.e:after,  #powerTip.e:before,
-#powerTip.ne:after, #powerTip.ne:before,
-#powerTip.se:after, #powerTip.se:before,
-#powerTip.nw:after, #powerTip.nw:before,
-#powerTip.sw:after, #powerTip.sw:before {
-	border: solid transparent;
-	content: " ";
-	height: 0;
-	width: 0;
-	position: absolute;
-}
-
-#powerTip.n:after,  #powerTip.s:after,
-#powerTip.w:after,  #powerTip.e:after,
-#powerTip.nw:after, #powerTip.ne:after,
-#powerTip.sw:after, #powerTip.se:after {
-	border-color: rgba(255, 255, 255, 0);
-}
-
-#powerTip.n:before,  #powerTip.s:before,
-#powerTip.w:before,  #powerTip.e:before,
-#powerTip.nw:before, #powerTip.ne:before,
-#powerTip.sw:before, #powerTip.se:before {
-	border-color: rgba(128, 128, 128, 0);
-}
-
-#powerTip.n:after,  #powerTip.n:before,
-#powerTip.ne:after, #powerTip.ne:before,
-#powerTip.nw:after, #powerTip.nw:before {
-	top: 100%;
-}
-
-#powerTip.n:after, #powerTip.ne:after, #powerTip.nw:after {
-	border-top-color: #ffffff;
-	border-width: 10px;
-	margin: 0px -10px;
-}
-#powerTip.n:before {
-	border-top-color: #808080;
-	border-width: 11px;
-	margin: 0px -11px;
-}
-#powerTip.n:after, #powerTip.n:before {
-	left: 50%;
-}
-
-#powerTip.nw:after, #powerTip.nw:before {
-	right: 14px;
-}
-
-#powerTip.ne:after, #powerTip.ne:before {
-	left: 14px;
-}
-
-#powerTip.s:after,  #powerTip.s:before,
-#powerTip.se:after, #powerTip.se:before,
-#powerTip.sw:after, #powerTip.sw:before {
-	bottom: 100%;
-}
-
-#powerTip.s:after, #powerTip.se:after, #powerTip.sw:after {
-	border-bottom-color: #ffffff;
-	border-width: 10px;
-	margin: 0px -10px;
-}
-
-#powerTip.s:before, #powerTip.se:before, #powerTip.sw:before {
-	border-bottom-color: #808080;
-	border-width: 11px;
-	margin: 0px -11px;
-}
-
-#powerTip.s:after, #powerTip.s:before {
-	left: 50%;
-}
-
-#powerTip.sw:after, #powerTip.sw:before {
-	right: 14px;
-}
-
-#powerTip.se:after, #powerTip.se:before {
-	left: 14px;
-}
-
-#powerTip.e:after, #powerTip.e:before {
-	left: 100%;
-}
-#powerTip.e:after {
-	border-left-color: #ffffff;
-	border-width: 10px;
-	top: 50%;
-	margin-top: -10px;
-}
-#powerTip.e:before {
-	border-left-color: #808080;
-	border-width: 11px;
-	top: 50%;
-	margin-top: -11px;
-}
-
-#powerTip.w:after, #powerTip.w:before {
-	right: 100%;
-}
-#powerTip.w:after {
-	border-right-color: #ffffff;
-	border-width: 10px;
-	top: 50%;
-	margin-top: -10px;
-}
-#powerTip.w:before {
-	border-right-color: #808080;
-	border-width: 11px;
-	top: 50%;
-	margin-top: -11px;
-}
-
-@media print
-{
-  #top { display: none; }
-  #side-nav { display: none; }
-  #nav-path { display: none; }
-  body { overflow:visible; }
-  h1, h2, h3, h4, h5, h6 { page-break-after: avoid; }
-  .summary { display: none; }
-  .memitem { page-break-inside: avoid; }
-  #doc-content
-  {
-    margin-left:0 !important;
-    height:auto !important;
-    width:auto !important;
-    overflow:inherit;
-    display:inline;
-  }
-}
-
-/* @group Markdown */
-
-/*
-table.markdownTable {
-	border-collapse:collapse;
-        margin-top: 4px;
-        margin-bottom: 4px;
-}
-
-table.markdownTable td, table.markdownTable th {
-	border: 1px solid #2D4068;
-	padding: 3px 7px 2px;
-}
-
-table.markdownTableHead tr {
-}
-
-table.markdownTableBodyLeft td, table.markdownTable th {
-	border: 1px solid #2D4068;
-	padding: 3px 7px 2px;
-}
-
-th.markdownTableHeadLeft th.markdownTableHeadRight th.markdownTableHeadCenter th.markdownTableHeadNone {
-	background-color: #374F7F;
-	color: #FFFFFF;
-	font-size: 110%;
-	padding-bottom: 4px;
-	padding-top: 5px;
-}
-
-th.markdownTableHeadLeft {
-	text-align: left
-}
-
-th.markdownTableHeadRight {
-	text-align: right
-}
-
-th.markdownTableHeadCenter {
-	text-align: center
-}
-*/
-
-table.markdownTable {
-	border-collapse:collapse;
-        margin-top: 4px;
-        margin-bottom: 4px;
-}
-
-table.markdownTable td, table.markdownTable th {
-	border: 1px solid #2D4068;
-	padding: 3px 7px 2px;
-}
-
-table.markdownTable tr {
-}
-
-th.markdownTableHeadLeft, th.markdownTableHeadRight, th.markdownTableHeadCenter, th.markdownTableHeadNone {
-	background-color: #374F7F;
-	color: #FFFFFF;
-	font-size: 110%;
-	padding-bottom: 4px;
-	padding-top: 5px;
-}
-
-th.markdownTableHeadLeft, td.markdownTableBodyLeft {
-	text-align: left
-}
-
-th.markdownTableHeadRight, td.markdownTableBodyRight {
-	text-align: right
-}
-
-th.markdownTableHeadCenter, td.markdownTableBodyCenter {
-	text-align: center
-}
-
-
-/* @end */
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/loadgen_integration_diagram.dia b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/loadgen_integration_diagram.dia
deleted file mode 100644
index 569089f243e4584e12134caf36d078248cb50af1..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1943
zcmV;I2Wa>oiwFP!000021MOW~kJ>mCexF|v(Z^*%?A+VVRH|0H`_NS@ZTA@&;(=^K
z%qE@-FZ<iqPC_9!7?Rj+kdZnS!8ykc{=V-V`y{;o@Un@tC!C}qiSH&rHzpd6mm~<|
z)!pRpufN@TlMnYd?}HG%W&c(Q+Gy;IIOXr|CTmK!Z)daT=Vv|IrHGP5kHSYi#k2oV
z6rmYwG@IPtXxg!X08zw`6-N=JNw|2VSc}mH-%S>1`E8Yu$2geet%`O_5|KoELebsi
z%|r2;%nC!ZlBE;Yw`heI2}a-AjT`Lc``wIhaZ)wB*^)G5P3Uf0Ytmwe|2=9`v`Sf{
zcy<5g6Q&d=Z}M&x2M_j|I@{`qZcwrc<GL?)g`)g&K;1Td{&Ik4n+9~v{dKwA-9oOr
zNUpm?uDdXuZ%INE6w<oO1tAeeaqg8SkGQ*^bcrH1#ZC<ibhrKxQcBMB{{cnmYX*=X
z{*Jgy(Ys2*;52wkEn2JuJNAFL6#A7shXGyBUj%(6x%HhO>rY`CE+X92`!J@1ncod#
zzB&Ukoj9D{bH?S?T7@X^u#N35LbQ4e1Du|j%;h#MmexhH*}3apZP)YC1Yx?3(C+jQ
zs*PG~p_qn@!%&I?u}m4G?JXs@6`By|h%ElWOF9sHw)t9<=huvA2scX-$810>6usRN
z2G?|$tN7Zfm>;S{shL+c$#7Ei^y48u)e5#LdZHEmM@NSc8_v`I-O+r{zq(=`{}z$w
z<r*bayVX9Dm&qX@XW@7l+)X|kWgt|q9<#=Rff|^zi&#V+Q&`1j{W`UoUM#AX6F$s%
z72$$Wb>E)yKktB{8}`&<f4aHD5#G#~B#G-$@(`oR%x3~uAJJaiydI|4I9#o1yX}tM
z=IEF=Bu3{!afz2L3W9Q^SLwZ>cl=f<eyLrQDw$W>v|WiT-zl}Lq(^^}07u#{+OG`2
zDW)xX(g}WQi&RNWoBZ<Q+5?OEq8;lViqnaf(j8|22HyPCKXCMfX}CnkWOaEDYLlz+
zEAou;%j)N}Oe;-bbf%^be7m2Uawv2nGdGMwK-A3@3k6p31~-43IDXtTM2V;5tVvzk
zLY>)x`@4$)=r0D2Xfd!4V&DlE14oI05`#uD_>y4lOG1`7O-Z7pK)e*VN(y930qiXW
zmQXE#=65kLffO;YM5=+K`=yr$l{}1E9!N@+KFHGt=<2qT2)Pnr_LT@=PJzgD0dk2j
zxJ1BAB7DIT(S1BWI$`jXB8XOmk2qegHz@h0#6Ywd0G9=ZN(v-Ofi*z30EA}?#vxm<
z#b*lwrN9*^1(uQm;ZndZ6OIEV1X8jBwTB>83hW`W0`X}9ILZrvDIqZcYAZp>L37zH
zLBa1hR&pRAFJRzTYA~1@{Ka@RKuN&F;8Pgm1~qVctAQ~!1&V>swE*gl>A`oI46ty)
zbU;T&<z-rP7TMKm%ubK0Q7}4=n$3yjw(7a5K=mS}h=Vyw67pO%e3*g|_B1_@aJ)LD
za-F&S;NpyqzS_f~oG`6v#Qqa)&9G6BD<xT_op$)NL9}Z|he4n_00Ldn2y~7S=t@GM
zdj^57Lf~~F5d0Vfdc6^7PkoUIk(Hru>SdKxc7{HduXN6dUXQ`jxEeq2nl>pA{;VK0
z%OC`>HUk<?2F1yC<zz_+_Io4P6}+b597EWZ2;uU;8XNcu!q)}i3JNbma25WhBnX4v
zH@W~{tO|R_5cVZP*gvDfzEa_<S7ED%3QL1<|JxIQ(8VNZ9IYlnOWJBud035`(P}m6
zN&vjapjF18D+_}|)P=jkG3Xp)P|A8zd034YRqIJtCZbR<2EXzR#-nE_?J?ND@Yc}1
zOsH&;0d3*Luvt6{bZEQ$ooB!iY{`NjxEbr!He+u;FQV|@@)HTJXH&LeE?&p3tRY}m
zw(TJ)S(n<m{9O1q5Ha;G0|jtu2`&9$0l8%`K4N}!V2^={R9T2{1}aTD#XaNXp8j=*
zj1i`MH(Kmg8~0)IkjT<)!_5j2!6;%PsA3{%j8vbBik8MnOZ^Mr95xpiNm#bn{X;(C
zJkCqbcV2Qv!Aqv%C273WFc%dujgy%A*9CeK;wA48FL@)2mu$sL(s;>J@zRyhlGlHa
zx7dUg$xC}Dd~F*-#!n+!+qM)hNxT2FVJ<3S8YeLgQUfaVQOUd%Ghf}#?{ibj+iH2c
zKJOI&NamjbD{X-zsl)&pLp;z$B-=qt=XZzb4s^_S(AN#Xb3+!2xu!RoTfVhVWR)rg
z6ds0Ua|P-U0#gy8Fe2;-_u7wJU0lO8Olcz=qP$ov=Vi5xob(RGjhynNR2Vnhs5K1T
ztNO-~+~?!T3O?WvYd~2tBzg^KnpLmmYe3oJG#JUInWGkgWG(_#C!>lzM!5(C`&VAu
zJQQ_xp_^uz4@dWn>~{XlpQ9FoR4UXw{}dOElZ$#6TT7s#BPLQrq=@L`<WoHKbMsK!
zvPtj4>+C7LLNhg22mH|rR`$qOSNHg%6&&$TR;UiFrG=x`O|EO*#A<)W$@v?*Ve7+H
dUb*O!zt4VH`2ME!%ft6K{{zrB$(5Fa002n3teyY>

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/mlperf_icon.png b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/mlperf_icon.png
deleted file mode 100644
index 95321896d3e467b923909c3654a4260346df5b9f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4632
zcmb_gXHXMNyG2n1{1g!clqO9CBp9Uz2+B(h0@6F7BQ<mqs!{|5i6~846eM&bMM@x`
zAR;9oHHMN%3%!>RAaZ%{ulwU0f8Ck0`^@Z~bDr7Rvwt?}v4Qp_)*Gx03=Efabu^4m
z>+!#Xndx*kspxh;E!TrIErU#8Zb6}r0j>-WU0_bGH+6j+-Cd1c9bFLq-L5JO4D484
zjfbZ2@zs3&08VYLr}rz)yNSCgFmgQeeHw9Q1eeE<(8)3h7k6`{og2>kvKC*(FKO^J
z5eV$fjChu2;2`1j#U2~VGnny)Xm)IQvn=Xf#l6~A8TAK>pG-{`EH+$Q!`>Q8h%vkq
z6rDWF5O?A4cmQYkJN~~7kgq0V{Ag5f`8x#@9Wx!a1SkQI%6R2<NU)Jh=uF~VRy+aE
zgwlhvPpIDH<HOD;bpulyOvRk-)Q8>@OxAA2Vq7bSB2YpbCM3ct@U0s2<-Lx1%cM>r
zwzF{wFH}(Yx}S{v)uUm5#9$ZXMUlI5f;lo0Tt}hWMgM@!;u4VO6+CD^V}JOla{PpX
zwtMN%ELZ8yvLK3y_CZ1Mq8jX$wO>9fYz*a|&=U~pAT!MNR7NXwpXU;@VAWjMJFl;u
zZgEB}y1V3IwpP_S?E_{}g=C*@p6=dW8BKXt5_0qFCW`32_A=9!RcRfH^_5v8v^_HH
z6vnCDp!d@sz)?jsi^)qBDn`P35;UO}ZWR<=WsEUExn<kR0SHZGq<e*h28nCJ<6s#j
zwZy#&r%z=tdR!fj9t33@*M)X!A7pAm*}LmoJwS!E%n1j~f{0}U4ef|HWKK&I0G+tr
zc3>U9_G^X4ym6;+U@Tq-eYo%Ir*L*Cs;M!{r^~um8-8JDSIRo*AwoK6t1Vi(M%Ozy
zBCY{!XrUFtH8<n%F^;b*og&h5zHh?+t&Z^!6+V2d;WEnu+RI@jA4z|V%AvR5u5=64
zbS2Zg@$TMWFM%kHV2ur{Izr$P{_(=T#=(tjV_qX%+u)xUI;nV`o9)_{r8fQCHlw3M
z>)1_m1`6%sIirN;1oQ!ZTJMXVJ<ff9XEANaLT5USAI!evn_ImV)w|KCJCj9s5RMO%
zEUoIGRYKDG4nMsR^nnaS@Oy60k)AmB@qrbI^I!>;k$eJ*Rc^OBh+uk<@e5q;gredU
zu@fU{oHe`eOw{cbgxuZjF^pQ0+j#9FCowRQgP%C+d0Rbs9;iQacd*Uf$-h$Vl{2r@
zcbZ#{lQetm=SQV-O)3RFd}f`D)7uwH;JaEbCX~&GKh{@f@`y=hZ#}*{5*9w;R4YCz
z_IT=GPL!{=z^Ys?%aDSb)(&qs8gx@Rs((?N`g7adRCvmd8!N!;@6`w>!W?Nam1=~e
zwjYUwN2R<@z3TCU_bz8{bV_&t=hzz){?UIB4XuRsBcX7<*Hpu(TCp)FzVRq3?iz`a
zi1J_=0_P{0=OGowqL1&*?b=D*r#saFS+-`vB_gUHP{{zM<W@%y;TjJVQWapG+_3kP
zH$j$+bQDP;v5@heExISBo)Edb;Qa!>{UxNk_JlnP`PnL{uy?Ih5N7)PST#s)OaJ(n
zeC_JXP1;ILY)2+nYUYlLa9AUI+smA=K}=e~vWL3O><4blSZlNb3AbA93!7x>RLxx+
z@EUiV9jZY`HN@v&5sgda*1j$@;ten{IOWPvXY9z{?<6`y>_@bNv>;{BTA$8wv8yVG
zsEHa{ps&tAKZ>X`!>oX9YN0$0_9oF_<zFWU4{x;vtELumwU!=%gRhS|6ETyad}~)H
zb@g%Zr;$`$Ax-%J+V3Rug9tWIt#rnl(g&t&D&rGjmYA1q&409PZBC$AGsuwB`}KpE
z6%?CB{kmkLMpH?CQyJ~~8-R;~DusB3*!*=eH1K3+>^IiPj?|_+&Lf|(41=CD(yi^4
zZP_sp%min08u!cIQK!|gq%3yQv~WIQf_pCsWb=wx{%(@*<P*T9HM{h~;Xa{@O;7cB
zr+&u8eBav6`x(C-Wa9~`|DtSx@kjA?7gx{8x)`S+stLlUFoQ-}tNY!#KK7|SXs!|B
z@ykWHp1ht6fot#VH|YvVG|=wQn_!QvZM>R*fBa2!JX!~o9gBOw(P@8bM;c45q;pU&
zyzy>t@0KgISap7uY9^-KwfQHc4TwV0^7{K$P;XDhO!wPTL&to<zlekyHTs)X&^$n@
znMI>>BG$|bCwUb^*&#8*%way6-jA@9mHwhlASDol51mA~L2Rg6yf4`zTDJE3Mbl<X
z+?fk(jWD--&I#|-MviZglG8~KB?o3~k+dVYd$w*`6wlzmTqhNZX4BKO*YkpmL4u81
z^k7OE+WQ4gT=Vw5Uc;*~87K4VcDh7;v4<y-yz>*IPAP2Rr*~gbH)f1**^toGpVIi5
z+!L`Af3>(Pp&;7OTh99}aJ_?lG#|YyMzU59X?(@zwJfG}RFs8mfZ(V2*UCrWqu|B#
z)^9d?KQ1qzHw4E<^!$v?_l0S}Tem2l1#*e~Tw5T$rJXTdDcG=3_`qGc;2ntfd7`GV
zP)36t5cFm|+9ZW;uz0t90v-ckqfcl(i*oQ>W|UYCnp7<t;xmz@HVPbdkTl@6@WEr2
z<qa0p@0;Z)s7YH8DZ&gV$<KMm;J(a@?R@Fq!B?lH(?Jx<l&1?4Rfq<UW{!Z)ndr(X
zk?8d!ia<QDu-K=G^@4TuMa&*EtjAAZk00Wa+XCQKa5Gx}9-D94>x&)sC8RIE^A@-L
zL%rY;exe>M6)NO3jX%~Yx&OwkCRzJEKij#{M%rF~qlP~2n?pCGWE-u7UMs50!T!c=
z(g3KJzRR_bk<2ky3z5;53NHbWB)h=~?L7G$=a}T}D*SXZfiri~bZ4lYM|$QF87fYw
z|GZc)FXdCE^xZfe<|8?19oHtw^s=8{VJC!ZX=c_(-|lv@GKbXX$h=Iu(pn=Rt9VZX
z*K}^5#iOtgBqH22J10yyOKQ~Iu&|26S2Xl-Lo?T?HG`S#?P_0;9ohF-w6N5EWF(&M
zT;e6$b>PQkAO7z)W_W828wsL3J!4E$d}3`P!%iw`Tl3}5x~P77p5uxpY0G=QB0b6*
znH4p|3nj(~Zsln8=9V=RZeOUUQdm>7*&+WP%Iyf$@v0#bYv_8pdNayUSvq{m)YU*z
z5Ip9&oOQ{?to3X<OldxA^^)C|NISQU1>Ja~eU7m{<Fe(hWaKg)QCJi%VRdKYt3$p?
z8hyS{qLM)0gza+ZjlH(xyBYpvsd*~ru=uS*CnbBtg1EE7n|&dx)+8+ty$ga6AV&9$
zX*Yq%0b^dvSucSSk|UE@cpx+^inn13w3LWV4n_o#c?t~`8_UbGu1f(7Q<r=JsK_Y$
z!0d+PkUj33_sr)R+r0&rO3_hD1qxjwf83?R4JLmFT!xl>Jz?>|On=xgIJMSJ*Om$C
zRp`2-5k8!fI=KXB=Wa$+nxFjVeyZO-U+wMCov;0Mx+v^<gO6?)2r{Ek>6WOu-CE+T
z23%sqZSSP3Zz^dWyi<=R(Re){%5~K(JkFAcoKJRjB|%9(0%a{%7OaTQg=fW6n4vQH
zS>mfsv+jrXve@Gp-Sj;7a%-Zb8p^KV9Q-K%!(I{{)~Ir(&J$!9L`?yBE{ies>^9Wd
z2KHT#vE?nFHC}0#5~=`aP%xcNY@NrtWSGh?q7>JBgrqaSnX(c^wZthc$DeE5-Ke8D
zBwk@2dE;}x>Nu4CZaC-AmC<Z8nz4!pdWPxRY3O?d%(>UNdVMO(aV#1V{m5P1DQ7+%
ztVB6}^0wGm6dJ!zoHHQizP2=2s_0jmV<~Q2-~51%T1v-20{aki$?ZY_5;4DPqM*Y9
zt0=)4NKL&hk5t+BB4in+q_N_GX{ico@iGSADlJocjYqt=Yj3;nM-gtRUK40YqQtz2
z<wiE#Nlm-iHBpWqcB^8+e0v&1qL!{(oW8nuH4{1+$cD|K!)4!mWT&RG*^-0?h%C@q
zXj`t`U*!P6ss}zA^<OxWMVHw~vqD>OV2<nU%IIGz_wdK$-RV0SInI(|d+O_6OW{&2
zotn*_xo5W}t85<W<4!5EM16dux7YjiS^rS+FAtny9r{wI*}Y(<Q}OK#1pC!n05SWu
z|Gnw9@-(e*Qc1<jIuPl9Y!=+-0g>r+!1Y{I0CkFDm37F5^PhF$JHu*5O-g8cYndQx
zLGxbkb<055Rb}p?F-cNDT*HqEt*kGu+>mlzJcWn-=%8XfdqOFqSo1t(gUjTA3hS9-
zJw^rWuFNJ>wYc#w9@N(Oxr-HBWB|1I-<N&njFR~_WedKJu-2JZ6G+$&xlPr!x=J#S
z!+cT{MG(Kotb*dnNX?MI=kHeHvCQA6%emVvpsE@N=<FYWW1o<+p5qr`a!bE~9FvN5
z)yAm0$U(@RE3MOcs+m7pNBW%t|1<mP+VmTUVZMU;%?SNR2{wyTTkgh)UOccG+2Q#S
z!9Go#LjP&8OXw(-zi4$ExW4R84d#>QvNyL>ox9Y0n9X6Oc~IQIcA=vv#B+TsMcc!%
znDl&;FjyVGND3qNNc878JAHrdRfXPdF}6Xfy&ib9+<c?At;_Ya^<ecT<cC~((6Nif
zJ@n$AN_tK>-!DJBdYnv~^k>f3=bK+-uQw!8)~jT@z9|dj3WiR3LJTjpYK_1B;V8>q
z{+Yt_y@<R0pQ<1**R+jS>X|sUw|lpltZmJh;x4$KQ;Xg}D=}RWz`iS@@GRQG=^d8E
z8@v#NQJ}lZ)#<O1lF^&@A{W;H)sZU9#cK*{@JkKaK}0@A4fqp<7Paj%%4KFjR&gQw
zo!%~+HxCLPotYw22roAOa7?P)XOYaXNQlIm$5H7bnVxE&^e$H_Y3(mt+gxi^RAnrW
zcwqIoeE?H#G5cyA!<kG6+f|&O?j4Xye_T0aF{g=Dxu=wfKzpQaeWpYSOy^yfG4xJn
zmtKVzr49}?=?KZ@pPS$-Vl=w(#nrm0!KckbL4&*;kx6%CnwwlXD5~)<rBYx1Ko^k7
zv@**N_RJW#-I!9ZtQ9M)sKg^x0xj^+Dddw`JNd}#BYq6fEFY&<xf!bPvq7K}jRb4G
zf5VgGtdhFDGcz7{nwI$`@9*t)Q@1hcp&qp|yvE~6+e6K#%5I9lAy*tGWNaTI$_om5
z=Ai*xsx>T&coew~O<9MBM&sqIA|EuIAm4G3<G7n`+!nG$*qCM?`b!)6ht^fEb)`|3
zZ?|#vUbosQnJFFOD6npcr?rI1ty&(6sKUr8dw&X8<yOtZg43a~u@c!;@$sx`eu*{s
zg&jLxm~b<V2<)4X$r%V0j|4PV)NYvHxfIh;Bej_ab~X&7PlWkL>?%li|A|cTI7STR
z9vwx5{9LBvly87=ZRWx)S8Lka)t_(HFYH35xPA2?;uQB?&HdWIFx6H&GP#L%9Y9$%
zh?60m#qK`O4*Im5CkW`OB(2DX<Gn$S2#0<=b%Y!~wNO=V>ula*s9BE|!R;M+wU*FG
z^%k=-em^e{xYq`YT@~5!<PPNc0TjpQbciSKEH3-{M>>(n3YgvRDi)Yn1ky!QbRxj2
z78AZZTw~?#dDhiTFx@zc8Fnz>U0;8tTjOM_;=W;Z8N!B{IMJD>=bl<FX@F^3Oaa|z
zVAJ2gd30T($r0tdV<#VwZEC-W69%Fgc)sE(*8<3Ojq*!jI}kX7pPtDyaf|7l_DvOr
zcE-Qsl=64{ufYS+N?BDb6GP%A!_)r;W;TYu;=hXi@8Vz8|9a1yoc#|&&S#NfSoEu^
R)2|)|T}=axkLnIF{{zz;5aa*=

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/mlperf_logo_horizontal_color.svg b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/mlperf_logo_horizontal_color.svg
deleted file mode 100644
index d932bdae6..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/docs/src/mlperf_logo_horizontal_color.svg
+++ /dev/null
@@ -1,55 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<!-- Generator: Adobe Illustrator 22.0.1, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
-<svg version="1.1" id="mlperf_x5F_logo_x5F_horizontal_x5F_color"
-	 xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px" viewBox="0 0 494 166"
-	 style="enable-background:new 0 0 494 166;" xml:space="preserve">
-<style type="text/css">
-	.st0{fill:#333333;}
-	.st1{fill:#1A6197;}
-	.st2{fill:#B4BFC1;}
-</style>
-<g>
-	<g>
-		<path class="st0" d="M242.3,58.3c0.9,0.9,1.4,1.9,1.4,3.2v43c0,1.3-0.4,2.4-1.3,3.2c-0.8,0.9-2,1.3-3.3,1.3
-			c-1.4,0-2.5-0.4-3.4-1.3c-0.9-0.8-1.3-1.9-1.3-3.3V74.6l-12.2,18.7c-0.5,0.6-1.1,1.1-1.7,1.4c-0.6,0.4-1.3,0.5-2,0.5
-			c-0.6,0-1.3-0.1-1.9-0.5c-0.6-0.4-1.2-0.9-1.7-1.4l-12.2-19.4v30.6c0,1.3-0.4,2.4-1.2,3.3c-0.8,0.8-1.8,1.3-3.1,1.3
-			c-1.2,0-2.3-0.4-3-1.3c-0.8-0.9-1.2-1.9-1.2-3.2v-43c0-1.2,0.5-2.3,1.4-3.2c0.9-0.9,2-1.3,3.3-1.3c0.7,0,1.5,0.2,2.2,0.6
-			c0.7,0.4,1.3,0.9,1.7,1.4l16.2,24.9L235.4,59c0.4-0.6,0.9-1.1,1.6-1.4c0.7-0.4,1.3-0.6,2-0.6C240.3,57,241.3,57.4,242.3,58.3z"/>
-		<path class="st0" d="M286.7,101.5c0.9,0.8,1.3,1.9,1.3,3.2c0,1.2-0.4,2.3-1.3,3.1c-0.9,0.8-1.9,1.2-3.2,1.2h-24.1
-			c-1.3,0-2.4-0.4-3.2-1.3c-0.9-0.9-1.3-1.9-1.3-3.2V61.5c0-1.3,0.4-2.4,1.3-3.2c0.9-0.9,2-1.3,3.4-1.3c1.2,0,2.3,0.4,3.2,1.3
-			c0.9,0.9,1.3,1.9,1.3,3.2v38.8h19.4C284.7,100.3,285.8,100.7,286.7,101.5z"/>
-		<path class="st0" d="M325.2,59.3c2.3,1.5,4.1,3.5,5.5,6.1c1.4,2.6,2.1,5.3,2.1,8.4c0,3-0.7,5.7-2.1,8.3c-1.4,2.6-3.2,4.6-5.5,6.1
-			c-2.3,1.5-4.7,2.3-7.2,2.3H305v14c0,1.3-0.4,2.4-1.2,3.3s-1.8,1.3-3.1,1.3c-1.2,0-2.3-0.4-3-1.3c-0.8-0.9-1.2-1.9-1.2-3.2V61.5
-			c0-1.3,0.4-2.4,1.3-3.2c0.9-0.9,1.9-1.3,3.2-1.3H318C320.5,57,322.9,57.8,325.2,59.3z M320.8,80.8c0.9-0.8,1.7-1.8,2.2-3.1
-			c0.6-1.3,0.9-2.6,0.9-4c0-1.4-0.3-2.7-0.9-4c-0.6-1.3-1.3-2.3-2.2-3c-0.9-0.8-1.8-1.2-2.8-1.2H305V82H318
-			C318.9,82,319.8,81.6,320.8,80.8z"/>
-		<path class="st0" d="M373.9,90.4c-0.9,0.7-1.9,1.1-3.1,1.1h-24.5c0.6,2.9,2,5.2,4.1,6.9c2.1,1.7,4.5,2.5,7.2,2.5
-			c2,0,3.6-0.2,4.7-0.6c1.1-0.4,2-0.8,2.7-1.2c0.7-0.4,1.1-0.7,1.4-0.9c0.9-0.4,1.7-0.7,2.5-0.7c1,0,1.9,0.4,2.7,1.1
-			c0.7,0.7,1.1,1.6,1.1,2.6c0,1.3-0.7,2.5-2.1,3.6c-1.4,1.1-3.2,2.1-5.6,2.9c-2.3,0.8-4.7,1.2-7,1.2c-4.2,0-7.8-0.9-10.9-2.6
-			c-3.1-1.7-5.5-4.1-7.2-7.2c-1.7-3-2.5-6.5-2.5-10.3c0-4.3,0.9-8,2.7-11.2c1.8-3.2,4.1-5.7,7-7.4c2.9-1.7,6-2.6,9.3-2.6
-			c3.3,0,6.3,0.9,9.2,2.7c2.9,1.8,5.2,4.2,6.9,7.2c1.7,3,2.6,6.3,2.6,9.8C375.2,88.7,374.8,89.6,373.9,90.4z M346.4,84.1h19.4v-0.5
-			c-0.2-2.1-1.2-3.9-3-5.3c-1.8-1.5-3.9-2.2-6.2-2.2C350.8,76,347.4,78.7,346.4,84.1z"/>
-		<path class="st0" d="M407.5,69.8c1,0.8,1.5,1.8,1.5,3c0,1.5-0.4,2.7-1.2,3.4c-0.8,0.8-1.7,1.1-2.8,1.1c-0.7,0-1.6-0.2-2.5-0.5
-			c-0.1,0-0.5-0.1-1-0.3c-0.5-0.1-1.1-0.2-1.7-0.2c-1.3,0-2.5,0.4-3.7,1.2c-1.2,0.8-2.2,2-2.9,3.6c-0.8,1.6-1.1,3.5-1.1,5.7v17.6
-			c0,1.3-0.4,2.4-1.3,3.2c-0.8,0.9-1.9,1.3-3.2,1.3c-1.3,0-2.3-0.4-3.2-1.3c-0.8-0.9-1.3-1.9-1.3-3.2V73.8c0-1.3,0.4-2.4,1.3-3.2
-			c0.8-0.9,1.9-1.3,3.2-1.3c1.3,0,2.3,0.4,3.2,1.3c0.8,0.9,1.3,1.9,1.3,3.2v1c1.1-2,2.8-3.6,4.9-4.6c2.1-1.1,4.4-1.6,6.8-1.6
-			C405.2,68.6,406.5,69,407.5,69.8z"/>
-		<path class="st0" d="M432.3,63.3c-0.7,1-1,2-1,3.2V70h6.8c1.2,0,2.2,0.4,3,1.2c0.8,0.8,1.2,1.7,1.2,2.9c0,1.2-0.4,2.2-1.2,2.9
-			c-0.8,0.8-1.8,1.2-3,1.2h-6.8v26.3c0,1.3-0.4,2.4-1.3,3.2c-0.8,0.9-1.9,1.3-3.2,1.3s-2.4-0.4-3.2-1.3c-0.8-0.9-1.3-1.9-1.3-3.2
-			V78.2h-4c-1.2,0-2.2-0.4-3-1.2c-0.8-0.8-1.2-1.7-1.2-2.9c0-1.2,0.4-2.2,1.2-2.9c0.8-0.8,1.8-1.2,3-1.2h4v-3.4
-			c0-3.7,1.3-6.7,3.8-9.1c2.6-2.3,6.2-3.5,11-3.5c1.8,0,3.4,0.4,4.8,1.1c1.4,0.7,2,1.9,2,3.4c0,1.2-0.3,2.2-1,2.9
-			c-0.7,0.7-1.5,1.1-2.5,1.1c-0.2,0-0.5,0-0.8-0.1c-0.3,0-0.6-0.1-0.9-0.1c-1.2-0.3-2.2-0.4-3-0.4C434.1,61.9,433,62.4,432.3,63.3z"
-			/>
-	</g>
-	<g>
-		<path class="st1" d="M138.7,71.3l-24.2,39.3l0,0c-1.9,3.2-5.4,5.3-9.4,5.3c-6.1,0-11-4.9-11-11c0-4,2.1-7.5,5.3-9.4l0,0
-			L138.7,71.3"/>
-		<g>
-			<path class="st2" d="M136.1,67.1l-11.7,7.2c-7.3-6.6-17-10.6-27.6-10.6C74,63.8,55.5,82.2,55.5,105c0,2.7,0.3,5.3,0.7,7.8l0,0
-				c0,0.1,0,0.3,0,0.4c0,1.6-1.3,2.8-2.8,2.8c-1.4,0-2.6-1-2.8-2.4l0,0c-0.4-2.8-0.7-5.7-0.7-8.6c0-30.4,24.6-55,55-55
-				c14,0,26.7,5.2,36.4,13.8L136.1,67.1z M146.2,68.6l-3.3,5.3l-8.6,14c2.4,5.2,3.7,11,3.7,17.1c0,0,0,0,0,0c0,6.1,4.9,11,11,11
-				c6.1,0,11-4.9,11-11h0C160,91,154.8,78.3,146.2,68.6z"/>
-		</g>
-	</g>
-</g>
-</svg>
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/early_stopping.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/early_stopping.cc
deleted file mode 100644
index 41f74b803..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/early_stopping.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-#include "early_stopping.h"
-
-#include <algorithm>
-#include <cmath>
-#include <cstdint>
-#include <iostream>
-#include <map>
-#include <numeric>
-#include <string>
-
-namespace mlperf {
-
-namespace loadgen {
-
-double lbeta(int64_t x, int64_t y) {
-  return std::lgamma(x) + std::lgamma(y) - std::lgamma(x + y);
-}
-
-// The Gaussian Hypergeometric function specialized for a = 1.
-// Based on http://dlmf.nist.gov/15.2.E1.
-// Converges if c > 0 and (b <= 0 or x < 1).
-// TODO(ckstanton): http://dlmf.nist.gov/15.2.E1 says there are transformations
-// to replace x with with a value less than 0.5, for faster convergence.
-// Presently, this function can take up to 200,000 iterations to converge.
-double hypergeometric_2F1_A1(int64_t b, int64_t c, double x) {
-  // TODO(ckstanton): Is there a more principled way to pick kTolerance?
-  constexpr double kTolerance = 1.0 / (1LL << 33);
-  double term = 1.0;
-  double result = 1.0;
-  for (int64_t i = 0; std::abs(term) > kTolerance; ++i) {
-    term *= (b + i) * x / (c + i);
-    result += term;
-  }
-  return result;
-}
-
-// BetaRegularized[x, a, b] =
-// Beta[x, a, b]/Beta[a, b] =
-// x^a/a Hypergeometric2F1[a, 1-b, 1+a, x]/Beta[a, b] =
-// (http://dlmf.nist.gov/15.8.E1.)
-// x^a/a (1-x)^(b-1) Hypergeometric2F1[1, 1-b, 1+a, x/(x-1)]/Beta[a, b]
-double beta_regularized(double x, int64_t a, int64_t b) {
-  return std::exp(a * std::log(x) + (b - 1) * std::log(1 - x) - lbeta(a, b)) /
-         a * hypergeometric_2F1_A1(1 - b, 1 + a, x / (x - 1));
-}
-
-// Compute the odds of t or fewer overlatency queries in h + t total queries.
-// The binomial distribution is the discrete probability distribution for
-// independent boolean experiments. The CDF of the binomial distribution is:
-// BetaRegularized[q, n - k, 1 + k] where 1 - q is the probability of an event
-// per experiment, n is the total number of experiments, and k is the number of
-// events. An even in our case is an overlatency query, so q = p - d, n = h + t,
-// and k = t.
-// Sum[Binomial[h + t, x] (p - d)^(h + t - x) (1 - p + d)^x, {x, 0, t}] =
-// BetaRegularized[p - d, h, 1 + t]
-double odds(int64_t h, int64_t t, double p, double d) {
-  return beta_regularized(p - d, h, 1 + t);
-}
-
-// Binary search to find the minimum value h such that:
-// odds(h, t, p, d) <= 1 - c on the range [min_h, max_h] given t, p, d, and c.
-int64_t find_min_passing(int64_t min_h, int64_t max_h, int64_t t, double p,
-                         double d, double c) {
-  int64_t count = max_h - min_h;
-  while (count > 0) {
-    int64_t step = count / 2;
-    int64_t h = min_h + step;
-    double prob = odds(h, t, p, d);
-    if (prob < 1 - c) {
-      count = step;
-    } else {
-      min_h = h + 1;
-      count -= step + 1;
-    }
-  }
-  return min_h;
-}
-
-int64_t MinPassingQueriesFinder::operator()(int64_t t, double p, double d,
-                                            double c) {
-  // Given t, p, d, and c, return the minimum h such that odds(h, t, p, d) <= 1
-  // - c
-
-  auto &cache = caches_[std::make_tuple(p, d, c)];
-  auto it = cache.lower_bound(t);
-  if (it != cache.end() && it->first == t) {
-    return it->second;
-  }
-
-  int64_t x0 = -1;
-  int64_t y0 = 0;
-  int64_t x1 = 0;
-  int64_t y1 = std::ceil(std::log(1 - c) / std::log(p - d));
-
-  if (it != cache.begin()) {
-    --it;
-    x1 = it->first;
-    y1 = it->second;
-  }
-
-  if (it != cache.begin()) {
-    --it;
-    x0 = it->first;
-    y0 = it->second;
-  }
-
-  double min_slope = (p - d) / (1 - p + d);
-  double max_slope = (y1 - y0) * (x1 - x0);
-  int64_t min_h = (t - x1) * min_slope + y1;
-  int64_t max_h = (t - x1) * max_slope + y1 + 1;
-  int64_t h = find_min_passing(min_h, max_h, t, p, d, c);
-  cache[t] = h;
-  return h;
-}
-
-}  // namespace loadgen
-}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/early_stopping.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/early_stopping.h
deleted file mode 100644
index 49b7a901e..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/early_stopping.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef MLPERF_LOADGEN_EARLYSTOPPING_H_
-#define MLPERF_LOADGEN_EARLYSTOPPING_H_
-
-#include <cstdint>
-#include <map>
-
-namespace mlperf {
-namespace loadgen {
-
-class MinPassingQueriesFinder {
- public:
-  int64_t operator()(int64_t t, double p, double d, double c);
-
- private:
-  // Memoize prior computations results and use them to bound the binary search
-  // range for subsequent computations.
-
-  // TODO: Is there something more efficient to use besides std::map for
-  // caches_?
-  std::map<std::tuple<double, double, double>, std::map<int64_t, int64_t>>
-      caches_;
-};
-
-}  // namespace loadgen
-}  // namespace mlperf
-
-#endif  // MLPERF_LOADGEN_EARLYSTOPPING_H_
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/generated/version_generated.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/generated/version_generated.cc
deleted file mode 100644
index 75fdc9519..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/generated/version_generated.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// DO NOT EDIT: Autogenerated by version_generator.py.
-
-#include <string>
-
-namespace mlperf {
-
-const std::string& LoadgenVersion() {
-  static const std::string str = "4.1";
-  return str;
-}
-
-const std::string& LoadgenBuildDateLocal() {
-  static const std::string str = "2024-10-18T23:12:51.002440";
-  return str;
-}
-
-const std::string& LoadgenBuildDateUtc() {
-  static const std::string str = "2024-10-19T06:12:51.002446";
-  return str;
-}
-
-const std::string& LoadgenGitRevision() {
-  static const std::string str = "f5c8f17583";
-  return str;
-}
-
-const std::string& LoadgenGitCommitDate() {
-  static const std::string str = "2024-10-08T18:30:16+01:00";
-  return str;
-}
-
-const std::string& LoadgenGitStatus() {
-  static const std::string str = R"LGVG_RSLD()LGVG_RSLD";
-  return str;
-}
-
-const std::string& LoadgenGitLog() {
-  static const std::string str = R"LGVG_RSLD(f5c8f1758374aeaba26b2e84d31690111cfdf054 Fix bug: Loadgen ignoring token latency targets in user conf (#1874)
-976bb1ad9c7946be79507f3ff67955c27426af52 Set correct remote repo (#1871)
-41fa8aadd1ba0ecc97f6a519d8b42b04278e5f24 Add format files github action (#1682)
-518b454fd8647bfbd23a074e875e87353f33393e Tflite tpu (#1449)
-e0fdec1c7a75c98cfc194f13d62ac4388d419c8a Fix link in GettingStarted.ipynb (#1512)
-92bd8198d15411d7fb7d7c27f8904bc5a0bcfe7a Fix warning in the submission checker (#1808)
-224cfbf5c0e82cae6d48620025b7e1258ae3666a Fix typo in reference datatype (#1851)
-3ef1249b7f50a250c02c568342e0aea6638fc5a7 Fix docs (#1853)
-a0874c100c54cbc54fb743ac8bf9fb5fadc64135 Update build_wheels.yml (#1758)
-6eff09986e337ccf03f675c9f244d8ee93644e16 Extend the final report generation script to output a json file of results (#1825)
-54f3f93a73cc8ca5e3319ad87fb325e510574f56 Add binding for server_num_issue_query_threads parameter (#1862)
-c4d0b3ea98e6fe7252e50cb573f0d523da7979df Update docs: SCC24,  fix broken redirect  (#1843)
-7d2f0c41e5cd79c9178702867392e38f57953338 Update DLRM readme (#1811)
-cf5fddc5d0746bf3820eb0ab7294bbf709d788ab Enable systems to be marked as power only (#1850)
-81c2de69de4af90410cd1ba000fc5bd731bf6dee Documentation updates (#1821)
-73b02798219c794a735a7f2ddabbc3df9173352d Fix error with generate_final_report.py when the input CSV file is empty (#1827))LGVG_RSLD";
-  return str;
-}
-
-const std::string& LoadgenSha1OfFiles() {
-  static const std::string str = R"LGVG_RSLD(012aad77e5206c89d50718c46c119d1f3cb056b2 /.clang-format
-e173f4513f3c5dac1f0bea1473bb0a058e23f190 /=42
-d5274ff0b56e8d3cdb273174628a4461fca6f02a /CMakeLists.txt
-20a55bb946c2c0bbb564ced2af1e48efd096b3a8 /README.md
-5f6c6a784e9cd6995db47f9b9f70b1769909c9d8 /README_BUILD.md
-01f9ae9887f50bc030dc6107e740f40c43ca388f /README_FAQ.md
-32181da9e161c285f8fe46ddaa49e6cba2f9f918 /bindings/c_api.cc
-91f58bd79b83b278f3240174a9af747fc38aff74 /bindings/c_api.h
-ea4c89decad19eaf3217bfa2fb757d3b83a561d6 /bindings/python_api.cc
-53dba8ad4272190ceb6335c12fd25e53dc02a8cb /diagram_network_submission.png
-84c2f79309b237cef652aef6a187ba8e875a3952 /diagram_submission.png
-0cd7b546a389deac73f7955cd39255ed76557d62 /early_stopping.cc
-158fcae6a5f47e82150d6416fa1f7bcef37e77fe /early_stopping.h
-126e952d00f4ea9efd12405fb209aa3ed585e4b2 /issue_query_controller.cc
-923d9d5cdf598e3ec33d7a1110a31f7e11527ec7 /issue_query_controller.h
-6650091ba7a918f343b06eb7a5aa540eae87275f /loadgen.cc
-e00fdc6dbc85a8c9a8485dbcbfe2944f81251c4e /loadgen.h
-47f748307536f80cfc606947b440dd732afc2637 /loadgen_integration_diagram.svg
-197efc96d178e5d33a750d07fa7b2966417506ea /logging.cc
-ddb961df7bcc145bcd7cce8c21f7cf075350dcbe /logging.h
-ca17720f9c8246e821331946d893e830fc88f8bd /pyproject.toml
-13ad6d842200cb161d6927eb74a3fafd79c46c75 /query_dispatch_library.h
-e9187c8612bbdc972305b789feb6e15c26e96cfe /query_sample.h
-8323a2225be1dff31f08ecc86b76eb3de06568bc /query_sample_library.h
-a5ff7e77caa6e9e22ada90f0de0c865c987bf167 /requirements.txt
-34e2d2a44324cb07c884f92146ecbb8ef9d704e2 /results.cc
-d82500c326c2de83db411f1146882aa4692b419c /results.h
-13c49b028b22749b5f3c44f3d9bb489e8c0574e9 /setup.py
-18d4809589dae33317d88d9beeb5491a6e1ccdec /system_under_test.h
-c15c3e150030089a8d634bd2ad6d4b644002e613 /test_settings.h
-e21febd60f9b5bedd1fc81bb990f09c34b32043c /test_settings_internal.cc
-f1d5335b53ca610c30e0edc5d07999a27b5b4b9a /test_settings_internal.h
-3df8fdabf6eaea4697cf25d1dcb89cae88e36efd /utils.cc
-40775e32d619ea6356826ae5ea4174c7911f6894 /utils.h
-cbec2a5f98f9786c8c3d8b06b3d12df0b6550fa0 /version.cc
-9d574baa64424e9c708fcfedd3dbb0b518a65fcc /version.h
-eea9b9cb1a06cd1abe1bbdaee82f9af31527fedb /version_generator.py)LGVG_RSLD";
-  return str;
-}
-
-}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/issue_query_controller.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/issue_query_controller.cc
deleted file mode 100644
index c1abea9d1..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/issue_query_controller.cc
+++ /dev/null
@@ -1,552 +0,0 @@
-/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/// \file
-/// \brief Implements IssueQueryController and other helper classes for
-/// query issuing.
-
-#include "issue_query_controller.h"
-
-#include <sstream>
-
-namespace mlperf {
-
-void RegisterIssueQueryThread() {
-  loadgen::IssueQueryController::GetInstance().RegisterThread();
-}
-
-/// \brief Loadgen implementation details.
-namespace loadgen {
-
-QueryMetadata::QueryMetadata(
-    const std::vector<QuerySampleIndex>& query_sample_indices,
-    std::chrono::nanoseconds scheduled_delta,
-    ResponseDelegate* response_delegate, SequenceGen* sequence_gen)
-    : scheduled_delta(scheduled_delta),
-      response_delegate(response_delegate),
-      sequence_id(sequence_gen->NextQueryId()),
-      wait_count_(query_sample_indices.size()) {
-  samples_.reserve(query_sample_indices.size());
-  for (QuerySampleIndex qsi : query_sample_indices) {
-    samples_.push_back({this, sequence_gen->NextSampleId(), qsi,
-                        sequence_gen->NextAccLogRng()});
-  }
-  query_to_send.reserve(query_sample_indices.size());
-  for (auto& s : samples_) {
-    query_to_send.push_back({reinterpret_cast<ResponseId>(&s), s.sample_index});
-  }
-}
-
-QueryMetadata::QueryMetadata(QueryMetadata&& src)
-    : query_to_send(std::move(src.query_to_send)),
-      scheduled_delta(src.scheduled_delta),
-      response_delegate(src.response_delegate),
-      sequence_id(src.sequence_id),
-      wait_count_(src.samples_.size()),
-      samples_(std::move(src.samples_)) {
-  // The move constructor should only be called while generating a
-  // vector of QueryMetadata, before it's been used.
-  // Assert that wait_count_ is in its initial state.
-  assert(src.wait_count_.load() == samples_.size());
-  // Update the "parent" of each sample to be this query; the old query
-  // address will no longer be valid.
-  // TODO: Only set up the sample parenting once after all the queries have
-  //       been created, rather than re-parenting on move here.
-  for (size_t i = 0; i < samples_.size(); i++) {
-    SampleMetadata* s = &samples_[i];
-    s->query_metadata = this;
-    query_to_send[i].id = reinterpret_cast<ResponseId>(s);
-  }
-}
-
-void QueryMetadata::NotifyOneSampleCompleted(PerfClock::time_point timestamp) {
-  size_t old_count = wait_count_.fetch_sub(1, std::memory_order_relaxed);
-  if (old_count == 1) {
-    all_samples_done_time = timestamp;
-    all_samples_done_.set_value();
-    response_delegate->QueryComplete();
-  }
-}
-
-void QueryMetadata::WaitForAllSamplesCompleted() {
-  all_samples_done_.get_future().wait();
-}
-
-PerfClock::time_point QueryMetadata::WaitForAllSamplesCompletedWithTimestamp() {
-  all_samples_done_.get_future().wait();
-  return all_samples_done_time;
-}
-
-// When server_coalesce_queries is set to true in Server scenario, we
-// sometimes coalesce multiple queries into one query. This is done by moving
-// the other query's sample into current query, while maintaining their
-// original scheduled_time.
-void QueryMetadata::CoalesceQueries(QueryMetadata* queries, size_t first,
-                                    size_t last, size_t stride) {
-  // Copy sample data over to current query, boldly assuming that each query
-  // only has one sample.
-  query_to_send.reserve((last - first) / stride +
-                        2);  // Extra one for the current query.
-  for (size_t i = first; i <= last; i += stride) {
-    auto& q = queries[i];
-    auto& s = q.samples_[0];
-    query_to_send.push_back({reinterpret_cast<ResponseId>(&s), s.sample_index});
-    q.scheduled_time = scheduled_time + q.scheduled_delta - scheduled_delta;
-    q.issued_start_time = issued_start_time;
-  }
-}
-
-void QueryMetadata::Decoalesce() { query_to_send.resize(1); }
-
-/// \brief A base template that should never be used since each scenario has
-/// its own specialization.
-template <TestScenario scenario>
-struct QueryScheduler {
-  static_assert(scenario != scenario, "Unhandled TestScenario");
-};
-
-/// \brief Schedules queries for issuance in the single stream scenario.
-template <>
-struct QueryScheduler<TestScenario::SingleStream> {
-  QueryScheduler(const TestSettingsInternal& /*settings*/,
-                 const PerfClock::time_point) {}
-
-  PerfClock::time_point Wait(QueryMetadata* next_query) {
-    auto tracer = MakeScopedTracer([](AsyncTrace& trace) { trace("Waiting"); });
-    if (prev_query != nullptr) {
-      prev_query->WaitForAllSamplesCompleted();
-    }
-    prev_query = next_query;
-
-    auto now = PerfClock::now();
-    next_query->scheduled_time = now;
-    next_query->issued_start_time = now;
-    return now;
-  }
-
-  QueryMetadata* prev_query = nullptr;
-};
-
-/// \brief Schedules queries for issuance in the multi stream scenario.
-template <>
-struct QueryScheduler<TestScenario::MultiStream> {
-  QueryScheduler(const TestSettingsInternal& /*settings*/,
-                 const PerfClock::time_point) {}
-
-  PerfClock::time_point Wait(QueryMetadata* next_query) {
-    auto tracer = MakeScopedTracer([](AsyncTrace& trace) { trace("Waiting"); });
-    if (prev_query != nullptr) {
-      prev_query->WaitForAllSamplesCompleted();
-    }
-    prev_query = next_query;
-
-    auto now = PerfClock::now();
-    next_query->scheduled_time = now;
-    next_query->issued_start_time = now;
-    return now;
-  }
-
-  QueryMetadata* prev_query = nullptr;
-};
-
-/// \brief Schedules queries for issuance in the server scenario.
-template <>
-struct QueryScheduler<TestScenario::Server> {
-  QueryScheduler(const TestSettingsInternal& /*settings*/,
-                 const PerfClock::time_point start)
-      : start(start) {}
-
-  PerfClock::time_point Wait(QueryMetadata* next_query) {
-    auto tracer =
-        MakeScopedTracer([](AsyncTrace& trace) { trace("Scheduling"); });
-
-    auto scheduled_time = start + next_query->scheduled_delta;
-    next_query->scheduled_time = scheduled_time;
-
-    auto now = PerfClock::now();
-    if (now < scheduled_time) {
-      std::this_thread::sleep_until(scheduled_time);
-      now = PerfClock::now();
-    }
-    next_query->issued_start_time = now;
-    return now;
-  }
-
-  const PerfClock::time_point start;
-};
-
-/// \brief Schedules queries for issuance in the offline scenario.
-template <>
-struct QueryScheduler<TestScenario::Offline> {
-  QueryScheduler(const TestSettingsInternal& /*settings*/,
-                 const PerfClock::time_point start)
-      : start(start) {}
-
-  PerfClock::time_point Wait(QueryMetadata* next_query) {
-    next_query->scheduled_time = start;
-    auto now = PerfClock::now();
-    next_query->issued_start_time = now;
-    return now;
-  }
-
-  const PerfClock::time_point start;
-};
-
-IssueQueryController& IssueQueryController::GetInstance() {
-  // The singleton.
-  static IssueQueryController instance;
-  return instance;
-}
-
-void IssueQueryController::RegisterThread() {
-  // Push this thread to thread queue.
-  auto thread_id = std::this_thread::get_id();
-  size_t thread_idx{0};
-  {
-    std::lock_guard<std::mutex> lock(mtx);
-    thread_idx = thread_ids.size();
-    thread_ids.emplace_back(thread_id);
-  }
-
-  LogDetail([thread_id, thread_idx](AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-    std::stringstream ss;
-    ss << "Registered IssueQueryThread[" << thread_idx
-       << "]. thread ID : " << std::hash<std::thread::id>()(thread_id);
-    MLPERF_LOG(detail, "generic_message", ss.str());
-#else
-    detail("Registered IssueQueryThread[" + std::to_string(thread_idx) +
-               "]. thread ID : ",
-           std::to_string(std::hash<std::thread::id>()(thread_id)));
-#endif
-  });
-
-  // Start test.
-  while (true) {
-    // Wait until the main thread signals a start or the end.
-    {
-      std::unique_lock<std::mutex> lock(mtx);
-      cond_var.wait(lock, [this]() { return issuing || end_test; });
-      // The test has ended.
-      if (end_test) {
-        break;
-      }
-    }
-
-    // Start issuing queries.
-    if (thread_idx <= num_threads) {
-      IssueQueriesInternal<TestScenario::Server, true>(num_threads, thread_idx);
-      {
-        std::lock_guard<std::mutex> lock(mtx);
-        thread_complete[thread_idx] = true;
-      }
-      cond_var.notify_all();
-    }
-
-    // Wait until all issue threads complete.
-    {
-      std::unique_lock<std::mutex> lock(mtx);
-      cond_var.wait(lock, [this]() { return !issuing; });
-    }
-  }
-}
-
-void IssueQueryController::SetNumThreads(size_t n) {
-  // Try waiting for IssueQueryThreads() to registered themselves.
-  std::unique_lock<std::mutex> lock(mtx);
-  const std::chrono::seconds timeout(10);
-  num_threads = n;
-  cond_var.wait_for(lock, timeout,
-                    [this]() { return thread_ids.size() >= num_threads; });
-  // If the number of registered threads do not match the settings, report an
-  // error.
-  if (num_threads != thread_ids.size()) {
-    LogDetail([this](AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-      std::stringstream ss;
-      ss << "Mismatch between settings and number of registered "
-         << "IssueQueryThreads! settings.server_num_issue_query_threads = "
-         << num_threads << " but " << thread_ids.size()
-         << " threads registered.";
-      MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
-#else
-      detail.Error(
-          "Mismatch between settings and number of registered ",
-          "IssueQueryThreads! settings.server_num_issue_query_threads = ",
-          num_threads, " but ", thread_ids.size(), " threads registered.");
-#endif
-    });
-  }
-}
-
-template <TestScenario scenario>
-void IssueQueryController::StartIssueQueries(IssueQueryState* s) {
-  // Get the state.
-  state = s;
-  state->start_for_power = std::chrono::system_clock::now();
-  state->start_time = PerfClock::now();
-
-  if (scenario != TestScenario::Server || num_threads == 0) {
-    // Usually, we just use the same thread to issue queries.
-    IssueQueriesInternal<scenario, false>(1, 0);
-  } else {
-    // If server_num_issue_query_threads is non-zero, issue queries on the
-    // registered threads.
-    // Tell all threads to start issuing queries.
-    {
-      std::unique_lock<std::mutex> lock(mtx);
-      issuing = true;
-      thread_complete.assign(num_threads, false);
-    }
-    cond_var.notify_all();
-    // Wait until all issue threads complete.
-    {
-      std::unique_lock<std::mutex> lock(mtx);
-      cond_var.wait(lock, [this]() {
-        return std::all_of(thread_complete.begin(), thread_complete.end(),
-                           [](bool in) { return in; });
-      });
-      issuing = false;
-    }
-    cond_var.notify_all();
-  }
-}
-
-template void IssueQueryController::StartIssueQueries<
-    TestScenario::MultiStream>(IssueQueryState* s);
-template void IssueQueryController::StartIssueQueries<TestScenario::Offline>(
-    IssueQueryState* s);
-template void IssueQueryController::StartIssueQueries<TestScenario::Server>(
-    IssueQueryState* s);
-template void IssueQueryController::StartIssueQueries<
-    TestScenario::SingleStream>(IssueQueryState* s);
-
-void IssueQueryController::EndThreads() {
-  // Tell all the issue threads to end.
-  {
-    std::lock_guard<std::mutex> lock(mtx);
-    end_test = true;
-  }
-  cond_var.notify_all();
-}
-
-template <TestScenario scenario, bool multi_thread>
-void IssueQueryController::IssueQueriesInternal(size_t query_stride,
-                                                size_t thread_idx) {
-  // Get all the needed information.
-  auto sut = state->sut;
-  auto& queries = *state->queries;
-  auto& response_logger = *state->response_delegate;
-
-  // Some book-keeping about the number of queries issued.
-  size_t queries_issued = 0;
-  size_t queries_issued_per_iter = 0;
-  size_t queries_count = queries.size();
-
-  // Calculate the min/max queries per issue thread.
-  const auto& settings = *state->settings;
-  const size_t min_query_count = settings.min_query_count;
-  const size_t min_query_count_for_thread =
-      (thread_idx < (min_query_count % query_stride))
-          ? (min_query_count / query_stride + 1)
-          : (min_query_count / query_stride);
-  const size_t max_query_count = settings.max_query_count;
-  const size_t max_query_count_for_thread =
-      (thread_idx < (max_query_count % query_stride))
-          ? (max_query_count / query_stride + 1)
-          : (max_query_count / query_stride);
-
-  // Create query scheduler.
-  const auto start = state->start_time;
-  QueryScheduler<scenario> query_scheduler(settings, start);
-  auto last_now = start;
-
-  // We can never run out of generated queries in the server scenario,
-  // since the duration depends on the scheduled query time and not
-  // the actual issue time.
-  bool ran_out_of_generated_queries = scenario != TestScenario::Server;
-  // This is equal to the sum of numbers of samples issued.
-  size_t expected_latencies = 0;
-
-  for (size_t queries_idx = thread_idx; queries_idx < queries_count;
-       queries_idx += query_stride) {
-    queries_issued_per_iter = 0;
-    auto& query = queries[queries_idx];
-    auto tracer1 =
-        MakeScopedTracer([](AsyncTrace& trace) { trace("SampleLoop"); });
-    last_now = query_scheduler.Wait(&query);
-
-    // If in Server scenario and server_coalesce_queries is enabled, multiple
-    // queries are coalesed into one big query if the current time has already
-    // passed the scheduled time of multiple queries.
-    if (scenario == TestScenario::Server &&
-        settings.requested.server_coalesce_queries) {
-      auto current_query_idx = queries_idx;
-      for (; queries_idx + query_stride < queries_count;
-           queries_idx += query_stride) {
-        auto next_scheduled_time =
-            start + queries[queries_idx + query_stride].scheduled_delta;
-        // If current time hasn't reached the next query's scheduled time yet,
-        // don't include next query.
-        if (last_now < next_scheduled_time) {
-          break;
-        }
-        queries_issued_per_iter++;
-      }
-      if (queries_idx > current_query_idx) {
-        // Coalesced all the pass due queries.
-        query.CoalesceQueries(queries.data(), current_query_idx + query_stride,
-                              queries_idx, query_stride);
-      }
-    }
-
-    // Issue the query to the SUT.
-    {
-      auto tracer3 =
-          MakeScopedTracer([](AsyncTrace& trace) { trace("IssueQuery"); });
-      sut->IssueQuery(query.query_to_send);
-    }
-
-    // Increment the counter.
-    expected_latencies += query.query_to_send.size();
-    queries_issued_per_iter++;
-    queries_issued += queries_issued_per_iter;
-
-    if (scenario == TestScenario::Server &&
-        settings.requested.server_coalesce_queries) {
-      // Set the query back to its clean state.
-      query.Decoalesce();
-    }
-
-    if (state->mode == TestMode::AccuracyOnly) {
-      // TODO: Rate limit in accuracy mode so accuracy mode works even
-      //       if the expected/target performance is way off.
-      continue;
-    }
-
-    auto duration = (last_now - start);
-    if (scenario == TestScenario::Server) {
-      if (settings.max_async_queries != 0) {
-        // Checks if there are too many outstanding queries.
-        size_t queries_issued_total{0};
-        if (multi_thread) {
-          // To check actual number of async queries in multi-thread case,
-          // we would have to combine the number of queries_issued from all
-          // issue threads.
-          {
-            std::lock_guard<std::mutex> lock(state->mtx);
-            state->queries_issued += queries_issued_per_iter;
-            queries_issued_total = state->queries_issued;
-          }
-        } else {
-          queries_issued_total = queries_issued;
-        }
-        size_t queries_outstanding =
-            queries_issued_total -
-            response_logger.queries_completed.load(std::memory_order_relaxed);
-        if (queries_outstanding > settings.max_async_queries) {
-          LogDetail([thread_idx, queries_issued_total,
-                     queries_outstanding](AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-            std::stringstream ss;
-            ss << "IssueQueryThread " << thread_idx
-               << " Ending early: Too many outstanding queries." << " issued "
-               << queries_issued_total << " outstanding "
-               << queries_outstanding;
-            MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
-#else
-            detail.Error("IssueQueryThread ", std::to_string(thread_idx),
-                         " Ending early: Too many outstanding queries.",
-                         "issued", std::to_string(queries_issued_total),
-                         "outstanding", std::to_string(queries_outstanding));
-#endif
-          });
-          break;
-        }
-      }
-    } else {
-      // Checks if we end normally.
-      if (queries_issued >= min_query_count_for_thread &&
-          duration >= settings.target_duration) {
-        LogDetail([thread_idx](AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-          MLPERF_LOG(
-              detail, "generic_message",
-              "Ending naturally: Minimum query count and test duration met.");
-#else
-          detail(
-              " Ending naturally: Minimum query count and test duration met.");
-#endif
-        });
-        ran_out_of_generated_queries = false;
-        break;
-      }
-    }
-
-    // Checks if we have exceeded max_query_count for this thread.
-    if (settings.max_query_count != 0 &&
-        queries_issued >= max_query_count_for_thread) {
-      LogDetail([thread_idx, queries_issued](AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-        std::stringstream ss;
-        ss << "IssueQueryThread " << thread_idx
-           << " Ending early: Max query count reached." << " query_count "
-           << queries_issued;
-        MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
-#else
-        detail.Error("IssueQueryThread ", std::to_string(thread_idx),
-                     " Ending early: Max query count reached.", "query_count",
-                     std::to_string(queries_issued));
-#endif
-      });
-      ran_out_of_generated_queries = false;
-      break;
-    }
-
-    // Checks if we have exceeded max_duration.
-    if (settings.max_duration.count() != 0 &&
-        duration > settings.max_duration) {
-      LogDetail([thread_idx, duration](AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-        std::stringstream ss;
-        ss << "IssueQueryThread " << thread_idx
-           << " Ending early: Max test duration reached." << " duration_ns "
-           << duration.count();
-        MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
-#else
-        detail.Error("IssueQueryThread ", std::to_string(thread_idx),
-                     " Ending early: Max test duration reached.", "duration_ns",
-                     std::to_string(duration.count()));
-#endif
-      });
-      ran_out_of_generated_queries = false;
-      break;
-    }
-  }
-
-  // Combine the issuing statistics from multiple issue threads.
-  {
-    std::lock_guard<std::mutex> lock(state->mtx);
-    state->ran_out_of_generated_queries |= ran_out_of_generated_queries;
-    // In Server scenario and when max_async_queries != 0, we would have set
-    // state->queries_issued when we check max_async_queries in the loop.
-    if (!(scenario == TestScenario::Server && settings.max_async_queries != 0 &&
-          multi_thread)) {
-      state->queries_issued += queries_issued;
-    }
-    state->expected_latencies += expected_latencies;
-  }
-}
-
-}  // namespace loadgen
-
-}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/issue_query_controller.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/issue_query_controller.h
deleted file mode 100644
index 5668c574e..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/issue_query_controller.h
+++ /dev/null
@@ -1,215 +0,0 @@
-/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/// \file
-/// \brief Declare IssueQueryController and other helper classes for
-/// query issuing.
-
-#ifndef MLPERF_LOADGEN_ISSUE_QUERY_CONTROLLER_H_
-#define MLPERF_LOADGEN_ISSUE_QUERY_CONTROLLER_H_
-
-#include <stdint.h>
-
-#include <atomic>
-#include <chrono>
-#include <condition_variable>
-#include <future>
-#include <mutex>
-#include <queue>
-#include <random>
-#include <thread>
-#include <vector>
-
-#include "loadgen.h"
-#include "logging.h"
-#include "query_sample.h"
-#include "system_under_test.h"
-#include "test_settings_internal.h"
-#include "utils.h"
-
-namespace mlperf {
-
-namespace loadgen {
-
-struct SampleMetadata;
-class QueryMetadata;
-
-/// \brief Every query and sample within a call to StartTest gets a unique
-/// sequence id for easy cross reference, and a random number which is used to
-/// determine accuracy logging when it is enabled.
-struct SequenceGen {
-  uint64_t NextQueryId() { return query_id++; }
-  uint64_t NextSampleId() { return sample_id++; }
-  uint64_t CurrentSampleId() { return sample_id; }
-  double NextAccLogRng() { return accuracy_log_dist(accuracy_log_rng); }
-  void InitAccLogRng(uint64_t accuracy_log_rng_seed) {
-    accuracy_log_rng = std::mt19937(accuracy_log_rng_seed);
-  }
-
- private:
-  uint64_t query_id = 0;
-  uint64_t sample_id = 0;
-  std::mt19937 accuracy_log_rng;
-  std::uniform_real_distribution<double> accuracy_log_dist =
-      std::uniform_real_distribution<double>(0, 1);
-};
-
-/// \brief An interface for a particular scenario + mode to implement for
-/// extended hanlding of sample completion.
-struct ResponseDelegate {
-  virtual ~ResponseDelegate() = default;
-  virtual void SampleComplete(SampleMetadata*, QuerySampleResponse*,
-                              PerfClock::time_point,
-                              const ResponseCallback&) = 0;
-  virtual void TokenComplete(SampleMetadata*, QuerySampleResponse*,
-                             PerfClock::time_point,
-                             const ResponseCallback&) = 0;
-  virtual void QueryComplete() = 0;
-  std::atomic<size_t> queries_completed{0};
-};
-
-/// \brief Used by the loadgen to coordinate response data and completion.
-struct SampleMetadata {
-  QueryMetadata* query_metadata;
-  uint64_t sequence_id;
-  QuerySampleIndex sample_index;
-  double accuracy_log_val;
-};
-
-/// \brief Maintains data and timing info for a query and all its samples.
-class QueryMetadata {
- public:
-  QueryMetadata(const std::vector<QuerySampleIndex>& query_sample_indices,
-                std::chrono::nanoseconds scheduled_delta,
-                ResponseDelegate* response_delegate, SequenceGen* sequence_gen);
-  QueryMetadata(QueryMetadata&& src);
-
-  void NotifyOneSampleCompleted(PerfClock::time_point timestamp);
-
-  void WaitForAllSamplesCompleted();
-
-  PerfClock::time_point WaitForAllSamplesCompletedWithTimestamp();
-
-  /// \brief Coalesce multiple queries into one query.
-  /// When server_coalesce_queries is set to true in Server scenario, we
-  /// sometimes coalesce multiple queries into one query. This is done by moving
-  /// the other query's sample into current query, while maintaining their
-  /// original scheduled_time.
-  void CoalesceQueries(QueryMetadata* queries, size_t first, size_t last,
-                       size_t stride);
-
-  /// \brief Set a coalesced query back to its original state.
-  void Decoalesce();
-
- public:
-  std::vector<QuerySample> query_to_send;
-  const std::chrono::nanoseconds scheduled_delta;
-  ResponseDelegate* const response_delegate;
-  const uint64_t sequence_id;
-
-  // Performance information.
-
-  size_t scheduled_intervals = 0;  // Number of intervals between queries, as
-                                   // actually scheduled during the run.
-                                   // For the MultiStream scenario only.
-  PerfClock::time_point scheduled_time;
-  PerfClock::time_point issued_start_time;
-  PerfClock::time_point all_samples_done_time;
-
- private:
-  std::atomic<size_t> wait_count_;
-  std::promise<void> all_samples_done_;
-  std::vector<SampleMetadata> samples_;
-};
-
-/// \brief A state object for communications between the controller and its
-/// caller.
-struct IssueQueryState {
-  // Information from caller to controller.
-  SystemUnderTest* sut;
-  std::vector<QueryMetadata>* queries;
-  ResponseDelegate* response_delegate;
-  const TestSettingsInternal* settings;
-  TestMode mode;
-  // Information from controller to caller.
-  std::chrono::system_clock::time_point start_for_power;
-  PerfClock::time_point start_time;
-  bool ran_out_of_generated_queries;
-  size_t queries_issued;
-  size_t expected_latencies;
-  // The lock to modify this state (in multi-thread case).
-  std::mutex mtx;
-};
-
-/// \brief Controls the query issuing part.
-/// This controller handles both the cases if the user registers or does not
-/// register IssueQueryThreads. It is implemented as a singleton, and is NOT
-/// thread-safe (i.e. users should not call StartTest() on multiple threads).
-/// It is thread-safe with regard to IssueQueryThreads.
-class IssueQueryController {
- public:
-  /// \brief Get the controller instance singleton.
-  static IssueQueryController& GetInstance();
-
-  /// \brief Don't allow copy. This is a singleton.
-  IssueQueryController(IssueQueryController const&) = delete;
-  void operator=(IssueQueryController const&) = delete;
-
-  /// \brief Register an IssueQueryThread.
-  /// It is blocking until the entire test ends.
-  void RegisterThread();
-
-  /// \brief Set number of IssueQueryThreads and wait for thread registration.
-  /// If for any reason the number of registered threads do not match the
-  /// specified number, it prints out an error.
-  void SetNumThreads(size_t n);
-
-  /// \brief Kick off the query issuing.
-  /// The query issuing will be done on the current thread if there is no
-  /// registered IssueQueryThreads or if it is not in Server scenario.
-  template <TestScenario scenario>
-  void StartIssueQueries(IssueQueryState* s);
-
-  /// \brief Notify the IssueQueryThreads to end.
-  void EndThreads();
-
- private:
-  /// \brief Hide constructor. This is a singleton.
-  IssueQueryController() {}
-
-  /// \brief The internal helper which actually issues queries.
-  /// This should be called by the thread(s) which issues queries.
-  template <TestScenario scenario, bool multi_thread>
-  void IssueQueriesInternal(size_t query_stride, size_t thread_idx);
-
-  /// \brief The issue query state.
-  IssueQueryState* state;
-  /// \brief Locks for communications across IssueQueryThreads and the main
-  /// thread.
-  std::mutex mtx;
-  std::condition_variable cond_var;
-  /// \brief Thread ids of the registered IssueQueryThreads.
-  std::vector<std::thread::id> thread_ids;
-  size_t num_threads{0};
-  /// \brief Whether the threads should be actively issuing queries.
-  bool issuing{false};
-  /// \brief Flags for each IssueQueryThread to mark that it is done.
-  std::vector<bool> thread_complete;
-  /// \brief Whether the threads can end now.
-  bool end_test{false};
-};
-
-}  // namespace loadgen
-
-}  // namespace mlperf
-
-#endif  // MLPERF_LOADGEN_ISSUE_QUERY_CONTROLLER_H_
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen.cc
deleted file mode 100644
index 42b2140de..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen.cc
+++ /dev/null
@@ -1,1345 +0,0 @@
-/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "loadgen.h"
-
-#include <stdint.h>
-
-#include <algorithm>
-#include <atomic>
-#include <cassert>
-#include <chrono>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <future>
-#include <iomanip>
-#include <iostream>
-#include <queue>
-#include <random>
-#include <sstream>
-#include <string>
-#include <thread>
-#include <vector>
-
-#include "early_stopping.h"
-#include "issue_query_controller.h"
-#include "logging.h"
-#include "query_sample.h"
-#include "query_sample_library.h"
-#include "results.h"
-#include "system_under_test.h"
-#include "test_settings.h"
-#include "test_settings_internal.h"
-#include "utils.h"
-#include "version.h"
-
-namespace mlperf {
-
-/// \brief Loadgen implementation details.
-namespace loadgen {
-
-/// \brief A random set of samples in the QSL that should fit in RAM when
-/// loaded together.
-struct LoadableSampleSet {
-  std::vector<QuerySampleIndex> set;
-  const size_t sample_distribution_end;  // Excludes padding in MultiStream.
-};
-
-/// \brief Generates nanoseconds from a start time to multiple end times.
-/// TODO: This isn't very useful anymore. Remove it.
-struct DurationGeneratorNs {
-  const PerfClock::time_point start;
-  int64_t delta(PerfClock::time_point end) const {
-    return std::chrono::duration_cast<std::chrono::nanoseconds>(end - start)
-        .count();
-  }
-};
-
-/// \brief ResponseDelegate implementation templated by scenario and mode.
-template <TestScenario scenario, TestMode mode>
-struct ResponseDelegateDetailed : public ResponseDelegate {
-  double accuracy_log_offset = 0.0f;
-  double accuracy_log_prob = 0.0f;
-
-  void SampleComplete(SampleMetadata* sample, QuerySampleResponse* response,
-                      PerfClock::time_point complete_begin_time,
-                      const ResponseCallback& response_cb) override {
-    // Using a raw pointer here should help us hit the std::function
-    // small buffer optimization code path when we aren't copying data.
-    // For some reason, using std::unique_ptr<std::vector> wasn't moving
-    // into the lambda; even with C++14.
-    std::vector<uint8_t>* sample_data_copy = nullptr;
-    double accuracy_log_val =
-        sample->accuracy_log_val + accuracy_log_offset < 1.0
-            ? sample->accuracy_log_val + accuracy_log_offset
-            : sample->accuracy_log_val + accuracy_log_offset - 1.0;
-    if (mode == TestMode::AccuracyOnly ||
-        accuracy_log_val <= accuracy_log_prob) {
-      // if a response_cb callback is provided, data only needs to reside on the
-      // host *after* calling it note that the callback is blocking and will
-      // likely involve a memcpy from accelerator to host
-      if (response_cb) {
-        response_cb(response);
-      }
-      // TODO: Verify accuracy with the data copied here.
-      uint8_t* src_begin = reinterpret_cast<uint8_t*>(response->data);
-      uint8_t* src_end = src_begin + response->size;
-      sample_data_copy = new std::vector<uint8_t>(src_begin, src_end);
-    }
-    int64_t n_tokens = response->n_tokens;
-    Log([sample, complete_begin_time, sample_data_copy,
-         n_tokens](AsyncLog& log) {
-      QueryMetadata* query = sample->query_metadata;
-      DurationGeneratorNs sched{query->scheduled_time};
-      if (scenario == TestScenario::Server) {
-        // Trace the server scenario as a stacked graph via counter events.
-        DurationGeneratorNs issued{query->issued_start_time};
-        log.TraceCounterEvent("Latency", query->scheduled_time, "issue_delay",
-                              sched.delta(query->issued_start_time),
-                              "issue_to_done",
-                              issued.delta(complete_begin_time));
-      }
-
-      // While visualizing overlapping samples in offline mode is not
-      // practical, sample completion is still recorded for auditing purposes.
-      log.TraceSample("Sample", sample->sequence_id, query->scheduled_time,
-                      complete_begin_time, "sample_seq", sample->sequence_id,
-                      "query_seq", query->sequence_id, "sample_idx",
-                      sample->sample_index, "issue_start_ns",
-                      sched.delta(query->issued_start_time), "complete_ns",
-                      sched.delta(complete_begin_time));
-
-      if (sample_data_copy) {
-        log.LogAccuracy(sample->sequence_id, sample->sample_index,
-                        LogBinaryAsHexString{sample_data_copy}, n_tokens);
-        delete sample_data_copy;
-      }
-
-      // Record the latency at the end, since it will unblock the issuing
-      // thread and potentially destroy the metadata being used above.
-      QuerySampleLatency latency = sched.delta(complete_begin_time);
-      log.RecordSampleCompletion(sample->sequence_id, complete_begin_time,
-                                 latency, n_tokens);
-    });
-  }
-
-  void TokenComplete(SampleMetadata* sample, QuerySampleResponse* response,
-                     PerfClock::time_point complete_begin_time,
-                     const ResponseCallback& response_cb) override {
-    // Using a raw pointer here should help us hit the std::function
-    // small buffer optimization code path when we aren't copying data.
-    // For some reason, using std::unique_ptr<std::vector> wasn't moving
-    // into the lambda; even with C++14.
-    std::vector<uint8_t>* token_data_copy = nullptr;
-    double accuracy_log_val =
-        sample->accuracy_log_val + accuracy_log_offset < 1.0
-            ? sample->accuracy_log_val + accuracy_log_offset
-            : sample->accuracy_log_val + accuracy_log_offset - 1.0;
-    if (mode == TestMode::AccuracyOnly ||
-        accuracy_log_val <= accuracy_log_prob) {
-      uint8_t* src_begin = reinterpret_cast<uint8_t*>(response->data);
-      uint8_t* src_end = src_begin + response->size;
-      token_data_copy = new std::vector<uint8_t>(src_begin, src_end);
-    }
-    Log([sample, complete_begin_time, token_data_copy](AsyncLog& log) {
-      QueryMetadata* query = sample->query_metadata;
-      DurationGeneratorNs sched{query->scheduled_time};
-      if (scenario == TestScenario::Server) {
-        DurationGeneratorNs issued{query->issued_start_time};
-        log.TraceCounterEvent(
-            "Token_Latency", query->scheduled_time, "issue_delay",
-            sched.delta(query->issued_start_time), "issue_to_done",
-            issued.delta(complete_begin_time));
-      } else {
-        log.TraceSample("Token", sample->sequence_id, query->scheduled_time,
-                        complete_begin_time, "sample_seq", sample->sequence_id,
-                        "query_seq", query->sequence_id, "sample_idx",
-                        sample->sample_index, "issue_start_ns",
-                        sched.delta(query->issued_start_time), "complete_ns",
-                        sched.delta(complete_begin_time));
-      }
-      if (token_data_copy) {
-        log.CacheToken(sample->sequence_id,
-                       LogBinaryAsHexString{token_data_copy});
-      }
-      QuerySampleLatency latency = sched.delta(complete_begin_time);
-      log.RecordTokenCompletion(sample->sequence_id, complete_begin_time,
-                                latency);
-    });
-  }
-
-  void QueryComplete() override {
-    // We only need to track outstanding queries in the server scenario to
-    // detect when the SUT has fallen too far behind.
-    if (scenario == TestScenario::Server) {
-      queries_completed.fetch_add(1, std::memory_order_relaxed);
-    }
-  }
-};
-
-/// \brief Selects the query timestamps for all scenarios except Server.
-template <TestScenario scenario>
-auto ScheduleDistribution(double qps) {
-  return [period = std::chrono::duration_cast<std::chrono::nanoseconds>(
-              std::chrono::duration<double>(1.0 / qps))](auto& /*gen*/) {
-    return period;
-  };
-}
-
-/// \brief Selects the query timestamps for the Server scenario.
-template <>
-auto ScheduleDistribution<TestScenario::Server>(double qps) {
-  // Poisson arrival process corresponds to exponentially distributed
-  // interarrival times.
-  return [dist = std::exponential_distribution<>(qps)](auto& gen) mutable {
-    return std::chrono::duration_cast<std::chrono::nanoseconds>(
-        std::chrono::duration<double>(dist(gen)));
-  };
-}
-
-/// \brief Selects samples for the accuracy mode.
-template <TestMode mode>
-auto SampleDistribution(size_t sample_count, size_t stride, std::mt19937* rng) {
-  std::vector<size_t> indices;
-  for (size_t i = 0; i < sample_count; i += stride) {
-    indices.push_back(i);
-  }
-  std::shuffle(indices.begin(), indices.end(), *rng);
-  return [indices = std::move(indices), i = size_t(0)](auto& /*gen*/) mutable {
-    return indices.at(i++);
-  };
-}
-
-/// \brief Selects samples for the performance mode.
-template <>
-auto SampleDistribution<TestMode::PerformanceOnly>(size_t sample_count,
-                                                   size_t /*stride*/,
-                                                   std::mt19937* /*rng*/) {
-  return [dist = std::uniform_int_distribution<>(0, sample_count - 1)](
-             auto& gen) mutable { return dist(gen); };
-}
-
-/// \brief Sample across the dataset, and ensure coverage of each of the
-/// samples.
-// Useful for non-uniform dataset (e.g. Llama2, GPTJ, 3d-unet)
-auto SampleDistributionEqualIssue(size_t sample_count, size_t set_size,
-                                  std::mt19937* rng) {
-  std::vector<size_t> indices;
-  std::vector<size_t> shuffle_indices(set_size);
-  std::iota(shuffle_indices.begin(), shuffle_indices.end(), 0);
-  for (size_t j = 0; j < sample_count; j += set_size) {
-    std::shuffle(shuffle_indices.begin(), shuffle_indices.end(), *rng);
-    indices.insert(indices.end(), shuffle_indices.begin(),
-                   shuffle_indices.end());
-  }
-  return [indices = std::move(indices), i = size_t(0)](auto& /*gen*/) mutable {
-    return indices.at((i++) % indices.size());
-  };
-}
-
-/// \brief Generates queries for the requested settings, templated by
-/// scenario and mode.
-/// \todo Make GenerateQueries faster.
-/// QueryMetadata is expensive to move; either reserve queries in advance
-/// so the queries vector doesn't need to grow. And/or parent samples to their
-/// queries only after all queries have been generated.
-/// \todo For the server scenario only, scale the query timeline at the end so
-/// the QPS as scheduled is equal to the QPS as requested.
-template <TestScenario scenario, TestMode mode>
-std::vector<QueryMetadata> GenerateQueries(
-    const TestSettingsInternal& settings,
-    const LoadableSampleSet& loaded_sample_set, SequenceGen* sequence_gen,
-    ResponseDelegate* response_delegate) {
-  auto tracer =
-      MakeScopedTracer([](AsyncTrace& trace) { trace("GenerateQueries"); });
-
-  auto& loaded_samples = loaded_sample_set.set;
-
-  // Generate 2x more samples than we think we'll need given the expected
-  // QPS in case the SUT is faster than expected.
-  // We should exit before issuing all queries.
-  // Does not apply to the server scenario since the duration only
-  // depends on the ideal scheduled time, not the actual issue time.
-  const int duration_multiplier = scenario == TestScenario::Server ? 1 : 2;
-  std::chrono::microseconds gen_duration =
-      duration_multiplier * settings.target_duration;
-  size_t min_queries = settings.min_query_count;
-
-  size_t samples_per_query = settings.samples_per_query;
-  if (mode == TestMode::AccuracyOnly && scenario == TestScenario::Offline) {
-    samples_per_query = loaded_sample_set.sample_distribution_end;
-  }
-
-  // We should not exit early in accuracy mode.
-  if (mode == TestMode::AccuracyOnly || settings.performance_issue_unique) {
-    gen_duration = std::chrono::microseconds(0);
-    // Integer truncation here is intentional.
-    // For MultiStream, loaded samples is properly padded.
-    // For Offline, we create a 'remainder' query at the end of this function.
-    min_queries = loaded_samples.size() / samples_per_query;
-  }
-
-  std::vector<QueryMetadata> queries;
-
-  // Using the std::mt19937 pseudo-random number generator ensures a modicum of
-  // cross platform reproducibility for trace generation.
-  std::mt19937 sample_rng(settings.sample_index_rng_seed);
-  std::mt19937 schedule_rng(settings.schedule_rng_seed);
-
-  constexpr bool kIsMultiStream = scenario == TestScenario::MultiStream;
-  const size_t sample_stride = kIsMultiStream ? samples_per_query : 1;
-
-  auto sample_distribution = SampleDistribution<mode>(
-      loaded_sample_set.sample_distribution_end, sample_stride, &sample_rng);
-  // Use the unique sample distribution same as in AccuracyMode to
-  // to choose samples when either flag performance_issue_unique
-  // or performance_issue_same is set.
-  auto sample_distribution_unique = SampleDistribution<TestMode::AccuracyOnly>(
-      loaded_sample_set.sample_distribution_end, sample_stride, &sample_rng);
-
-  auto sample_distribution_equal_issue = SampleDistributionEqualIssue(
-      min_queries, loaded_samples.size(), &sample_rng);
-
-  auto schedule_distribution =
-      ScheduleDistribution<scenario>(settings.target_qps);
-
-  // When sample_concatenate_permutation is turned on, pad to a multiple of the
-  // complete dataset to ensure fairness.
-  auto enable_equal_issue = settings.sample_concatenate_permutation;
-  if (mode != TestMode::AccuracyOnly && enable_equal_issue) {
-    if (scenario == TestScenario::Offline &&
-        samples_per_query % loaded_samples.size() != 0) {
-      // In offline mode, we pad samples_per_query
-      size_t pad_size =
-          (loaded_samples.size() - samples_per_query % loaded_samples.size());
-      samples_per_query += pad_size;
-    } else if ((scenario != TestScenario::Offline) &&
-               (min_queries % loaded_samples.size() != 0)) {
-      // In Server, SingleStream, MultiStream mode, the min_queries should be
-      // padded
-      size_t pad_size =
-          (loaded_samples.size() - min_queries % loaded_samples.size());
-      min_queries += pad_size;
-    }
-  }
-
-  std::vector<QuerySampleIndex> samples(samples_per_query);
-  std::chrono::nanoseconds timestamp(0);
-  std::chrono::nanoseconds prev_timestamp(0);
-  // Choose a single sample to repeat when in performance_issue_same mode
-  QuerySampleIndex same_sample = settings.performance_issue_same_index;
-
-  while (prev_timestamp < gen_duration || queries.size() < min_queries) {
-    if (kIsMultiStream) {
-      QuerySampleIndex sample_i = settings.performance_issue_unique
-                                      ? sample_distribution_unique(sample_rng)
-                                  : settings.performance_issue_same
-                                      ? same_sample
-                                      : sample_distribution(sample_rng);
-      for (auto& s : samples) {
-        // Select contiguous samples in the MultiStream scenario.
-        // This will not overflow, since GenerateLoadableSets adds padding at
-        // the end of the loadable sets in the MultiStream scenario.
-        // The padding allows the starting samples to be the same for each
-        // query with respect to samples_per_query.
-        s = loaded_samples[sample_i++];
-      }
-    } else if (scenario == TestScenario::Offline) {
-      // For the Offline + Performance scenario, we also want to support
-      // contiguous samples. In this scenario the query can be much larger than
-      // what fits into memory. We simply repeat loaded_samples N times, plus a
-      // remainder to ensure we fill up samples. Note that this eliminates
-      // randomization.
-      size_t num_loaded_samples = loaded_samples.size();
-      size_t num_full_repeats = samples_per_query / num_loaded_samples;
-      uint64_t remainder = samples_per_query % (num_loaded_samples);
-      if (settings.performance_issue_same) {
-        std::fill(samples.begin(), samples.begin() + samples_per_query,
-                  loaded_samples[same_sample]);
-      } else {
-        for (size_t i = 0; i < num_full_repeats; ++i) {
-          std::copy(loaded_samples.begin(), loaded_samples.end(),
-                    samples.begin() + i * num_loaded_samples);
-
-          if (settings.sample_concatenate_permutation) {
-            std::shuffle(samples.begin() + i * num_loaded_samples,
-                         samples.begin() + (i + 1) * num_loaded_samples,
-                         sample_rng);
-          }
-        }
-
-        std::copy(loaded_samples.begin(), loaded_samples.begin() + remainder,
-                  samples.begin() + num_full_repeats * num_loaded_samples);
-
-        if (settings.sample_concatenate_permutation) {
-          assert(remainder == 0);
-        }
-      }
-    } else {
-      for (auto& s : samples) {
-        s = loaded_samples[settings.performance_issue_unique
-                               ? sample_distribution_unique(sample_rng)
-                           : settings.performance_issue_same ? same_sample
-                           : enable_equal_issue
-                               ? sample_distribution_equal_issue(sample_rng)
-                               : sample_distribution(sample_rng)];
-      }
-    }
-    queries.emplace_back(samples, timestamp, response_delegate, sequence_gen);
-    prev_timestamp = timestamp;
-    timestamp += schedule_distribution(schedule_rng);
-    // In equal_issue mode, the min_queries will be bumped up by a multiple of
-    // the dataset size if the test time has not met the threshold.
-    if (enable_equal_issue && (queries.size() >= min_queries) &&
-        (prev_timestamp < gen_duration) &&
-        (scenario != TestScenario::Offline)) {
-      min_queries += loaded_samples.size();
-    }
-  }
-
-  // See if we need to create a "remainder" query for offline+accuracy to
-  // ensure we issue all samples in loaded_samples. Offline doesn't pad
-  // loaded_samples like MultiStream does.
-  if (scenario == TestScenario::Offline && mode == TestMode::AccuracyOnly) {
-    size_t remaining_samples = loaded_samples.size() % samples_per_query;
-    if (remaining_samples != 0) {
-      samples.resize(remaining_samples);
-      for (auto& s : samples) {
-        s = loaded_samples[sample_distribution(sample_rng)];
-      }
-      queries.emplace_back(samples, timestamp, response_delegate, sequence_gen);
-    }
-  }
-
-  LogDetail([count = queries.size(), spq = samples_per_query,
-             duration = timestamp.count()](AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-    MLPERF_LOG(detail, "generated_query_count", count);
-    MLPERF_LOG(detail, "generated_samples_per_query", spq);
-    MLPERF_LOG(detail, "generated_query_duration", duration);
-#else
-    detail("GeneratedQueries: ", "queries", count, "samples per query", spq,
-           "duration", duration);
-#endif
-  });
-
-  return queries;
-}
-
-/// \brief Issues a series of pre-generated queries.
-// TODO: Templates for scenario and mode are overused, given the loadgen
-//       no longer generates queries on the fly. Should we reduce the
-//       use of templates?
-template <TestScenario scenario, TestMode mode>
-PerformanceResult IssueQueries(SystemUnderTest* sut,
-                               const TestSettingsInternal& settings,
-                               const LoadableSampleSet& loaded_sample_set,
-                               SequenceGen* sequence_gen) {
-  // Create reponse handler.
-  ResponseDelegateDetailed<scenario, mode> response_logger;
-  std::uniform_real_distribution<double> accuracy_log_offset_dist =
-      std::uniform_real_distribution<double>(0.0, 1.0);
-  std::mt19937 accuracy_log_offset_rng(settings.accuracy_log_rng_seed);
-  response_logger.accuracy_log_offset =
-      accuracy_log_offset_dist(accuracy_log_offset_rng);
-  response_logger.accuracy_log_prob = settings.accuracy_log_probability;
-
-  // Generate queries.
-  auto sequence_id_start = sequence_gen->CurrentSampleId();
-  std::vector<QueryMetadata> queries = GenerateQueries<scenario, mode>(
-      settings, loaded_sample_set, sequence_gen, &response_logger);
-
-  // Calculated expected number of queries
-  uint64_t expected_queries =
-      settings.target_qps * settings.min_duration.count() / 1000;
-  uint64_t minimum_queries =
-      settings.min_query_count * settings.samples_per_query;
-
-  if (scenario != TestScenario::Offline) {
-    expected_queries *= settings.samples_per_query;
-  } else {
-    minimum_queries = settings.min_sample_count;
-  }
-
-  expected_queries =
-      expected_queries < minimum_queries ? minimum_queries : expected_queries;
-
-  if (settings.accuracy_log_sampling_target > 0) {
-    response_logger.accuracy_log_prob =
-        (double)settings.accuracy_log_sampling_target / expected_queries;
-  }
-  auto sequence_id_end = sequence_gen->CurrentSampleId();
-  size_t max_latencies_to_record = sequence_id_end - sequence_id_start;
-
-  // Initialize logger for latency recording.
-  GlobalLogger().RestartLatencyRecording(sequence_id_start,
-                                         max_latencies_to_record);
-
-  // Create and initialize an IssueQueryState.
-  IssueQueryState state{
-      sut, &queries, &response_logger, &settings, mode, {}, {}, false, 0,
-      0,   {}};
-  auto& controller = IssueQueryController::GetInstance();
-
-  // Set number of IssueQueryThreads and wait for the threads to register.
-  controller.SetNumThreads(settings.requested.server_num_issue_query_threads);
-
-  // Start issuing the queries.
-  controller.StartIssueQueries<scenario>(&state);
-
-  // Gather query issuing statistics.
-  const auto start_for_power = state.start_for_power;
-  const auto start = state.start_time;
-  const auto ran_out_of_generated_queries = state.ran_out_of_generated_queries;
-  const auto queries_issued = state.queries_issued;
-  const auto expected_latencies = state.expected_latencies;
-
-  // Let the SUT know it should not expect any more queries.
-  sut->FlushQueries();
-
-  if (mode == TestMode::PerformanceOnly && ran_out_of_generated_queries) {
-    LogDetail([](AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-      MLPERF_LOG_ERROR(
-          detail, "error_runtime",
-          "Ending early: Ran out of generated queries to issue before the "
-          "minimum query count and test duration were reached. "
-          "Please update the relevant expected latency or target qps in the "
-          "TestSettings so they are more accurate.");
-#else
-      detail.Error(
-          "Ending early: Ran out of generated queries to issue before the "
-          "minimum query count and test duration were reached.");
-      detail(
-          "Please update the relevant expected latency or target qps in the "
-          "TestSettings so they are more accurate.");
-#endif
-    });
-  }
-
-  // Wait for tail queries to complete and collect all the latencies.
-  // We have to keep the synchronization primitives alive until the SUT
-  // is done with them.
-  auto& final_query = queries[queries_issued - 1];
-  std::vector<QuerySampleLatency> sample_latencies(
-      GlobalLogger().GetLatenciesBlocking(expected_latencies));
-
-  std::vector<QuerySampleLatency> first_token_latencies(
-      GlobalLogger().GetTokenLatencies(expected_latencies));
-
-  std::vector<QuerySampleLatency> time_per_output_token_arr(
-      GlobalLogger().GetTimePerOutputToken(expected_latencies));
-
-  std::vector<int64_t> tokens_per_sample(
-      GlobalLogger().GetTokensPerSample(expected_latencies));
-
-  // Log contention counters after every test as a sanity check.
-  GlobalLogger().LogContentionAndAllocations();
-
-  // This properly accounts for the fact that the max completion time may not
-  // belong to the final query. It also excludes any time spent postprocessing
-  // in the loadgen itself after final completion, which may be significant
-  // in the offline scenario.
-  PerfClock::time_point max_completion_time =
-      GlobalLogger().GetMaxCompletionTime();
-  auto sut_active_duration = max_completion_time - start;
-  LogDetail([start_for_power, sut_active_duration](AsyncDetail& detail) {
-    auto end_for_power =
-        start_for_power +
-        std::chrono::duration_cast<std::chrono::system_clock::duration>(
-            sut_active_duration);
-#if USE_NEW_LOGGING_FORMAT
-    MLPERF_LOG_INTERVAL_START(detail, "power_begin",
-                              DateTimeStringForPower(start_for_power));
-    MLPERF_LOG_INTERVAL_END(detail, "power_end",
-                            DateTimeStringForPower(end_for_power));
-#else
-    detail("POWER_BEGIN: ", "mode", ToString(mode), "time",
-           DateTimeStringForPower(start_for_power));
-    detail("POWER_END: ", "mode", ToString(mode), "time",
-           DateTimeStringForPower(end_for_power));
-#endif
-  });
-
-  double max_latency =
-      QuerySampleLatencyToSeconds(GlobalLogger().GetMaxLatencySoFar());
-  double final_query_scheduled_time =
-      DurationToSeconds(final_query.scheduled_delta);
-  double final_query_issued_time =
-      DurationToSeconds(final_query.issued_start_time - start);
-  double final_query_all_samples_done_time =
-      DurationToSeconds(final_query.all_samples_done_time - start);
-
-  std::vector<QuerySampleLatency> query_latencies;
-  if (scenario == TestScenario::MultiStream) {
-    query_latencies.resize(queries_issued);
-    for (size_t i = 0; i < queries_issued; i++) {
-      query_latencies[i] = DurationGeneratorNs{queries[i].scheduled_time}.delta(
-          queries[i].all_samples_done_time);
-    }
-  }
-
-  return PerformanceResult{
-      std::move(sample_latencies),
-      std::move(query_latencies),
-      queries_issued,
-      max_latency,
-      final_query_scheduled_time,
-      final_query_issued_time,
-      final_query_all_samples_done_time,
-      TokenPerformanceResults{first_token_latencies, time_per_output_token_arr,
-                              tokens_per_sample}};
-}
-
-void LoadSamplesToRam(QuerySampleLibrary* qsl,
-                      const std::vector<QuerySampleIndex>& samples) {
-  LogDetail([&samples](AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-    MLPERF_LOG(detail, "loaded_qsl_set", samples);
-#else
-    std::string set("\"[");
-    for (auto i : samples) {
-      set += std::to_string(i) + ",";
-    }
-    set.resize(set.size() - 1);
-    set += "]\"";
-    detail("Loading QSL : ", "set", set);
-#endif
-  });
-  qsl->LoadSamplesToRam(samples);
-}
-
-/// \brief Generates random sets of samples in the QSL that we can load into
-/// RAM at the same time.
-std::vector<LoadableSampleSet> GenerateLoadableSets(
-    QuerySampleLibrary* qsl, const TestSettingsInternal& settings) {
-  auto tracer = MakeScopedTracer(
-      [](AsyncTrace& trace) { trace("GenerateLoadableSets"); });
-
-  std::vector<LoadableSampleSet> result;
-  std::mt19937 qsl_rng(settings.qsl_rng_seed);
-
-  // Generate indices for all available samples in the QSL.
-  const size_t qsl_total_count = qsl->TotalSampleCount();
-  std::vector<QuerySampleIndex> samples(qsl_total_count);
-  for (size_t i = 0; i < qsl_total_count; i++) {
-    samples[i] = static_cast<QuerySampleIndex>(i);
-  }
-
-  // Randomize the order of the samples.
-  std::shuffle(samples.begin(), samples.end(), qsl_rng);
-
-  // Partition the samples into loadable sets.
-  const size_t set_size = settings.performance_sample_count;
-  const size_t set_padding = (settings.scenario == TestScenario::MultiStream)
-                                 ? settings.samples_per_query - 1
-                                 : 0;
-  std::vector<QuerySampleIndex> loadable_set;
-  loadable_set.reserve(set_size + set_padding);
-
-  for (auto s : samples) {
-    loadable_set.push_back(s);
-    if (loadable_set.size() == set_size) {
-      result.push_back({std::move(loadable_set), set_size});
-      loadable_set.clear();
-      loadable_set.reserve(set_size + set_padding);
-    }
-  }
-
-  if (!loadable_set.empty()) {
-    // Copy the size since it will become invalid after the move.
-    size_t loadable_set_size = loadable_set.size();
-    result.push_back({std::move(loadable_set), loadable_set_size});
-  }
-
-  // Add padding for the multi stream scenario. Padding allows the
-  // starting sample to be the same for all SUTs, independent of the value
-  // of samples_per_query, while enabling samples in a query to be contiguous.
-  for (auto& loadable_set : result) {
-    auto& set = loadable_set.set;
-    for (size_t i = 0; i < set_padding; i++) {
-      // It's not clear in the spec if the STL deallocates the old container
-      // before assigning, which would invalidate the source before the
-      // assignment happens. Even though we should have reserved enough
-      // elements above, copy the source first anyway since we are just moving
-      // integers around.
-      QuerySampleIndex p = set[i];
-      set.push_back(p);
-    }
-  }
-
-  return result;
-}
-
-/// \brief Opens and owns handles to all of the log files.
-struct LogOutputs {
-  LogOutputs(const LogOutputSettings& output_settings,
-             const std::string& test_date_time) {
-    std::string prefix = output_settings.outdir;
-    prefix += "/" + output_settings.prefix;
-    if (output_settings.prefix_with_datetime) {
-      prefix += test_date_time + "_";
-    }
-    const std::string& suffix = output_settings.suffix;
-
-    summary_out.open(prefix + "summary" + suffix + ".txt");
-    detail_out.open(prefix + "detail" + suffix + ".txt");
-    accuracy_out.open(prefix + "accuracy" + suffix + ".json");
-    trace_out.open(prefix + "trace" + suffix + ".json");
-  }
-
-  bool CheckOutputs() {
-    bool all_ofstreams_good = true;
-    if (!summary_out.good()) {
-      all_ofstreams_good = false;
-      std::cerr << "LoadGen: Failed to open summary file.";
-    }
-    if (!detail_out.good()) {
-      all_ofstreams_good = false;
-      std::cerr << "LoadGen: Failed to open detailed log file.";
-    }
-    if (!accuracy_out.good()) {
-      all_ofstreams_good = false;
-      std::cerr << "LoadGen: Failed to open accuracy log file.";
-    }
-    if (!trace_out.good()) {
-      all_ofstreams_good = false;
-      std::cerr << "LoadGen: Failed to open trace file.";
-    }
-    return all_ofstreams_good;
-  }
-
-  std::ofstream summary_out;
-  std::ofstream detail_out;
-  std::ofstream accuracy_out;
-  std::ofstream trace_out;
-};
-
-/// \brief Find boundaries of performance settings by widening bounds
-/// exponentially.
-/// \details To find an upper bound of performance, widen an
-/// upper bound exponentially until finding a bound that can't satisfy
-/// performance constraints. i.e. [1, 2) -> [2, 4) -> [4, 8) -> ...
-template <TestScenario scenario>
-std::pair<PerformanceSummary, PerformanceSummary> FindBoundaries(
-    SystemUnderTest* sut, QuerySampleLibrary* qsl, SequenceGen* sequence_gen,
-    PerformanceSummary l_perf_summary) {
-  // Get upper bound
-  TestSettingsInternal u_settings = l_perf_summary.settings;
-  find_peak_performance::WidenPerformanceField<scenario>(&u_settings);
-
-  LogDetail(
-      [l_field = find_peak_performance::ToStringPerformanceField<scenario>(
-           l_perf_summary.settings),
-       u_field = find_peak_performance::ToStringPerformanceField<scenario>(
-           u_settings)](AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-        MLPERF_LOG(detail, "generic_message",
-                   "FindBoundaries: Checking fields [" + l_field + ", " +
-                       u_field + ")");
-#else
-        detail("FindBoundaries: Checking fields [" + l_field + ", " + u_field +
-               ")");
-#endif
-      });
-
-  std::vector<loadgen::LoadableSampleSet> loadable_sets(
-      loadgen::GenerateLoadableSets(qsl, u_settings));
-  const LoadableSampleSet& performance_set = loadable_sets.front();
-  LoadSamplesToRam(qsl, performance_set.set);
-
-  PerformanceResult u_pr(IssueQueries<scenario, TestMode::PerformanceOnly>(
-      sut, u_settings, performance_set, sequence_gen));
-  PerformanceSummary u_perf_summary{sut->Name(), u_settings, std::move(u_pr)};
-
-  qsl->UnloadSamplesFromRam(performance_set.set);
-
-  std::string tmp;
-  if (!u_perf_summary.PerfConstraintsMet(&tmp)) {
-    return std::make_pair(l_perf_summary, u_perf_summary);
-  } else {
-    return FindBoundaries<scenario>(sut, qsl, sequence_gen, u_perf_summary);
-  }
-}
-
-/// \brief Find peak performance by binary search.
-/// \details The found lower & upper bounds by the function 'FindBoundaries' are
-/// used as initial bounds of binary search
-template <TestScenario scenario>
-PerformanceSummary FindPeakPerformanceBinarySearch(
-    SystemUnderTest* sut, QuerySampleLibrary* qsl, SequenceGen* sequence_gen,
-    const LoadableSampleSet& performance_set, PerformanceSummary l_perf_summary,
-    PerformanceSummary u_perf_summary) {
-  if (find_peak_performance::IsFinished<scenario>(l_perf_summary.settings,
-                                                  u_perf_summary.settings)) {
-    return l_perf_summary;
-  }
-
-  const TestSettingsInternal m_settings =
-      find_peak_performance::MidOfBoundaries<scenario>(l_perf_summary.settings,
-                                                       u_perf_summary.settings);
-
-  LogDetail([l_field =
-                 find_peak_performance::ToStringPerformanceField<scenario>(
-                     l_perf_summary.settings),
-             u_field =
-                 find_peak_performance::ToStringPerformanceField<scenario>(
-                     u_perf_summary.settings),
-             m_field =
-                 find_peak_performance::ToStringPerformanceField<scenario>(
-                     m_settings)](AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-    MLPERF_LOG(
-        detail, "generic_message",
-        "FindPeakPerformanceBinarySearch: Testing the mid value of bounds [" +
-            l_field + ", " + u_field + "): " + m_field);
-#else
-    detail(
-        "FindPeakPerformanceBinarySearch: Testing the mid value of bounds [" +
-        l_field + ", " + u_field + "): " + m_field);
-#endif
-  });
-
-  PerformanceResult m_pr(IssueQueries<scenario, TestMode::PerformanceOnly>(
-      sut, m_settings, performance_set, sequence_gen));
-  PerformanceSummary m_perf_summary{sut->Name(), m_settings, std::move(m_pr)};
-
-  std::string tmp;
-  if (m_perf_summary.PerfConstraintsMet(&tmp)) {
-    return FindPeakPerformanceBinarySearch<scenario>(
-        sut, qsl, sequence_gen, performance_set, m_perf_summary,
-        u_perf_summary);
-  } else {
-    return FindPeakPerformanceBinarySearch<scenario>(
-        sut, qsl, sequence_gen, performance_set, l_perf_summary,
-        m_perf_summary);
-  }
-}
-
-/// \brief Runs the performance mode, templated by scenario.
-template <TestScenario scenario>
-void RunPerformanceMode(SystemUnderTest* sut, QuerySampleLibrary* qsl,
-                        const TestSettingsInternal& settings,
-                        SequenceGen* sequence_gen) {
-  LogDetail([](AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-    MLPERF_LOG(detail, "generic_message", "Starting performance mode");
-#else
-    detail("Starting performance mode:");
-#endif
-  });
-
-  // Use first loadable set as the performance set.
-  std::vector<loadgen::LoadableSampleSet> loadable_sets(
-      loadgen::GenerateLoadableSets(qsl, settings));
-  const LoadableSampleSet& performance_set = loadable_sets.front();
-  LoadSamplesToRam(qsl, performance_set.set);
-
-  // Start PerfClock/system_clock timers for measuring performance interval
-  // for comparison vs external timer.
-  auto pc_start_ts = PerfClock::now();
-  auto sc_start_ts = std::chrono::system_clock::now();
-  if (settings.print_timestamps) {
-    std::cout << "Loadgen :: Perf mode start. system_clock Timestamp = "
-              << std::chrono::system_clock::to_time_t(sc_start_ts) << "\n"
-              << std::flush;
-  }
-
-  PerformanceResult pr(IssueQueries<scenario, TestMode::PerformanceOnly>(
-      sut, settings, performance_set, sequence_gen));
-
-  // Measure PerfClock/system_clock timer durations for comparison vs
-  // external timer.
-  auto pc_stop_ts = PerfClock::now();
-  auto sc_stop_ts = std::chrono::system_clock::now();
-  auto pc_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
-                         pc_stop_ts - pc_start_ts)
-                         .count();
-  auto sc_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
-                         sc_stop_ts - sc_start_ts)
-                         .count();
-  float pc_sc_ratio = static_cast<float>(pc_duration) / sc_duration;
-  if (settings.print_timestamps) {
-    std::cout << "Loadgen :: Perf mode stop. systme_clock Timestamp = "
-              << std::chrono::system_clock::to_time_t(sc_stop_ts) << "\n"
-              << std::flush;
-    std::cout << "Loadgen :: PerfClock Perf duration = " << pc_duration
-              << "ms\n"
-              << std::flush;
-    std::cout << "Loadgen :: system_clock Perf duration = " << sc_duration
-              << "ms\n"
-              << std::flush;
-    std::cout << "Loadgen :: PerfClock/system_clock ratio = " << std::fixed
-              << std::setprecision(4) << pc_sc_ratio << "\n"
-              << std::flush;
-  }
-
-  if (pc_sc_ratio > 1.01 || pc_sc_ratio < 0.99) {
-    LogDetail([pc_sc_ratio](AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-      std::stringstream ss;
-      ss << "PerfClock and system_clock differ by more than 1%! "
-         << " pc_sc_ratio: " << pc_sc_ratio;
-      MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
-#else
-      detail.Error("PerfClock and system_clock differ by more than 1\%! ",
-                   "pc_sc_ratio", pc_sc_ratio);
-#endif
-    });
-  } else if (pc_sc_ratio > 1.001 || pc_sc_ratio < 0.999) {
-    LogDetail([pc_sc_ratio](AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-      std::stringstream ss;
-      ss << "PerfClock and system_clock differ by more than 0.1%! "
-         << " pc_sc_ratio: " << pc_sc_ratio;
-      MLPERF_LOG_WARNING(detail, "warning_generic_message", ss.str());
-#else
-      detail.Warning("PerfClock and system_clock differ by more than 0.1\%. ",
-                     "pc_sc_ratio", pc_sc_ratio);
-#endif
-    });
-  }
-
-  PerformanceSummary perf_summary{sut->Name(), settings, std::move(pr)};
-  LogSummary([perf_summary](AsyncSummary& summary) mutable {
-    perf_summary.LogSummary(summary);
-  });
-  // Create a copy to prevent thread hazard between LogSummary and LogDetail.
-  PerformanceSummary perf_summary_detail{perf_summary};
-  LogDetail([perf_summary_detail](AsyncDetail& detail) mutable {
-    perf_summary_detail.LogDetail(detail);
-  });
-
-  qsl->UnloadSamplesFromRam(performance_set.set);
-}
-
-/// \brief Runs the binary search mode, templated by scenario.
-/// \details 1. Check whether lower bound from user satisfies the performance
-/// constraints, 2. Find an upper bound using the function 'FindBoundaries'
-/// based on the lower bound, 3. Find peak performance settings using the
-/// function 'FindPeakPerformanceBinarySearch'. note: Since we can't find a
-/// lower bound programmatically because of the monotonicity issue of Server
-/// scenario, rely on user's settings. After resolving this issue, we can
-/// make the function 'FindBoundaries' find a lower bound as well from some
-/// random initial settings.
-template <TestScenario scenario>
-void FindPeakPerformanceMode(SystemUnderTest* sut, QuerySampleLibrary* qsl,
-                             const TestSettingsInternal& base_settings,
-                             SequenceGen* sequence_gen) {
-  LogDetail([](AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-    MLPERF_LOG(detail, "generic_message", "Starting FindPeakPerformance mode");
-#else
-    detail("Starting FindPeakPerformance mode:");
-#endif
-  });
-
-  if (scenario != TestScenario::Server) {
-    LogDetail([unsupported_scenario = ToString(scenario)](AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-      MLPERF_LOG_ERROR(detail, "error_invalid_config",
-                       find_peak_performance::kNotSupportedMsg);
-#else
-      detail.Error(find_peak_performance::kNotSupportedMsg);
-#endif
-    });
-    return;
-  }
-
-  LogDetail(
-      [base_field = find_peak_performance::ToStringPerformanceField<scenario>(
-           base_settings)](AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-        MLPERF_LOG(
-            detail, "generic_message",
-            "FindPeakPerformance: Check validity of the base settings field: " +
-                base_field);
-#else
-        detail(
-            "FindPeakPerformance: Check validity of the base settings field: " +
-            base_field);
-#endif
-      });
-
-  // 1. Check whether the lower bound came from user satisfy performance
-  // constraints or not.
-  std::vector<loadgen::LoadableSampleSet> base_loadable_sets(
-      loadgen::GenerateLoadableSets(qsl, base_settings));
-  const LoadableSampleSet& base_performance_set = base_loadable_sets.front();
-  LoadSamplesToRam(qsl, base_performance_set.set);
-
-  PerformanceResult base_pr(IssueQueries<scenario, TestMode::PerformanceOnly>(
-      sut, base_settings, base_performance_set, sequence_gen));
-  PerformanceSummary base_perf_summary{sut->Name(), base_settings,
-                                       std::move(base_pr)};
-
-  // We can also use all_constraints_met to check performance constraints,
-  // but to reduce searching time, leave it up to whether the settings satisfy
-  // min duration & min queries or not to users.
-  std::string msg;
-  if (!base_perf_summary.PerfConstraintsMet(&msg)) {
-    LogDetail([msg](AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-      std::stringstream ss;
-      ss << "FindPeakPerformance: Initial lower bound does not satisfy "
-         << "performance constraints, msg: " << msg;
-      MLPERF_LOG_ERROR(detail, "error_runtime", ss.str());
-#else
-      detail.Error(
-          "FindPeakPerformance: Initial lower bound does not satisfy "
-          "performance constraints, msg: " +
-          msg);
-#endif
-    });
-
-    PerformanceSummary perf_summary{sut->Name(), base_settings,
-                                    std::move(base_perf_summary.pr)};
-    LogSummary([perf_summary](AsyncSummary& summary) mutable {
-      perf_summary.LogSummary(summary);
-    });
-    // Create a copy to prevent thread hazard between LogSummary and LogDetail.
-    PerformanceSummary perf_summary_detail{perf_summary};
-    LogDetail([perf_summary_detail](AsyncDetail& detail) mutable {
-      perf_summary_detail.LogDetail(detail);
-    });
-
-    qsl->UnloadSamplesFromRam(base_performance_set.set);
-
-    return;
-  }
-
-  // Clear loaded samples.
-  qsl->UnloadSamplesFromRam(base_performance_set.set);
-
-  // 2. Find an upper bound based on the lower bound.
-  std::pair<PerformanceSummary, PerformanceSummary> boundaries =
-      FindBoundaries<scenario>(sut, qsl, sequence_gen, base_perf_summary);
-  PerformanceSummary l_perf_summary = boundaries.first;
-  PerformanceSummary u_perf_summary = boundaries.second;
-
-  LogDetail(
-      [l_field = find_peak_performance::ToStringPerformanceField<scenario>(
-           l_perf_summary.settings),
-       u_field = find_peak_performance::ToStringPerformanceField<scenario>(
-           u_perf_summary.settings)](AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-        MLPERF_LOG(detail, "generic_message",
-                   "FindPeakPerformance: Found boundaries: [" + l_field + ", " +
-                       u_field + ")");
-#else
-        detail("FindPeakPerformance: Found boundaries: [" + l_field + ", " +
-               u_field + ")");
-#endif
-      });
-
-  // Reuse performance_set, u_perf_summary has the largest 'samples_per_query'.
-  std::vector<loadgen::LoadableSampleSet> loadable_sets(
-      loadgen::GenerateLoadableSets(qsl, u_perf_summary.settings));
-  const LoadableSampleSet& performance_set = loadable_sets.front();
-  LoadSamplesToRam(qsl, performance_set.set);
-
-  // 3. Find peak performance settings using the found boundaries
-  PerformanceSummary perf_summary = FindPeakPerformanceBinarySearch<scenario>(
-      sut, qsl, sequence_gen, performance_set, l_perf_summary, u_perf_summary);
-
-  // Print-out the peak performance test setting.
-  LogDetail([field = find_peak_performance::ToStringPerformanceField<scenario>(
-                 perf_summary.settings)](AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-    MLPERF_LOG(detail, "generic_message",
-               "FindPeakPerformance: Found peak performance field: " + field);
-#else
-    detail("FindPeakPerformance: Found peak performance field: " + field);
-#endif
-  });
-
-  LogSummary([perf_summary](AsyncSummary& summary) mutable {
-    perf_summary.LogSummary(summary);
-  });
-  // Create a copy to prevent thread hazard between LogSummary and LogDetail.
-  PerformanceSummary perf_summary_detail{perf_summary};
-  LogDetail([perf_summary_detail](AsyncDetail& detail) mutable {
-    perf_summary_detail.LogDetail(detail);
-  });
-
-  qsl->UnloadSamplesFromRam(performance_set.set);
-}
-
-/// \brief Runs the accuracy mode, templated by scenario.
-template <TestScenario scenario>
-void RunAccuracyMode(SystemUnderTest* sut, QuerySampleLibrary* qsl,
-                     const TestSettingsInternal& settings,
-                     SequenceGen* sequence_gen) {
-  LogDetail([](AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-    MLPERF_LOG(detail, "generic_message", "Starting accuracy mode");
-#else
-    detail("Starting accuracy mode:");
-#endif
-  });
-
-  std::vector<loadgen::LoadableSampleSet> loadable_sets(
-      loadgen::GenerateLoadableSets(qsl, settings));
-
-  for (auto& loadable_set : loadable_sets) {
-    {
-      auto tracer = MakeScopedTracer(
-          [count = loadable_set.set.size()](AsyncTrace& trace) {
-            trace("LoadSamples", "count", count);
-          });
-      LoadSamplesToRam(qsl, loadable_set.set);
-    }
-
-    PerformanceResult pr(IssueQueries<scenario, TestMode::AccuracyOnly>(
-        sut, settings, loadable_set, sequence_gen));
-
-    {
-      auto tracer = MakeScopedTracer(
-          [count = loadable_set.set.size()](AsyncTrace& trace) {
-            trace("UnloadSampes", "count", count);
-          });
-      qsl->UnloadSamplesFromRam(loadable_set.set);
-    }
-  }
-}
-
-/// \brief Routes runtime scenario requests to the corresponding instances
-/// of its templated mode functions.
-struct RunFunctions {
-  using Signature = void(SystemUnderTest* sut, QuerySampleLibrary* qsl,
-                         const TestSettingsInternal& settings,
-                         SequenceGen* sequence_gen);
-
-  template <TestScenario compile_time_scenario>
-  static RunFunctions GetCompileTime() {
-    return {(RunAccuracyMode<compile_time_scenario>),
-            (RunPerformanceMode<compile_time_scenario>),
-            (FindPeakPerformanceMode<compile_time_scenario>)};
-  }
-
-  static RunFunctions Get(TestScenario run_time_scenario) {
-    switch (run_time_scenario) {
-      case TestScenario::SingleStream:
-        return GetCompileTime<TestScenario::SingleStream>();
-      case TestScenario::MultiStream:
-        return GetCompileTime<TestScenario::MultiStream>();
-      case TestScenario::Server:
-        return GetCompileTime<TestScenario::Server>();
-      case TestScenario::Offline:
-        return GetCompileTime<TestScenario::Offline>();
-    }
-    // We should not reach this point.
-    assert(false);
-    return GetCompileTime<TestScenario::SingleStream>();
-  }
-
-  Signature& accuracy;
-  Signature& performance;
-  Signature& find_peak_performance;
-};
-
-}  // namespace loadgen
-
-void StartTest(SystemUnderTest* sut, QuerySampleLibrary* qsl,
-               const TestSettings& requested_settings,
-               const LogSettings& log_settings,
-               const std::string audit_config_filename) {
-  GlobalLogger().StartIOThread();
-
-  const std::string test_date_time = CurrentDateTimeISO8601();
-
-  loadgen::LogOutputs log_outputs(log_settings.log_output, test_date_time);
-  if (!log_outputs.CheckOutputs()) {
-    return;
-  }
-
-  GlobalLogger().StartLogging(&log_outputs.summary_out, &log_outputs.detail_out,
-                              &log_outputs.accuracy_out,
-                              log_settings.log_output.copy_detail_to_stdout,
-                              log_settings.log_output.copy_summary_to_stdout);
-
-  GlobalLogger().SetUseTokens(requested_settings.use_token_latencies);
-  bool needs_first_token =
-      (requested_settings.scenario != TestScenario::Offline);
-  GlobalLogger().SetNeedsFirstToken(needs_first_token);
-
-  if (log_settings.enable_trace) {
-    GlobalLogger().StartNewTrace(&log_outputs.trace_out, PerfClock::now());
-  }
-
-  // measure sut->Name() response time
-  PerfClock::time_point pre_get_sut_name_ts = PerfClock::now();
-  const std::string& sut_name = sut->Name();
-  PerfClock::time_point post_get_sut_name_ts = PerfClock::now();
-
-  auto get_sut_name_duration_ns =
-      std::chrono::duration_cast<std::chrono::nanoseconds>(
-          post_get_sut_name_ts - pre_get_sut_name_ts)
-          .count();
-
-  LogLoadgenVersion();
-  LogDetail([sut, qsl, test_date_time, &sut_name,
-             &get_sut_name_duration_ns](AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-    MLPERF_LOG(detail, "test_datetime", test_date_time);
-    MLPERF_LOG(detail, "sut_name", sut_name);
-    MLPERF_LOG(detail, "get_sut_name_duration_ns", get_sut_name_duration_ns);
-    MLPERF_LOG(detail, "qsl_name", qsl->Name());
-    MLPERF_LOG(detail, "qsl_reported_total_count", qsl->TotalSampleCount());
-    MLPERF_LOG(detail, "qsl_reported_performance_count",
-               qsl->PerformanceSampleCount());
-#else
-    detail("Date + time of test: ", test_date_time);
-    detail("System Under Test (SUT) name: ", sut_name);
-    detail("Get SUT name time [ns]: ", get_sut_name_duration_ns);
-    detail("Query Sample Library (QSL) name: ", qsl->Name());
-    detail("QSL total size: ", qsl->TotalSampleCount());
-    detail("QSL performance size*: ", qsl->PerformanceSampleCount());
-    detail("*TestSettings (performance_sample_count_override) can override");
-    detail("*Refer to Effective Settings for actual value");
-#endif
-  });
-
-  TestSettings test_settings = requested_settings;
-  // Look for Audit Config file to override TestSettings during audit
-  if (FileExists(audit_config_filename)) {
-    LogDetail([](AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-      MLPERF_LOG_WARNING(detail, "warning_generic_message",
-                         "Found Audit Config file (audit.config)."
-                         " Overriding TestSettings from audit.config file.");
-#else
-      detail(
-          "Found Audit Config file (audit.config)."
-          " Overriding TestSettings from audit.config file.");
-#endif
-    });
-    std::string audit_scenario = loadgen::ToString(test_settings.scenario);
-    // Remove Spaces from the string
-    RemoveValue(&audit_scenario, ' ');
-    const std::string generic_model = "*";
-    test_settings.FromConfig(audit_config_filename, generic_model,
-                             audit_scenario, 2);
-  }
-  if (test_settings.test05) {
-    // If the configuration indicates we are running test05,
-    // random seeds
-    LogDetail([](AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-      MLPERF_LOG_WARNING(detail, "warning_generic_message",
-                         "Test05 flag detected"
-                         " Overriding random seeds");
-#else
-      detail(
-          "Test05 flag detected"
-          " Overriding random seeds");
-#endif
-    });
-    test_settings.mode = TestMode::PerformanceOnly;
-    test_settings.qsl_rng_seed = requested_settings.test05_qsl_rng_seed;
-    test_settings.sample_index_rng_seed =
-        requested_settings.test05_sample_index_rng_seed;
-    test_settings.schedule_rng_seed =
-        requested_settings.test05_schedule_rng_seed;
-  }
-
-  loadgen::TestSettingsInternal sanitized_settings(
-      test_settings, qsl->PerformanceSampleCount());
-  sanitized_settings.LogAllSettings();
-
-  auto run_funcs = loadgen::RunFunctions::Get(sanitized_settings.scenario);
-
-  loadgen::SequenceGen sequence_gen;
-  switch (sanitized_settings.mode) {
-    case TestMode::SubmissionRun:
-      run_funcs.accuracy(sut, qsl, sanitized_settings, &sequence_gen);
-      run_funcs.performance(sut, qsl, sanitized_settings, &sequence_gen);
-      break;
-    case TestMode::AccuracyOnly:
-      run_funcs.accuracy(sut, qsl, sanitized_settings, &sequence_gen);
-      break;
-    case TestMode::PerformanceOnly:
-      run_funcs.performance(sut, qsl, sanitized_settings, &sequence_gen);
-      break;
-    case TestMode::FindPeakPerformance:
-      run_funcs.find_peak_performance(sut, qsl, sanitized_settings,
-                                      &sequence_gen);
-      break;
-  }
-
-  loadgen::IssueQueryController::GetInstance().EndThreads();
-
-  // Stop tracing after logging so all logs are captured in the trace.
-  GlobalLogger().StopLogging();
-  GlobalLogger().StopTracing();
-  GlobalLogger().StopIOThread();
-}
-
-void AbortTest() {
-  loadgen::IssueQueryController::GetInstance().EndThreads();
-  GlobalLogger().StopLogging();
-  GlobalLogger().StopTracing();
-  GlobalLogger().StopIOThread();
-}
-
-void QuerySamplesComplete(QuerySampleResponse* responses, size_t response_count,
-                          const ResponseCallback& response_cb) {
-  PerfClock::time_point timestamp = PerfClock::now();
-
-  auto tracer = MakeScopedTracer(
-      [](AsyncTrace& trace) { trace("QuerySamplesComplete"); });
-
-  const QuerySampleResponse* end = responses + response_count;
-
-  // Notify first to unblock loadgen production ASAP.
-  for (QuerySampleResponse* response = responses; response < end; response++) {
-    loadgen::SampleMetadata* sample =
-        reinterpret_cast<loadgen::SampleMetadata*>(response->id);
-    loadgen::QueryMetadata* query = sample->query_metadata;
-    query->NotifyOneSampleCompleted(timestamp);
-  }
-
-  // Log samples.
-  for (QuerySampleResponse* response = responses; response < end; response++) {
-    loadgen::SampleMetadata* sample =
-        reinterpret_cast<loadgen::SampleMetadata*>(response->id);
-    loadgen::QueryMetadata* query = sample->query_metadata;
-    query->response_delegate->SampleComplete(sample, response, timestamp,
-                                             response_cb);
-  }
-  // PerfClock::time_point end_timestamp = PerfClock::now();
-  // mlperf::samples_overhead_acum += (end_timestamp - timestamp).count();
-}
-
-void FirstTokenComplete(QuerySampleResponse* responses, size_t response_count,
-                        const ResponseCallback& response_cb) {
-  PerfClock::time_point timestamp = PerfClock::now();
-
-  auto tracer =
-      MakeScopedTracer([](AsyncTrace& trace) { trace("FirstTokenComplete"); });
-
-  const QuerySampleResponse* end = responses + response_count;
-
-  // Log samples.
-  for (QuerySampleResponse* response = responses; response < end; response++) {
-    loadgen::SampleMetadata* sample =
-        reinterpret_cast<loadgen::SampleMetadata*>(response->id);
-    loadgen::QueryMetadata* query = sample->query_metadata;
-    query->response_delegate->TokenComplete(sample, response, timestamp,
-                                            response_cb);
-  }
-  // PerfClock::time_point end_timestamp = PerfClock::now();
-  // mlperf::tokens_overhead_acum += (end_timestamp - timestamp).count();
-}
-
-}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen.h
deleted file mode 100644
index 84e02656c..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/// \file
-/// \brief Provides the entry points for a SUT to start a test and respond
-/// to issued queries.
-
-#ifndef MLPERF_LOADGEN_LOADGEN_H_
-#define MLPERF_LOADGEN_LOADGEN_H_
-
-#include <cstddef>
-#include <functional>
-#include <numeric>
-#include <string>
-
-/// \brief Contains the loadgen API.
-namespace mlperf {
-
-struct QuerySampleResponse;
-class QuerySampleLibrary;
-class SystemUnderTest;
-struct TestSettings;
-struct LogSettings;
-
-using ResponseCallback = std::function<void(QuerySampleResponse*)>;
-
-/// \addtogroup LoadgenAPI Loadgen API
-/// @{
-
-///
-/// \brief SUT calls this to notify loadgen of completed samples.
-/// \details
-/// * The samples may be from any combination of queries or partial queries as
-///   issued by \link mlperf::SystemUnderTest::IssueQuery
-///
-///   SystemUnderTest::IssueQuery \endlink.
-/// * The SUT is responsible for owning and allocating the reponse data. The
-///   loadgen will copy the response data if needed (e.g. for accuracy mode).
-///   + If no response callback is provided, the response data must remain valid
-///     for the entire duration of this call.
-///   + The response callback is untimed; it is called for each response in
-///     responses after the loadgen records the completion time and before the
-///     loadgen copies the response data. The response callback enables the
-///     loadgen to simulate response data being stored in accelerator DRAM.
-///     After the response callback is called, response data must reside on the
-///     host so that the loadgen can copy it. Submitters must seek prior
-///     approval to use this feature of loadgen (refer to
-///     https://github.com/mlcommons/inference_policies/blob/master/inference_rules.adoc#5-load-generator).
-/// * All calls to QuerySampleComplete are thread-safe and wait-free bounded.
-///   + Any number of threads can call QuerySampleComplete simultaneously.
-///   + Regardless of where any other thread stalls, the current thread will
-///     finish QuerySampleComplete in a bounded number of cycles.
-///   + Note: If a callback is provided, the SUT must ensure that the callback
-///     is also thread-safe and wait-free bounded for the above to hold.
-void QuerySamplesComplete(QuerySampleResponse* responses, size_t response_count,
-                          const ResponseCallback& response_cb = {});
-
-void FirstTokenComplete(QuerySampleResponse* responses, size_t response_count,
-                        const ResponseCallback& response_cb = {});
-
-///
-/// \brief Starts the test against SUT with the specified settings.
-/// \details This is the C++ entry point. See mlperf::c::StartTest for the
-/// C entry point.
-///
-void StartTest(SystemUnderTest* sut, QuerySampleLibrary* qsl,
-               const TestSettings& requested_settings,
-               const LogSettings& log_settings,
-               const std::string audit_config_filename = "audit.config");
-
-///
-/// \brief Aborts the running test.
-/// \details This function will stop issueing new samples to the SUT. StartTest
-/// will return after the current inference finishes. Since StartTest is a
-/// blocking function, this function can only be called in another thread.
-void AbortTest();
-
-///
-/// \brief Register a thread for query issuing in Server scenario.
-/// \details If a thread registers itself, the thread(s) is used to call SUT's
-/// IssueQuery(). This function is blocking until the entire test is done. The
-/// number of registered threads must match server_num_issue_query_threads in
-/// TestSettings. This function only has effect in Server scenario.
-/// This is the C++ entry point. See mlperf::c::RegisterIssueQueryThread for the
-/// C entry point.
-///
-void RegisterIssueQueryThread();
-// inline long long samples_overhead_acum;
-// inline long long tokens_overhead_acum;
-/// @}
-
-}  // namespace mlperf
-
-#endif  // MLPERF_LOADGEN_LOADGEN_H_
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen_integration_diagram.svg b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen_integration_diagram.svg
deleted file mode 100644
index 17dd1b481..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/loadgen_integration_diagram.svg
+++ /dev/null
@@ -1,85 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!-- Created by diasvg.py -->
-<svg width="29.100cm" height="12.100cm" viewBox="1.950 0.950 29.100 12.100"
- xmlns="http://www.w3.org/2000/svg"
- xmlns:xlink="http://www.w3.org/1999/xlink">
-<!-- Layer: Background -->
-<g id="Background">
-<!-- Flowchart - Box -->
-<rect x="2.000" y="2.000" width="8.000" height="2.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100"  rx="0.000" />
-<text x="6.000" y="3.195"  fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
-Model + Dataset</text>
-<!-- Flowchart - Box -->
-<rect x="2.000" y="6.000" width="8.000" height="2.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100"  rx="0.000" />
-<text x="6.000" y="7.195"  fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
-Pre Processor</text>
-<!-- Flowchart - Box -->
-<rect x="24.000" y="6.000" width="7.000" height="2.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100"  rx="0.000" />
-<text x="27.500" y="7.195"  fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
-Post Processor</text>
-<!-- Flowchart - Box -->
-<rect x="13.000" y="1.000" width="8.000" height="8.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100"  rx="0.000" />
-<text x="17.000" y="5.195"  fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
-Benchmark</text>
-<!-- Flowchart - Box -->
-<rect x="24.000" y="1.000" width="7.000" height="4.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100"  rx="0.000" />
-<text x="27.500" y="3.195"  fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
-Backend</text>
-<!-- Flowchart - Box -->
-<rect x="13.000" y="11.000" width="8.000" height="2.000" fill="#FFFFFF" stroke="#000000" stroke-width="0.100"  rx="0.000" />
-<text x="17.000" y="12.195"  fill="#000000" text-anchor="middle" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
-LoadGen</text>
-<!-- Standard - Line -->
-<line x1="10.000" y1="3.000" x2="12.513" y2="3.000" stroke="#000000" stroke-width="0.100" />
-<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="12.888,3.000 12.388,3.250 12.513,3.000 12.388,2.750 "/>
-<!-- Standard - Line -->
-<line x1="10.000" y1="7.000" x2="12.513" y2="7.000" stroke="#000000" stroke-width="0.100" />
-<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="12.888,7.000 12.388,7.250 12.513,7.000 12.388,6.750 "/>
-<!-- Standard - Line -->
-<line x1="15.000" y1="9.000" x2="15.000" y2="10.513" stroke="#000000" stroke-width="0.100" />
-<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="15.000,10.888 14.750,10.388 15.000,10.513 15.250,10.388 "/>
-<!-- Standard - Line -->
-<line x1="17.000" y1="11.000" x2="17.000" y2="9.487" stroke="#000000" stroke-width="0.100" />
-<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="17.000,9.112 17.250,9.612 17.000,9.487 16.750,9.612 "/>
-<!-- Standard - Line -->
-<line x1="19.000" y1="9.000" x2="19.000" y2="10.513" stroke="#000000" stroke-width="0.100" />
-<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="19.000,10.888 18.750,10.388 19.000,10.513 19.250,10.388 "/>
-<!-- Standard - Line -->
-<line x1="21.000" y1="3.000" x2="23.513" y2="3.000" stroke="#000000" stroke-width="0.100" />
-<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="23.888,3.000 23.388,3.250 23.513,3.000 23.388,2.750 "/>
-<!-- Standard - Line -->
-<line x1="24.000" y1="7.000" x2="21.487" y2="7.000" stroke="#000000" stroke-width="0.100" />
-<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="21.112,7.000 21.612,6.750 21.487,7.000 21.612,7.250 "/>
-<!-- Standard - Text -->
-<text x="10.850" y="2.650"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
-1</text>
-<!-- Standard - Text -->
-<!-- Standard - Text -->
-<text x="14.100" y="9.950"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
-2</text>
-<!-- Standard - Text -->
-<text x="16.100" y="10.450"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
-3</text>
-<!-- Standard - Text -->
-<text x="18.100" y="9.950"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
-5</text>
-<!-- Standard - Text -->
-<text x="21.950" y="2.650"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
-4</text>
-<!-- Standard - Text -->
-<!-- Standard - Text -->
-<text x="24.500" y="12.221"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
-LoadGen Logs</text>
-<!-- Standard - Line -->
-<line x1="21.000" y1="12.000" x2="23.513" y2="12.000" stroke="#000000" stroke-width="0.100" />
-<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="23.888,12.000 23.388,12.250 23.513,12.000 23.388,11.750 "/>
-<!-- Standard - Text -->
-<text x="21.900" y="11.450"  fill="#000000" text-anchor="start" font-size="0.80" font-family="sans" font-style="normal" font-weight="400">
-6</text>
-<!-- Standard - Text -->
-<!-- Standard - Text -->
-<!-- Standard - Line -->
-<line x1="27.500" y1="5.000" x2="27.500" y2="5.513" stroke="#000000" stroke-width="0.100" />
-<polygon fill="#000000" stroke="#000000" stroke-width="0.100"  points="27.500,5.888 27.250,5.388 27.500,5.513 27.750,5.388 "/>
-</g>
-</svg>
\ No newline at end of file
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/logging.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/logging.cc
deleted file mode 100644
index 807c1954a..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/logging.cc
+++ /dev/null
@@ -1,1301 +0,0 @@
-/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/// \file
-/// \brief Implements a logging system with a central IO thread that handles
-/// all stringification and IO.
-/// \details Log-producing threads only submit lambdas to be executed on the
-/// IO thread.
-/// All producers and consumers use lock-free operations that guarantee
-/// forward progress independent of a) other stalled threads and b) where
-/// those threads are stalled.
-/// Each thread uses a double-buffering scheme to queue its logs. One buffer
-/// is always reserved for writes and the other is reserved for reads.
-/// A producing thread sends requests to the IOThread to swap the buffers
-/// and the IOThread does the actual read/write swap after it has finished
-/// reading the buffer it was working on.
-
-#include "logging.h"
-
-#include <cassert>
-#include <cmath>
-#include <future>
-#include <iomanip>
-#include <iostream>
-#include <limits>
-#include <sstream>
-
-#if defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64)
-#define WIN32_LEAN_AND_MEAN
-#define NOMINMAX
-#include <process.h>
-#include <windows.h>
-#define MLPERF_GET_PID() _getpid()
-#else
-#include <unistd.h>
-#define MLPERF_GET_PID() getpid()
-#endif
-
-// Use system-level TID for tracing. This enables correlation with other
-// performance tools that are not aware of C++ std::thread::id.
-#if defined(__linux__)
-#include <sys/syscall.h>
-#define MLPERF_GET_TID() syscall(SYS_gettid)
-#elif defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64)
-#define MLPERF_GET_TID() GetCurrentThreadId()
-#elif defined(__APPLE__)
-#define MLPERF_GET_TID() \
-  std::hash<std::thread::id>{}(std::this_thread::get_id())
-#else
-// TODO: std::this_thread::id is a class but MLPERF_GET_TID() assigned to
-// uint64_t
-#define MLPERF_GET_TID() std::this_thread::get_id()
-#endif
-
-#include "utils.h"
-
-namespace mlperf {
-namespace logging {
-
-namespace {
-
-uintptr_t SwapRequestSlotIsWritableValue(size_t id) {
-  // LSB of 1 indicates that this isn't a pointer.
-  // MSBs encode the id to detect collisions when a slot in
-  // |thread_swap_request_slots_| is reused for a different id and the request
-  // for the previous id is very slow.
-  return (id << 1) | 0x1;
-}
-
-bool SwapRequestSlotIsReadable(uintptr_t value) {
-  // Valid pointers will not have their lsb set.
-  return (value & 0x1) != 0x1;
-}
-
-constexpr size_t kMaxThreadsToLog = 1024;
-constexpr std::chrono::milliseconds kLogPollPeriod(10);
-
-/// \brief How many log entries to pre-allocate per thread to help avoid
-/// runtime allocation.
-constexpr size_t kTlsLogReservedEntryCount = 1024;
-
-constexpr auto kInvalidLatency = std::numeric_limits<QuerySampleLatency>::min();
-constexpr auto nTokenInvalid = std::numeric_limits<int64_t>::min();
-
-}  // namespace
-
-const std::string& ArgValueTransform(const bool& value) {
-  static const std::string v_true("true");
-  static const std::string v_false("false");
-  return value ? v_true : v_false;
-}
-
-char Bin2Hex(uint8_t four_bits) {
-  char number = '0' + four_bits;
-  char letter = ('A' - 10) + four_bits;
-  return four_bits < 10 ? number : letter;
-}
-
-const std::string ArgValueTransform(const LogBinaryAsHexString& value) {
-  if (value.data == nullptr) {
-    return "\"\"";
-  }
-  std::string hex;
-  hex.reserve(value.data->size() + 2);
-  hex.push_back('"');
-  for (auto b : *value.data) {
-    hex.push_back(Bin2Hex(b >> 4));
-    hex.push_back(Bin2Hex(b & 0x0F));
-  }
-  hex.push_back('"');
-  return hex;
-}
-
-#if USE_NEW_LOGGING_FORMAT
-const std::string ArgValueTransform(const std::string& value) {
-  return std::string("\"") + value + std::string("\"");
-}
-
-const std::string ArgValueTransform(const char* value) {
-  return std::string("\"") + std::string(value) + std::string("\"");
-}
-
-const std::string ArgValueTransform(const std::vector<size_t>& value) {
-  std::string s("[");
-  for (auto i : value) {
-    s += std::to_string(i) + ",";
-  }
-  s.resize(s.size() - 1);
-  s += "]";
-  return s;
-}
-
-const std::string ArgValueTransform(
-    const std::map<std::string, std::string>& value) {
-  std::string s("{");
-  for (const auto& i : value) {
-    s += "\"";
-    s += i.first;
-    s += "\":\"";
-    s += i.second;
-    s += "\",";
-  }
-  s.resize(s.size() - 1);
-  s += "}";
-  return s;
-}
-
-const std::string ArgValueTransform(const float value) {
-  if (value == std::numeric_limits<float>::infinity()) {
-    return "Infinity";
-  } else if (value == -std::numeric_limits<float>::infinity()) {
-    return "-Infinity";
-  } else if (std::isnan(value)) {
-    return "NaN";
-  }
-  return std::to_string(value);
-}
-
-const std::string ArgValueTransform(const double value) {
-  if (value == std::numeric_limits<double>::infinity()) {
-    return "Infinity";
-  } else if (value == -std::numeric_limits<double>::infinity()) {
-    return "-Infinity";
-  } else if (std::isnan(value)) {
-    return "NaN";
-  }
-  return std::to_string(value);
-}
-#endif
-
-ChromeTracer::ChromeTracer(std::ostream* out, PerfClock::time_point origin)
-    : out_(out), origin_(origin) {
-  WriteTraceEventHeader();
-}
-
-ChromeTracer::~ChromeTracer() {
-  WriteTraceEventFooter();
-  out_->flush();
-}
-
-void ChromeTracer::WriteTraceEventHeader() {
-  // Times and durations are converted from nanoseconds to microseconds, use
-  // 3 decimal digits to preserve precision.
-  *out_ << std::fixed << std::setprecision(3) << "{\"traceEvents\":[\n";
-}
-
-void ChromeTracer::WriteTraceEventFooter() {
-  *out_ << "{\"name\":\"LastTrace\"}\n"
-        << "],\n"
-        << "\"displayTimeUnit\":\"ns\",\n"
-        << "\"otherData\":{\n"
-        << "\"ts\":" << Micros(origin_.time_since_epoch()).count() << ",\n"
-        << "\"version\":\"MLPerf LoadGen v1.0\"\n"
-        << "}\n"
-        << "}\n";
-}
-
-void AsyncLog::SetCurrentPidTid(uint64_t pid, uint64_t tid) {
-  current_pid_ = pid;
-  current_tid_ = tid;
-}
-
-void AsyncLog::SetLogFiles(std::ostream* summary, std::ostream* detail,
-                           std::ostream* accuracy, bool copy_detail_to_stdout,
-                           bool copy_summary_to_stdout,
-                           PerfClock::time_point log_origin) {
-  std::unique_lock<std::mutex> lock(log_mutex_);
-  if (summary_out_ != &std::cerr) {
-    std::string warning_summary;
-    if (log_warning_count_ == 0) {
-      warning_summary = "\nNo warnings encountered during test.\n";
-    } else if (log_warning_count_ == 1) {
-      warning_summary = "\n1 warning encountered. See detailed log.\n";
-    } else if (log_warning_count_ != 0) {
-      warning_summary = "\n" + std::to_string(log_warning_count_) +
-                        " warnings encountered. See detailed log.\n";
-    }
-
-    std::string error_summary;
-    if (log_error_count_ == 0) {
-      error_summary = "\nNo errors encountered during test.\n";
-    } else if (log_error_count_ == 1) {
-      error_summary = "\n1 ERROR encountered. See detailed log.\n";
-    } else if (log_error_count_ != 0) {
-      error_summary = "\n" + std::to_string(log_error_count_) +
-                      " ERRORS encountered. See detailed log.\n";
-    }
-
-    *summary_out_ << warning_summary << error_summary;
-    if (copy_summary_to_stdout_) {
-      std::cout << warning_summary << error_summary;
-    }
-  }
-  if (summary_out_) {
-    summary_out_->flush();
-  }
-  if (detail_out_) {
-    detail_out_->flush();
-  }
-  if (accuracy_out_ != &std::cerr) {
-    WriteAccuracyFooterLocked();
-    accuracy_out_->flush();
-  }
-  summary_out_ = summary;
-  detail_out_ = detail;
-  accuracy_out_ = accuracy;
-  if (accuracy_out_ != &std::cerr) {
-    WriteAccuracyHeaderLocked();
-  }
-  copy_detail_to_stdout_ = copy_detail_to_stdout;
-  copy_summary_to_stdout_ = copy_summary_to_stdout;
-  log_origin_ = log_origin;
-  log_error_count_ = 0;
-  log_warning_count_ = 0;
-}
-
-void AsyncLog::StartNewTrace(std::ostream* trace_out,
-                             PerfClock::time_point origin) {
-  std::unique_lock<std::mutex> lock(trace_mutex_);
-  if (trace_out) {
-    tracer_ = std::make_unique<ChromeTracer>(trace_out, origin);
-  } else {
-    tracer_.reset();
-  }
-}
-
-void AsyncLog::StopTrace() {
-  std::unique_lock<std::mutex> lock(trace_mutex_);
-  tracer_.reset();
-}
-
-void AsyncLog::LogAccuracy(uint64_t seq_id, const QuerySampleIndex qsl_idx,
-                           const LogBinaryAsHexString& response,
-                           int64_t n_tokens = 0) {
-  std::unique_lock<std::mutex> lock(log_mutex_);
-  if (!accuracy_out_) {
-    return;
-  }
-  *accuracy_out_ << (accuracy_needs_comma_ ? ",\n{ " : "\n{ ");
-  if (!use_tokens_) {
-    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "data",
-            response);
-  } else if (!needs_first_token_) {
-    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "data",
-            response, "token_count", n_tokens);
-  } else {
-    const size_t i = seq_id - latencies_first_sample_sequence_id_;
-    LogArgs(accuracy_out_, "seq_id", seq_id, "qsl_idx", qsl_idx, "data",
-            response, "token_data", token_records_[i], "token_count", n_tokens);
-  }
-
-  *accuracy_out_ << " }";
-  accuracy_needs_comma_ = true;
-}
-
-void AsyncLog::CacheToken(uint64_t seq_id,
-                          const LogBinaryAsHexString& response) {
-  std::unique_lock<std::mutex> lock(token_record_mutex_);
-  const size_t i = seq_id - latencies_first_sample_sequence_id_;
-  if (token_records_.size() <= i) {
-    token_records_.resize(i + 1);
-  }
-  token_records_[i] = response;
-}
-
-void AsyncLog::Flush() {
-  {
-    std::unique_lock<std::mutex> lock(log_mutex_);
-    if (summary_out_) {
-      summary_out_->flush();
-    }
-    if (detail_out_) {
-      detail_out_->flush();
-    }
-    if (accuracy_out_) {
-      accuracy_out_->flush();
-    }
-  }
-
-  {
-    std::unique_lock<std::mutex> lock(trace_mutex_);
-    if (tracer_) {
-      tracer_->Flush();
-    }
-  }
-}
-
-void AsyncLog::WriteAccuracyHeaderLocked() {
-  *accuracy_out_ << "[";
-  accuracy_needs_comma_ = false;
-}
-
-void AsyncLog::WriteAccuracyFooterLocked() { *accuracy_out_ << "\n]\n"; }
-
-void AsyncLog::RestartLatencyRecording(uint64_t first_sample_sequence_id,
-                                       size_t latencies_to_reserve) {
-  std::unique_lock<std::mutex> lock(latencies_mutex_);
-  assert(latencies_.empty());
-  assert(latencies_recorded_ == latencies_expected_);
-  latencies_recorded_ = 0;
-  latencies_expected_ = 0;
-  max_latency_ = 0;
-  max_completion_timstamp_ = PerfClock::now();
-  latencies_first_sample_sequence_id_ = first_sample_sequence_id;
-  latencies_.reserve(latencies_to_reserve);
-  token_latencies_.reserve(latencies_to_reserve);
-  tokens_per_sample_.reserve(latencies_to_reserve);
-  time_per_output_token_.reserve(latencies_to_reserve);
-}
-
-void AsyncLog::RecordSampleCompletion(uint64_t sample_sequence_id,
-                                      PerfClock::time_point completion_time,
-                                      QuerySampleLatency latency,
-                                      int64_t n_tokens = 0) {
-  std::unique_lock<std::mutex> lock(latencies_mutex_);
-
-  max_latency_ = std::max(max_latency_, latency);
-
-  max_completion_timstamp_ =
-      std::max(max_completion_timstamp_, completion_time);
-
-  if (sample_sequence_id < latencies_first_sample_sequence_id_) {
-    // Call LogErrorSync here since this kind of error could result in a
-    // segfault in the near future.
-#if USE_NEW_LOGGING_FORMAT
-    std::stringstream ss;
-    ss << "Received completion for an old sample."
-       << " Min expected id: " << latencies_first_sample_sequence_id_
-       << " Actual id: " << sample_sequence_id;
-    MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime", ss.str());
-#else
-    GlobalLogger().LogErrorSync(
-        "Received completion for an old sample.", "Min expected id",
-        latencies_first_sample_sequence_id_, "Actual id", sample_sequence_id);
-#endif
-    return;
-  }
-
-  const size_t i = sample_sequence_id - latencies_first_sample_sequence_id_;
-
-  if (latencies_.size() <= i) {
-    // TODO: Reserve in advance.
-    latencies_.resize(i + 1, kInvalidLatency);
-  } else if (latencies_[i] != kInvalidLatency) {
-    // Call LogErrorSync here since this kind of error could result in a
-    // segfault in the near future.
-#if USE_NEW_LOGGING_FORMAT
-    MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
-                          "Attempted to complete a sample twice.");
-#else
-    GlobalLogger().LogErrorSync("Attempted to complete a sample twice.");
-#endif
-
-    // Return without recording the latency again to avoid potentially
-    // ending the test before the SUT is actually done, which could result
-    // in a segfault.
-    // If the SUT recorded the wrong sample, the test will hang and see
-    // the error above.
-    return;
-  }
-
-  if (use_tokens_) {
-    if (needs_first_token_ && (token_latencies_.size() <= i)) {
-      MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
-                            "Attempted to record a sample latency before it's "
-                            "first token latency");
-    } else if (needs_first_token_ && (token_latencies_[i] == kInvalidLatency)) {
-      MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
-                            "Attempted to record a sample latency before it's "
-                            "first token latency");
-    }
-
-    if (tokens_per_sample_.size() <= i) {
-      // TODO: Reserve in advance.
-      tokens_per_sample_.resize(i + 1, nTokenInvalid);
-    } else if (tokens_per_sample_[i] != nTokenInvalid) {
-      // Call LogErrorSync here since this kind of error could result in a
-      // segfault in the near future.
-#if USE_NEW_LOGGING_FORMAT
-      MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
-                            "Attempted to complete a sample twice.");
-#else
-      GlobalLogger().LogErrorSync("Attempted to complete a sample twice.");
-#endif
-
-      // Return without recording the latency again to avoid potentially
-      // ending the test before the SUT is actually done, which could result
-      // in a segfault.
-      // If the SUT recorded the wrong sample, the test will hang and see
-      // the error above.
-      return;
-    }
-    if (n_tokens == 0) {
-      MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
-                            "n_tokens argument missing or attempted to record "
-                            "0 as number of tokens");
-    } else if (n_tokens < 0) {
-      MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
-                            "Attempted to record a negative number of tokens");
-      n_tokens = 0;
-    } else if (n_tokens == 1) {
-      MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
-                            "Number of tokens need to be greater than 1");
-      n_tokens = 0;
-    }
-    if (time_per_output_token_.size() <= i) {
-      time_per_output_token_.resize(i + 1, kInvalidLatency);
-    } else if (time_per_output_token_[i] != kInvalidLatency) {
-      // Call LogErrorSync here since this kind of error could result in a
-      // segfault in the near future.
-#if USE_NEW_LOGGING_FORMAT
-      MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
-                            "Attempted to complete a sample twice.");
-#else
-      GlobalLogger().LogErrorSync("Attempted to complete a sample twice.");
-#endif
-
-      // Return without recording the latency again to avoid potentially
-      // ending the test before the SUT is actually done, which could result
-      // in a segfault.
-      // If the SUT recorded the wrong sample, the test will hang and see
-      // the error above.
-      return;
-    }
-    tokens_per_sample_[i] = n_tokens;
-    time_per_output_token_[i] =
-        (latency - token_latencies_[i]) / (n_tokens - 1);
-  }
-  latencies_[i] = latency;
-  latencies_recorded_++;
-  if (AllLatenciesRecorded()) {
-    all_latencies_recorded_.notify_all();
-  }
-}
-
-void AsyncLog::RecordTokenCompletion(uint64_t sample_sequence_id,
-                                     PerfClock::time_point completion_time,
-                                     QuerySampleLatency latency) {
-  std::unique_lock<std::mutex> lock(token_latencies_mutex_);
-  // std::unique_lock<std::mutex> lock(latencies_mutex_);
-  // max_latency_ = std::max(max_latency_, latency);
-
-  // max_completion_timstamp_ =
-  //     std::max(max_completion_timstamp_, completion_time);
-
-  if (sample_sequence_id < latencies_first_sample_sequence_id_) {
-    // Call LogErrorSync here since this kind of error could result in a
-    // segfault in the near future.
-#if USE_NEW_LOGGING_FORMAT
-    std::stringstream ss;
-    ss << "Received completion for an old sample."
-       << " Min expected id: " << latencies_first_sample_sequence_id_
-       << " Actual id: " << sample_sequence_id;
-    MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime", ss.str());
-#else
-    GlobalLogger().LogErrorSync(
-        "Received completion for an old sample.", "Min expected id",
-        latencies_first_sample_sequence_id_, "Actual id", sample_sequence_id);
-#endif
-    return;
-  }
-
-  const size_t i = sample_sequence_id - latencies_first_sample_sequence_id_;
-
-  if (latencies_.size() > i) {
-    if (latencies_[i] != kInvalidLatency) {
-#if USE_NEW_LOGGING_FORMAT
-      MLPERF_LOG_ERROR_SYNC(
-          GlobalLogger(), "error_runtime",
-          "Attempted to record token latency after sample was completed");
-#else
-      GlobalLogger().LogErrorSync(
-          "Attempted to record token latency after sample was completed");
-#endif
-
-      // Return without recording the latency again to avoid potentially
-      // ending the test before the SUT is actually done, which could result
-      // in a segfault.
-      // If the SUT recorded the wrong sample, the test will hang and see
-      // the error above.
-      return;
-    }
-  }
-  if (token_latencies_.size() <= i) {
-    // TODO: Reserve in advance.
-    token_latencies_.resize(i + 1, kInvalidLatency);
-  } else if (token_latencies_[i] != kInvalidLatency) {
-    // Call LogErrorSync here since this kind of error could result in a
-    // segfault in the near future.
-#if USE_NEW_LOGGING_FORMAT
-    MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime",
-                          "Attempted to complete a sample twice.");
-#else
-    GlobalLogger().LogErrorSync("Attempted to complete a sample twice.");
-#endif
-
-    // Return without recording the latency again to avoid potentially
-    // ending the test before the SUT is actually done, which could result
-    // in a segfault.
-    // If the SUT recorded the wrong sample, the test will hang and see
-    // the error above.
-    return;
-  }
-  token_latencies_[i] = latency;
-}
-
-std::vector<QuerySampleLatency> AsyncLog::GetLatenciesBlocking(
-    size_t expected_count) {
-  std::vector<QuerySampleLatency> latencies;
-  {
-    std::unique_lock<std::mutex> lock(latencies_mutex_);
-    latencies_expected_ = expected_count;
-    all_latencies_recorded_.wait(lock, [&] { return AllLatenciesRecorded(); });
-    latencies.swap(latencies_);
-  }
-
-  if (latencies.size() != expected_count) {
-    // Call LogErrorSync here since this kind of error could result in a
-    // segfault in the near future.
-#if USE_NEW_LOGGING_FORMAT
-    std::stringstream ss;
-    ss << "Received SequenceId that was too large."
-       << " expected_size: " << expected_count
-       << " actual_size: " << latencies.size();
-    MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime", ss.str());
-#else
-    GlobalLogger().LogErrorSync("Received SequenceId that was too large.",
-                                "expected_size", expected_count, "actual_size",
-                                latencies.size());
-#endif
-  }
-
-  size_t invalid_latency_count = 0;
-  for (auto l : latencies) {
-    if (l == kInvalidLatency) {
-      invalid_latency_count++;
-    }
-  }
-  if (invalid_latency_count != 0) {
-    // Call LogErrorSync here since this kind of error could result in a
-    // segfault in the near future.
-#if USE_NEW_LOGGING_FORMAT
-    std::stringstream ss;
-    ss << "Encountered incomplete samples at the end of a series of queries."
-       << " count: " << invalid_latency_count;
-    MLPERF_LOG_ERROR_SYNC(GlobalLogger(), "error_runtime", ss.str());
-#else
-    GlobalLogger().LogErrorSync(
-        "Encountered incomplete samples at the end of a series of queries.",
-        "count", invalid_latency_count);
-#endif
-  }
-
-  return latencies;
-}
-
-std::vector<QuerySampleLatency> AsyncLog::GetTokenLatencies(
-    size_t expected_count) {
-  std::vector<QuerySampleLatency> token_latencies;
-  token_latencies.swap(token_latencies_);
-  return token_latencies;
-}
-
-std::vector<QuerySampleLatency> AsyncLog::GetTimePerOutputToken(
-    size_t expected_count) {
-  std::vector<QuerySampleLatency> tpot_latencies;
-  tpot_latencies.swap(time_per_output_token_);
-  return tpot_latencies;
-}
-
-std::vector<int64_t> AsyncLog::GetTokensPerSample(size_t expected_count) {
-  std::vector<int64_t> tokens_per_sample;
-  tokens_per_sample.swap(tokens_per_sample_);
-  return tokens_per_sample;
-}
-
-PerfClock::time_point AsyncLog::GetMaxCompletionTime() {
-  return max_completion_timstamp_;
-}
-
-QuerySampleLatency AsyncLog::GetMaxLatencySoFar() {
-  std::unique_lock<std::mutex> lock(latencies_mutex_);
-  return max_latency_;
-}
-
-void AsyncLog::SetUseTokens(bool use_tokens) { use_tokens_ = use_tokens; }
-
-void AsyncLog::SetNeedsFirstToken(bool needs_first_token) {
-  needs_first_token_ = needs_first_token;
-}
-
-/// \brief Records a single thread using thread-local storage and submits
-/// entries to the central Logger.
-///
-/// \details This setup allows for each log entry to be added:
-///   * With forward-progress guarantees. (i.e.: no locking or blocking
-///       operations even if other threads have stalled.)
-///   * Without expensive syscalls or I/O operations, which are deferred to
-///       the central Logger.
-class TlsLogger {
- public:
-  TlsLogger(std::function<void()> forced_detatch);
-  ~TlsLogger();
-  void ForcedDetatchFromThread() { forced_detatch_(); }
-
-  void Log(AsyncLogEntry&& entry);
-  void SwapBuffers();
-
-  std::vector<AsyncLogEntry>* StartReadingEntries();
-  void FinishReadingEntries();
-  bool ReadBufferHasBeenConsumed();
-  size_t MaxEntryVectorSize() { return max_entry_size_; }
-
-  uint64_t Pid() const { return pid_; }
-  uint64_t Tid() const { return tid_; }
-
-  void RequestSwapBuffersSlotRetried() {
-    swap_buffers_slot_retry_count_.fetch_add(1, std::memory_order_relaxed);
-  }
-
-  size_t ReportLogCasFailCount() {
-    size_t c = log_cas_fail_count_.load(std::memory_order_relaxed);
-    log_cas_fail_count_.fetch_sub(c, std::memory_order_relaxed);
-    return c;
-  }
-
-  size_t ReportSwapBuffersSlotRetryCount() {
-    size_t c = swap_buffers_slot_retry_count_.load(std::memory_order_relaxed);
-    swap_buffers_slot_retry_count_.fetch_sub(c, std::memory_order_relaxed);
-    return c;
-  }
-
-  void TraceCounters();
-
- private:
-  using EntryVector = std::vector<AsyncLogEntry>;
-  enum class EntryState { Unlocked, ReadLock, WriteLock };
-
-  // Accessed by producer only.
-  size_t i_read_ = 0;
-
-  // Accessed by producer and consumer atomically.
-  EntryVector entries_[2];
-  std::atomic<EntryState> entry_states_[2]{{EntryState::ReadLock},
-                                           {EntryState::Unlocked}};
-  std::atomic<size_t> i_write_{1};
-
-  std::atomic<size_t> log_cas_fail_count_{0};
-  std::atomic<size_t> swap_buffers_slot_retry_count_{0};
-
-  // Accessed by consumer only.
-  size_t unread_swaps_ = 0;
-  size_t i_write_prev_ = 0;
-  uint64_t pid_;
-  uint64_t tid_;
-  size_t max_entry_size_ = kTlsLogReservedEntryCount;
-
-  std::function<void()> forced_detatch_;
-};
-
-Logger::Logger(std::chrono::duration<double> poll_period,
-               size_t max_threads_to_log)
-    : poll_period_(poll_period),
-      max_threads_to_log_(max_threads_to_log),
-      thread_swap_request_slots_(max_threads_to_log * 2) {
-  const size_t kSlotCount = max_threads_to_log * 2;
-  for (size_t i = 0; i < kSlotCount; i++) {
-    std::atomic_init(&thread_swap_request_slots_[i],
-                     SwapRequestSlotIsWritableValue(i));
-  }
-}
-
-Logger::~Logger() {
-  // TlsLoggers might outlive this Logger when loaded as a python module.
-  // Forcefully make all currently registered TlsLoggers orphans.
-  std::unique_lock<std::mutex> lock(tls_loggers_registerd_mutex_);
-  TlsLogger* tls_logger_prev = nullptr;
-  (void)tls_logger_prev;  // Avoid unused error in release builds.
-  while (!tls_loggers_registerd_.empty()) {
-    TlsLogger* tls_logger = *tls_loggers_registerd_.begin();
-    // Otherwise, this is an infinite loop.
-    assert(tls_logger != tls_logger_prev);
-    tls_loggers_registerd_mutex_.unlock();
-    tls_logger->ForcedDetatchFromThread();
-    tls_loggers_registerd_mutex_.lock();
-    tls_logger_prev = tls_logger;
-  }
-}
-
-void Logger::RequestSwapBuffers(TlsLogger* tls_logger) {
-  auto tls_logger_as_uint = reinterpret_cast<uintptr_t>(tls_logger);
-  assert(SwapRequestSlotIsReadable(tls_logger_as_uint));
-  size_t id, slot;
-  uintptr_t slot_is_writeable_value;
-  // The compare_exchange below should almost always succeed.
-  // The compare_exchange may fail if a recycled slot is still actively used
-  // by another thread, so we retry with subsequent slots here if needed.
-  // Since the slot count is 2x the expected number of threads to log,
-  // the CAS should only fail at most 50% of the time when all logging threads
-  // happen to be descheduled between the fetch_add and CAS below, which is
-  // very unlikely.
-  id = swap_request_id_.fetch_add(1, std::memory_order_relaxed);
-  slot = id % thread_swap_request_slots_.size();
-  slot_is_writeable_value = SwapRequestSlotIsWritableValue(id);
-  while (!thread_swap_request_slots_[slot].compare_exchange_strong(
-      slot_is_writeable_value, tls_logger_as_uint, std::memory_order_release)) {
-    id = swap_request_id_.fetch_add(1, std::memory_order_relaxed);
-    slot = id % thread_swap_request_slots_.size();
-    slot_is_writeable_value = SwapRequestSlotIsWritableValue(id);
-    tls_logger->RequestSwapBuffersSlotRetried();
-  }
-}
-
-void Logger::RegisterTlsLogger(TlsLogger* tls_logger) {
-  std::unique_lock<std::mutex> lock(tls_loggers_registerd_mutex_);
-  if (tls_loggers_registerd_.size() >= max_threads_to_log_) {
-#if USE_NEW_LOGGING_FORMAT
-    MLPERF_LOG_ERROR_SYNC((*this), "error_runtime",
-                          "Warning: More TLS loggers registerd than can be "
-                          "active simultaneously.");
-#else
-    LogErrorSync(
-        "Warning: More TLS loggers registerd than can "
-        "be active simultaneously.\n");
-#endif
-  }
-  tls_loggers_registerd_.insert(tls_logger);
-}
-
-// This moves ownership of the tls_logger data to Logger so the
-// exiting thread can exit immediately, even if all the logs of the
-// exiting thread haven't been processed.
-void Logger::UnRegisterTlsLogger(std::unique_ptr<TlsLogger> tls_logger) {
-  OrphanContainer::iterator orphan;
-  {
-    std::unique_lock<std::mutex> lock(tls_logger_orphans_mutex_);
-    tls_logger_orphans_.emplace_front(std::move(tls_logger));
-    orphan = tls_logger_orphans_.begin();
-  }
-
-  // Only remove the TlsLogger from the registry after adding to orphans so
-  // CollectTlsLoggerStats doesn't have any gaps in coverage.
-  {
-    std::unique_lock<std::mutex> lock(tls_loggers_registerd_mutex_);
-    tls_loggers_registerd_.erase(orphan->get());
-  }
-
-  // This will flush the logs of |tls_logger| and mark it for destruction.
-  // Deferring destruction via orphans_to_destroy helps avoid use-after-frees
-  // when the IOThread calls FinishReadingEntries.
-  (*orphan)->Log([this, orphan](AsyncLog&) {
-    CollectTlsLoggerStats(orphan->get());
-    orphans_to_destroy_.push_back(orphan);
-  });
-}
-
-void Logger::CollectTlsLoggerStats(TlsLogger* tls_logger) {
-  tls_total_log_cas_fail_count_ += tls_logger->ReportLogCasFailCount();
-  tls_total_swap_buffers_slot_retry_count_ +=
-      tls_logger->ReportSwapBuffersSlotRetryCount();
-
-  size_t max_entry_vector_size = tls_logger->MaxEntryVectorSize();
-  if (max_entry_vector_size > kTlsLogReservedEntryCount) {
-#if USE_NEW_LOGGING_FORMAT
-    std::stringstream msg;
-    msg << "Logging allocation detected:" << " tid: " << tls_logger->Tid()
-        << " reserved_entries: " << kTlsLogReservedEntryCount
-        << " max_entries: " << max_entry_vector_size;
-    MLPERF_LOG_WARNING((*this), "warning_generic_message", msg.str());
-#else
-    async_logger_.FlagWarning();
-    async_logger_.LogDetail("Logging allocation detected: ", "tid",
-                            tls_logger->Tid(), "reserved_entries",
-                            kTlsLogReservedEntryCount, "max_entries",
-                            max_entry_vector_size);
-#endif
-  }
-}
-
-void Logger::StartIOThread() {
-  {
-    std::unique_lock<std::mutex> lock(io_thread_mutex_);
-    keep_io_thread_alive_ = true;
-  }
-  io_thread_ = std::thread(&Logger::IOThread, this);
-}
-
-void Logger::StopIOThread() {
-  {
-    std::unique_lock<std::mutex> lock(io_thread_mutex_);
-    keep_io_thread_alive_ = false;
-    io_thread_cv_.notify_all();
-  }
-  io_thread_.join();
-}
-
-void Logger::StartLogging(std::ostream* summary, std::ostream* detail,
-                          std::ostream* accuracy, bool copy_detail_to_stdout,
-                          bool copy_summary_to_stdout) {
-  async_logger_.SetLogFiles(summary, detail, accuracy, copy_detail_to_stdout,
-                            copy_summary_to_stdout, PerfClock::now());
-}
-
-void Logger::StopLogging() {
-  if (std::this_thread::get_id() == io_thread_.get_id()) {
-#if USE_NEW_LOGGING_FORMAT
-    MLPERF_LOG_ERROR_SYNC((*this), "error_runtime",
-                          "StopLogging() not supported from IO thread.");
-#else
-    LogErrorSync("StopLogging() not supported from IO thread.");
-#endif
-    return;
-  }
-
-  // Flush logs from this thread.
-  std::promise<void> io_thread_flushed_this_thread;
-  Log([&](AsyncLog&) { io_thread_flushed_this_thread.set_value(); });
-  io_thread_flushed_this_thread.get_future().wait();
-  async_logger_.SetLogFiles(&std::cerr, &std::cerr, &std::cerr, false, false,
-                            PerfClock::now());
-}
-
-void Logger::StartNewTrace(std::ostream* trace_out,
-                           PerfClock::time_point origin) {
-  async_logger_.StartNewTrace(trace_out, origin);
-}
-
-void Logger::StopTracing() {
-  // Flush traces from this thread.
-  std::promise<void> io_thread_flushed_this_thread;
-  Log([&](AsyncLog&) { io_thread_flushed_this_thread.set_value(); });
-  io_thread_flushed_this_thread.get_future().wait();
-  async_logger_.StopTrace();
-}
-
-void Logger::LogContentionAndAllocations() {
-  LogDetail([&](AsyncDetail& detail) {
-    {
-      std::unique_lock<std::mutex> lock(tls_loggers_registerd_mutex_);
-      for (auto tls_logger : tls_loggers_registerd_) {
-        CollectTlsLoggerStats(tls_logger);
-      }
-    }
-
-    {
-      std::unique_lock<std::mutex> lock(tls_logger_orphans_mutex_);
-      for (auto& orphan : tls_logger_orphans_) {
-        CollectTlsLoggerStats(orphan.get());
-      }
-    }
-
-#if USE_NEW_LOGGING_FORMAT
-    MLPERF_LOG(detail, "logger_swap_request_slots_retry_count",
-               swap_request_slots_retry_count_);
-    MLPERF_LOG(detail, "logger_swap_request_slots_retry_retry_count",
-               swap_request_slots_retry_retry_count_);
-    MLPERF_LOG(detail, "logger_swap_request_slots_retry_reencounter_count",
-               swap_request_slots_retry_reencounter_count_);
-    MLPERF_LOG(detail, "logger_start_reading_entries_retry_count",
-               start_reading_entries_retry_count_);
-    MLPERF_LOG(detail, "logger_tls_total_log_cas_fail_count",
-               tls_total_log_cas_fail_count_);
-    MLPERF_LOG(detail, "logger_tls_total_swap_buffers_slot_retry_count",
-               tls_total_swap_buffers_slot_retry_count_);
-#else
-    detail("Log Contention Counters:");
-    detail(std::to_string(swap_request_slots_retry_count_) +
-           " : swap_request_slots_retry_count");
-    detail(std::to_string(swap_request_slots_retry_retry_count_) +
-           " : swap_request_slots_retry_retry_count");
-    detail(std::to_string(swap_request_slots_retry_reencounter_count_) +
-           " : swap_request_slots_retry_reencounter_count");
-    detail(std::to_string(start_reading_entries_retry_count_) +
-           " : start_reading_entries_retry_count");
-    detail(std::to_string(tls_total_log_cas_fail_count_) +
-           " : tls_total_log_cas_fail_count");
-    detail(std::to_string(tls_total_swap_buffers_slot_retry_count_) +
-           " : tls_total_swap_buffers_slot_retry_count");
-#endif
-
-    swap_request_slots_retry_count_ = 0;
-    swap_request_slots_retry_retry_count_ = 0;
-    swap_request_slots_retry_reencounter_count_ = 0;
-    start_reading_entries_retry_count_ = 0;
-    tls_total_log_cas_fail_count_ = 0;
-    tls_total_swap_buffers_slot_retry_count_ = 0;
-  });
-}
-
-void Logger::RestartLatencyRecording(uint64_t first_sample_sequence_id,
-                                     size_t latencies_to_reserve) {
-  async_logger_.RestartLatencyRecording(first_sample_sequence_id,
-                                        latencies_to_reserve);
-}
-
-std::vector<QuerySampleLatency> Logger::GetLatenciesBlocking(
-    size_t expected_count) {
-  return async_logger_.GetLatenciesBlocking(expected_count);
-}
-std::vector<QuerySampleLatency> Logger::GetTokenLatencies(
-    size_t expected_count) {
-  return async_logger_.GetTokenLatencies(expected_count);
-}
-std::vector<QuerySampleLatency> Logger::GetTimePerOutputToken(
-    size_t expected_count) {
-  return async_logger_.GetTimePerOutputToken(expected_count);
-}
-std::vector<QuerySampleLatency> Logger::GetTokensPerSample(
-    size_t expected_count) {
-  return async_logger_.GetTokensPerSample(expected_count);
-}
-
-PerfClock::time_point Logger::GetMaxCompletionTime() {
-  return async_logger_.GetMaxCompletionTime();
-}
-
-QuerySampleLatency Logger::GetMaxLatencySoFar() {
-  return async_logger_.GetMaxLatencySoFar();
-}
-
-void Logger::SetUseTokens(bool use_tokens) {
-  async_logger_.SetUseTokens(use_tokens);
-}
-
-void Logger::SetNeedsFirstToken(bool needs_first_token) {
-  async_logger_.SetNeedsFirstToken(needs_first_token);
-}
-
-TlsLogger* Logger::GetTlsLoggerThatRequestedSwap(size_t slot, size_t next_id) {
-  uintptr_t slot_value = thread_swap_request_slots_[slot].load();
-  if (SwapRequestSlotIsReadable(slot_value)) {
-    // TODO: Convert this block to a simple write once we are confidient
-    // that we don't need to check for success.
-    bool success = thread_swap_request_slots_[slot].compare_exchange_strong(
-        slot_value, SwapRequestSlotIsWritableValue(next_id));
-    if (!success) {
-#if USE_NEW_LOGGING_FORMAT
-      MLPERF_LOG_WARNING((*this), "warning_generic_message", "CAS failed.");
-#else
-      LogErrorSync("CAS failed.", "line", __LINE__);
-#endif
-      assert(success);
-    }
-    return reinterpret_cast<TlsLogger*>(slot_value);
-  }
-  return nullptr;
-}
-
-void Logger::GatherRetrySwapRequests(std::vector<TlsLogger*>* threads_to_swap) {
-  if (swap_request_slots_to_retry_.empty()) {
-    return;
-  }
-
-  std::vector<SlotRetry> retry_slots;
-  retry_slots.swap(swap_request_slots_to_retry_);
-  for (auto& slot_retry : retry_slots) {
-    TlsLogger* tls_logger =
-        GetTlsLoggerThatRequestedSwap(slot_retry.slot, slot_retry.next_id);
-    if (tls_logger) {
-      threads_to_swap->push_back(tls_logger);
-    } else {
-      swap_request_slots_to_retry_.push_back(slot_retry);
-      swap_request_slots_retry_retry_count_++;
-    }
-  }
-}
-
-void Logger::GatherNewSwapRequests(std::vector<TlsLogger*>* threads_to_swap) {
-  auto swap_request_end = swap_request_id_.load(std::memory_order_acquire);
-  for (; swap_request_id_read_ < swap_request_end; swap_request_id_read_++) {
-    size_t slot = swap_request_id_read_ % thread_swap_request_slots_.size();
-    size_t next_id = swap_request_id_read_ + thread_swap_request_slots_.size();
-    TlsLogger* tls_logger = GetTlsLoggerThatRequestedSwap(slot, next_id);
-    if (tls_logger) {
-      threads_to_swap->push_back(tls_logger);
-    } else {
-      swap_request_slots_retry_count_++;
-      // A thread is in the middle of its call to RequestSwapBuffers.
-      // Retry later once it's done.
-      auto it = std::find_if(swap_request_slots_to_retry_.begin(),
-                             swap_request_slots_to_retry_.end(),
-                             [=](SlotRetry& s) { return s.slot == slot; });
-      if (it == swap_request_slots_to_retry_.end()) {
-        // This is the first time we are retrying the slot.
-        swap_request_slots_to_retry_.push_back({slot, next_id});
-      } else {
-        // Whoa. We've been retrying this slot since the last time it was
-        // encountered. Just update the next_id.
-        it->next_id = next_id;
-        swap_request_slots_retry_reencounter_count_++;
-      }
-    }
-  }
-}
-
-void Logger::IOThread() {
-  while (keep_io_thread_alive_) {
-    auto tracer1 =
-        MakeScopedTracer([](AsyncTrace& trace) { trace("IOThreadLoop"); });
-    {
-      auto tracer2 = MakeScopedTracer([](AsyncTrace& trace) { trace("Wait"); });
-      std::unique_lock<std::mutex> lock(io_thread_mutex_);
-      io_thread_cv_.wait_for(lock, poll_period_,
-                             [&] { return !keep_io_thread_alive_; });
-    }
-
-    {
-      auto tracer3 =
-          MakeScopedTracer([](AsyncTrace& trace) { trace("Gather"); });
-      std::vector<TlsLogger*> threads_to_swap;
-      threads_to_swap.swap(threads_to_swap_deferred_);
-      GatherRetrySwapRequests(&threads_to_swap);
-      GatherNewSwapRequests(&threads_to_swap);
-      for (TlsLogger* thread : threads_to_swap) {
-        if (thread->ReadBufferHasBeenConsumed()) {
-          thread->SwapBuffers();
-          // After swapping a thread, it's ready to be read.
-          threads_to_read_.push_back(thread);
-        } else {
-          // Don't swap buffers again until we've finish reading the
-          // previous swap.
-          threads_to_swap_deferred_.push_back(thread);
-        }
-      }
-    }
-
-    {
-      auto tracer4 =
-          MakeScopedTracer([](AsyncTrace& trace) { trace("Process"); });
-      // Read from the threads we are confident have activity.
-      for (std::vector<TlsLogger*>::iterator thread = threads_to_read_.begin();
-           thread != threads_to_read_.end(); thread++) {
-        auto tracer5 =
-            MakeScopedTracer([tid = (*thread)->Tid()](AsyncTrace& trace) {
-              trace("Thread", "tid", tid);
-            });
-        std::vector<AsyncLogEntry>* entries = (*thread)->StartReadingEntries();
-        if (!entries) {
-          start_reading_entries_retry_count_++;
-          continue;
-        }
-
-        async_logger_.SetCurrentPidTid((*thread)->Pid(), (*thread)->Tid());
-        for (auto& entry : *entries) {
-          // Execute the entry to perform the serialization and I/O.
-          entry(async_logger_);
-        }
-        (*thread)->FinishReadingEntries();
-        // Mark for removal by the call to RemoveValue below.
-        *thread = nullptr;
-      }
-
-      // Only remove threads where reading succeeded so we retry the failed
-      // threads the next time around.
-      RemoveValue(&threads_to_read_, nullptr);
-    }
-
-    // Explicitly flush every time we wake up. The goal being minimization
-    // of large implicit flushes which could affect tail latency measurements,
-    // especially at percentiles closer to 100%.
-    /// \todo Determine if explicitly flushing logs every wake up is better
-    /// than relying on implicit flushing.
-    {
-      auto tracer6 =
-          MakeScopedTracer([](AsyncTrace& trace) { trace("FlushAll"); });
-      async_logger_.Flush();
-    }
-
-    if (!orphans_to_destroy_.empty()) {
-      auto tracer7 = MakeScopedTracer(
-          [](AsyncTrace& trace) { trace("Abandoning Orphans"); });
-      std::unique_lock<std::mutex> lock(tls_logger_orphans_mutex_);
-      for (auto orphan : orphans_to_destroy_) {
-        tls_logger_orphans_.erase(orphan);
-      }
-      orphans_to_destroy_.clear();
-    }
-  }
-}
-
-TlsLogger::TlsLogger(std::function<void()> forced_detatch)
-    : pid_(MLPERF_GET_PID()),
-      tid_(MLPERF_GET_TID()),
-      forced_detatch_(std::move(forced_detatch)) {
-  for (auto& entry : entries_) {
-    entry.reserve(kTlsLogReservedEntryCount);
-  }
-}
-
-TlsLogger::~TlsLogger() {}
-
-// Log always makes forward progress since it can unconditionally obtain a
-// "lock" on at least one of the buffers for writing.
-// Notificiation is also lock free.
-void TlsLogger::Log(AsyncLogEntry&& entry) {
-  size_t cas_fail_count = 0;
-  auto unlocked = EntryState::Unlocked;
-  size_t i_write = i_write_.load(std::memory_order_relaxed);
-  while (!entry_states_[i_write].compare_exchange_strong(
-      unlocked, EntryState::WriteLock, std::memory_order_acquire,
-      std::memory_order_relaxed)) {
-    unlocked = EntryState::Unlocked;
-    i_write ^= 1;
-    // We may need to try 3 times, since there could be a race with a
-    // previous SwapBuffers request and we use memory_order_relaxed when
-    // loading i_write_ above.
-    cas_fail_count++;
-    if (cas_fail_count >= 3) {
-#if USE_NEW_LOGGING_FORMAT
-      MLPERF_LOG_WARNING(GlobalLogger(), "warning_generic_message",
-                         "CAS failed.");
-#else
-      GlobalLogger().LogErrorSync("CAS failed.", "times", cas_fail_count,
-                                  "line", __LINE__);
-#endif
-    }
-    log_cas_fail_count_.fetch_add(1, std::memory_order_relaxed);
-  }
-  entries_[i_write].emplace_back(std::forward<AsyncLogEntry>(entry));
-
-  // TODO: Convert this block to a simple write once we are confidient
-  // that we don't need to check for success.
-  auto write_lock = EntryState::WriteLock;
-  bool success = entry_states_[i_write].compare_exchange_strong(
-      write_lock, EntryState::Unlocked, std::memory_order_release);
-  if (!success) {
-#if USE_NEW_LOGGING_FORMAT
-    MLPERF_LOG_WARNING(GlobalLogger(), "warning_generic_message",
-                       "CAS failed.");
-#else
-    GlobalLogger().LogErrorSync("CAS failed.", "line", __LINE__);
-#endif
-    assert(success);
-  }
-
-  bool write_buffer_swapped = i_write_prev_ != i_write;
-  if (write_buffer_swapped) {
-    GlobalLogger().RequestSwapBuffers(this);
-    i_write_prev_ = i_write;
-  }
-}
-
-void TlsLogger::SwapBuffers() {
-  // TODO: Convert this block to a simple write once we are confidient
-  // that we don't need to check for success.
-  auto read_lock = EntryState::ReadLock;
-  bool success = entry_states_[i_read_].compare_exchange_strong(
-      read_lock, EntryState::Unlocked, std::memory_order_release);
-  if (!success) {
-#if USE_NEW_LOGGING_FORMAT
-    MLPERF_LOG_WARNING(GlobalLogger(), "warning_generic_message",
-                       "CAS failed.");
-#else
-    GlobalLogger().LogErrorSync("CAS failed.", "line", __LINE__);
-#endif
-    assert(success);
-  }
-
-  i_write_.store(i_read_, std::memory_order_relaxed);
-  i_read_ ^= 1;
-  unread_swaps_++;
-}
-
-// Returns nullptr if read lock fails.
-std::vector<AsyncLogEntry>* TlsLogger::StartReadingEntries() {
-  auto unlocked = EntryState::Unlocked;
-  if (entry_states_[i_read_].compare_exchange_strong(
-          unlocked, EntryState::ReadLock, std::memory_order_acquire,
-          std::memory_order_relaxed)) {
-    return &entries_[i_read_];
-  }
-  return nullptr;
-}
-
-void TlsLogger::FinishReadingEntries() {
-  // Detect first logging allocation and track max allocated size.
-  size_t new_size = entries_[i_read_].size();
-  if (new_size > max_entry_size_) {
-    if (max_entry_size_ == kTlsLogReservedEntryCount) {
-      Log([ts = PerfClock::now()](AsyncLog& log) {
-        log.TraceAsyncInstant("FirstAllocation", 0, ts);
-      });
-    }
-    max_entry_size_ = new_size;
-  }
-
-  entries_[i_read_].clear();
-  unread_swaps_--;
-}
-
-bool TlsLogger::ReadBufferHasBeenConsumed() { return unread_swaps_ == 0; }
-
-void TlsLogger::TraceCounters() {
-  auto tracer = MakeScopedTracer(
-      [lcfc = log_cas_fail_count_.load(std::memory_order_relaxed),
-       sbsrc = swap_buffers_slot_retry_count_.load(std::memory_order_relaxed)](
-          AsyncTrace& trace) {
-        trace("TlsLogger:ContentionCounters", "log_cas_fail_count", lcfc,
-              "swap_buffers_slot_retry_count", sbsrc);
-      });
-}
-
-Logger& GlobalLogger() {
-  static Logger g_logger(kLogPollPeriod, kMaxThreadsToLog);
-  return g_logger;
-}
-
-/// \brief Moves ownership of the TlsLogger to Logger on thread exit
-/// so no round-trip synchronization with the IO thread is required.
-struct TlsLoggerWrapper {
-  TlsLoggerWrapper(std::function<void()> forced_detatch)
-      : tls_logger(std::make_unique<TlsLogger>(std::move(forced_detatch))) {
-    GlobalLogger().RegisterTlsLogger(tls_logger.get());
-  }
-  ~TlsLoggerWrapper() {
-    tls_logger->TraceCounters();
-    GlobalLogger().UnRegisterTlsLogger(std::move(tls_logger));
-  }
-  std::unique_ptr<TlsLogger> tls_logger;
-};
-
-TlsLoggerWrapper* InitializeMyTlsLoggerWrapper() {
-  thread_local std::unique_ptr<TlsLoggerWrapper> tls_logger_wrapper;
-  // forced_detatch lets the global Logger forcefully detatch TlsLoggers
-  // from the thread in the Logger's destructor, which may run before
-  // thread-local variables are destroyed when the loadgen is used as a python
-  // module and dynamically unloaded.
-  // Note: We capture a pointer to the tls_logger_wrapper since variables of
-  // the thread-local storage class aren't actually captured. C++ spec says
-  // only variables of the automatic storage class are captured.
-  /// \todo There is a race where the same TlsLoggerWrapper might be
-  /// destroyed both naturally and via forced_detatch. Destruction of
-  /// the TlsLoggerWrapper should be locked.
-  auto forced_detatch = [tls_logger_wrapper = &tls_logger_wrapper]() {
-    tls_logger_wrapper->reset();
-  };
-  tls_logger_wrapper = std::make_unique<TlsLoggerWrapper>(forced_detatch);
-  return tls_logger_wrapper.get();
-}
-
-TlsLogger* InitializeMyTlsLogger() {
-  thread_local TlsLoggerWrapper* wrapper = InitializeMyTlsLoggerWrapper();
-  return wrapper->tls_logger.get();
-}
-
-void Log(AsyncLogEntry&& entry) {
-  thread_local TlsLogger* const tls_logger = InitializeMyTlsLogger();
-  tls_logger->Log(std::forward<AsyncLogEntry>(entry));
-}
-
-}  // namespace logging
-}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/logging.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/logging.h
deleted file mode 100644
index 8f1a398e9..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/logging.h
+++ /dev/null
@@ -1,816 +0,0 @@
-/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/// \file
-/// \brief Internal logging implementation details.
-
-#ifndef MLPERF_LOADGEN_LOGGING_H_
-#define MLPERF_LOADGEN_LOGGING_H_
-
-#define USE_NEW_LOGGING_FORMAT 1
-#define MLPERF_LOG(logger, key, value) \
-  logger.Log((key), (value), __FILE__, __LINE__)
-#define MLPERF_LOG_ERROR(logger, key, value) \
-  logger.LogError((key), (value), __FILE__, __LINE__)
-#define MLPERF_LOG_ERROR_SYNC(logger, key, value) \
-  logger.LogErrorSync((key), (value), __FILE__, __LINE__)
-#define MLPERF_LOG_WARNING(logger, key, value) \
-  logger.LogWarning((key), (value), __FILE__, __LINE__)
-#define MLPERF_LOG_INTERVAL_START(logger, key, value) \
-  logger.LogIntervalStart((key), (value), __FILE__, __LINE__)
-#define MLPERF_LOG_INTERVAL_END(logger, key, value) \
-  logger.LogIntervalEnd((key), (value), __FILE__, __LINE__)
-
-#include <algorithm>
-#include <atomic>
-#include <cassert>
-#include <chrono>
-#include <condition_variable>
-#include <functional>
-#include <future>
-#include <iomanip>
-#include <iostream>
-#include <list>
-#include <map>
-#include <mutex>
-#include <set>
-#include <string>
-#include <thread>
-#include <unordered_set>
-#include <vector>
-
-#include "query_sample.h"
-
-namespace mlperf {
-
-/// \brief Wait-free logging utilities that defer stringification
-/// and syscalls to a worker thread.
-namespace logging {
-
-class AsyncLog;
-class Logger;
-class TlsLogger;
-struct TlsLoggerWrapper;
-
-/// \todo Verify lambas are not allocating when bounded to a std::function.
-using AsyncLogEntry = std::function<void(AsyncLog&)>;
-using PerfClock = std::chrono::high_resolution_clock;
-
-/// \brief Logs the raw bytes as a hexadecimal ascii string.
-struct LogBinaryAsHexString {
-  std::vector<uint8_t>* data;
-};
-
-/// \brief By default, print out the value directly.
-template <typename T>
-const T& ArgValueTransform(const T& value) {
-  return value;
-}
-
-/// \brief Print out True/False.
-const std::string& ArgValueTransform(const bool& value);
-/// \brief Print out binary day as hex string.
-const std::string ArgValueTransform(const LogBinaryAsHexString& value);
-#if USE_NEW_LOGGING_FORMAT
-/// \brief Print out a string in JSON format (with quotes).
-const std::string ArgValueTransform(const std::string& value);
-const std::string ArgValueTransform(const char* value);
-/// \brief Prints a list of int in JSON format.
-const std::string ArgValueTransform(const std::vector<size_t>& value);
-/// \brief Prints a dict in JSON format.
-const std::string ArgValueTransform(
-    const std::map<std::string, std::string>& value);
-#endif
-
-/// \brief Helper to print out values without quotes when value is a string.
-template <typename T>
-const T& ArgValueTransformWithoutQuote(const T& value) {
-  return ArgValueTransform<T>(value);
-}
-inline const std::string ArgValueTransformWithoutQuote(
-    const LogBinaryAsHexString& value) {
-  return ArgValueTransform(value);
-}
-/// \brief Helper to print out a string without the quotes.
-inline const std::string ArgValueTransformWithoutQuote(
-    const std::string& value) {
-  return value;
-}
-
-/// \brief Outputs a trace that can be uploaded to chrome://tracing for
-/// visualization.
-/// \details Trace event format definition:
-/// https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/edit?usp=sharing
-class ChromeTracer {
- public:
-  ChromeTracer(std::ostream* trace_out, PerfClock::time_point origin);
-  ~ChromeTracer();
-
-  template <typename... Args>
-  void AddCompleteEvent(const std::string& name, uint64_t pid, uint64_t tid,
-                        PerfClock::time_point start, PerfClock::time_point end,
-                        const Args... args) {
-    *out_ << "{\"name\":\"" << name << "\"," << "\"ph\":\"X\","
-          << "\"pid\":" << pid << "," << "\"tid\":" << tid << ","
-          << "\"ts\":" << Micros(start - origin_).count() << ","
-          << "\"dur\":" << Micros(end - start).count() << "," << "\"args\":{";
-    AddArgs(args...);
-    *out_ << "}},\n";
-  }
-
-  template <typename... Args>
-  void AddAsyncBeginEvent(const std::string& name, uint64_t pid, uint64_t id,
-                          PerfClock::time_point time, const Args... args) {
-    *out_ << "{\"name\":\"" << name << "\"," << "\"cat\":\"default\","
-          << "\"ph\":\"b\"," << "\"pid\":" << pid << "," << "\"id\":" << id
-          << "," << "\"ts\":" << Micros(time - origin_).count() << ","
-          << "\"args\":{";
-    AddArgs(args...);
-    *out_ << "}},\n";
-  }
-
-  template <typename... Args>
-  void AddAsyncInstantEvent(const std::string& name, uint64_t pid, uint64_t id,
-                            PerfClock::time_point time, const Args... args) {
-    *out_ << "{\"name\":\"" << name << "\"," << "\"cat\":\"default\","
-          << "\"ph\":\"n\"," << "\"pid\":" << pid << "," << "\"id\":" << id
-          << "," << "\"ts\":" << Micros(time - origin_).count() << ","
-          << "\"args\":{";
-    AddArgs(args...);
-    *out_ << "}},\n";
-  }
-
-  template <typename... Args>
-  void AddAsyncEndEvent(const std::string& name, uint64_t pid, uint64_t id,
-                        PerfClock::time_point time) {
-    *out_ << "{\"name\":\"" << name << "\"," << "\"cat\":\"default\","
-          << "\"ph\":\"e\", " << "\"pid\":" << pid << "," << "\"id\":" << id
-          << "," << "\"ts\":" << Micros(time - origin_).count() << "},\n";
-  }
-
-  template <typename... Args>
-  void AddCounterEvent(const std::string& name, uint64_t pid,
-                       PerfClock::time_point time, const Args... args) {
-    *out_ << "{\"name\":\"" << name << "\"," << "\"ph\": \"C\","
-          << "\"pid\":" << pid << ","
-          << "\"ts\":" << Micros(time - origin_).count() << ","
-          << "\"args\":{ ";
-    AddArgs(args...);
-    *out_ << "}},\n";
-  }
-
-  void Flush() { out_->flush(); }
-
- private:
-  using Micros = std::chrono::duration<double, std::micro>;
-
-  void WriteTraceEventHeader();
-  void WriteTraceEventFooter();
-
-  void AddArgs() {}
-
-  template <typename T>
-  void AddArgs(const std::string& arg_name, const T& arg_value) {
-    *out_ << "\"" << arg_name << "\":" << ArgValueTransform(arg_value);
-  }
-
-  template <typename T, typename... Args>
-  void AddArgs(const std::string& arg_name, const T& arg_value,
-               const Args... args) {
-    *out_ << "\"" << arg_name << "\":" << ArgValueTransform(arg_value) << ",";
-    AddArgs(args...);
-  }
-
-  std::ostream* out_;
-  PerfClock::time_point origin_;
-};
-
-/// \brief The proxy all logging lambdas ultimately use to write any log type.
-/// \details Passed as an argument to the log lambda on the
-/// recording thread to serialize the data captured by the lambda and
-/// forward it to the output stream.
-/// \todo Make summary_out_, detail_out_, accuracy_out_, and trace_out_
-/// instances of a new LogOutput interface that the client may override.
-class AsyncLog {
- public:
-  void SetLogFiles(std::ostream* summary, std::ostream* detail,
-                   std::ostream* accuracy, bool copy_detail_to_stdout,
-                   bool copy_summary_to_stdout,
-                   PerfClock::time_point log_origin);
-  void StartNewTrace(std::ostream* trace_out, PerfClock::time_point origin);
-  void StopTrace();
-  void Flush();
-
-  void SetCurrentPidTid(uint64_t pid, uint64_t tid);
-
-  void LogAccuracy(uint64_t seq_id, const QuerySampleIndex qsl_idx,
-                   const LogBinaryAsHexString& response, int64_t n_tokens);
-  void CacheToken(uint64_t seq_id, const LogBinaryAsHexString& response);
-
-  template <typename... Args>
-  void LogSummary(const std::string& message, const Args... args);
-
-  void SetLogDetailTime(PerfClock::time_point time) { log_detail_time_ = time; }
-
-  void FlagError() {
-    std::unique_lock<std::mutex> lock(log_mutex_);
-    log_error_count_++;
-    error_flagged_ = true;
-  }
-
-  void FlagWarning() {
-    std::unique_lock<std::mutex> lock(log_mutex_);
-    log_warning_count_++;
-    warning_flagged_ = true;
-  }
-
-#if USE_NEW_LOGGING_FORMAT
-  template <typename T>
-  void LogDetail(const std::string& key, const T& value,
-                 const std::string file_name, const unsigned int line_no);
-#else
-  template <typename... Args>
-  void LogDetail(const std::string& message, const Args... args);
-#endif
-
-  template <typename... Args>
-  void Trace(const std::string& trace_name, PerfClock::time_point start,
-             PerfClock::time_point end, const Args... args) {
-    std::unique_lock<std::mutex> lock(trace_mutex_);
-    if (tracer_) {
-      tracer_->AddCompleteEvent(trace_name, current_pid_, current_tid_, start,
-                                end, args...);
-    }
-  }
-
-  template <typename... Args>
-  void TraceAsyncInstant(const std::string& trace_name, uint64_t id,
-                         PerfClock::time_point instant_time,
-                         const Args... args) {
-    std::unique_lock<std::mutex> lock(trace_mutex_);
-    if (tracer_) {
-      tracer_->AddAsyncInstantEvent(trace_name, current_pid_, id, instant_time,
-                                    args...);
-    }
-  }
-
-  void SetScopedTraceTimes(PerfClock::time_point start,
-                           PerfClock::time_point end) {
-    scoped_start_ = start;
-    scoped_end_ = end;
-  }
-
-  template <typename... Args>
-  void ScopedTrace(const std::string& trace_name, const Args... args) {
-    std::unique_lock<std::mutex> lock(trace_mutex_);
-    if (tracer_) {
-      tracer_->AddCompleteEvent(trace_name, current_pid_, current_tid_,
-                                scoped_start_, scoped_end_, args...);
-    }
-  }
-
-  template <typename... Args>
-  void TraceSample(const std::string& trace_name, uint64_t id,
-                   PerfClock::time_point start, PerfClock::time_point end,
-                   const Args... args) {
-    std::unique_lock<std::mutex> lock(trace_mutex_);
-    if (tracer_) {
-      tracer_->AddAsyncBeginEvent(trace_name, current_pid_, id, start, args...);
-      tracer_->AddAsyncEndEvent(trace_name, current_pid_, id, end);
-    }
-  }
-
-  template <typename... Args>
-  void TraceCounterEvent(const std::string& trace_name,
-                         PerfClock::time_point time, const Args... args) {
-    std::unique_lock<std::mutex> lock(trace_mutex_);
-    if (tracer_) {
-      tracer_->AddCounterEvent(trace_name, current_pid_, time, args...);
-    }
-  }
-
-  void RestartLatencyRecording(uint64_t first_sample_sequence_id,
-                               size_t latencies_to_reserve);
-  void RecordSampleCompletion(uint64_t sample_sequence_id,
-                              PerfClock::time_point completion_time,
-                              QuerySampleLatency latency, int64_t n_tokens);
-  void RecordTokenCompletion(uint64_t sample_sequence_id,
-                             PerfClock::time_point completion_time,
-                             QuerySampleLatency latency);
-  std::vector<QuerySampleLatency> GetLatenciesBlocking(size_t expected_count);
-  std::vector<QuerySampleLatency> GetTokenLatencies(size_t expected_count);
-  std::vector<QuerySampleLatency> GetTimePerOutputToken(size_t expected_count);
-  std::vector<int64_t> GetTokensPerSample(size_t expected_count);
-  PerfClock::time_point GetMaxCompletionTime();
-  QuerySampleLatency GetMaxLatencySoFar();
-  void SetUseTokens(bool use_tokens);
-  void SetNeedsFirstToken(bool needs_first_token);
-  size_t GetErrorCount() { return log_error_count_; };
-
- private:
-  void WriteAccuracyHeaderLocked();
-  void WriteAccuracyFooterLocked();
-
-  void LogArgs(std::ostream*) {}
-
-  template <typename T>
-  void LogArgs(std::ostream* out, const T& value_only) {
-    *out << ArgValueTransformWithoutQuote(value_only);
-  }
-
-  template <typename T>
-  void LogArgs(std::ostream* out, const std::string& arg_name,
-               const T& arg_value) {
-    *out << "\"" << arg_name
-         << "\" : " << ArgValueTransformWithoutQuote(arg_value);
-  }
-
-  template <typename T, typename... Args>
-  void LogArgs(std::ostream* out, const std::string& arg_name,
-               const T& arg_value, const Args... args) {
-    *out << "\"" << arg_name
-         << "\" : " << ArgValueTransformWithoutQuote(arg_value) << ", ";
-    LogArgs(out, args...);
-  }
-
-  std::mutex log_mutex_;
-  std::ostream* summary_out_ = &std::cerr;
-  std::ostream* detail_out_ = &std::cerr;
-  std::ostream* accuracy_out_ = &std::cerr;
-  // TODO: Instead of these bools, use a class that forwards to two streams.
-  bool copy_detail_to_stdout_ = false;
-  bool copy_summary_to_stdout_ = false;
-  bool accuracy_needs_comma_ = false;
-  PerfClock::time_point log_origin_;
-  size_t log_error_count_ = 0;
-  bool error_flagged_ = false;
-  size_t log_warning_count_ = 0;
-  bool warning_flagged_ = false;
-  bool use_tokens_ = false;
-  bool needs_first_token_ = false;
-
-  std::mutex trace_mutex_;
-  std::unique_ptr<ChromeTracer> tracer_;
-
-  uint64_t current_pid_;
-  uint64_t current_tid_;
-  PerfClock::time_point log_detail_time_;
-  PerfClock::time_point scoped_start_;
-  PerfClock::time_point scoped_end_;
-
-  std::mutex latencies_mutex_;
-  std::mutex token_latencies_mutex_;
-  std::mutex token_record_mutex_;
-  std::condition_variable all_latencies_recorded_;
-  uint64_t latencies_first_sample_sequence_id_ = 0;
-  std::vector<QuerySampleLatency> latencies_;
-  std::vector<QuerySampleLatency> token_latencies_;
-  std::vector<QuerySampleLatency> time_per_output_token_;
-  std::vector<LogBinaryAsHexString> token_records_;
-  std::vector<int64_t> tokens_per_sample_;
-  QuerySampleLatency max_latency_ = 0;
-  PerfClock::time_point max_completion_timstamp_;
-  size_t latencies_recorded_ = 0;
-  size_t latencies_expected_ = 0;
-  // Must be called with latencies_mutex_ held.
-  bool AllLatenciesRecorded() {
-    return latencies_recorded_ == latencies_expected_;
-  }
-};
-
-/// \brief The central logger that logs all threads belonging to a run.
-class Logger {
- public:
-  Logger(std::chrono::duration<double> poll_period, size_t max_threads_to_log);
-  ~Logger();
-
-  void StartIOThread();
-  void StopIOThread();
-
-  void StartLogging(std::ostream* summary, std::ostream* detail,
-                    std::ostream* accuracy, bool copy_detail_to_stdout,
-                    bool copy_summary_to_stdout);
-  void StopLogging();
-
-  void StartNewTrace(std::ostream* trace_out, PerfClock::time_point origin);
-  void StopTracing();
-
-  void LogContentionAndAllocations();
-
-  void RestartLatencyRecording(uint64_t first_sample_sequence_id,
-                               size_t latencies_to_reserve);
-  std::vector<QuerySampleLatency> GetLatenciesBlocking(size_t expected_count);
-  std::vector<QuerySampleLatency> GetTokenLatencies(size_t expected_count);
-  std::vector<QuerySampleLatency> GetTimePerOutputToken(size_t expected_count);
-  std::vector<int64_t> GetTokensPerSample(size_t expected_count);
-  PerfClock::time_point GetMaxCompletionTime();
-  QuerySampleLatency GetMaxLatencySoFar();
-  void SetUseTokens(bool use_tokens);
-  void SetNeedsFirstToken(bool needs_first_token);
-
- private:
-  friend AsyncLog;
-  friend TlsLogger;
-  friend TlsLoggerWrapper;
-
-  void RegisterTlsLogger(TlsLogger* tls_logger);
-  void UnRegisterTlsLogger(std::unique_ptr<TlsLogger> tls_logger);
-  void RequestSwapBuffers(TlsLogger* tls_logger);
-  void CollectTlsLoggerStats(TlsLogger* tls_logger);
-
-  TlsLogger* GetTlsLoggerThatRequestedSwap(size_t slot, size_t next_id);
-  void GatherRetrySwapRequests(std::vector<TlsLogger*>* threads_to_swap);
-  void GatherNewSwapRequests(std::vector<TlsLogger*>* threads_to_swap);
-
-  /// \brief The main logging thread function that handles the serialization
-  /// and I/O to the stream or file.
-  ///
-  /// \todo Provide client hook to set logging thead affinity and priority.
-  void IOThread();
-
-// Slow synchronous error logging for internals that may prevent
-// async logging from working.
-#if USE_NEW_LOGGING_FORMAT
-  template <typename T>
-  void LogErrorSync(const std::string& key, const T& value,
-                    const std::string file_name, const unsigned int line_no) {
-    /// \todo Acquire mutex once for FlagError + LogDetail to avoid
-    ///       races. Better yet, switch to a non-stateful error API.
-    //       This is better than nothing though.
-    async_logger_.FlagError();
-    async_logger_.LogDetail(key, value, file_name, line_no);
-  }
-  template <typename T>
-  void LogWarning(const std::string& key, const T& value,
-                  const std::string file_name, const unsigned int line_no) {
-    async_logger_.FlagWarning();
-    async_logger_.LogDetail(key, value, file_name, line_no);
-  }
-#else
-  template <typename... Args>
-  void LogErrorSync(const std::string& message, Args&&... args) {
-    /// \todo Acquire mutex once for FlagError + LogDetail to avoid
-    ///       races. Better yet, switch to a non-stateful error API.
-    //       This is better than nothing though.
-    async_logger_.FlagError();
-    async_logger_.LogDetail(message, std::forward<Args>(args)...);
-  }
-#endif
-
-  // Accessed by IOThead only.
-  const std::chrono::duration<double> poll_period_;
-  AsyncLog async_logger_;
-
-  const size_t max_threads_to_log_;
-  std::thread io_thread_;
-
-  // Accessed by producers and IOThead during thread registration and
-  // destruction. Protected by io_thread_mutex_.
-  std::mutex io_thread_mutex_;
-  std::condition_variable io_thread_cv_;
-  bool keep_io_thread_alive_ = false;
-
-  std::mutex tls_loggers_registerd_mutex_;
-  std::unordered_set<TlsLogger*> tls_loggers_registerd_;
-
-  // Temporarily stores TlsLogger data for threads that have exited until
-  // all their log entries have been processed.
-  // Accessed by IOThread and producers as their threads exit.
-  std::mutex tls_logger_orphans_mutex_;
-  using OrphanContainer = std::list<std::unique_ptr<TlsLogger>>;
-  OrphanContainer tls_logger_orphans_;
-
-  // Accessed by producers and IOThead atomically.
-  std::atomic<size_t> swap_request_id_{0};
-  std::vector<std::atomic<uintptr_t>> thread_swap_request_slots_;
-
-  // Accessed by IOThead only.
-  size_t swap_request_id_read_{0};
-  struct SlotRetry {
-    size_t slot;
-    uintptr_t next_id;
-  };
-  std::vector<SlotRetry> swap_request_slots_to_retry_;
-  std::vector<TlsLogger*> threads_to_swap_deferred_;
-  std::vector<TlsLogger*> threads_to_read_;
-  std::vector<OrphanContainer::iterator> orphans_to_destroy_;
-
-  // Counts for retries related to the lock-free scheme.
-  // Abnormally high counts could be an indicator of contention.
-  // Access on IOThread only.
-  size_t swap_request_slots_retry_count_ = 0;
-  size_t swap_request_slots_retry_retry_count_ = 0;
-  size_t swap_request_slots_retry_reencounter_count_ = 0;
-  size_t start_reading_entries_retry_count_ = 0;
-  size_t tls_total_log_cas_fail_count_ = 0;
-  size_t tls_total_swap_buffers_slot_retry_count_ = 0;
-};
-
-Logger& GlobalLogger();
-
-/// \brief The generic way to add a log entry.
-/// \details Supports all types of logs, which is useful for complex
-/// lambdas that may wish to log in multiple places or log something other
-/// than a simple summary, detail, or trace entry.
-void Log(AsyncLogEntry&& entry);
-
-/// \brief The convenience proxy a LogSummary lambda uses to write to the
-/// summary log.
-class AsyncSummary {
- public:
-  explicit AsyncSummary(AsyncLog& async_log) : async_log_(async_log) {}
-  AsyncLog& async_log() { return async_log_; }
-
-  template <typename... Args>
-  AsyncLog& operator()(Args&&... args) {
-    async_log_.LogSummary(std::forward<Args>(args)...);
-    return async_log_;
-  }
-
- private:
-  AsyncLog& async_log_;
-};
-
-/// \brief A helper to simplify adding a summary log entry.
-template <typename LambdaT>
-void LogSummary(LambdaT&& lambda) {
-  Log([lambda = std::forward<LambdaT>(lambda)](AsyncLog& log) mutable {
-    AsyncSummary async_summary(log);
-    lambda(async_summary);
-  });
-}
-
-/// \brief The convenience proxy a LogDetail lambda uses to write to the detail
-/// log.
-class AsyncDetail {
- public:
-  explicit AsyncDetail(AsyncLog& async_log) : async_log_(async_log) {}
-  AsyncLog& async_log() { return async_log_; }
-
-#if USE_NEW_LOGGING_FORMAT
-  template <typename T>
-  AsyncLog& Log(const std::string& key, const T& value,
-                const std::string file_name, const unsigned int line_no) {
-    async_log_.LogDetail(key, value, file_name, line_no);
-    return async_log_;
-  }
-
-  template <typename T>
-  AsyncLog& LogError(const std::string& key, const T& value,
-                     const std::string file_name, const unsigned int line_no) {
-    async_log_.FlagError();
-    async_log_.LogDetail(key, value, file_name, line_no);
-    return async_log_;
-  }
-
-  template <typename T>
-  AsyncLog& LogWarning(const std::string& key, const T& value,
-                       const std::string file_name,
-                       const unsigned int line_no) {
-    async_log_.FlagWarning();
-    async_log_.LogDetail(key, value, file_name, line_no);
-    return async_log_;
-  }
-
-  template <typename T>
-  AsyncLog& LogIntervalStart(const std::string& key, const T& value,
-                             const std::string file_name,
-                             const unsigned int line_no) {
-    async_log_.LogDetail(key, value, file_name, line_no);
-    return async_log_;
-  }
-
-  template <typename T>
-  AsyncLog& LogIntervalEnd(const std::string& key, const T& value,
-                           const std::string file_name,
-                           const unsigned int line_no) {
-    async_log_.LogDetail(key, value, file_name, line_no);
-    return async_log_;
-  }
-#else
-  template <typename... Args>
-  AsyncLog& operator()(Args&&... args) {
-    async_log_.LogDetail(std::forward<Args>(args)...);
-    return async_log_;
-  }
-
-  template <typename... Args>
-  AsyncLog& Error(Args&&... args) {
-    async_log_.FlagError();
-    async_log_.LogDetail(std::forward<Args>(args)...);
-    return async_log_;
-  }
-
-  template <typename... Args>
-  AsyncLog& Warning(Args&&... args) {
-    async_log_.FlagWarning();
-    async_log_.LogDetail(std::forward<Args>(args)...);
-    return async_log_;
-  }
-#endif
-
- private:
-  AsyncLog& async_log_;
-};
-
-/// \brief A helper to simplify adding a detail log entry.
-template <typename LambdaT>
-void LogDetail(LambdaT&& lambda) {
-  Log([lambda = std::forward<LambdaT>(lambda),
-       timestamp = PerfClock::now()](AsyncLog& log) mutable {
-    log.SetLogDetailTime(timestamp);
-    AsyncDetail async_detail(log);
-    lambda(async_detail);
-  });
-}
-
-/// \brief The convenience proxy a ScopedTracer lambda uses to write to the
-/// detail log.
-class AsyncTrace {
- public:
-  explicit AsyncTrace(AsyncLog& async_log) : async_log_(async_log) {}
-  AsyncLog& async_log() { return async_log_; }
-
-  template <typename... Args>
-  AsyncLog& operator()(Args&&... args) {
-    async_log_.ScopedTrace(std::forward<Args>(args)...);
-    return async_log_;
-  }
-
- private:
-  AsyncLog& async_log_;
-};
-
-/// \brief ScopedTracer is an RAII object that traces the start and end
-/// of its lifetime.
-template <typename LambdaT>
-class ScopedTracer {
- public:
-  ScopedTracer(LambdaT&& lambda)
-      : start_(PerfClock::now()), lambda_(std::forward<LambdaT>(lambda)) {}
-
-  ~ScopedTracer() {
-    Log([start = start_, lambda = std::move(lambda_),
-         end = PerfClock::now()](AsyncLog& log) {
-      log.SetScopedTraceTimes(start, end);
-      AsyncTrace async_trace(log);
-      lambda(async_trace);
-    });
-  }
-
- private:
-  PerfClock::time_point start_;
-  LambdaT lambda_;
-};
-
-/// \brief Helper that creates a ScopeTracer with automatic type deduction.
-/// \details Helps with automatic template type deduction, which has been
-/// supported for functions for a long time.
-/// C++17 will support deduction for classes, which will neutralize the utility
-/// of a helper function like this.
-/// \todo Determine which traces to keep for submission purposes.
-template <typename LambdaT>
-auto MakeScopedTracer(LambdaT&& lambda) -> ScopedTracer<LambdaT> {
-  return ScopedTracer<LambdaT>(std::forward<LambdaT>(lambda));
-}
-
-template <typename... Args>
-void AsyncLog::LogSummary(const std::string& message, const Args... args) {
-  auto tracer = MakeScopedTracer([message](AsyncTrace& trace) {
-    std::string sanitized_message = message;
-    std::replace(sanitized_message.begin(), sanitized_message.end(), '"', '\'');
-    std::replace(sanitized_message.begin(), sanitized_message.end(), '\n', ';');
-    trace("LogSummary", "message", "\"" + sanitized_message + "\"");
-  });
-  std::unique_lock<std::mutex> lock(log_mutex_);
-  *summary_out_ << message;
-  LogArgs(summary_out_, args...);
-  *summary_out_ << "\n";
-
-  if (copy_summary_to_stdout_) {
-    std::cout << message;
-    LogArgs(&std::cout, args...);
-    std::cout << "\n";
-  }
-}
-
-#if USE_NEW_LOGGING_FORMAT
-template <typename T>
-void AsyncLog::LogDetail(const std::string& key, const T& value,
-                         const std::string file_name,
-                         const unsigned int line_no) {
-  auto tracer = MakeScopedTracer([key](AsyncTrace& trace) {
-    std::string sanitized_key = key;
-    std::replace(sanitized_key.begin(), sanitized_key.end(), '"', '\'');
-    std::replace(sanitized_key.begin(), sanitized_key.end(), '\n', ';');
-    trace("LogDetail", "key", "\"" + sanitized_key + "\"");
-  });
-  std::unique_lock<std::mutex> lock(log_mutex_);
-  std::vector<std::ostream*> detail_streams{detail_out_, &std::cout};
-  if (!copy_detail_to_stdout_) {
-    detail_streams.pop_back();
-  }
-  auto time_ns = (log_detail_time_ - log_origin_).count();
-  for (auto os : detail_streams) {
-    *os << ":::MLLOG {" << "\"key\": " << ArgValueTransform(key) << ", "
-        << "\"value\": " << ArgValueTransform(value) << ", "
-        << "\"time_ms\": " << ArgValueTransform(time_ns / 1000000ULL) << "."
-        << std::setfill('0') << std::setw(6)
-        << ArgValueTransform(time_ns % 1000000ULL) << ", "
-        << "\"namespace\": \"mlperf::logging\", "
-        << "\"event_type\": \"POINT_IN_TIME\", " << "\"metadata\": {"
-        << "\"is_error\": " << ArgValueTransform(error_flagged_) << ", "
-        << "\"is_warning\": " << ArgValueTransform(warning_flagged_) << ", "
-        << "\"file\": \"" << file_name << "\", "
-        << "\"line_no\": " << ArgValueTransform(line_no) << ", "
-        << "\"pid\": " << ArgValueTransform(current_pid_) << ", "
-        << "\"tid\": " << ArgValueTransform(current_tid_) << "}}\n";
-    if (error_flagged_) {
-      os->flush();
-    }
-  }
-  error_flagged_ = false;
-  warning_flagged_ = false;
-}
-#else
-template <typename... Args>
-void AsyncLog::LogDetail(const std::string& message, const Args... args) {
-  auto tracer = MakeScopedTracer([message](AsyncTrace& trace) {
-    std::string sanitized_message = message;
-    std::replace(sanitized_message.begin(), sanitized_message.end(), '"', '\'');
-    std::replace(sanitized_message.begin(), sanitized_message.end(), '\n', ';');
-    trace("LogDetail", "message", "\"" + sanitized_message + "\"");
-  });
-  std::unique_lock<std::mutex> lock(log_mutex_);
-  std::vector<std::ostream*> detail_streams{detail_out_, &std::cout};
-  if (!copy_detail_to_stdout_) {
-    detail_streams.pop_back();
-  }
-  for (auto os : detail_streams) {
-    *os << "\"pid\": " << current_pid_ << ", " << "\"tid\": " << current_tid_
-        << ", " << "\"ts\": " << (log_detail_time_ - log_origin_).count()
-        << "ns : ";
-    if (error_flagged_) {
-      *os << "ERROR : ";
-    } else if (warning_flagged_) {
-      *os << "WARNING : ";
-    }
-    *os << message;
-    LogArgs(os, args...);
-    *os << "\n";
-    if (error_flagged_) {
-      os->flush();
-    }
-  }
-  error_flagged_ = false;
-  warning_flagged_ = false;
-}
-#endif
-
-}  // namespace logging
-
-// Export some things out of the logging namespace to simplify call sites.
-
-const auto GlobalLogger = logging::GlobalLogger;
-const auto Log = logging::Log;
-
-using PerfClock = logging::PerfClock;
-
-using LogBinaryAsHexString = logging::LogBinaryAsHexString;
-
-using AsyncLog = logging::AsyncLog;
-
-using AsyncSummary = logging::AsyncSummary;
-template <typename LambdaT>
-void LogSummary(LambdaT&& lambda) {
-  logging::LogSummary(std::forward<LambdaT>(lambda));
-}
-
-using AsyncDetail = logging::AsyncDetail;
-template <typename LambdaT>
-void LogDetail(LambdaT&& lambda) {
-  logging::LogDetail(std::forward<LambdaT>(lambda));
-}
-
-using AsyncTrace = logging::AsyncTrace;
-
-template <typename LambdaT>
-using ScopedTracer = logging::ScopedTracer<LambdaT>;
-
-template <typename LambdaT>
-auto MakeScopedTracer(LambdaT&& lambda) -> ScopedTracer<LambdaT> {
-  return ScopedTracer<LambdaT>(std::forward<LambdaT>(lambda));
-}
-
-}  // namespace mlperf
-
-#endif  // MLPERF_LOADGEN_LOGGING_H_
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/mlperf.conf b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/mlperf.conf
deleted file mode 100644
index 1b825514b..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/mlperf.conf
+++ /dev/null
@@ -1,164 +0,0 @@
-# The format of this config file is 'key = value'.
-# The key has the format 'model.scenario.key'. Value is mostly int64_t.
-# Model maybe '*' as wildcard. In that case the value applies to all models.
-# All times are in milli seconds
-
-# Set performance_sample_count for each model.
-# User can optionally set this to higher values in user.conf.
-resnet50.*.performance_sample_count_override = 1024
-ssd-mobilenet.*.performance_sample_count_override = 256
-retinanet.*.performance_sample_count_override = 64
-bert.*.performance_sample_count_override = 10833
-dlrm.*.performance_sample_count_override = 204800
-dlrm-v2.*.performance_sample_count_override = 204800
-rnnt.*.performance_sample_count_override = 2513
-gptj.*.performance_sample_count_override = 13368
-mixtral-8x7b.*.performance_sample_count_override = 15000
-llama2-70b.*.performance_sample_count_override = 24576
-llama2-70b-interactive.*.performance_sample_count_override = 24576
-llama3_1-405b.*.performance_sample_count_override = 8313
-llama3_1-405b-interactive.*.performance_sample_count_override = 8313
-llama3_1-8b.*.performance_sample_count_override = 13368
-llama3_1-8b-edge.*.performance_sample_count_override = 5000
-llama3_1-8b-interactive.*.performance_sample_count_override = 13368
-stable-diffusion-xl.*.performance_sample_count_override = 5000
-rgat.*.performance_sample_count_override = 788379
-pointpainting.*.performance_sample_count_override = 1024
-deepseek-r1.*.performance_sample_count_override = 4388
-whisper.*.performance_sample_count_override = 1633
-# set to 0 to let entire sample set to be performance sample
-3d-unet.*.performance_sample_count_override = 0
-
-# Set seeds.
-*.*.qsl_rng_seed = 1780908523862526354
-*.*.sample_index_rng_seed = 14771362308971278857
-*.*.schedule_rng_seed = 18209322760996052031
-
-# Set seeds for TEST_05 (not needed from v5.0 onwards)
-*.*.test05_qsl_rng_seed = 7975553102935885558
-*.*.test05_sample_index_rng_seed = 11403566307062068064
-*.*.test05_schedule_rng_seed = 15816800565822761601
-
-*.SingleStream.target_latency_percentile = 90
-pointpainting.SingleStream.target_latency_percentile = 99.9
-*.SingleStream.min_duration = 600000
-
-*.MultiStream.target_latency_percentile = 99
-*.MultiStream.samples_per_query = 8
-*.MultiStream.min_duration = 600000
-*.MultiStream.min_query_count = 662
-retinanet.MultiStream.target_latency = 528
-
-# 3D-UNet uses equal issue mode because it has non-uniform inputs
-3d-unet.*.sample_concatenate_permutation = 1
-
-# R-GAT uses equal issue mode because it may have non-uniform inputs
-rgat.*.sample_concatenate_permutation = 1
-
-# LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario
-gptj.*.sample_concatenate_permutation = 1
-llama2-70b.*.sample_concatenate_permutation = 1
-llama2-70b-interactive.*.sample_concatenate_permutation = 1
-mixtral-8x7b.*.sample_concatenate_permutation = 1
-llama3_1-405b.*.sample_concatenate_permutation = 1
-llama3_1-405b-interactive.*.sample_concatenate_permutation = 1
-llama3_1-8b.*.sample_concatenate_permutation = 1
-llama3_1-8b-edge.*.sample_concatenate_permutation = 1
-llama3_1-8b-interactive.*.sample_concatenate_permutation = 1
-deepseek-r1.*.sample_concatenate_permutation = 1
-whisper.*.sample_concatenate_permutation = 1
-
-*.Server.target_latency = 10
-*.Server.target_latency_percentile = 99
-*.Server.target_duration = 0
-*.Server.min_duration = 600000
-resnet50.Server.target_latency = 15
-retinanet.Server.target_latency = 100
-bert.Server.target_latency = 130
-dlrm.Server.target_latency = 60
-dlrm-v2.Server.target_latency = 60
-rnnt.Server.target_latency = 1000
-gptj.Server.target_latency = 20000
-stable-diffusion-xl.Server.target_latency = 20000
-# Benchmarks that measure token latencies
-llama2-70b.*.use_token_latencies = 1
-llama2-70b-interactive.*.use_token_latencies = 1
-mixtral-8x7b.*.use_token_latencies = 1
-llama3_1-405b.*.use_token_latencies = 1
-llama3_1-405b-interactive.*.use_token_latencies = 1
-llama3_1-8b.*.use_token_latencies = 1
-llama3_1-8b-edge.*.use_token_latencies = 1
-llama3_1-8b-interactive.*.use_token_latencies = 1
-deepseek-r1.*.use_token_latencies = 1
-whisper.*.use_token_latencies = 1
-
-# gptj benchmark infers token latencies
-gptj.*.infer_token_latencies = 1
-gptj.*.token_latency_scaling_factor = 69
-# Only ttft and tpot are tracked for the llama2-70b, mixtral-8x7B & llama3_1-405b benchmark therefore target_latency = 0
-llama2-70b.Server.target_latency = 0
-llama2-70b.Server.ttft_latency = 2000
-llama2-70b.Server.tpot_latency = 200
-
-# Target Latencies for interactive setting
-llama2-70b-interactive.Server.target_latency = 0
-llama2-70b-interactive.Server.ttft_latency = 450
-llama2-70b-interactive.Server.tpot_latency = 40
-
-mixtral-8x7b.Server.target_latency = 0
-mixtral-8x7b.Server.ttft_latency = 2000
-mixtral-8x7b.Server.tpot_latency = 200
-
-llama3_1-405b.Server.target_latency = 0
-llama3_1-405b.Server.ttft_latency = 6000
-llama3_1-405b.Server.tpot_latency = 175
-
-# Target Latencies for interactive setting
-llama3_1-405b-interactive.Server.target_latency = 0
-llama3_1-405b-interactive.Server.ttft_latency = 4500
-llama3_1-405b-interactive.Server.tpot_latency = 80
-
-
-llama3_1-8b.Server.target_latency = 0
-llama3_1-8b.Server.ttft_latency = 2000
-llama3_1-8b.Server.tpot_latency = 100
-
-# Target Latencies for interactive setting
-llama3_1-8b-interactive.Server.target_latency = 0
-llama3_1-8b-interactive.Server.ttft_latency = 500
-llama3_1-8b-interactive.Server.tpot_latency = 30
-
-deepseek-r1.Server.target_latency = 0
-deepseek-r1.Server.ttft_latency = 2000
-deepseek-r1.Server.tpot_latency = 80
-
-*.Offline.target_latency_percentile = 90
-*.Offline.min_duration = 600000
-
-# In Offline scenario, we always have one query. But LoadGen maps this to
-# min_sample_count internally in Offline scenario. If the dataset size is larger
-# than 24576 we limit the min_query_count to 24576 and otherwise we use
-# the dataset size as the limit
-
-resnet50.Offline.min_query_count = 24576
-retinanet.Offline.min_query_count = 24576
-dlrm-v2.Offline.min_query_count = 24576
-bert.Offline.min_query_count = 10833
-gptj.Offline.min_query_count = 13368
-rnnt.Offline.min_query_count = 2513
-3d-unet.Offline.min_query_count = 43
-stable-diffusion-xl.Offline.min_query_count = 5000
-llama2-70b.Offline.min_query_count = 24576
-llama3_1-405b.Offline.min_query_count = 8313
-llama3_1-8b.Offline.min_query_count = 13368
-llama3_1-8b-edge.Offline.min_query_count = 5000
-mixtral-8x7b.Offline.min_query_count = 15000
-rgat.Offline.min_query_count = 788379
-deepseek-r1.Offline.min_query_count = 4388
-whisper.Offline.min_query_count = 1633
-
-# These fields should be defined and overridden by user.conf.
-*.SingleStream.target_latency = 10
-*.MultiStream.target_latency = 80
-*.Server.target_qps = 1.0
-*.Offline.target_qps = 1.0
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/mlperf_conf.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/mlperf_conf.h
deleted file mode 100644
index 7859e0139..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/mlperf_conf.h
+++ /dev/null
@@ -1,167 +0,0 @@
-const char* mlperf_conf =
-"# The format of this config file is 'key = value'.\n"
-"# The key has the format 'model.scenario.key'. Value is mostly int64_t.\n"
-"# Model maybe '*' as wildcard. In that case the value applies to all models.\n"
-"# All times are in milli seconds\n"
-"\n"
-"# Set performance_sample_count for each model.\n"
-"# User can optionally set this to higher values in user.conf.\n"
-"resnet50.*.performance_sample_count_override = 1024\n"
-"ssd-mobilenet.*.performance_sample_count_override = 256\n"
-"retinanet.*.performance_sample_count_override = 64\n"
-"bert.*.performance_sample_count_override = 10833\n"
-"dlrm.*.performance_sample_count_override = 204800\n"
-"dlrm-v2.*.performance_sample_count_override = 204800\n"
-"rnnt.*.performance_sample_count_override = 2513\n"
-"gptj.*.performance_sample_count_override = 13368\n"
-"mixtral-8x7b.*.performance_sample_count_override = 15000\n"
-"llama2-70b.*.performance_sample_count_override = 24576\n"
-"llama2-70b-interactive.*.performance_sample_count_override = 24576\n"
-"llama3_1-405b.*.performance_sample_count_override = 8313\n"
-"llama3_1-405b-interactive.*.performance_sample_count_override = 8313\n"
-"llama3_1-8b.*.performance_sample_count_override = 13368\n"
-"llama3_1-8b-edge.*.performance_sample_count_override = 5000\n"
-"llama3_1-8b-interactive.*.performance_sample_count_override = 13368\n"
-"stable-diffusion-xl.*.performance_sample_count_override = 5000\n"
-"rgat.*.performance_sample_count_override = 788379\n"
-"pointpainting.*.performance_sample_count_override = 1024\n"
-"deepseek-r1.*.performance_sample_count_override = 4388\n"
-"whisper.*.performance_sample_count_override = 1633\n"
-"# set to 0 to let entire sample set to be performance sample\n"
-"3d-unet.*.performance_sample_count_override = 0\n"
-"\n"
-"# Set seeds.\n"
-"*.*.qsl_rng_seed = 1780908523862526354\n"
-"*.*.sample_index_rng_seed = 14771362308971278857\n"
-"*.*.schedule_rng_seed = 18209322760996052031\n"
-"\n"
-"# Set seeds for TEST_05 (not needed from v5.0 onwards)\n"
-"*.*.test05_qsl_rng_seed = 7975553102935885558\n"
-"*.*.test05_sample_index_rng_seed = 11403566307062068064\n"
-"*.*.test05_schedule_rng_seed = 15816800565822761601\n"
-"\n"
-"*.SingleStream.target_latency_percentile = 90\n"
-"pointpainting.SingleStream.target_latency_percentile = 99.9\n"
-"*.SingleStream.min_duration = 600000\n"
-"\n"
-"*.MultiStream.target_latency_percentile = 99\n"
-"*.MultiStream.samples_per_query = 8\n"
-"*.MultiStream.min_duration = 600000\n"
-"*.MultiStream.min_query_count = 662\n"
-"retinanet.MultiStream.target_latency = 528\n"
-"\n"
-"# 3D-UNet uses equal issue mode because it has non-uniform inputs\n"
-"3d-unet.*.sample_concatenate_permutation = 1\n"
-"\n"
-"# R-GAT uses equal issue mode because it may have non-uniform inputs\n"
-"rgat.*.sample_concatenate_permutation = 1\n"
-"\n"
-"# LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario\n"
-"gptj.*.sample_concatenate_permutation = 1\n"
-"llama2-70b.*.sample_concatenate_permutation = 1\n"
-"llama2-70b-interactive.*.sample_concatenate_permutation = 1\n"
-"mixtral-8x7b.*.sample_concatenate_permutation = 1\n"
-"llama3_1-405b.*.sample_concatenate_permutation = 1\n"
-"llama3_1-405b-interactive.*.sample_concatenate_permutation = 1\n"
-"llama3_1-8b.*.sample_concatenate_permutation = 1\n"
-"llama3_1-8b-edge.*.sample_concatenate_permutation = 1\n"
-"llama3_1-8b-interactive.*.sample_concatenate_permutation = 1\n"
-"deepseek-r1.*.sample_concatenate_permutation = 1\n"
-"whisper.*.sample_concatenate_permutation = 1\n"
-"\n"
-"*.Server.target_latency = 10\n"
-"*.Server.target_latency_percentile = 99\n"
-"*.Server.target_duration = 0\n"
-"*.Server.min_duration = 600000\n"
-"resnet50.Server.target_latency = 15\n"
-"retinanet.Server.target_latency = 100\n"
-"bert.Server.target_latency = 130\n"
-"dlrm.Server.target_latency = 60\n"
-"dlrm-v2.Server.target_latency = 60\n"
-"rnnt.Server.target_latency = 1000\n"
-"gptj.Server.target_latency = 20000\n"
-"stable-diffusion-xl.Server.target_latency = 20000\n"
-"# Benchmarks that measure token latencies\n"
-"llama2-70b.*.use_token_latencies = 1\n"
-"llama2-70b-interactive.*.use_token_latencies = 1\n"
-"mixtral-8x7b.*.use_token_latencies = 1\n"
-"llama3_1-405b.*.use_token_latencies = 1\n"
-"llama3_1-405b-interactive.*.use_token_latencies = 1\n"
-"llama3_1-8b.*.use_token_latencies = 1\n"
-"llama3_1-8b-edge.*.use_token_latencies = 1\n"
-"llama3_1-8b-interactive.*.use_token_latencies = 1\n"
-"deepseek-r1.*.use_token_latencies = 1\n"
-"whisper.*.use_token_latencies = 1\n"
-"\n"
-"# gptj benchmark infers token latencies\n"
-"gptj.*.infer_token_latencies = 1\n"
-"gptj.*.token_latency_scaling_factor = 69\n"
-"# Only ttft and tpot are tracked for the llama2-70b, mixtral-8x7B & llama3_1-405b benchmark therefore target_latency = 0\n"
-"llama2-70b.Server.target_latency = 0\n"
-"llama2-70b.Server.ttft_latency = 2000\n"
-"llama2-70b.Server.tpot_latency = 200\n"
-"\n"
-"# Target Latencies for interactive setting\n"
-"llama2-70b-interactive.Server.target_latency = 0\n"
-"llama2-70b-interactive.Server.ttft_latency = 450\n"
-"llama2-70b-interactive.Server.tpot_latency = 40\n"
-"\n"
-"mixtral-8x7b.Server.target_latency = 0\n"
-"mixtral-8x7b.Server.ttft_latency = 2000\n"
-"mixtral-8x7b.Server.tpot_latency = 200\n"
-"\n"
-"llama3_1-405b.Server.target_latency = 0\n"
-"llama3_1-405b.Server.ttft_latency = 6000\n"
-"llama3_1-405b.Server.tpot_latency = 175\n"
-"\n"
-"# Target Latencies for interactive setting\n"
-"llama3_1-405b-interactive.Server.target_latency = 0\n"
-"llama3_1-405b-interactive.Server.ttft_latency = 4500\n"
-"llama3_1-405b-interactive.Server.tpot_latency = 80\n"
-"\n"
-"\n"
-"llama3_1-8b.Server.target_latency = 0\n"
-"llama3_1-8b.Server.ttft_latency = 2000\n"
-"llama3_1-8b.Server.tpot_latency = 100\n"
-"\n"
-"# Target Latencies for interactive setting\n"
-"llama3_1-8b-interactive.Server.target_latency = 0\n"
-"llama3_1-8b-interactive.Server.ttft_latency = 500\n"
-"llama3_1-8b-interactive.Server.tpot_latency = 30\n"
-"\n"
-"deepseek-r1.Server.target_latency = 0\n"
-"deepseek-r1.Server.ttft_latency = 2000\n"
-"deepseek-r1.Server.tpot_latency = 80\n"
-"\n"
-"*.Offline.target_latency_percentile = 90\n"
-"*.Offline.min_duration = 600000\n"
-"\n"
-"# In Offline scenario, we always have one query. But LoadGen maps this to\n"
-"# min_sample_count internally in Offline scenario. If the dataset size is larger\n"
-"# than 24576 we limit the min_query_count to 24576 and otherwise we use\n"
-"# the dataset size as the limit\n"
-"\n"
-"resnet50.Offline.min_query_count = 24576\n"
-"retinanet.Offline.min_query_count = 24576\n"
-"dlrm-v2.Offline.min_query_count = 24576\n"
-"bert.Offline.min_query_count = 10833\n"
-"gptj.Offline.min_query_count = 13368\n"
-"rnnt.Offline.min_query_count = 2513\n"
-"3d-unet.Offline.min_query_count = 43\n"
-"stable-diffusion-xl.Offline.min_query_count = 5000\n"
-"llama2-70b.Offline.min_query_count = 24576\n"
-"llama3_1-405b.Offline.min_query_count = 8313\n"
-"llama3_1-8b.Offline.min_query_count = 13368\n"
-"llama3_1-8b-edge.Offline.min_query_count = 5000\n"
-"mixtral-8x7b.Offline.min_query_count = 15000\n"
-"rgat.Offline.min_query_count = 788379\n"
-"deepseek-r1.Offline.min_query_count = 4388\n"
-"whisper.Offline.min_query_count = 1633\n"
-"\n"
-"# These fields should be defined and overridden by user.conf.\n"
-"*.SingleStream.target_latency = 10\n"
-"*.MultiStream.target_latency = 80\n"
-"*.Server.target_qps = 1.0\n"
-"*.Offline.target_qps = 1.0\n"
-"\n"
-"";
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/pyproject.toml b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/pyproject.toml
deleted file mode 100755
index 6f0ae06f0..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/pyproject.toml
+++ /dev/null
@@ -1,7 +0,0 @@
-[build-system]
-requires = ["setuptools>=42", "wheel", "pybind11==2.11.1"]
-build-backend = "setuptools.build_meta:__legacy__"
-
-[tool.cibuildwheel]
-environment = "CFLAGS='-std=c++14'"
-build = "cp3{7,8,9,10,11,12,13}-*"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_dispatch_library.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_dispatch_library.h
deleted file mode 100644
index 6c594efe0..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_dispatch_library.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/// \file
-/// \brief Defines the QueryDispatchLibrary interface.
-
-#ifndef MLPERF_LOADGEN_QUERY_DISPATCH_LIBRARY_H
-#define MLPERF_LOADGEN_QUERY_DISPATCH_LIBRARY_H
-
-#include <string>
-
-#include "system_under_test.h"
-
-namespace mlperf {
-
-/// \addtogroup LoadgenAPI
-/// @{
-
-/// \brief The interface a client implements for the LoadGen over the network to
-/// test. The API inherits the System_under_test.h API When working in LON mode
-/// the QueryDispatchLibrary class is used and natively Upcasted to the
-/// QueryDispatchLibrary class.
-
-class QueryDispatchLibrary : public SystemUnderTest {
- public:
-  virtual ~QueryDispatchLibrary() = default;
-};
-
-/// @}
-
-}  // namespace mlperf
-
-#endif  // MLPERF_LOADGEN_QUERY_DISPATCH_LIBRARY_H
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_sample.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_sample.h
deleted file mode 100644
index e740be99e..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_sample.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/// \file
-/// \brief Defines the structs involved in issuing a query and responding to
-/// a query.
-/// \details These are broken out into their own files since they are exposed
-/// as part of the C API and we want to avoid C clients including C++ code.
-
-#ifndef MLPERF_LOADGEN_QUERY_SAMPLE_H_
-#define MLPERF_LOADGEN_QUERY_SAMPLE_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include <iostream>
-
-namespace mlperf {
-
-/// \addtogroup LoadgenAPI
-/// @{
-
-/// \brief Represents a unique identifier for a sample of an issued query.
-/// \details As currently implemented, the id is a pointer to an internal
-/// loadgen struct whose value will never be zero/null.
-typedef uintptr_t ResponseId;
-constexpr ResponseId kResponseIdReserved = 0;
-
-/// \brief An index into the QuerySampleLibrary corresponding to a
-/// single sample.
-typedef size_t QuerySampleIndex;
-
-/// \brief Represents the smallest unit of input inference can run on.
-/// A query consists of one or more samples.
-struct QuerySample {
-  ResponseId id;
-  QuerySampleIndex index;
-};
-
-/// \brief Represents a single response to QuerySample
-struct QuerySampleResponse {
-  ResponseId id;
-  uintptr_t data;
-  size_t size;  ///< Size in bytes.
-  int64_t n_tokens;
-
- public:
-  QuerySampleResponse(ResponseId id, uintptr_t data, size_t size,
-                      int64_t n_tokens)
-      : id(id),
-        data(data),
-        size(size),
-        n_tokens(n_tokens){
-            // std::cout << "Initialized with 4 arguments, n_tokens: " <<
-            // n_tokens <<"\n";
-        };
-  QuerySampleResponse(ResponseId id, uintptr_t data, size_t size)
-      : id(id),
-        data(data),
-        size(size),
-        n_tokens(0){
-            // std::cout << "Initialized with 3 arguments, n_tokens: " <<
-            // n_tokens <<"\n";
-        };
-  QuerySampleResponse()
-      : id(0),
-        data(0),
-        size(0),
-        n_tokens(0){
-            // std::cout << "Initialized with 0 arguments, n_tokens: " <<
-            // n_tokens <<"\n";
-        };
-};
-
-/// \brief A latency in nanoseconds, as recorded by the loadgen.
-typedef int64_t QuerySampleLatency;
-
-/// @}
-
-}  // namespace mlperf
-
-#endif  // MLPERF_LOADGEN_QUERY_SAMPLE_H_
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_sample_library.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_sample_library.h
deleted file mode 100644
index 7258068cb..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/query_sample_library.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/// \file
-/// \brief Defines the QuerySampleLibrary interface.
-
-#ifndef MLPERF_LOADGEN_QUERY_SAMPLE_LIBRARY_H
-#define MLPERF_LOADGEN_QUERY_SAMPLE_LIBRARY_H
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "query_sample.h"
-
-namespace mlperf {
-
-/// \addtogroup LoadgenAPI
-/// @{
-
-/// \brief The interface a client implements to coordinate with the loadgen
-/// which samples should be loaded.
-class QuerySampleLibrary {
- public:
-  virtual ~QuerySampleLibrary() {}
-
-  /// \brief A human readable name for the model.
-  virtual const std::string& Name() = 0;
-
-  /// \brief Total number of samples in library.
-  virtual size_t TotalSampleCount() = 0;
-
-  /// \brief The number of samples that are guaranteed to fit in RAM.
-  virtual size_t PerformanceSampleCount() = 0;
-
-  /// \brief Loads the requested query samples into memory.
-  /// \details Paired with calls to UnloadSamplesFromRam.
-  /// In the MultiStream scenarios:
-  ///   * Samples will appear more than once.
-  ///   * SystemUnderTest::IssueQuery will only be called with a set of samples
-  ///     that are neighbors in the vector of samples here, which helps
-  ///     SUTs that need the queries to be contiguous.
-  /// In all other scenarios:
-  ///   * A previously loaded sample will not be loaded again.
-  virtual void LoadSamplesToRam(
-      const std::vector<QuerySampleIndex>& samples) = 0;
-
-  /// \brief Unloads the requested query samples from memory.
-  /// \details In the MultiStream scenarios:
-  ///   * Samples may be unloaded the same number of times they were loaded;
-  ///     however, if the implementation de-dups loaded samples rather than
-  ///     loading samples into contiguous memory, it may unload a sample the
-  ///     first time they see it unloaded without a refcounting scheme, ignoring
-  ///     subsequent unloads. A refcounting scheme would also work, but is not
-  ///     a requirement.
-  /// In all other scenarios:
-  ///   * A previously unloaded sample will not be unloaded again.
-  virtual void UnloadSamplesFromRam(
-      const std::vector<QuerySampleIndex>& samples) = 0;
-};
-
-/// @}
-
-}  // namespace mlperf
-
-#endif  // MLPERF_LOADGEN_QUERY_SAMPLE_LIBRARY_H
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/requirements.txt b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/requirements.txt
deleted file mode 100644
index e47c59fd7..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/requirements.txt
+++ /dev/null
@@ -1 +0,0 @@
-pybind11
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/results.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/results.cc
deleted file mode 100644
index f7c61af43..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/results.cc
+++ /dev/null
@@ -1,856 +0,0 @@
-/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "results.h"
-
-#include "early_stopping.h"
-#include "utils.h"
-
-namespace mlperf {
-namespace loadgen {
-
-void PerformanceSummary::ProcessLatencies() {
-  if (pr.sample_latencies.empty()) {
-    return;
-  }
-
-  sample_count = pr.sample_latencies.size();
-
-  QuerySampleLatency accumulated_sample_latency = 0;
-  for (auto latency : pr.sample_latencies) {
-    accumulated_sample_latency += latency;
-  }
-  sample_latency_mean = accumulated_sample_latency / sample_count;
-
-  std::sort(pr.sample_latencies.begin(), pr.sample_latencies.end());
-
-  target_latency_percentile.sample_latency =
-      pr.sample_latencies[sample_count * target_latency_percentile.percentile];
-  sample_latency_min = pr.sample_latencies.front();
-  sample_latency_max = pr.sample_latencies.back();
-  for (auto& lp : latency_percentiles) {
-    assert(lp.percentile >= 0.0);
-    assert(lp.percentile < 1.0);
-    lp.sample_latency = pr.sample_latencies[sample_count * lp.percentile];
-  }
-
-  query_count = pr.queries_issued;
-
-  // Count the number of overlatency queries. Only for Server scenario. Since in
-  // this scenario the number of samples per query is 1, sample_latencies are
-  // used.
-  if (settings.scenario == TestScenario::Server) {
-    QuerySampleLatency max_latency = settings.target_latency.count() + 1;
-    overlatency_query_count =
-        pr.sample_latencies.end() -
-        std::lower_bound(pr.sample_latencies.begin(), pr.sample_latencies.end(),
-                         max_latency);
-  }
-
-  if (settings.use_token_latencies) {
-    ProcessTokenLatencies();
-  }
-
-  // MultiStream only after this point.
-  if (settings.scenario != TestScenario::MultiStream) {
-    return;
-  }
-
-  // Calculate per-query stats.
-  size_t query_count = pr.queries_issued;
-  assert(pr.query_latencies.size() == query_count);
-  std::sort(pr.query_latencies.begin(), pr.query_latencies.end());
-  QuerySampleLatency accumulated_query_latency = 0;
-  for (auto latency : pr.query_latencies) {
-    accumulated_query_latency += latency;
-  }
-  query_latency_mean = accumulated_query_latency / query_count;
-  query_latency_min = pr.query_latencies.front();
-  query_latency_max = pr.query_latencies.back();
-  target_latency_percentile.query_latency =
-      pr.query_latencies[query_count * target_latency_percentile.percentile];
-  for (auto& lp : latency_percentiles) {
-    lp.query_latency = pr.query_latencies[query_count * lp.percentile];
-  }
-}
-
-void PerformanceSummary::ProcessTokenLatencies() {
-  constexpr auto nTokenInvalid = std::numeric_limits<int64_t>::min();
-  token_count = 0;
-  for (auto n_tokens : pr.token_results.tokens_per_sample) {
-    if (n_tokens != nTokenInvalid) token_count += n_tokens;
-  }
-  if (pr.token_results.first_token_latencies.empty()) {
-    return;
-  }
-  QuerySampleLatency accumulated_first_token_latency = 0;
-  for (auto latency : pr.token_results.first_token_latencies) {
-    accumulated_first_token_latency += latency;
-  }
-  first_token_latency_mean = accumulated_first_token_latency / sample_count;
-  QuerySampleLatency accumulated_tpot = 0;
-  for (auto latency : pr.token_results.time_per_output_token_arr) {
-    accumulated_tpot += latency;
-  }
-  time_per_output_token_mean = accumulated_tpot / sample_count;
-  std::sort(pr.token_results.first_token_latencies.begin(),
-            pr.token_results.first_token_latencies.end());
-  std::sort(pr.token_results.time_per_output_token_arr.begin(),
-            pr.token_results.time_per_output_token_arr.end());
-
-  token_target_latency_percentile.sample_latency =
-      pr.token_results
-          .first_token_latencies[sample_count *
-                                 token_target_latency_percentile.percentile];
-  first_token_latency_min = pr.token_results.first_token_latencies.front();
-  first_token_latency_max = pr.token_results.first_token_latencies.back();
-  for (auto& lp : token_latency_percentiles) {
-    assert(lp.percentile >= 0.0);
-    assert(lp.percentile < 1.0);
-    lp.sample_latency =
-        pr.token_results.first_token_latencies[sample_count * lp.percentile];
-  }
-
-  target_tpot_percentile.sample_latency =
-      pr.token_results
-          .time_per_output_token_arr[sample_count *
-                                     target_tpot_percentile.percentile];
-  time_per_output_token_min =
-      pr.token_results.time_per_output_token_arr.front();
-  time_per_output_token_max = pr.token_results.time_per_output_token_arr.back();
-  for (auto& lp : tpot_percentiles) {
-    assert(lp.percentile >= 0.0);
-    assert(lp.percentile < 1.0);
-    lp.sample_latency =
-        pr.token_results
-            .time_per_output_token_arr[sample_count * lp.percentile];
-  }
-
-  if (settings.scenario == TestScenario::Server) {
-    // TODO: Maybe another target latency needs to be added?
-    QuerySampleLatency max_latency = settings.target_latency.count() + 1;
-    overlatency_first_token_count =
-        pr.token_results.first_token_latencies.end() -
-        std::lower_bound(pr.token_results.first_token_latencies.begin(),
-                         pr.token_results.first_token_latencies.end(),
-                         max_latency);
-  }
-}
-
-bool PerformanceSummary::EarlyStopping(
-    std::string* recommendation, int64_t queries_issued,
-    std::vector<QuerySampleLatency>* sample_latencies,
-    std::vector<QuerySampleLatency>* query_latencies,
-    std::chrono::nanoseconds target_latency) {
-  recommendation->clear();
-
-  MinPassingQueriesFinder find_min_passing;
-  double confidence = 0.99;
-  double tolerance = 0.0;
-
-  ProcessLatencies();
-  switch (settings.scenario) {
-    case TestScenario::SingleStream: {
-      // TODO: Grab multistream percentile from settings, instead of hardcoding.
-      double multi_stream_percentile = 0.99;
-      int64_t t = 1;
-      int64_t h_min = find_min_passing(1, target_latency_percentile.percentile,
-                                       tolerance, confidence);
-      int64_t h = h_min;
-      if (queries_issued < h_min + 1) {
-        *recommendation =
-            " * Only processed " + std::to_string(queries_issued) +
-            " queries.\n * Need to process at least " +
-            std::to_string(h_min + 1) + " queries for early stopping.";
-        return false;
-      } else {
-        for (int64_t i = 2; i < queries_issued + 1; ++i) {
-          h = find_min_passing(i, target_latency_percentile.percentile,
-                               tolerance, confidence);
-          if (queries_issued < h + i) {
-            t = i - 1;
-            break;
-          }
-        }
-      }
-      QuerySampleLatency percentile_estimate =
-          (*sample_latencies)[queries_issued - t];
-      *recommendation =
-          " * Processed at least " + std::to_string(h_min + 1) + " queries (" +
-          std::to_string(queries_issued) + ").\n" + " * Would discard " +
-          std::to_string(t - 1) + " highest latency queries.\n" +
-          " * Early stopping " +
-          DoubleToString(target_latency_percentile.percentile * 100, 1) +
-          "th percentile estimate: " + std::to_string(percentile_estimate);
-      early_stopping_latency_ss = percentile_estimate;
-
-      // Early stopping estimate for 99%ile (used for infering multi-stream from
-      // single-stream)
-      t = 1;
-      h_min =
-          find_min_passing(1, multi_stream_percentile, tolerance, confidence);
-      h = h_min;
-      if (queries_issued < h_min + 1) {
-        *recommendation +=
-            "\n * Not enough queries processed for " +
-            DoubleToString(multi_stream_percentile * 100, 1) +
-            "th percentile\n" +
-            " early stopping estimate (would need to process at\n least " +
-            std::to_string(h_min + 1) + " total queries).";
-      } else {
-        for (int64_t i = 2; i < queries_issued + 1; ++i) {
-          h = find_min_passing(i, multi_stream_percentile, tolerance,
-                               confidence);
-          if (queries_issued < h + i) {
-            t = i - 1;
-            break;
-          }
-        }
-        percentile_estimate = (*sample_latencies)[queries_issued - t];
-        *recommendation +=
-            "\n * Early stopping " +
-            DoubleToString(multi_stream_percentile * 100, 1) +
-            "th percentile estimate: " + std::to_string(percentile_estimate);
-        early_stopping_latency_ms = percentile_estimate;
-      }
-      break;
-    }
-    case TestScenario::Server: {
-      int64_t t =
-          std::count_if((*sample_latencies).begin(), (*sample_latencies).end(),
-                        [=](auto const& latency) {
-                          return latency > target_latency.count();
-                        });
-      int64_t h = find_min_passing(t, target_latency_percentile.percentile,
-                                   tolerance, confidence);
-      if (queries_issued >= h + t) {
-        *recommendation = " * Run successful.";
-      } else {
-        *recommendation = " * Run unsuccessful.\n * Processed " +
-                          std::to_string(queries_issued) + " queries.\n" +
-                          " * Would need to run at least " +
-                          std::to_string(h + t - queries_issued) +
-                          " more queries,\n with the run being successful if "
-                          "every additional\n query were under latency.";
-        return false;
-      }
-      break;
-    }
-    case TestScenario::MultiStream: {
-      int64_t t = 1;
-      int64_t h_min = find_min_passing(1, target_latency_percentile.percentile,
-                                       tolerance, confidence);
-      int64_t h = h_min;
-      if (queries_issued < h_min + 1) {
-        *recommendation =
-            " * Only processed " + std::to_string(queries_issued) +
-            " queries.\n * Need to process at least " +
-            std::to_string(h_min + 1) + " queries for early stopping.";
-        return false;
-      } else {
-        for (int64_t i = 2; i < queries_issued + 1; ++i) {
-          h = find_min_passing(i, target_latency_percentile.percentile,
-                               tolerance, confidence);
-          if (queries_issued < h + i) {
-            t = i - 1;
-            break;
-          }
-        }
-      }
-      QuerySampleLatency percentile_estimate =
-          (*query_latencies)[queries_issued - t];
-      *recommendation =
-          " * Processed at least " + std::to_string(h_min + 1) + " queries (" +
-          std::to_string(queries_issued) + ").\n" + " * Would discard " +
-          std::to_string(t - 1) + " highest latency queries.\n" +
-          " * Early stopping " +
-          DoubleToString(target_latency_percentile.percentile * 100, 1) +
-          "th percentile estimate: " + std::to_string(percentile_estimate);
-      early_stopping_latency_ms = percentile_estimate;
-      break;
-    }
-    case TestScenario::Offline:
-      break;
-  }
-  return true;
-}
-
-bool PerformanceSummary::MinDurationMet(std::string* recommendation) {
-  recommendation->clear();
-  const double min_duration = DurationToSeconds(settings.min_duration);
-  bool min_duration_met = false;
-  switch (settings.scenario) {
-    case TestScenario::Offline:
-      min_duration_met = pr.max_latency >= min_duration;
-      break;
-    case TestScenario::Server:
-      min_duration_met = pr.final_query_scheduled_time >= min_duration;
-      break;
-    case TestScenario::SingleStream:
-    case TestScenario::MultiStream:
-      min_duration_met = pr.final_query_issued_time >= min_duration;
-      break;
-  }
-  if (min_duration_met) {
-    return true;
-  }
-
-  switch (settings.scenario) {
-    case TestScenario::SingleStream:
-    case TestScenario::MultiStream:
-      *recommendation =
-          "Decrease the expected latency so the loadgen pre-generates more "
-          "queries.";
-      break;
-    case TestScenario::Server:
-      *recommendation =
-          "Increase the target QPS so the loadgen pre-generates more queries.";
-      break;
-    case TestScenario::Offline:
-      *recommendation =
-          "Increase expected QPS so the loadgen pre-generates a larger "
-          "(coalesced) query.";
-      break;
-  }
-  return false;
-}
-
-bool PerformanceSummary::MinQueriesMet() {
-  return pr.queries_issued >= settings.min_query_count;
-}
-
-bool PerformanceSummary::MinSamplesMet() {
-  return sample_count >= settings.min_sample_count;
-}
-
-bool PerformanceSummary::HasPerfConstraints() {
-  return settings.scenario == TestScenario::Server;
-}
-
-bool PerformanceSummary::PerfConstraintsMet(std::string* recommendation) {
-  recommendation->clear();
-  bool perf_constraints_met = true;
-  switch (settings.scenario) {
-    case TestScenario::SingleStream:
-    case TestScenario::MultiStream:
-      break;
-    case TestScenario::Server:
-      ProcessLatencies();
-      if (!settings.use_token_latencies) {
-        if (target_latency_percentile.sample_latency >
-            settings.target_latency.count()) {
-          *recommendation = "Reduce target QPS to improve latency.";
-          perf_constraints_met = false;
-        }
-      } else {
-        if (token_target_latency_percentile.sample_latency >
-            settings.server_ttft_latency) {
-          *recommendation =
-              "TTFT constrain not met: Reduce target QPS to improve latency.";
-          perf_constraints_met = false;
-        }
-
-        if (target_tpot_percentile.sample_latency >
-            settings.server_tpot_latency) {
-          if (recommendation->empty()) {
-            *recommendation =
-                "TPOT constrain not met: Reduce target QPS to improve latency.";
-          } else {
-            recommendation->append(
-                "\n * TPOT constrain not met: Reduce target QPS to improve "
-                "latency.");
-          }
-          perf_constraints_met = false;
-        }
-      }
-      break;
-    case TestScenario::Offline:
-      break;
-  }
-  return perf_constraints_met;
-}
-
-void PerformanceSummary::LogSummary(AsyncSummary& summary) {
-  ProcessLatencies();
-
-  summary(
-      "================================================\n"
-      "MLPerf Results Summary\n"
-      "================================================");
-  summary("SUT name : ", sut_name);
-  summary("Scenario : ", ToString(settings.scenario));
-  summary("Mode     : ", ToString(settings.mode));
-
-  switch (settings.scenario) {
-    case TestScenario::SingleStream: {
-      summary(DoubleToString(target_latency_percentile.percentile * 100, 1) +
-                  "th percentile latency (ns) : ",
-              target_latency_percentile.sample_latency);
-      break;
-    }
-    case TestScenario::MultiStream: {
-      summary(DoubleToString(target_latency_percentile.percentile * 100, 1) +
-                  "th percentile latency (ns) : ",
-              target_latency_percentile.query_latency);
-      break;
-    }
-    case TestScenario::Server: {
-      // Subtract 1 from sample count since the start of the final sample
-      // represents the open end of the time range: i.e. [begin, end).
-      // This makes sense since:
-      // a) QPS doesn't apply if there's only one sample; it's pure latency.
-      // b) If you have precisely 1k QPS, there will be a sample exactly on
-      //    the 1 second time point; but that would be the 1001th sample in
-      //    the stream. Given the first 1001 queries, the QPS is
-      //    1000 queries / 1 second.
-      // TODO: make a more permanent solution
-      double qps_as_completed =
-          (sample_count - 1) / pr.final_query_all_samples_done_time;
-      summary("Completed samples per second    : ",
-              DoubleToString(qps_as_completed));
-      break;
-    }
-    case TestScenario::Offline: {
-      double samples_per_second = sample_count / pr.max_latency;
-      summary("Samples per second: ", samples_per_second);
-      break;
-    }
-  }
-
-  if (settings.use_token_latencies) {
-    switch (settings.scenario) {
-      case TestScenario::SingleStream: {
-        summary(DoubleToString(token_target_latency_percentile.percentile * 100,
-                               1) +
-                    "th first token percentile latency (ns) : ",
-                token_target_latency_percentile.sample_latency);
-        break;
-      }
-      case TestScenario::MultiStream: {
-        summary(DoubleToString(token_target_latency_percentile.percentile * 100,
-                               1) +
-                    "th first token percentile latency (ns) : ",
-                token_target_latency_percentile.sample_latency);
-        break;
-      }
-      case TestScenario::Offline: {
-        double tokens_per_second = token_count / pr.max_latency;
-        summary("Tokens per second: ", tokens_per_second);
-        break;
-      }
-      case TestScenario::Server:
-        double tps_as_completed =
-            token_count / pr.final_query_all_samples_done_time;
-        summary("Completed tokens per second: ",
-                DoubleToString(tps_as_completed));
-        break;
-    }
-  }
-
-  if (settings.infer_token_latencies) {
-    switch (settings.scenario) {
-      case TestScenario::SingleStream: {
-        break;
-      }
-      case TestScenario::MultiStream: {
-        break;
-      }
-      case TestScenario::Offline: {
-        double tokens_per_second = settings.token_latency_scaling_factor *
-                                   sample_count / pr.max_latency;
-        summary("Tokens per second (inferred): ", tokens_per_second);
-        break;
-      }
-      case TestScenario::Server:
-        double tps_as_completed = settings.token_latency_scaling_factor *
-                                  (sample_count - 1) /
-                                  pr.final_query_all_samples_done_time;
-        summary("Completed tokens per second (inferred): ",
-                DoubleToString(tps_as_completed));
-        break;
-    }
-  }
-
-  std::string min_duration_recommendation;
-  std::string perf_constraints_recommendation;
-  std::string early_stopping_recommendation;
-  std::string early_stopping_ttft_recommendation;
-  std::string early_stopping_tpot_recommendation;
-
-  bool min_duration_met = MinDurationMet(&min_duration_recommendation);
-  bool min_queries_met = MinQueriesMet() && MinSamplesMet();
-  bool early_stopping_met = true;
-  if (!settings.use_token_latencies) {
-    early_stopping_met = EarlyStopping(
-        &early_stopping_recommendation, pr.queries_issued, &pr.sample_latencies,
-        &pr.query_latencies, settings.target_latency);
-  } else {
-    early_stopping_met =
-        EarlyStopping(&early_stopping_tpot_recommendation, pr.queries_issued,
-                      &pr.token_results.time_per_output_token_arr,
-                      &pr.query_latencies,
-                      std::chrono::nanoseconds(settings.server_tpot_latency)) &&
-        EarlyStopping(&early_stopping_ttft_recommendation, pr.queries_issued,
-                      &pr.token_results.first_token_latencies,
-                      &pr.query_latencies,
-                      std::chrono::nanoseconds(settings.server_ttft_latency));
-  }
-  bool perf_constraints_met =
-      PerfConstraintsMet(&perf_constraints_recommendation);
-  bool all_constraints_met = min_duration_met && min_queries_met &&
-                             perf_constraints_met && early_stopping_met;
-  summary("Result is : ", all_constraints_met ? "VALID" : "INVALID");
-  if (HasPerfConstraints()) {
-    summary("  Performance constraints satisfied : ",
-            perf_constraints_met ? "Yes" : "NO");
-  }
-  summary("  Min duration satisfied : ", min_duration_met ? "Yes" : "NO");
-  summary("  Min queries satisfied : ", min_queries_met ? "Yes" : "NO");
-  summary("  Early stopping satisfied: ", early_stopping_met ? "Yes" : "NO");
-
-  if (!all_constraints_met) {
-    summary("Recommendations:");
-    if (!perf_constraints_met) {
-      summary(" * " + perf_constraints_recommendation);
-    }
-    if (!min_duration_met) {
-      summary(" * " + min_duration_recommendation);
-    }
-    if (!min_queries_met) {
-      summary(
-          " * The test exited early, before enough queries were issued.\n"
-          "   See the detailed log for why this may have occurred.");
-    }
-  }
-  // Early stopping results
-  if (settings.scenario == TestScenario::SingleStream ||
-      settings.scenario == TestScenario::Server ||
-      settings.scenario == TestScenario::MultiStream) {
-    if (!settings.use_token_latencies) {
-      summary("Early Stopping Result:");
-      summary(early_stopping_recommendation);
-    } else {
-      summary("TTFT Early Stopping Result:");
-      summary(early_stopping_ttft_recommendation);
-      summary("TPOT Early Stopping Result:");
-      summary(early_stopping_tpot_recommendation);
-    }
-  }
-
-  summary(
-      "\n"
-      "================================================\n"
-      "Additional Stats\n"
-      "================================================");
-
-  if (settings.scenario == TestScenario::SingleStream) {
-    double qps_w_lg = (sample_count - 1) / pr.final_query_issued_time;
-    double qps_wo_lg = 1 / QuerySampleLatencyToSeconds(sample_latency_mean);
-    summary("QPS w/ loadgen overhead         : " + DoubleToString(qps_w_lg));
-    summary("QPS w/o loadgen overhead        : " + DoubleToString(qps_wo_lg));
-    summary("");
-  } else if (settings.scenario == TestScenario::Server) {
-    // Scheduled samples per second as an additional stat
-    double qps_as_scheduled =
-        (sample_count - 1) / pr.final_query_scheduled_time;
-    summary("Scheduled samples per second : ",
-            DoubleToString(qps_as_scheduled));
-  } else if (settings.scenario == TestScenario::MultiStream) {
-    summary("Per-query latency:  ");
-    summary("Min latency (ns)                : ", query_latency_min);
-    summary("Max latency (ns)                : ", query_latency_max);
-    summary("Mean latency (ns)               : ", query_latency_mean);
-    for (auto& lp : latency_percentiles) {
-      summary(
-          DoubleToString(lp.percentile * 100) + " percentile latency (ns)   : ",
-          lp.query_latency);
-    }
-  }
-
-  if (settings.scenario != TestScenario::MultiStream) {
-    summary("Min latency (ns)                : ", sample_latency_min);
-    summary("Max latency (ns)                : ", sample_latency_max);
-    summary("Mean latency (ns)               : ", sample_latency_mean);
-    for (auto& lp : latency_percentiles) {
-      summary(
-          DoubleToString(lp.percentile * 100) + " percentile latency (ns)   : ",
-          lp.sample_latency);
-    }
-  }
-  if (settings.use_token_latencies) {
-    summary("");
-    if (settings.scenario == TestScenario::SingleStream) {
-      double tps_w_lg = token_count / pr.final_query_issued_time;
-      double tps_wo_lg =
-          ((double)token_count) /
-          (QuerySampleLatencyToSeconds(sample_latency_mean) * sample_count);
-      summary("TPS w/ loadgen overhead         : " + DoubleToString(tps_w_lg));
-      summary("TPS w/o loadgen overhead        : " + DoubleToString(tps_wo_lg));
-
-    } else if (settings.scenario == TestScenario::Server) {
-      double tps_as_completed =
-          token_count / pr.final_query_all_samples_done_time;
-      summary("Completed tokens per second                 : ",
-              DoubleToString(tps_as_completed));
-    }
-
-    if (settings.scenario != TestScenario::Offline) {
-      summary("Min First Token latency (ns)                : ",
-              first_token_latency_min);
-      summary("Max First Token latency (ns)                : ",
-              first_token_latency_max);
-      summary("Mean First Token latency (ns)               : ",
-              first_token_latency_mean);
-      for (auto& lp : token_latency_percentiles) {
-        summary(DoubleToString(lp.percentile * 100) +
-                    " percentile first token latency (ns)   : ",
-                lp.sample_latency);
-      }
-      summary("");
-      summary("Min Time per Output Token (ns)                : ",
-              time_per_output_token_min);
-      summary("Max Time per Output Token (ns)                : ",
-              time_per_output_token_max);
-      summary("Mean Time per Output Token (ns)               : ",
-              time_per_output_token_mean);
-      for (auto& lp : tpot_percentiles) {
-        summary(DoubleToString(lp.percentile * 100) +
-                    " percentile time to output token (ns)   : ",
-                lp.sample_latency);
-      }
-    }
-  }
-
-  summary(
-      "\n"
-      "================================================\n"
-      "Test Parameters Used\n"
-      "================================================");
-  settings.LogSummary(summary);
-}
-
-void PerformanceSummary::LogDetail(AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-  ProcessLatencies();
-
-  // General validity checking
-  std::string min_duration_recommendation;
-  std::string perf_constraints_recommendation;
-  std::string early_stopping_recommendation;
-  std::string early_stopping_ttft_recommendation;
-  std::string early_stopping_tpot_recommendation;
-  bool min_duration_met = MinDurationMet(&min_duration_recommendation);
-  bool min_queries_met = MinQueriesMet() && MinSamplesMet();
-  bool perf_constraints_met =
-      PerfConstraintsMet(&perf_constraints_recommendation);
-  bool early_stopping_met = true;
-  if (!settings.use_token_latencies) {
-    early_stopping_met = EarlyStopping(
-        &early_stopping_recommendation, pr.queries_issued, &pr.sample_latencies,
-        &pr.query_latencies, settings.target_latency);
-  } else {
-    early_stopping_met =
-        EarlyStopping(&early_stopping_tpot_recommendation, pr.queries_issued,
-                      &pr.token_results.time_per_output_token_arr,
-                      &pr.query_latencies,
-                      std::chrono::nanoseconds(settings.server_tpot_latency)) &&
-        EarlyStopping(&early_stopping_ttft_recommendation, pr.queries_issued,
-                      &pr.token_results.first_token_latencies,
-                      &pr.query_latencies,
-                      std::chrono::nanoseconds(settings.server_ttft_latency));
-  }
-  bool all_constraints_met = min_duration_met && min_queries_met &&
-                             perf_constraints_met && early_stopping_met;
-
-  MLPERF_LOG(detail, "result_validity",
-             all_constraints_met ? "VALID" : "INVALID");
-  if (HasPerfConstraints()) {
-    MLPERF_LOG(detail, "result_perf_constraints_met", perf_constraints_met);
-  }
-  MLPERF_LOG(detail, "result_min_duration_met", min_duration_met);
-  MLPERF_LOG(detail, "result_min_queries_met", min_queries_met);
-  MLPERF_LOG(detail, "early_stopping_met", early_stopping_met);
-  if (!all_constraints_met) {
-    std::string recommendation;
-    if (!perf_constraints_met) {
-      recommendation += perf_constraints_recommendation + " ";
-    }
-    if (!min_duration_met) {
-      recommendation += min_duration_recommendation + " ";
-    }
-    if (!min_queries_met) {
-      recommendation +=
-          "The test exited early, before enough queries were issued.";
-    }
-    std::replace(recommendation.begin(), recommendation.end(), '\n', ' ');
-    MLPERF_LOG(detail, "result_invalid_reason", recommendation);
-  }
-  std::replace(early_stopping_recommendation.begin(),
-               early_stopping_recommendation.end(), '\n', ' ');
-  if (!settings.use_token_latencies) {
-    MLPERF_LOG(detail, "early_stopping_result", early_stopping_recommendation);
-  } else {
-    std::replace(early_stopping_ttft_recommendation.begin(),
-                 early_stopping_ttft_recommendation.end(), '\n', ' ');
-    std::replace(early_stopping_tpot_recommendation.begin(),
-                 early_stopping_tpot_recommendation.end(), '\n', ' ');
-    MLPERF_LOG(detail, "early_stopping_ttft_result",
-               early_stopping_ttft_recommendation);
-    MLPERF_LOG(detail, "early_stopping_tpot_result",
-               early_stopping_tpot_recommendation);
-  }
-  // Report number of queries
-  MLPERF_LOG(detail, "result_query_count", query_count);
-  if (settings.scenario == TestScenario::Server) {
-    MLPERF_LOG(detail, "result_overlatency_query_count",
-               overlatency_query_count);
-  }
-
-  auto reportPerQueryLatencies = [&]() {
-    MLPERF_LOG(detail, "result_min_query_latency_ns", query_latency_min);
-    MLPERF_LOG(detail, "result_max_query_latency_ns", query_latency_max);
-    MLPERF_LOG(detail, "result_mean_query_latency_ns", query_latency_mean);
-    for (auto& lp : latency_percentiles) {
-      std::string percentile = DoubleToString(lp.percentile * 100);
-      MLPERF_LOG(detail,
-                 "result_" + percentile + "_percentile_per_query_latency_ns",
-                 lp.query_latency);
-    }
-  };
-
-  // Per-scenario performance results.
-  switch (settings.scenario) {
-    case TestScenario::SingleStream: {
-      double qps_w_lg = (sample_count - 1) / pr.final_query_issued_time;
-      double qps_wo_lg = 1 / QuerySampleLatencyToSeconds(sample_latency_mean);
-      MLPERF_LOG(detail, "result_qps_with_loadgen_overhead", qps_w_lg);
-      MLPERF_LOG(detail, "result_qps_without_loadgen_overhead", qps_wo_lg);
-      MLPERF_LOG(detail, "early_stopping_latency_ss",
-                 early_stopping_latency_ss);
-      MLPERF_LOG(detail, "early_stopping_latency_ms",
-                 early_stopping_latency_ms);
-      break;
-    }
-    case TestScenario::MultiStream: {
-      reportPerQueryLatencies();
-      MLPERF_LOG(detail, "early_stopping_latency_ms",
-                 early_stopping_latency_ms);
-      break;
-    }
-    case TestScenario::Server: {
-      // Subtract 1 from sample count since the start of the final sample
-      // represents the open end of the time range: i.e. [begin, end).
-      // This makes sense since:
-      // a) QPS doesn't apply if there's only one sample; it's pure latency.
-      // b) If you have precisely 1k QPS, there will be a sample exactly on
-      //    the 1 second time point; but that would be the 1001th sample in
-      //    the stream. Given the first 1001 queries, the QPS is
-      //    1000 queries / 1 second.
-      double qps_as_scheduled =
-          (sample_count - 1) / pr.final_query_scheduled_time;
-      MLPERF_LOG(detail, "result_scheduled_samples_per_sec", qps_as_scheduled);
-      double qps_as_completed =
-          (sample_count - 1) / pr.final_query_all_samples_done_time;
-      MLPERF_LOG(detail, "result_completed_samples_per_sec", qps_as_completed);
-      break;
-    }
-    case TestScenario::Offline: {
-      double samples_per_second = sample_count / pr.max_latency;
-      MLPERF_LOG(detail, "result_samples_per_second", samples_per_second);
-      break;
-    }
-  }
-
-  // Detailed latencies
-  MLPERF_LOG(detail, "result_min_latency_ns", sample_latency_min);
-  MLPERF_LOG(detail, "result_max_latency_ns", sample_latency_max);
-  MLPERF_LOG(detail, "result_mean_latency_ns", sample_latency_mean);
-  for (auto& lp : latency_percentiles) {
-    MLPERF_LOG(detail,
-               "result_" + DoubleToString(lp.percentile * 100) +
-                   "_percentile_latency_ns",
-               lp.sample_latency);
-  }
-  // Detailed first token latencies
-  if (settings.use_token_latencies) {
-    if (settings.scenario != TestScenario::Offline) {
-      MLPERF_LOG(detail, "result_first_token_min_latency_ns",
-                 first_token_latency_min);
-      MLPERF_LOG(detail, "result_first_token_max_latency_ns",
-                 first_token_latency_max);
-      MLPERF_LOG(detail, "result_first_token_mean_latency_ns",
-                 first_token_latency_mean);
-      for (auto& lp : token_latency_percentiles) {
-        MLPERF_LOG(detail,
-                   "result_first_token_" + DoubleToString(lp.percentile * 100) +
-                       "_percentile_latency_ns",
-                   lp.sample_latency);
-      }
-      double tps_w_lg = ((double)token_count) / pr.final_query_issued_time;
-      double tps_wo_lg =
-          ((double)token_count) / (sample_latency_mean * sample_count);
-      MLPERF_LOG(detail, "result_token_throughput_with_loadgen_overhead",
-                 tps_w_lg);
-      MLPERF_LOG(detail, "result_token_throughput", tps_wo_lg);
-      for (auto& lp : tpot_percentiles) {
-        MLPERF_LOG(detail,
-                   "result_time_per_output_token_" +
-                       DoubleToString(lp.percentile * 100) + "_percentile_ns",
-                   lp.sample_latency);
-      }
-      MLPERF_LOG(detail, "result_time_to_output_token_min",
-                 time_per_output_token_min);
-      MLPERF_LOG(detail, "result_time_to_output_token_max",
-                 time_per_output_token_max);
-      MLPERF_LOG(detail, "result_time_to_output_token_mean",
-                 time_per_output_token_mean);
-      double tps_as_completed =
-          token_count / pr.final_query_all_samples_done_time;
-      MLPERF_LOG(detail, "result_completed_tokens_per_second",
-                 tps_as_completed);
-    } else {
-      double tokens_per_second = token_count / pr.max_latency;
-      MLPERF_LOG(detail, "result_tokens_per_second", tokens_per_second);
-    }
-  }
-
-  if (settings.infer_token_latencies) {
-    switch (settings.scenario) {
-      case TestScenario::Server: {
-        double completed_tokens_per_second =
-            (sample_count - 1) * settings.token_latency_scaling_factor /
-            pr.final_query_all_samples_done_time;
-        MLPERF_LOG(detail, "result_inferred_completed_tokens_per_second",
-                   completed_tokens_per_second);
-        break;
-      }
-      case TestScenario::Offline: {
-        double tokens_per_second = sample_count *
-                                   settings.token_latency_scaling_factor /
-                                   pr.max_latency;
-        MLPERF_LOG(detail, "result_inferred_tokens_per_second",
-                   tokens_per_second);
-        break;
-      }
-      case TestScenario::SingleStream: {
-        break;
-      }
-      case TestScenario::MultiStream: {
-        break;
-      }
-    }
-  }
-  MLPERF_LOG(detail, "num_errors", detail.async_log().GetErrorCount());
-#endif
-}
-}  // namespace loadgen
-}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/results.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/results.h
deleted file mode 100644
index 6befea2c0..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/results.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/// \file
-/// \brief Defines PerformanceResult and PerformanceSummary.
-
-#ifndef MLPERF_LOADGEN_RESULTS_H_
-#define MLPERF_LOADGEN_RESULTS_H_
-
-#include <string>
-#include <vector>
-
-#include "query_sample.h"
-#include "test_settings_internal.h"
-
-namespace mlperf {
-namespace loadgen {
-
-/// \brief Contains the performance results for benchmarks that have
-/// token based metrics
-struct TokenPerformanceResults {
-  std::vector<QuerySampleLatency> first_token_latencies;
-  std::vector<QuerySampleLatency> time_per_output_token_arr;
-  std::vector<int64_t> tokens_per_sample;
-};
-
-/// \brief Provides performance results that are independent of scenario
-/// and other context.
-struct PerformanceResult {
-  std::vector<QuerySampleLatency> sample_latencies;
-  std::vector<QuerySampleLatency> query_latencies;
-  size_t queries_issued;
-  double max_latency;
-  double final_query_scheduled_time;         // seconds from start.
-  double final_query_issued_time;            // seconds from start.
-  double final_query_all_samples_done_time;  // seconds from start.
-  TokenPerformanceResults token_results;
-};
-
-/// \brief Wraps PerformanceResult with relevant context to change how
-/// it's interpreted and reported.
-struct PerformanceSummary {
-  std::string sut_name;
-  TestSettingsInternal settings;
-  PerformanceResult pr;
-
-  // Set by ProcessLatencies.
-  size_t sample_count = 0;
-  size_t query_count = 0;
-  size_t overlatency_query_count = 0;
-  QuerySampleLatency sample_latency_min = 0;
-  QuerySampleLatency sample_latency_max = 0;
-  QuerySampleLatency sample_latency_mean = 0;
-  QuerySampleLatency query_latency_min = 0;
-  QuerySampleLatency query_latency_max = 0;
-  QuerySampleLatency query_latency_mean = 0;
-
-  /// \brief The latency at a given percentile.
-  struct PercentileEntry {
-    const double percentile;
-    QuerySampleLatency sample_latency = 0;
-    QuerySampleLatency query_latency = 0;  // MultiStream only.
-  };
-
-  // Latency target percentile
-  PercentileEntry target_latency_percentile{settings.target_latency_percentile};
-  PercentileEntry latency_percentiles[6] = {{.50}, {.90}, {.95},
-                                            {.97}, {.99}, {.999}};
-
-  // Early stopping percentile estimates for SingleStream and MultiStream
-  QuerySampleLatency early_stopping_latency_ss = 0;
-  QuerySampleLatency early_stopping_latency_ms = 0;
-
-  // Set by ProcessTokenLatencies
-  size_t token_count = 0;
-  size_t overlatency_first_token_count = 0;
-  QuerySampleLatency first_token_latency_min = 0;
-  QuerySampleLatency first_token_latency_max = 0;
-  QuerySampleLatency first_token_latency_mean = 0;
-  QuerySampleLatency time_per_output_token_min = 0;
-  QuerySampleLatency time_per_output_token_max = 0;
-  QuerySampleLatency time_per_output_token_mean = 0;
-
-  // Latency token target percentile
-  PercentileEntry token_target_latency_percentile{
-      settings.target_latency_percentile};
-  PercentileEntry token_latency_percentiles[6] = {{.50}, {.90}, {.95},
-                                                  {.97}, {.99}, {.999}};
-  PercentileEntry target_tpot_percentile{settings.target_latency_percentile};
-  PercentileEntry tpot_percentiles[6] = {{.50}, {.90}, {.95},
-                                         {.97}, {.99}, {.999}};
-
-#if defined(_WIN32) || defined(WIN32) || defined(_WIN64) || defined(WIN64)
-  // MSVC complains if there is no explicit constructor.
-  // (target_latency_percentile above depends on construction with settings)
-  PerformanceSummary(const std::string& sut_name_arg,
-                     const TestSettingsInternal& settings_arg,
-                     const PerformanceResult& pr_arg)
-      : sut_name(sut_name_arg), settings(settings_arg), pr(pr_arg){};
-#endif
-  void ProcessLatencies();
-  void ProcessTokenLatencies();
-
-  bool MinDurationMet(std::string* recommendation);
-  bool EarlyStopping(std::string* recommendation, int64_t queries_issued,
-                     std::vector<QuerySampleLatency>* sample_latencies,
-                     std::vector<QuerySampleLatency>* query_latencies,
-                     std::chrono::nanoseconds target_latency);
-  bool MinQueriesMet();
-  bool MinSamplesMet();
-  bool HasPerfConstraints();
-  bool PerfConstraintsMet(std::string* recommendation);
-  void LogSummary(AsyncSummary& summary);
-  void LogDetail(AsyncDetail& detail);
-};
-}  // namespace loadgen
-}  // namespace mlperf
-
-#endif
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/setup.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/setup.py
deleted file mode 100644
index 6254eea17..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/setup.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Copyright 2019 The MLPerf Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-# \file
-#  \brief MLPerf Inference LoadGen python module setup.
-#  \details Creates a module that python can import.
-#  All source files are compiled by python"s C++ toolchain  without depending
-#  on a loadgen lib.
-#
-#  This setup.py can be used stand-alone, without the use of an external
-#  build system. This will polute your source tree with output files
-#  and binaries. Use one of the gn build targets instead if you want
-#  to avoid poluting the source tree.
-
-from setuptools import Extension, setup
-from pathlib import Path
-from pybind11 import get_include
-from pybind11.setup_helpers import Pybind11Extension, build_ext
-from version_generator import generate_loadgen_version_definitions
-import subprocess
-
-generated_version_source_filename = "generated/version_generated.cc"
-generate_loadgen_version_definitions(generated_version_source_filename, ".")
-
-public_headers = [
-    "loadgen.h",
-    "query_sample.h",
-    "query_sample_library.h",
-    "system_under_test.h",
-    "test_settings.h",
-    "issue_query_controller.h",
-    "early_stopping.h",
-    "query_dispatch_library.h"
-]
-
-lib_headers = [
-    "logging.h",
-    "test_settings_internal.h",
-    "trace_generator.h",
-    "utils.h",
-    "version.h",
-    "results.h",
-    "bindings/c_api.h",
-    "version_generator.py",
-    "mlperf_conf.h"
-]
-
-lib_sources = [
-    "early_stopping.cc",
-    "issue_query_controller.cc",
-    "loadgen.cc",
-    "logging.cc",
-    "test_settings_internal.cc",
-    "utils.cc",
-    "version.cc",
-    "results.cc",
-]
-
-lib_bindings = [
-    "bindings/c_api.cc",
-    "bindings/python_api.cc",
-]
-
-this_directory = Path(__file__).parent
-mlperf_loadgen_headers = public_headers + lib_headers
-mlperf_loadgen_sources_no_gen = lib_sources + lib_bindings
-mlperf_loadgen_sources = mlperf_loadgen_sources_no_gen + [
-    generated_version_source_filename
-]
-mlperf_long_description = (
-    this_directory /
-    "README.md").read_text(
-        encoding="utf-8")
-
-with open("VERSION.txt", "r") as f:
-    version = f.read()
-version_split = version.split(".")
-
-if len(version_split) < 2:
-    print("Version is incomplete. Needs a format like 4.1.1 in VERSION file")
-
-
-try:
-    with open("mlperf.conf", 'r') as file:
-        conf_contents = file.read()
-
-    # Escape backslashes and double quotes
-    conf_contents = conf_contents.replace('\\', '\\\\').replace('"', '\\"')
-
-    # Convert newlines
-    conf_contents = conf_contents.replace('\n', '\\n"\n"')
-
-    formatted_content = f'const char* mlperf_conf =\n"{conf_contents}";\n'
-
-    with open("mlperf_conf.h", 'w') as header_file:
-        header_file.write(formatted_content)
-
-except IOError as e:
-    raise RuntimeError(f"Failed to generate header file: {e}")
-
-mlperf_loadgen_module = Pybind11Extension(
-    "mlperf_loadgen",
-    define_macros=[
-        ("MAJOR_VERSION",
-         version_split[0]),
-        ("MINOR_VERSION",
-         version_split[1])
-    ],
-    include_dirs=[".", get_include()],
-    sources=mlperf_loadgen_sources,
-    depends=mlperf_loadgen_headers,
-)
-
-setup(name="mlcommons_loadgen",
-      version=version,
-      description="MLPerf Inference LoadGen python bindings",
-      url="https://mlcommons.org/",
-      cmdclass={"build_ext": build_ext},
-      ext_modules=[mlperf_loadgen_module],
-      packages=['mlcommons_loadgen'],
-      package_dir={'mlcommons_loadgen': '.'},
-      include_package_data=True,
-      long_description=mlperf_long_description,
-      long_description_content_type='text/markdown')
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/system_under_test.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/system_under_test.h
deleted file mode 100644
index 843453962..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/system_under_test.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/// \file
-/// \brief Defines the SystemUnderTest interface.
-
-#ifndef MLPERF_LOADGEN_SYSTEM_UNDER_TEST_H
-#define MLPERF_LOADGEN_SYSTEM_UNDER_TEST_H
-
-#include <string>
-#include <vector>
-
-#include "query_sample.h"
-
-namespace mlperf {
-
-/// \addtogroup LoadgenAPI
-/// @{
-
-/// \brief The interface a client implements for the loadgen to test.
-/// \todo Add hook for an untimed warm up period for the SUT.
-/// \todo Add hook for an untimed warm up period for the loadgen logic.
-/// \todo Support power hooks for cool-down period before runing performance
-/// traffic.
-/// \todo Support power hooks for correlating test timeline with power
-/// measurment timeline.
-class SystemUnderTest {
- public:
-  virtual ~SystemUnderTest() {}
-
-  /// \brief A human-readable string for logging purposes.
-  virtual const std::string& Name() = 0;
-
-  /// \brief Lets the loadgen issue N samples to the SUT.
-  /// \details The SUT may either a) return immediately and signal completion
-  /// at a later time on another thread or b) it may block and signal
-  /// completion on the current stack. The load generator will handle both
-  /// cases properly.
-  /// Note: The data for neighboring samples may or may not be contiguous
-  /// depending on the scenario.
-  virtual void IssueQuery(const std::vector<QuerySample>& samples) = 0;
-
-  /// \brief Called immediately after the last call to IssueQuery
-  /// in a series is made.
-  /// \details This doesn't necessarily signify the end of the
-  /// test since there may be multiple series involved during a test; for
-  /// example in accuracy mode.
-  /// Clients can use this to flush any deferred queries immediately, rather
-  /// than waiting for some timeout.
-  /// This is especially useful in the server scenario.
-  virtual void FlushQueries() = 0;
-};
-
-/// @}
-
-}  // namespace mlperf
-
-#endif  // MLPERF_LOADGEN_SYSTEM_UNDER_TEST_H
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings.h
deleted file mode 100644
index 584d073bb..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings.h
+++ /dev/null
@@ -1,329 +0,0 @@
-/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/// \file
-/// \brief Provides ways for a client to change the behavior and
-/// constraints of the load generator.
-/// \details Note: The MLPerf specification takes precedent over any of the
-/// comments in this file if there are inconsistencies in regards to how the
-/// loadgen *should* work.
-/// The comments in this file are indicative of the loadgen implementation.
-
-#ifndef MLPERF_LOADGEN_TEST_SETTINGS_H
-#define MLPERF_LOADGEN_TEST_SETTINGS_H
-
-#include <cstdint>
-#include <string>
-
-namespace mlperf {
-
-/// \addtogroup LoadgenAPI
-/// @{
-
-/// \addtogroup LoadgenAPITestSettings Test Settings
-/// \brief This page contains a description of all the scenarios, modes,
-/// and log settings as implemented by the LoadGen.
-/// @{
-
-///
-/// \enum TestScenario
-/// * **SingleStream**
-///  + Issues queries containing a single sample.
-///  + The next query is only issued once the previous one has completed.
-///  + Internal LoadGen latency between queries is not included in the
-///    latency results.
-///  + **Final performance result is:** a percentile of the latency.
-/// * **MultiStream**
-///  + Issues queries containing N samples.
-///   - N is specified by \link
-///   mlperf::TestSettings::multi_stream_samples_per_query
-///   multi_stream_samples_per_query \endlink.
-///  + The next query is only issued once the previous one has completed.
-///  + The samples of each query are guaranteed to be contiguous with respect
-///    to the order they were loaded in the QuerySampleLibrary.
-///  + Latency is tracked and reported on a per-query and per-sample basis.
-///  + The latency of a query is the maximum latency of its samples, including
-///    any cross-thread communication within the loadgen.
-///  + Internal LoadGen latency between queries is not included in the
-///    latency results.
-///  + **Final performance result is:** a percentile of the query latency.
-/// * **Server**
-///  + Sends queries with a single sample.
-///  + Queries have a random poisson (non-uniform) arrival rate that, when
-///    averaged, hits the target QPS.
-///  + There is no limit on the number of outstanding queries, as long as
-///    the latency constraints are met.
-///  + **Final performance result is:** PASS if the a percentile of the latency
-///    is under a given threshold. FAIL otherwise.
-///   - Threshold is specified by \link
-///   mlperf::TestSettings::server_target_latency_ns server_target_latency_ns
-///   \endlink.
-/// * **Offline**
-///  + Sends all N samples to the SUT inside of a single query.
-///  + The samples of the query are guaranteed to be contiguous with respect
-///    to the order they were loaded in the QuerySampleLibrary.
-///  + **Final performance result is:** samples per second.
-///
-enum class TestScenario {
-  SingleStream,
-  MultiStream,
-  Server,
-  Offline,
-};
-
-///
-/// \enum TestMode
-/// * **SubmissionRun**
-///  + Runs accuracy mode followed by performance mode.
-///  + TODO: Implement further requirements as decided by MLPerf.
-/// * **AccuracyOnly**
-///  + Runs each sample from the QSL through the SUT a least once.
-///  + Outputs responses to an accuracy json that can be parsed by a model +
-///    sample library specific script.
-/// * **PerformanceOnly**
-///  + Runs the performance traffic for the given scenario, as described in
-///    the comments for TestScenario.
-/// * **FindPeakPerformance**
-///  + Determines the maximumum QPS for the Server scenario.
-///  + Not applicable for SingleStream, MultiStream or Offline scenarios.
-///
-enum class TestMode {
-  SubmissionRun,
-  AccuracyOnly,
-  PerformanceOnly,
-  FindPeakPerformance,
-};
-
-///
-/// \brief Top-level struct specifing the modes and parameters of the test.
-///
-struct TestSettings {
-  TestScenario scenario = TestScenario::SingleStream;
-  TestMode mode = TestMode::PerformanceOnly;
-
-  // ==================================
-  /// \name SingleStream-specific
-  /**@{*/
-  /// \brief A hint used by the loadgen to pre-generate enough samples to
-  ///        meet the minimum test duration.
-  double single_stream_expected_latency_ns = 1000000;
-  /// \brief The latency percentile reported as the final result.
-  double single_stream_target_latency_percentile = 0.90;
-  /**@}*/
-
-  // ==================================
-  /// \name MultiStream-specific
-  /**@{*/
-  /// \brief A hint used by the loadgen to pre-generate enough samples to
-  ///        meet the minimum test duration.
-  /// \brief MultiStream latency is for query (not sample) latency
-  double multi_stream_expected_latency_ns = 8000000;
-  /// \brief The latency percentile for MultiStream mode.
-  double multi_stream_target_latency_percentile = 0.99;
-  /// \brief The number of samples in each query.
-  /// \details How many samples are bundled in a query
-  uint64_t multi_stream_samples_per_query = 8;
-  /**@}*/
-
-  // ==================================
-  /// \name Server-specific
-  /**@{*/
-  /// \brief The average QPS of the poisson distribution.
-  /// \details note: This field is used as a FindPeakPerformance's lower bound.
-  /// When you run FindPeakPerformanceMode, you should make sure that this value
-  /// satisfies performance constraints.
-  double server_target_qps = 1;
-  /// \brief The latency constraint for the Server scenario.
-  uint64_t server_target_latency_ns = 100000000;
-  /// \brief The latency percentile for server mode. This value is combined with
-  /// server_target_latency_ns to determine if a run is valid.
-  /// \details 99% is the default value, which is correct for image models. GNMT
-  /// should be set to 0.97 (97%) in v0.5.(As always, check the policy page for
-  /// updated values for the benchmark you are running.)
-  double server_target_latency_percentile = 0.99;
-  /// \brief If this flag is set to true, LoadGen will combine samples from
-  /// multiple queries into a single query if their scheduled issue times have
-  /// passed.
-  bool server_coalesce_queries = false;
-  /// \brief The decimal places of QPS precision used to terminate
-  /// FindPeakPerformance mode.
-  int server_find_peak_qps_decimals_of_precision = 1;
-  /// \brief A step size (as a fraction of the QPS) used to widen the lower and
-  /// upper bounds to find the initial boundaries of binary search.
-  double server_find_peak_qps_boundary_step_size = 1;
-  /// \brief The maximum number of outstanding queries to allow before earlying
-  /// out from a performance run. Useful for performance tuning and speeding up
-  /// the FindPeakPerformance mode.
-  uint64_t server_max_async_queries = 0;  ///< 0: Infinity.
-  /// \brief The number of issue query threads that will be registered and used
-  /// to call SUT's IssueQuery(). If this is 0, the same thread calling
-  /// StartTest() will be used to call IssueQuery(). See also
-  /// mlperf::RegisterIssueQueryThread().
-  uint64_t server_num_issue_query_threads = 0;
-  /**@}*/
-
-  // ==================================
-  /// \name Offline-specific
-  /**@{*/
-  /// \brief Specifies the QPS the SUT expects to hit for the offline load.
-  /// The loadgen generates 10% more queries than it thinks it needs to meet
-  /// the minimum test duration.
-  double offline_expected_qps = 1;
-  /// \brief Affects the order in which the samples of the dataset are chosen.
-  /// If false it concatenates a single permutation of the dataset (or part
-  /// of it depending on QSL->PerformanceSampleCount()) several times up to the
-  /// number of samples requested.
-  /// If true it concatenates a multiple permutation of the dataset (or a
-  /// part of it depending on QSL->PerformanceSampleCount()) several times
-  /// up to the number of samples requested.
-  bool sample_concatenate_permutation = false;
-  /**@}*/
-
-  // ==================================
-  /// \name Test duration
-  /// The test runs until **both** min duration and min query count have been
-  /// met. However, it will exit before that point if **either** max duration or
-  /// max query count have been reached.
-  /**@{*/
-  uint64_t min_duration_ms = 10000;
-  uint64_t max_duration_ms = 0;  ///< 0: Infinity.
-  uint64_t min_query_count = 100;
-  uint64_t max_query_count = 0;  ///< 0: Infinity.
-  /**@}*/
-
-  // ==================================
-  /// \name Random number generation
-  /// There are 4 separate seeds, so each dimension can be changed
-  /// independently.
-  /**@{*/
-  /// \brief Affects which subset of samples from the QSL are chosen for
-  /// the performance sample set and accuracy sample sets.
-  uint64_t qsl_rng_seed = 0;
-  /// \brief Affects the order in which samples from the performance set will
-  /// be included in queries.
-  uint64_t sample_index_rng_seed = 0;
-  /// \brief Affects the poisson arrival process of the Server scenario.
-  /// \details Different seeds will appear to "jitter" the queries
-  /// differently in time, but should not affect the average issued QPS.
-  uint64_t schedule_rng_seed = 0;
-  /// \brief Affects which samples have their query returns logged to the
-  /// accuracy log in performance mode.
-  uint64_t accuracy_log_rng_seed = 0;
-
-  /// \brief Probability of the query response of a sample being logged to the
-  /// accuracy log in performance mode
-  double accuracy_log_probability = 0.0;
-
-  /// \brief Target number of samples that will have their results printed to
-  /// accuracy log in performance mode for compliance testing
-  uint64_t accuracy_log_sampling_target = 0;
-
-  /// \brief Variables for running test05 from native config. A boolean that
-  /// determines whether or not to run test05 and three random seed to run the
-  /// test
-  bool test05 = false;
-  uint64_t test05_qsl_rng_seed = 0;
-  uint64_t test05_sample_index_rng_seed = 0;
-  uint64_t test05_schedule_rng_seed = 0;
-
-  /// \brief Load mlperf parameter config from file.
-  int FromConfig(const std::string &path, const std::string &model,
-                 const std::string &scenario, int conf_type = 1);
-  /**@}*/
-
-  // ==================================
-  /// \name Performance Sample modifiers
-  /// \details These settings can be used to Audit Performance mode runs.
-  /// In order to detect sample caching by SUT, performance of runs when only
-  /// unique queries (with non-repeated samples) are issued can be compared with
-  /// that when the same query is repeatedly issued.
-  /**@{*/
-  /// \brief Prints measurement interval start and stop timestamps to std::cout
-  /// for the purpose of comparison against an external timer
-  bool print_timestamps = false;
-  /// \brief Allows issuing only unique queries in Performance mode of any
-  /// scenario \details This can be used to send non-repeat & hence unique
-  /// samples to SUT
-  bool performance_issue_unique = false;
-  /// \brief If true, the same query is chosen repeatedley for Inference.
-  /// In offline scenario, the query is filled with the same sample.
-  bool performance_issue_same = false;
-  /// \brief Offset to control which sample is repeated in
-  /// performance_issue_same mode.
-  /// Value should be within [0, performance_sample_count)
-  uint64_t performance_issue_same_index = 0;
-  /// \brief Overrides QSL->PerformanceSampleCount() when non-zero
-  uint64_t performance_sample_count_override = 0;
-  /// \brief Measure token latencies
-  bool use_token_latencies = false;
-  /// Token latency parameters
-  uint64_t server_ttft_latency = 100000000;
-  uint64_t server_tpot_latency = 100000000;
-  /// \brief Infer token latencies
-  bool infer_token_latencies = false;
-  uint64_t token_latency_scaling_factor;
-  /**@}*/
-};
-
-///
-/// \enum LoggingMode
-/// Specifies how and when logging should be sampled and stringified at
-/// runtime.
-/// * **AsyncPoll**
-///  + Logs are serialized and output on an IOThread that polls for new logs at
-///  a fixed interval. This is the only mode currently implemented.
-/// * **EndOfTestOnly**
-///  + TODO: Logs are serialzied and output only at the end of the test.
-/// * **Synchronous**
-///  + TODO: Logs are serialized and output inline.
-enum class LoggingMode {
-  AsyncPoll,
-  EndOfTestOnly,
-  Synchronous,
-};
-
-///
-/// \brief Specifies where log outputs should go.
-///
-/// By default, the loadgen outputs its log files to outdir and
-/// modifies the filenames of its logs with a prefix and suffix.
-/// Filenames will take the form:
-/// "<outdir>/<datetime><prefix>summary<suffix>.txt"
-///
-/// Affordances for outputing logs to stdout are also provided.
-///
-struct LogOutputSettings {
-  std::string outdir = ".";
-  std::string prefix = "mlperf_log_";
-  std::string suffix = "";
-  bool prefix_with_datetime = false;
-  bool copy_detail_to_stdout = false;
-  bool copy_summary_to_stdout = false;
-};
-
-///
-/// \brief Top-level log settings.
-///
-struct LogSettings {
-  LogOutputSettings log_output;
-  LoggingMode log_mode = LoggingMode::AsyncPoll;
-  uint64_t log_mode_async_poll_interval_ms = 1000;  ///< TODO: Implement this.
-  bool enable_trace = true;
-};
-
-/// @}
-
-/// @}
-
-}  // namespace mlperf
-
-#endif  // MLPERF_LOADGEN_TEST_SETTINGS_H
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings_internal.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings_internal.cc
deleted file mode 100644
index 3f2cd8847..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings_internal.cc
+++ /dev/null
@@ -1,800 +0,0 @@
-/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "test_settings_internal.h"
-
-#include <fstream>
-#include <map>
-#include <sstream>
-#include <string>
-
-#include "logging.h"
-#include "mlperf_conf.h"
-#include "utils.h"
-
-namespace mlperf {
-namespace loadgen {
-
-TestSettingsInternal::TestSettingsInternal(
-    const TestSettings &requested_settings, size_t qsl_performance_sample_count)
-    : requested(requested_settings),
-      scenario(requested.scenario),
-      mode(requested.mode),
-      samples_per_query(1),
-      target_qps(1),
-      max_async_queries(0),
-      target_duration(std::chrono::milliseconds(requested.min_duration_ms)),
-      min_duration(std::chrono::milliseconds(requested.min_duration_ms)),
-      max_duration(std::chrono::milliseconds(requested.max_duration_ms)),
-      min_query_count(requested.min_query_count),
-      max_query_count(requested.max_query_count),
-      min_sample_count(0),
-      qsl_rng_seed(requested.qsl_rng_seed),
-      sample_index_rng_seed(requested.sample_index_rng_seed),
-      schedule_rng_seed(requested.schedule_rng_seed),
-      accuracy_log_rng_seed(requested.accuracy_log_rng_seed),
-      accuracy_log_probability(requested.accuracy_log_probability),
-      accuracy_log_sampling_target(requested.accuracy_log_sampling_target),
-      print_timestamps(requested.print_timestamps),
-      performance_issue_unique(requested.performance_issue_unique),
-      performance_issue_same(requested.performance_issue_same),
-      performance_issue_same_index(requested.performance_issue_same_index),
-      performance_sample_count(0),
-      sample_concatenate_permutation(false),
-      use_token_latencies(requested.use_token_latencies),
-      server_ttft_latency(requested.server_ttft_latency),
-      server_tpot_latency(requested.server_tpot_latency),
-      infer_token_latencies(requested.infer_token_latencies),
-      token_latency_scaling_factor(requested.token_latency_scaling_factor) {
-  // Target QPS, target latency, and max_async_queries.
-  switch (requested.scenario) {
-    case TestScenario::SingleStream:
-      target_qps = static_cast<double>(std::nano::den) /
-                   requested.single_stream_expected_latency_ns;
-      max_async_queries = 1;
-      target_latency_percentile =
-          requested.single_stream_target_latency_percentile;
-      break;
-    case TestScenario::MultiStream:
-      target_qps = static_cast<double>(std::nano::den) /
-                   requested.multi_stream_expected_latency_ns;
-      max_async_queries = 1;
-      target_latency_percentile =
-          requested.multi_stream_target_latency_percentile;
-      break;
-    case TestScenario::Server:
-      if (requested.server_target_qps >= 0.0) {
-        target_qps = requested.server_target_qps;
-      } else {
-        LogDetail([server_target_qps = requested.server_target_qps,
-                   target_qps = target_qps](AsyncDetail &detail) {
-#if USE_NEW_LOGGING_FORMAT
-          std::stringstream ss;
-          ss << "Invalid value for server_target_qps requested."
-             << " requested: " << server_target_qps << " using: " << target_qps;
-          MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", ss.str());
-#else
-          detail.Error("Invalid value for server_target_qps requested.",
-                       "requested", server_target_qps, "using", target_qps);
-#endif
-        });
-      }
-      target_latency =
-          std::chrono::nanoseconds(requested.server_target_latency_ns);
-      target_latency_percentile = requested.server_target_latency_percentile;
-      max_async_queries = requested.server_max_async_queries;
-      break;
-    case TestScenario::Offline:
-      // target_latency_percentile is not used in Offline, but set it to
-      // 0.99 anyway to avoid garbage value.
-      target_latency_percentile = 0.99;
-      if (requested.offline_expected_qps >= 0.0) {
-        target_qps = requested.offline_expected_qps;
-      } else {
-        LogDetail([offline_expected_qps = requested.offline_expected_qps,
-                   target_qps = target_qps](AsyncDetail &detail) {
-#if USE_NEW_LOGGING_FORMAT
-          std::stringstream ss;
-          ss << "Invalid value for offline_expected_qps requested."
-             << " requested: " << offline_expected_qps
-             << " using: " << target_qps;
-          MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", ss.str());
-#else
-          detail.Error("Invalid value for offline_expected_qps requested.",
-                       "requested", offline_expected_qps, "using", target_qps);
-#endif
-        });
-      }
-      max_async_queries = 1;
-      break;
-  }
-
-  // Performance Sample Count: TestSettings override QSL ->
-  // PerformanceSampleCount
-  performance_sample_count = (requested.performance_sample_count_override == 0)
-                                 ? qsl_performance_sample_count
-                                 : requested.performance_sample_count_override;
-
-  // Sample by concatentating several permutations of the dataset
-  // sample_concatenate_permutation
-  sample_concatenate_permutation =
-      (requested.sample_concatenate_permutation == 0)
-          ? false
-          : requested.sample_concatenate_permutation;
-
-  // Samples per query.
-  if (requested.scenario == TestScenario::MultiStream) {
-    samples_per_query = requested.multi_stream_samples_per_query;
-  }
-
-  // In the offline scenario, coalesce all queries into a single query.
-  if (requested.scenario == TestScenario::Offline) {
-    // TODO: Should the spec require a max duration for large query counts?
-    // kSlack is used to make sure we generate enough samples for the SUT
-    // to take longer than than the minimum test duration required by the
-    // MLPerf spec.
-    constexpr double kSlack = 1.1;
-    uint64_t target_sample_count =
-        kSlack * DurationToSeconds(target_duration) * target_qps;
-    samples_per_query =
-        (requested.performance_issue_unique)
-            ? performance_sample_count
-            : std::max<uint64_t>(min_query_count, target_sample_count);
-    min_query_count = 1;
-    target_duration = std::chrono::milliseconds(0);
-  }
-
-  // FIXME: Only do this for 3D-UNet SingleStream, for v2.0
-  // TODO: consolidate after v2.0
-  // make min_queries to be multiple of performance_sample_count
-  // performance_sample_count == 0 makes it to be equal to loaded_samples.size()
-  if (sample_concatenate_permutation &&
-      requested.scenario == TestScenario::SingleStream) {
-    // set slack larger for 3D-UNet KiTS19 distribution, i.e. 50% latency << 90%
-    // latency
-    constexpr double kSlack = 2.0;
-    uint64_t expected_queries =
-        kSlack * DurationToSeconds(target_duration) * target_qps;
-    min_query_count =
-        min_query_count > expected_queries ? min_query_count : expected_queries;
-    min_query_count += qsl_performance_sample_count -
-                       (min_query_count % qsl_performance_sample_count);
-  }
-
-  min_sample_count = min_query_count * samples_per_query;
-
-  // Validate TestSettings
-  if (requested.performance_issue_same &&
-      (requested.performance_issue_same_index >= performance_sample_count)) {
-    LogDetail([performance_issue_same_index =
-                   requested.performance_issue_same_index,
-               performance_sample_count =
-                   performance_sample_count](AsyncDetail &detail) {
-#if USE_NEW_LOGGING_FORMAT
-      std::stringstream ss;
-      ss << "Sample Idx to be repeated in performance_issue_same mode"
-         << " cannot be greater than loaded performance_sample_count."
-         << " performance_issue_same_index: " << performance_issue_same_index
-         << " performance_sample_count: " << performance_sample_count;
-      MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", ss.str());
-#else
-      detail.Error(
-          "Sample Idx to be repeated in performance_issue_same mode"
-          " cannot be greater than loaded performance_sample_count.",
-          "performance_issue_same_index", performance_issue_same_index,
-          "performance_sample_count", performance_sample_count);
-#endif
-    });
-  }
-
-  if (requested.performance_issue_unique && requested.performance_issue_same) {
-    LogDetail([performance_issue_unique = requested.performance_issue_unique,
-               performance_issue_same =
-                   requested.performance_issue_same](AsyncDetail &detail) {
-#if USE_NEW_LOGGING_FORMAT
-      std::stringstream ss;
-      ss << "Performance_issue_unique and performance_issue_same, both"
-         << " cannot be true at the same time."
-         << " performance_issue_unique: " << performance_issue_unique
-         << " performance_issue_same: " << performance_issue_same;
-      MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", ss.str());
-#else
-      detail.Error(
-          "Performance_issue_unique and performance_issue_same, both"
-          " cannot be true at the same time.",
-          "performance_issue_unique", performance_issue_unique,
-          "performance_issue_same", performance_issue_same);
-#endif
-    });
-  }
-}
-
-std::string ToString(TestScenario scenario) {
-  switch (scenario) {
-#if USE_NEW_LOGGING_FORMAT
-    case TestScenario::SingleStream:
-      return "SingleStream";
-    case TestScenario::MultiStream:
-      return "MultiStream";
-#else
-    case TestScenario::SingleStream:
-      return "Single Stream";
-    case TestScenario::MultiStream:
-      return "Multi Stream";
-#endif
-    case TestScenario::Server:
-      return "Server";
-    case TestScenario::Offline:
-      return "Offline";
-  }
-  assert(false);
-  return "InvalidScenario";
-}
-
-std::string ToString(TestMode mode) {
-  switch (mode) {
-#if USE_NEW_LOGGING_FORMAT
-    case TestMode::SubmissionRun:
-      return "SubmissionRun";
-    case TestMode::AccuracyOnly:
-      return "AccuracyOnly";
-    case TestMode::PerformanceOnly:
-      return "PerformanceOnly";
-    case TestMode::FindPeakPerformance:
-      return "FindPeakPerformance";
-#else
-    case TestMode::SubmissionRun:
-      return "Submission";
-    case TestMode::AccuracyOnly:
-      return "Accuracy";
-    case TestMode::PerformanceOnly:
-      return "Performance";
-    case TestMode::FindPeakPerformance:
-      return "Find Peak Performance";
-#endif
-  }
-  assert(false);
-  return "InvalidMode";
-}
-
-void LogRequestedTestSettings(const TestSettings &s) {
-  LogDetail([s](AsyncDetail &detail) {
-#if USE_NEW_LOGGING_FORMAT
-    MLPERF_LOG(detail, "requested_scenario", ToString(s.scenario));
-    MLPERF_LOG(detail, "requested_test_mode", ToString(s.mode));
-
-    // Scenario-specific
-    switch (s.scenario) {
-      case TestScenario::SingleStream:
-        MLPERF_LOG(detail, "requested_single_stream_expected_latency_ns",
-                   s.single_stream_expected_latency_ns);
-        MLPERF_LOG(detail, "requested_single_stream_target_latency_percentile",
-                   s.single_stream_target_latency_percentile);
-        break;
-      case TestScenario::MultiStream:
-        MLPERF_LOG(detail, "requested_multi_stream_expected_latency_ns",
-                   s.multi_stream_expected_latency_ns);
-        MLPERF_LOG(detail, "requested_multi_stream_target_latency_percentile",
-                   s.multi_stream_target_latency_percentile);
-        MLPERF_LOG(detail, "requested_multi_stream_samples_per_query",
-                   s.multi_stream_samples_per_query);
-        break;
-      case TestScenario::Server:
-        MLPERF_LOG(detail, "requested_server_target_qps", s.server_target_qps);
-        MLPERF_LOG(detail, "requested_server_target_latency_ns",
-                   s.server_target_latency_ns);
-        MLPERF_LOG(detail, "requested_server_target_latency_percentile",
-                   s.server_target_latency_percentile);
-        MLPERF_LOG(detail, "requested_server_coalesce_queries",
-                   s.server_coalesce_queries);
-        MLPERF_LOG(detail,
-                   "requested_server_find_peak_qps_decimals_of_precision",
-                   s.server_find_peak_qps_decimals_of_precision);
-        MLPERF_LOG(detail, "requested_server_find_peak_qps_boundary_step_size",
-                   s.server_find_peak_qps_boundary_step_size);
-        MLPERF_LOG(detail, "requested_server_max_async_queries",
-                   s.server_max_async_queries);
-        MLPERF_LOG(detail, "requested_server_num_issue_query_threads",
-                   s.server_num_issue_query_threads);
-        break;
-      case TestScenario::Offline:
-        MLPERF_LOG(detail, "requested_offline_expected_qps",
-                   s.offline_expected_qps);
-        break;
-    }
-
-    // Overrides
-    MLPERF_LOG(detail, "requested_min_duration_ms", s.min_duration_ms);
-    MLPERF_LOG(detail, "requested_max_duration_ms", s.max_duration_ms);
-    MLPERF_LOG(detail, "requested_min_query_count", s.min_query_count);
-    MLPERF_LOG(detail, "requested_max_query_count", s.max_query_count);
-    MLPERF_LOG(detail, "requested_qsl_rng_seed", s.qsl_rng_seed);
-    MLPERF_LOG(detail, "requested_sample_index_rng_seed",
-               s.sample_index_rng_seed);
-    MLPERF_LOG(detail, "requested_schedule_rng_seed", s.schedule_rng_seed);
-    MLPERF_LOG(detail, "requested_accuracy_log_rng_seed",
-               s.accuracy_log_rng_seed);
-    MLPERF_LOG(detail, "requested_accuracy_log_probability",
-               s.accuracy_log_probability);
-    MLPERF_LOG(detail, "requested_accuracy_log_sampling_target",
-               s.accuracy_log_sampling_target);
-    MLPERF_LOG(detail, "requested_print_timestamps", s.print_timestamps);
-    MLPERF_LOG(detail, "requested_performance_issue_unique",
-               s.performance_issue_unique);
-    MLPERF_LOG(detail, "requested_performance_issue_same",
-               s.performance_issue_same);
-    MLPERF_LOG(detail, "requested_performance_issue_same_index",
-               s.performance_issue_same_index);
-    MLPERF_LOG(detail, "requested_performance_sample_count_override",
-               s.performance_sample_count_override);
-    MLPERF_LOG(detail, "requested_sample_concatenate_permutation",
-               s.sample_concatenate_permutation);
-    // Token latencies specific values
-    if (s.use_token_latencies) {
-      MLPERF_LOG(detail, "requested_use_token_latencies",
-                 s.use_token_latencies);
-      if (s.scenario != TestScenario::Offline) {
-        MLPERF_LOG(detail, "requested_server_ttft_latency",
-                   s.server_ttft_latency);
-        MLPERF_LOG(detail, "requested_server_tpot_latency",
-                   s.server_tpot_latency);
-      }
-    }
-#else
-    detail("");
-    detail("Requested Settings:");
-    detail("Scenario : " + ToString(s.scenario));
-    detail("Test mode : " + ToString(s.mode));
-
-    // Scenario-specific
-    switch (s.scenario) {
-      case TestScenario::SingleStream:
-        detail("single_stream_expected_latency_ns : ",
-               s.single_stream_expected_latency_ns);
-        detail("single_stream_target_latency_percentile : ",
-               s.single_stream_target_latency_percentile);
-        break;
-      case TestScenario::MultiStream:
-        detail("multi_stream_expected_latency_ns : ",
-               s.multi_stream_expected_latency_ns);
-        detail("multi_stream_target_latency_percentile : ",
-               s.multi_stream_target_latency_percentile);
-        detail("multi_stream_samples_per_query : ",
-               s.multi_stream_samples_per_query);
-        break;
-      case TestScenario::Server:
-        detail("server_target_qps : ", s.server_target_qps);
-        detail("server_target_latency_ns : ", s.server_target_latency_ns);
-        detail("server_target_latency_percentile : ",
-               s.server_target_latency_percentile);
-        detail("server_coalesce_queries : ", s.server_coalesce_queries);
-        detail("server_find_peak_qps_decimals_of_precision : ",
-               s.server_find_peak_qps_decimals_of_precision);
-        detail("server_find_peak_qps_boundary_step_size : ",
-               s.server_find_peak_qps_boundary_step_size);
-        detail("server_max_async_queries : ", s.server_max_async_queries);
-        detail("server_num_issue_query_threads : ",
-               s.server_num_issue_query_threads);
-        break;
-      case TestScenario::Offline:
-        detail("offline_expected_qps : ", s.offline_expected_qps);
-        break;
-    }
-
-    // Overrides
-    detail("min_duration_ms : ", s.min_duration_ms);
-    detail("max_duration_ms : ", s.max_duration_ms);
-    detail("min_query_count : ", s.min_query_count);
-    detail("max_query_count : ", s.max_query_count);
-    detail("qsl_rng_seed : ", s.qsl_rng_seed);
-    detail("sample_index_rng_seed : ", s.sample_index_rng_seed);
-    detail("schedule_rng_seed : ", s.schedule_rng_seed);
-    detail("accuracy_log_rng_seed : ", s.accuracy_log_rng_seed);
-    detail("accuracy_log_probability : ", s.accuracy_log_probability);
-    detail("accuracy_log_sampling_target : ", s.accuracy_log_sampling_target);
-    detail("print_timestamps : ", s.print_timestamps);
-    detail("performance_issue_unique : ", s.performance_issue_unique);
-    detail("performance_issue_same : ", s.performance_issue_same);
-    detail("performance_issue_same_index : ", s.performance_issue_same_index);
-    detail("performance_sample_count_override : ",
-           s.performance_sample_count_override);
-    detail("");
-#endif
-  });
-}
-
-void TestSettingsInternal::LogEffectiveSettings() const {
-  LogDetail([s = *this](AsyncDetail &detail) {
-#if USE_NEW_LOGGING_FORMAT
-    MLPERF_LOG(detail, "effective_scenario", ToString(s.scenario));
-    MLPERF_LOG(detail, "effective_test_mode", ToString(s.mode));
-
-    MLPERF_LOG(detail, "effective_samples_per_query", s.samples_per_query);
-    MLPERF_LOG(detail, "effective_target_qps", s.target_qps);
-    MLPERF_LOG(detail, "effective_target_latency_ns", s.target_latency.count());
-    MLPERF_LOG(detail, "effective_target_latency_percentile",
-               s.target_latency_percentile);
-    MLPERF_LOG(detail, "effective_max_async_queries", s.max_async_queries);
-    MLPERF_LOG(detail, "effective_target_duration_ms",
-               s.target_duration.count());
-    MLPERF_LOG(detail, "effective_min_duration_ms", s.min_duration.count());
-    MLPERF_LOG(detail, "effective_max_duration_ms", s.max_duration.count());
-    MLPERF_LOG(detail, "effective_min_query_count", s.min_query_count);
-    MLPERF_LOG(detail, "effective_max_query_count", s.max_query_count);
-    MLPERF_LOG(detail, "effective_min_sample_count", s.min_sample_count);
-    MLPERF_LOG(detail, "effective_qsl_rng_seed", s.qsl_rng_seed);
-    MLPERF_LOG(detail, "effective_sample_index_rng_seed",
-               s.sample_index_rng_seed);
-    MLPERF_LOG(detail, "effective_schedule_rng_seed", s.schedule_rng_seed);
-    MLPERF_LOG(detail, "effective_accuracy_log_rng_seed",
-               s.accuracy_log_rng_seed);
-    MLPERF_LOG(detail, "effective_accuracy_log_probability",
-               s.accuracy_log_probability);
-    MLPERF_LOG(detail, "effective_accuracy_log_sampling_target",
-               s.accuracy_log_sampling_target);
-    MLPERF_LOG(detail, "effective_print_timestamps", s.print_timestamps);
-    MLPERF_LOG(detail, "effective_performance_issue_unique",
-               s.performance_issue_unique);
-    MLPERF_LOG(detail, "effective_performance_issue_same",
-               s.performance_issue_same);
-    MLPERF_LOG(detail, "effective_performance_issue_same_index",
-               s.performance_issue_same_index);
-    MLPERF_LOG(detail, "effective_performance_sample_count",
-               s.performance_sample_count);
-    MLPERF_LOG(detail, "effective_sample_concatenate_permutation",
-               s.sample_concatenate_permutation);
-#else
-    detail("");
-    detail("Effective Settings:");
-
-    detail("Scenario : " + ToString(s.scenario));
-    detail("Test mode : " + ToString(s.mode));
-
-    detail("samples_per_query : ", s.samples_per_query);
-    detail("target_qps : ", s.target_qps);
-    detail("target_latency (ns): ", s.target_latency.count());
-    detail("target_latency_percentile : ", s.target_latency_percentile);
-    detail("max_async_queries : ", s.max_async_queries);
-    detail("target_duration (ms): ", s.target_duration.count());
-    detail("min_duration (ms): ", s.min_duration.count());
-    detail("max_duration (ms): ", s.max_duration.count());
-    detail("min_query_count : ", s.min_query_count);
-    detail("max_query_count : ", s.max_query_count);
-    detail("min_sample_count : ", s.min_sample_count);
-    detail("qsl_rng_seed : ", s.qsl_rng_seed);
-    detail("sample_index_rng_seed : ", s.sample_index_rng_seed);
-    detail("schedule_rng_seed : ", s.schedule_rng_seed);
-    detail("accuracy_log_rng_seed : ", s.accuracy_log_rng_seed);
-    detail("accuracy_log_probability : ", s.accuracy_log_probability);
-    detail("accuracy_log_sampling_target : ", s.accuracy_log_sampling_target);
-    detail("print_timestamps : ", s.print_timestamps);
-    detail("performance_issue_unique : ", s.performance_issue_unique);
-    detail("performance_issue_same : ", s.performance_issue_same);
-    detail("performance_issue_same_index : ", s.performance_issue_same_index);
-    detail("performance_sample_count : ", s.performance_sample_count);
-#endif
-  });
-}
-
-void TestSettingsInternal::LogAllSettings() const {
-  LogRequestedTestSettings(requested);
-  LogEffectiveSettings();
-}
-
-void TestSettingsInternal::LogSummary(AsyncSummary &summary) const {
-  summary("samples_per_query : ", samples_per_query);
-  summary("target_qps : ", target_qps);
-  if (!use_token_latencies) {
-    summary("target_latency (ns): ", target_latency.count());
-  } else {
-    summary("ttft_latency (ns): ", server_ttft_latency);
-    summary("tpot_latency (ns): ", server_tpot_latency);
-  }
-  summary("max_async_queries : ", max_async_queries);
-  summary("min_duration (ms): ", min_duration.count());
-  summary("max_duration (ms): ", max_duration.count());
-  summary("min_query_count : ", min_query_count);
-  summary("max_query_count : ", max_query_count);
-  summary("qsl_rng_seed : ", qsl_rng_seed);
-  summary("sample_index_rng_seed : ", sample_index_rng_seed);
-  summary("schedule_rng_seed : ", schedule_rng_seed);
-  summary("accuracy_log_rng_seed : ", accuracy_log_rng_seed);
-  summary("accuracy_log_probability : ", accuracy_log_probability);
-  summary("accuracy_log_sampling_target : ", accuracy_log_sampling_target);
-  summary("print_timestamps : ", print_timestamps);
-  summary("performance_issue_unique : ", performance_issue_unique);
-  summary("performance_issue_same : ", performance_issue_same);
-  summary("performance_issue_same_index : ", performance_issue_same_index);
-  summary("performance_sample_count : ", performance_sample_count);
-  if (sample_concatenate_permutation) {
-    summary(
-        "WARNING: sample_concatenate_permutation was set to true. \n"
-        "Generated samples per query might be different as the one in the "
-        "setting.\n"
-        "Check the generated_samples_per_query line in the detailed log for "
-        "the real\n"
-        "samples_per_query value");
-  }
-}
-
-}  // namespace loadgen
-
-int TestSettings::FromConfig(const std::string &path, const std::string &model,
-                             const std::string &scenario, int conf_type) {
-  std::map<std::string, std::string> kv;
-  static int configCount = 0;
-
-  if (conf_type == 1) {
-    if (configCount == 0) {
-      // Only allow userConf as the single configFile and loadgen loads the
-      // mlperfConf automatically for perf and accuracy runs
-      FromConfig("", model, scenario, 0);
-    }
-
-    else {
-      LogDetail([](AsyncDetail &detail) {
-        std::stringstream ss;
-        ss << "Multiple conf files are used. This is not valid for official "
-              "submission.";
-        MLPERF_LOG_ERROR(detail, "error_invalid_config", ss.str());
-      });
-    }
-    configCount++;
-  }
-
-  // lookup key/value pairs from config
-  auto lookupkv = [&](const std::string &model, const std::string &scenario,
-                      const std::string &key, uint64_t *val_l, double *val_d,
-                      double multiplier = 1.0) {
-    std::map<std::string, std::string>::iterator it;
-    std::string found;
-    // lookup exact key first
-    it = kv.find(model + "." + scenario + "." + key);
-    if (it != kv.end()) {
-      found = it->second;
-    } else {
-      // lookup key with model wildcard
-      it = kv.find("*." + scenario + "." + key);
-      if (it != kv.end()) {
-        found = it->second;
-      } else {
-        it = kv.find(model + ".*." + key);
-        if (it != kv.end()) {
-          found = it->second;
-        } else {
-          it = kv.find("*.*." + key);
-          if (it != kv.end()) {
-            found = it->second;
-          } else {
-            return false;
-          }
-        }
-      }
-    }
-    // if we get here, found will be set
-    if (val_l) {
-      *val_l = strtoull(found.c_str(), nullptr, 0) *
-               static_cast<uint64_t>(multiplier);
-    }
-    if (val_d) *val_d = strtod(found.c_str(), nullptr) * multiplier;
-    return true;
-  };
-
-  int line_nr = 0;
-  int errors = 0;
-  // Declare the input stream before the if-else block
-  std::unique_ptr<std::istream> fss;
-  std::string line;
-
-  if (conf_type != 0) {
-    // dirt simple config parser
-    fss = std::make_unique<std::ifstream>(path);
-    if (!static_cast<std::ifstream *>(fss.get())->is_open()) {
-      LogDetail([p = path](AsyncDetail &detail) {
-#if USE_NEW_LOGGING_FORMAT
-        std::stringstream ss;
-        ss << "can't open file " << p;
-        MLPERF_LOG_ERROR(detail, "error_invalid_config", ss.str());
-#else
-        detail.Error("can't open file ", p);
-#endif
-      });
-      return -ENOENT;
-    }
-  } else {
-    // Convert unsigned char array to std::string
-    std::string config_str(mlperf_conf);
-    fss = std::make_unique<std::istringstream>(config_str);
-  }
-  while (std::getline(*fss, line)) {
-    line_nr++;
-    std::istringstream iss(line);
-    std::string s, k;
-    int looking_for = 0;  // 0=key, 1=equal, 2=value
-    while (iss >> s) {
-      if (s == "#" && looking_for != 2) {
-        // done with this line
-        break;
-      }
-      if (looking_for == 2) {
-        // got key and value
-        const char *start = s.c_str();
-        char *stop;
-        (void)strtoul(start, &stop, 0);
-        if (start + s.size() == stop) {
-          kv[k] = s;
-          continue;
-        }
-        (void)strtod(start, &stop);
-        if (start + s.size() == stop) {
-          kv[k] = s;
-          continue;
-        }
-        errors++;
-        LogDetail([l = line_nr](AsyncDetail &detail) {
-#if USE_NEW_LOGGING_FORMAT
-          std::stringstream ss;
-          ss << "value needs to be integer or double, line=" << l;
-          MLPERF_LOG_ERROR(detail, "error_invalid_config", ss.str());
-#else
-          detail.Error("value needs to be integer or double, line=", l);
-#endif
-        });
-        break;
-      }
-      if (looking_for == 1 && s != "=") {
-        errors++;
-        LogDetail([l = line_nr](AsyncDetail &detail) {
-#if USE_NEW_LOGGING_FORMAT
-          std::stringstream ss;
-          ss << "expected 'key=value', line=" << l;
-          MLPERF_LOG_ERROR(detail, "error_invalid_config", ss.str());
-#else
-          detail.Error("expected 'key=value', line=", l);
-#endif
-        });
-        break;
-      }
-      if (looking_for == 0) k = s;
-      looking_for++;
-    }
-  }
-  if (errors != 0) return -EINVAL;
-
-  uint64_t val;
-
-  // keys that apply to all scenarios
-  if (lookupkv(model, scenario, "mode", &val, nullptr)) {
-    switch (val) {
-      case 0:
-        mode = TestMode::SubmissionRun;
-        break;
-      case 1:
-        mode = TestMode::AccuracyOnly;
-        break;
-      case 2:
-        mode = TestMode::PerformanceOnly;
-        break;
-      case 3:
-        mode = TestMode::FindPeakPerformance;
-        break;
-      default:
-        LogDetail([](AsyncDetail &detail) {
-#if USE_NEW_LOGGING_FORMAT
-          std::stringstream ss;
-          ss << "Invalid value passed to Mode key in config.";
-          MLPERF_LOG_ERROR(detail, "error_invalid_config", ss.str());
-#else
-          detail.Error("Invalid value passed to Mode key in config.");
-#endif
-        });
-        break;
-    }
-  }
-
-  if (conf_type == 0) {
-    lookupkv(model, scenario, "qsl_rng_seed", &qsl_rng_seed, nullptr);
-    lookupkv(model, scenario, "sample_index_rng_seed", &sample_index_rng_seed,
-             nullptr);
-    lookupkv(model, scenario, "schedule_rng_seed", &schedule_rng_seed, nullptr);
-    lookupkv(model, scenario, "accuracy_log_probability", nullptr,
-             &accuracy_log_probability, 0.01);
-    if (lookupkv(model, scenario, "test05", &val, nullptr))
-      test05 = (val == 1) ? true : false;
-    lookupkv(model, scenario, "test05_qsl_rng_seed", &test05_qsl_rng_seed,
-             nullptr);
-    lookupkv(model, scenario, "test05_sample_index_rng_seed",
-             &test05_sample_index_rng_seed, nullptr);
-    lookupkv(model, scenario, "test05_schedule_rng_seed",
-             &test05_schedule_rng_seed, nullptr);
-  }
-
-  // keys that can be overriden in user.conf but will make the results eligible
-  // only for open submissions
-
-  // keys to measure token metrics
-  if (lookupkv(model, scenario, "use_token_latencies", &val, nullptr)) {
-    use_token_latencies = (val == 1) ? true : false;
-  }
-  if (use_token_latencies) {
-    lookupkv(model, "Server", "ttft_latency", &server_ttft_latency, nullptr,
-             1000 * 1000);
-    lookupkv(model, "Server", "tpot_latency", &server_tpot_latency, nullptr,
-             1000 * 1000);
-  }
-
-  // keys to infer token metrics
-  if (lookupkv(model, scenario, "infer_token_latencies", &val, nullptr)) {
-    infer_token_latencies = (val == 1) ? true : false;
-  }
-  if (infer_token_latencies) {
-    lookupkv(model, scenario, "token_latency_scaling_factor",
-             &token_latency_scaling_factor, nullptr, 1);
-  }
-  // keys that apply to SingleStream
-  lookupkv(model, "SingleStream", "target_latency_percentile", nullptr,
-           &single_stream_target_latency_percentile, 0.01);
-
-  // keys that apply to MultiStream
-  lookupkv(model, "MultiStream", "target_latency_percentile", nullptr,
-           &multi_stream_target_latency_percentile, 0.01);
-  lookupkv(model, "MultiStream", "samples_per_query",
-           &multi_stream_samples_per_query, nullptr, 1);
-
-  // keys that apply to Server
-  lookupkv(model, "Server", "target_latency_percentile", nullptr,
-           &server_target_latency_percentile, 0.01);
-  lookupkv(model, "Server", "target_latency", &server_target_latency_ns,
-           nullptr, 1000 * 1000);
-
-  // keys that can be overriden in user.conf (the provided values still need to
-  // pass the submission checker rules)
-  if (lookupkv(model, scenario, "performance_issue_unique", &val, nullptr))
-    performance_issue_unique = (val == 0) ? false : true;
-  if (lookupkv(model, scenario, "performance_issue_same", &val, nullptr))
-    performance_issue_same = (val == 0) ? false : true;
-  lookupkv(model, scenario, "performance_issue_same_index",
-           &performance_issue_same_index, nullptr);
-
-  if (lookupkv(model, scenario, "sample_concatenate_permutation", &val,
-               nullptr))
-    sample_concatenate_permutation = (val == 1) ? true : false;
-  if (lookupkv(model, "Server", "coalesce_queries", &val, nullptr))
-    server_coalesce_queries = (val == 0) ? false : true;
-  if (lookupkv(model, "Server", "max_async_queries", &val, nullptr))
-    server_max_async_queries = int(val);
-
-  lookupkv(model, scenario, "min_duration", &min_duration_ms, nullptr);
-  lookupkv(model, scenario, "max_duration", &max_duration_ms, nullptr);
-  lookupkv(model, scenario, "min_query_count", &min_query_count, nullptr);
-  lookupkv(model, scenario, "max_query_count", &max_query_count, nullptr);
-  lookupkv(model, scenario, "performance_sample_count_override",
-           &performance_sample_count_override, nullptr);
-  lookupkv(model, "SingleStream", "target_latency", nullptr,
-           &single_stream_expected_latency_ns, 1000 * 1000);
-  lookupkv(model, "MultiStream", "target_latency", nullptr,
-           &multi_stream_expected_latency_ns, 1000 * 1000);
-  lookupkv(model, "Server", "target_qps", nullptr, &server_target_qps);
-  lookupkv(model, "Offline", "target_qps", 0, &offline_expected_qps);
-
-  if (lookupkv(model, scenario, "print_timestamps", &val, nullptr))
-    print_timestamps = (val == 0) ? false : true;
-
-  // keys that are used in audit.conf
-  lookupkv(model, scenario, "accuracy_log_rng_seed", &accuracy_log_rng_seed,
-           nullptr);
-  lookupkv(model, scenario, "accuracy_log_sampling_target",
-           &accuracy_log_sampling_target, nullptr);
-  return 0;
-}
-
-}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings_internal.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings_internal.h
deleted file mode 100644
index ab2773bd1..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/test_settings_internal.h
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/// \file
-/// \brief The internal representation of user-provided settings.
-
-#ifndef MLPERF_LOADGEN_TEST_SETTINGS_INTERNAL_H
-#define MLPERF_LOADGEN_TEST_SETTINGS_INTERNAL_H
-
-#include <chrono>
-#include <cmath>
-#include <string>
-
-#include "logging.h"
-#include "test_settings.h"
-
-namespace mlperf {
-
-namespace logging {
-class AsyncSummary;
-}
-
-namespace loadgen {
-
-using AsyncSummary = logging::AsyncSummary;
-
-std::string ToString(TestScenario scenario);
-std::string ToString(TestMode mode);
-
-/// \brief takes the user-friendly TestSettings and normalizes it
-/// for consumption by the loadgen.
-/// \details It does things like remove scenario-specific naming and introduce
-/// the concept of target_duration used to pre-generate queries.
-struct TestSettingsInternal {
-  explicit TestSettingsInternal(const TestSettings &requested_settings,
-                                size_t qsl_performance_sample_count);
-  void LogEffectiveSettings() const;
-  void LogAllSettings() const;
-  void LogSummary(AsyncSummary &summary) const;
-
-  const TestSettings requested;
-  const TestScenario scenario;  // Copied here for convenience.
-  const TestMode mode;          // Copied here for convenience.
-
-  uint64_t samples_per_query;
-  double target_qps;
-  std::chrono::nanoseconds target_latency{0};
-  double target_latency_percentile;  // Single, multistream, and server modes.
-  uint64_t max_async_queries;
-
-  // Target duration is used to generate queries of a minimum duration before
-  // the test run.
-  std::chrono::milliseconds target_duration{0};
-
-  // Min duration/query_count/sample_count are used to validate the test
-  // duration at the end of the run.
-  std::chrono::milliseconds min_duration{0};
-  std::chrono::milliseconds max_duration{0};
-  uint64_t min_query_count;
-  uint64_t max_query_count;
-  uint64_t min_sample_count;  // Offline only.
-
-  uint64_t qsl_rng_seed;
-  uint64_t sample_index_rng_seed;
-  uint64_t schedule_rng_seed;
-  uint64_t accuracy_log_rng_seed;
-  double accuracy_log_probability;
-  uint64_t accuracy_log_sampling_target;
-  bool print_timestamps;
-  bool performance_issue_unique;
-  bool performance_issue_same;
-  uint64_t performance_issue_same_index;
-  uint64_t performance_sample_count;
-
-  bool sample_concatenate_permutation;
-  bool use_token_latencies = false;
-  int64_t server_ttft_latency;
-  int64_t server_tpot_latency;
-
-  bool infer_token_latencies = false;
-  int64_t token_latency_scaling_factor;
-};
-
-/// \brief A namespace of collections of FindPeakPerformance helper functions,
-/// mainly about binary search.
-namespace find_peak_performance {
-
-constexpr char const *kNotSupportedMsg =
-    "Finding peak performance is only supported in Server scenarios.";
-
-template <TestScenario scenario>
-TestSettingsInternal MidOfBoundaries(
-    const TestSettingsInternal &lower_bound_settings,
-    const TestSettingsInternal &upper_bound_settings) {
-  TestSettingsInternal mid_settings = lower_bound_settings;
-  if (scenario == TestScenario::Server) {
-    assert(lower_bound_settings.target_qps < upper_bound_settings.target_qps);
-    mid_settings.target_qps =
-        lower_bound_settings.target_qps +
-        (upper_bound_settings.target_qps - lower_bound_settings.target_qps) / 2;
-  } else {
-    LogDetail([](AsyncDetail &detail) {
-#if USE_NEW_LOGGING_FORMAT
-      MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", kNotSupportedMsg);
-#else
-      detail(kNotSupportedMsg);
-#endif
-    });
-  }
-  return mid_settings;
-}
-
-template <TestScenario scenario>
-bool IsFinished(const TestSettingsInternal &lower_bound_settings,
-                const TestSettingsInternal &upper_bound_settings) {
-  if (scenario == TestScenario::Server) {
-    uint8_t precision = lower_bound_settings.requested
-                            .server_find_peak_qps_decimals_of_precision;
-    double l =
-        std::floor(lower_bound_settings.target_qps * std::pow(10, precision));
-    double u =
-        std::floor(upper_bound_settings.target_qps * std::pow(10, precision));
-    return l + 1 >= u;
-  } else {
-    LogDetail([](AsyncDetail &detail) {
-#if USE_NEW_LOGGING_FORMAT
-      MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", kNotSupportedMsg);
-#else
-      detail(kNotSupportedMsg);
-#endif
-    });
-    return true;
-  }
-}
-
-template <TestScenario scenario>
-std::string ToStringPerformanceField(const TestSettingsInternal &settings) {
-  if (scenario == TestScenario::Server) {
-    return std::to_string(settings.target_qps);
-  } else {
-    LogDetail([](AsyncDetail &detail) {
-#if USE_NEW_LOGGING_FORMAT
-      MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", kNotSupportedMsg);
-#else
-      detail(kNotSupportedMsg);
-#endif
-    });
-    return ToString(settings.scenario);
-  }
-}
-
-template <TestScenario scenario>
-void WidenPerformanceField(TestSettingsInternal *settings) {
-  if (scenario == TestScenario::Server) {
-    settings->target_qps =
-        settings->target_qps *
-        (1 + settings->requested.server_find_peak_qps_boundary_step_size);
-  } else {
-    LogDetail([](AsyncDetail &detail) {
-#if USE_NEW_LOGGING_FORMAT
-      MLPERF_LOG_ERROR(detail, "error_invalid_test_settings", kNotSupportedMsg);
-#else
-      detail(kNotSupportedMsg);
-#endif
-    });
-  }
-}
-
-}  // namespace find_peak_performance
-}  // namespace loadgen
-}  // namespace mlperf
-
-#endif  // MLPERF_LOADGEN_TEST_SETTINGS_INTERNAL_H
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/BUILD.gn b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/BUILD.gn
deleted file mode 100644
index d73bf831a..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/BUILD.gn
+++ /dev/null
@@ -1,25 +0,0 @@
-static_library("mlperf_loadgen_tests_loadgen_test_main") {
-  sources = [ "loadgen_test.h", "loadgen_test_main.cc" ]
-  configs += [ "//build/config/compiler:exceptions" ]
-}
-
-executable("mlperf_loadgen_perftests") {
-  sources = [ "perftests_null_sut.cc" ]
-  deps = [ "..:mlperf_loadgen" ]
-}
-
-executable("mlperf_loadgen_tests_basic") {
-  sources = [ "basic.cc" ]
-  deps = [ "..:mlperf_loadgen",
-           ":mlperf_loadgen_tests_loadgen_test_main"  ]
-  configs += [ "//build/config/compiler:exceptions" ]
-}
-
-source_set("mlperf_loadgen_perftests_py") {
-  sources = [ "perftests_null_sut.py" ]
-  deps = [ "../..:loadgen_pymodule_wheel_lib" ]
-}
-
-source_set("docs") {
-  sources = [ "README.md" ]
-}
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/README.md b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/README.md
deleted file mode 100644
index 41056b457..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/README.md
+++ /dev/null
@@ -1,42 +0,0 @@
-# Building and Running the Tests {#ReadmeTests}
-
-The unit and performance tests are only supported via gn/ninja at the moment.
-
-See the [top-level build readme](@ref ReadmeBuild) for details but, from a clean checkout, you must first run:
-
-    make bootstrap_gn_ninja
-    third_party/gn/gn gen out/Release --args="is_debug=false"
-
-This will build the gn and ninja build tools and create a release project.
-
-## Unit Tests
-
-To build:
-
-    third_party/ninja/ninja -C out/Release mlperf_loadgen_tests_basic
-
-To run all tests:
-
-    out/Release/mlperf_loadgen_tests_basic .
-
-To run specific tests:
-
-    out/Release/mlperf_loadgen_tests_basic <regex>
-    e.g.:
-    out/Release/mlperf_loadgen_tests_basic SingleStream
-
-## Performance Tests
-
-To build:
-
-    third_party/ninja/ninja -C out/Release mlperf_loadgen_perftests
-
-To run all tests:
-
-    out/Release/mlperf_loadgen_perftests .
-
-To run specific tests:
-
-    out/Release/mlperf_loadgen_perftests <regex>
-    e.g.:
-    out/Release/mlperf_loadgen_tests_basic ServerPool
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/basic.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/basic.cc
deleted file mode 100644
index 97c6a0bb1..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/basic.cc
+++ /dev/null
@@ -1,314 +0,0 @@
-/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/// \file
-/// \brief Basic functionality unit tests.
-
-#include <algorithm>
-#include <deque>
-#include <future>
-#include <iostream>
-#include <queue>
-#include <unordered_set>
-#include <vector>
-
-#include "../loadgen.h"
-#include "../query_sample_library.h"
-#include "../system_under_test.h"
-#include "../test_settings.h"
-#include "loadgen_test.h"
-
-/// \brief Correctness unit tests.
-namespace unit_tests {
-
-/// \defgroup LoadgenTestsBasic Test Coverage: Basic
-
-/// \brief Implements the client interfaces of the loadgen and
-/// has some basic sanity checks that are enabled for all tests.
-/// \details It also forwards calls to overrideable *Ext methods and implements
-/// the TestProxy concept.
-struct SystemUnderTestBasic : public mlperf::QuerySampleLibrary,
-                              public mlperf::SystemUnderTest {
-  const std::string& Name() const override { return name_; }
-
-  size_t TotalSampleCount() override { return total_sample_count_; }
-  size_t PerformanceSampleCount() override { return performance_sample_count_; }
-
-  void LoadSamplesToRam(
-      const std::vector<mlperf::QuerySampleIndex>& samples) override {
-    for (auto s : samples) {
-      samples_load_count_.at(s)++;
-      loaded_samples_.push_back(s);
-    }
-    LoadSamplesToRamExt(samples);
-  }
-  virtual void LoadSamplesToRamExt(
-      const std::vector<mlperf::QuerySampleIndex>& samples) {}
-
-  void UnloadSamplesFromRam(
-      const std::vector<mlperf::QuerySampleIndex>& samples) override {
-    for (auto s : samples) {
-      FAIL_IF(loaded_samples_.front() != s) &&
-          FAIL_EXP(loaded_samples_.front()) && FAIL_EXP(s);
-      loaded_samples_.pop_front();
-      size_t prev_load_count = samples_load_count_.at(s)--;
-      FAIL_IF(prev_load_count == 0) && FAIL_EXP(prev_load_count);
-    }
-    UnloadSamplesFromRamExt(samples);
-  }
-  virtual void UnloadSamplesFromRamExt(
-      const std::vector<mlperf::QuerySampleIndex>& samples) {}
-
-  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
-    std::vector<mlperf::QuerySampleResponse> responses;
-    query_sizes_.push_back(samples.size());
-    samples_between_flushes_.back() += samples.size();
-    responses.reserve(samples.size());
-    for (auto s : samples) {
-      FAIL_IF(samples_load_count_.at(s.index) == 0) &&
-          FAIL_MSG("Issued unloaded sample:") && FAIL_EXP(s.index);
-      samples_issue_count_.at(s.index)++;
-      issued_samples_.push_back(s.index);
-      responses.push_back({s.id, 0, 0});
-    }
-    mlperf::QuerySamplesComplete(responses.data(), responses.size());
-    IssueQueryExt(samples);
-  }
-  virtual void IssueQueryExt(const std::vector<mlperf::QuerySample>& samples) {}
-
-  void FlushQueries() override {
-    samples_between_flushes_.push_back(0);
-    FlushQueriesExt();
-  }
-  virtual void FlushQueriesExt() {}
-
-  virtual void RunTest() {
-    samples_load_count_.resize(total_sample_count_, 0);
-    samples_issue_count_.resize(total_sample_count_, 0);
-    samples_between_flushes_.resize(1, 0);
-    mlperf::StartTest(this, this, test_settings_, log_settings_);
-  }
-
-  virtual void EndTest() {}
-
- protected:
-  mlperf::TestSettings test_settings_;
-  mlperf::LogSettings log_settings_;
-
-  std::string name_{"BasicSUT"};
-  size_t total_sample_count_;
-  size_t performance_sample_count_;
-  std::vector<mlperf::QuerySampleIndex> issued_samples_;
-  std::deque<mlperf::QuerySampleIndex> loaded_samples_;
-  std::vector<size_t> samples_load_count_;
-  std::vector<size_t> samples_issue_count_;
-
-  std::vector<size_t> query_sizes_;
-  std::vector<size_t> samples_between_flushes_;
-};
-
-/// \brief Provides common test set up logic.
-struct SystemUnderTestAccuracy : public SystemUnderTestBasic {
-  virtual void SetUpTest(size_t samples_per_query,
-                         size_t samples_per_query_remainder,
-                         size_t accuracy_remainder,
-                         mlperf::TestScenario scenario) {
-    performance_sample_count_ =
-        samples_per_query * 16 + samples_per_query_remainder;
-    total_sample_count_ = performance_sample_count_ * 32 + accuracy_remainder;
-
-    log_settings_.log_output.prefix_with_datetime = false;
-
-    test_settings_.scenario = scenario;
-    test_settings_.mode = mlperf::TestMode::AccuracyOnly;
-    test_settings_.multi_stream_samples_per_query = samples_per_query;
-
-    double qps = 1e3;
-    test_settings_.server_target_qps = qps;
-  }
-};
-
-/// \brief Verifies all samples from the QSL are included at least once
-/// in accuracy mode.
-/// \ingroup LoadgenTestsBasic
-struct TestAccuracyIncludesAllSamples : public SystemUnderTestAccuracy {
-  void EndTest() override {
-    std::sort(issued_samples_.begin(), issued_samples_.end());
-
-    FAIL_IF(issued_samples_.size() < total_sample_count_) &&
-        FAIL_EXP(issued_samples_.size()) && FAIL_EXP(total_sample_count_);
-    FAIL_IF(issued_samples_.front() != 0) && FAIL_EXP(issued_samples_.front());
-    FAIL_IF(issued_samples_.back() != total_sample_count_ - 1) &&
-        FAIL_EXP(issued_samples_.back()) && FAIL_EXP(total_sample_count_);
-
-    mlperf::QuerySampleIndex prev = -1;
-    size_t discontinuities = 0;
-    size_t dupes = 0;
-    for (auto s : issued_samples_) {
-      if (s == prev) {
-        dupes++;
-      } else if (s - prev > 1) {
-        discontinuities++;
-      }
-      prev = s;
-    }
-
-    FAIL_IF(discontinuities != 0) && FAIL_EXP(discontinuities);
-    FAIL_IF(dupes != 0) && FAIL_EXP(dupes);
-  }
-};
-
-REGISTER_TEST_ALL_SCENARIOS(AccuracyIncludesAllSamples,
-                            TestProxy<TestAccuracyIncludesAllSamples>(), 4, 0,
-                            0);
-
-/// \brief Verifies samples from the QSL aren't included too many times.
-/// \details This is a regression test for:
-/// https://github.com/mlperf/inference/pull/386
-/// The root cause was using different values for samples_per_query while
-/// generating queries for the GNMT dataset.
-/// \ingroup LoadgenTestsBasic
-struct TestAccuracyDupesAreLimitted : public SystemUnderTestAccuracy {
-  void SetUpTest(bool, mlperf::TestScenario scenario) {
-    SystemUnderTestAccuracy::SetUpTest(4, 0, 0, scenario);
-    total_sample_count_ = 3003;
-    performance_sample_count_ = 1001;
-  }
-
-  void EndTest() override {
-    std::sort(issued_samples_.begin(), issued_samples_.end());
-
-    FAIL_IF(issued_samples_.size() < total_sample_count_) &&
-        FAIL_EXP(issued_samples_.size()) && FAIL_EXP(total_sample_count_);
-    FAIL_IF(issued_samples_.front() != 0) && FAIL_EXP(issued_samples_.front());
-    FAIL_IF(issued_samples_.back() != total_sample_count_ - 1) &&
-        FAIL_EXP(issued_samples_.back()) && FAIL_EXP(total_sample_count_);
-
-    std::vector<size_t> issue_counts(total_sample_count_, 0);
-    for (auto s : issued_samples_) {
-      issue_counts.at(s)++;
-    }
-
-    const size_t max_count = 1;
-    for (size_t i = 0; i < issue_counts.size(); i++) {
-      FAIL_IF(issue_counts[i] > max_count) && FAIL_EXP(i) &&
-          FAIL_EXP(max_count) && FAIL_EXP(issue_counts[i]);
-    }
-  }
-};
-
-REGISTER_TEST_ALL_SCENARIOS(TestAccuracyDupesAreLimitted,
-                            TestProxy<TestAccuracyDupesAreLimitted>(), true);
-
-/// \brief Verifies offline + accuracy doesn't hang if the last set
-/// in the accuracy series is smaller than others.
-/// \ingroup LoadgenTestsBasic
-struct TestOfflineRemainderAccuracySet : public SystemUnderTestAccuracy {
-  void SetUpTest() {
-    SystemUnderTestAccuracy::SetUpTest(4, 0, 7, mlperf::TestScenario::Offline);
-  }
-
-  void EndTest() override {
-    auto& flush_samples = samples_between_flushes_;
-
-    FAIL_IF(flush_samples.size() < 3) && FAIL_EXP(flush_samples.size()) &&
-        BAD_TEST_MSG("Test should generate multiple query sets.") && ABORT_TEST;
-
-    // The last counter will be 0, since a test ends with a call to
-    // FlushQuery.
-    FAIL_IF(flush_samples.back() != 0) && FAIL_EXP(flush_samples.back()) &&
-        FAIL_MSG(
-            "Detected stray calls to IssueQuery after the last call to "
-            "FlushQuery.");
-    flush_samples.pop_back();
-
-    // Verify the test ran with a smaller last accuracy set.
-    size_t first_size = flush_samples.front();
-    size_t last_size = flush_samples.back();
-    FAIL_IF(first_size <= last_size) && FAIL_EXP(first_size) &&
-        FAIL_EXP(last_size) && BAD_TEST_MSG();
-
-    flush_samples.pop_back();  // Don't check the last set for equality.
-    for (size_t query_size : flush_samples) {
-      FAIL_IF(query_size != first_size) && FAIL_EXP(query_size) &&
-          FAIL_EXP(first_size);
-    }
-  }
-};
-
-REGISTER_TEST(Offline_RemainderAccuracySets,
-              TestProxy<TestOfflineRemainderAccuracySet>());
-
-/// \brief Verifies all queries only contain samples that are contiguous,
-/// even if the set size is not a multiple of samples_per_query.
-/// \ingroup LoadgenTestsBasic
-struct TestMultiStreamContiguousRemainderQuery
-    : public SystemUnderTestAccuracy {
-  void SetUpTest(mlperf::TestScenario scenario) {
-    SystemUnderTestAccuracy::SetUpTest(4, 1, 0, scenario);
-    first_qsl_offsets_.resize(total_sample_count_, kBadQslOffset);
-
-    auto spq = test_settings_.multi_stream_samples_per_query;
-    FAIL_IF(performance_sample_count_ % spq == 0) &&
-        FAIL_EXP(performance_sample_count_) && FAIL_EXP(spq) &&
-        BAD_TEST_MSG("There is no remainder.");
-  }
-
-  void LoadSamplesToRamExt(
-      const std::vector<mlperf::QuerySampleIndex>& samples) override {
-    FAIL_IF(loaded_samples_.size() != samples.size()) &&
-        FAIL_MSG("Contiguous sample order is likely ambiguous.");
-    for (size_t i = 0; i < samples.size(); i++) {
-      auto& offset = first_qsl_offsets_.at(samples.at(i));
-      // Samples may be loaded into multiple slots for padding purposes,
-      // so make sure to only index the first time a sample appears in a
-      // loaded set.
-      if (offset == kBadQslOffset) {
-        offset = i;
-      }
-    }
-  }
-
-  void UnloadSamplesFromRamExt(
-      const std::vector<mlperf::QuerySampleIndex>& samples) override {
-    FAIL_IF(!loaded_samples_.empty()) &&
-        FAIL_MSG("Contiguous sample order is likely ambiguous.");
-    for (size_t i = 0; i < samples.size(); i++) {
-      first_qsl_offsets_.at(samples.at(i)) = kBadQslOffset;
-    }
-  }
-
-  void IssueQueryExt(const std::vector<mlperf::QuerySample>& samples) override {
-    size_t expected_offset = first_qsl_offsets_[samples[0].index];
-    for (auto s : samples) {
-      FAIL_IF(loaded_samples_[expected_offset] != s.index) &&
-          FAIL_MSG("Samples are not contiguous.");
-      expected_offset++;
-    }
-  }
-
-  void FlushQueriesExt() override {}
-
-  void EndTest() override {}
-
- private:
-  static const size_t kBadQslOffset;
-  std::vector<size_t> first_qsl_offsets_;
-};
-
-constexpr size_t TestMultiStreamContiguousRemainderQuery::kBadQslOffset =
-    std::numeric_limits<size_t>::max();
-
-REGISTER_TEST(MultiStream_RemainderQueryContiguous,
-              TestProxy<TestMultiStreamContiguousRemainderQuery>(),
-              mlperf::TestScenario::MultiStream);
-}  // namespace unit_tests
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/loadgen_test.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/loadgen_test.h
deleted file mode 100644
index 777029b99..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/loadgen_test.h
+++ /dev/null
@@ -1,198 +0,0 @@
-/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/// \file
-/// \brief A minimal test framework.
-
-#ifndef MLPERF_LOADGEN_TESTS_LOADGEN_TEST_H_
-#define MLPERF_LOADGEN_TESTS_LOADGEN_TEST_H_
-
-#include <algorithm>
-#include <exception>
-#include <functional>
-#include <iostream>
-#include <map>
-#include <vector>
-
-#define REGISTER_TEST(name, ...) \
-  static Test::StaticRegistrant test##name(#name, __VA_ARGS__);
-
-#define REGISTER_TEST_SCENARIO(name, scenario, test, ...) \
-  static Test::StaticRegistrant t##name##scenario(        \
-      #name "_" #scenario, test, __VA_ARGS__, mlperf::TestScenario::scenario)
-
-#define REGISTER_TEST_ALL_SCENARIOS(name, test, ...)             \
-  REGISTER_TEST_SCENARIO(name, SingleStream, test, __VA_ARGS__); \
-  REGISTER_TEST_SCENARIO(name, MultiStream, test, __VA_ARGS__);  \
-  REGISTER_TEST_SCENARIO(name, Server, test, __VA_ARGS__);       \
-  REGISTER_TEST_SCENARIO(name, Offline, test, __VA_ARGS__);
-
-#define FAIL_IF(exp)                                              \
-  [&]() {                                                         \
-    const bool v = exp;                                           \
-    if (v) {                                                      \
-      std::cerr << "\n   ERROR: (" << __FILE__ << "@" << __LINE__ \
-                << ") : " #exp;                                   \
-      Test::AddFailure();                                         \
-    }                                                             \
-    return v;                                                     \
-  }()
-
-#define FAIL_MSG(...)                                                      \
-  [&]() {                                                                  \
-    std::cerr << "\n    Info: (" << __FILE__ << "@" << __LINE__ << ") : "; \
-    Test::Log(__VA_ARGS__);                                                \
-    return true;                                                           \
-  }()
-
-#define FAIL_EXP(exp)                                                      \
-  [&]() {                                                                  \
-    std::cerr << "\n    Info: (" << __FILE__ << "@" << __LINE__ << ") : "; \
-    std::cerr << #exp << " is " << (exp);                                  \
-    return true;                                                           \
-  }()
-
-#define BAD_TEST_MSG(...)                                        \
-  [&]() {                                                        \
-    FAIL_MSG("The test isn't testing what it claims to test. "); \
-    Test::Log(__VA_ARGS__);                                      \
-    return true;                                                 \
-  }()
-
-#define ABORT_TEST                                     \
-  [&]() {                                              \
-    FAIL_MSG("ABORTING");                              \
-    throw std::logic_error("ABORT_TEST encountered."); \
-    return false;                                      \
-  }();
-
-/// \brief Testing utilities.
-namespace testing {
-
-/// \brief Wraps a test class as a functor for easy registration.
-/// Forwards registration args to a SetUpTest method.
-/// \details Calls SetUpTest, RunTest, and EndTest.
-template <typename TestT>
-struct TestProxy {
-  template <typename... Args>
-  void operator()(Args&&... args) {
-    TestT test;
-    test.SetUpTest(std::forward<Args>(args)...);
-    test.RunTest();
-    test.EndTest();
-  }
-};
-
-/// \brief A collection of methods for registering and running tests.
-class Test {
-  /// \brief Maps registered test names to a callback.
-  using TestMap = std::multimap<const char*, std::function<void()>>;
-
-  /// \brief The registered tests.
-  /// \details Wraps a static local to avoid undefined initialization order
-  /// and guarantee it is initialized before the first test registers itself.
-  static TestMap& tests() {
-    static TestMap tests_;
-    return tests_;
-  }
-
-  /// \brief The number of errors the current test has encountered.
-  static size_t& test_fails() {
-    static size_t test_fails_ = 0;
-    return test_fails_;
-  }
-
- public:
-  /// \brief Registers a test before main() starts during static initialization.
-  struct StaticRegistrant {
-    template <typename... Args>
-    StaticRegistrant(Args&&... args) {
-      Test::Register(std::forward<Args>(args)...);
-    }
-  };
-
-  /// \brief Registers a test at runtime.
-  template <typename TestF, typename... Args>
-  static void Register(const char* name, TestF test, Args&&... args) {
-    std::function<void()> test_closure =
-        std::bind(test, std::forward<Args>(args)...);
-    tests().insert({std::move(name), std::move(test_closure)});
-  }
-
-  /// \brief Runs all currently registered tests that match the given filter.
-  static int Run(std::function<bool(const char*)> filter) {
-    // Determine which tests are enabled.
-    std::vector<TestMap::value_type*> enabled_tests;
-    for (auto& test : tests()) {
-      if (filter(test.first)) {
-        enabled_tests.push_back(&test);
-      }
-    }
-    const size_t enabled = enabled_tests.size();
-    std::cout << enabled << " of " << tests().size() << " tests enabled.\n";
-
-    // Run the tests.
-    std::vector<const char*> failures;
-    for (size_t i = 0; i < enabled; i++) {
-      const char* name = enabled_tests[i]->first;
-      std::cout << "[" << (i + 1) << "/" << enabled << "] : " << name << " : ";
-      std::cout.flush();
-      test_fails() = 0;
-      try {
-        enabled_tests[i]->second();  // Run the test.
-      } catch (std::exception& e) {
-        constexpr bool TestThrewException = true;
-        FAIL_IF(TestThrewException) && FAIL_EXP(e.what());
-      }
-      if (test_fails() > 0) {
-        failures.push_back(name);
-        std::cerr << "\n FAILED: " << name << "\n";
-      } else {
-        std::cout << "SUCCESS\n";
-      }
-    }
-
-    // Summarize.
-    if (enabled_tests.empty()) {
-      std::cerr << "Check your test filter.\n";
-    } else if (failures.empty()) {
-      std::cout << "All " << enabled << " tests passed! \\o/\n";
-    } else {
-      std::cout << failures.size() << " of " << enabled << " tests failed:\n";
-      for (auto failed_test_name : failures) {
-        std::cout << "  " << failed_test_name << "\n";
-      }
-    }
-    return failures.size();
-  }
-
-  /// \brief Used by test macros to flag test failure.
-  static void AddFailure() { test_fails()++; }
-
-  /// \brief Base case for the variadic version of Log.
-  static void Log() {}
-
-  /// \brief Used by test macros to log an arbitrary list of args.
-  template <typename T, typename... Args>
-  static void Log(T&& v, Args&&... args) {
-    std::cerr << v;
-    Log(std::forward<Args>(args)...);
-  }
-};
-
-}  // namespace testing
-
-// The testing namespace exists for documentation purposes.
-// Export the testing namespace for all files that define tests.
-using namespace testing;
-
-#endif  // MLPERF_LOADGEN_TESTS_LOADGEN_TEST_H_
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/loadgen_test_main.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/loadgen_test_main.cc
deleted file mode 100644
index 3dc5afa80..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/loadgen_test_main.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/// \file
-/// \brief A main entry point a test binary can use if it just wants to execute
-/// Test::Run on all statically registered tests.
-
-#include <regex>
-
-#include "loadgen_test.h"
-
-int main(int argc, char* argv[]) {
-  if (argc <= 1) {
-    std::cerr << "Usage: " << argv[0] << " <include_regex> <exclude_regex>\n";
-    return -1;
-  }
-  std::regex include_regex(argc >= 2 ? argv[1] : ".*");
-  std::regex exclude_regex(argc >= 3 ? std::regex(argv[2]) : std::regex());
-  auto test_filter = [&](const char* test_name) {
-    return (std::regex_search(test_name, include_regex) &&
-            !std::regex_search(test_name, exclude_regex));
-  };
-  return Test::Run(test_filter);
-}
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/perftests_null_sut.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/perftests_null_sut.cc
deleted file mode 100644
index 56d562c3e..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/perftests_null_sut.cc
+++ /dev/null
@@ -1,230 +0,0 @@
-/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/// \file
-/// \brief Performance tests using a null backend.
-
-#include <future>
-
-#include "../loadgen.h"
-#include "../query_sample_library.h"
-#include "../system_under_test.h"
-#include "../test_settings.h"
-
-/// \brief Performance unit tests.
-namespace perf_tests {
-
-/// \defgroup LoadgenTestsPerformance Test Coverage: Performance
-
-/// \brief A simple SUT implemenatation that immediately completes
-/// issued queries sychronously ASAP.
-class SystemUnderTestNull : public mlperf::SystemUnderTest {
- public:
-  SystemUnderTestNull() = default;
-  ~SystemUnderTestNull() override = default;
-  const std::string& Name() override { return name_; }
-  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
-    std::vector<mlperf::QuerySampleResponse> responses;
-    responses.reserve(samples.size());
-    for (auto s : samples) {
-      responses.push_back({s.id, 0, 0});
-    }
-    mlperf::QuerySamplesComplete(responses.data(), responses.size());
-  }
-
-  void FlushQueries() override {}
-
- private:
-  std::string name_{"NullSUT"};
-};
-
-/// \brief A stub implementation of QuerySampleLibrary.
-class QuerySampleLibraryNull : public mlperf::QuerySampleLibrary {
- public:
-  QuerySampleLibraryNull() = default;
-  ~QuerySampleLibraryNull() = default;
-  const std::string& Name() const override { return name_; }
-
-  size_t TotalSampleCount() override { return 1024 * 1024; }
-
-  size_t PerformanceSampleCount() override { return 1024; }
-
-  void LoadSamplesToRam(
-      const std::vector<mlperf::QuerySampleIndex>& samples) override {
-    return;
-  }
-
-  void UnloadSamplesFromRam(
-      const std::vector<mlperf::QuerySampleIndex>& samples) override {
-    return;
-  }
-
- private:
-  std::string name_{"NullQSL"};
-};
-
-/// \brief Runs single stream traffic.
-/// \ingroup LoadgenTestsPerformance
-void TestSingleStream() {
-  SystemUnderTestNull null_sut;
-  QuerySampleLibraryNull null_qsl;
-
-  mlperf::LogSettings log_settings;
-  log_settings.log_output.prefix_with_datetime = true;
-
-  mlperf::TestSettings ts;
-
-  mlperf::StartTest(&null_sut, &null_qsl, ts, log_settings);
-}
-
-/// \brief A SUT implementation that completes queries asynchronously using
-/// std::async.
-class SystemUnderTestNullStdAsync : public mlperf::SystemUnderTest {
- public:
-  SystemUnderTestNullStdAsync() { futures_.reserve(1000000); }
-  ~SystemUnderTestNullStdAsync() override = default;
-  const std::string& Name() const override { return name_; }
-  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
-    futures_.emplace_back(std::async(std::launch::async, [samples] {
-      std::vector<mlperf::QuerySampleResponse> responses;
-      responses.reserve(samples.size());
-      for (auto s : samples) {
-        responses.push_back({s.id, 0, 0});
-      }
-      mlperf::QuerySamplesComplete(responses.data(), responses.size());
-    }));
-  }
-
-  void FlushQueries() override {}
-
- private:
-  std::string name_{"NullStdAsync"};
-  std::vector<std::future<void>> futures_;
-};
-
-/// \brief Tests server traffic using SystemUnderTestNullStdAsync.
-/// \ingroup LoadgenTestsPerformance
-void TestServerStdAsync() {
-  SystemUnderTestNullStdAsync null_std_async_sut;
-  QuerySampleLibraryNull null_qsl;
-
-  mlperf::LogSettings log_settings;
-  log_settings.log_output.prefix_with_datetime = true;
-  log_settings.log_output.copy_summary_to_stdout = true;
-
-  mlperf::TestSettings ts;
-  ts.scenario = mlperf::TestScenario::Server;
-  ts.server_target_qps = 2000000;
-  ts.min_duration_ms = 100;
-
-  mlperf::StartTest(&null_std_async_sut, &null_qsl, ts, log_settings);
-}
-
-/// \brief A SUT implementation that completes queries asynchronously using
-/// an explicitly managed thread pool.
-class SystemUnderTestNullPool : public mlperf::SystemUnderTest {
- public:
-  SystemUnderTestNullPool() {
-    samples_.reserve(kReserveSampleSize);
-    next_poll_time_ = std::chrono::high_resolution_clock::now() + poll_period_;
-    for (size_t i = 0; i < thread_count_; i++) {
-      threads_.emplace_back(&SystemUnderTestNullPool::WorkerThread, this);
-    }
-  }
-
-  ~SystemUnderTestNullPool() override {
-    {
-      std::unique_lock<std::mutex> lock(mutex_);
-      keep_workers_alive_ = false;
-    }
-    cv_.notify_all();
-    for (auto& thread : threads_) {
-      thread.join();
-    }
-  }
-
-  const std::string& Name() const override { return name_; }
-
-  void IssueQuery(const std::vector<mlperf::QuerySample>& samples) override {
-    std::unique_lock<std::mutex> lock(mutex_);
-    samples_.insert(samples_.end(), samples.begin(), samples.end());
-  }
-
-  void FlushQueries() override {}
-
- private:
-  void WorkerThread() {
-    std::vector<mlperf::QuerySample> my_samples;
-    my_samples.reserve(kReserveSampleSize);
-    std::unique_lock<std::mutex> lock(mutex_);
-    while (keep_workers_alive_) {
-      next_poll_time_ += poll_period_;
-      auto my_wakeup_time = next_poll_time_;
-      cv_.wait_until(lock, my_wakeup_time,
-                     [&]() { return !keep_workers_alive_; });
-      my_samples.swap(samples_);
-      lock.unlock();
-
-      std::vector<mlperf::QuerySampleResponse> responses;
-      responses.reserve(my_samples.size());
-      for (auto s : my_samples) {
-        responses.push_back({s.id, 0, 0});
-      }
-      mlperf::QuerySamplesComplete(responses.data(), responses.size());
-
-      lock.lock();
-      my_samples.clear();
-    }
-  }
-
-  static constexpr size_t kReserveSampleSize = 1024 * 1024;
-  const std::string name_{"NullPool"};
-  const size_t thread_count_ = 4;
-  const std::chrono::milliseconds poll_period_{1};
-  std::chrono::high_resolution_clock::time_point next_poll_time_;
-
-  std::mutex mutex_;
-  std::condition_variable cv_;
-  bool keep_workers_alive_ = true;
-  std::vector<std::thread> threads_;
-
-  std::vector<mlperf::QuerySample> samples_;
-};
-
-/// \brief Tests server traffic using SystemUnderTestNullPool.
-/// \ingroup LoadgenTestsPerformance
-void TestServerPool() {
-  SystemUnderTestNullPool null_pool;
-  QuerySampleLibraryNull null_qsl;
-
-  mlperf::LogSettings log_settings;
-  log_settings.log_output.prefix_with_datetime = true;
-  log_settings.log_output.copy_summary_to_stdout = true;
-
-  mlperf::TestSettings ts;
-  ts.scenario = mlperf::TestScenario::Server;
-  ts.server_target_qps = 2000000;
-  ts.min_duration_ms = 100;
-
-  mlperf::StartTest(&null_pool, &null_qsl, ts, log_settings);
-}
-
-/// @}
-
-}  // namespace perf_tests
-
-int main(int argc, char* argv[]) {
-  perf_tests::TestSingleStream();
-  perf_tests::TestServerStdAsync();
-  perf_tests::TestServerPool();
-  return 0;
-}
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/perftests_null_sut.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/perftests_null_sut.py
deleted file mode 100644
index 115372e18..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tests/perftests_null_sut.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright 2019 The MLPerf Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-"""Python version of perftests_null_sut.cc.
-"""
-
-from __future__ import print_function
-from absl import app
-import mlperf_loadgen
-
-
-def load_samples_to_ram(query_samples):
-    del query_samples
-    return
-
-
-def unload_samples_from_ram(query_samples):
-    del query_samples
-    return
-
-
-def issue_query(query_samples):
-    responses = []
-    for s in query_samples:
-        responses.append(mlperf_loadgen.QuerySampleResponse(s.id, 0, 0))
-    mlperf_loadgen.QuerySamplesComplete(responses)
-
-
-def flush_queries():
-    pass
-
-
-def main(argv):
-    del argv
-    settings = mlperf_loadgen.TestSettings()
-    settings.scenario = mlperf_loadgen.TestScenario.SingleStream
-    settings.mode = mlperf_loadgen.TestMode.PerformanceOnly
-
-    sut = mlperf_loadgen.ConstructSUT(issue_query, flush_queries)
-    qsl = mlperf_loadgen.ConstructQSL(
-        1024 * 1024, 1024, load_samples_to_ram, unload_samples_from_ram
-    )
-    mlperf_loadgen.StartTest(sut, qsl, settings)
-    mlperf_loadgen.DestroyQSL(qsl)
-    mlperf_loadgen.DestroySUT(sut)
-
-
-if __name__ == "__main__":
-    app.run(main)
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tools/mlperf-trace.ipynb b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tools/mlperf-trace.ipynb
deleted file mode 100644
index ab834d17a..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/tools/mlperf-trace.ipynb
+++ /dev/null
@@ -1,441 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Tool to extract usefull information from mlperf trace"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<Figure size 3600x2400 with 0 Axes>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "%matplotlib inline\n",
-    "# Ignore warnings\n",
-    "import warnings\n",
-    "warnings.filterwarnings('ignore')\n",
-    "\n",
-    "import json\n",
-    "import os\n",
-    "import seaborn as sns\n",
-    "from operator import itemgetter\n",
-    "import pandas as pd\n",
-    "import numpy as np\n",
-    "import matplotlib.pyplot as plt\n",
-    "\n",
-    "figsize=(10, 5)\n",
-    "font=10\n",
-    "\n",
-    "plt.figure(dpi=600)\n",
-    "plt.rc('xtick', labelsize=font) \n",
-    "plt.rc('font', size=font)\n",
-    "sns.set(font_scale=1.4, style=\"whitegrid\");"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def trace_to_df(fname):\n",
-    "    with open(fname, \"r\") as f:\n",
-    "        j = json.load(f)\n",
-    "    if type(j) == dict:\n",
-    "        j = j['traceEvents']\n",
-    "        \n",
-    "    result = []\n",
-    "    for item in j:\n",
-    "        name = item['name']\n",
-    "        if name not in [\"Latency\", \"Sample\", \"QuerySamplesComplete\", \"IssueQuery\"]:\n",
-    "            continue\n",
-    "\n",
-    "        args = item.get('args')\n",
-    "        d = {\"ts\": item['ts'], \"name\": name, \"dur\": item.get(\"dur\")}\n",
-    "\n",
-    "        if name == \"Latency\":\n",
-    "            d[\"issue_delay\"] = args[\"issue_delay\"]\n",
-    "            d[\"issue_to_done\"] = args[\"issue_to_done\"] / 1e3\n",
-    "            result.append(d)\n",
-    "        elif name == \"Sample\":\n",
-    "            if args:\n",
-    "                d[\"issue_start_ns\"] = args[\"issue_start_ns\"]\n",
-    "                d[\"complete_ns\"] = args[\"complete_ns\"]\n",
-    "                d[\"issue_to_done\"] = (args[\"complete_ns\"] - args[\"issue_start_ns\"]) / 1e3\n",
-    "                result.append(d)\n",
-    "        elif name == \"QuerySamplesComplete\":\n",
-    "            result.append(d)\n",
-    "        elif name == \"IssueQuery\":\n",
-    "            result.append(d)\n",
-    "\n",
-    "    df = pd.DataFrame(result)\n",
-    "    df = df.sort_values(by=[\"ts\"])\n",
-    "    return df\n",
-    "\n",
-    "BINS = 10"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>ts</th>\n",
-       "      <th>dur</th>\n",
-       "      <th>issue_delay</th>\n",
-       "      <th>issue_to_done</th>\n",
-       "      <th>issue_start_ns</th>\n",
-       "      <th>complete_ns</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>count</th>\n",
-       "      <td>2.000000e+04</td>\n",
-       "      <td>10000.000000</td>\n",
-       "      <td>5.000000e+03</td>\n",
-       "      <td>10000.000000</td>\n",
-       "      <td>5.000000e+03</td>\n",
-       "      <td>5.000000e+03</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>mean</th>\n",
-       "      <td>4.894584e+07</td>\n",
-       "      <td>17.731682</td>\n",
-       "      <td>7.001508e+04</td>\n",
-       "      <td>6112.554491</td>\n",
-       "      <td>7.001508e+04</td>\n",
-       "      <td>6.182570e+06</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>std</th>\n",
-       "      <td>2.839099e+07</td>\n",
-       "      <td>25.578639</td>\n",
-       "      <td>9.666462e+04</td>\n",
-       "      <td>2254.077235</td>\n",
-       "      <td>9.666462e+04</td>\n",
-       "      <td>2.263719e+06</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>min</th>\n",
-       "      <td>4.102560e+03</td>\n",
-       "      <td>1.152000</td>\n",
-       "      <td>8.810000e+02</td>\n",
-       "      <td>2754.967000</td>\n",
-       "      <td>8.810000e+02</td>\n",
-       "      <td>2.780383e+06</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>25%</th>\n",
-       "      <td>2.463025e+07</td>\n",
-       "      <td>3.974750</td>\n",
-       "      <td>5.806250e+04</td>\n",
-       "      <td>4100.473000</td>\n",
-       "      <td>5.806250e+04</td>\n",
-       "      <td>4.166623e+06</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>50%</th>\n",
-       "      <td>4.881766e+07</td>\n",
-       "      <td>7.364000</td>\n",
-       "      <td>6.159800e+04</td>\n",
-       "      <td>6089.880000</td>\n",
-       "      <td>6.159800e+04</td>\n",
-       "      <td>6.155939e+06</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>75%</th>\n",
-       "      <td>7.373552e+07</td>\n",
-       "      <td>27.441000</td>\n",
-       "      <td>6.835175e+04</td>\n",
-       "      <td>7337.257000</td>\n",
-       "      <td>6.835175e+04</td>\n",
-       "      <td>7.408272e+06</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>max</th>\n",
-       "      <td>9.832065e+07</td>\n",
-       "      <td>508.552000</td>\n",
-       "      <td>6.522433e+06</td>\n",
-       "      <td>22234.101000</td>\n",
-       "      <td>6.522433e+06</td>\n",
-       "      <td>2.414005e+07</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                 ts           dur   issue_delay  issue_to_done  \\\n",
-       "count  2.000000e+04  10000.000000  5.000000e+03   10000.000000   \n",
-       "mean   4.894584e+07     17.731682  7.001508e+04    6112.554491   \n",
-       "std    2.839099e+07     25.578639  9.666462e+04    2254.077235   \n",
-       "min    4.102560e+03      1.152000  8.810000e+02    2754.967000   \n",
-       "25%    2.463025e+07      3.974750  5.806250e+04    4100.473000   \n",
-       "50%    4.881766e+07      7.364000  6.159800e+04    6089.880000   \n",
-       "75%    7.373552e+07     27.441000  6.835175e+04    7337.257000   \n",
-       "max    9.832065e+07    508.552000  6.522433e+06   22234.101000   \n",
-       "\n",
-       "       issue_start_ns   complete_ns  \n",
-       "count    5.000000e+03  5.000000e+03  \n",
-       "mean     7.001508e+04  6.182570e+06  \n",
-       "std      9.666462e+04  2.263719e+06  \n",
-       "min      8.810000e+02  2.780383e+06  \n",
-       "25%      5.806250e+04  4.166623e+06  \n",
-       "50%      6.159800e+04  6.155939e+06  \n",
-       "75%      6.835175e+04  7.408272e+06  \n",
-       "max      6.522433e+06  2.414005e+07  "
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df = trace_to_df('/tmp/mlperf_log_trace.json')\n",
-    "df.describe()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAoIAAAFKCAYAAACJoz5RAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAstklEQVR4nO3deZxcVZ338Q8ECIQASVgNyjIIP8AoPkRGUfRBkSXgOiIMjAsoKIroIAgzKItsowODKCIoI4IzqICODgjigiwqKtCAEiA/MLI8EEFIAoJgB0ieP84tUhSVdHelu6vS9/N+vfp1U/eeunWqTi/fnHvOuSssWrQISZIk1c+K3a6AJEmSusMgKEmSVFMGQUmSpJoyCEqSJNWUQVCSJKmmDIKSJEk1tVK3KyBJdRIRmwB3A/tn5nndrc3z9ULdIuJdwDeAjTLz0W7UYbAiYhpwC/DKzJzZ5epIHTEISmNIROxH+SO6fWb+psvVGZSIWAc4Eng7sBHwJHAD8MXMvLybdRurIuKjwJM9GETHAScAZ/V6CATIzJkRcTlwPPAP3a6P1AkvDUvqmogISo/KIcCVwMeAfwPWAy6LiM91r3Zj2keB/drsvxdYDfivUa3NYm8BtgK+2qXX78TZwDsj4qXdrojUCXsEJXVFRKwMfBeYDLwhM69vOnYacAFwZET0ZebFo1y31TPzr6P5mp2qetFWysz+ZT1XZi4C/rbsterYB4AbM/OPXazDUP0UmE8J1p/pblWkoTMISmNcRKwPnATsSulpexS4CTg8M2+rymwLnAhsB6wBPARcC3woM5+KiB2Bq4A3ZubVTefehDZjyiJii+p8OwGrA3cAJ2Xmd5uq9i5gGnBMcwgEyMxnI+LDVZ0/C1xcnXfY69F0OX0n4B3A3sB61XPvBA7LzNNaPtNXAL8DPpqZZ7EEETEJOB14J7AI+F/gC23KXV297x1b9p8H7JiZm7S8z38FngA+AWwKvBm4OiIOq15rS2AicBflEvt/Np3zHmDj6t+Ne4zem5mbLOVz3IbSU7sDMI5y6f7ozPxFU5n9KJ/jjpTL/O8FJgA/oXwfPbykz6l6/qrAbsB/tDm2CPhsZh7Xsv8e4OrM3K96vBLwL8D7gJcAT1Wfwecz83+anjeY708iYi1KuHsXsCHwCHAN8KnMfAAgM5+u2u+dGAS1HPLSsDT2fRfYEzifckmwEUS2AIiIdSm9GpsB/065PHseJaStPtQXi4itgN8CL6/OdxgwF7g4It7TVPSt1fab7c6TmY9RgtNWEbHZCNaj4QxgW0poPiYz7wJ+DbQr+x5gAXDhUl5/har+76X0bn4GmEpph2X1XuBwSvA6FPhTtf9QYCZlzNqnKIH+nIg4qOm5/wzcD8yqzvPeat+S3sdWwC+A/wOcAhxXvY+fRcQb2jzldGAbSoA/i9LOXx7Ee5oOrALcOIiyS3Is5b1fA3y8+vcs4O8bBQb7fRERq1fnORT4OSV0f4USolsvA/dRvk8nL0Pdpa6wR1Aaw6oeqR0oPRinNh1qHnv3WmAKsGtmNv8RPrbDl/0iMAd4VWY+Ve07MyJ+AnwuIi6oLkFuDTyWmfcu5Vy/q7ZbA7NHqB4NT1B6355p2vdN4KyI2DozbweIiBWBfYDLMnPeUl7/bcAbgCMz89+r554F/GyI76OdjYHNM/NPLfu3yMwnmx6fUb3fwylj2cjMH0TEicAjmfnfg3itk4BVgelVOCYivkEJWKcBr2opPxfYufHZVp/XxyNirSrcL8mW1XZZLgu/Bbg8Mw9cSpnBfl98ihJo92oZmnBSFfKb/RFYgTK+8bplqL806uwRlMa2pyg9VztGxJQllGn8cX5LNW6vY9VrvBm4CFg9ItZpfAFXUC6vbVEVXwN4fIBTNo6vMYL1aDinJQRC6fHrp/SaNewIvJiBJ1TsDiyk9IoB5ZI3cOZQ3ssS/KBNCKQRAiNi5YiYUr3fq4DNqsucQ1KNP9wVuLQRAqvXeYTSazy9GnrQ7OstAfsXlMvJGw/wcmtX2/lDrWeTx4CXVZd+X2CI3xd7Are1G5/a8v6a67zOMtRd6gqDoDSGVRMIjqSMvXooIn4ZEUdFxEuail1DuXx8LDA3Ii6NiAOrS2ND9VJKz8hxwMMtX42xX+tV28cZOOA1jv95BOvR8IIex8ycD1wC7NvUC/QeYB5w2QB12Bh4MDNbw+6dg3sLS9W2dzQi3h4RN1L+AzCX8n5Prg4POQgC61LG+WWbY3dU201a9t/X8rgRkgZ72bS1t20ojqG8z4yI2yLitIho7rEcyvfFZpTL7EOpc2tAlHqeQVAa4zLzdGBzyqWux4CjgTuqiRdk5qLMfDfwasr4rnWArwG3RkTjj+KS/sCNa3nc+J3yBWDnJXw1/rjeDqwVERstpfqvqLaNy4UjUY+Gp2jvm5T1Dd9QTWh4F3BRZi5YSr2HarDvq+EFdY2IHYDvU9ZhPAjYg/I+G2NCR+v3/bNL2D9QwHuk2g5lnN3zPp/MvJYS4N4P3EyZNHJ9RBxRFenk+2IwGnV+ZKmlpB7kGEGpBjLzbkrIOz0iXkxZu+/TwNVNZa4HrgeOiYgZwOXAgZQxYo1enUktp2693NcIbM9k5kBj4S4F9qX8sT6x9WBErEmZfXpT03IiI1GPgVxB6ZF8L7A+sCaDW2fvXmDniFijpVew3WXL+cDftdk/0OXUZntSln7ZJTOfWwImIt7Ypuxge64epgTLaHOsMabvniHUcWkaPYybUkJcs/m0tHlErAK8qPUkVS/uN4FvRsRqlO/jz0bEfzC074vZlAlTg7Ep5TOdNcjyUs+wR1AawyJiQvXH8DmZeT8l2EyqykxuM/j9pmo7qdreS+npaZ0l+tGWc/+ZMibtwIjYsE191m16+D3gNuBfWi7fNcamnUXpaTmp6dBI1GOpqnGDF1CC1geBP2TmYCYEXE75HfuRptddETi4TdnZwJbN9aqWbHndYOtJ+VwW0fR7vZrF+oE2Zf/KIHreqjGNVwBvbZ65XY21ez9lzb+HhlDHpemjBNnWySdQPp/WNv8QLT2CEbF28+NqMsgsymSX1Yb4ffFdynjDd7cp1/rzMh2YVYVQablij6A0tm0B/DwiLqaErn7KJIatKDNJofxBPzgivk/5g7sasD8lWHwXylIu1TkOqdZ0m02Zodk6zg5K8PkV8PuIOKcqux7l0vPWVEtvVOuvvYuyNMcvI+JcShiYTOkp/D/Aic3rv41EPQbpm5RlRHahjC8bjEur1/+3an2+2yjrFLabtHMu8EngxxHx9aqeB1XPWXMIr/dJ4KcR8V/V6xwIPAhs0FL2RuCjEXEsZcziE5l56RLO+xnK+/5lRJxJCWsHUv6TsOcg6zagzFwQEVdQLs8e1XL4P4GzI+J7lKWOtqFMYmm9FHtHRFxLWefwkarcAcAPM/OJqsxgvy9OoQwD+HZE7EL53pwEzKCMRbwGnlsY/f+yfN0NRXqOPYLS2Pb/KL1Zr6f0rJ1CmRn5wcxsDI6/hnJJeC/K0hpHUcLDmzLzt03nOoSyLt5BlEu591FC5PNkZlJ6dS6hXPY9k9JjtxJlfGJr2W0o68ztTFmn7RRKCHx/Zj6v/EjVYyCZeQvw++rhYJZcITMXUpaQuQD4J8rn/6cl1PWOqo5rUZZkeRvlUvRNrWWX8npXV+eeQhkG8AHK2ohfalP8eBYHx29V5ZZ03jsoSxDdTJl49FnK98ebqzF5w+lc4FURsWnL/nOAz1N6Bf+Dcil2Z0rPZrPTKTO6j6R8T+1GWSppn0aBwX5fVHeWeUN1fDfK5/gxyhqMz82gpsxCnkKZRS0td1ZYtMhJTpJ6R0S8nLLkyL2UW88tbe25URMRNwALMnMol2s1BNWl85mU5WqO7HZ9BiMiLgEWZuY7ul0XqRP2CErqKZl5K2WSSADfryYFdFVEvJLSizQcdwXRElS9qEcDH6kWQ+9pETGNMtTCW8tpuWWPoCQtQfWHfjplfOCLgE1b7twhScs1ewQlacn2pNzPdzXgHw2BksYaewQlSZJqyuVjOtDX1zce2I4yA3BJq+hLkiT1gnGU4S03TJ8+vb/5gEGwM9tRZjVKkiQtL14P/LJ5h0GwM38C2GKLLVhlleGd0Dhz5kymTRvsXY00GmyT3mJ79Bbbo/fYJr2lF9pjwYIF3HnnnVDll2YGwc48C7DKKqswfvz4YT/5SJxTy8Y26S22R2+xPXqPbdJbeqg9XjCczVnDkiRJNWUQlCRJqimDoCRJUk0ZBCVJkmrKIChJklRTBkFJkqSaMghKkiTVlEFQkiSppgyCkiRJNeWdRXrY439dwJP9z3S7GsNiwviVWGP14b0dnyRJWjYGwR72ZP8zXHnDfd2uxrDYabuNDIKSJPUYLw1LkiTVlEFQkiSppgyCkiRJNWUQlCRJqimDoCRJUk0ZBCVJkmrKIChJklRTBkFJkqSaMghKkiTVlEFQkiSppgyCkiRJNWUQlCRJqimDoCRJUk0ZBCVJkmrKIChJklRTBkFJkqSaMghKkiTVlEFQkiSppgyCkiRJNWUQlCRJqimDoCRJUk0ZBCVJkmrKIChJklRTBkFJkqSaMghKkiTVlEFQkiSppgyCkiRJNWUQlCRJqimDoCRJUk0ZBCVJkmrKIChJklRTBkFJkqSaMghKkiTVlEFQkiSpplbqdgUaImIiMAvYENguM29sOvY+4ChgE2A2cHxmXtjy/JWB44H3A5OAG4BPZOYtLeU2AL4I7AYsAn4I/HNmPjIS70uSJKlX9VKP4HG0CaYRsSdwPvB9YAbwM+DbETGjpegXgIOBY4G3AwuAKyNiatO5VgKuAF4OvA84AHgtcElErDDM70eSJKmn9USPYERMAw4CPgl8teXwCcDFmfmv1eOrImIr4LPAj6rnb1g9/+OZeU617zfA3cA/A0dUz30XsA0wLTNvq8rNAX5FCZmXj8T7kyRJ6kW90iN4JvBl4M7mnRGxKbAl8J2W8t8CtouIdavHuwDjgOcuF2fm45TLvrs3PW934NZGCKzKXQfc21JOkiRpzOt6EIyI9wIvBU5sc3irant7y/5GkIumcg9l5tw25baIiBWbyrWeq1Fuy6HUW5IkaXnX1SAYEWsBpwBHZOYTbYpMrraPtuyfX22nNJVrLdMotzIwcRDlprTZL0mSNGZ1e4zgicBdmXlBl+vRkZkzZ47Iefv6+gBYbc11mTNnzoi8xmibO3cC99/9cLer0bFGm6g32B69xfboPbZJb+nl9uhaEIyIl1EmeOwcEZOq3Y2eu4kRsQaLe/4mAQ82Pb3RUziv2s6vyrSaDDwNPDGIcvPa7F+qadOmMX78+KE+ban6+vqYPn06AA/Ne5KpU58c1vN3y9prr8P6m2/U7Wp0pLlN1H22R2+xPXqPbdJbeqE9+vv7l9h51c1Lw5tTguhVlIA2H7i0OnYV8AvgjurxVi3P3braZrW9A1gvIlov724N3JmZC5vKtZ6rUW5WB+9BkiRpudXNIPhL4I0tX4dWxw4CDsjMuykBbe+W5+4D3JCZjWuNPwEWAns1ClQLVL+V5y8Jcznw8mr5mUa511AWqnbpGEmSVCtduzRc3cnj6uZ9EY1JwPQ13VnkGODCiJgN/JSyWPQuwB5N53ogIs4GPh8Rz1CWgzkcWAE4veklvgf8HvhuRPwr5f2fAvyaak1CSZKkuuj68jEDycyLgf2BPYEfA7sC+2Zma3A7FDiLMgHlEmA14M2ZOafpXM9Qbi03E/hv4BvAb4C3ZeaiEX4rkiRJPaXbs4afJzOvpvTite4/n3KbuaU992ngX6qvpZV7kBdeapYkSaqdnu8RlCRJ0sgwCEqSJNWUQVCSJKmmDIKSJEk1ZRCUJEmqKYOgJElSTRkEJUmSasogKEmSVFMGQUmSpJoyCEqSJNWUQVCSJKmmDIKSJEk1ZRCUJEmqKYOgJElSTRkEJUmSasogKEmSVFMGQUmSpJoyCEqSJNWUQVCSJKmmDIKSJEk1ZRCUJEmqKYOgJElSTRkEJUmSasogKEmSVFMGQUmSpJoyCEqSJNWUQVCSJKmmDIKSJEk1ZRCUJEmqKYOgJElSTRkEJUmSasogKEmSVFMGQUmSpJoyCEqSJNWUQVCSJKmmhhwEI2LXiFhhJCojSZKk0dNJj+CPgPsj4pSI2Ga4KyRJkqTR0UkQfAfwK+Bg4KaI+H1EHB4RU4e1ZpIkSRpRQw6CmXlJZu4FrA8cCDwMfA64NyJ+EhHviYgJw1xPSZIkDbOOJ4tk5uOZeW5m7gRsDBwFrAecDzwUEd+MiJ2GqZ6SJEkaZsM1a3gcsDIwHlgBeAp4M/DTiLg5IqYN0+tIkiRpmKzU6RMjYi1gL+A9wOuAZ4DLgH+ptguBtwFfAL4BbLeslZUkSdLwGXIQjIh3UMLf7sCqwA3AJ4BvZ+a8luI/iIh1gK+0Oc8/AJ8EtgQmAg8A3wdOyMzHmsrNAE4Ctq7KnJ6ZZ7Q53+GUCSwbALcBR2bmlS1l1gBOAfas6n4VcEhm3jOkD0GSJGkM6OTS8P8Arwa+CGydma/OzDPbhMCG3wMXtNk/BbgW+BCwW3W+DwAXNwpExPbAJcDNwAxKz+LpEXFQ84mqEHgycCawB3AXcFmb5W2+TemlPATYG5gKXOnkFkmSVEedXBreBbgyMxcNpnBmXg9c32b/f7bsujoi/gZ8NSKmZuYc4Bjgpsz8YFXmqojYCDg2Ir6WmQsjYjzwGUpP4akAEXENcCvwacrlayLi1ZSQuEdmXl7tuxWYDexHm15LSZKksayT5WN+NtgQ2IFHqu0qVcB7E3BhS5lvUS7/bls9fi2wFvCdpjo+C1wEzGi6C8ruwGPAFU3l7qOsibj78L4NSZKk3tfJLea+EBF3LeX4nRFxyhDONy4iVo2I6ZQewEuqMXubAasAt7c85bZqu2W13ara3tGm3ERgw6ZyszJzYZtyWyJJklQznVwa3oMX9tI1uxB4N/CpQZ5vLqVHD0pv3b7VvydX20dbys+vtlOayvVn5lNLKXd/Va71XI1yU9rsH9DMmTM7edqA+vr6AFhtzXWZM2fOiLzGaJs7dwL33/1wt6vRsUabqDfYHr3F9ug9tklv6eX26CQIvgS4ZynH763KDNaOwARgGmWs36URsXMH9Rp106ZNY/z48cN6zr6+PqZPnw7AQ/OeZOrUJ4f1/N2y9trrsP7mG3W7Gh1pbhN1n+3RW2yP3mOb9JZeaI/+/v4ldl51EgT/Amy6lON/R1lQelAy85bqn9dFRB9wI/BOFl8SntTylEZPYWOW8nxgfESsmpl/G6BcuyQyuamMJElSbXSyfMzPgQ9Xs3efJyI2AT5clenELZSFqF9Kmc27gMVjABu2rrazqm1jbGC7co9T1h5slIumySPN5WYhSZJUM50EwWMoPYkzI+KLEfGh6utLlDUDVwSO7rA+21fP/2Nm9lMC5V4tZfYBHgRuqh5fR5kNvHejQESMq553RdMM58spvYu7NpV7CbBDdUySJKlWhnxpODPviojXURZvPqTl8DWUO3XkQOeJiB8DV1Jm7f4NeCVlgsnvgR9UxY4Hro2IcyiLUr8OOBA4uDH7NzP7I+JE4OSIeJgSEA+gzDpuTDwhM38bEZcBX4+IwyiXuI8H7gPOG9qnIEmStPzr6F7DmXkbsGN1+7i/q3bPzsy5QzjN9ZRb1TXGG94DnA2clpkLqtf5dUS8nXLXkPcBc4BDM/PslvqcGhEAHwfWp4TLPTLzdy2vuQ9wKmXx6PGUW8y9OzPHxowMSZKkIegoCDZk5iMsXgR6qM89mkFcQq7uAjLgpdvqriKnDlDmccoYxg8PspqSJEljVkdBsBqDtyulN3Ay0DoBY1FmnrCMdZMkSdIIGnIQjIhXAd8DXswLA2DDIsAgKEmS1MM66RH8CrAa8A7gF5n56HBWSJIkSaOjkyD4CuDTmXnpcFdGkiRJo6eTdQTvZ8mXhCVJkrSc6CQIfg44MCLWHO7KSJIkafR0cml4CvBX4A8R8V3g/wHPtpRZlJmnLGvlJEmSNHI6CYKfa/r3QUsoswgwCEqSJPWwToLgpgMXkSRJUq/r5F7D945ERSRJkjS6Or7FXERsDuwIrAdckJn3RMQqwAbAg437BUuSJKk3dXJnkRWBs4EPUpaRWQT8GrgHWAW4FTge+I9hq6UkSZKGXSfLxxwFfAA4GtiepjUFM/MJyu3n/mFYaidJkqQR00kQ3B84NzNPBv7Q5vitwObLVCtJkiSNuE6C4IuB65dy/Clgjc6qI0mSpNHSSRB8ENh4KcenA84sliRJ6nGdBMHvAR+pZg03LAKIiBnA+4CLhqFukiRJGkGdBMHjgPuAm4ELKCHwqIj4DfBD4HfAvw1XBSVJkjQyhhwEM/MvwGuBk4H1gb8BOwATKSHxDZn51DDWUZIkSSOgowWlM/NvlCB48vBWR5IkSaOlk0vDkiRJGgM6ubPIuYMotigzP9hBfSRJkjRKOrk0/CaqWcJNxgEvqrYPA39dxnpJkiRphA05CGbmJu32R8TKwIeBfwZ2XqZaSZIkacQN2xjBzHw6M78M/AT48nCdV5IkSSNjJCaL/A54wwicV5IkScNoJILgzsCTI3BeSZIkDaNOZg0fs4RDkyg9gdsCn1uGOkmSJGkUdDJr+Lgl7J8PzAYOAs7ptEKSJEkaHZ3MGnYRakmSpDHAUCdJklRTnYwR3KiTF8rM+zp5niRJkkZGJ2ME7+GFdxYZjHEdPEeSJEkjpJMgeADwceAlwLeAO6v9AewD3Ad8CVg4HBWUJEnSyOgkCL4IGA+8NDPnNx+IiGOBXwEbZOa/DUP9JEmSNEI6mSxyEPC11hAIkJlzKUvHfGRZKyZJkqSR1UkQXBuYuJTjq1dlJEmS1MM6CYK/AT4REdNbD0TEq4BPAL9d1opJkiRpZHUyRvBjwNXA9RFxA3BXtX9zYDtgHnDIsNROkiRJI2bIPYKZeTvwcsrM4EnAntXXJOCLwMsz87bhq6IkSZJGQic9gmTmQ8Ch1ZckSZKWQx0FwYaI2BxYD5iZmY8N8bnvBv4JmA5MAWYDZwFfzcyFTeVmACcBWwMPAKdn5hltznc4cDCwAXAbcGRmXtlSZg3gFEoP5qrAVcAhmXnPUOouSZI0FnR0r+GI2Dci7gNmAddSwhwRsU5E3BkRew3iNIcB/cCngLcAP6Bcbv580+tsD1wC3AzMAL4BnB4RB7XU53DgZOBMYA/KuMXLImKbltf8NvA2yhjGvYGpwJURMWHQb16SJGmM6ORew+8C/hv4KXA6cGrjWGY+EhF3AO8DLhrgVG/NzIebHl8VEROBj0XEZzKzHzgGuCkzP9hUZiPg2Ij4WmYujIjxwGcoPYWnVnW8BrgV+DSwV7Xv1ZSQuEdmXl7tu5XSE7kf8JWhfhaSJEnLs056BD8N/CwzdwXOb3P8t0BrT9wLtITAhpspl2ynVAHvTcCFLWW+Rbn8u231+LXAWsB3ms79LCWIzoiIFarduwOPAVc0lbuPcieU3QeqryRJ0ljTSRDcCvj+Uo7/GVi3s+rwesryM38GNgNWAW5vKdOYkbxlU30A7mhTbiKwYVO5Wc3jD5vKbYkkSVLNdDJZ5K8s/c4imwGPDPWk1WLU+wOfzcxnI2JydejRlqKNW9tNqbaTgf7MfGop5e6vyrWeq1FuSpv9A5o5c2YnTxtQX18fAKutuS5z5swZkdcYbXPnTuD+u9t1Ai8fGm2i3mB79Bbbo/fYJr2ll9ujkyD4c2C/iPhi64GImAocCPzvUE4YERsA3wOup2mySK+bNm0a48ePH9Zz9vX1MX16uWnLQ/OeZOrUJ4f1/N2y9trrsP7mG3W7Gh1pbhN1n+3RW2yP3mOb9JZeaI/+/v4ldl51OkbwRcCNwEeBRcDuEfE5ygSNhcBnB3uyiFgL+BHwJPC2zHy6OtTo0ZvU8pRGT+G8pnLjI2LVQZRrPVej3Lw2+yVJksa0Tu4schfwOuBB4DhgBeCTwBHALcAO1SSMAVXh7RLKWoS7ZebcpsOzgQUsHgPYsHW1nVVtG2MD25V7nLL2YKNcNE0eaS43C0mSpJoZUhCMiHHV8i0PZeYuwDrAq4HtgfUzc6fMvHOQ51qJMrP3FcCMzLy3+Xi1fMzPqZZ/abIPJYTeVD2+jjIbeO/melbPuyIzF1W7L6f0CO7aVO4lwA7VMUmSpFoZ6hjBFSk9dUcCp2XmfOCGDl/7TOCtlJ7ECRHxmqZjt2fmX4DjgWsj4hzgAkpP5IHAwY3Zv5nZHxEnAidHxMOUgHgAZdLKvo0TZuZvI+Iy4OsRcRjQOP99wHkdvgdJkqTl1pB6BKvxe3Mo4wKXVaNn7t+BX7d8bVu93q+BtwPbAT+mBLxDM/PslnqdChwFfJwy3nBLysLRv2t5zX2AH1IWj76Y0rP45swcGzMyJEmShqCTWcPfoMwaPisz/9bpC2fmJoMsdzmDuHRbhcFTByjzOPDh6kuSJKnWOgmCdwLjgFkRcT7wR6B1DT8yc6BbzEmSJKmLOgmC/93076OXUGYRA99rWJIkSV00qCAYEV8Czs/MPuCN1e6JlJ7AZ0eobpIkSRpBg+0R/BjwG6AvM6+JiLUp9wPeOTOvGbHaSZIkacR0cmeRhtaFmSVJkrQcWZYgKEmSpOWYQVCSJKmmhjJr+O8i4u+rf69VbbeMiCfaFc7M65epZpIkSRpRQwmCn62+mp3RptwKlOVjxnVaKUmSJI28wQbB/Ue0FpIkSRp1gwqCmXn+SFdEkiRJo8vJIpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1dRK3XzxiHgpcDjwGmAaMCszp7UpNwM4CdgaeAA4PTPPaFPucOBgYAPgNuDIzLyypcwawCnAnsCqwFXAIZl5z/C9M0mSpN7X7R7BlwF7AH8Abm9XICK2By4BbgZmAN8ATo+Ig1rKHQ6cDJxZnfMu4LKI2KbllN8G3gYcAuwNTAWujIgJw/SeJEmSlgtd7REELs3M/wWIiPOAV7UpcwxwU2Z+sHp8VURsBBwbEV/LzIURMR74DKWn8NTqfNcAtwKfBvaq9r2aEhL3yMzLq323ArOB/YCvjMi7lCRJ6kFd7RHMzIVLO14FvDcBF7Yc+hbl8u+21ePXAmsB32k697PARcCMiFih2r078BhwRVO5+4BfVcckSZJqo9uXhgeyGbAKL7xsfFu13bLablVt72hTbiKwYVO5WW0C6G1N55IkSaqFbl8aHsjkavtoy/751XZKU7n+zHxqKeXur8q1nqtRbkqb/Us1c+bMoT5lUPr6+gBYbc11mTNnzoi8xmibO3cC99/9cLer0bFGm6g32B69xfboPbZJb+nl9uj1INjTpk2bxvjx44f1nH19fUyfPh2Ah+Y9ydSpTw7r+btl7bXXYf3NN+p2NTrS3CbqPtujt9gevcc26S290B79/f1L7Lzq9UvDjR69SS37Gz2F85rKjY+IVQdRrvVcjXLz2uyXJEkas3o9CM4GFrB4DGDD1tV2VrVtjA1sV+5xytqDjXLRNHmkudwsJEmSaqSng2Bm9gM/p1r+pck+wIPATdXj6yizgfduFIiIcdXzrsjMRdXuyyk9grs2lXsJsEN1TJIkqTa6fWeRCSxetmVjYM2I2LN6fENm3gscD1wbEecAFwCvAw4EDm7M/s3M/og4ETg5Ih6mBMQDKLOO9228Xmb+NiIuA74eEYcBf6nOfx9w3oi+WUmSpB7T7cki6wEXt+xrPN4fOC8zfx0Rb6fcNeR9wBzg0Mw8u/lJmXlqRAB8HFifsiTMHpn5u5bz7wOcSlk8ejzlFnPvzsyxMStDkiRpkLoaBKv7+7aO12tX7nIGcem2uqvIqQOUeRz4cPUlSZJUWz09RlCSJEkjxyAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaYMgpIkSTVlEJQkSaopg6AkSVJNGQQlSZJqyiAoSZJUUwZBSZKkmjIISpIk1ZRBUJIkqaZW6nYFVA8LFy7ioXlPdrsaHVltzXWfV/cJ41dijdVX6WKNJEkaHgZBjYr+p5/lut/P6XY1OjJnzhymTl0cBHfabiODoCRpTPDSsCRJUk0ZBCVJkmqqdpeGI2Jz4AxgB+Ap4DvAkZm5fA5gkyRJ6lCtgmBETAKuAu4F9gTWA04D1gX+sXs1kyRJGn21CoLAh4HJwCsz8xGAiHgGuCAiTsjM27paO0mSpFFUtzGCuwNXNkJg5XtAPzCjO1WSJEnqjrr1CG4FnNu8IzP7I2I2sOUQzjMOYMGCBcNYtcX6+/sBeObpBay04sIReY3R9uwzTy+372XVlVd4Xt0XLOjngYdGpu1H07gVV+DZhYu6XY0hGz9xCg889Ojz9q22ykqsPmHl7lRIz/3OUu+wTXpLt9ujKa+Maz1WtyA4GXi0zf75wJQhnOdFAHfeeecwVOmFZs6c+dy/NxtKrXrYo3++Z7l9L5tNWZMyr6j48wN/7F5lBMCDT8zrdhXUpPl3lnqDbdJbeqg9XgTMbt5RtyA4XG4AXg/8CXi2y3WRJElamnGUEHhD64G6BcH5wKQ2+ycDswZ7kunTp/cDvxymOkmSJI202e121m2yyB2UcYLPiYjxwGYMIQhKkiSNBXULgpcDO0XE2k373gmMr45JkiTVxgqLFi1/swY7VS0oPRO4BziBxQtKX5mZLigtSZJqpVY9gpn5KPAm4Angf4AvABcCH+hitSRJkrqiVj2CkiRJWqxWPYKSJElazCAoSZJUUwZBSZKkmqrbgtI9KyI2B84AdqDcz+w7wJGZ+WRXKzbGRMRLgcOB1wDTgFmZOa1NuRnAScDWwAPA6Zl5RptyhwMHAxsAt1Ha7MqRewdjR0S8G/gnYDrlFo+zgbOAr2bmwqZytsUoiYh/AD5Juff6RMrn/X3ghMx8rKmcbdIFETGRsubthsB2mXlj07H3AUcBm1B+lo7PzAtbnr8ycDzwfsrNFW4APpGZt4xC9Zd7EbEf8I02h87MzI81lVuufj7sEewB1bI2VwFrAHsChwH7AOd2sVpj1cuAPYA/ALe3KxAR2wOXADcDMyg/+KdHxEEt5Q4HTgbOrM55F3BZRGwzYrUfWw4D+oFPAW8BfgB8Cfh8o4BtMeqmANcCHwJ2A75IWVXh4kYB26SrjqNNB05E7AmcTwntM4CfAd+uAkmzL1CCx7HA24EFwJURMXUE6zwW7QZs3/R1auPA8vjz4azhHhARRwLHABtn5iPVvn2BC4BpmXlbN+s3lkTEio3epog4D3hVa49gRPwImJKZr27a9zXgrcCGmbmwuiPNQ8DXMvOIqsw44FZgZmbuNSpvaDkWEetm5sMt+04DPgJMysx+26L7IuJDwFcpn/cc26Q7ImIa8BtKj+1XaeoRjIg7gFubP9eI+Anl5+jvq8cbAvcCH8/Mr1T71gDuBs5ttJOWrKlHcN3G3+o2ZZa7nw97BHvD7pRFrZu/sb5H6S1p/R+dlkHzJcd2qh/QN1HWl2z2LUr3/bbV49cCa1Eu4TfO/SxwETAjIlYYrjqPVa0hsHIzsCowxbboGY3fS6vYJl11JvBl4M7mnRGxKeVS/ndayn8L2C4i1q0e7wKMo6ntMvNx4IeUv0FaRsvrz4dBsDdsRctlyszsp4zz2LIrNaqvzYBVeOFl40avbKM9GvesvqNNuYmUMTwautcD84A/Y1t0TUSMi4hVI2I65WrFJZl5D7ZJV0TEe4GXAie2Odz4rJfUJtFU7qHMnNum3BYRYR4YvJkR8WxE3B0Rx0ZE43L9cvnzYcP3hsnAo232z6eM2dHomVxtH23ZP7/aTmkq15+ZTw1QToMUEa8C9ge+UP3v2LbonrmUSWs3An8C9q322yajLCLWAk4BjsjMJ9oUGUqbtJZplFuZEkC0dH+ijK/cjzJO8PvA0cB/VseXy58PZw1L6rqI2IAyHOJ6miaLqGt2BCZQZtZ/Brg0Inbuao3q60Tgrsy8oNsVqbvM/DHw46ZdP42Ix4DjIuKELlVrmdkj2BvmU6byt5pMuUym0dP4H9mklv2N/+nNayo3PiJWHaCcBlD1ePwIeBJ4W2Y+XR2yLbokM2/JzOsy82vAO4E3VlvbZBRFxMuAg4CjI2JStcJEo+duYjXZYyht0lqmUe5poF1vowZ2UbXdluX058Mg2BvuYPGYAeC5QaebUdaM0uiZTVlSYauW/VtX20Z7NMZ2tCv3OGXtKA2g+kV4CbAesFvL+CXbojfcAiykjFGzTUbX5pQrd1dRwsN84NLq2FXAL1j6Zw2Q1fYOYL2IaL3suDVw50AT6TQoy+XPh0GwN1wO7BQRazfteycwvjqmUVJN0vk50Dp9fx/gQeCm6vF1wGPA3o0C1fT/vYArMtN1mQZQDbC+CHgFMCMz720+blv0jO0pfyv+aJuMul9SemObvw6tjh0EHJCZd1MCxt4tz90HuKFpdv5PKIG+eYmZiZRlTfw707l/BBYBfcvrz4djBHvDV4FDgP+txhmsB5wGXJiZbRc9VmciYgKLl0rYGFizWowVyi/Neykr718bEedQ1nJ8HXAgcHDjf83VGncnAidHxMOUH/ADKL24+6LBOJPyR+gIYEJEvKbp2O2Z+Rdsi1EVET8GrqTMXvwb8ErKgt+/pyz4DbbJqKmWFLu6eV9EYxIwfU13FjkGuDAiZgM/pSwWvQtloeLGuR6IiLOBz0fEM5Q1BQ8HVgBOH7l3MXZUPx8/B2ZSQvUM4KPA1zPzj1Wx5e7nwwWle0REbEG5q8LrWXyLuSO8xdzwiohNKAuotrN/Zp5Xldudsur7VsAcykzWL7U53+GUEL8+5Y/nEd5Ca3Ai4h5KGG/njZl5dVXOthgl1X9E3w5sWu26hzKJ57QqmDfK2SZdEhE7Ui4Lt95i7v288BZz32l57srACZRZr2ux+BZzN49G3Zd3EXE6Jfy9mNKRdhfVnUOqlQ4a5Zarnw+DoCRJUk05RlCSJKmmDIKSJEk1ZRCUJEmqKYOgJElSTRkEJUmSasogKEmSVFMGQUmSpJoyCEqSJNXU/we2dyrPLzUr8gAAAABJRU5ErkJggg==\n",
-      "text/plain": [
-       "<Figure size 720x360 with 1 Axes>"
-      ]
-     },
-     "metadata": {
-      "needs_background": "light"
-     },
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAApsAAAFKCAYAAABSGJRzAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAA3OklEQVR4nO3debgcVZn48W8EuRDZQtiMIy4sr2DUGQIjICiKigFFHFnEUQSF0ZHFBRQXREDEhYyiAu4KjiCL/lQQxAUQHRDBwAAB8oIIZDAjsoRFYS6Y5PfHqSZFc29yb9+u23f5fp4nT6er3q46dbq679unzjk1ZenSpUiSJElNeEqvCyBJkqSJy2RTkiRJjTHZlCRJUmNMNiVJktQYk01JkiQ1xmRTkiRJjTHZlMagiDg6IsbUvGQRsTQivtLrckgjERFTIuLaiPhEr8syFBFxTkSc3etySCOxcq8LIE0Ww0ge92+0IGNEREwFPgj8KjN/1ePiNCIidgQuAfbJzDN7XJwhiYinAe8H9gI2Bh4DrgW+BpyemWPqR1AH9gE2Ab7Q64IM0aeA30fEizLz2l4XRuqEyaY0et7a9vzfgG2At7ctvxz4LvDp0ShUD00FPl79/1c9LIcqEbEBcBGwOfA94EvAasC/AP8JzI6It2bmkt6VcsQ+AHw/M+/pdUGGIjOvjojfA4fz5O8QaVww2ZRGSWZ+t/48Il4J/HP78pq/N18q6QlOoySab8jMc2vLvxARJ1ASnv8GThjNQkXEKsCSzBzRZyIi/gn4R+DIbpRrFJ0FHBsRB2Xmg70ujDRcJpvSGBQRRwMfz8wptWW3A/MpLZ5zgOcDtwKHZubFEbE7cCywGXAjcGBmzm3b7mbAccBOwNOAm4BPZub3h1G2vSktks8FEjgiMy9si1mritkD2BC4E/g2cHxmLo6IZwO3VeEfj4hWC+dpwH8A1wF7ZOYPqu1Fdex/yMxNa/v5T+Clmfms2rKtgWOAlwCrAHOBj2XmJW1lfDrwCeC1wDTgj8AXM/PLtZgdKZfB3ww8B3g3sC5wGfDOzPzDUOutts3VgaOBNwIzgAeBG4CjMvPXVcwmwPHADsA6wL3Ab4GDM/N/a/W3f2ae2rb9pcAxmXn0MI91G2Bn4FttiWbLh4HXAx+KiJMy85GGyrEjpc7fAmxKafl/BvBPEXEF8I3MPLRtX9OB/wU+n5lHDFD2lt2BxcDFba8/mrbPW7V8P8p5+5zMvL1atiXlM7Q1sAZwF/Br4N8y85EqZgpwMOXqxaaU9/g8ymflnrZ9vIpSt1sBU4CbgS9n5jdqYb+gfOZ3Bs5ZzvFJY5IDhKTx5bmUy5vnAx8C1gbOjYg3A18EzgCOquLOiYiVWi+MiM2B3wEvAD4LHEZJYs6JiLcMcf8vAb4MnA18FFgVOC8itq/tZzVKsrAfpTvAwZQ/7kcDX63C7gb+vfr/DymXB99arZ8HLAJeWtvvS4ElwCZVwtKyA+UPfWvfLwN+Q0nQjgWOAPqAn1dJTCtufeAK4DXAKcB7qv2eEhEDtXp9kHIpeQ6lD902wOmD1tLyfRk4hHLc7wY+Q6mPF1VleyrwM2B74OQq5hRgA0pyOizDONbXVY/fGWg7VaviGZS63a7BcrR8hJKQf4HyPi4EfgTsHRHtDSV7A08drOw12wE3tpLCDo5hPUritzHlM3QwcCowk/LjreXLwOcon7f3UPq77gFcEhGr1rb3Vsp7vUG1vQ8CVwK7tu36RuARyudPGnds2ZTGl00pLXm/AYiImyh/rL4FbJ6Zt1XL76ckbi8Hflm99guUP9hb1f7YnhwRPwc+HRFDGfwxE9guM39b7edU4BZKa2sr4Xwf8Dxgy8ycXy37WkTcBhwXESdkZkbE9yl/lK8boIvBZTwx2dwB+CmwY7X8rIh4JvAs4JPVa6ZUx/xfwKtax1KNoL+G0lLYSpKOoyShL8jMu6tlX4mIrwMfqVru7q/tf1XgRZn5aLXNRZRLyzMzc94K6qzda4GvZ+b7B1m/BeXHwp5tLc7HDXM/9dcN5Vi3qNYtbxBKa90WlL6dTZSjZQ3KOf231oKI+A5lgM+rgQtqsW8BrsnMG1ZQhudRWro7tR0l2d45M39fW95qmScitgPeCbwtM79TW34h5YfQvpTPw5rAScDVwA71BLg6lx+XmX+PiP9h2XskjSu2bErjy82tRLPyu+rxV61Es235cwEiYh3glZQWyadFxLqtf8CFlMuUmw1h/79vJZoAmXkvpbXrJRExrVq8FyXhu6dtP62kd8ch7Oc3wAury/FQEsyLKS1jrSR0h1oslJbBqMozvbbfNSmtUS+OiKnVH/I9KK3DS9vK+HPKgJgXt5XnO61Es22fzx3CsbR7oCrLMwZZ3+qTt3M1MrxjwzzWNarHh5azyda6NZYTM9JytHynnmhWfkG5XP74QJmIeC6wLWUA04pMp7Sad+qB6vG1VQv0QPYC/gpc2Hac8ymX3F9exb2acm5+ur2ldZAffYsoXTikcceWTWl8WVB/kpkPlO6M/E9bXOuPYisB3ITSH+zo6t9A1qf0wVyeWwZYdnP1+CzKH8TNKInf3QPEtvazIr+h/BjePiKuq7b9a2B1YM8qZgfgL7XW01ay/M3lbHc60E+pl7fz5JkABivjgrbnrYRlGsP3AUrf1AURcQ0l2f/PzEyAzLwtIj5HmX7oLVUr73nAd6vkfjjWY+jHWk8k7x8ktpVk/qXBcrTc2h5Q9ff9LnBQRKyRmQ9RWjUXU7qXDMWUFYcM6lLg+5SWzPdHxKXAucAZtcR4M8p5etcg22gd58bV41BbxqcA433aKU1SJpvS+LJ4mMtbf1hbVzE+zxMvP9YN93LwYJ5CaYX81CDr/ziEbfye0kftpZR+qQ9RLoWvARxdtdTuQGlBre8XSl/WwS6V3l1tD0py8q1B4tovx66ofocsM8+JiN9QBtu8GjgU+GBE7JeZZ1Qxh0XEt4Ddqpj/AI6MiJdl5o0MknTU++hWWnUylGO9kTKA5oXU+sG2eWH12HoPmyhHy2D9Kr9DSdj/hZK0/yvwi8z88yDxdfcw8A+EwZK4JxxH1eK4Z0T8M6U7xKso/TE/HBHbZOZfKMd6L/CmQbbZacvqNJYNqpPGFZNNaXJoJQd/z8xfLjdy+TYdYFmrRfGO6vFWYI0h7GfQVprMfKwaefxSYC3g8qpV6wrKlFCvp/Rf+3rtZa2WsIeWt++IuJuSvK48wrroWJUYfRX4akSsTekecAylC0Ar5gZKAvapiHghJYF+H3AgyxKWtds2/ay258M51vMog3L2ZYBks0og38yy0dc0VI7lysx5EXE18Naqz/JmlLobipsoswq0WwQQEWu39RttP45WGa6kDOQ5KiJmU37AHUjpP3wrJQm9IjP/upyytM7XmZRL7IOqBkQ9k8F/KEpjmn02pUmganG5BDhwoL6C1SjbodgqIratvW46JQG5PDNbicdZwNYRscsA+1kjIvqqpw9Xj4Ndiv4NMIvyh/vX1XE8Qmn1PILSqlhPiuYCf6Bc3nxSn8LWMWbmYsql0N0j4kWDxTUhIlaq9UOlKs/9lBartauYNQcYbX0TpaVv7eo1D1Ja6V7aFvfutm0P+Vgz8wpK/8n9I6J9NDSURGoz4DOt+S6bKMcQnUbp+/hBShL7wyG+7jJgi2rGhLpW4vf4cVT9Zd/WVs5p7YN3KAN8YFnCfRblb+tR7Tuv3v/W+f5zSv/cD7WXZ4B9bEEZpHb5wIcljW22bEqTx79T/theV40AvpXSf+zFlD9mmwxhG/OAn0TElyh/5P+Ncmn7w7WYEyjT6Pw4Ik6jJIGrUVpw9qRMvXR7NU/jDcCbIuJmyqXH2zKzNbjpNyybxqmeVP6akmw+SG3kdGYuiYh3UPpA3lhdhr6TMl3QyyjJaWtwxocoA5V+W9XFDZSk9x+BN1D+sDdhDeBPEfGDquwPUqazeQ1lZDLAKyizBHyf0od2CmVqnzUoiUzLNyiJyjcoCfhLGXiQ13COdV9KF4hzI+IMynuwKuWS9csog3Y+37b9JsqxIt+jTEP1RuDUYUxl9GNKK+grKIOVWn5O6Zf7zSiT1y+m9C29G9ioFvc2Sn/RH1I+P6tRbi/bSqbJzF9HxMnAB6oW6Z9R+glvQhkkdVRV5gcj4j2UbgW/r+r7Xsr8uc+g1HnLqyg/Nn42xOOUxhRbNqVJohqAshVlQMO+LJvDcWXgY0PczGXVa/amTCXUD+zemoy82s8jlKTiM5TE40TK5dnNKRN61/vWvQO4ndIn8Xssm3sTyiTmfwf+j3LJsqU1Evyy9tsmVuXYhnJZ+t2UBO7twH1VeVpxf6Ek2d+g9FM8iXKJekPK/KNNeZhS7y+g1PmJlPfk8Gr/UJLQnwK7UBKqT1ASzt3bpkI6ljIYag/KHI0rAbPbdzicY83Mu6rYY4B/okyX9QVKovnxzHxCS19T5ViRauqkn1ZPhzIKvfW6ayktkXu1LX+MkvDeSqnvQ6tyntS2iUsp5+JelHr5COV8fkXtRxKZeTDl3F6H0iL8aUrf27OpTShfTYT/Wsr5+RFK/W1L6dJQtxfww8x8AGkcmrJ0qYPbJEkDq7pdXE5pnNg2M+/scZEAiIhzKD8sntX+o2MFr9uH0tf3WR2M7h91Ue5Y9HtgVmZe0+vySJ2wZVOSNKjM/BPlMv9UytyRa/e2RI/fjWg3ypRRQ040K2dSWjDf2+1yNeTDwPdNNDWe2bIpSRoXIuI5lD6ub6dcbt50rLS0ShqcA4QkSePFy4BvU25isJ+JpjQ+2LIpSZKkxtiy2YG5c+f2AVtT7tE72J1FJEmSxoKVgKcDV82aNat/tHdustmZrVk2/YokSdJ40H6b31FhstmZ/wXYbLPNWGWVVRrbybx585g5c2Zj2x8vrIfCeiish8J6KKyHwnoorIeivR4effRRbr75Zqjyl9FmstmZxQCrrLIKfX19K4odkaa3P15YD4X1UFgPhfVQWA+F9VBYD8Ug9dCTrn/OsylJkqTGmGxKkiSpMSabkiRJaozJpiRJkhpjsilJkqTGmGxKkiSpMSabkiRJaozJpiRJkhpjsilJkqTGeAehMWz1tdblrvse7nUxumJq38qs8bTmbu0pSZLGJpPNMWzx0ilcdNWCXhejK3baeiOTTUmSJiEvo0uSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGrNyr3YcEXsC/wrMAtYBbgW+DHw1M5dUMacCbxvg5Xtm5vfbtnc4cBCwIXADcERmXtQWswZwArAHsCpwCXBIZt7etQOTJEnS43rZsnkY0A98AHgt8CPgi8Bn2uL+CGzb9u/iekCVaB4PnAzsCtwCnB8RL2rb1veA3YBDgL2BGcBFETG1WwclSZKkZXrWsgm8LjPvrj2/JCJWBw6OiCMzs79a/khmXjHYRiKiDzgSODEz51TLLgWuBz4K7FUtezElEd01My+oll1PaVHdDzilmwcnSZKkHrZstiWaLddQLm+vM4xNbQesBZxZ2/Zi4GxgdkRMqRbvAjwAXFiLWwBcVq2TJElSl/WyZXMgOwD3AX+pLds4Iu4HngbMAz6dmWfV1m9ePd7Utq0bgNWBZwB3VnHzW/1B2+J27krpJUmS9ARjJtmMiK2A/YFjqpZJKC2dV1ESwrWAA4AzI2K1zDy1ipkG9GfmI22bXFQ9rkNJNqcB9w+w60UMryX1cfPmzevkZUO22prrsXDhwkb3MVruvXcqd942UGP20MydO7eLpRm/rIfCeiish8J6KKyHwnooxlI9jIlkMyI2BH4AXEltgFBmfqEt9McRcTFwDHDqqBVwEDNnzqSvr6+x7d94ywJmzJjR2PZH0/Tp67LBpht19Nq5c+cya9asLpdo/LEeCuuhsB4K66GwHgrroWivh/7+/sYbyJan5/NsRsRawE+Bh4HdMvOxFbzkHGCjiFiver4I6IuIVdviplWP99Xi1h5ge9NqMZIkSeqiniabVYJ4LrA+8JrMvLeDzbT6am7etnwL4CHgT7W4qA0YqsfN72C/kiRJWoGeJZsRsTJlxPgLgdmZeccQXjOFMpXRHbXR7JdTRpnvXYtbqYq7MDOXVosvoLRs7lyLeyawfbVOkiRJXdbLPpsnA68DPghMjYhtautupFzePo0yEfsfKIniAcCOwFtbgZnZHxHHAcdHxN3A1VXcxsCba3G/i4jzgW9GxGHAg8CxwALGQP9PSZKkiaiXyWarhfGzA6x7OXAdpcXySMpl9scoieRumXlePTgz50QEwKHABpTR67tm5rVt290HmEOZwL2PcrvKPTPz4W4ckCRJkp6oZ8lmZj57CGGvH8b25lASyeXFPAS8s/onSZKkhvV8NLokSZImLpNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNWblXu04IvYE/hWYBawD3Ap8GfhqZi6pxc0GPglsAfwJODEzvzTA9g4HDgI2BG4AjsjMi9pi1gBOAPYAVgUuAQ7JzNu7fXySJEnqbcvmYUA/8AHgtcCPgC8Cn2kFRMS2wLnANcBs4NvAiRHxrvqGqkTzeOBkYFfgFuD8iHhR2z6/B+wGHALsDcwALoqIqV0+NkmSJNHDlk3gdZl5d+35JRGxOnBwRByZmf3AUcDVmfmOWsxGwMcj4muZuSQi+oAjKS2ecwAi4lLgeuCjwF7VshdTEtFdM/OCatn1lBbV/YBTGj5eSZKkSadnLZttiWbLNZTL2+tUSeQrgLPaYs6gXCrfsnq+HbAWcGZt24uBs4HZETGlWrwL8ABwYS1uAXBZtU6SJEldNtYGCO0A3Af8BdgYWAW4sS3mhurxedXj5tXjTQPErQ48oxY3v94ftBb3PCRJktR1vbyM/gQRsRWwP3BMZi6OiGnVqvvbQhdVj+tUj9OA/sx8ZDlxd1Zx7dtqxa0zwPIVmjdvXicvG7LV1lyPhQsXNrqP0XLvvVO587aBGrOHZu7cuV0szfhlPRTWQ2E9FNZDYT0U1kMxluphTCSbEbEh8APgSmoDhMa6mTNn0tfX19j2b7xlATNmzGhs+6Np+vR12WDTjTp67dy5c5k1a1aXSzT+WA+F9VBYD4X1UFgPhfVQtNdDf39/4w1ky9Pzy+gRsRbwU+BhYLfMfKxa1WqZXLvtJa0Wz/tqcX0RseoQ4tq31Yq7b4DlkiRJGqGeJptVgngusD7wmsy8t7b6VuBRlvXJbNmiepxfPbb6ag4U9xBlbs5WXNQGDNXj5iNJkqSu61myGRErU0aMvxCYnZl31NdXUx9dTDV1Uc0+wJ+Bq6vnl1NGme9d2/ZK1esuzMyl1eILKC2bO9finglsX62TJElSl/Wyz+bJwOuADwJTI2Kb2robM/NB4Fjg1xHxdeB04CXAgcBBrVHlmdkfEccBx0fE3ZQk9ADKaPY3tzaYmb+LiPOBb0bEYUBr+wuAUxs9UkmSpEmql8lmq4XxswOseznwq8z8bUS8nnJ3oH2BhcD7MvMr9eDMnBMRAIcCG1CmM9o1M69t2+4+wBzKBO59lNtV7pmZD3fnkCRJklTXs2QzM589xLgLGMJl7uruQXNWEPMQ8M7qnyRJkhrW89HokiRJmrhMNiVJktQYk01JkiQ1xmRTkiRJjTHZlCRJUmNMNiVJktQYk01JkiQ1ZtjJZkTsPMD9xSVJkqQn6aRl86fAnRFxQkS8qNsFkiRJ0sTRSbK5O3AZcBBwdURcFxGHR8SMrpZMkiRJ496wk83MPDcz96Lcg/xA4G7g08AdEfHziHhLREztcjklSZI0DnU8QCgzH8rMb2XmTsCzgI8A6wOnAXdFxHciYqculVOSJEnjULdGo68EPBXoA6YAjwCvBH4REddExMwu7UeSJEnjyMqdvjAi1gL2At4CvAT4O3A+8KHqcQmwG/B54NvA1iMtrCRJksaXYSebEbE7JcHcBVgVuAp4D/C9zLyvLfxHEbEucMoIyylJkqRxqJOWzf8H/An4AnBaZs5fQfx1wOkd7EeSJEnjXCfJ5quBizJz6VCCM/NK4MoO9iNJkqRxbtjJZmb+somCSJIkaeLp5HaVn4+IW5az/uaIOGFkxZIkSdJE0MnUR7sCZy1n/VnA6zorjiRJkiaSTpLNZwK3L2f9HVWMJEmSJrlOks0HgecsZ/1zKZO6S5IkaZLrJNm8GHhnRGzUviIing28s4qRJEnSJNfJ1EdHAbOBeRHxbeCGavlMYD9gMfCxrpROkiRJ41onUx/dEhEvAU4GDmlbfSlwSGZmNwonSZKk8a2je6Nn5g3AjtWtKJ9bLb41M+/tWskkSZI07nWUbLZk5j3APV0qiyRJkiaYjpLNiFgJ2JnSqjkNmNIWsjQzPzHCskmSJGmcG3ayGRFbAT8A/oEnJ5ktSwGTTUmSpEmuk5bNU4DVgN2B32Tm/d0skCRJkiaOTpLNFwIfzczzul0YSZIkTSydTOp+J4NfPpckSZIe10my+WngwIhYs9uFkSRJ0sTSyWX0dYC/AX+IiO8D/0O5a1Dd0sw8YaSFkyRJ0vjWSbL56dr/3zVIzFLAZFOSJGmS6yTZfE63dh4RmwCHA9tQ7q0+PzNntsWcCrxtgJfvmZnfb4s9HDgI2JByz/YjMvOitpg1KInwHsCqwCWUW2ze3oVDkiRJUk0n90a/o4v7fz6wK/A7Sv/RwfqQ/hH417ZlN9efVInm8cBHgKuBA4HzI+LFmXltLfR7wJaU+7o/CBwLXBQRL8jMh0d2OJIkSarr+HaVEbEpsCOwPnB6Zt4eEatQWhX/nJmPDmEz52Xmj6vtnQpsNUjcI5l5xXLK0gccCZyYmXOqZZcC1wMfBfaqlr2YktzumpkXVMuuB24F9qPMISpJkqQuGfZo9Ih4SkR8DZgPfJXSMvjcavUqlATvkKFsKzOXDHf/g9gOWAs4s7btxcDZwOyIaE3VtAvwAHBhLW4BcFm1TpIkSV3UydRHHwHeDnwM2JbanJuZ+VfKrSz/pSulW2bjiLg/Ih6LiGsiYu+29ZtXjze1Lb8BWB14Ri1u/gBJ7g3A87paYkmSJHV0GX1/4FuZeXxETB9g/fXAa0dWrCe4BriKkhCuBRwAnBkRq2XmqVXMNKA/Mx9pe+2i6nEdymT004D7B9jHoipmWObNmzfclwzLamuux8KFCxvdx2i5996p3Hnb3R2/fu7cuV0szfhlPRTWQ2E9FNZDYT0U1kMxluqhk2TzH4Arl7P+EWCNzorzZJn5hbZFP46Ii4FjgFO7tZ9OzJw5k76+vsa2f+MtC5gxY0Zj2x9N06evywabbtTRa+fOncusWbO6XKLxx3oorIfCeiish8J6KKyHor0e+vv7G28gW55OLqP/GXjWctbPAro5Yn0g5wAbRcR61fNFQF9ErNoWN616vK8Wt/YA25tWi5EkSVKXdJJs/gD492o0estSgIiYDexLGZgzmlp9NTdvW74F8BDwp1pc1AYM1ePmN1c8SZKkyamTZPNoYAGlL+XplETzIxFxBfAT4FrgU90qYLsqUdwLuCMzW50AL6eMMt+7FrdSFXdhZi6tFl9AadncuRb3TGD7ap0kSZK6qJNJ3R+MiO2A9wN7Av9HSdZupSSiJ2Tm/w1lWxExlWVTDj0LWDMi9qieX1U9nkaZiP0PlETxAMr8nm+tlak/Io4Djo+IuymTuh8AbAy8uRb3u4g4H/hmRBzGskndF9Dj/p+SJEkTUUeTulfJ5PHVv5FYn9L/sq71fH/gXEqL5ZFV7GOURHK3zDyvrUxzIgLgUGADyuj1XdvuHgSwDzCHMoF7H+V2lXt69yBJkqTu6/gOQt1Q3Y+8vf9ku9cPY3tzKInk8mIeAt5Z/ZMkSVKDhp1sRsS3hhC2NDPf0UF5JEmSNIF00rL5CqrR5zUrAU+vHu8G/jbCckmSJGkC6GSA0LMHWh4RT6Vcmn4v8KoRlUqSJEkTQidTHw0oMx/LzJOAnwMndWu7kiRJGr+6lmzWXAu8tIHtSpIkaZxpItl8FeA0QpIkSepoNPpRg6xam9KiuSXw6RGUSZIkSRNEJ6PRjx5k+SLKXYTeBXy90wJJkiRp4uhkNHoTl94lSZI0AZk4SpIkqTGd9NncqJMdZeaCTl4nSZKk8auTPpu38+Q7CA3FSh28RpIkSeNYJ8nmAcChwDOBM4Cbq+UB7AMsAL4ILOlGASVJkjR+dZJsPh3oAzbJzEX1FRHxceAyYMPM/FQXyidJkqRxrJMBQu8CvtaeaAJk5r2UaY/+faQFkyRJ0vjXSbI5HVh9OeufVsVIkiRpkusk2bwCeE9EzGpfERFbAe8BfjfSgkmSJGn866TP5sHAr4ArI+Iq4JZq+abA1sB9wCFdKZ0mjCVLlnLXfQ939NrV1lyv49c2YWrfyqzxtFV6XQxJksaFTu4gdGNEvAD4EDAb2KNadQfwBeCzmfnn7hVRE0H/Y4u5/LqFHb124cKFzJgxdpLNnbbeyGRTkqQh6qRlk8y8C3hf9U+SJEkaUEfJZktEbAqsD8zLzAe6UyRJkiRNFB3dGz0i3hwRC4D5wK+BWdXydSPi5ojYq4tllCRJ0jg17GQzIt4IfBe4CfgAMKW1LjPvqZbv260CSpIkafzqpGXzo8AvM3Nn4LQB1v8OeNGISiVJkqQJoZNkc3Pgh8tZ/xdgvc6KI0mSpImkk2Tzbyz/DkIbA/d0VhxJkiRNJJ0kmxcD+0XEkyYajIgZwIHAz0ZaMEmSJI1/nfbZfDrwe+DdwFJgl4j4NHA9sAQ4pmsllCRJ0rg17GQzM28BXgL8GTiaMhr9/cAHgf8Gts/MBd0roiRJksarYU3qHhErAc8A7srMV0fENGATStL6x8y8u4EySpIkaZwa7h2EngLcChwBfC4zFwFXdb1UkiRJmhCGdRk9Mx8DFlL6aUqSJEnL1ckAoW9TRqOv2u3CSJIkaWIZ7mV0gJuBlYD5EXEa8EfgkfagzDx7hGWTJEnSONdJsvnd2v8/NkjMUmCFyWZEbAIcDmwDzATmZ+bMAeJmA58EtgD+BJyYmV8aIO5w4CBgQ+AG4IjMvKgtZg3gBGAPYFXgEuCQzLx9ReWVJEnS8AzpMnpEfDEiZlVPX179ex3wytrz+r9XDHH/zwd2Bf4A3DjIvrcFzgWuAWZTLuOfGBHvaos7HDgeOLna5i3A+RHRfp/27wG7AYcAewMzgIsiYuoQyyxJkqQhGmrL5sHAFcDczLw0IqZT7oH+qsy8dAT7Py8zfwwQEacCWw0QcxRwdWa+o3p+SURsBHw8Ir6WmUsiog84ktLiOafa3qWUSeY/CuxVLXsxJRHdNTMvqJZdTxlhvx9wygiORZIkSW06GSDUMmWkO8/MJctbXyWRrwDOalt1BuVS+ZbV8+2AtYAza9teTLmUPzsiWmXdBXgAuLAWtwC4rFonSZKkLhpJsjkaNgZW4cmX2G+oHp9XPW5ePd40QNzqlInoW3HzB0hyb6htS5IkSV3SyQCh0TStery/bfmi6nGdWlx/ZraPiq/H3VnFtW+rFbfOAMuXa968ecN9ybCstuZ6LFy4sNF9jJb+WGdExzKW6uHee6dy5229uVnW3Llze7LfscZ6KKyHwnoorIfCeijGUj0MJ9l8bkT8c/X/tarH50XEXwcKzswrR1SycWDmzJn09fU1tv0bb1nAjBkzGtv+aOrrW7XjY1m4cOGYqofp09dlg003GvX9zp07l1mzZq04cIKzHgrrobAeCuuhsB6K9nro7+9vvIFseYaTbB5T/at70vRDlL6cSylzcY5Uq2Vy7bblrRbP+2pxfRGxamb+3wriBsoSptViJEmS1CVDTTb3b7QUg7sVeJTS1/LC2vItqsf51WOrr+bmlCmS6nEPUebmbMW9KiKmZObStrj5SJIkqauGlGxm5mlNF2SQ/fZHxMWUqYs+X1u1D/Bn4Orq+eWUUeZ7UyWbEbFS9boLa4nlBZSplHamSl4j4pnA9sB7Gj0YSZKkSainA4SqidRbUw49C1gzIvaonl+VmXcAxwK/joivA6cDLwEOBA5qjSqvktLjgOMj4m5KEnoAZTT7m1v7y8zfRcT5wDcj4jDgwWr7C4BTGz1YSZKkSajXo9HXB85pW9Z6vj9wamb+NiJeT7k70L7AQuB9mfmV+osyc05EABwKbECZzmjXzLy2bfv7AHMoE7j3UW5XuWdmPty1o5IkSRLQ42Szuh/5CieHr+72c8EQ4uZQEsnlxTwEvLP6J0mSpAaN9UndJUmSNI6ZbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqzMq9LsCKRMR+wLcHWHVyZh5ci5sNfBLYAvgTcGJmfmmA7R0OHARsCNwAHJGZFzVQdE1QS5Ys5a77Hh71/a625npd3e/UvpVZ42mrdG17kiQNZMwnmzWvAR6oPf9z6z8RsS1wLvAd4DDgJcCJEfFYZn6lFnc4cDzwEeBq4EDg/Ih4cWZe2/whaCLof2wxl1+3cNT3u3DhQmbM6F6yudPWG5lsSpIaN56SzbmZec8g644Crs7Md1TPL4mIjYCPR8TXMnNJRPQBR1JaPOcARMSlwPXAR4G9Gi6/JEnSpDPu+2xWSeQrgLPaVp1BuVS+ZfV8O2At4MxWQGYuBs4GZkfElOZLK0mSNLmMp5bNeRGxHrAAOBX4ZGb+HdgYWAW4sS3+hurxecDvgc2r5zcNELc68Azgzu4XW5IkafIaD8nm/wIfB64EFgOzgY8BzwH2A6ZVcfe3vW5R9bhO9TgN6M/MR5YTN6xkc968ecMJH7bV1lyPhQtHv29gE/pjnREdy1iqh5Eey0h0c7/33juVO2+7u2vbG01z587tdRHGBOuhsB4K66GwHoqxVA9jPtnMzJ8BP6st+kVEPAAcHRGf6FGxAJg5cyZ9fX2Nbf/GWxYwY8aMxrY/mvr6Vu34WMrAmLFTDyM5lpHodj1Mn74uG2y6Ude2N1rmzp3LrFmzel2MnrMeCuuhsB4K66For4f+/v7GG8iWZ7z22Ty7etySZS2Ta7fFtFo876seFwF9EbHqCuIkSZLUJeM12ay7FXiUZX0yW7aoHudXj62+mgPFPUSZm1OSJEldNF6TzTcBSynTIfUDF/PkqYv2oczFeXX1/HLKPJ17twIiYqXqdRdm5tKmCy1JkjTZjPk+mxHxM0oyOQ9YQhkg9G7gm5n5xyrsWODXEfF14HTKpO4HAgdl5hKAzOyPiOOA4yPibkoSegBlNPubR/GQJEmSJo0xn2xSLn+/HfgHSnlvAY4ATmwFZOZvI+L1lLsD7QssBN5Xv3tQFTcnIgAOBTagTHu0q3cPkiRJasaYTzYz873Ae4cQdwFwwRDi5gBzRlwwSZIkrdB47bMpSZKkccBkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUmJV7XQBJvbFkyVLuuu/hXhdj2FZbc70nlXtq38qs8bRVelQiSdLymGxKk1T/Y4u5/LqFvS7GsC1cuJAZM56YbO609UYmm5I0RnkZXZIkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGJNNSZIkNcZkU5IkSY0x2ZQkSVJjTDYlSZLUGO+NLmncW7JkKXfd9/CKA8eBqX0re593SROKyaakca//scVcft3CXhejK3baeiOTTUkTipfRJUmS1BhbNiVpDBlql4DV1lxvTHcdsDuApBaTTUkaQ4baJWDhwoXMmDF2k027A0hqmXTJZkRsCnwJ2B54BDgTOCIzx+63tiRJ0jg1qZLNiFgbuAS4A9gDWB/4HLAe8KbelUySJGlimlTJJvBOYBrwj5l5D0BE/B04PSI+kZk39LR0kjRBjNZ0VKPRd9X+p9LITLZkcxfgolaiWfkB8C1gNmCyKUldMFrTUY1G31X7n0ojM9mSzc0pieXjMrM/Im4FnjeM7awE8Oijj3axaE+2ZMliVn7Kkkb3MVoW//2xjo9l1adOGVP1MJJjGYlu10OvjmOkBqqH8XosAxnqsYy1z0W70XpPRqMe/v7Yo/T3r9ToPrqhv7+/10UYE6yHol4PtXylJyfylKVLl/Zivz0REY8BH8vMT7ct/y/gL5n5L0PZzty5c7cHftNAESVJkpqyw6xZs/5rtHc62Vo2u+UqYAfgf4HFPS6LJEnS8qwEPJ2Sv4y6yZZsLgLWHmD5NGD+UDcya9asfmDUfxlIkiR16NZe7Xiy3a7yJkq/zcdFRB+wMcNINiVJkjQ0ky3ZvADYKSKm15a9Aeir1kmSJKmLJtsAobWBecDtwCdYNqn7RZnppO6SJEldNqlaNjPzfuAVwF+B/wd8HjgLeHsPiyVJkjRhTaqWTUmSJI2uSdWyKUmSpNFlsilJkqTGmGxKkiSpMZNtUvcxLyI2Bb4EbA88ApwJHJGZD/e0YCsQEXsC/wrMAtahTB77ZeCrmbmkFjcb+CSwBfAn4MTM/NIA2zscOAjYELiBUgcXtcWsAZwA7AGsClwCHJKZt7fF9aROI2J1yvytzwC2zszf19btC3wEeDalro7NzLPaXv9U4FjgbZSbEVwFvCcz/7stbkPgC8BrgKXAT4D3ZuY9bXH/TJl9YRZwH/CNar+N3QUrIt4KvJfyfj8MXA3s0yrbZDgfImJ3ynu9OfA34DLgQ5l5S1vchDknImIT4HBgG2AmMD8zZw4QN2bf/6GWbST1EBErAYcBu1b7WRm4Hjim/fgmcj0MED8LuBJ4JDNXb1vXk8/AUD6fKzKMz8WqwIeAtwL/ANwDXJCZB7bFjZvzwZbNMaSamukSYA3KiXEYsA/wrR4Wa6gOA/qBDwCvBX4EfBH4TCsgIrYFzgWuAWYD3wZOjIh31TdUfYCOB06mfAnfApwfES9q2+f3gN2AQ4C9gRnARRExtbatteldnR7NAD/oImIP4DTgh5R6+CXwvepDXPd5yhfJx4HXA49Sjm9GbVsrAxcCLwD2BQ4AtgPOjYgptbjnVvu5j/L+HE95rz7ZheMcUER8lPKD4/9RjvMdlC/Evmr9hD8fImInyvHPB/6lKtvzgF9GxJq1uIl2Tjyf8l79AbhxoICx/P4PtWxDsKJ6WI2SwPw3sD/wJsof8F9ExGvbyjSR66G+z6dQvjfuHiRk1D8Dw/h8rshQPhdPofz93Lcqz6uBD1Jm0anHjavzwZbNseWdlFtn/mOt5efvwOkR8YnMvKGnpVu+12Vm/cvhkqpl7+CIODIz+4GjgKsz8x21mI2Aj0fE1zJzSXVHpyMpv5bmAETEpZRf+x8F9qqWvZjyAds1My+oll1P+cW5H3BKtY+e1GlEzATeBbwf+Grb6k8A52Tmh6vnl0TE5sAxwE+r1z+jev2hmfn1atkVwG2UlsIPVq99I/AiYGbrWCJiIaX1bDbLblbwAeB+YM/qvbgoItYCjoqIz2bmfd07eoiIoCTbb8jMn9RW/aj2/8lwPuwD3AG8LTOXVvu7A/gd8BKq95uJd06cl5k/rvZ9KrDVADFj+f1fYdm6VA+PAM/JzEWtBRHxc2Azyh/8n1TLJno91B0IrEVJdA6tr+jhZ2CFn88u1sP+wLbAFpn5p9ry02v1MO7OB1s2x5ZdKBPM15v5f0BpMRzuL6hR1ZZotlxDabZfp/pwvIIyr2ndGZRLAFtWz7ejfNGcWdv2YuBsYHbtV+kuwAOUX6+tuAWUL5NdatvvVZ2eDJwE3FxfGBHPobRsndkWfwawdUSsVz1/NbAStfrKzIcof3zaj+/6epKUmZdTEpz2uB9VX6j1fbbel27bH7ijLdF83CQ6H54KPNRKNCv3V49TYGKeEytKQsby+z+Msq3QiuohMxfXE81q2VJKS+eM2uIJXQ8tEbEupbXuPZQWy3aj/hkYxudzhYZYDwdSEts/LSdm3J0PJptjy+a0Na1XH4JbKSf7eLMD5fLEXyj3n1+FJ186aH0ZtI6vde/6mwaIW53S/7EVN3+AD+8NPLGuRr1Oo/RT3AQ4boDVreMbrB6iFndXZt47QNxm1aWWVtxAl2Mer4eIeBqwUXtc1WfnYZqph22A6yLiyIj4c0Q8FhFXRsTLqvWT5Xw4Fdg8Ig6JiLUj4tnAHMrxtPpWTZZzom4sv/9DLVsjqvdxO554zJOlHj4D/FdmXjjI+l58Bob6+RyxKP1RtwRuj4jTIuKvEfG3iPhR1YLYMu7OB5PNsWUay1o96hZRBt2MGxGxFaV16/PVL65p1ar720Jbv+pbxzcN6M/MR4YQ176tVly9rka1TqtLMCcAH8zMvw4QMpx6aI9pxT2V8oWyorjWttYeZJ/tcd20IfAqyjlwKPA64EHgwirhmhTnQ2ZeQumr+clqH7cBzwFeVWtNmSznRN1Yfv+HWramHEJJYP6jtmzC10PVH3Af4H3LCevFZ2A062E65TiOoHyHvpHS1/1FwAVR+qK2yjSuzgeTTXVdlFGAP6CMJvzMCsInmuOAWzLz9BVGTmxPoXzxvzEzz65aKnajJJwf6GnJRlFEbAd8B/gm5RLUnsASykCF1XpZNo09Vcv/Z4E5mfmbXpdntEQZlX8K8LnM/GOvy9NDrZzsr8DumfmzzDyT8r3xfOANPSvZCJlsji2LWPZrq24a5XL0mFe17P2Uchlit8x8rFrV+vWzdttLWr+W7qvF9UWZ+mFFce3basXV62rU6jQink/pvP6x6pLp2iz7pb16lCkohlMP7TGtuMdYNjJxKMd3/yD7bI/rpkXAvVmbjiTLFBpXUKb8mPDnQ+WLwCWZ+b7MvCQzv0/psP9PlGlNWmVigHJNtHOibiy//0MtW1dFxAuBH1MG0R3Rtnqi18OBwNOBU2rfnatCGSld+2HWi8/AaNbD/ZRpmi6rt1pmmTbvQcp3Z6tM4+p8MNkcW25iWV8M4PFOuRtTpk4Z06oT/1xgfeA1bf1qbqV0+N687WVbVI+t42v1QRko7iHKtCCtuKh1hK7H1etqNOt0U8oMD5dQPpiLgPOqdZcAv2H5xweQ1eNNwPoR0X5pYgvg5lofnCcdXy1uPkBm/g1Y0B4XEc8CptLMubW8Ud2rMjnOh9b+/7u+IDPvpMybt3GtTLSXi4l3TtSN5fd/qGXrmojYGPgZZR7at7YNKIOJXw/PAzagHEfru/MI4GnV/z9VK/dofwaG+vkcseoH+e2DrF5KlYCvoExj8nww2RxbLgB2iojptWVvoIyMu2Dgl4wNVV+Ss4EXArMz8476+qp/2sVUUzLU7AP8mfIlC3A5ZfTc3rVtr1S97sLal/AFlF9YO9finkmZkLZeV6NZp/8FvLztX6v/0buAAzLzNsqHcu+21+4DXFUb1f9zyuXWx+srylRSr+PJx/eCahqOVtw2lImH2+N2j4hV2vbZz7KBKt30E2B6RDw+QrHqkL8tMHeSnA9QRr/Oqi+o/pitS/VHZRKdE48by+//MMrWFVW3o59X2949MwcahT3R6+EknvzdeRrwf9X/T6riRv0zMIzPZ7f8BNi+3s0mysTzawFzq0Xj7nyYsnRp+w8o9Up16WAe5Y/QJygthJ+jTEnwpt6VbMUi4qvAv1HmOWvva3RjZj5YdQD/NWWE7umUeQaPBQ7KzK/UttWarPbDlBP4AEpH6Rdn5rW1uJ9QLkceRrnEcCylSf8F1S/EntdpROxIadV8/A5CUe62dBbl1/ovKBMTv4cyF9pPa689iXKp9TBK0nI4ZV62F2TmwipmZeD3lE7lH6a0rJ4A3AW8JJfN7fhcSgvbxZS7REQV96XM/FADx/0U4LfAepR53x6qjmNrylxuf5gM50NEHEyp75Mol0inU+bHWw94fqv1f6KdE1EmjG5NrXIQpYXk/dXzqzLzjrH8/g+1bCOtB8pMHb+tlr+F8h49LjOvmAz10N44Ub3maODwfPIdhEb9MzDUz2c36qFKBq+lvMefpySLx1Peyy1bXdPG2/lgy+YYkpn3UwYR/JVy15HPU07wt/ewWEPV+uX0WcqXZ/3flgCZ+VvKh3RryiWjA4D3tZ+kWSap/QhlFPNPKZdYdq1/gCr7UH4FngKcQ/l19cqs3V5rLNZpZp5DGaW9B6UedgbePMCX1vsod9I4jtI9YTXK8S2sbevvlNuxzQO+S7mjwxWU/rJLa3F/BF5JSXLOpyQ8/0FJBLuuupy1K+WLqfX+AOyYmX+oYibD+XAyZcLkHSh98U6k3D3k5fVuJhPwnFif8h6cA+wIPLP2/OXV/sfs+z/UsnWhHjagjDRenXJ+tH93TpZ6GI5R/wwM4/O5IkP5XPxP9f8p1fKTKA04r8xlYyDG3flgy6YkSZIaY8umJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqjMmmJEmSGmOyKUmSpMaYbEqSJKkxJpuSJElqzP8HVvNfDeBqWn8AAAAASUVORK5CYII=\n",
-      "text/plain": [
-       "<Figure size 720x360 with 1 Axes>"
-      ]
-     },
-     "metadata": {
-      "needs_background": "light"
-     },
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "df1 = df[df[\"name\"].isin([\"IssueQuery\"])]\n",
-    "df1['delta'] = df1['ts'].diff()\n",
-    "ax = df1['dur'].plot.hist(bins=BINS, alpha=0.5, figsize=figsize)\n",
-    "ax.set_title('IssueQuery duration (usec)');\n",
-    "plt.show()\n",
-    "ax = df1['delta'].plot.hist(bins=BINS, alpha=0.5, figsize=figsize)\n",
-    "ax.set_title('Time between IssueQuery (usec)');\n",
-    "\n",
-    "# df1['delta'].describe()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# for SingleStream\n",
-    "if False:\n",
-    "    df1 = df[df[\"name\"].isin([\"QuerySamplesComplete\"])]\n",
-    "    ax = df1['dur'].plot.hist(bins=BINS, alpha=0.5, figsize=figsize)\n",
-    "    ax.set_title('Inference time (usec)');\n",
-    "    plt.show()\n",
-    "    ax = df1['dur'].plot(figsize=figsize)\n",
-    "    ax.set(ylim=(0, 100))\n",
-    "    ax.set_title('Individual inference time (usec)');"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "scrolled": false
-   },
-   "outputs": [
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAoMAAAFtCAYAAAB8yGDhAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAABA20lEQVR4nO3deZwcZbX/8U8WMkmAkIWwBAkgwgGM4DXyExQF5SKyCFxlEQQFRUAQRETZd81FiQsoKKAsV5TVK8tlEWVfJQ4IJJAjW4hhIIQkLJIwZJnfH+fppNL0bD29Tdf3/Xrl1emq6uqnz9TUnH7qeU4N6OjoQERERETyaWC9GyAiIiIi9aNkUERERCTHlAyKiIiI5JiSQREREZEcUzIoIiIikmNKBkVERERybHC9GyAi+WFmw4GfArsDawHnuvvRdW1UAzKz9YEXgIPc/bL6tmZFZvZd4AhgY3dfXO/2dMXMdgGuBjZw9zn1bo9Io1LPoIj0mJkdaGYdZrZVmbs4BjgEuBg4APhdxRrXD5nZ4WZ2YL3b0VNmtipwAnBOoyeCAO5+M/Ac0WYR6YR6BkWklrYDHnf3U+vdkAZxOPAacFnR8heBYcCiWjeoG18j2vU/9W5IL1wInGNmp7v7m/VujEgjUjIoIrW0BjCvUjszswHAUHdfWKl9NgJ37wDeqXc7SvgacIu7v13vhvTCdcB5wN7Ab+rcFpGGpGRQRPrEzC4DvgRsCJwP/CewELgcOM7dl5jZdsBdmdcU7oO5gbvPMLMW4Hhgf2A80Vt2DXCSuy8oet2FaV8nA0Zcdr7MzFYDTgP2JMYjzgIuBSa5+5L0+vWJsXgnAHPTe74PeAI43N2nFH22jYEzgO2BEcC/gJuz4xzNbG3gLGBXYBTwPHCeu/+qm7jNANYriseL7r5+qTGDZnZ6+nybAScBnyd6Di9Kz8cR8f8MEf/J7n5O0Xv2KM6dtHcDYPP0Htnl72lrZl0HcIa7n56erwKcDnwxtfdNYBpwqrvfm3ndlkTcPwEMAVqBU9z9rqL9r532twswFngZ+AtwjLu/BeDur5rZE8B/oWRQpCSNGRSRShgI3EYkWMcC9wDfJRI1gKeJMYKzgOnp/wcAc1Lv3p+A7wM3A0cSCcrhwPVpfdaniITkj8BRwHQzG0YkiAcCVwDfAu4kEoULS7R3n/R+FxJJ5frA/5rZSoUNzOyDwCPAjsAl6b2uJZKwwjZrAA8DnwMuAL4NTAUuMLOTu4nZ0SXicXQ3rwG4kvgifzzwEJHYfpdIgl4BjgOeAX5sZp/JtLW3cS728fT49x60sTO/Su/7p/S+PwLmAFtk2rktcB8wGjgzfZ4W4Pb0paKw3VrEz+crxLFwJHG5/f8BY4retxXYugefUSSX1DMoIpWwEnCtu5+Znv/azB4Fvg78yt1nA1eY2fHAa+5+ReGFZrYfkUx92t3vySz/O5HY7QDcnnmvTYCPuPs/MtuemFk+PS2+yMxeAH5gZue4u2f2sS6wkbvPT6934AYi8fu/tM35xDnyQ+7+Qua9Tsrs5wdEovKhzGzVX5vZxcCJZvZLd3+9VMDc/Xoz+0FxPHqg1d2/ntpyETAD+DHRc/bDtPxKoI24rHtnet2+9C7OxTZJj8/3oq3FdgUudvdjSq1MydqFwP3ADulyOWb2a+AxYBLLk9Kzid7Fj7v73zK7Ob1E0vc80Wu7NhEXEclQMigilXJx0fP7iN6u7uwN/BOYZmarZ5bfA3QAn2bFJOXBbCKY2cf9wGtF+/grkbBtB2STwT8WEsFMWwHeD2BmY4FtgfOziSAsG89XSFz2JHq5Oore93bgYOBjwJ87++BlWnapM12C/ztxqfu3meWvpwT3/ZnX9TbOxcYAS4E3+tD2N4CPmdk67v5SifVbEJf+fwyMMbPsur8AR6byRO8Ql31vLUoEgeU/o4zCz3p1lAyKvIeSQRGphEXu/nLRsvlEb0x3NiYSgM7qwK1R9Py5TvaxRS/2MTP7xN3np8Sj0N5CEjW1k/1BjFEbRfS+fa2H71sJM4uev0HE/5USy9fMPO9tnEsZkP4VJ1s99T1iLOlMM3uMGFrwu0yv7cbp8belXpyMAd4lxnB29fPJKvQUlttukaamZFBEKmFpH147EHiKGG9XSnFPTqmZwwOJy6H/3ck+ii9tLulku96MKSuMub6SGFNYyrRe7K+nSrW9s/hnP09v41zstbS/1Vje0wadJFhmNqh4mbtfa2b3EUXHP0uMw/y+mR3o7n9geUyPJ8b5lTIntaE3Ckn+a718nUguKBkUkXp7DpgI3FHi8l5v9rGqu/+1gm0CmNDFNnOAt4DBfXjfWvZU9TXOT6fHDVgxGSz8f2TR9uuV2knqwbwQuNDMRhITcM4A/sDyuL/VVUzN7F1iJnJXP5+sQpuLe09FBM0mFpH6u5q4nPnN4hVm1pLuetGTfWxpZjuX2MeqqaRKj7n7a8RYugNTSZXs/gakbZYQNez2MLMtiveRxh125216dim9Evoa5wfS40ezC1Mh59eIWd5Zhxe9x6BU/if72teJsjQj06JW4FngmFLtKcTU3ZcSYzV3MrOPldiuuId3IvBwH75siDQ19QyKSL1dQUzEOD+VFbmfuBxpxKSHvYC7u9nHOUTJlxvM7HIiqRhG9BztBXyImHXbG0emtrSa2YXEpebxRE3FjdI2xxOTUx5KM4inEcndh4kJDkO7eY+/A4eb2WnE5I5/u/tNvWxnT/Upzu4+08z+Qcw6vqho9W+A483sN8Rn+hTLx/8VrAq8ZGZ/BB4nevY+Qcxw/mV6j6Vm9nViLOFTZnYJUX5nHDGhZwAx0QWipM4OwN3p5/MUkex+gYj9DFhW/mdz4Nfdh0gkn5QMikhdpQTgC0SNva8S48kWEsnXBURB6O72sTDVoDuBSGwOIC7hPkMUhO715UF3fzLdg/ks4FAiufwXcFNmm1dTz9QpwB5Er9s84pLqd3vwNmcSZW6OISZEvJjdfyVVIs7E2MizzWzloruQnElMqNmTiP+twE7Aq5ltFhDlenYAdiPKEb1A1KU8N9POe1PcTyF6F0cQP78prDiT+uUU+7OIsjkjiXGPt7Pi2MAvEhNOru7B5xPJpQEdHeo1FxGR7qVLt88Tdwzp8g4rjSL1Zt6dvWuMiKxIYwZFRKRH0i3ezga+Z2YNf2XJzHYBPkAUqxaRTqhnUERERCTH1DMoIiIikmNKBkVERERyrOHHfDSi1tbWFmBL4GU6v5OBiIiISCMYBKwNTJk4cWJ78Uolg+XZkuU3thcRERHpDz5J1BhdgZLB8rwMsPHGGzNkyJA+7Wjq1KlMmNDTOyo1L8VBMQDFABQDUAxAMQDFACoXg3fffZd//vOfkPKXYkoGy7MEYMiQIbS09OouVyVVYh/NQHFQDEAxAMUAFANQDEAxgIrHoOTQNk0gEREREckxJYMiIiIiOaZkUERERCTHlAyKiIiI5JiSQREREZEcUzIoIiIikmNKBkVERERyTMmgiIiISI4pGRQRERHJMSWDIiIiIjmm29E1sLfefpcF7Yvr3YyKGN4ymFVX7tt9nEVERKTylAw2sAXti7ljysx6N6Mitt9yvJJBERGRBqTLxCIiIiI5pmRQREREJMeUDIqIiIjkmJJBERERkRxTMigiIiKSY0oGRURERHJMyaCIiIhIjikZFBEREckxJYMiIiIiOaZkUERERCTHlAyKiIiI5JiSQREREZEcUzIoIiIikmNKBkVERERyTMmgiIiISI4Nruebm9kHgGOBrYAJwHR3n1C0zWXAV0u8fC93v65o22OBI4C1gGnAce5+R9E2qwLnAHsCQ4G7gCPdfUYFPpKIiIhIv1LvnsEPArsAzwJPdbHd88DWRf/uzG6QEsFJwPlpn88AN5vZFkX7uhLYDTgS2AcYB9xhZsP7+mFERERE+pu69gwCN7n7DbCsB/CjnWy30N0f7mwnZtYCnAz83N0np2X3AE8CJwF7p2UfIxLFXdz9lrTsSeA54EDggr5/JBEREZH+o649g+6+tEK7+jiwGnBVZt9LgGuAncxsQFq8M/AGcFtmu5nAA2mdiIiISK7Uu2ewpzY0s9eBlYGpwNnufnVm/abp8emi100DVgHWAWal7aaXSEKnATtWutEiIiIija7eYwZ74jFikskexKSPWcBVZnZgZptRQLu7Lyx67fz0ODqz3esl3mN+ZhsRERGR3Gj4nkF3P7do0Q1mdidwBnBZ7Vu03NSpUyuyn9bW1pLLh40YS1tbW0Xeo97mzh3OrBfmdLlNZ3HIE8VAMQDFABQDUAxAMYDaxKDhk8FOXAtcYGZj3X0O0bPXYmZD3f2dzHaj0uO89DgfGF9if6My2/TYhAkTaGlp6e3LVtDa2srEiRNLrps9bwHjxi3o0/4bxZgxq7PmRqVCH7qKQ14oBooBKAagGIBiAIoBVC4G7e3tXXZg9YfLxD1RGCu4adHyzYC3gJcy21lmQkl2u+nVa56IiIhIY+p3yWBK5PYGXky9ggAPErOE98lsNyhtd5u7d6TFtwAjyUwWMbN1gW3SOhEREZFcqfcdSIazvKTLesAIM9szPZ+SHi8nCkU/SyRyBwPbAQcU9uPu7Wb2A2CSmc0BHk3bbQjsl9nub2Z2M/BbM/su8CZwJjCTOo8/FBEREamHeo8ZXIMY/5dVeH4QcCPR43dy2nYRkejt5u43ZV/k7pPNDOAoYE2iXMwu7v540f73BSYTBaZbiNvR7eXuzTE4T0RERKQX6poMpvsBF4/fK7Z7L/Y3mUj0utrmLeDQ9E9EREQk1/rdmEERERERqRwlgyIiIiI5pmRQREREJMeUDIqIiIjkmJJBERERkRxTMigiIiKSY0oGRURERHJMyaCIiIhIjikZFBEREckxJYMiIiIiOaZkUERERCTH6npvYpH+6K2332VB++KK73fYiLHMnreg4vvtzPCWway68pCavZ+IiDQmJYMivbSgfTF3TJlZ8f22tbUxblztksHttxyvZFBERHSZWERERCTPlAyKiIiI5JiSQREREZEcUzIoIiIikmNKBkVERERyTLOJpSaWLu3osmxKrcuq9MWixUvq3QQREZGKUTIoNdG+aAkPPtHW6fpal1Xpi49vPq7eTRAREakYXSYWERERyTElgyIiIiI5pmRQREREJMeUDIqIiIjkmJJBERERkRxTMigiIiKSY0oGRURERHKsrnUGzewDwLHAVsAEYLq7T8isHwR8F9gF2Ixo75PAGe5+R9G+ZgDrlXibse7+Wma7VYFzgD2BocBdwJHuPqNiH0xERESkn6h3z+AHiUTvWeCpEuuHAScC/wAOAr4EvAT8xcx2LbH9dcDWRf9eL9rmSmA34EhgH2AccIeZDe/bRxERERHpf+p9B5Kb3P0GADO7DPho0fqFwAbuPr+wwMxuBzYmegz/r2j72e7+cGdvZmYfI5LPXdz9lrTsSeA54EDggr58GBEREZH+pq49g+6+tJv1S7KJYFrWQfQUlnNPsJ2BN4DbMvubCTyQ1omIiIjkSr17BnvNzAYCHweeLrH6y2Z2MLAEuB84wd0fzazflBiXWJyETgN2rEZ7RURERBpZv0sGibF+BhxStPxG4G/ATGIiyQnAfWa2pbsXxiOO4r1jCAHmA6N725CpU6f29iUltba2llw+bMRY2traKvIe9dZuo7v9LP3ls/bks5SrljGYO3c4s16YU7P366nOfh/yRDFQDEAxAMUAahODfpUMmtm2wI+Bye5+X3adux+VeXqfmd0KTAeOB75SjfZMmDCBlpaWPu2jtbWViRMnllw3e94Cxo1b0Kf9N4qWlqGMG9f5lf22trYu1zeS7j5LuWodgzFjVmfNjcbX7P16oqvfh7xQDBQDUAxAMYDKxaC9vb3LDqx6zybuMTPbHLgBuB44rrvt3X0ucCeQjeJ8YGSJzUcB8/rcSBEREZF+pl8kg2a2IfBn4FHggDSJpBxPx+5sQNHyzYheRBEREZFcafhk0MzWAm4HXgH2cPd3e/i61YHtgSmZxbcQPYM7ZrZbF9gmrRMRERHJlXrfgWQ4y0u6rAeMMLM90/MpwKtEGZg1gGOAzcxs2esLNQXNbF9gV+BWoij1+sSl5Bbg7Mz2fzOzm4Hfmtl3gTeBM4lJJ5dV4zOKiIiINLJ6TyBZA7i2aFnh+UHA3cAW6fn1JV5fuNz7AlF38KfE+L83gHuAPd29+PLvvsBkosB0C3E7ur3cvTlmaoiIiIj0Ql2TwXQ/4OLxe8W6W1/oIfx0D9/zLeDQ9E9EREQk1xp+zKCIiIiIVI+SQREREZEcUzIoIiIikmNKBkVERERyTMmgiIiISI4pGRQRERHJMSWDIiIiIjmmZFBEREQkx5QMioiIiOSYkkERERGRHFMyKCIiIpJjSgZFREREckzJoIiIiEiO9ToZNLMdzWxANRojIiIiIrVVTs/grcAsMzvHzLaodINEREREpHbKSQb3AB4AjgAeNbMnzOxYMxtX0ZaJiIiISNX1Ohl09xvdfW9gTeAbwBzgbOBFM7vdzPY3s+EVbqeIiIiIVEHZE0jc/S13v8TdtwfWA04E1gAuB2ab2f+Y2fYVaqeIiIiIVEGlZhMPAlYCWoABwELgP4G/mNljZjahQu8jIiIiIhU0uNwXmtlqwN7A/sAngMXAzcDx6XEpsBvwM+BSYMu+NlZEREREKqvXyaCZ7UEkgDsDQ4EpwLeBK919XtHm15vZ6sAFfWyniIiIiFRBOT2D/wu8BJwLXO7u07vZ/gng92W8j4iIiIhUWTnJ4GeBO9y9oycbu/sjwCNlvI+IiIiIVFmvk0F3/2s1GiIiIiIitVfO7eh+ZmbPdLH+n2Z2Tt+aJSIiIiK1UE5pmV2Aq7tYfzXw+fKaIyIiIiK1VM6YwXWBGV2sfzFt0y0z+wBwLLAVMAGY7u7vqUloZjsBPwQ2Iyav/Nzdf1Fiu2OJ2+StBUwDjnP3O4q2WRU4B9iTmA19F3Cku3f1mURERESaUjk9g28CG3Sx/v1E0eme+CDR0/gs8FSpDcxsa+BG4DFgJ6Jm4c/N7LCi7Y4FJgHnp30+A9xsZlsU7fJKov7hkcA+wDjgDt1CT0RERPKonGTwTuBQMxtfvMLM1gcOTdv0xE3uvq677wk82sk2pwKPuvvX3f0ud/8B8FvgNDMbmN63BTiZ6DGc7O53ErUQnwdOyrTvY0SieLC7X+nuNwP/BYwHDuxhm0VERESaRjnJ4KnE5eWpZnaumR2S/p1H1BQcCJzSkx25+9Ku1qck7zO8d4ziH4hLwR9Jzz8OrAZcldn3EuAaYCczG5AW7wy8AdyW2W4m8EBaJyIiIpIrvU4G3f0Z4vZzjxKXWn+d/n0LaAU+6e5eofZtCAzhvZeQp6XHTdLjpunx6RLbrQKsk9lueokkdFpmXyIiIiK5Uda9id19GrBdutXc+9Pi59x9bsVaFkalx9eLls9Pj6Mz27W7e/FYxex2s9J2xfsqbDe6xPIuTZ06tbcvKam1tbXk8mEjxtLW1laR96i3dhvd7WfpL5+1J5+lXLWMwdy5w5n1wpyavV9Pdfb7kCeKgWIAigEoBlCbGJSVDBa4+2vAaxVqS78zYcIEWlpa+rSP1tZWJk6cWHLd7HkLGDduQZ/23yhaWoYybty4Tte3tbV1ub6RdPdZylXrGIwZszprbvSeob911dXvQ14oBooBKAagGEDlYtDe3t5lB1ZZyaCZDQJ2JHoFRwEDijbpcPezytl3kULP3sii5YUew3mZ7VrMbKi7v9PNdqX++o3KbCMiIiKSG71OBs3so8Afgffx3iSwoAOoRDL4HPAuMdbvtszyzdLj9PRYGCu4KVGCJrvdW0RtwsJ2O5jZgKJ7K2+W2ZeIiIhIbpTTM3gBMAzYA7jP3V+vZIOy3L3dzO4E9gZ+llm1L/AKy8vRPEjMEt6HlAym3su9gdsyid8txGzoHUnJpZmtC2wDfLtan0NERESkUZWTDG4OnOTuN/X1zVOh50JJl/WAEWa2Z3o+xd1fBM4E7jWzi4HfEzOZvwEcUZgVnJLGHwCTzGwOkSQeTMxG3q/wfu7+NzO7GfitmX2XKKB9JjATuKyvn0dERESkvyknGZxF55eHe2sN4NqiZYXnBwGXuftDZrY7cXeRrwBtwHfc/dfZF7n7ZDMDOApYkygXs4u7P160/32ByUQPZwtxO7q93L05ZmqIiIiI9EI5yeDZwPfM7CJ3f7Mvb57uB9xtYunutxCXeLvbbjKR6HW1zVvEXVIO7VkrRURERJpXOcngaOBt4Fkzuw74F7CkaJsOdz+nr40TERERkeoqt2ew4LBOtukAlAyKiIiINLhyksENKt4KEREREamLXieDaYaviIiIiDSBsm9HZ2YbAdsRM4J/7+4zzGwIsBbwiru/W5kmioiIiEi1lHMHkoHAr4GvEzOBO4CHgBnAEOBJonbfTyrWShERERGpioFlvOZE4GvAKcDWZErDuPu/iVvVfaEirRMRERGRqionGTwIuMTdJwHPllj/JLBRn1olIiIiIjVRTjL4PuCRLtYvBFYtrzkiIiIiUkvlJIOvEPcR7sxEQDOORURERPqBcpLBPwLfTLOJCzoAzGwn4v7B11SgbSIiIiJSZeUkg6cDM4HHgN8TieCJZvYw8H/A48B/V6qBIiIiIlI9vU4G3f1N4OPAJGBN4B1gG2AVIlH8lLsvrGAbRURERKRKyio67e7vEMngpMo2R0RERERqqZzLxCIiIiLSJMq5A8klPdisw92/XkZ7RERERKSGyrlM/BnS7OGMQcDa6XEO8HYf2yUiIiIiNdDrZNDd1y+13MxWAg4FjgZ26FOrRERERKQmKjZm0N0XufsvgduBX1ZqvyIiIiJSPdWYQPI48Kkq7FdEREREKqwayeAOwIIq7FdEREREKqyc2cSndrJqJNEj+BHg7D60SURERERqpJzZxKd3snw+8BxwGHBxuQ0SERERkdopZzaxClWLiIiINAkldiIiIiI5Vs6YwfHlvJG7zyzndSIiIiJSPeWMGZzBe+9A0hODyngNZnY3sG0nq09w97PN7HTgtBLrv+fuk4v29xXgRGB9Yozjme5+dTltExEREenvykkGDwaOAtYF/gD8My03YF9gJnAesLQSDQQOB0YULTsgLb8ls2whcau8rBezT8xsT+ByYrbz7cAewJVm9qa731qh9oqIiIj0G+Ukg2sDLcAH3H1+doWZnQY8AKzl7v9dgfbh7k8VLzOz84An3f2JzOKl7v5wN7s7C7jW3U9Iz+8ys02BMwAlgyIiIpI75UwgOQy4qDgRBHD3uURZmW/2tWGdMbONgC2BK3r5ug2ATYCrilb9AdjSzMZWpoUiIiIi/Uc5PYNjgFW6WL9y2qZa9icuQf+haPkwM3sVGA08C/zC3c/PrN80PRb3NE5LjwbMqXBbRURERBpaOT2DDwPfNrOJxSvM7KPAt4G/9bVhXfgycI+7z8osexY4jhizuBvwEPDLNLGkYFR6fL1of4UeztEVb6mIiIhIgyunZ/BbwN3AI2Y2BXgmLS9cvp0HHFmR1hUxs62ADYFJ2eXuXnzJ+BYzAzjOzM5x97er0Z6pU6dWZD+tra0llw8bMZa2traKvEe9tdvobj9Lf/msPfks5aplDObOHc6sFxqvM7yz34c8UQwUA1AMQDGA2sSgnDuQPGVmHwKOB3YC9kyrXgTOBX7s7q9Urokr2B94B7iuB9teAxwIbAZMYXkP4Egg275Cj+G83jZmwoQJtLS09PZlK2htbWXixPd0sgIwe94Cxo1b0Kf9N4qWlqGMGzeu0/VtbW1drm8k3X2WctU6BmPGrM6aG5VVNrRquvp9yAvFQDEAxQAUA6hcDNrb27vswCqnZxB3nw18J/2rCTMbDOwD3OTub5axi6fT46bA9MzyzdKj96F5IiIiIv1Sn25HZ2YbmdknzGy1SjWoCzsCq9PzWcRfImoPTgNw9xeIJHCfou32Baa4e+NdLxMRERGpsrJ6Bs1sP6Jw8zpp0Q7AnWa2OvAgcLK7X1OZJi6zPzCXEvUAzayVKCbtwBAi4ftyakf2OuupwNVm9hzwF2B34LPALhVuq4iIiEi/0OueQTP7ItE79zTwPWBAYZ27v5aWf6VSDUzvuQoxS/gad19UYpNngaOB64mxgpsAX3P3H2Y3cvdrgYOIcY5/Jnob99PdR0RERCSvyukZPAn4q7vvaGZjgMlF6/9GhYtOu/u/ifqFna0vvvTb1b4uJ3oRRURERHKvnDGDmwJ/6mL9q4Du5iEiIiLSD5STDL5N13cg2RB4rbzmiIiIiEgtlZMM3gkcaGZDileY2TjgG8R4PBERERFpcOUkgycBawN/Bw4HOoCdzexs4EnivsFnVKyFIiIiIlI1vU4G3f0Z4BPEXTxOJ2YTHwN8H/gHsI27z6xcE0VERESkWno1m9jMBhG1BWe7+2fNbBTwASKpfF6Fm0VERET6l96WlhkIPAccB/zU3ecT9/0VERERkX6oV5eJU8HnNmKcoIiIiIj0c+VMILmUmE08tNKNEREREZHaKucOJP8EBgHTzexy4HlgYfFGVbg3sYiIiIhUWDnJ4BWZ/5/SyTYdxD2CRURERKSB9SgZNLPzgMvdvRX4dFq8CtEjuKRKbRMRERGRKutpz+C3gIeBVne/x8zGEPcg3sHd76la60RERESkqsqZQFIwoGKtEBEREZG66EsyKCIiIiL9nJJBERERkRzrzWzi95vZ/0v/Xy09bmJm/y61sbs/0qeWiYiIiEjV9SYZPCP9y/pFie0GEKVlBpXbKBERERGpjZ4mgwdVtRUiIiIiUhc9Sgbd/fJqN0REREREak8TSERERERyTMmgiIiISI4pGRQRERHJMSWDIiIiIjmmZFBEREQkx5QMioiIiORYb4pO14WZHQhcWmLV+e7+rcx2OwE/BDYDXgJ+7u7vKYptZscCRwBrAdOA49z9jio0XURERKTh9aeewc8BW2f+TS6sMLOtgRuBx4CdiOTx52Z2WHYHKRGcBJwP7AI8A9xsZlvU4gOIiIiINJqG7xnMaHX31zpZdyrwqLt/PT2/y8zGA6eZ2UXuvtTMWoCTiR7DyQBmdg/wJHASsHeV2y8iIiLScPpTz2BJKcn7DHB10ao/EJeCP5KefxxYDbiqsIG7LwGuAXYyswHVb62IiIhIY+lPPYNTzWwsMBO4DPihuy8GNgSGAE8VbT8tPW4C/B3YND1/usR2qwDrALMq32wRERGRxtUfksGXgdOAR4AlxJjAU4ANgAOBUWm714teNz89jk6Po4B2d1/YxXZKBkVERCRXGj4ZdPc/A3/OLPqLmb0BnG5mZ9WpWQBMnTq1IvtpbW0tuXzYiLG0tbVV5D3qrd1Gd/tZ+stn7clnKVctYzB37nBmvTCnZu/XU539PuSJYqAYgGIAigHUJgYNnwx24hrgdGI8YOFy8MiibQo9hvPS43ygxcyGuvs7XWzXYxMmTKClpaW3L1tBa2srEydOLLlu9rwFjBu3oE/7bxQtLUMZN25cp+vb2tq6XN9Iuvss5ap1DMaMWZ01Nxpfs/fria5+H/JCMVAMQDEAxQAqF4P29vYuO7D6/QQS4DngXZaPCSzYLD1OT4+FsYKltnuLqE0oIiIikiv9NRn8EtBBlJtpB+7kvaVh9gVeAR5Nzx8E3gD2KWxgZoPS625z945qN1pERESk0TT8ZWIz+zOR7E0FlhITSA4Hfuvuz6fNzgTuNbOLgd8DnwC+ARzh7ksB3L3dzH4ATDKzOUSSeDAxG3m/Gn4kERERkYbR8MkgcXn3a8D7iPY+AxwH/Lywgbs/ZGa7E3cX+QrQBnzH3X+d3ZG7TzYzgKOANYnxhru4++PV/xgiIiIijafhk0F3Pxo4ugfb3QLc0oPtJpO5lZ2IiIhInvXXMYMiIiIiUgEN3zMoItWxdGkHs+c1VumiYSPGltWm4S2DWXXlIVVokYhI81MyKJJT7YuW8OATjVXoO2ot9j4Z3H7L8UoGRUTKpMvEIiIiIjmmZFBEREQkx5QMioiIiOSYkkERERGRHFMyKCIiIpJjSgZFREREckzJoIiIiEiOKRkUERERyTElgyIiIiI5pmRQREREJMeUDIqIiIjkmJJBERERkRxTMigiIiKSY0oGRURERHJMyaCIiIhIjikZFBEREckxJYMiIiIiOaZkUERERCTHlAyKiIiI5JiSQREREZEcUzIoIiIikmNKBkVERERyTMmgiIiISI4pGRQRERHJscH1bkB3zGwv4MvARGA08BzwK+BCd1+atrkM+GqJl+/l7tcV7e9Y4AhgLWAacJy731G1DyAiIiLSwPpDz+B3gXbge8CuwPXAecCPirZ7Hti66N+d2Q1SIjgJOB/YBXgGuNnMtqhe80VEREQaV8P3DAKfd/c5med3mdkqwLfM7GR3b0/LF7r7w53txMxagJOBn7v75LTsHuBJ4CRg7+o0X0RERKRxNXzPYFEiWPAYMJS4bNxTHwdWA67K7HsJcA2wk5kN6Es7RURERPqj/tAzWMongXnAq5llG5rZ68DKwFTgbHe/OrN+0/T4dNG+pgGrAOsAs6rSWhEREZEG1e+SQTP7KHAQcEbq2YPoKZxCJHarAQcDV5nZMHe/LG0zCmh394VFu5yfHkfTy2Rw6tSpvf8AJbS2tpZcPmzEWNra2iryHvXWbqO7/Sz95bP25LOUq5YxqObn6Ity2jR37nBmvVDqIkL/1Nk5IU8UA8UAFAOoTQz6VTJoZmsBfwQeITOBxN3PLdr0BjO7EzgDuKxa7ZkwYQItLS192kdraysTJ04suW72vAWMG7egT/tvFC0tQxk3blyn69va2rpc30i6+yzlqnUMqvU5+qLcGIwZszprbjS+Ci2qva7OCXmhGCgGoBhA5WLQ3t7eZQdWw48ZLDCz1YBbgQXAbu6+qJuXXAuMN7Ox6fl8oMXMhhZtNyo9zqtYY0VERET6iX6RDKYE7kZgDeBz7j63jN0UxgpuWrR8M+At4KXyWygiIiLSPzV8Mmhmg4kZv5sDO7n7iz14zQCiVMyLmdnIDwJvAPtkthuUtrvN3Tsq3XYRERGRRtcfxgyeD3we+D4w3My2yqx7irjMezlwJfAsMJKYQLIdcEBhQ3dvN7MfAJPMbA7waNpuQ2C/qn8KERERkQbUH5LBHdPjj0us+zTwBNHjdzJxGXkRkejt5u43ZTd298lmBnAUsCYx+3gXd3+8Ok0XERERaWwNnwy6+/o92Gz3XuxvMjC57AaJiIiINJGGHzMoIiIiItWjZFBEREQkx5QMioiIiOSYkkERERGRHFMyKCIiIpJjSgZFREREckzJoIiIiEiOKRkUERERyTElgyIiIiI5pmRQREREJMeUDIqIiIjkWMPfm1hEpDtLl3Ywe96CejejIlZZbfV6N0FEckbJoIj0e+2LlvDgE231bkZFfPj9w+vdBBHJGV0mFhEREckxJYMiIiIiOaZkUERERCTHlAyKiIiI5JiSQREREZEcUzIoIiIikmNKBkVERERyTMmgiIiISI4pGRQRERHJMSWDIiIiIjmmZFBEREQkx5QMioiIiOSYkkERERGRHBtc7wbUmpltBPwC2AZYCFwFHOfuC+raMBEREZE6yFUyaGYjgbuAF4E9gTWAnwJjgS/Vr2UiImHo0GHMntf/v5sObxnMqisPqXczRKQHcpUMAocCo4APu/trAGa2GPi9mZ3l7tPq2joRyb1FSzq4Y8rMejejz7bfcrySQZF+Im9jBncG7igkgskfgXZgp/o0SURERKR+8tYzuClwSXaBu7eb2XPAJr3YzyCAd999tyKNam9vL7l88aJ3GTxwaUXeo96WLF7U5WcZutKAfvNZu/ss5ap1DKr1Ofqi3Bg04mcp19Ili5vis7z7bjsvzS7vHNmyymhemv16ZRvUB8OGDGbl4SvV/H07+9uQJ4pBZWKQyVcGlVo/oKOjo89v0l+Y2SLgFHc/u2j5/cCr7v6FnuyntbV1G+C+KjRRREREpFo+OXHixPuLF+atZ7BSpgCfBF4GltS5LSIiIiJdGQSsTeQv75G3ZHA+MLLE8lHA9J7uZOLEie3AezJrERERkQb1XGcr8jaB5Gli3OAyZtYCbEgvkkERERGRZpG3ZPAWYHszG5NZ9l9AS1onIiIikit5m0AyEpgKzADOYnnR6TvcXUWnRUREJHdy1TPo7q8DnwH+Dfwv8DPgauBrdWyWiIiISN3kqmdQRERERFaUq55BEREREVmRkkERERGRHFMyKCIiIpJjSgZFREREckzJoIiIiEiOKRlsEGY2oOgxdz8bxeC9FIOgOCgGoBiAYgCKAVQ+BrkPaKNw945UFHukma3s7ksBzGxQfVtWO4oBmNkQM1vXzDY2s5GZGAyod9tqLfuZC3EoXt7sFAPFoFheY6DjoLoxUJ3BBmBmewN7AbsALwLPAH919/My2wzM/vCbjWIAZnYgsC+wA/Av4A3g/4AfuPuCtE1TxyDLzFYHNgG2BZyIyTPuPi+tH+DuTX0CUwwUAwAzGwdMBHYG/gnMAh5095fS+qY/L+g4qG4MlAzWmZltADwG3AHcBGwBbAZsTtwp5SR3v6Z+Law+xWBZDJ4AriHujmPAB4E9gAHAKe5+ftq26U96AGZ2K/EHcAEwHngZeJBIkK9y9/Y6Nq8mFAPFAMDM7gPeD8wF3kecF9uAG4BzM18Wm/bcoOOgujFQMlhnZnYpsDqwT+YXeh1gO2A/YGvgT0Qy0FavdlaTYgBmdgGwAfAFd1+Ylo0EPgR8FTgQuAc41N2frVMza8bMziJ6io8A/pEWHwHsDqwK3A9c4O5/r0sDa0AxUAwAzOwMYB/gQHd/2MyGAfsDOwH/AcwAfuTut9WvldWl46D6MVAyWEdpLNylwEjgi8BSoCMzTmwjIhHYB7jJ3Y+pU1OrRjFYNt7jXOIb3w7uviB72cfMxgK7AUcTl4gOKCTNzcjMhhKJ71/c/eSidesC3wb2JGJxuLs/22w9IoqBYgAxhhi4GZjq7t8pWjcW+BKRGA4krqDc3oQx0HFQgxgoGawzMzsJOAyY6O6vpmUrufui9P9BwPeAScBe7v7HujW2ShQDMLOvAj8Gdnb31rSsOAZfBi4DvunuF9arrbVgZtcBS9197/R8JWBJJkHeAbgCeA7YtTBmppkoBooBgJn9BtjQ3T+dnhfH4GPARcBQYDt3f7luja0SHQfVj4FmE9ff5cT4j/vN7FMA7r7IzAaa2RB3X+LuZwN/Bz5az4ZWkWIQYz6eA243s8/DshgMSknhEnf/H+DPwEfq2dAauRf4opkdYmaD3H2Ruy9NJ0Dc/S/A54jB1J+pZ0OrSDFQDADuBLY1s9PTFYPiGPwN+E9gFHF1pRnpOKhyDJQM1lH6xZ4FHAq8Bvwq/cKv4+5L3f3dwnbAC8AH6tjcqlAMgrvPJWYSPwhcZmaXmNmGKQks9A4OAF4F1qtjU2vlUuBq4vLH0WY2HpZ/SUjbzCAS6C3q0sLqUwwUA9z9D8CPiOEy55nZh9PywpfFgcSEgieATaw5S63k/jigyjFQMlhHhe5dd78XOAmYDnwFuMrMTjCzUWkixReJwcJX1a2xVaIYLOfuLwLfJ37pPwHcbWbnmtkmZvYfwEHE7OLf1K+V1WOZYuPu/hZwKnFimwSca2ZfMLO1fXkJjSHEwOnX69HealAMFIOsTGL3S+LqwR5EDI4xsw+kL4tLgRHAOkBbs4yV03FQ2xhozGCNmdmqwJZEnaABwAvufmla10LMGv08MUtsTaInaBFwu7t/ox5trjTFYNlM4c8S3fmrAE8CV7j7S2l84GeJGOxE9ATOBd4CbnT3o+vR5lpIcRkILEonP8zsIOC/gcHAQ0R9rZeBXYH13X2D+rS2OhQDxcBi4siawHBgji+vI7cH8YVxHWAe8eV5FnEuHenuG9elwVWS9+MAahcDJYM1ZmbXELNGVyH+wI8n/sj/DPiFuy80szWJy6FrpvW3AS96KjnS3ykGYGY3ABsTyXAb8GFgEPA/wPnuPt3MVgFGEyf+dYCHiT8MTVdPy0oXHb/H3X+S2eY44FPE8TAO+D3wB3d/uPYtrjzFQDGATovP3wqc4ctLbx1EnEM3B9YCrgSud/fH6tHmStNxUPsYKBmsITPbnZgs8WV3v9nM3kckBHsRl0afB45Il0yLX9sUU+UVAzCzXYlf2t3c/R4zWw0YC+wNHAu8DZzo7r8r8dqmiEGWdV10/G3gdHe/Im27CpFAd7j7v+vT4spTDBQD6Lb4/EAiIfx52naou79jmaoDzUDHQX1ioGSwhszsZ8Qlvz19xfsKDiUum55GFFo+AZgMDHT3JXVoatUoBmBmpxDf5nYpTJDJrFuX6P7fD/ipux9bhybWlPWs6PiNRNHxf1nMpGu2Y0IxUAywnhWfvw84zN29SWOg46AOMdAEkhrIDAKeSSQB66XlgwHc/R13vw84BLiEqLk3oZkOcMVgBf8CPk1841+Bu/+LmFl9KvBlM9upxm2rqTQ+chDQASybHenuL7n774lC2xcQE2q+A9Bsx4RioBjAsnPkYmIySHbiwOvp3HgCcW5YHZhkZsObMAY6DuoUAyWDNZC5rHcv8ct+RFq+2MwGpx8+7v48cDwxI+g7pfbVXykGK/gr8BRwnJmtBSsky7j728BPgFeAb1pzlooAlp3EnJgsNKowO9KW1856hugt/i1RTqHp6qgpBooBLDtHthJjpTdNy7IxmEMUnT8H+C/ggPq0tHp0HNQvBkoGa8jjzhJnAUeZ2S1pSvhid1+S+UHPA64FVre4B2VTyXsM0pi/WcAPiZlf15vZR4vHAaZLRFcTZQJG1L6lNaWi44oBKAag4vOg4wDqEAONGawyMxvs7ouLlu0PnA6sDfwUOMuXF1dehRg8vMDd96xxc2smxeBMYrZw08egMNi7aNnWxGf/f0TtwJ8Ar7j7m2a2OjGzeJG7717zBtdIuvyxNJ3wzgZWI74IXOzuL2W3I2ZMDnT3verT2spLvb4DUgy2IW5JOAK4jpzEAHQcZFkUE/4lcRnwBuCH7v5cZv0AoodwTXf/XF0aWSV5Pw7qeT5QMlhlFvfdvdbd/5lZthIxAHTf9K8D+BPwDjF77EPAVumSab9nUTtwbWCEuz+Rlg0gkqCvETdbX0pzx+BnxHHwYGbZAGAjoqD2YcAawN3EZfS1iJh90t1fqHmD68DMPg18i7g88hJwC/Brotbax4nLIgd5k9yb2sxGe9H9Q81sW+AootRQG00eg1JyeBwY8LxnZgSb2QeJIvO7ECW4/hf4FTCMiMvPgK+7+3W1b3Ft5PA4qOv5QMlgFZnZN4ALiZuMv2BFZUHMbCwxXXxbYE+i1t6TRNJwRz3aXGlmth1x+5ztiGEJc0k1sYjxMS1EvaztiaSoGWNwMPELbO7+XInjYDjRQ7odkRi/QVwq+l93n1KHJleVqeg4FneUuRfYvPjcYDGz/mvAzsQfgbVozhjoOIhbyz0ATPSoLZo9DoYQyeAONHHxeR0HjXE+UDJYRWY2B/g5cHYaEzeA6PIdD8zwVE08s/1q7v5G7VtaPWb2EnGj9b8RB/A2xIG9gPim+xuPGbSF7ZsxBnOAc4H/TsfBYOKm8psD09z9laLtm6puWDFT0XHM7AHiXtz7FA8fyGyzFrAhzRsDHQdxHMwBvpQ9DrLDiyxKy6xGFBVuuuLzOg4a43wwuBI7kfcys58Ss0Ev8eXTvo8Gvk5k9kMsagkdX/hhFpKg4p6j/srMDgPaiQLKhYTvGjM7mZgRdwrwSTM7uHA5uAlj8FPiNkG/zRwHpwH7E7/UQ83sWuBUd/e0fvF799QcLIqOf5bSRcdPAw4ws0LR8dlFr22KWdUWdxb4MHGZpz0tW5vo/RhJ9I5c6e4ziXNI9rXNEgMdB8uPg0+w/DjYmJgpvE4aE/Zzd3+WuNfsi5nXNksMdBw0yPlAPYNVkC7/zgYmufvJadmpRDJY6CV7H/Bl4lvhbh7TxZuKmR0AnAx8xN3fTgfusiLSaQLFJcT4h8+5+9P1a23lpUkgrwLnuvt30rIziLI61wH3EOMCv0l889/Z3f9ep+bWhKnoOGY2E/i9u5+Qnm9L9BxvDrwJLCTuMvAjd7/YmrOoro4Ds1nA7zLHwWeIqyXrE+PDWog7E51BTCJpuj/WOg4a53yg0jLVsQ3xLW4/MzsvjQv5NlFIeF93P4c4yA8nuv73rldDq+xZ4hf9MDMb5u4d6TJpoYjmQ0Q19QVEdf1mY8AjwDfM7EYz+xjwDaJH9Ah3vxL4BdFL2A4cXLeWVpmp6DgAZvZN4ovgSma2WVp8EfACMWZqDaK+5kzgRDPbuJlioOMgmNmXiXP/+y1mjQKcDzwO/D933wDYB7gKOIY4NpqGjoPQSOcDJYNV4O5/Iga93kp0gT9AFBm+pjAWzKOq/DXEfSi3slR0uclMAX4HfBfYM02UwFMRzfT/x4iBs582s6app2dmK7v7A8RxcDox1uMh4u4jNxR+od19kbv/jYjVZmY2rFkuf2S5io4X3E3MCNwfmJyGCKxE/I7c7+7vuvtVxHCStYk7TjQNHQfLPETcReKDwM/M7G6iJ/B4YgIdKRk6iojTV+rTzOrQcbDM3TTI+UDJYJW4+z3EN7pTiUKidxBdvsu+FaXZYs8TYzebJhksfMNJA6CPJu6leSnwSzPbxFJx6YwnieLKzfSt7ydm9ll3n050+R9CXOa4iRj/kz0OBgOzCi9sxstBBZ7zouPu/rS770oMDVgD+AIxZOAld+/I9Iy8QMy2X7lJvxzk9jhI46Gfd/dvAUcCM4irCNcTdUaXZpKh+cTVhSElzpv9Xp6PA2is84HGDFaQxRT5CcAniVmiN6fl44CR7v5Uujy6NC1fj0gSL3f3s+rV7kpKk0Z2JC77/jvzWQ8jLo8OIwqm3k3ccmdz4EdEr+nxdWhyxZnZocTYn+eJYQFT0vKRxO2FXig6DjYgblH3O3c/vT6tri3LWdHxzqTfi8Xu/pui5WOI5OAxdz+qHm2rNFMB/kJB6TW9qGSUme1DjIe7smj5KOI4cHc/pGYNrTGdD0I9zwdKBiskfZO7lKgF1AGMIS4Pf8lXrBo+KH3r2ZAYB7Gvu7+vHm2utPQt5m3gJHefXGL9eOKy6beIntAWooTAXe6+Xw2bWjUpBnOJX9ytidlfX/WiwtGZ42ATotfwi+6+Xq3bWwumouPAshP6Ku7+YtHyAakXYCWPW04NIeqOXkbUpmyKouOmAvyY2WPEbNnt3f3h7GSAEsdBC3EcXEpzHQc6H9B45wMlgxViZucRs57OIsYB/gfR43Wfu+9ftO0goqL8x4DD3P36mja2SszsQmAr4kT3Wlq2CZEgrwrcBTzh7q+b2W7AfKLA8j+9k9pK/U0mBtsSJSIuJi5xHFCiV2Qg0SO4CXCou99U4+ZWnanoOGa2BTFR6gBiqMhjwMleogB52v7bxBem6939ezVvcBWYCvBjZl8lErvZxFWRA9z9X6V6TNP2pxBfnq9z9+Nq2tgq0fmgcc8HSgYrwKI20hTgSE+3B0o9RMcAk4BPufuDtmJV8S2A8c2SAJjZB4DpxCXie9JA4G8RA6DXJWYMjyQGyx6W7S1tFma2EXGS/0IhwU9/BM8l/gic6O5vFF0iHgP8h7v/tU7NripT0XHM7ElgHvAoMUxie+AfwFe8qGCsma0LfI8oSv6VZhk/airAj5nNI+41+wJwBXAjUV/vPV+EzWw0kTSNAw5pouNA54MGPR9oAkllfJYoJfMMLOvmXUyMfZhB3FKITCL4PqKCfFMkgsmxxPE0OiWCo4kK8jcR3/Y3I7r/twJusKim3mwuAm4mZpEXXAdcTcwG+yJAJhEc5O5zmzgRzBYd/6W7X5PGu4wjLgGdAlxuZu8vvMYzRcfr0eZKs6gruRT4hketycOBnxDHwr5pm2WTx9IfwnOAo5ooAeisAP8DxNWCl8zs3OzkgCY8Dn5K9IL9xt2vJnp6diQmmg1L2yz7e5wmTZwNfLuJjgOdDxr4fKBksDL+TdxKZwZE0pd6fxYTycCyQa8W5VNuIsaJNYV0EruR+FxXm9mfiG++twI/cPdHgFfd/Vric38I+Ei92lsNaRzM08SJbtltotx9vrsfRCSEF5rZ18xsYPrC0Eyzp0t5m7h/5jyIE3pKgN9090OJOy+sDdxlZptmX9gMfwDT7/qOwP8Qk4lw96Xu/mvij9+X07JlY8bS8395zCLt99Ll36OJckovp2WnEn/4pxNDaS4lYvFo6l1fpkmOg3WJ0iknFYbPEJdGryBmkR4Iy78kFrj7QndfUMOmVpvOBw18PlAyWAEe9QJ3TZcACz/Awi/2bcAHzGyr9HwvorbUebVvaXWkA/oWYgzEgcAGRG/gLaQyKhlPEL0E69awiVXn7u3ufri7P5ldnvmWdw4x9uVE4IPNcHLrgbwXHV83/Vvsy+8zWzjn/gH4qJl9MLP9p8zsZmuu8hkqwB+fdRpRYqxw5ehNdz+MSIQnm9kh6UtiM/9N1vmggc8HzXzg1ZS7z0iPxX/kHwJeArYzszVYfmuhebVtYfW5+5tEkendiMvGj6Re0sLsqAHERJJBvDdJbEq+vLj0VOIP3WLgT2b2H3VtWG3ktug4gLtPI7703QHLkoDCl8T7ibqan0vrRhC9ZaOKxw31Zx4F+A8gxwX43f0bxK0mF6TnHZkk4GdED+nxwCbFvYNNppXoDc3z+eB8GvR8oGSwitIPu52YMfp54DhgkLufUd+WVU/6tjcTuMjdH02LC8fZCKLncEkaN5Mb6XLIs8SkonWJnpCmlOkdLxQdv5+cFR235UWDf5S+CKzwRdHdZwN/BnZNi3YnZlh+sbYtrR5bXjD3fuLuESeTvwL8QwDc/ZXs8kzyM5X4mS8E/s/ilpVNyaNu4NHAg+TsfFDg7pPcfWrqCW2o84FmE1dRpkfs88TU+QFEPbk/1bdltZe+BR5J/FE4xN1vrHOT6sbMjgGedPe/1Lst1ZKSoYGF3h8zO4T4pjucqJd1D01adLwgJUNLOhsSYGb/RfSUfAr4PXCTN0kpmYJMQli4LLYGMMbdn7YmL8BfkH4XlpY6DgoxMLPdiQlojwL7NdGY0c5uxPBN4CSW34Sgac8HRTF43N3/nJYvO/7T87qeD5QM1oDFnSceB55298/VuTl1YWYbE5cM72qWX/LeshI1pJpJ+qN3EnBxZrLASpmE8P1Ez/A3iS9GQ2m+ouOlYrCssHBmu4HEDeqnECU21nL3sbVubzV0dhwQiXH2j18zF+Dv0XFQ9JpvAke7u9WomVVlnd+IYW93fzn93L/K8qskzXo+KI7BvcSx/nJmu7qfD5QM1kg6Ga7s7q/Xuy31kn4xBntmtq00DzM7G/g+cZnnUuBcX15OqbjI8OeJcaNvAdO9eYqOdxWDUknhX4HPAHu5+x9r3d5q6CYGKxRYtuUF+LciCq9fX/sWV15vjoPMFaSViF7TV0rutJ+x0jdi+DFwr7/3RgyF88GbxK33muV80OObUaTt63Y+UDIoIn2WSojcS5SNmE9c7pkBTC4MCUjjw4Y065eBXsRgoC8vH7EJccvK0+vR5krrbQzS/ycQBfhvrk+rK6uc46DZWPc3YtjW3R8o/nLQTHoQg1I3o9iU6DU8tdbtHVzrNxSRpvQRYC1i9uyVwMHAHsAFZrYvMMmj7E67mQ0lak22AA800aXznsZgiUVdyg8Tl46aaYxcb2IwhEiUBhFlqJpFOTEYSnP9LpS8EYNF8e1DiMumD2TGkq5LJMcvdrbDfqi7GOwCPJhJBNcjekdPq0djNZtYRCphOjEQ/JpUKmQyMWHoSqLn5wYzOzuVTBhO1NXauYn++EHvYrAyMVh8jybrHepNDFZJy3fP8XFQiEGz/S709kYMN5KKbzeR3sbgemKoRF2OA10mFpGKMLMh7v5udtJIWv45opjsNsAbRN3NHYhB0k0xa7JAMVAMQDEAMLP13X1GifHC2xEzxz/h7g+b2deJ+xKv5U1Wf7c/xUDJoIhURVHpkBHAF4ib0m8DHOvuP61n+2pBMVAMQDHISkMkngEuAC4hyulc7E1cf7dYI8ZAyaCIVFWmhMhg4CpgC3ffqLvXNRPFQDEAxSAzc/oSwICHibqKa9e5aTXTqDHQBBIRqar0x28g8J9Ej8ie3byk6SgGigEoBhl/IsbIbU0T3XWnlxoqBuoZFJGaMLPxwP7uPqnebakXxUAxAMXAdCOGhouBkkERERGpKd2IobFioGRQREREJMdUZ1BEREQkx5QMioiIiOSYkkERERGRHFMyKCIiIpJjSgZFREREckzJoIiIiEiO/X/icxdMgxke1QAAAABJRU5ErkJggg==\n",
-      "text/plain": [
-       "<Figure size 720x360 with 1 Axes>"
-      ]
-     },
-     "metadata": {
-      "needs_background": "light"
-     },
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAnYAAAFKCAYAAACQBBKyAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAACHR0lEQVR4nO2dd5wURdrHf7MLLDkHEUkCFiKY1hzOHFAx5/CennreneE8MZzhxHzmnD3jmdBTCZJEohIUlpyKuOSwwO4Cm0O/f1T3THdPdZye6ZnZ5/v56DLd1RW6u6qefup5noooigKCIAiCIAgi88kJuwIEQRAEQRBEMJBgRxAEQRAEkSWQYEcQBEEQBJElkGBHEARBEASRJZBgRxAEQRAEkSWQYEcQBEEQBJElkGBHEBkAY6wXY0xhjN2oO/YYY8xXvCLGWCFj7BMX6U5Vyz3VTzlJqM9UxtjUBMo5gzE2jzFWobarrd+8shnG2CeMscKw62GGMdaCMbaNMXZz2HVxA2NsDmPs+bDrQTQsSLAjiIBhjN2oCg3HhV0XIgZjrBWAbwHUA7gDwA0AykKtVIgwxgaoHwe9wq6LB/4OoAbAf8OuiEv+DeB2xth+YVeEaDg0CrsCBEH45ikAz/q8lkEIOJnG2QlcexiAdgCe5JyPDKg+mcwAAMMATAVQaDp3K9Lsw58x1hjA3QDe4ZxXh1wdt4wAsAfA7QD+FW5ViIYCCXYEkaFwzmsB1Pq8tirg6qSEBCf0zurf0iDqAoilQc551mn9OOc1YddBwgUAOgH4JuyKuIVzXs8Y+x+APzLGhnHOM/FjisgwSLAjiBSg2o9dDaAPgLcAnAmgAsCnAB7gnNfp0rYF8CqASwAoAEYCeEWS52MAhnHOI+rvHwEcCqAn51wxpf0ZQB/OeW/1dyGAqZzzG3VpDgDwJoCzIJYovwAwXlJu3LXq8akAwDk/Vf3dBMDDAM4D0BdAUwCLATzDOR8hv1P2SMroBWAdgAcB7ALwTwAHAFgE4G+c8zm6605Rs5nCGAOAT7U2MMaOBvA4gBMBNAFQAOBfnPMpurIfg9BwDQLwAIDzIYRE7Z6erbY3X73kVwD/5Jwv0OXxCdy/BxEAf4PQnjGIZzIfwBOc81906a4F8A8AAwFUAvgZwP2c83U29/FGAB+b7gcA3MQ5/0St56mc8166axQA76n5Pw7gQIj7fBvnfAFj7FYA9wPoDuA3Na+1pnId77MNFwPYyjlfasozrq7q8ceg6x/qsTMQe4Z5ALYCGMc5v0OXJg/iPboeQA8AOyGEyYc55+WmMq6G0CIOglgiXgLgBZNGeCLE0n8+gDku2kkQCZFWqnaCyHJyIASlXQDuBTANwFAAf9YSqJP5SAj7ry8APAJgf4iJ34mvISbV4/UHGWOdAZwKYLjVhYyxZgAmATgHQrh7GmLyTcTwuzWA2wDMgBB4Hoa4Bz8wxgYnkK+MqyCEivcg7lkvAN+ry3eAaM/r6r+fgbi/7wEAY+wUAL8AaA/gCQihLQ/ATxZOI8MhlnQfAfCamse1EM+2EkLIfAxC8PmFMdbfdL3je6DyPsSz2Kbm+TSAEgB/0BIwxv4J4HMI4XYogBcBnARgBmOsk/xWAQCmS+7HDepxO06A+Mj4r9pGBuBHxthfIITLdyDemeMAfKK/0Md9lpU910U6KYyxAQDGAGim1v3vEDaXJ+rSRAD8APEujQFwJ4RQ9zcAI9TzWtpHAHwF8fH1OMRS62qIPqSnQP17IggiBZDGjiBSR2MA33LOn1B/v8sYmwfgZogJEQAuhJi4H+CcPw8AjLF3ILQkToyE0P5cBWCm7vjlAHIhBD8r/gzgIABXcc6/Uct9H0JD5JdiCO1hdNmXMfYmgHkQQsi4BPI20x1AP855sVoOh7gf5wD4kXM+kTHWBsBdACZyzqeq6SIQAt6vAM7SNJ2MsXch2v4MhEChZwXn/DJdm1pACGCfcM7/pDv+IQAO4FEA1+qud3wPVEHnFgBvc85v1137iiZcMMZ6AHgSwGO6vMAY+xrAUghB6yHZzeKcr2WM/WK+Hy7oD+BgzvkataxiiPv3BMT9L1WPNwLwIGOsL+d8tc/7HEXNrw+EsOWXsyAEycGc85264//U/fsaAOcCOI1zPk1X/lwIAfosCEG0D4QwNwrApRJNaxTO+WbGWDWETSNBJB3S2BFEavnA9PsXCM2OxnkQTg2aoAd10njLKWPO+V4AYwFcwRjT9+2rIISRBTaXnwdgO4D/6fKrAPAfp3Jt6lOnCXWMsSaMsfYQWrzpiC1XBsV3mlCnoi1VHihLrOMwCK3TlwA6MMY6MsY6qvWcCOBYxlhz0zXvmH6fBaHB+1K7Xs0jV63HaZJynd6Dy9W/w8wX6pbZL4X4OB9uKrcUYslbVm6iTNGEOpXf1L/fa0Kd6bjWJj/3WU97ABGIjwW/aPW72NQ/9FwJYCWApaZ7Og1CM6fd00sg5s8n9UIdYHg+eooBdEyg7gThGtLYEUTqqOGcbzUdK4YQCjR6AtimCml6Vros42sAl0Fo/aYyxvaHWJp70uG6ngDWSIy73ZYrhTF2C4Tm6GCIiVnDV/w9Gzbof3DOi1W7sXby5FEOUv9+aJOmAwC9bdUa03ktj4kW15vvqZv3oA+A7SbNkhmt3BUW59daHE+EDabfmrC00eK41iY/91lGxOG8HcMhtKIfAHiWMTYZwmv1G9URSasnA1BkkYfmgNNH/bvUIp2ZCIJ/5wlCCgl2BJE6UuERNwbAXggt3VQIDUQO7JdhvWI1QeUC0C9JXQcxiY4G8ByAHRBevDfBuDQZBHUWx50EAU1z80/EbKHMmCf5Cos8bgSw2aE8ILj3QCt3MOTe0eZ6BoHVfXa6/37us55dEO+dTFC3ex+jcM4rVDu/P0BoqM+BsGO9hzF2sqqhzgGwDML+TsYWmzra0RbCCYMgkg4JdgSRXqwHcBZjrJVJa3eQ1QV61MlrFIDLGGN3QAh4CznnVlodfbmHMcZyTFo7WbnFEBOVmZ4waomuUH9fpF+eYozd5NySlKFp3/Zyzt3YMdrlUZRAHrI8z2WMdeKcWwk8WrkbOOfLfJSRSg1SQveZc17HGFsF1QPZhN37aM6nHuKDZyqA+xljfwXwNsSy9hdqPfMBTLJYUtXQ2nMIHBw6GGPdIDyAl9ulI4igIBs7gkgvxkL0y79qB1R7oNstr4jna4h4XzdBeCe60daNBdAFMdsuzVP2FknaNQCOU8OZaGkvgHBg0KNpcfSehAdC2CelCwUQnoz3qDtTGHDwLNWYAOGt+pD+nnjMw4xm6/iYJD/tfn4HcY8fNRvsq+mcbLq0+HtOy9VBEMR9ngHgKMnxNQDaMMYO1eXXFab3jDHWQXLtPPVvW/XvcIh+8FdzQsZYnq7uP0BoXh9ljOWa0pmfhWZPOhMEkQJIY0cQ6cVoiAns32qMtqUQ8bvae8hjAoQW42X1txvB7gOIWFufMsbyIZYUrwcgC2T8HwgBcDxj7BsIe6PrEW97NgpCEzJK1SJ2gwgbwQEc7qE9SUMNIHszRPiRZYyxjwBsgggxcwqEUGrrhMA536OG+/gCwHzG2FcQjig9IDwsl0Is03qp11Q1PtvfVA9MzYP4eIjYcc+onq3/BPACgJ6MsREQAmZvABdBCCmP2RQzH0IwfJCJ2IkVAH6zi3/nlyDuM4SX802MsUNMsey+hljq/4Ex9jqA5hCC2UoAR+rS/Uv1Nh4DsdNGOwB/gRBwf1TTfA7xbr+lLtv+qtaNQZg1XAERw3ENY+wJiPv7K2Psewj7wCMhQt7oP8TOUtvqO1QLQXiBNHYEkUaoS0UXQggJ10HELtsK4I8e8qgB8D2AVhATdaGLa8oBnAHgJwgB718QGob7JWknQIQrOQgikPLxELsCbDKl+1S9fgBEzLTLIBwp0mo7L875dAjN5mwIwfNNAH8CsBtCYHCTx3AIwWQDxL15HcKOcDnUeHk+uBnAPRCa0Ochnkl7CA9NrdwXIQT/aoi4ei9DaKqmQsRos6vzdojgx+0gBPuvEAviHDgB3OcxEHaaV5ry3QXR5nKI+/RHiLh/o03Xj4QwDfijWvbdEMLtiZzz9Wpe9RAfI/dBvLcvQIQ1OQ5iyXaRrtzH1bwaQ4R7eQpCAJygpVG17ZcD+Ix2nSBSRURRyFGHIAiCSH8YYw9CaOP6pOm2ZwYYY5dCaAH7SDyhCSIpkMaOIAiCyBReg3BE+L+wK+KSBwG8SUIdkUpIY0cQBEEQBJElkMaOIAiCIAgiS2jwXrEFBQWNABwAYFN+fr4syCdBEARBEERa4CS3NHjBDiKI5WoAJxcUFGxySkwQBEEQBBEiB0DsL90X8WGmSLAD0FX9+4ttKoIgCIIgiPShK0iwk7IVAA466CA0aRIXND5QlixZgoEDBya1jHSlIbcdaNjtp7Y3zLYDDbv9DbntQMNuf7LbXl1djZUrVwKq/GKGBDt126MmTZogLy8v6YWloox0pSG3HWjY7ae2N1wacvsbctuBht3+FLW9TnaQvGIJgiAIgiCyBBLsCIIgCIIgsgQS7AiCIAiCILIEEuwIgiAIgiCyBBLsCIIgCIIgsgQS7AiCIAiCILIEEuwIgiAIgiCyBBLsCIIgCIIgsgQS7AiCIAgiC9hStA/XDxuHouKKsKtChAgJdgRBEASRBYyfvR6l+6rxy4JNYVeFCBES7AiCIAiCILIEEuwIgiAIgiCyBBLsCIIgCCILUBQl7CoQaQAJdgRBEASRVUTCrgARIiTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQBEFkCSTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQBEFkCSTYEQRBEEQWEaFoJw0aEuwIgiAIgiCyBBLsCIIgCIIgsgQS7AiCIAiCILIEEuwIgiAIgiCyBBLsCIIgCIIgsgQS7AiCIAgiC1CUsGtApAMk2BEEQRBEFkHhTho2JNgRBEEQBEFkCSTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQBEFkCY2cEjDGrgBwHYB8AO0BrAHwDoD3OOf1unSDATwNYACAzQBe5Zy/IcnvXgC3A9gPwFIAD3DOJ5nStALwAoDLATQFMAXAnZzzQlO6fgDeAHASgAoAX6v5lbtoO0EQBEEQRFbhRmM3FEAVgPsAXABgBIDXATynJWCMHQ9gFID5AAYD+BjAq4yxv+gzUoW6ZwC8BeB8AKsAjGGMHWYq8ysAFwK4E8BVAPYHMIkx1lyXV1sIga8VhAA4FMA1AD5y0SaCIAiCyCoUULwTwoXGDsAQznmR7vcUxlhLAHcwxh7hnFcBeBTAPM75zbo0PQAMY4y9zzmvZ4zlAXgEQpP3IgAwxqYBWAzgYQBXqseOhRD6zuecj1WPLYbQFN4I4G21jNsAtANwOOd8p5quFsAXjLEnOedL/dwQgiAIgshsKN5JQ8ZRY2cS6jTmQyyRtlcFttMBDDel+RJiufVI9fcJANpALJdqedcB+AbAYMaY9iaeB6AUwHhdug0AZqjnoEs3SRPqVL6D0C4OdmoXQRAEQRBEtuHXeeJkALsB7ADQB0ATAMtMaTSNWX/178Hq3+WSdC0BdNOlW6G339Ol66/7fbC5TFV7uMaUjiAIgiAIokHgZinWAGPsKAA3AXicc17HGGunnioxJS1W/7ZX/7YDUMU5r7BJt0lNZ85LS9de99ttOlcsWbLE6yW+KCgoSEk56UhDbjvQsNtPbW+4NOT2p7rtO7aXAAA2bdqIgoKSlJYtg559OHgS7Bhj+0Esd/4OnfNENjBw4EDk5eUltYyCggLk5+cntYx0pSG3HWjY7ae2N8y2Aw27/WG0fd6mxQDfhwMO6I78/D4pLdsMPfvktb2qqspWGeV6KZYx1gbAOADlAC7knNeopzSNW1vTJZomb7cuXR5jrKmLdOa8tHS7db/dpiMIgiCI7IecYgm4FOxUYWwUgM4AzuWc79KdXgOgGjEbOo0B6t8V6l/Ntk6Wbi9E7DstHdM5U+jTrdD9Xm7OS3Xk6GNKRxAEQRANhgg5xTZoHAU7xlgjCM/VQwEM5pyv159XHRYmQw1XouMaANsAzFN/z4Twdr1Kl3euet14zrn2rTEWQhN3ji5dd4ggxGN1+Y8FcAZjrIPu2CUA8kzpCIIgCIIgGgRubOzeAjAEwP0AmjPGjtOdW8Y53wPgCQDTGWMfAPgCwIkAbgVwu+bdyjmvYow9BeAZxlgRhMB3C4SG7VotQ875b4yxMQA+ZIwNBaDlvwHAJ7qy34MIYDySMfYkhDbxZQDDOedmD12CIAiCIIisx81SrKY5ex7ALNN/RwIA53wWgIsAHA1gAoTA9g/O+bv6jNTAxA8BuAvCXq8/RCDihaYyrwHwI0Qw4m8hNH9n6rcK45yXQMTP2wfgewCvQMTS+5OLNhEEQRAEQWQdjho7znkvNxmpu0Q4LoGqwt2LDmn2QuwscZtDupUAznVTP4IgCIIgiGzHb4BigiAIgiAIIs0gwY4gCIIgsgCKdkIAJNgRBEEQRFZB0U4aNiTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQBEFkCSTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQRBagKBTwhCDBjiAIgiCyC4p30qAhwY4gCIIgCCJLIMGOIAiCyDh2FJfT0iNBSCDBjiAIgsgo+PrduPmpifjpt/VhV4Ug0g4S7AiCIIiMYuP2fQCAZet2h1wTgkg/SLAjCIIgMgxagiUIK0iwIwiCIDKSCHl/GiF5lwAJdgRBEESGQT4T9kQo3kmDhgQ7giAMlFXU4LclW8OuBkE4QgIMQcRDgh1BEAZe/KIAT338O7bvLg+7KkSCjPplDdZv3RN2NQiCSCGNwq4AQRDpxdadZQCA6pq6kGtCJMoHI5YgJwKMfPGisKsSKLQSSxDWkMaOIAgTYtokw/TsoD4LpSDNxo7eUYKIhzR2BEEYiE2aNGv6pbK6Ftt30VJ28shCaZUgAoI0dgRBGNCmTBLr/PP8f+fijhenoLq2PuyqZDX08WGExF0CIMGOIAgT2v6bNGn6Z8maXQCycxk0HaBwJ/ZQ123YkGBHEIQBsl8KApI8UgG9owQRDwl2BEEYIJEkcaLCcbjVyFroHbWHNJoNGxLsCIIwQkuxBEEQGQsJdgRBGCDnicRJB4WJks1qG/r4sIVuS8OGBDtCypChI/HcZ3PCrgYRAgpJdglDdorJhV5RgrCGBDvCkl8Xbgm7CkQYaNoQmjb9kwbasjSoApFislpLS7iGBDuCIAxEtSEk1/kmplGim5gMSH6xh966hg0JdgRBGKBlxAAJ8R42CNmH3lGCiIMEO4IgTDQIkSCpkEYpydANJghLSLAjCMIA7RUbBJqdYphVyH7hh95QgoiHBDuCIAyQx2HipINMlQZVSBrZ3DaCSBQS7AiCMEKSXcKQA0pyIa0yQVhDgh1BEAbqKdxJVpAOWsNkoaTDUncaksWPnPAACXZE4IybVYiSvVVhV4PwCXnFBkA2S1XpBL2jcqjzNmhIsCMCZUvRPrz9v4V4lnatyGBIKAGAD0YuxvT5mxLO59IHRuPOF6cEUCOv0HMkiIZIo7ArQGQXNbX1AIA9ZdUh14TwC9kvCUZNXwsA+MMRB3i+Vi9S1dTWo3DrnoBqRQAgmZUgbCCNHREoZDSe+dAzTJx0WIlNhzokm4b+8UEQMkiwIwjCiEKG6dlANst12dw2gkgUEuwIgjAQ09iRaEekJ1FzgXCrQRBpCQl2DZAxM9bhq594UvJWSNuT8ZBXbHagZPVaLEl2UrL5kROuIeeJBsi73y8CAFxzNktaGaTtyWRodiAyA4q1KIeG34YNaewIgjBAXrFZQhbL51mtjCSIBCHBjiAIA7SjWHAk6x5WVtVixLQ1qK9PLwnn2c/m4JWv5qWsPPr2IIh4SLAjAoW+pDMfhSS7tOfTscvw4aglmLVkq2WaMLrijIVbMHnuxqSXQ+MMQVhDgh2RFOhLOoPRHGDoIaYt+8prAABV1XWWabLbeYIgCCtIsCMIwgAp7DIHkr0JgjDjyiuWMdYXwL0AjgMwEMAKzvlAU5pPAPxRcvkVnPP/mdLeC+B2APsBWArgAc75JFOaVgBeAHA5gKYApgC4k3NeaErXD8AbAE4CUAHgazW/cjdtI4KFtASZD4U7IdKf7NAq7yguBxSgc/vmgeRHoy8BuNfYHQLgfACrASyzSbcWwPGm/ybrE6hC3TMA3lLzXAVgDGPsMFNeXwG4EMCdAK4CsD+ASYyx5rq82kIIfK0gBMChAK4B8JHLdhFJgsIQZC4KGdkFB93CpJAtAYpvfmoibn56YuD5Zvp9IRLDbRy70ZzzkUBUM3eURboKzvlsq0wYY3kAHgHwKuf8RfXYNACLATwM4Er12LEQQt/5nPOx6rHFANYAuBHA22qWtwFoB+BwzvlONV0tgC8YY09yzpe6bB8REPTFmPnQXrHpjxvFeDYrz7O4aQSRMK40dpzz+oDKOwFAG4jlUi3vOgDfABjMGNOmkvMAlAIYr0u3AcAM9Rx06SZpQp3KdwCqAAwOqM4E0bCg3UOyimwW0LO5bQThl6CdJ/owxkoYYzWMsfmMsatM5w9W/y43HV8KoCWAbrp0KyQC5VIA/U35GZaGOedVEJo9fToi1dCAm7FEtSE0a6aMEdPWYN2W0kDzzGatFtnyEoQ1QQp28yEcLC6GsHfbBOBrxtiNujTtAFRxzitM1xarf9vr0pVIyijWpfGSjkgVNN5mPNliv5RJfDhqCe56aarn62yfEWleCaJBEthesZzz10yHRjLGJgN4HMAnQZWTLJYsWZKScgoKClJSjhvc1MVrfbfsrgYAVJSXx12bTm0Pg0xr/7x5BYF5HWZa2/UEVfcg+9uu3bsAAOvWFaIVdkjTVFSLBQ/FQ75BoS8vGWVv3rwHALBt+3YUFFQGnn9QuG17UPeoqEjoSNZv2ICCvN2B5JkImdzvEyXMtgcm2FnwLYC3GWOdOOdFEJq0PMZYU865vje2U/9qb2IxgB6S/Nrp0mjp2lqkW+GlogMHDkReXp6XSzxTUFCA/Pz8pJbhii83AYB9XdykkdBmYwkwfgeat2huuDZt2h4SGdV+9dkfdZSVj5Q3Mqrtenz2Af21erE4yP42eflcYP1m9O7dC/n53aVp9pZXA//bgoiHfBPG1I5kPfs1xSuBhXvQdb/9kJ8/IPD8g8BV2xN5xyTMWrsAWF2Gnj16ID+/dyB5+iVj+30AJLvtVVVVtsqoVAco1mzrDjYdHwBgL4DNunRM50yhT6cX2Jab81I9b/vAo2BHBIOShLXY14fPx0+/rQ88X4JoEGShraQSjWMXckXSFRc3ZmdJBd75biHq6oLyjSTShaQJdqpQdiWA9aq2DgBmQni7XqVLl6umG88516SCsRCauHN06bpDBCEeqytmLIAzGGMddMcuAZBnSkekiGTYZ038fQPe+GZBgDkSRPqQkCOAzQRO/gWEHW9+uwBjZxZi4aqdzomJjMLtzhPNEQsz0hNAa8bY5ervOerfTyGCCq+GEMpuAXAqgBu0fDjnVYyxpwA8wxgrAjBPTdcHwLW6dL8xxsYA+JAxNhTAHgBPANgAo73eexABjEcyxp4E0BnAywCGc87tAikTyYY+pQnCFb4EMBLaiATR3rt6+gLIOtza2HWGsJfTo/2+CcAoCE3cI2raGgih7ULO+Wj9RZzzFxljAHAXgC4QIUzO55wvNOV/DYAXIYIR50HsMHGFfqswznkJY+x0AK8D+B6xLcXud9kugiCIUElW6I6sDgmSxU1LGfTtnbW4EuzU/VmdXoOL3Baq7jrxokOavRA7S9zmkG4lgHPdlk0QBJFOJFtGyeb5O9P3ig0T7c5l9QdAAyXVzhNElqNQ7CyCiOJG8Ij2GR+dpqH2MxJFEkd7N+leZh8k2BEEQYSIH4cjN5NxQ1DENFTBNlAawHvS0CDBjkg7vp20EisKww+uaWbx6p0YMnQkdu+pRHllDW4YNh6LVhc5X0gQNmjzarKWFbNxtTIqtGZh24LAzW3J0TR2DeELoIFBgh0RKLFJyn8en41djvve+CWQ+gTJ6F/XAgCWF+7G+q17UbKvCv8da972mCC8odRrS7HBSinJiCmZNkRNPkiy84v2umXxW9JgIcGOSAo04DYctu0qo6/+BNDuXI6HLuPKLi/6SLKvL5rftlUbi1FUbN6CnHADdd3sgwQ7gnCJzDGkoY+JfP1u3PrMzxg3qzDsqmQsUaE4G9dMk4x2y+55dTr+9NRP4VYmY2noo1j2QYIdESxZPEbo59/oHJzF7XXD5qIyAGJ5mvBHsuS6bH4100nLdNXDYzD6l7VhV8Mz0aXYNLqXRDCQYEcESsMwao5E25fVdkyuaOjtTxw/S7EabkweslkRGHbTFEVBeWUt3h+xOOSaCLwIaRTuJHshwY5ICmEPuMkm29vnFbof/lF8fA25C3dCU3ayqa9Pz3vsSZhPzyYQCUCCHUF4RD9o0tzpTF29gro0nQDTAe0d8qOxSwd2labeaSGqKQ9ZHants5qTgQ8vQqsOWQsJdkSgNJRBgpYx3PPk15vxwJvpF74mXYh5uGaecLBi/W7c+MRPmDx3Q2oLTg+5DnV1qmAXdkV8EB3DaBDLOkiwI5JCKiepyupaLFu3K+nl0ADoH76+OOwqpC0JOU/YXONnRwuvrN+6BwCwdG1qnWfSpStmtMZO/UtL9tkHCXZEoIQxRrz5zUI88Oav2FFcntRyNG2kYQhv4INitjY/lZNd9L3yItml2X0PSzgIW5zSbOwyUK7LSA0x4Q4S7IiMZ+2WUgBARWVtUsuJaVYiFLXdRLZNEqmUU5IW7iQlbQjnuaeLlimInXbCJk1uJREgJNgRhFcitLOGGW2iLa+swazFW0KuTeKkcq5Luo1dJksdToTcNP3HXjrgReClUJzZCwl2hC3fTV6F8sqasKvhDoex9bOxyzBu5rpAi8zmr10/WpHXhs/HM5/Mwcbte5NQo+wkEa9YO3mioTgyeWXj9r0YPpEHkpdsNxonxs8qxCX3j0qyp7iLGsWM7JJYDyIMSLAjbPlkzDJ8NHqp6/Su9rB0cb2/i+1PfztpFd7+bpH//PXQ564BTWOxY7ewc6yoSu6yeNJJpY2djz1d00VoC3v3Aj+a84fenoHPx69AWUXiH6yKj7XYD0YuQW2dgpqauoTLNzPxd8072fmBaPcuPd4kIkhIsEszdu+pxJChI7Fg5Y7Q6mAWrvzYrvldmsiEj8cI9HJdBlTYJ26eRVyaNFmSSpSULsUiSQb4KWhEWE87EbvE6trgBCo//V97zvVJHOw87UCRtFoQYUGCXZrB14uwAWNmBLtkmCmks5ikF3jTxaYmmfh5FtkSQiEM54mgheJsMOy3IpH3K9Bn6yOkjBYaJZnvmKudSdJ6tCUSgQS7NCXMeTGJq6GW/Hfccnw1YYWvwlM1ccUmykjoS1DpSjYKEclGE1KStles92w901CFBD/CcywwcPw9W72pJJCPIld5pJnjhx8mz92A1ZtKwq5G2kGCHeFMCvYd/ObnlfjyJ57e00NaVy4J+BGys8ZuJ/U2dnYT7NadZfhs7DJPk75fAWHx6p2uPZszUSYIss5+9vnVBHiz88TiNTvxj1emYeT0tQHUyzmNthScLs+wdF8VKqvdm/3U1Nbjla/m44E3aFcbMyTYEXGEOSmTBizD0TSZ9eFWI1FSuhQL5wn2yY9+w7eTVmHrzrKk1+ehd2bgmU/mJL2cIPCjbUr2s527fDse+2CWpWBttRSrOR6t3VySzOpF0a9ApAPXDxuP+153L6Rp0RqqazN8sEkCJNilHenRycIjsyS7TLcls8Ndy4ypssWpJKW1d6Gxq60zTl7p9tqluj5BlDfs/Vn4+ff1CeVRrz6WHN1M+sSHs1GwYoflOxTVapsaEajtnYtMEo1gkAwK1S3q3FCjCnRNGucmqzoZS6OwK0CkIYkYJrvQPrgpOp0GGw39QBi1kwmzQmkIbSzuneiSmIu0cbc19H4SbgUSKZ1vKAbfUIwzj+npO4/YNoOSmlj0AW1sM3vFan0nCG9ZNyHyYnsJh/4S+ULzbm7ahAQ7M6SxIxxJZcdPt+UBGRHdHclmASab2+ZESveKdaGx89Mbsvn5pU3TJCZ2Th99Vpq5aBiUAAIXe9KYBzTUVlTVYsjQkZixKDU7z9TUCI1dHgl2cZBgR8QRro2d90ju0WuDrYp9/ukrd4aCJpPEvIXTZur1R0rDnThrub1UR1EUzFm2Le2M44MkXZYRZa4TTlWKaubq5Ro7P11nReFuY5+T5PH9lFVYu7k0liSBsVbGtl3C/vOrCSsCytEeTWOXR0uxcZBgRwRKwur9dHbBlzrAZbgAY4uf0DPZsUSd2gDFgqDe+ZmLt+KJD3/DiGlrtJwDyTc9Cbdtsg8Yp1BImsbOvORqddyJ35dtw31v/IKxutinMqXfxz8uw99fnhr9HfQ+tzkp7vuaYNwol8QYM3RH0ox0kGeCULb4trFL4Ppk37qYPU2srCCWTTIZy3elYd8WT7jZK9bqlOz4rtIKAMCO4vKE6uWGsDW06TBeAuZ62As4VmYcfpdiNU3Zph37dEfdOE+o9QnqHqb4XdBKyQl8y5bMhwQ7Iq2ILbH4CGMQdGVsmMeLAACbi5IffiIs/IzP0Yk+wyU7N5NTRVVtdFINoiw373yllz14XTyCRAPihiVYpctKv2yfX6d7EtVsxe/H57MS6tUeBRz9h2oQ5ESXmAPK0CXpItwDQE1tHYqKK8KuBgl2hIwERs2ABtx06qwydu+p9H3tnGXbUhKPLAxioRwSy+fbSSt1S4npycPvzMCtz/yccD5uNCfaubtfmaZek3hH+3XhZvzjlWmYNn9zwnmFhb9xIjipULbPbyR2UooWGiWovWJlFiJe9nn2KhBakXLtbZoI93pe/nIe/vTUT6itC7dyFO6EcMTL4JmopiZmo5d+6CfgROr3xIe/IRIBRr14USD1ShZenqQm0EXflQTHtc/GLgcAXHxKn8Qy8okbYWvVxpJAyoqFO7EuzGqulNZPcZEGsaW7Tdv3OlXRkTScY1OCdJ/fqK2Z/K4EHRJIlo8b4Spo54lU29hphKW9raiqxZ6yanRp3zx67Lel20SdwqlSFNLYEXGkh42d+wxSLQRGEElYoxj0YLRwZRFqVC+xdCDjl2LVvyl5t3wUlsrJrK6u3marp2Du0LotpRijM/53Il3eLz/OE1b4H1Piv4Y9bBUbnKNaqm3sQn4FHnz7V9zy9ETDMc0+MmzFBAl2aUrYL21opEkYA1siAQ6GAbBmUwkeeW8mPhy1NNB8/QzQfie1sCneU4lp8zbFn0jBc9Y0drbOEz6qEdQE+8RHv+GKB8c4FJZYGXe9NBXvfr/I/QUS27YwkGl2E62R38em1/h21mmRRJ4ytZ6/cqzICVgTme6s2VQad0zb/zfs6YEEOyKORPplop068K/IANG3LZ2qV1pWDQDYXLTPIaVH/DhPOHgEmqmuqcO6LfEDZKp57D+z8eIXBdhbLu4lAl6msmPDNrGNUrLe+URznbdih3XeIfWD6apdYIUXZ5IkYohjF3UgkhO04KPP77B+HQEATRrlWKaJHUuOEJI6r9j0lSDDnh5IsCOSgt84dmHb2G3cvhfPfTYnbm9OPZE009hp6v+cEOqkDa0/z9kg/uHRxu6t/y3EXS9NRXECzihO1NbVY/Qva1Fn80x3lghPtjrV6DmRsDteeW34AgBOz89LwBPB/JVFvuuU7pTsqwIAbN0ZzMfM356fhP+MXOL5OkWmsoODZBcwbuxBZVUJ+iM66hWbciO7FJeXAZBgl2akg7iQyBdXon0skXAnQfDa8Pn4deEWrHYwjE+H56SRtC9vH9fE5Dp3V/P1xQCAfRU1Pkpzx8hpa/D+iMUYP6vQMo05TIssjEXSsS0qvWevsJbfWjVvEkg+G7fvw8jp9l7YKwp346mPfosutwHyp+KksbPCa9+J1kE3ZlrHlYw/YXae2FFcjh9/XeupbD2xrdIaho2dLbQUS6QdCYRX0jq1X1f+wINmeq6A3Sn9Wmzyq+KWoCPI+ylbI2KxD6YV0Wj7SfzML6usUf9aL9tZaZhTeUuDKipZd1LqKJCkspw48dD9AQAH9WiXsjL//ekc/LZ0m1G7rN4SQ7iTkAYvu1KlGjv14MzFWwEAw96fhfd+WIxSVRvqufxUhzuJFhx/qHRfVUIhqTIdEuzSlHS2H7Bjwuz1AIBFq3f6uj4aNDMkY3G+QWiQ6iSChn67NN9bpiUBTYjODTgCuy/nCY/pc31uo+SWhSuL8OtCsSm5l/YEHQrCDfYCQQI1CUjQCDs2lx7tgyC1MlT82CQLVRPVvKVqKdZNGqmNnfg7flYh1m/bg33l4gPIb18MOoyLW2rr6qO7rWhcP2w8/vj4hJTVYduuMkNg4rBnBxLsiDgS6ZfGbW38Fx62DVtVtU3okEh6OU/Uh+iJZVWmWyFKuz5ZGrtH3psZDQbtqoQ4bXXqbmpOTgQjp6/BkKEjsU9z4nAgmdUbY1qWs7M7TfWHqNX7Vbh1TxLLtDkpfQ6W66LyLHw+TJkphuSz1DYP2/HOJSkXaNWCNmzbixuf+ClUR5pbn/kZf3rqp+jvsOcvEuyynJraOluj8aBxOyFZUR+CpkSG7S4ACL/j6kmrpViPQUqTrbHT42Zi1pK42b81aCIRYMLsQgDArgSWkeKeic983v1hseG3dBzxa1CWIFHDf1PrErERc1um3slFu9dbd5bpQl2401zpu+u+8mp8MHKxdeIEkUc7UWx/e0Vrd6J9ma/f7es6T9vtZTkk2KUZQU/Olz7wIx58e4anaxJZ0pQtYfqheG8VZqhLaE4kQ6Bx2lg6jeQ6XRy0NKqUy9cgOhmkwpXO5r229DlNqY1d/HJeWpGO75epSl620vJcpMRlXz9WLlgpwsL4uU2fjl2ObbvKfdVP/2Fnda2djZ0Zv2YmQdnY3fv6L67SmUtJxcdhpkCCXQNgeaG/L6Aw0G8g/exnc0Krh0xI0g9Y6TTHRcOdBG5j5/0a6w3O7UmFttGdLZIW7iT1k8TiNTtdC7h2t3fR6uSEOQnahjMRErHF9V2mzsY2Vg/JeR9KzERWVfT3Qvv3JC38ULRu1l6xcf/OUPvuoJQK2QAJdoQjqRzOUzGoTJ8v2WHAjN1SbCS9nCeUNNLY+V2ZS4UnnV0Rcbcumja193RzkbAHNAu6+p/3v/ELqtXt42S1m7Nsu/Tawq17ErJDapQbX1pozusSIUscT957JPXYlxQXDdLtoSpePmw2bt8r/QD4fsrqqNZv9pJtsWDbFvWMX7JP7Glq5bUMKASNI6b6p0TrnyGQYJemNFStsrndyRioX/i8wDGNbIhL150ntPEsEnBvTuTOu31sKdW4eGhRzJ4qOXVJhOWFuz05CeyrqMGu0grc+eKUhLTgOTlpOF2kVGNntKEDjO9U9HAS67R6Uwn+9vxkfD91ta5i4k9dvRINtG1GvhQbrI2dFouyZbPG0WOP/2c2PhzlPfCzHxrqnCkjDXsqEQZlFTWoqhFagDA7SNxgE1JdFABTCzZG70k6E67GzviAYlVIfEkxcGzLstD8pJME7xNFAW58Qnjs8QTMMuyE3OkLNvvON12prK5FTW2s/0fN+gw2drF/axovpzdG9s67fc22qxq5lRuKUbBiO1ZuKLZ8rY1OHrKlWN2/9XXxK5lGHY5i189dvh0jptkHft5TVo3rh43zUZyxTWRjF4MEOwIAcPUjY/H3l6aEXY20+epatGonXvpyHj4ZvTR6TB9WINHQBEESRLiTrTvL4u18/MSxS+ONwN3Z2Bl/p6tY5/c9atI4N+CahINVnMFE37tq3YfcFQ+Owd+enxyXecRKYFIPx5wI7MvyM4ZEHaVyInjsg9kY+tp0y3KCtrl1jcdil67dhdJ9iUVTAGgpVg8JdkQUzb4nFUtwltfHueCHgxa2pXhvfBT2CCK+l+iSIfDUS76UvVBUXIE///tnfDJmWWB1ctvM1Bq/23jFupyMg6S6pg41tXKjeaf74ree2SLYaZjvU6LLieZYfZrNGqAzeTCUp6uL6V/JsBfWhJdcK7WhDr1g5yrcSVDV9ZhPUGOpX+eJiqpaTJhdmPodM5IICXYhs2nHXrzw+Vzb4J9mvKTNONKkb9U57ubgU2Pnsz62eeq+4v2gbSG0eI1xtxA/dY3ONy4vTpex1Erzk0zB87J//oi/PjdJXh9dwdt3l8cF/vYbL1Im2A3/eaWra1P5rGYu2hK3m4C8Lql17QKsnSe0Dyu3fcDoZW+/bKqxUt0ZxyC0WaQ1LMVKzhvkoDTph4ngV2P37veL8Oa3C7FsXeZEj3CCBLuQefXr+Zg+fzNWbSgRBxzGqR27y3HJ/aPx02/rk1cp08CSysC3cXYSIc380RAiufLB0fctScZSrGSJyA9ubn3pvipTmASLvNJwpvBy64MSHJy0ANt3l0uP19crUW3eUx/9Fn/e5+3Naxwb8s1V27G7PC44bBjLW+WVNfj3p3Pw1Ee/YUvRPmwpit/NxkrwTuZwYbctl6iM+schSLesm7p9y0b9sjYuD8t4dAYB1NktNrBhPuDporqmTq6NM3vFenj4C1buwIX3jsTe8upoH5Rd/9VPHA+/4y0ObDpAgl3INMoVj2BywUZX6bUv918CMFYeN3MdPhq9NNStWMzECRfhVCMm2ElGO2Fj5y9fd3Zeird9TaN19Vcnt2zbVYbrh43HD1OtjaG92tily1JsNE3Ab5zf3F77ej4ufWA0AATaPxs3sl6KvfnpiXHBYYdP5NF/p6ovllWI9hbvrcJtz07Cbc/KtZpAMDKEWwcprf1GTVj8XfG1jO6xIW608w5RWQwfB0EIxF7z2FK0D9U1dY4fpJf980e8/EV8JINE+ur/Jq+CogBrNpVEn3+eRJv95YQVvvc9DxMS7EJGW+obP6sQG7fvdb4gwInw7e8W4Yepqw2DN5DoAJ7g1QmMMEF+rWtfb4alWL2dtG/nCec0F947ylNYCm3ZOFGv2Hjth7Gy2ibXc5Zvs84jenFCVUk9pgB8YWscubrk9sPU1ZZaPSD+Njttx6QXCNy8Lobg5gF0sJra+ujSv4w9ZdW6gLvWFbR6PvX1CuYu3+5pHNm+q8xVOu0D6relsfffLgSSF+cJJyHMjJNwaa6Qmxh8svqWVdRgyNCR+H7K6viTCVBRVYvbnp2E14cvcPWs3Hhda0oSN8S2P4vtk5vXJHvsTxu5ScQY6wvgXgDHARgIYAXnfKAk3WAATwMYAGAzgFc5529I0t0L4HYA+wFYCuABzvkkU5pWAF4AcDmApgCmALiTc15oStcPwBsATgJQAeBrNT/r0TCN0A+0ldW1+Fj1wmyortvmVptvw7R5m9Bjv1bovX8bd/kpCr4YvwInH9HNUz3qJLs5RAfQiH/52u2EM3PRVg95ir+RJKvstHtRV2fnhJA8w/FE8RegONFCE7v8I51XthsKt7r4OPRA0MPQc5/NwW9Lt2H0SxdJz1/36Di88o9TXNfLLPxNnbcJU+dtwiM3HYNjB3aVX2v67dY2VbvuxS8KcMqRB8Sdj777AXipO2Fot5Vcp/u3NNyJxYX647vVPYt/+m09Lj2tr+d6WlGiOqWtWL8bJx++v688zE2SBdC2Qr9DjhbSprEHwTDdcduSQwCcD2A1AKnrHGPseACjAMwHMBjAxwBeZYz9xZTuXgDPAHhLzXMVgDGMscNMWX4F4EIAdwK4CsD+ACYxxprr8moLIfC1ghAAhwK4BsBHLtsVOmYziKiRtNOAGuCA20IXUFKrh18S9op1uP7FLwpw10tTMWp6/HKgbPAqq6zF8J9X4sG3PNpJqFnJNvyORP9nzZaifRgydGRcINlkiDv6MCx6Zi7agiFDR0YDhzrnY3/eabuw9Vv3RG9LOn6XuFoGN/1NdHJO9m0w3+fGjYKVJvSTfBBt0Wu7rNijhr5wde8t0ui92Z3sBN1qumXvtP4D3Owz4eXd8boC4MZ5wsmWz7KPGpZovQupbpJqu1S0at44MBtuJ43dtl1lKNlbhbf/txDzuNjXV1F0S9LpGtvIB24Fu9Gc8+6c88sBzLNI8yiAeZzzmznnUzjnTwH4EMAwxlgOADDG8gA8AqHJe5FzPhnA9QDWAnhYy4gxdiyE0HcL5/wrzvkYAJcA6AHgRl2ZtwFoB+Aizvl4zvlnAO4CcBVj7BCXbQsV/UvtRkuXjHcvyMC2xw8SX8n7dWjukFJOvNAgvycfjIyPZm5nuuLbk9ji1jgF8dSWDsy2kMmJYyf+mp+jtnyyyWmJX3eZnb2RtvGA/j3Vt2brrrJoXmHLdcV7KqPaBjdky5ie67A7hNeubghi6+Gh7iguxw6bJWTbMrWlWF9XC8ora7CvvBrT5m3CRfeNwtad1sutrjV2uhsQFRalPgla/d23QJ9y3oodlmFwNAwWIm5sRx2kTQWKdDtAL0KqFy19eaX42GzetLFDSrvyjNi9+/P4Dtz6zM+44bHxGDerMJaHR3vmTMGVYMc5t33LVIHtdADDTae+hFhuPVL9fQKANhDLpVredQC+ATCYMaa9PucBKAUwXpduA4AZ6jno0k3inOutG78DUAWhNUx/9B00pCgm8cuf/l/0owd0AQD06NI6gRrp65LY9f9861ctJ28XOgxkTgOdFui0SWNTF0vCGBINWmqqVKNGomynSULPJz/qAzIbz0WXYi2srhVFv09muIPl/z0+AX98fILn67R6h11/vzi9l153FfBrEnLzUxNx89MTLc8nen+tAhRrfPzjMlzzr3HRDyu7LdicNEa/q1pGfY21PmDcUszoOJTI9/KE2YW25105T6gVGPPrWtzw2Pi48/Wmvit9JBZL3l6pqa3D4/+ZHX0OicbelGGXVeGWUulxRdEtnWfN511wzhN9ADRB/DKtNkv0V/8erP5dLknXEkA3XboVEoFyqS4vLZ2hTM55FYA1pnQZgWEQdfKs8ighvPPdQgwZOlJ6LkjTLD/j9QqdgXZC4RUkl2oDSSBeX+rfSCSCH39da5/WYuBK5lKsebBvogp21bUOXn+6ShXvsTZsd/J4/XTM0szdgStJFU++fGgswG95Uyy88o35xX7sKq3Au98vQl29v69Ru27upF3S1ySIZTwn4eJJLdyM7mZou7TInCcMJhsSpE03Ja528NT1IhB9OjY23bpZijVoJh0EaLes2liCucu3461vF6iFSCrkFVP9/bz79Uri1sDzVuxIMIfgceU84YJ26t8S0/Fi9W97Xboqzrk58qQ+3SY1nTkvLV173W+36RxZsiQ1GxUXFBjdtveUxr4kVvCYd2ppaWlcWgBYs00sL+3Zs1d63oqxMwul5QPApk2x5cKCggKUVRoHlV27drkua80GsfxSWlpiuKa8qh4zZs9BU7MGC8BjX26K/nvZcqPMP2/efEPd9Gi/KyoqoteW7mgirZd5uyyn9uzeJYTNoh1F0bTlZWI5Z8WKFdFdOqzy2rpNPNfNmzejoGBvNF1VTb3tdV7qqLFpkxBet2/fhoKCWNcqKxPlruCrgLJN0msBYMtuYe9SXl6O4pxY4NuFCxeiZbNcaTqtbus3xGKMbS4qQ4cWYphct24dWsM44MnaU6a7p/t25sWd9/KOO7Ft2zYUFMiXZ6ur1CDNS5Zgc8tG2L1XhNyora0N7DkF2RaNNWvWokl1zNFmXaH98ufefbFxY8vmmBbr5S/lFjb79sWW8QvmzUdj1UD9q2k7wTdXYsOmZrHzFu2THZ87t8DS2H3VqlUAYs9ElkepOm6uXr0KkXLrd7u4pAQAsHbtGjSp3gIAqDeNBYsXL4r+e/6CBdIxqqCgwCCMzps/H3mNc7B2W+x94pyjYnceqmtUm9aIvO1VlaJd+jmnaEeRIc3GTZtRUGCtZSwqivWtrVvldota2fX1uv1uFSVu3NTqXlsr3vlFixajdXPR77cWiz5fWVnp+P5uKBLtKisri0vL1bmtZM8+fPbDL9GPzr179mD16niPW+16vZBpznP1FmNfXrR4EVo3z8WCteVx16xcUyKt8+rVa1BVLdq4ZMkSbGklF4ns2v7GcHkEg2T0d7cEJdhlPAMHDkReXvzEEiQFBQXIz883HBs1bxawVXTSfv36AT+LDt6mdZu4tACQu3IHMHknWrduFXe+oqoWNbX1aN1CJ9x8aRz0Duh9MITsrDt2wAHAAjFQ5ufni3AE38cmiw4dOiA//0i4obLRFuDX3Wjbtm20fuWVNbjq4bEAIPeG09Wxf//+wPjYoHXEkUcA32yO1k2fVsu/2ZQpQEkNDj64P/p1bwcDanrxZa/EXWtVl44dOwCF5ejUqRPy84VfT/PpU4HdpaKOE2J1lOW1eOtSYNledOvWDfn5B0WffXllDfDtFld1kJ2fPHcjunVqAdazPX78dS2OOWQ/7LdrI7BoD/bv2hX5+QdjyZqdqKtX0KlDPVZu3oIePXsj/3Brr+C534uJbVtxDfr06ARsFAP+oYcdinatmkbTtdlYAozfgebNm0frtq1yHTCnJJqmQ4f2QGE5evXqhfz8Ho7t+fLXacCuEvTv3x+V1XWYUrARd199pO01GhNmF6J1i7yoXacx4/jJvkuXLsjPjznzb99djsqqWvTs2hp54ycCZeUYeMhAdO3YAlt27gNGb0Pjxo18PSdZ+XHpJGm8cuCBByL/0JhX4b7IJmCmdciT1q1i48aq3RxYbC08AECrVq2AHbsAAEcecUR054qxC34DNm9D23btgfXy/qlhaLd6/vAjjjDGDdNd17dvX2DaLjRtmgeUlcfnAWDk3JnA1iL069cP+f27WN7LNm3aAJsr0a9vX+Qfsh8AIOd/W4G6mLAzcOAgYKQQjo44/PCY3Zd5rNH9PuKII9Asr1F0PAaAg/v3R/9e7ZE7YgeAakQk9QaAvJ9+BvbV4pBDDgF+3A4A6NKlM7Ay9pEkxo1+8Q1S69ClSxdgxb7Yv5fF29FqZTf6YTuqVGEzJycSPZ43cRIAkQdjDI1m/Q5UVWPQoEHo2FYI7Ks3lQDjdqCFrs9b0XTtLmBiEVq2bIn8/HyDYMMYAyYWYVtxDb79dTeuOZsB2Ik2bdqgb9/ewPRd0rrX1yvAV5sNx6K02A5MjVlhDRo4CEvW7sKo3+bF5fPYl/LVqt4HHohGCxYBqMKgQQOxX4cWxgQu+njTpk2BPfFBtJ3uVyJUVVXZKqOCWorVNG5tTce1WXa3Ll0eY6ypi3TmvLR0+lHLbbq0xVIT7bQUK9Ef3/7CZFz36Djb62SBjZ3iL3lZ7YjFoIod21Pmfvuj+N0PnBXlm9XI9HZJfavbJVEFEvF2S3Rp7pWv5uHe139B6b4qvPfDYjz63kyd55oo68G3Z+CRd2eisUsbu+3FFloeta7bd5dbb2FlsUuJUzt37C7Hq1/PQ3llbbSoR96diUlz3AXqBoA3v12IZz753XV6M7c8PRF3vDgFQPJCU2SmpV4Mvd2Rvi2xZUd/LVRs1mK95OhkF+WmegZnAZeF3/3yVDzzye/SnSfsPEm37y6POnIYHFPMdXKoiBdnMGtTGw83J8H+YX5OZRUxraZd3lb3YcvOfVHbx2haeJtrohdleieVEJRgtwZANWI2dBoD1L8r1L/aOpss3V6I2HdaOqZzptCnW6H7vdycl+rI0ceULm0xBql0YRBrk0YLIGuHrJMHOalpDiD6dskielte72PrCTfOAUHaOrl5Tpb2KwHVQQucvHtPVTSERPFe49JETLBzF1kfEN6kZm55eiJuf2GKp8HdaWL6fupqTJqzMW4P1LQhQwb7OIHA5XVT523CF+Odh0jj1lWx3DV7Tr/96t7Xp1ufjNqoyV+48soaVFa7f6cBYzvMdrx+hNMtO8swa/FWSOQ6W/u/WYu3eC5Lhj6WpJdAyPpObCVbGwVOzTHLaw3tcWu7J6tiXV09bvv3pKh5UTStoqDeo82nAiUrY8YGItipDguTAVxpOnUNgG2IhUiZCeHtepWWgDGWq143nnOu3eGxEJq4c3TpukMEIR6ry38sgDMYYx10xy4BkGdKR6jIjY3NBv7+X3Sna52MgpPXxVLbebXSauvqsWh1zH4mKG9LbWKtqKrF2s1iGX3C7PWGNFpcJ+k+izr0E6h+I2z9VVahQ8w5u/1IKHMZWy/VaO9vTFGR4IyW6jnD4f3Sns9Lki2avKAZ7/udFNdvsw7BE+0jFrf+qofHxnbEcFzZMGqzAW9e4k7o+3NMW219T+yCexvztT+v19g55WjlaKGv56qNJVFtlyF2oWmZoqKqVpgpmFi5oTjuw9IOqwDTVnXUJ7Mazwwx6TyQqR7wdrjdeaI5YmFGegJozRi7XP09h3O+HsATAKYzxj4A8AWAEwHcCuB2zbuVc17FGHsKwDOMsSIIge8WCA3btVp5nPPfGGNjAHzIGBsKYI+a/wYAn+iq9h5EAOORjLEnAXQG8DKA4ZxzaSDlho5TRyreW5nQZCTrI/pDl/3zR3z37AVRWx3bxPE/Hcq2Th3EZuZW2a/bUop1W/bg9KO6x53TtCJDL+lqm0cyiC6X2bT9X+/OxIJVRZbnnTC3p0Z1DvGyvY8XXvqiAHx9Md5/6EzP19ruPGEKwBfUYJ/0HThM2e+xWjJX8SqoWm1BpR0Pol+Z0bKU1VSLfxath0Nebmone9SV1e7255WOd3aCXb3iKp3Te+P0sablH4lEDM/QqIGN/fvDUXJ7rajHvXrdYx/MwrJ1u+NspYe+ZqOBRfwHn9MHgVZ3swC4cFURtu2Sm44oUHy9j4m+wukYCcCt80RnAN+ajmm/bwLwCed8FmPsIohdJf4PwBYA/+Ccv6u/iHP+ImMMEIGEu0CEMDmfc77QlP81AF4E8DaEBm4KgCv0W4VxzksYY6cDeB3A94htKXa/y3aFRl1dPdZsNsbWWVa4yyK1Po34UvW7MbEshqNezb5qYwn6HdDWV96AOzu06po6S8HO3OG9TLB2KQOZfiwyueulqQDEzgs3DRFxsc31rlW/1AMTGFxko32p19kkthPqXAU+Nd2U0jLhGde6pX9HJKtyd5ZUYOo8/w4HtpOlUa5LCgtW7sDhB3VOYgnAByMcvPs9TkJmQbCouAJzV2wPNP6YZdmSIsxbrDlWIyoY2CXRCVvq33Wb7Z1KbIu0eYkMgp1tJvZlGDR2LrW0gPM2Y+ayY8pTcaVeow+IUFV2MQItcXgudfUKGuVGdDE6xfFH3p1pm6cbgddwidKANXbq/qyOPZlzPhYulkA55y9CCG12afZC7Cxxm0O6lQDOdSoz3fh64kp8PZGjTcuYB+vn4+JtXvaUVRu8XFeqm4P7JddhJDT6jvpAEnXd3HFsN/cOsI/py/War1bFcTMLcfOFA5HXONfxK/r7qaujgp1Vfma27NyHXSWVGNS3Y1ydE8Zs9JMElq41foxUqxo7za7Sysjb1tHF4ty/3rMZ1AMmGWP9v96bhe+fuwCNGwW32Xgq9+RVAAz7YCY2bt+HI5kQUL1OpO5LAmRTTkWlUZPm7Dwh8pq3YgeaNmmEQw7sIEnkq5KG/PX/jm5KIcnXEPfPptzPx6/A2cf2RLvWZj9DgVGws6qbNu5YLMValK0/rlg/CgDAfW/8YpGLPdruD3OWbceh6thnqINWsMslWy2pZ42dTrALamuzdCB7dr3NMAq3Cm1d6T7rpZPl63bjukfHYcbCmMFtohO/dKN4g6o+/ryX5Zto7Wwusf16TqR9pkuDmnNGTI2Ps+QXc/Nu+/ckPPRObB9bfZ2HDB1psM8z5OOiLG3SzQ3Y8nnt5lK8/d1CrN5YgpmLttqmfVMLSGpi5UbrDxSrtun3/0wZiZrYmRrj1sbKLc99NtdTes/NMTlP7C0TS6H1USEmeMHObueGOO1zxGlJUzDql7W6XWjkafSFu53jFdm/XZqEON25d39YZHnO4DxhkZP0qGTv67jrlPi8E9bQ2lz+4aillueiGjsXY5ii+HOESPwVTj+BkAS7kHDzdbBqk5j8lqyJLbt6+SKRaUucBLecSCQh4Upm92zOzm6QSKSP+QmVomfbrljQYf09qZFEmfdal5jyzNm2RM+YGevcF2pCe1dydPZupfuqXGtZ7No7bmahq9ACMxfFewHW1yu2e3daEbRnnp7o81GCXTI3E7YHnletxHrzMlvE8Md1hx0ydCS+nGBckVhRuBtDho7Ehm3GMuzMOfxoZByTSJ6J68ckkez0VTSPwUZvVqexwN25UdMtdsFxaoSLRu4scY60YCYIxVf8nAHsdbAfVRR/Np9JUTqHDAl2IbGv3L1XYPHeKjz2wSzsq6jxJFy8+338F59juJOENRTOam2p1tB0fex34nVxi9Wku7u0Eu/9sEi6P6QVI6evsaiU/LBsiyLZb+cTMbRJpZF6v8sqanD9sPH4yMJQ2isO+80DkA+ac1dst7/Ism2Je6iWV9Zg8Zp4+9RUrcKEPYlUVLlzCtAwa0m1rqv1by8T6Vc/ccNvLabmPO7svDNj4RaU7quSmHU4CEAu+qoxvIc35EKhOLZrTw0uuX+0wS7Uy9L1rMX22nDHukmOeX3PX/hceE+70Zglg6gjTSTiausu2f3dUmQdUkmBEpuz/FVRBPRPM2jniZCoqHIh2Knv6AxV6zF57gZPX/wFko4g05ZFDOddZy/FjcbOrog4wcZL2abUTmPoisLdKNlXheMGCo9VK03ixN83eKiFVd2Mf83U1NYjNzdHItj6lwSiS7Hq1k1lqkfhzEVbcOvFgxLSGAAul0ckD6HWIdzEtPnxQbSBxIUvBcBz/52LeSt24PPH5Wa5Wps320wGXsvUkwwvUi8sL9ztnMgCYbNlfAiJeFRrxD1XiT3Hs5/NwSEHdkALbVcIl/jtPm5tF/WPM2oWpv5jW4nob7MWb8GpRx7gKd8gcFTY+bzOD167bnS81CkKnMcrRZpm2AezHK9LBM9BkVMAaexSyIjZu1Ggaiscl0Qs3jUvgp1sEnGajO3s6eYu347Pxxn3ct22q8zUMSRfsKZjtk1PSENn+u0wid73xi94+uPYzgX6e+P0eJo28WcAbzmIaOEjXGos3dym+jrNPkV0c01wjQ2aLjKxIddGZRcNtSK9zv7mvvKVfN/SILRqhVuEbWu8mUIs8+XrduOZT+aYjvrEdAPqPAZQTTeC1NtEl1xNx7UlN/PzLiqpkI5/Mg2sp3q4cLKynvx116r/NodrsRoeEzMndnNxfBo3z0+Wd1h+BVFzkoiLECmQz3nVNfZ9LmwtejIgwS6FLFhbjsc+mA3AnTFq3PumuBsMFq/ZiXkrdkgHI6lAqd/9Ise6jMf/MxvDf14Z/b1uSylufeZnjNTZeEi9qDx0nLjO6+IrzXVeDrg1EJ7Pi3zLn1ZV0oKmBvm1bHae8Bp7zGnycOOUIXs+fpd1UuW1tlVna1m8twoLA9BKadTXK/j59w3YtMM6QG86MaB3++i/FejMKAKxo5Ko9wG89T9z5CtBbk5E+u7ahcBw059ciUiWAp+Liy3zjF08+hcLOzkL3PThAEzsosjGRqtdNKR23D77bsyRJgLHbyLFaqnbuqGGcCcRYaqxw2qLxQyCBLuQcPOeyzqeVYf+8dfYwPDQ2zMw7INZUsFm9C/xtl/GOEeyTimvn7ZctVwXfy8m1/nsyB7T241vdgPXlIL4PUndChz/HbccVR63NIot08jPX/uvcep583JyIkuxYiTUBLCoxk6ri++cBW7ul+z5+BbsfF0Vw+4jQP+Om6v3no13omOZpt/19cBrw+dHYx+GwYr17pdju3bUbYquKNFnEIiI7fACmsvIicgFOztc9R+DFk1R/xqTrN5U4nSpZyEvkf4nM7Oxyt8qQLFVDYZPXGm504yeNZtKHdP4xezEtKes2nkpFvL50YuAe8+r03HzUxNd1y9dIcEuJJwmt+27y7F6Y4nhmAL5QFVbV4/3flgcd1z2kq+WdEaD74SDMbIsf/3XnOxaLwsCcTZmTpWwC3Vgc+7lL+OX+1KhD7LTgr357QLXzhNunpH29bpx+148/p/Z8c4ffo1wVGy9m6NFyJ5+/HWlLkKZJMPbTkN7l/eV10j6pvj90eilGDJ0ZELla8J2kNtaeeW+1/3FHgOAHep+1EFoT50+As1l5OoC1kbTBNBr3Qx3ljsr6C6urK7F356fHDugmSOEJATIy41gzaYSWyeayXM3xm05J+vrqWiV/nk7yfSKooCvLzYcq6mt91RPt7a1aS7XkWCXKtbqdplYt6U0LoK3mc1F+zBtvjHKvkFtrCMuJIGK649b/VKshwE7KtgZJkJxLKJ7szztHuFtJda206aj7QQvLLY8N2H2+kAnAS2rz8evwNzl27FWtS/TljTMJR3er5On/GUfJ369mp/S2TomDV1dzO/5FjX8ysTf18ed037+EEA8w7DDnSSCf/MD+ZWyfUDtyInEC3ZBoK+fV0N4vSZv1cYSbNzuYYk92a+CJP+a2jrc/co0/PuT3237ZpVpT+/q2jrsM4UbSarAKvn2dN6GDNExTmNL0T7beurP2H0kmDcGII0dAQCGWE1O++rZIRNWrF54t8sWcRo71x5hIp3e1spun0c3eO0wxk24E8vLD7Ko6XY8/7l9QFnzI3OaFO0wp3n1q/mG4+YsWjY3ehwqANbr3luzHZDX+7t1Zxk+HRPuFs5WNT7q4C4AgEP7dYoTWBMJzmq+RWF7xYaBkwOQ27urKArWb/Vmm+j1HTVo3Fzwv8mrdGVZ1UH3b/1xj5Kd112HZLlrWntH72jTxcvW7cY1qrkIIByMUiHbGDR2Dn1H+qw9rEDZMfS16SjRrSqkey8mwS4E/C/DyDc5tnrh3Q5qTjZ2VtRFvS511+jc092UZ8Zc40RCcnj9uvfTWc3CEAAMn8jjjrnNO8g4fmbMAT65ydZK9szueGFK9N+T5xrtEu2qJntuT340G/+bvMrgnJAuaF7OOZEAouwbMN6H5GzBlRr0j9TLLXJssUVm5sPrt+11DFIbV3Yqb3fcMrGn5FIWrirC3OUikoJXhUAyP2zvf/OXlHw4G4RiFzZ2ZiJwuM8e2lBZHVu+dhN6JUxIsEsRQYzniiIX4qwmCz/agR9/XetaColp7GKvkexSu3d84m/rjWm9buKs+7dZKE1F35JtqfX5+Pg9f93eU3Od5/EdWGdaXkgUrYgH355hn87hBsrerxUmGxc9idqVWVVnV2kFvp200tPAv2N3zPPt20krDfZe5hXmSASGJbZEBm3zXqeZhO/JzEllZ0EQ9nNu4vYFNU7EZ2N0VhL/9lbYI+/OxOP/mZ1QveJrJOpqK+8koJOSjQtev5W0HPR5Oc1n1TXxDm3iY9W7gkN+3t6WXE/Yn28k2GUQVjZ2Vi+8exkp9sJOXyAPDivNX2JjJ3eesK7IhNlGwc6rLGq2+9DjeTAxLOsG70pR6SLqv+z56pd7AiGgmcyt7Yq52KCD2L/weQE+G7schRa2psZKiD/6zcs/G7vcaH9nqmAEEdslOr39bFxxphvxwJv+HRfSHa/ym3b8s7EWy/Mpip0WlHZlwUqrfZ1F/jW1ddiwLZgPBN/ox7UEi7d674O0g9TfI6e5QbYvdSQSnK21/nV0bGPIkh0Jdikj8Se9bVeZ9IWy1Nj5WIoF3NdUE+zGzyrEtf8aq16sLcVaZ6g/xePsRrwtRf770zmxfE3tSGRJLWi5ToG75+H2mXmM4mCgzEJr5LXJTnGlVm00PttYvKhgb265uqOGc1gDFzdNshQbMY2S5mze/k4ed02WNoNXYg3ItGmypq3cXGH5YaI9j/IUajFlzhFBPRIn7eCb3y7EotWJBVP2gqKIuWFXaSx0SUR3LtFdMOZbCbIBCHZaHvrx0CnfjdvjPVpzHMI8eKqph9isYXdzEuxSRBAB58fNKpTmY6mxczmLFBV72+hZ29dUL1DuVfe+1S9p+cHrxKd3b99ZWoE/PfUTtqk2XInsbxj4x3QiApvfuiTYBqcq2wqhiogJJatOsDZs3rCazPRtiRPsHOpr18/CHuCDxOgEIFPNxx/7ctoufCEzTXCBq9fE46t03aPj4g8m+SFpt2W5KRKCl2LNoa9clQsbT24fzldukXUH33FNdXn5cjyKONkC65IGaTcaMiTYZRiyzpaoQfbXJmN/pw7960IRcVw2sWvHbDX+kQjKK2uwq1QiUCrmn+7bNnnuRhQVV+An1W7PbT9Nxq4PfnFbh1TsOblT9nz0dbD9Eo4/p9lPBr0U6/aejZ1ZaHlOmzQikfgB3qm6dgJuOrxTiaC3i3TaeivopoYn/idnmTRuBz4PRfzj1WneC1SUuPArsViWri73RZD3zhjHznu+W3eWBRYzUi+cJrrPdrJpFG7xDYlgnrRVgOJkYtZYaDGONK9YDUXo9+OQdYLbX5iCnSXxgkP8lmLu6xlb6Yt4urRkbxU6tWtmOJaMpVhX6ZJkn+IKU5udYnr59Tr2vb2Qw3k32Vo5L+g/jsyaXidTBVuNXdgjfIJMnx+zuXWOI+bxfQjigQZAKj6SgNiezdFyk60phP+PDouh3F25sunI56M0Op54v17bwjMIDMqKNO/WJNiliKBsa2T5pDqEgjaRyTasl64kSg7KhDqR2Pjz9W8WWNbDvMSjD3g6duY6adgRGbm58d5rqUC2J6ZsIA5rDHGMG2XzPSHV6MjsLwMkEbtD7eNIeMXGO0/4LTfNx39PGJbFZO+px8ami9CbdAErGj3A5LWf5LdDUaz7qHDEc5GBD6TCpI+s1m/bg607Y3Zzyblf3vOsrqlDmWrXG1yuwUKCXYoIavCQhjtJssYurg6aYaupLgqMWjNFUfDpmGXo172dIZ3dNGnuvFoMJxnmJeQN6rJDTiSCd75zv79nWDZf42cVxh1z/Z4kIMS4vcBp4q3zKNEkyXfCm22MRZXrPWjsRCaxg3YfVmu3OW+VlikYDNlle3IGXF6oS7EB5KG9N1peidj8+kFRFEthyM0qj2+NnaLgv+OWo7kSe/e368ILucvDGEMzWejHg6kFm6wTIvY8//HqNIN3szzjBCuWICTYpYxgnrR+sl29sQR9u7dFbV2wb5Fs8tMHs9UG9TghU1GgtTMSEQ4V302JN961+9pJxB6iyOc+llGNXRJ7YyK2c9ozX7p2l8dCE0vuZm9GK2RCX+xIsBNcEB9N+onO3C7n98m6AttL7L/sMwmj84TsfLD9J1XfW+Z6+3FUcINZsKupSe4H+YZtex0c45JjJ1avAN/8vNJw7NnP5liklrNph0RwSrKw9IluV5y5y7dbxg90FOqQuuV9K8h5IkUkQ2O3bXeZeiz5Grt7dRuHaxoKs6ZCr7EDrAd6u44RhKGr1wkhSI2dfgsuP1i9J1U1dfjnW7/G0iVUijuclmLtzg97f1b8wQQ0dmN+XWvY0kdGIo8x+i5H4peSnGzs7Pp28b7MDUhsxhBTTKax8/hSWoXLSDU7SyoNv305Ktih3hfzUuxz/7XfXjBRHnpnhqsAzTJK91V53uVDIwgBXz/fhMFHo5fEHUtGbNNkQYJdighqItZPOtqLFrSNnflrwxxYVyvOzs4mWQbybvAaXiH6JR1A4XbLB27Gu0S3hzNck2CDXvlqnn3+njWCmjbX+7vx7g+LPV9jVQsZeo2d2SnIqb6KAqzZVILvJdrplZsrJVdkJkE7T1ja2aoEsfOEG8wmHYDYPD5RamrF/dDui/kD0q/gFBR2j2tHcQX++PgEX/kmYz/kJo1zUZMEk6PPxy2XHs+Nc2H2RtjmoyTYpYqAnrR+0smJAKs3ldhGvg8C824U9RZLscIgVxwbP6vQ4FHnljA2SU/F9OG2VTLbl1Qse/mhzqOmWEueYlMjA1a3LfpBAucPpTjHbUXB3a9Mw8c/LgUgdhhIJTMWbUlJOfp2m7VPycCN/F9Vnfi9lu1YsqKwOOF8xxWUJJxHJpKMoam6pg7DJ650TuiRnaXyDy+ZPWTm6OtIsEsZydDY1dYp+Mcr02zjc/nCobJaHeInQKOO6P0Rci2L3YBtJyzMWhy/L2sQvPntQtTXh20VIbjrpalxx+rrFXwwIn5pwIlky37vfu/eQQWIaSh27wnWocDVVmIqbm6J+ePCrGkxC9XmPC994EfX9QmCZz+dgwoX29Uliv6+VEtMJoJ+3xo1cp6eRk1fE2yhKhXVwd1PBcLBze+yaLJI1vCQHiNpYsg+XGYvcT//hH0HyHkiRSTDxu5Vh6UyvzhVdcS01dhZUhF1OtAoKq5IWJNkpwR65pPfE8rbimnzN+Gc43vGBfMMlARuy4rC3Sh2sC8LAq/Pzq+A9uEo90JqbV09GuXGT/DJCvETicTnbf4QeX34Agzq2zF2QJd8nMTTORVc+dAYjH7poqSWoX8/ZI48QT8RN7avyfp4mbEwOC3ogpVFqE3HveSSdPPCWHUJGplg9/Z3i/DLApfvBS3FNhAC6kR6D1jZV3MqKN1XjTEz1mGvKYDt8J/dqcr//Ym1h9SYGWsTqptfHnp7hqcQKV6prKnHGzYx+dKBdDQOvuT+0dLjK/xqPyzXYmP/NE9M5tsybf4mw4bjeg3F25LYhKliyNCRSc3faQgL2jygsQuNXU5u+r2zMtLBdCJVZIFch1zJxyQALF7jbq/fsG8BaexSRFAPOtm7TADuB6Epkrg/bi4t2Wet6dm2y1u8o2QRtJDz0/xSbNrpz1haZu9hdZ9vfCJm8OxVq5WK5Tw/yN5H2Rd1MM8sgrr6OtMR/wGKswkn54myimBDu7Rv3dQxTSps/QKhgbwjQHYIsam2kw0a0tiliCx4113xXwsvo0wj6MHJr1AHeBNYdumMgb22wS4YdJisksQVK97rz9vUjaw7ae5G4wGH299AurbjGHbrMz+npiI6UuU5myhrkuzg5oek2dhlQYdYuaEkoevDvgck2KWIVi2ahF0F1/h9KSebJ8QMZsLs9WFXIYpMKeFq+6wsGGABYOhr0+OOPWOznO+HtVvExBuJAPNW7DCccwxPnC032oHXhs9PaXnutojLjHuvj0GZ7TSU/mDHO2PD/UimpdgU0altM+dEBCEjDW3f0pXaunpUJhAC4+mP4x10Ig7LffbR/bOHZIdV8kO6mg9kAuWVybl3Tkv2DYG9FeEu5ZJgRxBpzg6P+yxqaFooGXrj/2whAmtni0TzJVKPG23ckjUet9kjkk7pvnADLxO0FJsyzHvnEUQiuJn07IK3ptNSc1AkS0+wbVdZknImCCIb6dEpXNMrEuxSxDy+wzlRmkA2EkQmkqz3toQ0EKFAwxCRqcgiGaS0/FBLJwjCHzTppQzzXskEQRB2hB2FhwQ7Ig6SGYhMJFkanmTtckEQRHYStl0uCXZEHJPmZE/YkmyFRI14GooJwdnH9gy7CgRBpDEk2BFxLF3rbtsUgkgnGoZY13Ci3zQUQZ0ggoYEOyIOWnlKf2jSk9BAbkk67ulLEIQOsrEj0o3Vki2cCCLdyZRdCBKloch1DeNpEkTwkGBHEERW0FCUmDkNRbIjiAwl7B5Kgh1BZCD1tF7eYAl70kgVm3fsC7sKBOETimNHEIRHvvqJh12FtGPrzoaxQ4TT3rXZwioyCSEIX5BgRxAZyKLVRWFXIe148YuCsKuQEpK9EjvsluOSWwBBZDlhW0uQYEcQGUhDsScj4km2jV2fA9okNX+CIJILCXYEkYGQXEckDXq5CCKjIcGOIHzQvUurUMsv2VsVavlEeJBXLEEQdpBgRxA+oLmVCIuDe7dPav6ksCOIxAh7eiDBjiB8EHbHJRouzfIaJTV/2tWEIBKEnCcIgiAIt5C2mCAIO0iwIzKesO3dCCKVkEKNINKbsL+9AtPpM8ZuBPCx5NRbnPM7dOkGA3gawAAAmwG8yjl/Q5LfvQBuB7AfgKUAHuCcTzKlaQXgBQCXA2gKYAqAOznnhQE0icgQBh/fC++PWJzSMmkjdiJbIcGRIDKbZGjszgVwvO6/F7UTjLHjAYwCMB/AYAhB8FXG2F/0GahC3TMA3gJwPoBVAMYwxg4zlfUVgAsB3AngKgD7A5jEGGsefLOIdCW/f2fb8+ed0Cs1FSEIgiCIkL/7k2GFW8A532lx7lEA8zjnN6u/pzDGegAYxhh7n3NezxjLA/AIhCbvRQBgjE0DsBjAwwCuVI8dCyH0nc85H6seWwxgDYAbAbydhLYRaYiT9oz1bIexMwtTUxmCyHBIY0f4pUv75miUG8HmooaxvV+6kjIbO1VgOx3AcNOpLyGWW49Uf58AoA2Ar7UEnPM6AN8AGMwY02bx8wCUAhivS7cBwAz1HBECRw/okvIynVdFadmUINyiUMATwidiLKbxNuw7kAyN3RLGWCcAGwB8AuBpznktgD4AmgBYZkq/VP3bH8BcAAerv5dL0rUE0A3AJjXdCs55vSTdOYk3g/BDJIRX2ilgazL2TCcTO4IgCCMRRJBDLpmhE6RgtxXAMAC/A6iDsKH7F4DeEEuj7dR0JabritW/WtTNdgCqOOcVNuk2qenMeWnpPEfwXLJkiddLCAklpSUpL3PxEnvHicLCwsDLLC83v54EkRpWrlyZ1PwXL06tIxKRPVRVVaG2NuxapAcFBQWhlR2YYMc5nwBggu7QRMZYKYDHGGNPBlVOshg4cCDy8vKSV8CXm5KXdxrRrm1bYPO2lJZ52KGHAiOtyzzwwN7AzN2BltmieXOgpDTQPAnCiUgEOOigg4DJVmbMiTNw4CDb/kQQVjRtmofGjXKA0r1hVyVUIpEI8vPzk5Z/VVWVrTIq2UrTb9S/RyKmcWtrSqNp8rSZtxhAHmOsqYt05ry0dMHO4kRa47QsSqFJiGwhAiR9zy/aeYLwiwIab9OBVK6GrwFQjZgNncYA9e8K9a9mWydLtxci9p2WjumcKfTpVoBoMDjZ2CVlnKGxiwiDBF/mRrlkAEUkF5LrwifZvfxqCCG+gHNeBWAy1HAlOq4BsA3APPX3TAhv16u0BIyxXPW68Zxz7XNyLITG7hxduu4ATlLPESEQRqd2+kJMhkNHTjI8MggiydCkSySbMBzoCCNB7jwxAUJwWwKgHsJ54m8APuScr1WTPQFgOmPsAwBfADgRwK0Abte8WznnVYyxpwA8wxgrghD4boHwqr1WK49z/htjbAyADxljQwHsUfPXvHGJBoLjZJWEcSaXZkgiBBJ969xc77QSmxMB6jNgtbZ1iybYU1YddjUaHBFSCodOkI9gOYA/QdjVjYCIWfcAgOiuEpzzWQAuAnA0hKPFLQD+wTl/V5+RGpj4IQB3ARgHEQrlfM75QlOZ1wD4ESIY8bcQmr8zOeflAbaL8ICd9uy7Zy9IeZkAhTshsodE37tIEJ0hQ17+p/5yQthVaJBkxtuRXHbtDdc1OEiv2LsB3O0i3Vi4WCpVhbsXHdLsBXCb+h+R5jRpnJuUfJ3nKlqKJRoOZx7dAz/P2SA950pj5+CdkSlvPhnxhwPdd2BHSU2o5ZPSNA0ZfHwvtGjWOOxqZAxhaOxIsCPSFTvBzNWc67DMmm7zdqPcNKtQAyfd3o+GCAl2aQjr2Q4XnXxg2NXwRTjOE07nk6Cxo9GLCIVIgh99Qby39O4T1pDzRPiQYJeG1NaZd0nLHFo0Tb2m0VFwo3GGyCL6dm+Lf/3pWMvzds4PkQhwzID9bPPPAL8IV2Rqt7//+qPCroJ/FKC2PnPnr2yBBLs0RFGAfZXhrtH75eqzWMrLJLmOaCho7/oxh9gLZ1YoCvDIn45xSGMv2qWbFUK2xVPusV8r2/OXnto3RTXxx5pN2bsjT9Mm7uzEe3dJ4i5WLiDBLk058+gentIf3Eu+Pe5RB3cJojquadw49a+Uc4Bi+/MtyZ4xaeQ1ycURBzYPuxopp1Xz5LxTbmSqehtJp6a23rE/OApKSTZDGP3SRUnN3yu3XjTQdVrWs51zIiciwBv3nmZ5Ol3se3NyIvjb5YcZBc0UVu2vlx2KT4ed45wwQE4+vJvhd67Fs7jipA6pqI4lJNilKb33bxNIPq1bNAkkH7eEISQ5Bih2GGwOOTDcTpjNRADkSYT9Vs3Fe3nWMd4+YIJk6HXJ28vxlCMPsDyXbMeo+jprySwIMw9zf/IqaNh9tJ53Qi/P9bESZP0o8i6RaMMuOOlAPHv7SfjqqfPw5n3WAhcAdOvU0kepwEmH7R/9dwTBfxhce07/QPMDgMP6dsTg43vhpiGH4NGbVdMABdivQ2o+5Nq0yEP71uadR/3hVng/oLPx+V5xxkEAgL9ceqjhePO8cEUrEuyyHNkXResWTZL2Vdy4UXJCmtjh1nmiRbPGGPXihbjstL6m8+7KCUoA7Nw+uQNf144tkpq/mTuuOMzyXCQSwWmHto473rpFE/zw/BDcccXhSayZNa1bNMGpRx6Ac47rKT3vdsnFEhupwosc9Pb9p3suWm/j1Lypc0SrCxN01BpyUnCOXieZNCJu0Mt1+mVMP3vetpF8COfkRHDIgR3Qsllj9Nwv/l3W41eZuW7LHtdp3bSrp2k5d0Bv+YpOIug/qLvpBJ5zj+sVeFnyCgSXlVkT55brzu2PkS9ciIN6tA2uMgFAgl2Wk8q9If9whL/OkSjOW4qpKAoikUhc1Hyn6x+5SdgklVXE7B4Tses5bqA/+yi3NG6U2m7dsW0zw2/9pBKJyDV27VrnoVFujqW257LT+qZEk9rfYunsDI+mEGYmWsSREyQwI+neVav7U1RcEf13n25tHbO88syDDL+dBAdz7d0IM6ccEdNg2i0VD0zwmV+XgGaqzwFtEn7ufj1CNxftw/+dJ7ZHb9e6KTq0aeZwhT3P/O0kw++ke/HrHmluBu5H3LpFE5x+VHfpuUF9Otpem5MTCUWhYUfmPYEsxEpr4IVO7eQDQa4kxpPbPj7sluM81eE+G2+uls3Eq3bCoV095ekGxx3F1ASN1QDJ5onLadBrp6r795RV4e6rj/BVRz11NktlAHD94P5o18q/8W2TFAt25nn6jisPtzzndLxX19b4+F9n44bzBqBZXmDx0y2pqpEvTd584SGJ5VtdZ3kuJ4HHo39VrQS7VRtLov92CjYsw6uJ3fkn9o5LY/640NfDTnDUPrL8yiF6odHp48u8JH7PNUeiaV5iE3Qi8tPlp/fDd89eEDVTkHH4QZ1c5WX+Xkq2XV5NrehHrVo0lq4S+f1Is4tRaNeiPw1x339vvXigrTAaXWa2oVdXe01uqiHBLg2444rDceP5AxLK44bBB0uPd2lvvSz3wUNn2uZ5aF/7LxUv/HWwcOK4/4aj0UKyPGSeSJ+87XjXeTsNpi3VgVLTKNabVHZO17dtKYSsft3bRQVoBYrtFmmP/9m6/v17tTdoMMyclt8dHdp4sx3RL7/Kvh69LMV4tQFrqbMHatsqD/17xsqqqva2tc4RrDM6tm2G3JyIr6U0r5xpYeOX6Be4+X0+UWdDVesg2Gv03j9+sjj72NhH4CkmDfkFJwkBy24s6XuAs+1uba1R2L3+XKMWTK/hHv3SRehk0tge1q8j/nHNkZb561tvXjKUlaHxxRODLfPUaK4Lt+T0/uT37xxXrya65965fXN0tOmH+mdhxklr/rfLDo07FolEHHfnefgma49mg1mD6f7ZjXFu7qsezVGvdcuYANpjv1a46syD8OAfj5EKY3q7Oy9C0N1Xx96jl/7+B9fXebLzc/yScZ9VukCCXQg8+Mejo//+6knRqS47vV/0mJ/pzGoguegPMfsXs+Hnfh1aYPRLF1l6zgYZ2LdFUzFg5eZEcLtuALrijH4Y/dJFuPgUo93b4QcZB10Zrw89FW/ee5pjPVs0bYzPhp2DP10gJluzYOeksWvcKAdv3Xca7jUZ2+sH4SNZZ3z/XEzQO8Lmy/rUIw/AvdcHa7j/xr2n4cBuYtKWaUW9LMmfeXQP9OraGh89crb0/BCTTVb/nu2jE5V2J6PLswm8Q7J+cCRzfi/MPP3XE3DxKX2k5/Ia5+ICicYJAIY/fR7+drm1/aAd5vdZb8Mj0+Y9d8dJUcFM4/iBxuf4+ePnGoy8e+zX2jCWdGrbHKNfusgwlui1wx8+fBae/uuJjnWvNmkxI5EIjuxvfd/N/e/YQ7rGPzzdb0WX/aH9OuGea+OFQJlQZuUINuyW4/DS3/+A+284Ckcc1AlfPDEYnw47x3Ycvf+Go+K174rQbGmCyYcPn4WPH7X2upRpzzTNmJOzwgCfGqymTeRa7Iv+0MdoAmFRLzOd2zf37GD31F9OwJCTD8SfLx4UKy8SwfWDD0bHts2QI1FJ33JRLK2dTa6ZZnmNog4pzfIaGYTQPNUO9o+SDxmZTD/yhQt1dTjcdR3080OmhNYhwS4E9J0sL1EjbQcikQi6dhDaHNmSif21yaiRe0NVq2XPvt3bYvRLF6H3/m3Q0+XXX7vWTaPqdvOA7GaZosd+rdFUtzRotqVp3CjHoOVJSCi2GDxusfHcymucG71uoMQmxMotX0b71k3xxr2nWS7v5/fvjC4mBxAtrprW7DvVgVOmdQKME/eb950WtW8xTOiy++DjtuZEInHPQ++1etul8doTQGh/nDSnfsItyDxTB/TugM7t7LUMbVrmeX6v9GV1bt/coNGyorrWKHial3PNH5E5ORHDx6ECJc6OTjH8O/YrAqC5ZMndbgI1a/laNmuMg3q0w8mHd0MkEkHrFk2Et6RNHicf3s1SK/vds0MwQicEWGH3KGSrEgYCFhBYj3aGvmHO3urj1esKbf+e7dCkcS7+fPEgyyVjmcauWZNc9OveFoC3sdEs4Ldu0QTfPHM+brtkUPQj73Ldh0z0Okle+nHeSzdKl/AyXiDBLgSSYchq11natRZLiVYvqNWSRdivs6Uhs8fPJvOtOdakCXF6HLLSzJOd287vOuyApFJul2dlNfFm0Ox8f51sfZxslfSPsOd+raXehjIjez/vZCRiXNZ9959n4OYLXcYmc7gVXgRmDbNXts+i49IcNyjeKafO7CkkwTx21JiWYs2PQeYgoF/6UpT4utfbCOzNPSz9f/jIWXj+zpNt6xcrxp/0lJMTcfVcZfdBG9sVAJedELwnql+sBTtv768bMw2re+dXltWqqPXhZnmNcMFJB9oLiA6Feem2+nu0dN0u9xeGCAl2KeLlu2P2Afr3MTcRS2odbt5Tq45g2QdSsB/qhm17PV8TtDZcJpRpHmpmrLze3C51XnO2y505ZEKNU2BZmzuT6o3SA9kvUtIcP5pQ80TTspncwFuGnQenLG83yJaOgPjupijOXVCzAbv2bIb9O8bHUDObHcho1byxQZtfXSNx/FCzufAPB8afg/F17dCmafzHou6n+Z568YTt3C5e62gZxy6MZTNNCKlXMKiXtQY2kapJ2xUxLnGbE1l9eHrtT27SS+e0SCRaJy9FKroyvTxPZ6Fet7zqkFJ/7+Ys2+6+EiFCgl2K6Nc9FlZB3zlSqeb1WpIsvdfwKUf274y7dF6SGprn5m9Lt3msVfCYv1pbNGuMP9g4NwDxg4yfCd4re8qqXaWTDb5BhyAwt79tyzwc3Kt91Ng5kiNPZ4XmZJGns1v0q3Exk5NjXIpNZClIlneiHJhAMHLWsz1e+vsfcKXFVn49usidE/ReipFIxBBgNW4ZVffzSNZZOjBo1xx7yH448dD94+6b0Ss2drxZXiPp8zDfk2NttlCzekZ2nsmJcPygeBvW5+44CQ/deLRBY2eHH693W6cDxeQRbDptZYMdkF7BgCwSAxCrk/6jz2kVomPbZjGNnYc6OI07siHAzbBgDkScrpBgFwJBLcUathFzkaVlsWonMAdZlKV32sfQzOO3Ho+zJN5jiUyIijxChW+0ieWmCw7Ba/eciq+fOg9d2jePDr6GicOi2vrBTLMB8rIVkRsqKr15mOppFOAIrijxk2lubg6ev/NkHKHavTg9XfP1eapRePcu+gCz8df56Trma7zk4aTwct2X1Xw6t2sWJ8g8pHo6mrVr/Xu1QycHuzsAOKhHO8sPi7uvOQK9urbGq/84xXDcSuADgOMGdsWlp/aNOXPEhQeKv+bKMw7CoX074u6rj1CXvo3njSuxsR9XmGLoaTz11xMMv+3scq0eUXebNnrJxxL1PuREhI3k8YP2j773ToJFm5Z5+N+zF3iysTZvM2YW1vTvj7l8q3vhdcXITRgR2cd/BLoxISKCYv/9qsOjbXj/QXmEhr4HtNXdU+ub+sD/GUNt1TnssCL7mHDySAaAwcf3wu2qQ1UqwjH5hQS7EAhqhbONzt1c/xVk9orU+kOOhUpbC/kwoLdxSUSq2ZD0rYF9OuCMo+XBHa3QjLplAp75i9gcly0S8Fur1aFZXm7UsxSQPyfN/qixafDSBsjXh56KZ28XwUEv/EMfTx5gg4/vBUDsFnDpqRKDYIfZwu50x7ZN8dCN9pu/a+zvYlskrSwr7zKvSzxDTu6N+284yhDkOqiltJyI/9Ap+usO72ftBelIREw+z90h7MNOPfIAHHFQJ1x6al90Vp1UzLZt+f27RDWYXiLb6+vcuFEu3rj3NPQ5wHi9zBGne5eWuOy0vmiUm4ObhhyC6889GGcc3R0XmTyKZRNgx7bN8PRfT4yGFiq3+QjR97E8i8m0VfMmeEUnjPoZM9u2yjPssHPducFsq6V5abZtmYerzjoIL98dq6dmL+hG25zXOBeKzZeDlXOCdoXZrEO/y4hM6JCZY1hp1/QxTD9THYT269DclbDc0mI7tPtvOBoXnNQbB+7fBrdePAhnHhP/wS/DzVhi9haukpkT6Ni/Uyw8lNZdrDyO9eTkRHDu8b3w2K3H4am/nOCYPizSV+TMYpzsdtyiKMDVZzGUV9YYjj/4x2MwZOjIuPTN1I5vnkBOPrwbTj68G0ZMW+1Y5p8vGYSnP/4de8vFsqDZ7d0tfQ5oC76+2BBC5Lk7TkIL07Y9r9x9Ctq1zsONT/wEQNgnnXjo/nH5aYx4fgguvn+04ZjT5KuNG27skbTlHfPkpmlMzHv86uPLmXdoMPPnSwbhijMOQsvmTXDyEd3w/OdzDeed3ptjB+6Hwq174pZ5br7wEJx/Ym80bpSL9q2bYveeyrhr+x7QBqs3lQIQGhuNa85m+OonHpdem7isbm1UuLA0bDfSuFFunFbmxEO7YvGanYZjfmz3zM/fS/fTP897rhPLzDW19aisqsVvS7cZ3oPjB3XFrMVbo7+bNMpBta6vnXRYrH2yfWpr1I+dS07ta3Cw+Pqp89BEsnuHFW7aJxPO3r7/DMPvFs0aG+KIaWzbVe6Y/wmHdsX7Ixajc9vG2FFSA0VRMPzp8zB/ZRGOH9gVn49bEXeNef7ue0Bb5PfvjIIVO+LSfjrsHPzx8QkAgEN6u7PR269DMFvtXXdufxzatyMOObBDXODd0/IPAF+/Gycd1g3rVpWgSeNcg83iFWf0M3jj2g05Hzx0Jq5+ZGzcce1DwKzN1QvvjRvlYOQLF+Ki+0ZFj33++OC4/Mwa57OO6YH8g7sYvJzbtW6KP188KOr57kT/nu3xt8sPQ5tIEf797RYA4tl27dgCt13ifynT7r3O798Z3Tq1xOaifQBEyJ03v11oW0cz2kfGgd3aYO3mUtu65PePDxF24P5tsHZLKbp1Su2WjjJIYxcC7VoJuwLN/duMl6Xa687tj1svHhQdFO1svbR4RVZBWbUtJo8ZsF80eLGmPTvq4C74/PFzcciBHQyBIv0IdUBMXd9VN9gO6N0hzjuyb/e26NCmGW66YADOOa4nLj+9n+VeqAN6t5faksnCSHymC1Oh3e9mLkJBaLEATzzMqFU8V9W2mTm0b0zL06dbG2kajUa5OYYQI6NfusjTnr7Xnt0fnz9+Ltq1bmrYTP3iU/pGJxOzkKPFaTvCIj6clW2TNshavaqa8NhetaH5dNg5+ORRXVw8F8LH+ScdiM8fP9dwTF/e3686wjkTyMOduKVbp5bRD4m8xrlo16opOrdrjh77tY5uAP72/afjnmuPjAvo7dWuUQsM3LpFE7RpGRPOWzRr7ClgctSWyaHJ157TH6fl29uSRlHzcuNlCwAd2jTD6JcuwmmDRH9WFBE+5sRD95d+aN115eF46774fXG1pTrzvdRv/u7FrMPOVk+2T6yMRrk5lv3lgM6t8NRfTozWz7yd3v+dN8CgadPGk1YSLVeLZo2lQeLPO6E3ht1yXFyAajPm+yLzaNXmi/Zq5ISuHVtIP5yHnHxgXIgju3IHH99LupWgH2IrTdbvXiQSMezg1LaVWOo2k5sTwUV/kMe01PaHlsUgffimY3C/zc5KAPDEbcfjyduOx7v/tA/8nwpIsEshD1/VDV89ORi992+N68/tj3/qgosCwH3X56Nj22bRmF53Xnk4/mWznYl56RSwF+xyc3Iw/OnzLEM9aB2nW+eW0a/bv191BG67ZBAevfnY6GQThG3BXy87FENOPhC9HYQdjUtP62cbVPLjf50d3e3BTYTydq2b4tC+HTH0unxce05/XH56v7i9bo8fJAY4fXu7d2mFb545H6cfZRSOD3TRjjtNTiSP3XpcNHq9m51HmMW+pgP7iPcgJycSfUZ/vUy+BHymLoRMs7xcnH1MD/xpyCG4+iyG14eeahS+ABzQpRW6d2mJq3XG+U0a50S1glbhD/Iai3umCbPt1f0vtWUst4o3vYCjp1O7ZoYPFP1+p41ycww7HzRpnIuObWOCQFMHuybz7gz3XHsk3nngdMsYcN27tMJp+d3jtDf6pa+uLrRFJ6gT6jED5AHD3ZITEfa3D5rGFzPXnM1wz7XuAmVfqgZc7tOtDf6qBqN20880ecy85PfYrccZTAPOOrandJlP26VDFuvODw/deIwhRp32AXTHFYdFtw4MEm3pHRABr81oO0nceaX8I0Ubq1vq+pkWN9DNx8pNFxyCoyXvk7abiGZ28e/bT0LblnmG+I5B8PwdJxs+Ms1o5bXSCdUnH94Npx55QOzjOxruxL4sTbB77FaxjCxb5h/xwoVxZghHMCHInXdib5xxdHdcdno/dO9iNEc5bmBXnOwgSLdpmecqsH5KUBSlQf83d+7cXnPnzlUqKyuVZDN37lxf123cvkeZt2K7UlZRrfy2ZKuyt7xaWbu5RKmvr4+mqa6pVW56coLy/ZRViqIoSsneSqWqulZRFEV545v5ygX3jFDWbSm1Laesolp5+csCZW9ZlWOdtu0qU2pr66TnVm0sVh59b6ZywT0josf8tt0vF9wzQrnv9enK+z8s8p1HbW2dsntPhW2aucu3KRu27bFNc9u/fzbcCz319fWO1/++dKvyn5GLFUVRlMWri5QL7hkR/a++vt7wHuhZs6lEWbu5xHCstq5e2bRjr3LL0z8pMxZuti3XjL7M6ppaZeJvhZZl19fXKzMXbVFqa+sMz762rl559/uFytad+1yXO3nuhmjZS9fuVPaWVysVVTWKoijKzEWblS/GL1dqa+uiaTRWFO5Svpu8UlEURamrq1cmzC5UyitrpGUMfXWa8ta3C5QL7hmhfDlhheu6mVm3pVTZXLRXURRF+e/YZXF1amjMnP278spXBcrOknJf1+/YXa68/8Mi6VizZM1OZcmanY55LFmzU7nt3z8re8ur485VVNYoo39ZY/keJ4L23tfV1Su1df7znzJ3g+U49Mg7M5QL7hmhlOwV89cF94xQbn16omVeKzfsVkr3ifF99uIt0X4k4+ff1ysvf1ngu95uxvy6unqlQu2T2jxlfk7T5m2UHnfDlqJ9yoVD5X2wYMV227HXbmx1ItnzXWVlpTJ37lxl7ty5vRSJXBNRQgn2kz4UFBT0ArBu4MCByMvzv/G6y7KQnx/sVlJuqK6pw9K1uyyXD1JBWG1PB8orazDr93k44w/Om0m7oaKqFne/PBWXntYX5xzXK5A83TBq+hoM7NPRlXZSTxDPvrKqFjuKy9FDEshY45f5m3FQz3aul4xkbC7ah64dWgQSxkRRFMz+fS76DxgUNb9oaDTkfh9G20v3VaFJ49y08NhMl2dfXVOH+nrFsHNQskl226uqqrBkyRIA6J2fn19oPh/+0yeSTpPGuaEKdQ2d5k0bo22L4Lpas7xGeM8iPEAyudDCNiUVNM1rZCvUAXBcKnFDNxcewW6JRCJo0iinwQp1ROqxMl1oyLgJY5JtkI0dQRAEQRBElkCCHUEQBEEQRJZAgh1BEARBEESWQIIdQRAEQRBElkCCHUEQBEEQRJZAgh1BEARBEESWQIIdQRAEQRBElkCCHUEQBEEQRJZAgh1BEARBEESWQDtPALkAUF1dnZLCqqqqUlJOOtKQ2w407PZT2xsuDbn9DbntQMNufzLbrpNXpNtq0F6xBQUnAfgl7HoQBEEQBEF44OT8/PxfzQdJYwfMAXAygK0A6kKuC0EQBEEQhB25ALpCyC9xNHiNHUEQBEEQRLZAzhMEQRAEQRBZAgl2BEEQBEEQWQIJdgRBEARBEFkCCXYEQRAEQRBZAgl2BEEQBEEQWQIJdgRBEARBEFkCCXYEQRAEQRBZAgl2BEEQBEEQWQLtPJFkGGP9ALwB4CQAFQC+BvAA57w81Iq5hDF2BYDrAOQDaA9gDYB3ALzHOa9X03wC4I+Sy6/gnP/PlN+9AG4HsB+ApRD3YpIpTSsALwC4HEBTAFMA3Mk5LwysYS5gjN0I4GPJqbc453fo0g0G8DSAAQA2A3iVc/6GJL+Mabtal6kATrE4/SDn/FnG2GMAhknO38c5f9GU3/8BeAhAL4j36AnO+XBTmsYAnoB4n9pCRFb/O+d8gd92uIEx1hfAvQCOAzAQwArO+UBJupQ/62SPIU5tZ4zlAhgK4HyIdjcCsBjA45I2FQLoKSmmE+d8py5dWrRdLcPx2Yc1xoX97NU0drsYHM85n62mmwr5eHE053yuLj9XfZwxth+A1wCcC0AB8COAu/XvUSK4mdvUdBnX50ljl0QYY20hHlwriAc5FMA1AD4KsVpeGQqgCsB9AC4AMALA6wCeM6VbC+B403+T9QnUF/8ZAG9BTBKrAIxhjB1myusrABcCuBPAVQD2BzCJMdY8qEZ55FwY2xUVWBhjxwMYBWA+gMEQguCrjLG/6DPI0Lb/DfHP9G313FhdugpJui/0GTHGLgfwKYAfIO7TzwC+UgdNPa9ADI7DAFwEoBqi/fsH1io5h0A8l9UAlskShPGsUzSGOLW9GYRAvgDATQCuhpjgJjLGLpCk/x/i34cSU5p0aTvg4tmrpHSMS5NnD8S3+XgAswHsADDXlHaGJO1yUxrHPs4YawRgPIBBAP4PwC0ATgAwijEW8dFOGY5zW6b2edLYJZfbALQDcLj2lcEYqwXwBWPsSc750lBr544hnPMi3e8pjLGWAO5gjD3COa9Sj1doX24yGGN5AB6B+Np5UT02DeLL/2EAV6rHjoXoGOdzzseqxxZDfE3diJhgkUoKbL4SHwUwj3N+s/p7CmOsB4BhjLH3Oef1mdp2znncQM8Yex3AYs75It3hertnr/IkgG855w+qv6cwxg4G8DiAcWre3QD8BcBdnPMP1GOzAawDcDeA+xNojhOjOecj1TI/AXCUJE0YzzoVY4hT2ysA9OacF2sHGGM/ATgIYtL50ZR+u8NYkE5tB9w9eyD1Y1w6PHuY26wKHkcAeJ9zXmtKXuJwj9z28csAHAZgoNZOxtgWCMFxMIwfln5xM7dlZJ8njV1yOQ/AJJNQ8B3EV4JZU5GWmF58jfkQauT2HrI6AUAbCJWylncdgG8ADNZ9hZ0HoBTia01LtwGiQ5/nqfJJRu3QpwMYbjr1JYQ6/kj1d1a0XV0eOBrA5x6v6w2gP3TtV/kSwNGMsU7q77MhNreO3k/O+V4IwSGp7dcvvcgI8VknfQxxajvnvE4v1KnHFAgNnh9Natq0XS3btv0eyLpnb8EVAPLgcRxQcdvHz4P4gFyqSzcTwHoENBY4zW2Z3OdJsEsuB8Ok3la/AtZATHSZyskAdkOo4jX6MMZKGGM1jLH5jLGrTNccrP41q+WXAmgJoJsu3QrJgLMU4d2zJYyxOsbYOsbYMHWZAAD6AGiC+CUMbTDS6pvJbddzPYB6iIFNTzPG2A7GWC1jbAVj7HbTea39VveJ6dJt55zvkqQ7iDEW5ngV1rNOyzFEfRYnIL6dAHAdY6ySMVbGGJvAGDvSdD5T257qMS7d2q9xPYCVnPPfJedOYYztU5//r4yxM0zn3fbxuLbr0iWz7fq5LWP7PAl2yaUd4m1LAKAY3rRdaQNj7CgIO5tX1K8SQHzl3AvgYgibgE0AvmbC+UCjHYAqznmFKUtNE9Bel65EUnQY92wrhB3IjRB2dj8A+BeA/6jn26l/S0zXydqUaW2XcR2AaZzzTbpjqwE8AGEDciGAWQDeZMKpQsPLfTKn0dI1hhgkwyKsZ52u78SdEAL5S6bjowDcAeAsiCWlAwD8whgboEuTiW0PY4xLp/YDANRlyJNhsqFVmQaxnHoegBsARAD8xBg7XZfGbR9Pedslc1vG9nmysSNcw4SX0ncAfofOwJRz/pop6UjG2GQI+6lPUlbBgOGcTwAwQXdoImOsFMBjjLEnQ6pWKDDGjoP4gn1Gf5xzbl6OGcsYA4AHGGMvcM7LUlRFIkUwxk4B8DyAFznnv+jPcc7v0v38hTE2DsAKAP+EMILPSLJ1jPPBtRACW9wyLOfc4B3PGBsFYCGAx2ByMkk3rOa2TIU0dsmlGMKd20w7CHVvxsAYawNh5F4O4ELOeY3DJd8C6KGznyoGkMcYa2pKp30V7dalayvJL13u2Tfq3yMR+yJra0oja1Omt/16AJUQHo9OfANhp6JpabzcJ3MaLV0NgH3uqpoUwnrWafVOMMYOBTASwoPwAaf06pLbZIiQEhoZ2XYJyR7j0rH91wGYxTlf65RQXT4cCffPXt/HU9Z2m7ktY/s8CXbJZTli6+8AokbYfSC+YjMC9YUdBaAzgHMl9hFu0OwPDjYdHwBgL0T4BC0dY/Eu7QOQfvdsDYSrvqxNQKy+Gd121abwKggPuj0+srBrPwBwXbrOjDHzcsMACJueoIzc/RDWs06bMYQx1gdCgz0PwA2qA4UfMq7tLsnaZ6+WfThErDs/ThMabvt4XNt16QJru8PclrF9ngS75DIWwBmMsQ66Y5dAeBQF4a6ddNRJ/RsAhwIYzDlf7+KaCISL93qd59FMCI+gq3TpctV043WTxFiIr5VzdOm6QwRqTId7djVEsMwC9Yt0MlR3dh3XANgGMQECmd/2cwB0hPsB/WqIEBlLAYBzvg5iQDIbm18DYI7uHfkJwjkjej/V8ANDEPKzD/FZp8UYoi5V/QTR1os559Uur+sI4AyIILQaGdV2GSka49Kt/ddBaNXMXqJSVGHkYhifvds+PhbAICZCImnpjoMIbh5I253mtkzu8xFF8fvRRTjBRLyfJQAKIeJ4dQbwMoQr89Xh1cw9jLH3APwZIr7QL6bTyyDUw59CBF5cDfHi3gLhbHCD3gaLxYI4PgjRKW6BiFd0LOd8oS7djxBxkoYC2AMRpbwdgEE8hTt2MMYmQHTsJRCD0WCIoL0fc85vVdMcD2A6hJ3NFwBOVOt7O+f8XV1eGdV2PYyxryCM4bual+AZYwUQz59DeJBdBTEBPMI5f1qX7gqICeHfACZCBCb9O0RMp3G6dG9CGF4PhQhtcC9EbK1BnPMtSWxjc8RCDtwO8XV8j/p7Dud8fRjPOhVjiFPbITwEZ6nHrwewXX89j+08cA1EoNdxEFqKXhDLtd0gdh+IahvSpe1qOU7tB0IY49Lh2WvCjuqtugEiptuFknxOhgj0+4Na3/0g+nc+gLM451N1aR37uCp0zYVwqHgQwh/gBYh378QEtMX6OtvObZzzPZna58l5IolwzktUj6DXAXyP2NYgyQy0GjTa18XzknOnAVgE8bXyCMQLWAPxYl/IOR+tT8w5f1E1rL8LQBcIjc75+hdf5RqI3R3ehvhKmQKxdU+qBZvlAP4E4dnXCCKa+AMAXtUScM5nMcYugujU/wdgC4B/6Du9mi7T2g4g+jV9IYBPLewqV0N4wnVVfy8F8CfOuWErNs75t+ok8hDEQL4GwLV6oU7lHxB2Nk9BxIaaA+DMZAp1Kp0hbKb0aL9vAvBJGM86RWOIU9unQgSLBYRtnRltaWkdRFy7lyEmq1IIT8nL9UKdSrq0HXBu/yiEMMalybP/RP33qRAC+j2QsxXiw+4ZAB0g7NVmAziVcz7DlNaxj3POaxlj50JsKfY5YluK/T0IoU7FaW6bmql9njR2BEEQBEEQWQLZ2BEEQRAEQWQJJNgRBEEQBEFkCSTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQBEFkCSTYEQRBEARBZAkk2BEEQRAEQWQJJNgRBEEQBEFkCf8Pn/9H/koRoC8AAAAASUVORK5CYII=\n",
-      "text/plain": [
-       "<Figure size 720x360 with 1 Axes>"
-      ]
-     },
-     "metadata": {
-      "needs_background": "light"
-     },
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "df1 = df[df[\"name\"].isin([\"Latency\"])]\n",
-    "ax = df1['issue_to_done'].plot.hist(bins=BINS, alpha=0.5, figsize=figsize)\n",
-    "ax.set_title('Inference time (usec)');\n",
-    "#ax.set(xlim=(0, 25000))\n",
-    "plt.xticks(rotation=60)\n",
-    "plt.show()\n",
-    "\n",
-    "ax = df1['issue_to_done'].plot(figsize=figsize)\n",
-    "ax.set_title('Individual inference time (usec)');\n",
-    "#ax.set(ylim=(0, 200))\n",
-    "plt.show()\n",
-    "\n",
-    "\n",
-    "# df1['issue_to_done'].describe()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAApwAAAFKCAYAAACwxI8KAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAA8y0lEQVR4nO3deZgkRZn48e8A0oBcw+24ooL4Ao6iDKyKuIsiIqDIKoegIigs7iIeC4oiIiIiyqh4gNeq4G9REF1PEA9AVJDDwUWG4wURmMVZEbkVbJCZ3x+RNZMkNdPdNZVd3T3fz/P0U12ZUZmRkUe9FRkROW3hwoVIkiRJbVlh0BmQJEnS1GbAKUmSpFYZcEqSJKlVBpySJElqlQGnJEmSWmXAKUmSpFYZcGpSi4hjI2JCje0VEQsj4nODzofUi4g4LSJuGXQ+JqqIeEp1jh8w6Lw0RcS0iLgqIj446LyMRkScHRHfGHQ+ND5WGnQGpKYxBJAHtpqRCSIiVgPeBfwsM3824Oy0KiLWA44EXglsDDwAXAF8MjPPHWTexiIitgSOAZ4LPAG4C7gRuDAzjx1g1iaMiHg58GbgH4G1gbuBy4CvZOa3B5i11kTEDOBfge9k5v+0sIp9gacBn2xh2W34MPDriNgqM68adGbULgNOTUSvb7z/V+B5wBsb0y8B/gs4cTwyNUCrAe+v/v/ZAPPRqogI4HxgPeArwBxgOrAfcE5EfCQz3z3ALI5KRDwfuBCYD5wG/AGYAcwC3g0cO6i8TQQRMQ34PHAw8Fvg05QyWh/YFfjviHhtZn5tcLlszQzKuXwL8D8tLP+dwDcz888tLLvvMvPKiPg1cASPve5rijHg1ISTmf9Vfx8RLwH+sTm95u/t50ptiojHAd+kBJj/lJmX1+Z9HDgDODIi5mTm2eOct8dn5l/H8JGjgb8C22bmnY1lbdjXzE1O76AEm58B3paZC2rzPhIRLwMeN5CcTWIR8Rzg2ZTjbzI5CzguIg7NzPsGnRm1x4BTk1pEHAu8PzOn1abdAlxPqfmcDTwDuAl4a2ZeEBF7AMcBTweuBQ7OzDmN5T4dOB7YEXg8cB3wocz85hjytg+lNmMTIIEjM/O8Rpq1qjR7AhsBt1Fq907IzEci4inAzVXy90dEp6bzdOBjlBqiPTPzW9Xyotr232XmZrX1/D9KIPfk2rRtgQ8ALwBWptQovi8zL2zk8QnAB4GXUwLC3wOfyszP1tLsQKnV2w94KvDvlJrKi4FDMvN3IxTXq4GZwDH1YBOgKodDgJ2r/J7dWOeL6k0NamV2YGaeVps+4j6t2uV9pUqzB7APsEH12RuAwzPz443yeRZwFfDvVZlsClzbDDarbbm98dndKcHXcyg1fH8EvkHZD3+rpTsNeA3lmD0V2AG4DzgxMz8VEc8APkW5E3An8N7M/H9dtuvFlLJ+DTAE/BA4rJmvbiJiP0qwOBP4G/BT4F2ZeXMtzdOAE4AXAutUefkV8JbM/L+IWBU4inI+vKMRbHbKqHmOrFctc3fKrfffUZpYfLGW5imUff6eqlyOoJxPlwBvAuZV8/6Nclz+BHhjvSawdt34KHAS5boxDzg+M786ivJZ6nlSO14BvhIRX6n+/0CnmcUyXnf2AB4BLmjk61ga18hq+gGUY+KpmXlLNW3rav3bAmsAtwM/B/41Mx+s0kwD3kK587QZpby/T7m+/bmxjp0o5b4NMI1yDn02M/+zluwnlOv0zlTntqYmOw1pqtoE+DpwDuU25trA96ovzU8BX6O0sdsEODsiVux8MCK2oLQleybly+dwyhfn2RHxulGu/wXAZynBw3uBVYDvR8T2tfWsSvkCOoDSNOAtlC+LYym3HAHuoHxJAnybctvp9dX8uZR2b/9UW+8/AQuAp1VfgB0vpHxxdNb9z8AvKEHBcZR2k0PAj6svxk66DYBLgZdRAp23Ves9NSK61aS8C3gV5Qvkw5QA6IwlltJir6heu36xZ+a9wHeBLSJi01Es71F62KefBrYGPkQJgm+kBE7d0r4OeIhSUwPldulzImKrUWTtQGCYcky+lbL/30G5Fd+0AnAu5fbzOykBzScj4kDgR8CVlP14H3BaFfw1fZJya/844AuUIOXHEbHy0jIZEe+mHKM3U8puNrA9cHFErF+leVyVj+2BUyg/Ok4FNqTcSoZyXqwLfC0zR7wzERGrUM6RA4Ezq+2+HfhClaem1wCHUWpPP0Y57s+m/FB5BWXff54SFH68y+c3Ab5FadpxJHAPcHr143Fp+RzNeXId5ZoDpew75/J/V8tY1uvOdpQfOg+OIm23bVifEvxtWq3/LZTjcCYl+O34LKXsLqu28wuUH8wXVvurs7zXU46HDavlvQu4HNitseprgQcpx4amMGs4NVVtRqnR+wVARFxHufh9GdiiUysTEfdQvoBeRKmxgfKlPB/YpnbxPiUifgycGBFnZOZIHZtmAttl5q+q9ZxG6TRyIuULGUpgsTmwdWZeX037QkTcDBwfESdlZkbENykX+d92aW5wMY8OOF9IqbXaoZp+VkQ8CXgyJXiqt6H7JbBTZ1uqnvW/odQmbVct73hKIPrMzLyjmva5iPgicFREfCYz76mtfxVgq8x8qFrm3ZSgaGZmzl1KeW0J3JuZty4lzVW1tDctJV03Y92nfwF2aARFXwU+GxFbZua1ABGxAqWjxjmZeVeV7qPATkCnfdovKEHT+fVay8prM/OB2vvPR8SNlP3/zsz839q8xwFnZeYHq3V/vdqmLwGvz8wzquk/odTUHUD326s7ZOZwlfaa6vP7A//ZJS0RsTGl5u7YzDyuNv1M4BrKcXwUZb9sAuzVqJE7vvb/ltXrb7utq4t/pZxLB2Tm6dV6T6Wcy8dGxBcbNcn/ADytc0xWPyTfQ2kH/ZzMfLiavgHwmog4pBGgbQbsl5lfr9J9gXJOnBQRZ3erka1t40jnye0R8UNKsP+rLk2ElvW6sznlLkWvtqP8AN05M39dm965q0JEbAccAryhXusbEedRjvP9KdewNSlB/5XAC+tlXF1/FsnMv0fE/7L42NAUZQ2npqobOsFm5bLq9Wf1W4C16ZsARMQ6wEsoNZOPj4j1On/AecATKbc1R/LrTrAJUH0pfg14QURMrybvTQn6/txYTyfw3WEU6/kF8Kzq1jyUIPMCSm1LJxB9YS0twFZAVPlZt7beNSk1HM+NiNWqL4Y9KbXECxt5/DGwKqUXdt1XO8FmY52bjLAdawD3j5CmM3+NEdI9So/79ItdauDOotRG1js37EAJchbdvs7MCyhl/gNKsHR49f/tVW0ktbQPVHlcISLWqvL0S8rtx627bM5/1j57D+XW9N8otfmd6UmpmetW5p/vBJuVr1ZpX94lbcerKJUTZzXK7l7gasqPNSg1qwA7R8TjuywHyjEGI+/rjt0otfyLgrPMfAQ4mRLgvaSR/luNH0Cd8/u/OsFmbfrjgCc1Pv8nFtdUUwVK/1mle1a3DPZ4njSX0Y/rzrqUOx69urd6fXlVW93N3pQfY+c18ng9pea5cyy8lLKvT2zWuC4haL6b0tRBU5gBp6aqefU31S1ZgP9tpOtM7wSBT6N82R9L+aKr/32sSrPBKNZ/Y5dpN1SvnXaUT6fUhDXXc+kY1vMLynm8fa0m8+fVXz3g/FOtFrXzxfWlLut+W7W8dSltCqdTRgdopuuMndfM47zG+84X4HSW7n5GDiQ78/80QrqmXvbpY2pQM/Nu4HvAfrVamtdRhjw6p5H2ksx8JaUpx7MpNY0LgS9HxIs76SJiZkScS/kSv6fK00XV7LV4tIcz8/8a0+4F/tCl5u1eupf5o47LKqi+GXhKl7QdnePleh5bfttQlV31Q+7jwEGUH1E/jYi3RcS6tWV1gtLR/mh4MqU98iON6ddVr818N4+/0Z73HTd1KcvOedtcV0cv50lTv64700ZOskQXUTruvR+4MyK+HxEHN348PB1YnRJcNvO5YS2PnWYvS7ur0cz3hBpPWf3nLXVNVc0vqJGmdy7UnR9hn6C0l+tmtBfRkaxAqY388BLm/34Uy/g1pf3TP1GCm/sptwDXoNxyXIcScP6ysV4obVuXdAvujmp5UGrPvryEdNc03o9UvktyLfDsiNg4M5tBQ0enhqlTLkv6glqx8b6XfbqkdnBfBfYC/ikiLqN0wPlao1Z3kapW7Srgqoj4FaVt4OuAC6pa6QspPdrfS+kM8yClNus0HlshsKTbub2W+Wh18rEL3UeEWFRWmXl4RHyZ0sHnpZRg6eiI+OeqGUInUHwm8J0+5a+u1/N+WXTKZyznyZKWsSzXnT/T/UfGqM6TquZxr4j4R0qN906U9pnviYjnZeafqnzeSWkr202vNazTWdw5UlOUAaf0aJ1g5u+Z+dOlply6zbpM69QUddop3gSsMYr1LPGXf2Y+HBGd2+drAZdUvbovpQQHr6S0jfpi7WOd2rv7l7buiLiDEsCutIxlMRrfp/Rw359Ht/nr5GVNyrZcmZmdfdT5clu7kfzJjff92qdQbm/+iXJbfUPKbcP/t9RPLNbpfd/pQPMiym3EPTOzU6vZ6dnbls0ot3k761qJMqrARUv8xOLjZV6n7erSZOY1lADrw1F68M9h8VBIv6Tst/0i4oQuNZdNt1I6YK3YSLt59XrLSPkZo00jYoVGLWfnvF3SusZynizpXO7HMXodZV823Q0QEWs3mhs0zxMAsowScTlwTETsQgmAD6a0Ab+JEohempl/WUpeOsfMTErN+BJVx+CTWHKgrSnCW+pSTfUr/kLg4Ih4YnN+1ZNzNLaJMgB453PrUgKqS6pbs1Daim0bEbt2Wc8aETFUve10KlnSbelfUHoe70TVE71qN/VrSk/badR6qFMCgN8B/xERj7m12dnG6gv+m8Ae0aXH9RjKYjS+RQlS3h0R2zTWsyKl09R0qo5PlVspNVf1TlNQekcv0sd92rkFfQalzd6bKLd7L2ks78VVZ6Kmzn7ufAF3Aqj6kF4rAP8x2vz04JDacQUlwF+bRpOAhm9R8npMs8MHLBq2iIhYswoe6q6j1ICuDYuOyw9TAsaPLWF5L43yFCIobV/Xp5w7nfkrUJp+DLO4vXO/bEAZCquzrlUpTQRuYwkdncZ4nnTGc33UudynY/RiYMsqz3Wd4G/ReVLdJn9DYx3Tu+yPK6vXtavXsyhxwzGNdETEirX26T+mNJ94dzM/XdaxJaWz4SVoSrOGU3qsf6NcvH9b9TK9ifJF9FzKxbHbcDNNc4EfRMSnKbUf/0q5zf2eWpqTKEO1fDciTqcEgqtSagX2otx2vCUzH4zSm/g1EXED5ZbWzZnZ6RDxCxYP8VQPLH/O4mFyFj02LjMXRMSbKLV111a3QG+j1Lz9MyUA6jT+fzelY8yvqrK4hvJl+WzgXyhfFMusqql9NaWJwS+rPNWfNPQcyniI/137zL0RcTZwWJTHod5EuRXYra1bP/Zpx1cpNXYvpftTgz4FrB4R36YEXCtQOgC9nrLvTq7SXVy9P706Th6mBLKrjyEvvbgwSg/3p1CGEJpLGde1q8z8fZQhiE4CnhwR36G0N30qpdb5LEo5vJjSq/qblM5M0yjB2xrUOuJQhlTanBI07lDtw/mUwHLnajmdAPOLlHPnS1EGNv89ZSinHYH3ZJexTpfRjZShjJ5DOSdeR+lg99ql9FCH0Z8nN1FqHP8tIv5CuTbMrUZwWNZj9LuU4Z9ezKN/QPyY0rb1SxFxEuXHQ6e96ca1dG8ADq2O25so16IDq/TfBMjMn0fEKcA7q9rrH1EC/6dRjt1jgNMy876IeBulicGvI+JrlGP9GZQmI6+qrXcnyo+SH42wfZrkrOGUGqpevttQOojsz+IxBVcC3jfKxVxcfWYfyjBDw8AembkoIKxqe3YAPkKpfTiZMrzMFpRhaP5YW96bKLf0PkZpK/ZvtXm/otw+/xuLb9vC4h7iFze/LKt8PI/SQenfKUOYvJHSAeYjtXR/onzh/Sfli/4zlGBrI0rv676pyn2rah07UcYzPIkSbL4hM7uV/WGUL9o3U27Fz6NRc1Nb9rLu086y/ofFtV3dnn51BKW3/86U4OqT1facQXn60C3Vcu6i9ML+X0qgcBSl1/f+Y8nPGL2N0sb3/ZThbb4HvHRJbVA7MnM2Zf8/ROkA9XFKIPUzFg/WfRVlSK5dKdv9QUrQuUd9mKTMXJiZb6IEq/OBt1PaCh5O6Ty1e2dYomoYqRdRAuLXUo7/J1AGIm/jkba/p7TL3ZEyvNV0ygMElvqYzdGeJ1Wb3tdTztVTKOfyntW8ZTpGszyL/EpKT/L69Icp++omyj55a5XPzzQWcRHl+rE35Zg9inINenHtxy2Z+RbK9Wgdyh2HEyk/vr5BbdD5LA9deDnlmnIUpTyfT2k+U7c38O1ax05NUdMWLrRjmKSJKSKeSQmcb6WMqzohvpQi4grgocycFINVx+Knyjw/My8dIflyKaonDWXmywadl15FxL6UWuEnt1D723dRnmz0a2BWZv5m0PlRu6zhlDRhZebVlJqwAL4dIzwRZzxExLMpNVFLvA0tDciZlJrMtw84H6P1HuCbBpvLB9twSprQqh7cfWkruiwiYialc9Y7KD3Vu91OlwamGtpoNI9UnRAyc69B50HjxxpOSRqdPSm3pVcFXpOPfiSlJGkpbMMpSZKkVnlLvQdz5swZArYF/o8lP8FCkiRpIliRMsLDFbNmzRoeRAYMOHuzLYuHnJEkSZoMmo86HjcGnL35P4CnP/3prLxye51m586dy8yZM1tb/mRhORSWQ2E5FJZDYTkUlsNilkVRL4eHHnqIG264Aar4ZRAMOHvzCMDKK6/M0NDQSGmXSdvLnywsh8JyKCyHwnIoLIfCcljMsii6lMPAmgHaS12SJEmtMuCUJElSqww4JUmS1CoDTkmSJLXKgFOSJEmtMuCUJElSqww4JUmS1CoDTkmSJLXKgFOSJEmt8klDE9jqa63H7Xc9MOhs9MVqQyuxxuPbewyoJEmauAw4J7BHFk7j/CvmDTobfbHjthsbcEqStJzylrokSZJaZcApSZKkVhlwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVhlwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVhlwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVhlwSpIkqVUGnJIkSWrVSoNacUTsBbwWmAWsA9wEfBb4fGYuqNKcBryhy8f3ysxvNpZ3BHAosBFwDXBkZp7fSLMGcBKwJ7AKcCFwWGbe0rcNkyRJ0qMMsobzcGAYeCfwcuA7wKeAjzTS/R54fuPvgnqCKtg8ATgF2A24ETgnIrZqLOvrwO7AYcA+wAzg/IhYrV8bJUmSpEcbWA0n8IrMvKP2/sKIWB14S0QcnZnD1fQHM/PSJS0kIoaAo4GTM3N2Ne0i4GrgvcDe1bTnUoLR3TLz3Gra1ZSa1QOAU/u5cZIkSSoGVsPZCDY7fkO51b3OGBa1HbAWcGZt2Y8A3wB2iYhp1eRdgXuB82rp5gEXV/MkSZLUgkHWcHbzQuAu4E+1aZtGxD3A44G5wImZeVZt/hbV63WNZV0DrA48EbitSnd9p31oI93Ofcm9JEmSHmPCBJwRsQ1wIPCBqoYSSo3nFZSgcC3gIODMiFg1M0+r0kwHhjPzwcYi765e16EEnNOBe7qs+m7GVqO6yNy5c3v52Kituub6zJ8/v9V1jJc771yN227uVqk9OnPmzOljbiYvy6GwHArLobAcCsthMcuimEjlMCECzojYCPgWcDm1TkOZ+clG0u9GxAXAB4DTxi2DSzBz5kyGhoZaW/61N85jxowZrS1/PK277npsuNnGPX12zpw5zJo1q885mnwsh8JyKCyHwnIoLIfFLIuiXg7Dw8OtV5KNZODjcEbEWsAPgQeA3TPz4RE+cjawcUSsX72/GxiKiFUa6aZXr3fV0q3dZXnTa2kkSZLUZwMNOKsg8XvABsDLMvPOHhbTabu5RWP6lsD9wB9q6aLWiaie7voe1itJkqRRGFjAGRErUXqSPwvYJTNvHcVnplGGObq11sv9Ekrv831q6Vas0p2XmQuryedSajh3rqV7ErB9NU+SJEktGGQbzlOAVwDvAlaLiOfV5l1LudV9OmWw9t9RgsWDgB2A13cSZuZwRBwPnBARdwBXVuk2BfarpbssIs4BvhQRhwP3AccB85gA7UElSZKmqkEGnJ2axo92mfci4LeUmsujKbfcH6YEk7tn5vfriTNzdkQAvBXYkNKrfbfMvKqx3H2B2ZRB3ocoj7bcKzMf6McGSZIk6bEGFnBm5lNGkeyVY1jebEowubQ09wOHVH+SJEkaBwPvpS5JkqSpzYBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrTLglCRJUqsMOCVJktQqA05JkiS1yoBTkiRJrVppUCuOiL2A1wKzgHWAm4DPAp/PzAW1dLsAHwK2BP4AnJyZn+6yvCOAQ4GNgGuAIzPz/EaaNYCTgD2BVYALgcMy85Z+b58kSZKKQdZwHg4MA+8EXg58B/gU8JFOgoh4PvA94DfALsBXgJMj4s31BVXB5gnAKcBuwI3AORGxVWOdXwd2Bw4D9gFmAOdHxGp93jZJkiRVBlbDCbwiM++ovb8wIlYH3hIRR2fmMHAMcGVmvqmWZmPg/RHxhcxcEBFDwNGUms/ZABFxEXA18F5g72racynB6G6ZeW417WpKzeoBwKktb68kSdJyaWA1nI1gs+M3lFvd61SB5IuBsxppvka5bb519X47YC3gzNqyHwG+AewSEdOqybsC9wLn1dLNAy6u5kmSJKkFE63T0AuBu4A/AZsCKwPXNtJcU71uXr1uUb1e1yXd6sATa+mur7cPraXbHEmSJLViwgScEbENcCDwiaqGcno1655G0rur13Wq1+nAcGY+OIp0zWV10q3TZbokSZL6YJBtOBeJiI2AbwGXU+s0NNHNnTu31eWvuub6zJ8/v9V1jJc771yN227u1opidObMmdPH3ExelkNhORSWQ2E5FJbDYpZFMZHKYeABZ0SsBfwQeADYPTMfrmZ1aijXbnykU/N5Vy3dUESskpl/GyHdxl2yML2WZkxmzpzJ0NBQLx8dlWtvnMeMGTNaW/54Wnfd9dhws27FP7I5c+Ywa9asPudo8rEcCsuhsBwKy6GwHBazLIp6OQwPD7deSTaSgd5Sj4hVKMMebQC8LDPvrM2+CXiIxW00O7asXq+vXjttN7ulu58ydmcnXdQ6EdXTXY8kSZJaMbCAMyJWovQkfxawS2beWp9fDYt0AdWwRjX7An8ErqzeX0Lpfb5PbdkrVp87LzMXVpPPpdSW7lxL9yRg+2qeJEmSWjDIW+qnAK8A3gWsFhHPq827NjPvA44Dfh4RXwTOAF4AHAwc2ultnpnDEXE8cEJE3EEJRA+i9HLfr7PAzLwsIs4BvhQRhwOd5c8DTmt1SyVJkpZjgww4OzWNH+0y70XAzzLzVxHxSspThPYH5gPvyMzP1RNn5uyIAHgrsCFlqKPdMvOqxnL3BWZTBnkfojzacq/MfKA/myRJkqSmgQWcmfmUUaY7l1Hc8q6eMjR7hDT3A4dUf5IkSRoHE2YcTkmSJE1NBpySJElqlQGnJEmSWmXAKUmSpFYZcEqSJKlVBpySJElq1ZgDzojYucvjISVJkqSueqnh/CFwW0ScFBFb9TtDkiRJmlp6CTj3AC4GDgWujIjfRsQRETGjrzmTJEnSlDDmgDMzv5eZe1MeIXkwcAdwInBrRPw4Il4XEav1OZ+SJEmapHruNJSZ92fmlzNzR+DJwFHABsDpwO0R8dWI2LFP+ZQkSdIk1a9e6isCjwOGgGnAg8BLgJ9ExG8iYmaf1iNJkqRJZqVePxgRawF7A68DXgD8HTgHeHf1ugDYHfgE8BVg22XNrCRJkiafMQecEbEHJcjcFVgFuAJ4G/D1zLyrkfw7EbEecOoy5lOSJEmTVC81nP8N/AH4JHB6Zl4/QvrfAmf0sB5JkiRNAb0EnC8Fzs/MhaNJnJmXA5f3sB5JkiRNAWMOODPzp21kRJIkSVNTL4+2/ERE3LiU+TdExEnLli1JkiRNFb0Mi7QbcNZS5p8FvKK37EiSJGmq6SXgfBJwy1Lm31qlkSRJknoKOO8DnrqU+ZtQBn6XJEmSego4LwAOiYiNmzMi4inAIVUaSZIkqadhkY4BdgHmRsRXgGuq6TOBA4BHgPf1JXeSJEma9HoZFunGiHgBcApwWGP2RcBhmZn9yJwkSZImv56epZ6Z1wA7VI+t3KSafFNm3tm3nEmSJGlK6Cng7MjMPwN/7lNeJEmSNAX1FHBGxIrAzpTazenAtEaShZn5wWXMmyRJkqaAMQecEbEN8C3gH3hsoNmxEDDglCRJUk81nKcCqwJ7AL/IzHv6mSFJkiRNLb0EnM8C3puZ3+93ZiRJkjT19DLw+20s+Va6JEmS9Ci9BJwnAgdHxJr9zowkSZKmnl5uqa8D/BX4XUR8E/hfytOF6hZm5knLmjlJkiRNfr0EnCfW/n/zEtIsBAw4JUmS1FPA+dR+rTwingYcATyP8iz26zNzZiPNacAbunx8r8z8ZiPtEcChwEaUZ7wfmZnnN9KsQQmG9wRWAS6kPI7zlj5skiRJkhp6eZb6rX1c/zOA3YDLKO1Jl9Sm9PfAaxvTbqi/qYLNE4CjgCuBg4FzIuK5mXlVLenXga0pz4G/DzgOOD8inpmZDyzb5kiSJKmp50dbRsRmwA7ABsAZmXlLRKxMqV38Y2Y+NIrFfD8zv1st7zRgmyWkezAzL11KXoaAo4GTM3N2Ne0i4GrgvcDe1bTnUgLc3TLz3Gra1cBNwAGUMUYlSZLUR2PupR4RK0TEF4Drgc9Tagg3qWavTAnyDhvNsjJzwVjXvwTbAWsBZ9aW/QjwDWCXiOgM47QrcC9wXi3dPODiap4kSZL6rJdhkY4C3gi8D3g+tTE5M/MvlMdevqovuVts04i4JyIejojfRMQ+jflbVK/XNaZfA6wOPLGW7vouge41wOZ9zbEkSZKA3gLOA4EvZ+YJwO+6zL8a2GyZcvVov6F0LNqD0tHnNuDMiDiglmY6MJyZDzY+e3f1uk4t3T1d1nF3LY0kSZL6qJc2nP8AXL6U+Q8Ca/SWncfKzE82Jn03Ii4APgCc1q/19GLu3LmtLn/VNddn/vz5ra5jvNx552rcdvMdPX9+zpw5fczN5GU5FJZDYTkUlkNhOSxmWRQTqRx6CTj/CDx5KfNnAf3syd7N2cCpEbF+Zt5BqaEciohVMvNvtXTTq9e7qte7gY27LG96Lc2ozZw5k6GhobF+bNSuvXEeM2bMaG3542nddddjw826Ff3I5syZw6xZs/qco8nHcigsh8JyKCyHwnJYzLIo6uUwPDzceiXZSHq5pf4t4N+qXuodCwEiYhdgf0pnnfHUabu5RWP6lsD9wB9q6aLWiaie7vr2sidJkrT86iXgPBaYR2lbeQYl2DwqIi4FfgBcBXy4XxlsqoLFvYFbq9pNgEsovc/3qaVbsUp3XmYurCafC6wN7FxL9yRg+2qeJEmS+qyXgd/vi4jtgP8A9gL+RgnYbqIEoyc1bmsvUUSsxuLhiJ4MrBkRe1bvr6heT6cM1v47SrB4EGX8z9fX8jQcEccDJ0TEHZSB3w8CNgX2q6W7LCLOAb4UEYezeOD3eQy4PagkSdJU1dPA71VAeUL1tyw2oLTHrOu8PxD4HqXm8ugq7cOUYHL3zPx+I0+zIwLgrcCGlKGOdms8ZQhgX2A2ZZD3IcqjLffyKUOSJEnt6PlJQ/1QPb+82Z6y6ZVjWN5sSjC5tDT3A4dUf5IkSWrZmAPOiPjyKJItzMw39ZAfSZIkTTG91HC+mKpXes2KwBOq1zuAvy5jviRJkjRF9NJp6CndpkfE4yi3qd8O7LRMuZIkSdKU0cuwSF1l5sOZ+Rngx8Bn+rVcSZIkTW59CzhrrgL+qYXlSpIkaRJqI+DcCXCIIUmSJAG99VI/Zgmz1qbUbG4NnLgMeZIkSdIU0ksv9WOXMP1uytOG3gx8sdcMSZIkaWrppZd6G7fhJUmSNEUZPEqSJKlVvbTh3LiXFWXmvF4+J0mSpMmtlzact/DYJw2Nxoo9fEaSJEmTXC8B50HAW4EnAV8DbqimB7AvMA/4FLCgHxmUJEnS5NZLwPkEYAh4WmbeXZ8REe8HLgY2yswP9yF/kiRJmuR66TT0ZuALzWATIDPvpAyJ9G/LmjFJkiRNDb0EnOsCqy9l/uOrNJIkSVJPAeelwNsiYlZzRkRsA7wNuGxZMyZJkqSpoZc2nG8BfgZcHhFXADdW0zcDtgXuAg7rS+4kSZI06Y25hjMzrwWeSemJvjawZ/W3NvBJ4JmZeU3/sihJkqTJrJcaTjLzduAd1Z8kSZK0RD0FnB0RsRmwATA3M+/tT5YkSZI0lfQUcEbEfsCJwBOrSTsBF0TEesAlwNGZ+Y3+ZFFTwYIFC7n9rgd6+uyqa67f82fbsNrQSqzx+JUHnQ1JkiaNXp6l/mrgv4CfACcDszvzMvPPEXEdsD9gwKlFhh9+hEt+O7+nz86fP58ZMyZOwLnjthsbcEqSNAa9DIv0XuCnmbkzcHqX+ZcBWy1TriRJkjRl9BJwbgF8eynz/wSs31t2JEmSNNX0EnD+laU/aWhT4M+9ZUeSJElTTS8B5wXAARHxmEZsETEDOBj40bJmTJIkSVNDr204nwD8Gvh3YCGwa0ScCFwNLAA+0LccSpIkaVLr5UlDNwIvAP4IHAtMA/4DeBfwP8D2mTmvf1mUJEnSZDamYZEiYkXK2Ju3Z+ZLI2I68DRK4Pr7zLyjhTxKkiRpEhvrOJwrADcBRwIfz8y7gSv6nitJkiRNGWO6pZ6ZDwPzKe02JUmSpBH10mnoK5Re6qv0OzOSJEmaenp5lvoNwIrA9RFxOvB74MFmIp+lLkmSJOgt4Pyv2v/vW0KahfgsdUmSJDHKgDMiPgWcnplzgBdVk1en1Gw+0uvKI+JpwBHA84CZwPWZObNLul2ADwFbAn8ATs7MT3dJdwRwKLARcA1wZGae30izBnASsCewCnAhcFhm3tLrdkiSJGnJRlvD+RbgUmBOZl4UEetSnpm+U2ZetAzrfwawG3AZpT3pY9qURsTzge8BXwUOp4wBenJEPJyZn6ulOwI4ATgKuJLyxKNzIuK5mXlVbZFfB7YGDgPuA44Dzo+IZ2bmA8uwLZIkSeqil1vqHdP6sP7vZ+Z3ASLiNGCbLmmOAa7MzDdV7y+MiI2B90fEFzJzQUQMAUdTaj5nV8u7iPLko/cCe1fTnksJcHfLzHOraVdThno6ADi1D9skSZKkml56qfdNZi5Y2vwqkHwxcFZj1tcot823rt5vB6wFnFlb9iOUdqS7REQnON4VuBc4r5ZuHnBxNU+SJEl9NtCAcxQ2BVYGrm1Mv6Z63bx63aJ6va5LutUpT0fqpLu+S6B7TW1ZkiRJ6qOx3FLfJCL+sfp/rep184j4S7fEmXn5MuWsmF693tOYfnf1uk4t3XBmNodnqqe7rUrXXFYn3Tpdpi/V3Llzx/qRMVl1zfWZP39+q+sYL8OxzjJty0QqhzvvXI3bbh7MU1znzJkzkPVONJZDYTkUlkNhOSxmWRQTqRzGEnB+oPqre0xPcUrbzoWUsTqntJkzZzI0NNTa8q+9cR4zZsxobfnjaWholZ63Zf78+ROqHNZddz023GzjcV/vnDlzmDVr1rivd6KxHArLobAcCsthMcuiqJfD8PBw65VkIxltwHlgq7lYsk4N5dqN6Z2az7tq6YYiYpXM/NsI6bpFCtNraSRJktRHowo4M/P0tjOyBDcBD1HaXp5Xm75l9Xp99dppu7kF8JtGuvspY3d20u0UEdMyc2Ej3fVIkiSp7yZ0p6HMHAYuoBrWqGZf4I+U8TYBLqH0Pt+nkyAiVqw+d14tuDyXUlu6cy3dk4Dtq3mSJEnqs2UZh3OZRcRqLB6O6MnAmhGxZ/X+isy8lTIw+88j4ovAGZSB3w8GDu30Ns/M4Yg4HjghIu6gBKIHUXq579dZX2ZeFhHnAF+KiMNZPPD7POC0VjdWkiRpOTXQgBPYADi7Ma3z/kDgtMz8VUS8kvIUof2B+cA76k8ZAsjM2REB8FZgQ8pQR7s1njIEpXZ0NmWQ9yHKoy338ilDkiRJ7RhowFk9v3zEJxZVTwUa8ZZ39ZSh2SOkuR84pPqTJElSyyZ0G05JkiRNfgackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVQackiRJapUBpyRJklplwClJkqRWGXBKkiSpVSsNOgPSZLNgwUJuv+uBcV/vqmuu39f1rja0Ems8fuW+LU+SpCWZ8AFnRBwAfKXLrFMy8y21dLsAHwK2BP4AnJyZn+6yvCOAQ4GNgGuAIzPz/Bayrilq+OFHuOS388d9vfPnz2fGjP4FnDtuu7EBpyRpXEymW+ovA55f+5vdmRERzwe+B/wG2IUSoJ4cEW+uL6AKNk8ATgF2A24EzomIrcZjAyRJkpZHE76Gs2ZOZv55CfOOAa7MzDdV7y+MiI2B90fEFzJzQUQMAUdTaj5nA0TERcDVwHuBvVvOvyRJ0nJpMtVwdlUFki8GzmrM+hrltvnW1fvtgLWAMzsJMvMR4BvALhExrf3cSpIkLX8mUw3n3IhYH5gHnAZ8KDP/DmwKrAxc20h/TfW6OfBrYIvq/XVd0q0OPBG4rf/ZliRJWr5NhoDz/4D3A5cDj1DaaL4PeCpwADC9SndP43N3V6/rVK/TgeHMfHAp6cYUcM6dO3csycds1TXXZ/788e+c0obhWGeZtmUilcOybsuy6Od677xzNW67+Y6+LW88zZkzZ9BZmBAsh8JyKCyHxSyLYiKVw4QPODPzR8CPapN+EhH3AsdGxAcHlC0AZs6cydDQUGvLv/bGecyYMaO15Y+noaFVet6W0jt74pTDsmzLsuh3Oay77npsuNnGfVveeJkzZw6zZs0adDYGznIoLIfCcljMsijq5TA8PNx6JdlIJmsbzm9Ur1uzuIZy7UaaTs3nXdXr3cBQRKwyQjpJkiT10WQNOOtuAh5icRvNji2r1+ur107bzW7p7qeM3SlJkqQ+m6wB52uAhZShkoaBC3jssEb7An8ErqzeXwLcC+zTSRARK1afOy8zF7adaUmSpOXRhG/DGRE/ogSUc4EFlE5D/w58KTN/XyU7Dvh5RHwROAN4AXAwcGhmLgDIzOGIOB44ISLuoASiB1F6ue83jpskSZK0XJnwASflVvgbgX+g5PdG4Ejg5E6CzPxVRLyS8hSh/YH5wDsy83P1BWXm7IgAeCuwIWVIpN0y86r2N0OSJGn5NOEDzsx8O/D2UaQ7Fzh3FOlmU3sspiRJkto1WdtwSpIkaZIw4JQkSVKrDDglSZLUKgNOSZIktcqAU5IkSa0y4JQkSVKrDDglSZLUKgNOSZIktcqAU5IkSa0y4JQkSVKrDDglSZLUKgNOSZIktcqAU5IkSa0y4JQkSVKrDDglSZLUKgNOSZIktWqlQWdA0mAsWLCQ2+96YNDZGLNV11z/MflebWgl1nj8ygPKkSRpJAac0nJq+OFHuOS38wedjTGbP38+M2Y8OuDccduNDTglaQLzlrokSZJaZcApSZKkVhlwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVhlwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVhlwSpIkqVU+S13SpLdgwUJuv+uBkRNOAqsNreRz4SVNOQackia94Ycf4ZLfzh90Nvpix203NuCUNOV4S12SJEmtMuCUJElSq7ylLkkTyGjbo6665voTut2qbVEl1S13AWdEbAZ8GtgeeBA4EzgyMyfulVvScmO07VHnz5/PjBkT97JlW1RJdctVwBkRawMXArcCewIbAB8H1gdeM7icSZIkTV3LVcAJHAJMB56dmX8GiIi/A2dExAcz85qB5k6SpojxGqpqPJoW2DxAWnbLW8C5K3B+J9isfAv4MrALYMApSX0wXkNVjUfTApsHSMtueQs4t6AEl4tk5nBE3ARsPoblrAjw0EMP9TFrj7VgwSOstMKCVtcxXh75+8M9b8sqj5s2ocphWbZlWfS7HAa1HcuqWzlM1m3pZrTbMtHOi6bx2ifjUQ5/f/ghhodXbHUd/TA8PDzoLEwYlkXRKYdavDKwA3nawoULB7XucRcRDwPvy8wTG9N/CfwpM181muXMmTNne+AXLWRRkiSpLS+cNWvWLwex4uWthrNfrgBeCPwf8MiA8yJJkrQ0KwJPoMQvA7G8BZx3A2t3mT4duH60C5k1a9YwMJBfCJIkST24aZArX96eNHQdpR3nIhExBGzKGAJOSZIkjd7yFnCeC+wYEevWpv0LMFTNkyRJUp8tb52G1gbmArcAH2TxwO/nZ6YDv0uSJLVguarhzMx7gBcDfwH+G/gEcBbwxgFmS5IkaUpbrmo4JUmSNP6WqxpOSZIkjT8DTkmSJLXKgFOSJEmtWt4Gfp/wImIz4NPA9sCDwJnAkZn5wEAzNoKI2At4LTALWIcywOxngc9n5oJaul2ADwFbAn8ATs7MT3dZ3hHAocBGwDWUMji/kWYN4CRgT2AV4ELgsMy8pZFuYGUaEatTxnh9IrBtZv66Nm9/4CjgKZTyOi4zz2p8/nHAccAbKA8tuAJ4W2b+TyPdRsAngZcBC4EfAG/PzD830v0jZWSGWcBdwH9W623liVkR8Xrg7ZT9/QBwJbBvJ1/Lw/EQEXtQ9vMWwF+Bi4F3Z+aNjXRT5niIiKcBRwDPA2YC12fmzC7pJuz+H23elqUcImJF4HBgt2o9KwFXAx9obt9ULocu6WcBlwMPZubqjXkDOQdGc36OZAznxSrAu4HXA/8A/Bk4NzMPbqSbVMeDNZwTSDVs04XAGpSD43BgX+DLA8zWaB0ODAPvBF4OfAf4FPCRToKIeD7wPeA3wC7AV4CTI+LN9QVVJ9EJwCmUC/GNwDkRsVVjnV8HdgcOA/YBZgDnR8RqtWWtzWDL9Fi6/LCLiD2B04FvU8rip8DXq5O57hOUC8r7gVcCD1G2cUZtWSsB5wHPBPYHDgK2A74XEdNq6Tap1nMXZR+dQNlfH+rDdj5GRLyX8qPjvynb+CbKRXGomj/lj4eI2JGy/dcDr6rytjnw04hYs5Zuqh0Pz6Dsq98B13ZLMJH3/2jzNgojlcOqlCDmf4ADgddQvsR/EhEvb+RpKpdDfZ0rUK4bdywhybifA2M4P0cymvNiBcr35/5Vfl4KvIsyuk493aQ7HqzhnFgOoTxm89m1GqC/A2dExAcz85qB5m7pXpGZ9QvEhVXt3lsi4ujMHAaOAa7MzDfV0mwMvD8ivpCZC6onPx1N+dU0GyAiLqL86n8vsHc17bmUk2y3zDy3mnY15ZfnAcCp1ToGVqYRMRN4M/AfwOcbsz8InJ2Z76neXxgRWwAfAH5Yff6J1effmplfrKZdCtxMqTV8V/XZVwNbATM72xMR8yk1abuw+KEG7wTuAfaq9sf5EbEWcExEfDQz7+rjtgcl2P6XzPxBbdZ3av8vD8fDvsCtwBsyc2G1vluBy4AXUO1rpt7x8P3M/G617tOAbbqkmcj7f8S89akcHgSempl3dyZExI+Bp1O+9H9QTZvq5VB3MLAWJdh5a33GAM+BEc/PPpbDgcDzgS0z8w+16WfUymFSHg/WcE4su1IGoa9X+X+LUnM41l9S46oRbHb8hlKFv051gryYMu5p3dcotwO2rt5vR7nYnFlb9iPAN4Bdar9OdwXupfyK7aSbR7mg7Fpb/iDL9BTgM8AN9YkR8VRKLdeZjfRfA7aNiPWr9y8FVqRWZpl5P+VLqLmNV9eDpcy8hBLoNNN9p7qw1tfZ2Tf9dCBwayPYXGQ5Oh4eB9zfCTYr91Sv02BqHg8jffFM5P0/hryNaKRyyMxH6sFmNW0hpcZzRm3ylC6HjohYj1Jr9zZKzWXTuJ8DYzg/RzTKcjiYEtz+YSlpJuXxYMA5sWxBo5q9OhFuohzwk80LKbcq/kR5Xv3KPPY2QueC0Nm+zrPur+uSbnVKW8hOuuu7nMDX8OiyGkiZRmm7+DTg+C6zO9u4pLKIWrrbM/POLumeXt166aTrdntmUVlExOOBjZvpqnY8D9D/snge8NuIODoi/hgRD0fE5RHxz9X85eV4OA3YIiIOi4i1I+IpwGzK9nTaWi0Px0PTRN7/o81bK6r9uB2P3ublpRw+AvwyM89bwvxBnAOjPT+XWZT2qVsDt0TE6RHxl4j4a0R8p6pJ7JiUx4MB58QyncW1H3V3UzriTBoRsQ2llusT1S+v6dWsexpJO7/uO9s3HRjOzAdHka65rE66elmNe5lWt2ROAt6VmX/pkmQsZdFM00n3OMqFZaR0nWWtvYR1NtP1y0bATpRj4K3AK4D7gPOqoGu5OB4y80JK280PVeu4GXgqsFOtVmV5OB6aJvL+H23e2nIYJYj5WG3alC+Hqn3gvsA7lpJsEOfAeJbDupTtOJJyDX01pe37VsC5UdqmdvI06Y4HA071XZTegd+i9DL8yAjJp6LjgRsz84wRU05dK1Au/q/OzG9UNRa7U4LOdw40Z+MoIrYDvgp8iXI7ai9gAaXzwqqDzJsmnuoOwEeB2Zn5i0HnZ7xE6a1/KvDxzPz9oPMzQJ2Y7C/AHpn5o8w8k3LdeAbwLwPLWR8YcE4sd7P4V1fddMqt6Qmvqt37IeWWxO6Z+XA1q/MraO3GRzq/mu6qpRuKMizESOmay+qkq5fVuJZpRDyD0qj9fdUt1LVZ/Kt79ShDVIylLJppOukeZnGvxdFs4z1LWGczXb/cDdyZtaFKsgyvcSllOJDl4nigjNRwYWa+IzMvzMxvUhrxP4cy5EknT3TJ11Q6Hpom8v4fbd76KiKeBXyX0rHuyMbsqV4OBwNPAE6tXTdXgdKDuvbjbBDnwHiWwz2UIZwurtdeZhlO7z7KtbOTp0l3PBhwTizXsbhtBrCooe6mlGFVJrTq4P8esAHwskY7m5sojcC3aHxsy+q1s32dNind0t1PGTKkky5qjaPr6eplNd5luhll9IcLKSfo3cD3q3kXAr9g6dsIkNXrdcAGEdG8VbElcEOtXc5jtrGW7nqAzPwrMK+ZLiKeDKxG/8tiab29V2H5OR62pHQAWSQzb6OMq7dpLU8088XUOh6aJvL+H23e+iYiNgV+RBmn9vWNTmYw9cthc2BDynZ0rptHAo+v/v9wLd/jfQ6M9vxcZtWP8luWMHshVRA+Qp4m7PFgwDmxnAvsGBHr1qb9C6XH3LndPzIxVG1LvgE8C9glM2+tz6/aq11ANVxDzb7AHykXWoBLKL3q9qkte8Xqc+fVLsTnUn5p7VxL9yTKoLX1shrvMv0l8KLGX6dN0puBgzLzZsrJuU/js/sCV9R6/P+Ycvt1UZlFGWrqFTx2G59ZDdPRSfc8ygDFzXR7RMTKjXUOs7gDS7/8AFg3Ihb1XKwa6T8fmLMcHQ+3UgaUXqT6QluP6otlOTkeHmUi7/8x5K0vqiZIP66WvUdmduudPdXL4TM89rp5OvC36v/PVOnG/RwYw/nZLz8Atq83uYkyOP1awJxq0qQ8HqYtXNj8IaVBqW4jzKV8EX2QUlP4ccpwBa8ZXM5GFhGfB/6VMg5as+3RtZl5X9Uo/OeUnrtnUMYhPA44NDM/V1tWZ0Db91AO4oMojaefm5lX1dL9gHJr8nDK7YbjKNX7z6x+KU6IMo2IHSi1m4ueNBTlyUxnUX65/4QygPHbKOOl/bD22c9Qbr0eTglejqCM3fbMzJxfpVkJ+DWlsfl7KDWsJwG3Ay/IxeM/bkKpbbuA8kSJqNJ9OjPf3edtXgH4FbA+ZVy4+6tt2JYy1tvvlofjISLeQinrz1Bul65LGT9vfeAZnbsAU+14iDKodGfYlUMpNSX/Ub2/IjNvncj7f7R5W9ZyoIzg8atq+uso+2iRzLx0eSiHZgVF9ZljgSPysU8aGvdzYLTnZz/KoQoIr6Ls409QAsYTKPty604ztcl4PFjDOYFk5j2UjgV/oTyd5BOUg/yNA8zWaHV+QX2UcgGt/20NkJm/opyo21JuHx0EvKN5oGYZyPYoSu/mH1Jut+xWP4kq+1J+DZ4KnE35lfWSrD2Ka6KWaWaeTenBvSelLHYG9uty8XoH5akbx1OaK6xK2cb5tWX9nfL4trnAf1Ge/nAppQ3twlq63wMvoQQ751ACn49RAsJ+b98CSlvFn7N4/wDskJm/q9IsD8fDKZRBlV9IaZt3MuUpIy+qNzmZgsfDBpR9cDawA/Ck2vsXVeufsPt/tHnrQzlsSOmBvDrl+GheO5eXchiLcT8HxnB+jmQ058X/Vv9Pq6Z/hlKJ85Jc3CdiUh4P1nBKkiSpVdZwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVhlwSpIkqVUGnJIkSWqVAackSZJaZcApSZKkVv1/xVCuu0pwNnUAAAAASUVORK5CYII=\n",
-      "text/plain": [
-       "<Figure size 720x360 with 1 Axes>"
-      ]
-     },
-     "metadata": {
-      "needs_background": "light"
-     },
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAoIAAAFKCAYAAACJoz5RAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Z1A+gAAAACXBIWXMAAAsTAAALEwEAmpwYAAAw2ElEQVR4nO3debxtc/348deNXMksEpUy9Ea3+HWpDPVrki6l4WuIypeifJMkU5Mh5Kf4lpKiCRWRVF8iDcZKGU4D1/CmK3y5kXmI7sW9vz8+a3eWbZ9pn2Gfe9br+Xicx7p7rc9a+7M++3PPfp/PtKYtXLgQSZIkNc8zep0BSZIk9YaBoCRJUkMZCEqSJDWUgaAkSVJDGQhKkiQ1lIGgJElSQxkISuMgIg6NCNdm0tNExOsiYmFEvK7XeZmsIuLiiLi41/noJCL2jYibI2LxXudlKBGxdUQ8EhEr9zovmrwmfUWWJosRBHa7jmtGRiEipgHvA3YDNgCeCcwBzgC+mJmP9jB7wxYRzwb2AbYH1gQeB+4Afgd8KTNv6GH2JoWIWAPYH9gSeD7wJHA98FPg+Mx8oGeZG0cR8WHg0cw8eRyuvQzwSeCgzHxirK8/1jLz3IiYQ8nzx3udH01OBoLS8L2v7fUHgVcD72/bfxnwfeCoicjUcEXEYsBplODpN8DBwGPAa4HPAttHxJsy8x+9y+XQIuKZwCXADOB7wNeAZwHrAlsDvwcaHQhGxJbAWZTg73vA1ZTf9xtRgoL/C7y5ZxkcXx8G7gFOHodrv59S1747DtceLycCR0fEoZn5UK8zo8nHQFAapsz8fv11RLwJeGX7/prJ1mJwACUIPCYz96/t/0ZE/BD4CXASJZiaMFUr5ZKZ+dgwT3kHMBPYtb3Vp+quW25MM7iIiYgXAWdSWkjfkJl3tB3/FKVFWCP3fuC8zPxnrzMyAj8CvkL5v/+tHudFk5CBoDQOIuJQ4JDMnFbbdwulpeoo4BjgpZRu2Y9m5oUR8Q7gMOAlwHXA7pnZ13bdlwBHAG8Enk3p6vtcZv5oiPw8i9JNeCOlRegpMvPsiDgF2DUiXpmZV9TyfHFm7tJ2vYur815X2zcd+ATwXuCFlFaZHwKfrnc5V13sJwIXAZ8BAvhgROwOLJOZL++Q/z8Cj2fmq4C1qt2/6XAfTwD31s5bgxIAvwFYA5gP/Bb4ZGZeU0v3uio/7wHWobT2Lgf8EvgA8E/K5/YeSrmfBexRD17b7usQSpd1Agdm5vntee1wj0N+tlWg+wlgZ+AFlBbdm4DPZ+aPq2QHAMsAW7UHgVUZ3Vm9T/299wA+AqwNPAD8T1VG99XSXAysCmwHfBXYGLizSndGRGxOqdcbALdR6vUvaucfWpXLSymf+9bAAkod2TczHxmifKZVefwg5TN6CDiHUr73VGluoXzO9aEct2bmi6p9w6qjA7z/i4GXA8e37X8R8Dc6/2GyEPhsZh5avV4aOBT4D2C16h6uBQ7OzEtr521MaaXfDFgC6KN0R1/Udv3nVdfbGlgZ+DvwK+DjmfkwQGb+IyKuBt6JgaA6cLKINLHWBH4AnEv5QloeODsidqL81X4apct2TeDMqjsXgIhYD7gceBnwBWBfStBzZkS8d4j33RxYAThtkLFNre6ut430pqov6Z9QgpBzgb0oX7AfBn5aHa97LeUL9Szgo5QA+RTgZRHxlECwuu//U8vfLdV25w7Xbbdx9V4/AvYGvgS8Arik+hJtdwAwixL0fZvS+vgN4JvA+pQv57MogdjTAmrKF/fXq3v/NLAkcE4VJA1oBJ/tIZQ/Fi6hlNthlLJ7ZS3NNsDfMvO3g71n7b0/U+X5LsofC6dTWr4urAKnuuUon++VlLJ6FDg1It5NKePzKfV6qSrvnVpnT6fUxU9V53yQUl5D+TrwRUo57U35XLYFLoqIJas0HwNup5TJ+6qfj1X3OdI62m7TanvVMPI62D3sVeXjw8DngbspwTNVPv8v5Y+cFSmf74HAdOCX9QlGEbEqcAWlLp5VXfdkSl1Yqe19+4BNhnGPaiBbBKWJtQ7w2sz8DUBEXA/8AvgOsF5m/q3a/wClden1wK+rc78MzAU2qrVEHR8RvwSOiohTM3OgCS3rV9u/DJK31rH1B0kzkB2BtwCvz8xLWjsj4irKeMktKK1rLesCr8jMP9fS3kC5x/dSvqxb3keZDHJG9fqnlC/6gyktmBdTWvnO7dACdm57a2lEfI/S4voB2lrGKK0vr8zM+VXalYF3U1pZ3lKV79ciYh1KsHRw2/kzgE0z8/fV+SdTWuyOogTjAxnuZ/tWStfk7p0uEhHLAqtTWvSGVN3fQcAFwJaZ+WS1/8+UYQK7U1r/WlYFds7M71XpfkX5LE4DXpOZv6v2t+r1djy9FeoOSmvlwirt34GDqvGpv6aDiNgU+BDwn5n53dr+8ylB087ANzLzpxFxBHBPhyEbI62j7dattjcPkmYobwW+mZkdJ25UgdqJlPq8Ra2MTgD+BBxJf0B6FKVVcdPMvLx2mUM7BHw3U4Lv51HqmfRvtghKE+vGVhBYaf0Cv7gVBLbtXxMgIlYE3kRpwXh2RDyn9UNphVmd0qU8kGWq7cODpGkdW2aQNAPZntLtfG1b3i4BFlIC2rrL6kEgQDWL9Wxgp4h4Bvz7i3En4Oet7r/M/BclqDoGaM2CPhH434j4fhUMta5Z77pdKiJWonTHJWWcYbvvtYLAyuXVe5zUFmRfDqzWocXsqlYQWL3/vZQgabOIWKHD+430s30QeGnVjdxJ694H+5zr3kQJfr/cCgIr36O0ELaPF30MOLV2f0npSr6xFQRWnlJ/23y1rSy/Um3fOkg+twceAc5vK58bqny216+BrjGSOtpuJUpX9oPDeK+BPAi8KiJWH+D4BpShEqcBK9XyuCzlj5FXVfX4GZSu3p+3BYEAdPiD8P5q+5xR5F1TlC2C0sS6rf4iMx+MCID/bUvX+rJpBQ9rUwKSQ6ufTlahBDidDCfIax3rZtbwSyhfYHcPcHyVttdzBkh3CuUL+/WUVqrXUMZ87VdPVAVY+wP7R8TzKd2/e1PG8C2gtBBRdRkeRmllbO8Kvpenu63tdetz6PT5TKN07d9V239Th2veWG3XoP8LuW4kn+3BlBbRjIjrKK1up2Vmq7uyNSt0uMH8GtX2KfUmM5+MiJuAF7WlvyMzF7Tte5C28qnV607B701tae+JiPs7vFfdS4CleWpZ17XXr4GuMZI62sm06qfbNUL3p9Tx2yLiT5RA/3tVQN3KI5RhCQNZiTLWdVlg9jDft9VC6NqmehoDQWliPTnC/a1f4K3W+y8B5w2QdrAvheuq7cspgUQnrbF59a6vgb44FuOpeX5G9R57D5C+vTtqoBnCv6B82b+XEgi+l9LidM4A6cnM24HTIuJHlIH3746I91djIY+jdOEeR1nW5wFKoHgsnXtEuv18RmPYn21mXhoRa1HGcb6ZEvB+LCI+kZlfyMyHImIuZazheOhF+UApo3sp3fSddAqwO11jJHW03T2U+1mu7f06/h+pj+9tycwzI+I3wNspn99HgQMiYpfMPI3+uvAJyri+Tu5m5DPjWwH5PSM8Tw1gICgtGlrB2RMDjaMawu8oQdBOEfG5tm7Alp2r7Zm1ffdTWr3arcFTA8Y5lK7WCwYZpzikqiXqVGD3iNiHMhngzMycN4xz50fEXygtbM+hzGjdDvhuZn6snrbqph2PL8V1OuxrtfLcOsA5I/psM/N+ysSZ70aZDX4e8NmI+O/qcz0b2CMiNmvrru2klaegv+WSqutxHcq4tLG2Ttt7PYcSqNwyyDlzKGP4/jDU7GIG/uNltHX0+mr7Yp4aCLb+vXxb+jXooJq1fSJwYkQsD/yBMgnpNPpbyh8erC5ExHxK6++MYea9lec7h5leDeIYQWkRUC3yfBElQHra+KIY4hFS1dIYX6B84X+uw/lbA7sAZ9eXVaF8Mb06IpaopX0rZemSujOA5wL/1eHa06M8kWG4TqF0bZ5ICRCesnhvRGzQ6X6rL9VNgPvo7/57krZWqYjYkTLIfjxsFBGb1N5rJcoYx8uqAO5pRvLZVtern/sYZZzckpSFjgGOpoyn+3ZEPO0+I+K51UxhKOPO5gMfbY3LrLyH8nn+bPDb7cpH2iYzfLTanjvIOWdQvq/aJ+cQEYu1jb/8J527pEdbR1tB9Ub1ndUizfdQhifUfbhDPp/SkleNi/0b/UFkH/BX4OOd8tOqC1X3/E+AWRHxqg7p2ltiZ1KCaLuG9TS2CEqLjv+ifBldHRHfpARpqwCvosz0XXuI878AbAgcGBGvBn4MtCZevAe4hhIM1n2L0ip3fpRFp9eidNe2j/H7fpXu+CjLX/yWEoAFZczfdsDFw7nJzLy6atnbnvIl2d6qtQVweEScQ3mKyIOUwHRnSoC3V63F82zKMjMPUbpXNwR2YHQzPwczG/hZRBxHGZf5QUpQ22mpmbrhfrbXR8SllOVb7qFMLtgN+FmrpSwzb46IHSgtu9dVs6RbTxZ5BaV79bIq7T0RcThwOGV5kp9SJnh8hDKLfDzWnVsdOC8iflblf3fgl5n5q4FOqLrEj6eMCX05ZQjBPEq5bEsJEE+ukl8FfDgiDqG0PD6SmecwyjqambdVs6m3oCxdU/ct4BMR8a3q/V/L0ydvLQPcERFnUcr2IcpyQ2+hmpmdmQsi4gOUsYPXRcR3KMvhrEZ5Gsw0+ie1fLLKy8URcSKl2/u5wLsoE0luAYiIVSjDPk4Y6N7UbLYISouIakD5RlTBDWUdvg9TvuAPGsb5T1KCgF2qc46gtLq9j9Iy9Mr2VqtqQeB9KV9qx1Ja3N5K+XKqp1tA+QLanxK4HE3p7no15RFwV4/wdk+ptt/v0IpxFmXpjNUpa6ydSGlVuhl4Z2bWlzvZmzLwfgfKGLyXUr542yd/jJXfUT6THShLfcwD3lFfLLiTEXy2x1KeG3wgJXh4C6Usdmy73nmUbsPTgK0oYySPpgTCn6MEPa20R1AC0ecB/035o+Bk4I3D6ZLvwo6Ubsoj6X/axXaDnlHy+RHKkj8rUu7hKMo4ux8CF9aSHkYZU/pxyv0fV50/FnX0O8BWUZ51XXcYpZ5tS/mDazHKepR1j1I+15dRPtNjKZ/5fpTnZrfu89IqT3+g1IGvUsa53kdZd7CV7u+UPxROp5RpazzsFTx12MN/UFp9z0DqYNrChbYUS00V5bm951CevLH1YK0yEyki9qR8AUZm3jhU+skgqieLZOYevc7LZBT9TxZ5XjVObpFTddfeTHkSyNd7nZ/hqFoxL24fJyu12CIoNVhmPk5pMfgLcFZEvKLHWWrZDfj9ohIEqhmqx7YdReminvRDq6qxv2tTWl+ljiZ9RZY0vjLzn5RHsfVU1d22DWUs1IaUbjZpUsnM/6Z0oU96mXkuZf1FaUAGgpImi5UpY7oeAL6QmWf1NjuSNPU5RlCSJKmhbBHsQl9f33RKV9rfGXhFfUmSpMlgMcrKAFfOnDnzKasBGAh2Z2PgN73OhCRJ0gi8hrKG5r8ZCHbn7wAveclLWGKJJQZNOHv2bGbMGO5TgKYuy6GwHArLoZ9lUVgOheVQWA79xqIs5s+fz4033ghV/FJnINidJwGWWGIJpk+fPmTi4aRpAsuhsBwKy6GfZVFYDoXlUFgO/cawLJ42nM11BCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIayieLTGIP/3M+j857otfZGBNLL/ecXmdBkiS1mTSBYEQsDdwArA5snJlX1Y7tDHwKeBEwBzgsM89oO/+ZwGHAfwLLA1cCe2fmn9vSrQp8GXgLsBD4GfCxzLxnPO5rNB6d9wQXXHlbr7MxJjZcc6leZ0GSJLWZTF3Dh9IhMI2IbYFTgJ8As4BfAz+IiFltSb8E7AkcArwdmA9cEBGr1a61OHA+8DJgZ2A3YFPg7IiYNsb3I0mSNKlNihbBiJgB7AF8HDix7fDhwJmZ+cnq9UURsR7wWeDn1fmrV+d/NDO/We37A/A34GPAAdW5/wFsAMzIzGurdHOB31GCzPPG4/4kSZImo8nSIng88FXgxvrOiHgxsC5welv604CNI2Ll6vWbgcWAf3cXZ+bDlG7frWrnbQVc0woCq3SXAbe2pZMkSZryeh4IRsT7gLWBIzocXq/aXte2vxXIRS3dXZl5b4d0L4mIZ9TStV+rlW7dkeRbkiRpUdfTruGIWA44Gtg3Mx+JiPYkK1TbB9r2319tV6yla0/TSvdMYGngoSHSrT/8nBezZ88eVrq+vr6RXhqAZy27MnPnzu3q3MlmwzXX7rocphrLobAc+lkWheVQWA6F5dBvPMui12MEjwBuysxTe5yPrsyYMYPp06cPmqavr4+ZM2d2df277nuU1VZ7tKtzJ6Nuy2EqGU19mEosh36WRWE5FJZDYTn0G4uymDdv3oCNVz0LBCPipZQJHltExPLV7qVb24hYhv6Wv+WBO2unt1oK76u291dp2q0APA48Mox093XYL0mSNGX1cozgOpRA9CJKgHY/cE517CLgN8D11ev12s5tdeNmtb0eWCUiVuyQ7sbMXFBL136tVroburgHSZKkRVYvA8HfAq9v+9mnOrYHsFtm/o0SoO3Qdu6OwJWZeXf1+pfAAmD7VoJqgeq38dQlYc4DXlYtP9NK92rKQtUuHSNJkhqlZ13D1ZM8Lq7vq00W6as9WeRg4IyImAP8irJY9JuBrWvXuiMiTgA+HxFPUJaD2Q+YBhxbe4uzgKuBH0XEJyn3fzTwe6o1CSVJkpqi58vHDCUzzwR2BbYFfgFsCeyUme2B2z7A1ykTUM4GngW8KTPn1q71BOXRcrOB7wMnAX8AtsnMheN8K5IkSZNKr2cNP0VmXkxpxWvffwrlMXODnfs48InqZ7B0d/L0rmZJkqTGmfQtgpIkSRofBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNZSAoSZLUUAaCkiRJDWUgKEmS1FAGgpIkSQ1lIChJktRQBoKSJEkNtXiv3jgi3gV8HFgXWBq4A/gJcHhmPlhLNwv4HLB+lebYzDyuw/X2A/YEVgWuBQ7MzAva0iwDHA1sCywJXATslZm3jPX9SZIkTXa9bBFcEbgU+CDwFuDLwPuBM1sJImIT4GzgT8As4CTg2IjYo36hKgg8Ejge2Bq4CTg3IjZoe88fANsAewE7AKsBF0TEUmN9c5IkSZNdz1oEM/Nbbbsujoh/ASdGxGqZORc4GPhjZn6gSnNRRLwQOCQivpGZCyJiOvAZSkvhMQARcQlwDfBpYPtq36soQeLWmXlete8aYA6wC/C1cbxdSZKkSWeyjRG8p9ouUQV4bwDOaEtzGqX79xXV602B5YDTWwky80ngh8CsiJhW7d4KeBA4v5buNuB31TFJkqRG6VmLYEtELAY8E3gppQXw7My8JSLWB5YArms75dpquy5wFbBe9fr6DumWBlYHbq/S3ZCZCzqk23IMbkWSJGmR0vNAELiX0qIHpbVup+rfK1TbB9rS319tV6ylm5eZjw2S7vYqXfu1WulW7LB/SLNnzx5Wur6+vm4uz7OWXZm5c+d2de5ks+Gaa3ddDlON5VBYDv0si8JyKCyHwnLoN55lMRkCwdcBSwEzKGP9zomILXqao2GaMWMG06dPHzRNX18fM2fO7Or6d933KKut9mhX505G3ZbDVDKa+jCVWA79LIvCcigsh8Jy6DcWZTFv3rwBG696Hghm5p+rf14WEX2U7t530t8lvHzbKa2Wwvuq7f3A9IhYMjP/NUS6F3bIwgq1NJIkSY0x2SaL/BlYAKxNmc07n/4xgC3rV9sbqm1rbGCndA9T1h5spYva5JF6uhuQJElqmMkWCG5CydPNmTkPuJBq+ZeaHYE7gT9Wry+jzAbeoZWgmoCyPXB+Zi6sdp9HaV3cspbuBcDm1TFJkqRG6eWTRX4BXECZtfsvYENgf+Bq4KdVssOASyPim8CpwGbA7sCerdm/mTkvIo4AjoyIuykB4m7AWvRPPCEzL4+Ic4FvR8S+wEPV9W8DTh7Pe5UkSZqMejlG8ArgvcCLq9e3ACcAX8zM+QCZ+fuIeDvlqSE7A3OBfTLzhPqFMvOYiAD4KPBcSnC5dWb+pe09dwSOoSwePZ3yiLntMnPqzMiQJEkapl4+WeQg4KBhpDuPYXTdVk8VOWaINA8DH6p+JEmSGm2yjRGUJEnSBDEQlCRJaigDQUmSpIYyEJQkSWooA0FJkqSGMhCUJElqKANBSZKkhhpxIBgRW3Z4Xq8kSZIWMd20CP4cuD0ijo6IDcY6Q5IkSZoY3QSC7wB+B+wJ/DEiro6I/SJitTHNmSRJksbViAPBzDw7M7enPNN3d+Bu4Cjg1oj4ZUS8NyKWGuN8SpIkaYx1PVkkMx/OzO9k5huBNYBPAasApwB3RcR3I+KNY5RPSZIkjbGxmjW8GPBMYDowDXgMeBPwq4j4U0TMGKP3kSRJ0hhZvNsTI2I5YHvgvcBmwBPAucAnqu0CYBvgS8BJwMajzawkSZLGzogDwYh4ByX42wpYErgS2Bv4QWbe15b8pxHxHOBro8ynJEmSxlg3LYI/Bu4Avgyckpk3DJH+auDULt5HkiRJ46ibQPDNwAWZuXA4iTPzCuCKLt5HkiRJ42jEgWBm/no8MiJJkqSJ1c0j5r4UETcNcvzGiDh6dNmSJEnSeOtm+ZitgTMGOX4G8LbusiNJkqSJ0k0g+ALglkGO31qlkSRJ0iTWTSD4EPDiQY6vSVlQWpIkSZNYN4HghcCHIuKF7Qci4kXAh6o0kiRJmsS6WT7mYGAWMDsiTgKurfbPAHYBngQOGpPcSZIkadx0s3zMTRGxGXA8sFfb4UuAvTIzxyJzkiRJGj9dPWs4M68FXlc9Pm7NaveczLx3zHImSZKkcdVVINiSmfcA94xRXiRJkjSBugoEI2IxYEtKa+AKwLS2JAsz8/BR5k2SJEnjaMSBYERsBJwFPJ+nB4AtCwEDQUmSpEmsmxbBrwHPAt4B/CYzHxjLDEmSJGlidBMIvhz4dGaeM9aZkSRJ0sTpZkHp2xm4S1iSJEmLiG4CwaOA3SNi2bHOjCRJkiZON13DKwL/BP4aET8C/pfyNJG6hZl59GgzJ0mSpPHTTSB4VO3fewyQZiFgIChJkjSJdRMIvnjMcyFJkqQJ182zhm8dj4xIkiRpYnX9iLmIWAd4HbAKcGpm3hIRSwCrAndm5vyxyaIkSZLGQzdPFnkGcALwAcoyMguB3wO3AEsA1wCHAf89ZrmUJEnSmOtm+ZhPAe8HDgI2obamYGY+Qnn83LvGJHeSJEkaN90EgrsC38nMI4G/djh+DbDOqHIlSZKkcddNIPh84IpBjj8GLNNddiRJkjRRugkE7wTWGOT4TMCZxZIkSZNcN4HgWcB/VbOGWxYCRMQsYGfgh2OQN0mSJI2jbgLBQ4HbgD8Bp1KCwE9FxB+AnwF/Af7fWGVQkiRJ42PEgWBmPgRsChwJPBf4F7A5sDQlSHxtZj42hnmUJEnSOOhqQenM/BclEDxybLMjSZKkidJN17AkSZKmgG6eLPKdYSRbmJkf6CI/kiRJmiDddA2/gWqWcM1iwPOq7d3AP0eZL0mSJI2zEQeCmfmiTvsj4pnAh4CPAVsMdZ2I2A54D2XdwRWBOcDXgRMzc0Et3Szgc8D6wB3AsZl5XIfr7QfsCawKXAscmJkXtKVZBjga2BZYErgI2Cszbxkqv5IkSVPNmI0RzMzHM/OrwC+Brw7jlH2BecD+wFuBnwJfAT7fShARmwBnU5aqmQWcBBwbEXvUL1QFgUcCxwNbAzcB50bEBm3v+QNgG2AvYAdgNeCCiFhqJPcqSZI0FXQ1a3gIfwHeN4x0b8vMu2uvL4qIpYGPRMRnMnMecDDwx9p4w4si4oXAIRHxjcxcEBHTgc9QWgqPAYiISyjPPP40sH2171WUIHHrzDyv2ncNpSVyF+Bro7prSZKkRcx4zBreAnh0qERtQWDLnyhdtitWAd4bgDPa0pxG6f59RfV6U2A54PTatZ+kPN1kVkRMq3ZvBTwInF9Ldxvwu+qYJElSo3Qza/jgAQ4tD7yWEqAd1WV+XgPcB/wDCGAJ4Lq2NNdW23WBq4D1qtfXd0i3NLA6cHuV7ob6+MNaui27zK8kSdIiq5uu4UMH2H8/pZt1D+CbI71oRGwE7Ap8NjOfjIgVqkMPdHgfKBNMAFYA5nV4mkk93e1VuvZrtdKt2GG/JEnSlNbNrOEx706OiFWBs4ArqE0Wmexmz549rHR9fX1dXf9Zy67M3Llzuzp3stlwzbW7LoepxnIoLId+lkVhORSWQ2E59BvPshiPySIjEhHLAT+njCvcJjMfrw61WvSWbzul1VJ4Xy3d9IhYsnr03WDpXtghCyvU0ozIjBkzmD59+qBp+vr6mDlzZjeX5677HmW11YYcbrnI6LYcppLR1IepxHLoZ1kUlkNhORSWQ7+xKIt58+YN2HjVzRjBTsHUkKqJGe3XWpKyPMwqwKaZeW/t8BxgPmVs3/m1/etX2xuqbWts4HqUySb1dA9T1h5spdsiIqZl5sK2dDcgSZLUMN10894C/K2Ln6eIiMUpM3tfDszKzFvrx6vlYy6kWv6lZkfgTuCP1evLKLOBd6hde7HqvPNrQd95lNbFLWvpXgBsXh2TJElqlG66hncDPgq8gLKUy43V/qAEabdRFoZun53b7njgbcABwFIR8erasesy8yHgMODSiPgmcCqwGbA7sGdr9m9mzouII4AjI+JuSoC4G7AWsFPrgpl5eUScC3w7IvYFWte/DTi5i3KQJElapHUTCD4PmA6snZn31w9ExCGUdflWzcz/N8R1Wi1zX+hw7PXAxZn5+4h4O+WpITsDc4F9MvOEeuLMPCYioASoz6UsCbN1Zv6l7bo7AsdQFo+eTnnE3HaZOXUG4kmSJA1TN4HgHsAX24NAgMy8t2q92xsYNBAc6JnFHdKdxzC6bqunihwzRJqHKc9D/tBw3luSJGkq62aM4EqUhZoH8uwqjSRJkiaxbgLBPwB7R8TT5jJXi0LvDVw+2oxJkiRpfHXTNfwR4GLgioi4Erip2r8OsDFlTb69xiR3kiRJGjcjbhHMzOuAl1FmBi8PbFv9LA98GXhZZl470PmSJEmaHLp6skhm3gXsU/1IkiRpETSqR8xFxDqUp4LMzswHxyZLkiRJmgjdTBYhInaKiNsoj2a7FJhZ7X9ORNwYEe1PA5EkSdIkM+JAMCL+A/g+5dm9+wPTWscy855q/85jlUFJkiSNj25aBD8N/DoztwRO6XD8cmCDUeVKkiRJ466bQHA94CeDHP8HsHJ32ZEkSdJE6SYQ/CeDP1lkLeCe7rIjSZKkidJNIHghsEtELNF+ICJWA3YHfjHajEmSJGl8dTtG8HnAVcCHgYXAVhFxFHANsAD47JjlUJIkSeOimyeL3ARsBtwJHEqZNfxx4ADgz8DmmXnb2GVRkiRJ42FEC0pHxGLA6sBdmfnmiFgBWJsSUN6cmXePQx4lSZI0Dkb6ZJFnAHOAA4EvZub9wJVjnitJkiSNuxF1DWfm48BcyrhASZIkLcK6mSxyEmXW8JJjnRlJkiRNnJF2DQPcCCwG3BARpwA3A4+1J8rMH44yb5IkSRpH3QSC36/9+6AB0iwEDAQlSZImsWEFghHxFeCUzOwDXl/tXprSEvjkOOVNkiRJ42i4LYIfAf4A9GXmJRGxEuWZwltk5iXjljtJkiSNm24mi7RMG7NcSJIkacKNJhCUJEnSIsxAUJIkqaFGMmt4zYh4ZfXv5artuhHxSKfEmXnFqHImSZKkcTWSQPCz1U/dcR3STaMsH7NYt5mSJEnS+BtuILjruOZCkiRJE25YgWBmnjLeGZEkSdLEcrKIJElSQxkISpIkNZSBoCRJUkMZCEqSJDWUgaAkSVJDGQhKkiQ1lIGgJElSQxkISpIkNZSBoCRJUkMZCEqSJDWUgaAkSVJDGQhKkiQ1lIGgJElSQxkISpIkNZSBoCRJUkMZCEqSJDWUgaAkSVJDGQhKkiQ1lIGgJElSQxkISpIkNZSBoCRJUkMZCEqSJDXU4r1884hYG9gPeDUwA7ghM2d0SDcL+BywPnAHcGxmHtch3X7AnsCqwLXAgZl5QVuaZYCjgW2BJYGLgL0y85axuzNJkqTJr9ctgi8Ftgb+ClzXKUFEbAKcDfwJmAWcBBwbEXu0pdsPOBI4vrrmTcC5EbFB2yV/AGwD7AXsAKwGXBARS43RPUmSJC0SetoiCJyTmf8DEBEnAxt1SHMw8MfM/ED1+qKIeCFwSER8IzMXRMR04DOUlsJjqutdAlwDfBrYvtr3KkqQuHVmnlftuwaYA+wCfG1c7lKSJGkS6mmLYGYuGOx4FeC9ATij7dBplO7fV1SvNwWWA06vXftJ4IfArIiYVu3eCngQOL+W7jbgd9UxSZKkxuh11/BQ1gKW4OndxtdW23Wr7XrV9voO6ZYGVq+lu6FDAHpt7VqSJEmN0Ouu4aGsUG0faNt/f7VdsZZuXmY+Nki626t07ddqpVuxw/5BzZ49e1jp+vr6RnppAJ617MrMnTu3q3Mnmw3XXLvrcphqLIfCcuhnWRSWQ2E5FJZDv/Esi8keCE5qM2bMYPr06YOm6evrY+bMmV1d/677HmW11R7t6tzJqNtymEpGUx+mEsuhn2VRWA6F5VBYDv3GoizmzZs3YOPVZO8abrXoLd+2v9VSeF8t3fSIWHIY6dqv1Up3X4f9kiRJU9ZkDwTnAPPpHwPYsn61vaHatsYGdkr3MGXtwVa6qE0eqae7AUmSpAaZ1IFgZs4DLqRa/qVmR+BO4I/V68sos4F3aCWIiMWq887PzIXV7vMoLYJb1tK9ANi8OiZJktQYvX6yyFL0L9uyBrBsRGxbvb4yM28FDgMujYhvAqcCmwG7A3u2Zv9m5ryIOAI4MiLupgSIu1FmHe/Uer/MvDwizgW+HRH7Ag9V178NOHlcb1aSJGmS6fVkkVWAM9v2tV7vCpycmb+PiLdTnhqyMzAX2CczT6iflJnHRATAR4HnUpaE2Toz/9J2/R2BYyiLR0+nPGJuu8ycOrMyJqEll3wWd903NYp4qemLs8yzl+h1NiRJGrWeBoLV833bx+t1Sncew+i6rZ4qcswQaR4GPlT9aII8/uRCLrjytl5nY0y8ceMXGghKkqaEST1GUJIkSePHQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhFu91BiZaRKwDHAdsDjwGnA4cmJmP9jRjkiRJE6xRgWBELA9cBNwKbAusAnwRWBl4d+9yJkmSNPEaFQgCHwJWADbMzHsAIuIJ4NSIODwzr+1p7iRJkiZQ08YIbgVc0AoCK2cB84BZvcmSJElSbzStRXA94Dv1HZk5LyLmAOuO4DqLAcyfP39YiefNmzeCS/d74vH5LP6MBV2dO9ksePKJKXMvTzw+n3nzFuv6/G7rw1RjOfSzLArLobAcCsuh32jLohavPO3La9rChQtHdfFFSUQ8DhyUmUe17f8t8I/MfNdwrtPX17c58JtxyKIkSdJ4ec3MmTN/W9/RtBbBsXIl8Brg78CTPc6LJEnSYBYDnkeJX56iaYHg/cDyHfavANww3IvMnDlzHvDbIRNKkiRNDnM67WzaZJHrKeME/y0ipgNrMYJAUJIkaSpoWiB4HvDGiFiptu+dwPTqmCRJUmM0bbLI8sBs4BbgcPoXlL4gM11QWpIkNUqjWgQz8wHgDcAjwI+BLwFnAO/vYbYkSZJ6olEtgpIkSerXqBZBSZIk9TMQlCRJaigDQUmSpIZq2oLSEyIi1gGOAzYHHgNOBw7MzEd7mrEJFBG7ACd1OHR8Zn5kgrMzYSJibWA/4NXADOCGzJzRId0s4HPA+sAdwLGZedxE5nU8DaccIuJk4D87nL5dZv5o3DM5ASJiO+A9wExgRcqCrl8HTszMBbV0U70+DFkOTagPABHxLuDjlOfbL035vH8CHJ6ZD9bSTfU6MWQ5NKVO1EXE0pR1jVcHNs7Mq2rHdgY+BbyI8n/osMw8Y7TvaSA4xqolai4CbgW2pX+JmpWBJi5R8xbgwdrrO3uVkQnyUmBr4HJKi/vTWt0jYhPgbOC7wL7AZsCxEfF4Zp4wgXkdT0OWQ+VmSoBQd+M45mui7Uv5XbA/cBfweuArwJrVvqbUhyHLoTLV6wOUQPhSyvfCfcDLgUOr7ZuhMXViyHKoNKFO1B1Kh9gsIrYFTgGOAn4JvAP4QUQ8lJk/H80bOmt4jEXEgcDBwBqZeU+1byfgVGBGZl7by/xNlFqL4MqtcmiCiHhGWwvHRh1awn4OrJiZr6rt+wbwNmD1ekvRomqY5dBx/1QSEStn5t1t+74I/BewfGbOa0h9GE45nMwUrw8DiYgPAidSPu+5TagTnXQoh5NpUJ2IiBnAHygtpSdSaxGMiOuBazJz+1r6X1L+/7xyNO/rGMGxtxVlgep68HMWMA+Y1ZssaaIM9Qu6eqThGyjrV9adBqwKvGKcsjahpuoX1Ui1Bz+VPwFLAis2qD4MWg4TnJ3JqPV9sURT6sQA/l0OPc1F7xwPfJW2Fs+IeDGlC/30tvSnARtHxMqjeVO7hsfeesB36juqv3bnUD7IppldVdLbgJOBz2XmE73NUk+tRfkld13b/lZL8brAVTTHWhHxAPBsylN/jhqLMS+T3GsoXWH/AILm1od6ObQ0pj5ExGLAMynDKA4Gzs7MWyJifRpUJwYqh1qSRtSJiHgfsDZlSM1GbYfXq7YD1YkAOv2xNSy2CI69FYAHOuy/n2b95ft34BBgF8o4wZ8ABwHf6mGeJoMVqu0Dbfvvr7ZNqiN/okwoeQdlPO3twOnVsIIpKSI2AnYFvpSZT9LQ+tChHKB59eFeymTCqyi/L3eq9jetTgxUDtCQOhERywFHAwdk5iMdkoxrnbBFUOMiM38B/KK261cR8SBwaEQcnplzepQ1TRKZ+eW2Xf8TERcCn6W0Hk8pEbEqZZjIFcDne5ydnhmoHJpWH4DXAUtRZtV/BjgnIrboaY5643V0KIfMfLJBdeII4KbMPLUXb26L4Ni7H1i+w/4VKN0gTfbDajuVx7gMpfUX3PJt+1t/8TW9jpwJvHC0Y14mm+ov/p8DjwLbZObj1aFG1YdBymEgU7I+AGTmnzPzssz8BvBOykzqd9KwOjFIOQxkStWJiHgpsAdwUEQsX608snR1eOmIWIZxrhMGgmPvevr784F/TxBYi7I2kJptDjCftjpCWSsMrCNTTkQsSVkKZBXgLZl5b+1wY+rDEOXQdH8GFlDGiDWmTnTwZ/rLoSnWofTOXkQJ+O4HzqmOXQT8hhJXwMB1IkeTAQPBsXce8MaIWKm2753A9OpYk70bWAj09TojvZKZ84ALge3bDu1IWWPxjxOeqUkiIqZRyuXWAWaZLnIiYnFKS/jLgVmZeWv9eFPqw1DlMMA5U64+DGITyvfxzU2pEwP4dzl0OjhF68RvKa2g9Z99qmN7ALtl5t8ofwDs0HbujsCVoy0LxwiOvROBvShjGQ6nf0HpMzKzfcbPlBURv6D8MptN+QtvFvBh4NuZ2fE/+VQQEUtRlhACWANYtloIFMp/2FuBw4BLI+KblPUlNwN2B/acKsuuDFUO1fYU4AfAXyldHrtRxgu9b8IyOv6Op6z9dgCwVES8unbsusx8iAbUB4YoB0oXVxPqQ+t34wWUGZ//AjakLKp9NfDTKtmUrxNDlUNErEED6kS11NzF9X0R0fpnX+3JIgcDZ1QrkPwKeDtl4e2tR5sHA8ExlpkPRMQbKKvm/5j+R8wd0NOMTbzrgfcDz6fUs5uAA4Fje5inibAKZQxLXev1rsDJmfn7iHg7cCSwMzAX2GcKPTEAhi6HsylPnPlMlfZxSkvHNpl5DlPHltX2Cx2OvR64uCH1YahyuJpm1Acok2TeC7y4en0LcALwxcycD9CQOjFoOUTEwzSnTgwpM8+s/sD+FGUm9Rxgp9E+VQR8sogkSVJjOUZQkiSpoQwEJUmSGspAUJIkqaEMBCVJkhrKQFCSJKmhDAQlSZIaykBQkiSpoQwEJUmSGur/A0w6fGYym9kTAAAAAElFTkSuQmCC\n",
-      "text/plain": [
-       "<Figure size 720x360 with 1 Axes>"
-      ]
-     },
-     "metadata": {
-      "needs_background": "light"
-     },
-     "output_type": "display_data"
-    }
-   ],
-   "source": [
-    "df1 = df[df[\"name\"].isin([\"QuerySamplesComplete\"])]\n",
-    "df1['delta'] = df1['ts'].diff()\n",
-    "ax = df1['delta'].plot.hist(bins=BINS, alpha=0.5, figsize=figsize)\n",
-    "ax.set_title('Time between QuerySamplesComplete (usec)');\n",
-    "plt.show()\n",
-    "\n",
-    "ax = df1['dur'].plot.hist(bins=BINS, alpha=0.5, figsize=figsize)\n",
-    "ax.set_title('Time QuerySamplesComplete (usec)');"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.3"
-  },
-  "varInspector": {
-   "cols": {
-    "lenName": 16,
-    "lenType": 16,
-    "lenVar": 40
-   },
-   "kernels_config": {
-    "python": {
-     "delete_cmd_postfix": "",
-     "delete_cmd_prefix": "del ",
-     "library": "var_list.py",
-     "varRefreshCmd": "print(var_dic_list())"
-    },
-    "r": {
-     "delete_cmd_postfix": ") ",
-     "delete_cmd_prefix": "rm(",
-     "library": "var_list.r",
-     "varRefreshCmd": "cat(var_dic_list()) "
-    }
-   },
-   "types_to_exclude": [
-    "module",
-    "function",
-    "builtin_function_or_method",
-    "instance",
-    "_Feature"
-   ],
-   "window_display": false
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/utils.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/utils.cc
deleted file mode 100644
index de74eb820..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/utils.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "utils.h"
-
-#include <chrono>
-#include <ctime>
-#include <fstream>
-#include <iomanip>
-#include <sstream>
-
-#include "logging.h"
-
-namespace mlperf {
-
-std::string DoubleToString(double value, int precision) {
-  std::stringstream ss;
-  ss.precision(precision);
-  ss << std::fixed << value;
-  return ss.str();
-}
-
-bool FileExists(const std::string filename) {
-  std::ifstream file_object(filename);
-  return file_object.good();
-}
-
-namespace {
-
-std::string DateTimeString(const char* format,
-                           std::chrono::system_clock::time_point tp,
-                           bool append_ms, bool utc) {
-  std::time_t tp_time_t = std::chrono::system_clock::to_time_t(tp);
-  std::tm date_time =
-      utc ? *std::gmtime(&tp_time_t) : *std::localtime(&tp_time_t);
-  constexpr size_t kDateTimeMaxSize = 256;
-  char date_time_cstring[kDateTimeMaxSize];
-  std::strftime(date_time_cstring, kDateTimeMaxSize, format, &date_time);
-  std::string date_time_string(date_time_cstring);
-  if (!append_ms) {
-    return date_time_string;
-  }
-
-  auto tp_time_t_part = std::chrono::system_clock::from_time_t(tp_time_t);
-  auto tp_remainder = tp - tp_time_t_part;
-  auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(tp_remainder)
-                .count();
-  if (ms < 0 || ms >= 1000) {
-    LogDetail([ms](AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-      std::stringstream ss;
-      ss << "WARNING: Unexpected milliseconds getting date and time."
-         << " ms: " << ms;
-      MLPERF_LOG_WARNING(detail, "warning_generic_message", ss.str());
-#else
-      detail("WARNING: Unexpected milliseconds getting date and time.", "ms",
-             ms);
-#endif
-    });
-  }
-  std::string ms_string = std::to_string(ms);
-  // Prefix with zeros so length is always 3.
-  ms_string.insert(0, std::min<size_t>(2, 3 - ms_string.length()), '0');
-  return date_time_string + "." + ms_string;
-}
-
-}  // namespace
-
-std::string CurrentDateTimeISO8601() {
-  return DateTimeString("%FT%TZ", std::chrono::system_clock::now(), false,
-                        false);
-}
-
-std::string DateTimeStringForPower(std::chrono::system_clock::time_point tp) {
-  return DateTimeString("%m-%d-%Y %T", tp, true, true);
-}
-
-std::string EscapeStringJson(const std::string& in) {
-  std::stringstream ss;
-  for (auto c = in.cbegin(); c != in.cend(); c++) {
-    int c_val = static_cast<int>(*c);
-    switch (*c) {
-      case '"':
-        ss << "\\\"";
-        break;
-      case '\\':
-        ss << "\\\\";
-        break;
-      case '\b':
-        ss << "\\b";
-        break;
-      case '\f':
-        ss << "\\f";
-        break;
-      case '\n':
-        ss << "\\n";
-        break;
-      case '\r':
-        ss << "\\r";
-        break;
-      case '\t':
-        ss << "\\t";
-        break;
-      default:
-        if (c_val >= 0x00 && c_val < 0x20) {
-          ss << "\\u" << std::hex << std::setw(4) << std::setfill('0') << c_val;
-        } else {
-          ss << *c;
-        }
-    }
-  }
-  return ss.str();
-}
-
-}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/utils.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/utils.h
deleted file mode 100644
index c587e0cbe..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/utils.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/// \file
-/// \brief Various shared utility functions.
-
-#ifndef MLPERF_LOADGEN_UTILS_H
-#define MLPERF_LOADGEN_UTILS_H
-
-#include <algorithm>
-#include <chrono>
-#include <string>
-
-#include "query_sample.h"
-
-namespace mlperf {
-
-template <typename T>
-void RemoveValue(T* container, const typename T::value_type& value_to_remove) {
-  container->erase(std::remove_if(container->begin(), container->end(),
-                                  [&](typename T::value_type v) {
-                                    return v == value_to_remove;
-                                  }),
-                   container->end());
-}
-
-template <typename CountT, typename RatioT>
-double DurationToSeconds(
-    const std::chrono::duration<CountT, RatioT>& chrono_duration) {
-  return std::chrono::duration_cast<std::chrono::duration<double>>(
-             chrono_duration)
-      .count();
-}
-
-inline double QuerySampleLatencyToSeconds(QuerySampleLatency qsl) {
-  return static_cast<double>(qsl) / std::nano::den;
-}
-
-template <typename DurationT>
-inline DurationT SecondsToDuration(double seconds) {
-  return std::chrono::duration_cast<DurationT>(
-      std::chrono::duration<double>(seconds));
-}
-
-std::string CurrentDateTimeISO8601();
-
-/// \brief Uses a format that matches the one used by SPEC power
-/// measurement logging.
-std::string DateTimeStringForPower(std::chrono::system_clock::time_point tp);
-
-std::string DoubleToString(double value, int precision = 2);
-
-bool FileExists(const std::string filename);
-
-// \brief Escape special characters in a string for JSON.
-// Don't use this in performance critical path.
-std::string EscapeStringJson(const std::string& in);
-
-}  // namespace mlperf
-
-#endif  // MLPERF_LOADGEN_UTILS_H
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version.cc b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version.cc
deleted file mode 100644
index 3216c9d72..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/// \file
-/// \brief Non-generated version logic.
-
-#include "version.h"
-
-#include "logging.h"
-#include "utils.h"
-
-namespace mlperf {
-
-/// Helper function to split a string based on a delimiting character.
-std::vector<std::string> splitString(const std::string& input,
-                                     const std::string& delimiter) {
-  std::vector<std::string> result;
-  size_t start = 0;
-  size_t next = 0;
-  while (next != std::string::npos) {
-    next = input.find(delimiter, start);
-    result.emplace_back(input, start, next - start);
-    start = next + 1;
-  }
-  return result;
-}
-
-/// Converts the hash-filename pairs to a dict.
-std::map<std::string, std::string> LoadgenSha1OfFilesToDict(
-    const std::string& in) {
-  std::map<std::string, std::string> result;
-  auto files = splitString(in, "\n");
-  for (const auto& file : files) {
-    auto hash_and_name = splitString(file, " ");
-    assert(hash_and_name.size() > 1);
-    result[hash_and_name[1]] = hash_and_name[0];
-  }
-  return result;
-}
-
-void LogLoadgenVersion() {
-  LogDetail([](AsyncDetail& detail) {
-#if USE_NEW_LOGGING_FORMAT
-    MLPERF_LOG(detail, "loadgen_version",
-               LoadgenVersion() + " @ " + LoadgenGitRevision());
-    MLPERF_LOG(detail, "loadgen_build_date_local", LoadgenBuildDateLocal());
-    MLPERF_LOG(detail, "loadgen_build_date_utc", LoadgenBuildDateUtc());
-    MLPERF_LOG(detail, "loadgen_git_commit_date", LoadgenGitCommitDate());
-    MLPERF_LOG(detail, "loadgen_git_log_message",
-               EscapeStringJson(LoadgenGitLog()));
-    MLPERF_LOG(detail, "loadgen_git_status_message",
-               EscapeStringJson(LoadgenGitStatus()));
-    if (!LoadgenGitStatus().empty() && LoadgenGitStatus() != "NA") {
-      MLPERF_LOG_ERROR(detail, "error_uncommitted_loadgen_changes",
-                       "Loadgen built with uncommitted changes!");
-      ;
-    }
-    MLPERF_LOG(detail, "loadgen_file_sha1",
-               LoadgenSha1OfFilesToDict(LoadgenSha1OfFiles()));
-#else
-    detail("LoadgenVersionInfo:");
-    detail("version : " + LoadgenVersion() + " @ " + LoadgenGitRevision());
-    detail("build_date_local : " + LoadgenBuildDateLocal());
-    detail("build_date_utc   : " + LoadgenBuildDateUtc());
-    detail("git_commit_date  : " + LoadgenGitCommitDate());
-    detail("git_log :\n\n" + LoadgenGitLog() + "\n");
-    detail("git_status :\n\n" + LoadgenGitStatus() + "\n");
-    if (!LoadgenGitStatus().empty() && LoadgenGitStatus() != "NA") {
-      detail.Error("Loadgen built with uncommitted changes!");
-    }
-    detail("SHA1 of files :\n\n" + LoadgenSha1OfFiles() + "\n");
-#endif
-  });
-}
-
-}  // namespace mlperf
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version.h b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version.h
deleted file mode 100644
index 87c3409aa..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2019 The MLPerf Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-/// \file
-/// \brief Declares the version-related strings that will be defined in
-/// a version_generated.cc as created by version_generator.py.
-
-#ifndef MLPERF_LOADGEN_VERSION_H
-#define MLPERF_LOADGEN_VERSION_H
-
-#include <string>
-
-namespace mlperf {
-
-// Non-generated.
-void LogLoadgenVersion();
-
-// Definitions generated at compile time.
-const std::string& LoadgenVersion();
-const std::string& LoadgenGitRevision();
-const std::string& LoadgenBuildDateLocal();
-const std::string& LoadgenBuildDateUtc();
-const std::string& LoadgenGitCommitDate();
-const std::string& LoadgenGitStatus();
-const std::string& LoadgenGitLog();
-const std::string& LoadgenSha1OfFiles();
-
-}  // namespace mlperf
-
-#endif  // MLPERF_LOADGEN_VERSION_H
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version_generator.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version_generator.py
deleted file mode 100644
index 2e7524330..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/thirdparty/loadgen/version_generator.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright 2019 The MLPerf Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# =============================================================================
-
-# \file
-#  \brief A script run by the build to generate the version definitions
-#  expected at link time.
-
-import datetime
-import errno
-import hashlib
-import os
-import sys
-import subprocess
-
-
-# Creates a C++ raw string literal using a delimiter that is very
-# unlikely to show up in a git stats.
-def make_raw_string(str):
-    delimeter = "LGVG_RSLD"
-    return 'R"' + delimeter + "(" + str + ")" + delimeter + '"'
-
-
-def func_def(name, string):
-    return (
-        "const std::string& Loadgen"
-        + name
-        + "() {\n"
-        + "  static const std::string str = "
-        + string
-        + ";\n"
-        + "  return str;\n"
-        + "}\n\n"
-    )
-
-
-# For clients that build the loadgen from the git respository without
-# any modifications.
-def generate_loadgen_version_definitions_git(ofile, git_command):
-    git_rev = os.popen(git_command + "rev-parse --short=10 HEAD").read()
-    git_commit_date = os.popen(
-        git_command +
-        "log --format=\"%cI\" -n 1").read()
-    git_status = os.popen(git_command + "status -s -uno .").read()
-    git_log = subprocess.Popen(
-        git_command + "log --pretty=oneline -n 16 --no-decorate", stdout=subprocess.PIPE, shell=True, encoding='ascii', errors="ignore").stdout.read()
-    ofile.write(func_def("GitRevision", "\"" + git_rev[0:-1] + "\""))
-    ofile.write(func_def("GitCommitDate", "\"" + git_commit_date[0:-1] + "\""))
-    ofile.write(func_def("GitStatus", make_raw_string(git_status[0:-1])))
-    ofile.write(func_def("GitLog", make_raw_string(git_log[0:-1])))
-
-
-# For clients that might not import the loadgen code as the original git
-# repository.
-def generate_loadgen_verstion_definitions_git_stubs(ofile):
-    na = '"NA"'
-    ofile.write(func_def("GitRevision", na))
-    ofile.write(func_def("GitCommitDate", na))
-    ofile.write(func_def("GitStatus", na))
-    ofile.write(func_def("GitLog", na))
-
-
-# Always log the sha1 of the loadgen files, regardless of whether we are
-# in the original git repository or not.
-def generate_loadgen_version_definitions_sha1(ofile, loadgen_root):
-    """Writes definition for Sha1OfFiles."""
-    sha1s = ""
-    loadgen_files = [
-        "/bindings/" + s for s in os.listdir(loadgen_root + "/bindings")
-    ] + ["/" + s for s in os.listdir(loadgen_root)]
-    for fn in sorted(loadgen_files):
-        full_fn = loadgen_root + fn
-        if not os.path.isfile(full_fn):
-            continue
-        file_data = open(full_fn, "rb").read()
-        sha1s += hashlib.sha1(file_data).hexdigest() + " " + fn + "\n"
-
-    ofile.write(func_def("Sha1OfFiles", make_raw_string(sha1s[0:-1])))
-
-
-# Outputs version function definitions to cc_filename.
-# Includes SHA1's of the relevant dirs in the loadgen_root directory.
-def generate_loadgen_version_definitions(cc_filename, loadgen_root):
-    """Generates the C++ source file with the loadgen version info."""
-    try:
-        os.makedirs(os.path.dirname(cc_filename))
-    except OSError as exc:
-        if exc.errno != errno.EEXIST:
-            raise
-    ofile = open(cc_filename, "w")
-    ofile.write("// DO NOT EDIT: Autogenerated by version_generator.py.\n\n")
-    ofile.write("#include <string>\n\n")
-    ofile.write("namespace mlperf {\n\n")
-    # Open and read the VERSION.txt file
-    with open(os.path.join(loadgen_root, "VERSION.txt"), "r") as version_file:
-        # Read and strip any extra whitespace/newlines
-        version_contents = version_file.read().strip()
-
-    # Write the version into the function definition
-    ofile.write(func_def("Version", f"\"{version_contents}\""))
-
-    date_time_now_local = datetime.datetime.now().isoformat()
-    date_time_now_utc = datetime.datetime.utcnow().isoformat()
-    ofile.write(func_def("BuildDateLocal", '"' + date_time_now_local + '"'))
-    ofile.write(func_def("BuildDateUtc", '"' + date_time_now_utc + '"'))
-
-    git_dir = '--git-dir="' + loadgen_root + '/../.git" '
-    git_work_tree = '--work-tree="' + loadgen_root + '/.." '
-    git_command = "git " + git_dir + git_work_tree
-    git_status = os.popen(git_command + "status")
-    git_status.read()
-    is_git_repo = git_status.close() is None
-    if is_git_repo:
-        generate_loadgen_version_definitions_git(ofile, git_command)
-    else:
-        generate_loadgen_verstion_definitions_git_stubs(ofile)
-    generate_loadgen_version_definitions_sha1(ofile, loadgen_root)
-
-    ofile.write("}  // namespace mlperf\n")
-    ofile.close()
-
-
-def main():
-    if len(sys.argv) != 3:
-        raise ValueError("Incorrect command-line arguments.")
-    generate_loadgen_version_definitions(sys.argv[1], sys.argv[2])
-
-
-if __name__ == "__main__":
-    main()
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/ts_types.py b/recommendation_v4/generative_recommenders/dlrm_v3/inference/ts_types.py
deleted file mode 100644
index cb558726b..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/ts_types.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-strict
-
-"""
-TorchScript-friendly boundary types for the HSTU sparse <-> dense interface.
-
-The eager path uses ``Dict[str, SequenceEmbedding]`` (a NamedTuple of
-``lengths`` and ``embedding`` tensors). TorchScript supports ``NamedTuple`` but
-does not script cleanly through ``Dict[str, NamedTuple]`` once the dict crosses
-device boundaries. The packaged sparse / dense modules instead exchange two
-parallel ``Dict[str, Tensor]`` dicts -- one of jagged values, one of lengths.
-
-These helpers convert between the two representations so we can keep the
-existing eager code unchanged while the scripted modules use only TS-friendly
-types at their boundaries.
-"""
-
-from typing import Dict, Tuple
-
-import torch
-from generative_recommenders.modules.dlrm_hstu import SequenceEmbedding
-
-
-# Per-feature jagged values (concatenated across batch, [L_total, table_dim]).
-SeqEmbValues = Dict[str, torch.Tensor]
-# Per-feature per-batch lengths ([B]).
-SeqEmbLengths = Dict[str, torch.Tensor]
-
-
-def flatten_seq_embeddings(
-    seq_embeddings: Dict[str, SequenceEmbedding],
-) -> Tuple[SeqEmbValues, SeqEmbLengths]:
-    """Split ``Dict[str, SequenceEmbedding]`` into parallel value/length dicts.
-
-    Lossless and zero-copy -- the returned tensors alias the inputs.
-    """
-    values: Dict[str, torch.Tensor] = {}
-    lengths: Dict[str, torch.Tensor] = {}
-    for k, v in seq_embeddings.items():
-        values[k] = v.embedding
-        lengths[k] = v.lengths
-    return values, lengths
-
-
-def unflatten_seq_embeddings(
-    values: SeqEmbValues,
-    lengths: SeqEmbLengths,
-) -> Dict[str, SequenceEmbedding]:
-    """Inverse of :func:`flatten_seq_embeddings`.
-
-    Reconstructs ``Dict[str, SequenceEmbedding]`` for code paths (e.g.
-    ``DlrmHSTU.main_forward``) that still consume the NamedTuple form.
-    """
-    out: Dict[str, SequenceEmbedding] = {}
-    for k, val in values.items():
-        out[k] = SequenceEmbedding(lengths=lengths[k], embedding=val)
-    return out
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/inference/user.conf b/recommendation_v4/generative_recommenders/dlrm_v3/inference/user.conf
deleted file mode 100644
index c6ca854f9..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/inference/user.conf
+++ /dev/null
@@ -1,5 +0,0 @@
-# Please set these fields depending on the performance of your system to
-# override default LoadGen settings.
-*.Server.target_latency = 80
-# *.Server.min_duration = 20000
-# *.Offline.min_duration = 20000
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/kuairand_1k.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/kuairand_1k.gin
deleted file mode 100644
index 46d8e1272..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/kuairand_1k.gin
+++ /dev/null
@@ -1,41 +0,0 @@
-batch_size = 16
-dataset = "kuairand-1k"
-
-# model parameters
-make_model.dataset = %dataset
-
-# dense model optimizer
-dense_optimizer_factory_and_class.learning_rate = 0.001
-dense_optimizer_factory_and_class.optimizer_name = "Adam"
-dense_optimizer_factory_and_class.momentum = 0
-dense_optimizer_factory_and_class.weight_decay = 0
-dense_optimizer_factory_and_class.eps = 1e-8
-dense_optimizer_factory_and_class.betas = (0.95, 0.999)
-
-# sparse model optimizer
-sparse_optimizer_factory_and_class.learning_rate = 0.001
-sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
-sparse_optimizer_factory_and_class.momentum = 0
-sparse_optimizer_factory_and_class.weight_decay = 0
-sparse_optimizer_factory_and_class.eps = 1e-8
-sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
-
-# dataloader configs
-make_train_test_dataloaders.batch_size = %batch_size
-make_train_test_dataloaders.dataset_type = %dataset
-make_train_test_dataloaders.train_split_percentage = 0.75
-make_train_test_dataloaders.new_path_prefix = "/home/linjianma"
-make_train_test_dataloaders.num_workers = 2
-make_train_test_dataloaders.prefetch_factor = 4
-
-# train loop variables
-train_loop.num_epochs = 5
-train_loop.output_trace = True
-train_loop.metric_log_frequency = 10
-
-# logger variables
-MetricsLogger.tensorboard_log_path = "/tmp/tensorboard_log_path.log"
-
-# checkpoint
-# save_dmp_checkpoint.path = "/home/linjianma/ckpts/kuairand_1k"
-# load_dmp_checkpoint.path = "/home/linjianma/ckpts/kuairand_1k/2025_01_12_17_56_43/"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_13b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_13b.gin
deleted file mode 100644
index e2f371de4..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_13b.gin
+++ /dev/null
@@ -1,41 +0,0 @@
-batch_size = 128
-dataset = "movielens-13b"
-
-# model parameters
-make_model.dataset = %dataset
-
-# dense model optimizer
-dense_optimizer_factory_and_class.learning_rate = 0.001
-dense_optimizer_factory_and_class.optimizer_name = "Adam"
-dense_optimizer_factory_and_class.momentum = 0
-dense_optimizer_factory_and_class.weight_decay = 0
-dense_optimizer_factory_and_class.eps = 1e-8
-dense_optimizer_factory_and_class.betas = (0.95, 0.999)
-
-# sparse model optimizer
-sparse_optimizer_factory_and_class.learning_rate = 0.001
-sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
-sparse_optimizer_factory_and_class.momentum = 0
-sparse_optimizer_factory_and_class.weight_decay = 0
-sparse_optimizer_factory_and_class.eps = 1e-8
-sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
-
-# dataloader configs
-make_train_test_dataloaders.batch_size = %batch_size
-make_train_test_dataloaders.dataset_type = %dataset
-make_train_test_dataloaders.train_split_percentage = 0.75
-make_train_test_dataloaders.new_path_prefix = "/home/linjianma"
-make_train_test_dataloaders.num_workers = 2
-make_train_test_dataloaders.prefetch_factor = 4
-
-# train loop variables
-train_loop.num_epochs = 1
-train_loop.output_trace = True
-train_loop.metric_log_frequency = 10
-train_eval_loop.num_epochs = 1
-train_eval_loop.output_trace = True
-train_eval_loop.metric_log_frequency = 10
-
-# logger variables
-MetricsLogger.tensorboard_log_path = "/tmp/tensorboard_log_path.log"
-save_dmp_checkpoint.path = "/home/linjianma/ckpts/movielens_13b"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_18b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_18b.gin
deleted file mode 100644
index 094271b57..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_18b.gin
+++ /dev/null
@@ -1,56 +0,0 @@
-batch_size = 64
-dataset = "movielens-18b"
-
-# model parameters
-make_model.dataset = %dataset
-
-# dense model optimizer
-dense_optimizer_factory_and_class.learning_rate = 0.001
-dense_optimizer_factory_and_class.optimizer_name = "Adam"
-dense_optimizer_factory_and_class.momentum = 0
-dense_optimizer_factory_and_class.weight_decay = 0
-dense_optimizer_factory_and_class.eps = 1e-8
-dense_optimizer_factory_and_class.betas = (0.95, 0.999)
-
-# sparse model optimizer
-sparse_optimizer_factory_and_class.learning_rate = 0.001
-sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
-sparse_optimizer_factory_and_class.momentum = 0
-sparse_optimizer_factory_and_class.weight_decay = 0
-sparse_optimizer_factory_and_class.eps = 1e-8
-sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
-
-# dataloader configs
-make_train_test_dataloaders.batch_size = %batch_size
-make_train_test_dataloaders.dataset_type = %dataset
-make_train_test_dataloaders.train_split_percentage = 0.80
-make_train_test_dataloaders.new_path_prefix = "/home/linjianma"
-make_train_test_dataloaders.num_workers = 2
-make_train_test_dataloaders.prefetch_factor = 4
-make_train_test_dataloaders.num_blocks = 20
-
-# train loop variables
-train_loop.num_epochs = 200
-train_loop.output_trace = False
-train_loop.metric_log_frequency = 40
-train_loop.checkpoint_frequency = 4000
-train_loop.start_batch_idx = 0
-
-# eval loop variables
-eval_loop.metric_log_frequency = 40
-
-# train eval loop variables
-train_eval_loop.num_epochs = 20
-train_eval_loop.output_trace = False
-train_eval_loop.start_train_batch_idx = 0
-train_eval_loop.start_eval_batch_idx = 0
-train_eval_loop.num_eval_batches = 200
-train_eval_loop.metric_log_frequency = 40
-train_eval_loop.checkpoint_frequency = 2000
-train_eval_loop.eval_frequency = 500
-
-
-# logger variables
-MetricsLogger.tensorboard_log_path = "/home/linjianma/tensorboard/movielens_18b/"
-save_dmp_checkpoint.path = "/home/linjianma/ckpts/movielens_18b/"
-# load_dmp_checkpoint.path = "/home/linjianma/ckpts/movielens_18b/20000/"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_1m.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_1m.gin
deleted file mode 100644
index 2b6cd6b64..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_1m.gin
+++ /dev/null
@@ -1,38 +0,0 @@
-batch_size = 128
-dataset = "movielens-1m"
-
-# model parameters
-make_model.dataset = %dataset
-
-# dense model optimizer
-dense_optimizer_factory_and_class.learning_rate = 0.001
-dense_optimizer_factory_and_class.optimizer_name = "Adam"
-dense_optimizer_factory_and_class.momentum = 0
-dense_optimizer_factory_and_class.weight_decay = 0
-dense_optimizer_factory_and_class.eps = 1e-8
-dense_optimizer_factory_and_class.betas = (0.9, 0.98)
-
-# sparse model optimizer
-sparse_optimizer_factory_and_class.learning_rate = 0.001
-sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
-sparse_optimizer_factory_and_class.momentum = 0
-sparse_optimizer_factory_and_class.weight_decay = 0
-sparse_optimizer_factory_and_class.eps = 1e-8
-sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
-
-# dataloader configs
-make_train_test_dataloaders.batch_size = %batch_size
-make_train_test_dataloaders.dataset_type = %dataset
-make_train_test_dataloaders.train_split_percentage = 0.75
-make_train_test_dataloaders.new_path_prefix = "/home/linjianma"
-make_train_test_dataloaders.num_workers = 2
-make_train_test_dataloaders.prefetch_factor = 4
-
-# train-eval loop variables
-train_eval_loop.num_epochs = 101
-train_eval_loop.output_trace = True
-train_eval_loop.metric_log_frequency = 10
-train_eval_loop.eval_frequency = 1
-
-# logger variables
-MetricsLogger.tensorboard_log_path = "/tmp/tensorboard_log_path.log"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_20m.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_20m.gin
deleted file mode 100644
index c01fab5af..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/movielens_20m.gin
+++ /dev/null
@@ -1,56 +0,0 @@
-batch_size = 64
-dataset = "movielens-20m"
-
-# model parameters
-make_model.dataset = %dataset
-
-# dense model optimizer
-dense_optimizer_factory_and_class.learning_rate = 0.001
-dense_optimizer_factory_and_class.optimizer_name = "Adam"
-dense_optimizer_factory_and_class.momentum = 0
-dense_optimizer_factory_and_class.weight_decay = 0
-dense_optimizer_factory_and_class.eps = 1e-8
-dense_optimizer_factory_and_class.betas = (0.95, 0.999)
-
-# sparse model optimizer
-sparse_optimizer_factory_and_class.learning_rate = 0.001
-sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
-sparse_optimizer_factory_and_class.momentum = 0
-sparse_optimizer_factory_and_class.weight_decay = 0
-sparse_optimizer_factory_and_class.eps = 1e-8
-sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
-
-# dataloader configs
-make_train_test_dataloaders.batch_size = %batch_size
-make_train_test_dataloaders.dataset_type = %dataset
-make_train_test_dataloaders.train_split_percentage = 0.80
-make_train_test_dataloaders.new_path_prefix = "/home/linjianma"
-make_train_test_dataloaders.num_workers = 2
-make_train_test_dataloaders.prefetch_factor = 4
-make_train_test_dataloaders.num_blocks = 1
-
-# train loop variables
-train_loop.num_epochs = 200
-train_loop.output_trace = False
-train_loop.metric_log_frequency = 40
-train_loop.checkpoint_frequency = 4000
-train_loop.start_batch_idx = 0
-
-# eval loop variables
-eval_loop.metric_log_frequency = 10
-
-# train eval loop variables
-train_eval_loop.num_epochs = 20
-train_eval_loop.output_trace = False
-train_eval_loop.start_train_batch_idx = 0
-train_eval_loop.start_eval_batch_idx = 0
-train_eval_loop.num_eval_batches = 100
-train_eval_loop.metric_log_frequency = 40
-train_eval_loop.checkpoint_frequency = 2000
-train_eval_loop.eval_frequency = 200
-
-
-# logger variables
-MetricsLogger.tensorboard_log_path = "/home/linjianma/tensorboard/movielens_20m/"
-# save_dmp_checkpoint.path = "/home/linjianma/ckpts/movielens_18b/0.5T"
-# load_dmp_checkpoint.path = "/home/linjianma/ckpts/movielens_18b/20000/"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_100b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_100b.gin
deleted file mode 100644
index 7d1df4bce..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_100b.gin
+++ /dev/null
@@ -1,52 +0,0 @@
-batch_size = 64
-num_workers = 2
-prefetch_factor = 4
-dataset = "streaming-100b"
-
-# model parameters
-make_model.dataset = %dataset
-
-# dense model optimizer
-dense_optimizer_factory_and_class.learning_rate = 0.001
-dense_optimizer_factory_and_class.optimizer_name = "Adam"
-dense_optimizer_factory_and_class.momentum = 0
-dense_optimizer_factory_and_class.weight_decay = 0
-dense_optimizer_factory_and_class.eps = 1e-8
-dense_optimizer_factory_and_class.betas = (0.95, 0.999)
-
-# sparse model optimizer
-sparse_optimizer_factory_and_class.learning_rate = 0.001
-sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
-sparse_optimizer_factory_and_class.momentum = 0
-sparse_optimizer_factory_and_class.weight_decay = 0
-sparse_optimizer_factory_and_class.eps = 1e-8
-sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
-
-# dataloader configs
-make_train_test_dataloaders.batch_size = %batch_size
-make_train_test_dataloaders.dataset_type = %dataset
-make_train_test_dataloaders.train_split_percentage = 0.80
-make_train_test_dataloaders.new_path_prefix = "/home/linjianma"
-make_train_test_dataloaders.num_workers = %num_workers
-make_train_test_dataloaders.prefetch_factor = %prefetch_factor
-make_train_test_dataloaders.num_blocks = 20
-
-get_dataset.name = %dataset
-get_dataset.new_path_prefix = "/home/linjianma"
-
-make_streaming_dataloader.batch_size = %batch_size
-make_streaming_dataloader.num_workers = %num_workers
-make_streaming_dataloader.prefetch_factor = %prefetch_factor
-
-# train eval loop variables
-streaming_train_eval_loop.num_train_ts = 90
-streaming_train_eval_loop.output_trace = False
-streaming_train_eval_loop.num_eval_batches = 500
-streaming_train_eval_loop.metric_log_frequency = 40
-streaming_train_eval_loop.checkpoint_frequency = 3
-
-
-# logger variables
-MetricsLogger.tensorboard_log_path = "/home/linjianma/tensorboard/streaming_100b/run4/"
-save_dmp_checkpoint.path = "/home/linjianma/ckpts/streaming_100b/"
-# load_dmp_checkpoint.path = "/home/linjianma/ckpts/streaming_100b/20000/"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_200b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_200b.gin
deleted file mode 100644
index 872019962..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_200b.gin
+++ /dev/null
@@ -1,63 +0,0 @@
-batch_size = 64
-num_workers = 2
-prefetch_factor = 4
-dataset = "streaming-200b"
-
-# model parameters
-make_model.dataset = %dataset
-
-# dense model optimizer
-dense_optimizer_factory_and_class.learning_rate = 0.001
-dense_optimizer_factory_and_class.optimizer_name = "Adam"
-dense_optimizer_factory_and_class.momentum = 0
-dense_optimizer_factory_and_class.weight_decay = 0
-dense_optimizer_factory_and_class.eps = 1e-8
-dense_optimizer_factory_and_class.betas = (0.95, 0.999)
-
-# sparse model optimizer
-sparse_optimizer_factory_and_class.learning_rate = 0.001
-sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
-sparse_optimizer_factory_and_class.momentum = 0
-sparse_optimizer_factory_and_class.weight_decay = 0
-sparse_optimizer_factory_and_class.eps = 1e-8
-sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
-
-# dataloader configs
-make_train_test_dataloaders.batch_size = %batch_size
-make_train_test_dataloaders.dataset_type = %dataset
-make_train_test_dataloaders.train_split_percentage = 0.80
-make_train_test_dataloaders.new_path_prefix = "/home/linjianma"
-make_train_test_dataloaders.num_workers = %num_workers
-make_train_test_dataloaders.prefetch_factor = %prefetch_factor
-make_train_test_dataloaders.num_blocks = 20
-
-get_dataset.name = %dataset
-get_dataset.new_path_prefix = "/home/linjianma"
-
-make_streaming_dataloader.batch_size = %batch_size
-make_streaming_dataloader.num_workers = %num_workers
-make_streaming_dataloader.prefetch_factor = %prefetch_factor
-
-# train loop variables
-train_loop.num_epochs = 200
-train_loop.output_trace = False
-train_loop.metric_log_frequency = 40
-train_loop.checkpoint_frequency = 4000
-train_loop.start_batch_idx = 0
-
-# eval loop variables
-eval_loop.metric_log_frequency = 40
-
-# train eval loop variables
-streaming_train_eval_loop.num_train_ts = 90
-streaming_train_eval_loop.output_trace = False
-streaming_train_eval_loop.num_train_batches = 5000
-streaming_train_eval_loop.num_eval_batches = 200
-streaming_train_eval_loop.metric_log_frequency = 40
-streaming_train_eval_loop.checkpoint_frequency = 2000
-
-
-# logger variables
-MetricsLogger.tensorboard_log_path = "/home/linjianma/tensorboard/streaming_200b/"
-# save_dmp_checkpoint.path = "/home/linjianma/ckpts/streaming_200b/"
-# load_dmp_checkpoint.path = "/home/linjianma/ckpts/streaming_400m/20000/"
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_400m.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_400m.gin
deleted file mode 100644
index eba17bc23..000000000
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/streaming_400m.gin
+++ /dev/null
@@ -1,61 +0,0 @@
-batch_size = 64
-num_workers = 2
-prefetch_factor = 4
-dataset = "streaming-400m"
-
-# model parameters
-make_model.dataset = %dataset
-
-# dense model optimizer
-dense_optimizer_factory_and_class.learning_rate = 0.001
-dense_optimizer_factory_and_class.optimizer_name = "Adam"
-dense_optimizer_factory_and_class.momentum = 0
-dense_optimizer_factory_and_class.weight_decay = 0
-dense_optimizer_factory_and_class.eps = 1e-8
-dense_optimizer_factory_and_class.betas = (0.95, 0.999)
-
-# sparse model optimizer
-sparse_optimizer_factory_and_class.learning_rate = 0.001
-sparse_optimizer_factory_and_class.optimizer_name = "RowWiseAdagrad"
-sparse_optimizer_factory_and_class.momentum = 0
-sparse_optimizer_factory_and_class.weight_decay = 0
-sparse_optimizer_factory_and_class.eps = 1e-8
-sparse_optimizer_factory_and_class.betas = (0.95, 0.999)
-
-# dataloader configs
-make_train_test_dataloaders.batch_size = %batch_size
-make_train_test_dataloaders.dataset_type = %dataset
-make_train_test_dataloaders.train_split_percentage = 0.80
-make_train_test_dataloaders.new_path_prefix = "/home/linjianma"
-make_train_test_dataloaders.num_workers = %num_workers
-make_train_test_dataloaders.prefetch_factor = %prefetch_factor
-make_train_test_dataloaders.num_blocks = 20
-
-get_dataset.name = %dataset
-get_dataset.new_path_prefix = "/home/linjianma"
-
-make_streaming_dataloader.batch_size = %batch_size
-make_streaming_dataloader.num_workers = %num_workers
-make_streaming_dataloader.prefetch_factor = %prefetch_factor
-
-# train loop variables
-train_loop.num_epochs = 200
-train_loop.output_trace = False
-train_loop.metric_log_frequency = 40
-train_loop.checkpoint_frequency = 4000
-train_loop.start_batch_idx = 0
-
-# eval loop variables
-eval_loop.metric_log_frequency = 40
-
-# train eval loop variables
-streaming_train_eval_loop.num_train_ts = 8
-streaming_train_eval_loop.output_trace = False
-streaming_train_eval_loop.metric_log_frequency = 40
-streaming_train_eval_loop.checkpoint_frequency = 2000
-
-
-# logger variables
-MetricsLogger.tensorboard_log_path = "/home/linjianma/tensorboard/streaming_400m/"
-# save_dmp_checkpoint.path = "/home/linjianma/ckpts/streaming_400m/"
-# load_dmp_checkpoint.path = "/home/linjianma/ckpts/streaming_400m/20000/"
diff --git a/recommendation_v4/generative_recommenders/ops/benchmarks/hstu_attention_bench.py b/recommendation_v4/generative_recommenders/ops/benchmarks/hstu_attention_bench.py
deleted file mode 100644
index cc7fbede7..000000000
--- a/recommendation_v4/generative_recommenders/ops/benchmarks/hstu_attention_bench.py
+++ /dev/null
@@ -1,406 +0,0 @@
-# pyre-strict
-import os
-from typing import List, Optional, Tuple
-
-import click
-import pandas as pd
-import torch
-
-# @manual=//triton:triton
-import triton
-from generative_recommenders.common import (
-    apply_sampling,
-    blackwell_tlx_unavailable,
-    generate_sparse_seq_len,
-    HammerKernel,
-)
-from generative_recommenders.ops.cpp.cuda_hstu_attention import cuda_hstu_mha
-from generative_recommenders.ops.hstu_attention import delta_hstu_mha, hstu_mha
-
-try:
-    from hammer.ops.ragged_hstu_attention import ragged_hstu_mha
-    from hammer.utils import HammerKernel as HammerKernel2
-except ImportError:
-    pass
-
-
-def _get_kernel(provider: str) -> HammerKernel:
-    if provider == "triton":
-        return HammerKernel.TRITON
-    elif provider == "tlx":
-        return HammerKernel.TLX
-    elif provider == "pytorch":
-        return HammerKernel.PYTORCH
-    else:
-        raise ValueError(f"Unknown provider {provider}")
-
-
-def _flops(
-    batch_size: int,
-    max_seqlen: int,
-    attn_dim: int,
-    hidden_dim: int,
-    nheads: int,
-    seq_offsets: torch.Tensor,
-    mode: str = "fwd",
-) -> float:
-    assert mode in ["fwd", "bwd", "fwd_bwd"]
-    ratio = 2.0  # triangular masking
-    f1 = 0.0
-    f2 = 0.0
-    for i in range(batch_size):
-        seq_len = int((seq_offsets[i + 1] - seq_offsets[i]).item())
-        # (QK^T), dQ = d(QK^T)K, dK^T = Q^Td(QK^T)
-        f1 += 2 * nheads * attn_dim * seq_len**2 // ratio
-        # (QK^T)V, d(QK^T) = dOV^T, dV = (QK^T)^TdO,
-        f2 += 2 * nheads * hidden_dim * seq_len**2 // ratio
-    if mode == "fwd":
-        return f1 + f2  # computes (QK^T) and (QK^T)V
-    elif mode == "bwd":
-        return 3 * f1 + 2 * f2  # computes (QK^T), dQ, dK, dV, d(QK^T)
-    else:
-        return 4 * f1 + 3 * f2
-
-
-@click.command()
-@click.option(
-    "--batch-size",
-    type=int,
-    default=512,
-)
-@click.option("--heads", type=int, default=4)
-@click.option("--attn-dim", type=int, default=128)
-@click.option("--hidden-dim", type=int, default=128)
-@click.option("--max-seq-len-log2", type=int, default=13)
-@click.option("--data-type", type=str, default="bf16")
-@click.option("--seq-sparsity", type=float, default=0.95)
-@click.option("--has-delta-q", type=bool, default=False)
-@click.option("--delta-size", type=int, default=256)
-@click.option("--target-size", type=int, default=20)
-@click.option("--bench-backward", type=bool, default=True)
-@click.option("--bench-forward", type=bool, default=True)
-@click.option("--bench-tlx", type=bool, default=False)
-@click.option("--bench-pytorch", type=bool, default=False)
-@click.option("--bench-ragged", type=bool, default=True)
-@click.option("--report-flops", type=bool, default=False)
-@click.option("--return-result", type=bool, default=False)
-@click.option("--max-attn-len", type=int, default=0)
-@click.option("--min-full-attn-seq-len", type=int, default=0)
-@click.option("--contextual-seq-len", type=int, default=0)
-@click.option("--sampling-alpha", type=float, default=2.0)
-@click.option("--triton-enable-tma", type=bool, default=False)
-@click.option("--dynamic-attn-scale", type=bool, default=False)
-@click.option("--num-softmax-heads", type=int, default=0)
-def main(  # noqa: C901
-    batch_size: int,
-    heads: int,
-    attn_dim: int,
-    hidden_dim: int,
-    max_seq_len_log2: int,
-    data_type: str,
-    seq_sparsity: float,
-    has_delta_q: bool,
-    delta_size: int,
-    target_size: int,
-    bench_backward: bool,
-    bench_forward: bool,
-    bench_tlx: bool,
-    bench_pytorch: bool,
-    bench_ragged: bool,
-    report_flops: bool,
-    return_result: bool,
-    max_attn_len: int,
-    min_full_attn_seq_len: int,
-    contextual_seq_len: int,
-    sampling_alpha: float,
-    triton_enable_tma: bool,
-    dynamic_attn_scale: bool,
-    num_softmax_heads: int,
-) -> Optional[Tuple[List[triton.testing.Benchmark], List[pd.DataFrame]]]:
-    torch.backends.cudnn.allow_tf32 = True
-    torch.backends.cuda.matmul.allow_tf32 = True
-    if data_type == "fp32":
-        dtype = torch.float32
-    elif data_type == "fp16":
-        dtype = torch.float16
-    elif data_type == "bf16":
-        dtype = torch.bfloat16
-    else:
-        raise ValueError(f"Unsupported data type: {data_type}.")
-
-    line_vals = ["triton", "flash_cuda_jagged"]
-    line_names = ["triton", "flash_cuda_jagged"]
-    styles = [("blue", "-"), ("green", "-")]
-    if bench_pytorch:
-        line_vals.append("pytorch")
-        line_names.append("PyTorch")
-        styles.append(("green", "-"))
-    if bench_ragged:
-        line_vals.append("ragged")
-        line_names.append("ragged")
-        styles.append(("red", "-"))
-    if bench_tlx and not blackwell_tlx_unavailable[0]:
-        line_vals.append("tlx")
-        line_names.append("tlx")
-        styles.append(("cyan", "-"))
-
-    bench_backward = False if has_delta_q else bench_backward
-    modes = []
-    if bench_forward:
-        modes.append("fwd")
-    if bench_backward:
-        modes.append("bwd")
-    assert len(modes) > 0
-
-    configs: List[triton.testing.Benchmark] = [
-        triton.testing.Benchmark(
-            x_names=["seq_len"],
-            x_vals=[2**i for i in range(8, max_seq_len_log2)],
-            line_arg="provider",
-            line_vals=line_vals,
-            line_names=line_names,
-            styles=styles,
-            ylabel="ms",
-            plot_name=f"hstu-attn-b{batch_size}-h{heads}-d{attn_dim}-v{hidden_dim}--sparsity{seq_sparsity}-{mode}-{dtype}-target{target_size}-mattn{max_attn_len}-full{min_full_attn_seq_len}-c{contextual_seq_len}-sl_alpha{sampling_alpha}-triton_tma{triton_enable_tma}-dynamic_scale{dynamic_attn_scale}-num_softmax_heads{num_softmax_heads}",
-            args={
-                "batch_size": batch_size,
-                "heads": heads,
-                "attn_dim": attn_dim,
-                "hidden_dim": hidden_dim,
-                "dtype": dtype,
-                "mode": mode,
-                "seq_sparsity": seq_sparsity,
-                "has_delta_q": has_delta_q,
-                "delta_size": delta_size,
-                "target_size": target_size,
-                "bench_backward": bench_backward,
-                "report_flops": report_flops,
-                "max_attn_len": max_attn_len,
-                "min_full_attn_seq_len": min_full_attn_seq_len,
-                "contextual_seq_len": contextual_seq_len,
-                "sampling_alpha": sampling_alpha,
-                "triton_enable_tma": triton_enable_tma,
-                "dynamic_attn_scale": dynamic_attn_scale,
-                "num_softmax_heads": num_softmax_heads,
-            },
-        )
-        for mode in modes
-    ]
-
-    @triton.testing.perf_report(configs)
-    def _bench_hstu_attention(
-        batch_size: int,
-        heads: int,
-        seq_len: int,
-        attn_dim: int,
-        hidden_dim: int,
-        mode: str,
-        provider: str,
-        dtype: torch.dtype,
-        seq_sparsity: float,
-        has_delta_q: bool,
-        delta_size: int,
-        target_size: int,
-        bench_backward: bool,
-        report_flops: bool,
-        max_attn_len: int,
-        min_full_attn_seq_len: int,
-        contextual_seq_len: int,
-        sampling_alpha: float,
-        triton_enable_tma: bool,
-        dynamic_attn_scale: bool,
-        num_softmax_heads: int,
-    ) -> float:
-        assert mode in ["fwd", "bwd"]
-        warmup = 25
-        rep = 1000
-        torch.manual_seed(1001)  # for reproducibility
-        alpha = 1.0 / attn_dim
-        causal = True
-        lengths = generate_sparse_seq_len(
-            size=batch_size,
-            max_seq_len=seq_len,
-            sparsity=seq_sparsity,
-            device=torch.device("cuda"),
-        )
-        lengths = apply_sampling(lengths, sampling_alpha, max_seq_len=seq_len)
-        if has_delta_q:
-            lengths = lengths + delta_size
-            num_targets = torch.ones_like(lengths) * delta_size
-            seq_len = seq_len + delta_size
-        else:
-            delta_size = 0
-            num_targets = None
-            if target_size != 0:
-                num_targets = torch.randint(
-                    1,
-                    target_size + 1,
-                    (batch_size,),
-                    device=lengths.device,
-                    dtype=lengths.dtype,
-                )
-                num_targets = torch.where(
-                    num_targets > lengths, lengths, num_targets
-                ).to(torch.int32)
-        max_attn_len = max_attn_len if max_attn_len < seq_len else seq_len
-        seq_offsets = torch.zeros(
-            (batch_size + 1,), dtype=torch.int64, device=torch.device("cuda")
-        )
-        seq_offsets[1:] = torch.cumsum(lengths, dim=0)
-        L = int(seq_offsets[-1].item())
-        x = torch.empty(
-            (L, heads, attn_dim * 2 + hidden_dim),
-            dtype=dtype,
-            device=torch.device("cuda"),
-        ).uniform_(-0.01, 0.01)
-        q, k, v = torch.split(x, [attn_dim, attn_dim, hidden_dim], dim=-1)
-        delta_q = torch.empty(
-            (batch_size * delta_size, heads, attn_dim),
-            dtype=dtype,
-            device=torch.device("cuda"),
-        ).uniform_(-0.1, 0.1)
-        delta_x_offsets = torch.arange(0, delta_size, device=torch.device("cuda"))
-        delta_x_offsets = (seq_offsets[1:] - delta_size).view(
-            batch_size, 1
-        ) + delta_x_offsets.view(1, delta_size)
-        delta_x_offsets = delta_x_offsets.view(-1)
-        attn_scale = torch.empty(
-            (L,),
-            dtype=torch.float32,
-            device=torch.device("cuda"),
-        ).uniform_(0.5, 1.0)
-
-        if bench_backward:
-            q = q.requires_grad_(True)
-            k = k.requires_grad_(True)
-            v = v.requires_grad_(True)
-        assert provider in [
-            "triton",
-            "pytorch",
-            "flash_cuda_jagged",
-            "flash_cuda",
-            "tlx",
-            "ragged",
-        ]
-        if has_delta_q:
-            fn = lambda: delta_hstu_mha(  # noqa E731
-                max_seq_len=seq_len,
-                alpha=alpha,
-                delta_q=delta_q,
-                k=k,
-                v=v,
-                seq_offsets=seq_offsets,
-                num_targets=num_targets,
-                kernel=_get_kernel(provider),
-            )
-        else:
-            if provider == "flash_cuda_jagged":
-                fn = lambda: cuda_hstu_mha(  # noqa E731
-                    q=q,
-                    k=k,
-                    v=v,
-                    alpha=alpha,
-                    causal=True,
-                    seq_offsets=seq_offsets.to(torch.int32),
-                    attn_scale=attn_scale if dynamic_attn_scale else None,
-                    max_seq_len=seq_len,
-                    max_attn_len=max_attn_len,
-                    min_full_attn_seq_len=min_full_attn_seq_len,
-                    contextual_seq_len=contextual_seq_len,
-                    num_targets=num_targets,
-                    sort_by_length=False,
-                    num_softmax_heads=num_softmax_heads,
-                )
-            elif provider == "flash_cuda":
-                q, k, v = [
-                    torch.randn(
-                        batch_size,
-                        seq_len,
-                        heads,
-                        attn_dim,
-                        device="cuda",
-                        dtype=dtype,
-                        requires_grad=True,
-                    )
-                    for _ in range(3)
-                ]
-                fn = lambda: cuda_hstu_mha(  # noqa E731
-                    q=q,
-                    k=k,
-                    v=v,
-                    alpha=alpha,
-                    causal=True,
-                    max_seq_len=seq_len,
-                    max_attn_len=max_attn_len,
-                    min_full_attn_seq_len=min_full_attn_seq_len,
-                    contextual_seq_len=contextual_seq_len,
-                    num_targets=num_targets,
-                    sort_by_length=False,
-                    num_softmax_heads=num_softmax_heads,
-                )
-            elif provider == "ragged":
-                fn = lambda: ragged_hstu_mha(  # noqa E731
-                    max_seq_len=seq_len,
-                    alpha=alpha,
-                    q=q,
-                    k=k,
-                    v=v,
-                    seq_offsets=seq_offsets,
-                    dropout_pr=0.0,
-                    training=True,
-                    invalid_attn_mask_type="lower_triangular",
-                    num_targets=num_targets,
-                    attn_scale=attn_scale if dynamic_attn_scale else None,
-                    max_attn_len=max_attn_len,
-                    contextual_seq_len=contextual_seq_len,
-                    full_attn_size=min_full_attn_seq_len,
-                    sort_by_length=True,
-                    kernel=HammerKernel2.TRITON,
-                    num_softmax_heads=num_softmax_heads,
-                )
-            else:
-                fn = lambda: hstu_mha(  # noqa E731
-                    max_seq_len=seq_len,
-                    alpha=alpha,
-                    q=q,
-                    k=k,
-                    v=v,
-                    seq_offsets=seq_offsets,
-                    causal=causal,
-                    dropout_pr=0.0,
-                    training=True,
-                    num_targets=num_targets,
-                    max_attn_len=max_attn_len,
-                    contextual_seq_len=contextual_seq_len,
-                    sort_by_length=True,
-                    kernel=_get_kernel(provider),
-                    enable_tma=triton_enable_tma,
-                )
-        if mode == "bwd":
-            o = fn()
-            do = torch.randn_like(o)
-            fn = lambda: o.backward(do, retain_graph=True)  # noqa E731
-        ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
-        all_flops = _flops(
-            batch_size, seq_len, attn_dim, hidden_dim, heads, seq_offsets, mode
-        )
-        if has_delta_q:
-            all_flops = all_flops / seq_len * delta_size
-        if report_flops:
-            return all_flops / ms / 1e9
-        else:
-            return ms
-
-    df = _bench_hstu_attention.run(
-        print_data=True,
-        show_plots=False,
-        save_path="/tmp/" + os.environ["USER"],
-        return_df=return_result,
-    )
-
-    if return_result:
-        return configs, df
-
-
-if __name__ == "__main__":
-    main()
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/concat_1d_jagged_jagged_bench.py b/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/concat_1d_jagged_jagged_bench.py
deleted file mode 100644
index 95c43853f..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/concat_1d_jagged_jagged_bench.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# pyre-strict
-from typing import List
-
-import click
-import torch
-
-# @manual=//triton:triton
-import triton
-from hammer.ops.jagged import concat_1D_jagged_jagged
-
-# buck2 run @//mode/opt -c fbcode.nvcc_arch=h100 //generative_recommenders/ops/cpp/benchmarks:concat_1d_jagged_jagged_bench
-
-torch.ops.load_library("//generative_recommenders/ops/cpp:cpp_ops")
-torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
-torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
-
-
-@click.command()
-@click.option("--data-type", type=str, default="float32")
-@click.option("--batch-size", type=int, default=512)
-@click.option("--max-seq-len-log2", type=int, default=20)
-@click.option("--seq-sparsity", type=float, default=0.8)
-def main(
-    data_type: str,
-    batch_size: int,
-    max_seq_len_log2: int,
-    seq_sparsity: float,
-) -> None:
-    if data_type == "float32":
-        dtype = torch.float32
-    elif data_type == "float16":
-        dtype = torch.float16
-    elif data_type == "bfloat16":
-        dtype = torch.bfloat16
-    else:
-        raise ValueError(f"Unsupported data type: {data_type}.")
-
-    configs: List[triton.testing.Benchmark] = [
-        triton.testing.Benchmark(
-            x_names=["max_seq_len"],
-            x_vals=[2**i for i in range(6, max_seq_len_log2)],
-            line_arg="method",
-            line_vals=[
-                "custom_cuda",
-                "hammer_pytorch",
-            ],
-            line_names=["Custom CUDA", "Hammer PyTorch"],
-            styles=[("green", "-"), ("orange", "--")],
-            ylabel="ms",
-            plot_name=f"concat_1d_jagged_jagged_batch{batch_size}_sparsity{seq_sparsity}_{data_type}",
-            args={
-                "dtype": dtype,
-                "batch_size": batch_size,
-                "seq_sparsity": seq_sparsity,
-            },
-        )
-    ]
-
-    @triton.testing.perf_report(configs)
-    def bench_concat_1d_jagged_jagged(
-        max_seq_len: int,
-        batch_size: int,
-        method: str,
-        dtype: torch.dtype,
-        seq_sparsity: float,
-    ) -> float:
-        warmup = 50
-        rep = 500
-        torch.manual_seed(1001)
-
-        lengths_left = torch.randint(
-            1, int(max_seq_len * seq_sparsity) + 1, (batch_size,), dtype=torch.int32
-        )
-        lengths_right = torch.randint(
-            1, int(max_seq_len * seq_sparsity) + 1, (batch_size,), dtype=torch.int32
-        )
-
-        total_left = int(lengths_left.sum().item())
-        total_right = int(lengths_right.sum().item())
-
-        values_left = torch.randn(total_left, dtype=dtype)
-        values_right = torch.randn(total_right, dtype=dtype)
-
-        offsets_left = torch.zeros(
-            (batch_size + 1,), dtype=lengths_left.dtype, device=lengths_left.device
-        )
-        offsets_left[1:] = torch.cumsum(lengths_left.view(-1), dim=0)
-        offsets_right = torch.zeros(
-            (batch_size + 1,), dtype=lengths_right.dtype, device=lengths_right.device
-        )
-        offsets_right[1:] = torch.cumsum(lengths_right.view(-1), dim=0)
-        max_seq_len_left = int(lengths_left.max().item())
-        max_seq_len_right = int(lengths_right.max().item())
-
-        lengths_left = lengths_left.cuda()
-        lengths_right = lengths_right.cuda()
-        values_left = values_left.cuda()
-        values_right = values_right.cuda()
-        offsets_left = offsets_left.cuda()
-        offsets_right = offsets_right.cuda()
-
-        if method == "custom_cuda":
-            fn = lambda: torch.ops.hstu.concat_1d_jagged_jagged(  # noqa E731
-                lengths_left, values_left, lengths_right, values_right
-            )
-        elif method == "hammer_pytorch":
-            fn = lambda: concat_1D_jagged_jagged(  # noqa E731
-                max_seq_len_left,
-                offsets_left,
-                values_left,
-                max_seq_len_right,
-                offsets_right,
-                values_right,
-            )
-        else:
-            raise ValueError(f"unknown method: {method}")
-
-        ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
-        return ms
-
-    bench_concat_1d_jagged_jagged.run(print_data=True)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/jagged_transpose_1d_bench.py b/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/jagged_transpose_1d_bench.py
deleted file mode 100644
index 7806d6970..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/jagged_transpose_1d_bench.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# pyre-strict
-from typing import List
-
-import click
-import torch
-
-# @manual=//triton:triton
-import triton
-from hammer.ops.jagged import jagged_transpose_1D
-
-# buck2 run @//mode/opt -c fbcode.nvcc_arch=h100 //generative_recommenders/ops/cpp/benchmarks:jagged_transpose_1d_bench
-
-torch.ops.load_library("//generative_recommenders/ops/cpp:cpp_ops")
-torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
-torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
-
-
-@click.command()
-@click.option("--data-type", type=str, default="float32")
-@click.option("--size1", type=int, default=32)
-@click.option("--size2", type=int, default=16)
-@click.option("--max-len-log2", type=int, default=19)
-@click.option("--seq-sparsity", type=float, default=0.8)
-def main(
-    data_type: str,
-    size1: int,
-    size2: int,
-    max_len_log2: int,
-    seq_sparsity: float,
-) -> None:
-    if data_type == "float32":
-        dtype = torch.float32
-    elif data_type == "float16":
-        dtype = torch.float16
-    elif data_type == "bfloat16":
-        dtype = torch.bfloat16
-    else:
-        raise ValueError(f"Unsupported data type: {data_type}.")
-
-    configs: List[triton.testing.Benchmark] = [
-        triton.testing.Benchmark(
-            x_names=["max_len"],
-            x_vals=[2**i for i in range(4, max_len_log2)],
-            line_arg="method",
-            line_vals=[
-                "custom_cuda",
-                "hammer_pytorch",
-            ],
-            line_names=["Custom CUDA", "Hammer PyTorch"],
-            styles=[("green", "-"), ("orange", "--")],
-            ylabel="ms",
-            plot_name=f"jagged_transpose_1d_size1_{size1}_size2_{size2}_sparsity{seq_sparsity}_{data_type}",
-            args={
-                "dtype": dtype,
-                "size1": size1,
-                "size2": size2,
-                "seq_sparsity": seq_sparsity,
-            },
-        )
-    ]
-
-    @triton.testing.perf_report(configs)
-    def bench_jagged_transpose_1d(
-        max_len: int,
-        size1: int,
-        size2: int,
-        method: str,
-        dtype: torch.dtype,
-        seq_sparsity: float,
-    ) -> float:
-        warmup = 50
-        rep = 500
-        torch.manual_seed(1001)
-
-        lengths = torch.randint(
-            1, int(max_len * seq_sparsity) + 1, (size1 * size2,), dtype=torch.int32
-        )
-        offsets = torch.zeros(
-            (size1 * size2 + 1,), dtype=lengths.dtype, device=lengths.device
-        )
-        offsets[1:] = torch.cumsum(lengths.view(-1), dim=0)
-
-        values = torch.randn(int(offsets[-1].item()), dtype=dtype)
-
-        lengths = lengths.cuda()
-        offsets = offsets.cuda()
-        values = values.cuda()
-
-        if method == "custom_cuda":
-            fn = lambda: torch.ops.hstu.jagged_transpose_1d(  # noqa E731
-                values=values,
-                offsets=offsets,
-                lengths=lengths,
-                max_len=max_len,
-                size1=size1,
-                size2=size2,
-            )
-        elif method == "hammer_pytorch":
-            fn = lambda: jagged_transpose_1D(  # noqa E731
-                values=values,
-                offsets=offsets,
-                lengths=lengths,
-                max_len=max_len,
-                size1=size1,
-                size2=size2,
-            )
-        else:
-            raise ValueError(f"unknown method: {method}")
-
-        ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
-        return ms
-
-    bench_jagged_transpose_1d.run(print_data=True)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/replace_last_n_with_jagged_bench.py b/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/replace_last_n_with_jagged_bench.py
deleted file mode 100644
index a3f2483fa..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/replace_last_n_with_jagged_bench.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# pyre-strict
-from typing import List
-
-import click
-import torch
-
-# @manual=//triton:triton
-import triton
-from hammer.ops.jagged import replace_last_n_with_jagged
-from hammer.utils import HammerKernel
-
-# buck2 run @//mode/opt -c fbcode.nvcc_arch=h100 //generative_recommenders/ops/cpp/benchmarks:replace_last_n_with_jagged_bench
-
-torch.ops.load_library("//generative_recommenders/ops/cpp:cpp_ops")
-torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
-torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
-
-
-@click.command()
-@click.option("--data-type", type=str, default="float32")
-@click.option("--batch-size", type=int, default=512)
-@click.option("--embedding-dim", type=int, default=64)
-@click.option("--max-seq-len-log2", type=int, default=16)
-@click.option("--seq-sparsity", type=float, default=0.8)
-def main(
-    data_type: str,
-    batch_size: int,
-    embedding_dim: int,
-    max_seq_len_log2: int,
-    seq_sparsity: float,
-) -> None:
-    if data_type == "float32":
-        dtype = torch.float32
-    elif data_type == "float16":
-        dtype = torch.float16
-    elif data_type == "bfloat16":
-        dtype = torch.bfloat16
-    else:
-        raise ValueError(f"Unsupported data type: {data_type}.")
-
-    configs: List[triton.testing.Benchmark] = [
-        triton.testing.Benchmark(
-            x_names=["max_seq_len"],
-            x_vals=[2**i for i in range(6, max_seq_len_log2)],
-            line_arg="method",
-            line_vals=[
-                "custom_cuda",
-                "hammer_pytorch",
-                "hammer_triton",
-            ],
-            line_names=[
-                "Custom CUDA",
-                "Hammer PyTorch",
-                "Hammer Triton",
-            ],
-            styles=[
-                ("green", "-"),
-                ("orange", "--"),
-                ("purple", "-."),
-            ],
-            ylabel="ms",
-            plot_name=f"replace_last_n_with_jagged_batch{batch_size}_dim{embedding_dim}_sparsity{seq_sparsity}_{data_type}",
-            args={
-                "dtype": dtype,
-                "batch_size": batch_size,
-                "embedding_dim": embedding_dim,
-                "seq_sparsity": seq_sparsity,
-            },
-        )
-    ]
-
-    @triton.testing.perf_report(configs)
-    def bench_replace_last_n_with_jagged(
-        max_seq_len: int,
-        batch_size: int,
-        method: str,
-        dtype: torch.dtype,
-        embedding_dim: int,
-        seq_sparsity: float,
-    ) -> float:
-        warmup = 50
-        rep = 500
-        torch.manual_seed(1001)
-
-        min_left_len = max(1, int(max_seq_len * seq_sparsity * 0.3))
-        max_left_len = int(max_seq_len * seq_sparsity)
-
-        lengths_left = torch.randint(
-            min_left_len, max_left_len + 1, (batch_size,), dtype=torch.int32
-        )
-        lengths_right = torch.randint(
-            1, min_left_len + 1, (batch_size,), dtype=torch.int32
-        )
-
-        lengths_right = torch.min(lengths_right, lengths_left)
-
-        total_left = int(lengths_left.sum().item())
-        total_right = int(lengths_right.sum().item())
-
-        values_left = torch.randn(total_left, embedding_dim, dtype=dtype)
-        values_right = torch.randn(total_right, embedding_dim, dtype=dtype)
-
-        offsets_left = torch.zeros(
-            (batch_size + 1,), dtype=lengths_left.dtype, device=lengths_left.device
-        )
-        offsets_left[1:] = torch.cumsum(lengths_left.view(-1), dim=0)
-        offsets_right = torch.zeros(
-            (batch_size + 1,), dtype=lengths_right.dtype, device=lengths_right.device
-        )
-        offsets_right[1:] = torch.cumsum(lengths_right.view(-1), dim=0)
-
-        lengths_left = lengths_left.cuda()
-        lengths_right = lengths_right.cuda()
-        values_left = values_left.cuda()
-        values_right = values_right.cuda()
-        offsets_left = offsets_left.cuda()
-        offsets_right = offsets_right.cuda()
-
-        if method == "custom_cuda":
-            fn = lambda: torch.ops.hstu.replace_last_n_with_jagged(  # noqa E731
-                lengths_left, values_left, lengths_right, values_right
-            )
-        elif method == "hammer_pytorch":
-            fn = lambda: replace_last_n_with_jagged(  # noqa E731
-                max_seq_len_left=max_seq_len,
-                offsets_left=offsets_left,
-                values_left=values_left,
-                offsets_right=offsets_right,
-                values_right=values_right,
-            )
-        elif method == "hammer_triton":
-            fn = lambda: replace_last_n_with_jagged(  # noqa E731
-                max_seq_len_left=max_seq_len,
-                offsets_left=offsets_left,
-                values_left=values_left,
-                offsets_right=offsets_right,
-                values_right=values_right,
-                kernel=HammerKernel.TRITON,
-            )
-        else:
-            raise ValueError(f"unknown method: {method}")
-
-        ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
-        return ms
-
-    bench_replace_last_n_with_jagged.run(print_data=True)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/split_1d_jagged_jagged_bench.py b/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/split_1d_jagged_jagged_bench.py
deleted file mode 100644
index 4aaa9d77c..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/benchmarks/split_1d_jagged_jagged_bench.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# pyre-strict
-from typing import List
-
-import click
-import torch
-
-# @manual=//triton:triton
-import triton
-from hammer.ops.jagged import split_1D_jagged_jagged
-
-# buck2 run @//mode/opt -c fbcode.nvcc_arch=h100 //generative_recommenders/ops/cpp/benchmarks:split_1d_jagged_jagged_bench
-
-torch.ops.load_library("//generative_recommenders/ops/cpp:cpp_ops")
-torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
-torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
-
-
-@click.command()
-@click.option("--data-type", type=str, default="float32")
-@click.option("--batch-size", type=int, default=512)
-@click.option("--max-seq-len-log2", type=int, default=20)
-@click.option("--seq-sparsity", type=float, default=0.8)
-def main(
-    data_type: str,
-    batch_size: int,
-    max_seq_len_log2: int,
-    seq_sparsity: float,
-) -> None:
-    if data_type == "float32":
-        dtype = torch.float32
-    elif data_type == "float16":
-        dtype = torch.float16
-    elif data_type == "bfloat16":
-        dtype = torch.bfloat16
-    else:
-        raise ValueError(f"Unsupported data type: {data_type}.")
-
-    configs: List[triton.testing.Benchmark] = [
-        triton.testing.Benchmark(
-            x_names=["max_seq_len"],
-            x_vals=[2**i for i in range(6, max_seq_len_log2)],
-            line_arg="method",
-            line_vals=[
-                "custom_cuda",
-                "hammer_pytorch",
-            ],
-            line_names=["Custom CUDA", "Hammer PyTorch"],
-            styles=[("green", "-"), ("orange", "--")],
-            ylabel="ms",
-            plot_name=f"split_1d_jagged_jagged_batch{batch_size}_sparsity{seq_sparsity}_{data_type}",
-            args={
-                "dtype": dtype,
-                "batch_size": batch_size,
-                "seq_sparsity": seq_sparsity,
-            },
-        )
-    ]
-
-    @triton.testing.perf_report(configs)
-    def bench_split_1d_jagged_jagged(
-        max_seq_len: int,
-        batch_size: int,
-        method: str,
-        dtype: torch.dtype,
-        seq_sparsity: float,
-    ) -> float:
-        warmup = 50
-        rep = 500
-        torch.manual_seed(1001)
-
-        lengths_left = torch.randint(
-            1, int(max_seq_len * seq_sparsity) + 1, (batch_size,), dtype=torch.int32
-        )
-        lengths_right = torch.randint(
-            1, int(max_seq_len * seq_sparsity) + 1, (batch_size,), dtype=torch.int32
-        )
-
-        offsets_left = torch.zeros(
-            (batch_size + 1,), dtype=lengths_left.dtype, device=lengths_left.device
-        )
-        offsets_left[1:] = torch.cumsum(lengths_left.view(-1), dim=0)
-        offsets_right = torch.zeros(
-            (batch_size + 1,), dtype=lengths_right.dtype, device=lengths_right.device
-        )
-        offsets_right[1:] = torch.cumsum(lengths_right.view(-1), dim=0)
-
-        combined_offsets = offsets_left + offsets_right
-        combined_values = torch.randn(int(combined_offsets[-1].item()), dtype=dtype)
-
-        max_seq_len_combined = int((lengths_left + lengths_right).max().item())
-
-        lengths_left = lengths_left.cuda()
-        lengths_right = lengths_right.cuda()
-        combined_values = combined_values.cuda()
-        offsets_left = offsets_left.cuda()
-        offsets_right = offsets_right.cuda()
-
-        if method == "custom_cuda":
-            fn = lambda: torch.ops.hstu.split_1d_jagged_jagged(  # noqa E731
-                lengths_left, lengths_right, combined_values
-            )
-        elif method == "hammer_pytorch":
-            fn = lambda: split_1D_jagged_jagged(  # noqa E731
-                max_seq_len_combined, combined_values, offsets_left, offsets_right
-            )
-        else:
-            raise ValueError(f"unknown method: {method}")
-
-        ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
-        return ms
-
-    bench_split_1d_jagged_jagged.run(print_data=True)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/common.h b/recommendation_v4/generative_recommenders/ops/cpp/common.h
deleted file mode 100644
index 1c4b43768..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/common.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <ATen/ATen.h>
-#include <ATen/cuda/Exceptions.h>
-#include <c10/cuda/CUDAGuard.h>
-
-#define AT_DISPATCH_CASE_FLOATING_TYPES_AND4(                \
-    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, ...) \
-  AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)               \
-  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                 \
-  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                 \
-  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)                 \
-  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)
-
-#define AT_DISPATCH_FLOATING_TYPES_AND4(                                 \
-    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, TYPE, NAME, ...) \
-  AT_DISPATCH_SWITCH(                                                    \
-      TYPE,                                                              \
-      NAME,                                                              \
-      AT_DISPATCH_CASE_FLOATING_TYPES_AND4(                              \
-          SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, __VA_ARGS__))
-
-inline __attribute__((always_inline)) uint32_t
-div_round_up(uint32_t a, uint32_t b) {
-  return (a + b - 1) / b;
-};
-
-inline __attribute__((always_inline)) uint32_t next_power_of_2(uint32_t n) {
-  n--;
-  n |= n >> 1;
-  n |= n >> 2;
-  n |= n >> 4;
-  n |= n >> 8;
-  n |= n >> 16;
-  n++;
-  return n;
-}
-
-/*
- * Because different .SO may include the same CUDA CUB kernels, this results in
- * confusion, where libA may end up calling libB's cub kernel and causing
- * failures when we static link libcudart_static.a. To avoid this, we annotate
- * only the public functions and hide the rest.
- */
-#define DLL_PUBLIC __attribute__((visibility("default")))
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/complete_cumsum.cpp b/recommendation_v4/generative_recommenders/ops/cpp/complete_cumsum.cpp
deleted file mode 100644
index 4ebd426d7..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/complete_cumsum.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <ATen/ATen.h>
-#include <ATen/core/op_registration/op_registration.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <torch/extension.h>
-
-#include "fbgemm_gpu/sparse_ops.h" // @manual
-
-namespace hstu {
-
-at::Tensor complete_cumsum_cpu(const at::Tensor& values) {
-  TORCH_CHECK(values.dim() == 1);
-  auto len = values.size(0);
-  const torch::Tensor index = at::range(0, len, at::kLong).cpu();
-  auto output = fbgemm_gpu::asynchronous_complete_cumsum_cpu(values);
-  return output;
-}
-
-at::Tensor complete_cumsum_meta(const at::Tensor& values) {
-  auto len = values.sym_size(0);
-  auto output = at::native::empty_meta_symint(
-      {len + 1},
-      /*dtype=*/::std::make_optional(values.scalar_type()),
-      /*layout=*/::std::make_optional(values.layout()),
-      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
-      /*pin_memory=*/::std::nullopt);
-  return output;
-}
-
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/complete_cumsum.cu b/recommendation_v4/generative_recommenders/ops/cpp/complete_cumsum.cu
deleted file mode 100644
index 06d1abdd6..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/complete_cumsum.cu
+++ /dev/null
@@ -1,51 +0,0 @@
-#include "common.h"
-
-#include <cub/device/device_scan.cuh>
-
-namespace hstu {
-
-DLL_PUBLIC at::Tensor complete_cumsum_cuda(const at::Tensor& values) {
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(values.get_device());
-
-  TORCH_CHECK(values.numel() < std::numeric_limits<int32_t>::max());
-  TORCH_CHECK(values.dim() == 1);
-  const auto values_contig = values.contiguous();
-
-  auto cumsum = at::empty({values_contig.numel() + 1}, values_contig.options());
-  cumsum[0].zero_();
-
-  AT_DISPATCH_FLOATING_TYPES_AND4(
-      at::ScalarType::Int,
-      at::ScalarType::Long,
-      at::ScalarType::Half,
-      at::ScalarType::BFloat16,
-      values_contig.scalar_type(),
-      "complete_cumsum_cuda",
-      [&] {
-        size_t temp_storage_bytes = 0;
-        AT_CUDA_CHECK(
-            cub::DeviceScan::InclusiveSum(
-                nullptr,
-                temp_storage_bytes,
-                values_contig.data_ptr<scalar_t>(),
-                cumsum.data_ptr<scalar_t>() + 1,
-                values_contig.numel(),
-                at::cuda::getCurrentCUDAStream()));
-        auto temp_storage = at::empty(
-            {static_cast<int64_t>(temp_storage_bytes)},
-            values_contig.options().dtype(at::kByte));
-        AT_CUDA_CHECK(
-            cub::DeviceScan::InclusiveSum(
-                temp_storage.data_ptr(),
-                temp_storage_bytes,
-                values_contig.data_ptr<scalar_t>(),
-                cumsum.data_ptr<scalar_t>() + 1,
-                values_contig.numel(),
-                at::cuda::getCurrentCUDAStream()));
-      });
-
-  return cumsum;
-}
-
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/concat_1d_jagged_jagged.cpp b/recommendation_v4/generative_recommenders/ops/cpp/concat_1d_jagged_jagged.cpp
deleted file mode 100644
index 51b313443..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/concat_1d_jagged_jagged.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <ATen/ATen.h>
-#include <ATen/Dispatch.h>
-#include <ATen/core/op_registration/op_registration.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/Exceptions.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <torch/extension.h>
-#include <torch/library.h>
-
-#include "fbgemm_gpu/sparse_ops.h" // @manual
-
-namespace hstu {
-
-template <typename index_t, typename val_t>
-void _concat_1d_jagged_jagged_cpu_kernel(
-    int32_t B,
-    const at::TensorAccessor<index_t, 1>& offsets_left,
-    const at::TensorAccessor<val_t, 1>& values_left,
-    const at::TensorAccessor<index_t, 1>& offsets_right,
-    const at::TensorAccessor<val_t, 1>& values_right,
-    at::TensorAccessor<val_t, 1> combined_values) {
-  for (auto b : c10::irange(B)) {
-    auto left_start = offsets_left[b];
-    auto left_len = offsets_left[b + 1] - left_start;
-    auto right_start = offsets_right[b];
-    auto right_len = offsets_right[b + 1] - right_start;
-    auto combined_start = left_start + right_start;
-    for (auto i = 0; i < left_len; ++i) {
-      combined_values[combined_start + i] = values_left[left_start + i];
-    }
-    for (auto i = 0; i < right_len; ++i) {
-      combined_values[left_len + combined_start + i] =
-          values_right[right_start + i];
-    }
-  }
-}
-
-at::Tensor concat_1d_jagged_jagged_cpu(
-    const at::Tensor& lengths_left,
-    const at::Tensor& values_left,
-    const at::Tensor& lengths_right,
-    const at::Tensor& values_right) {
-  TORCH_INTERNAL_ASSERT(lengths_left.device().type() == at::DeviceType::CPU);
-  TORCH_INTERNAL_ASSERT(values_left.device().type() == at::DeviceType::CPU);
-  TORCH_INTERNAL_ASSERT(lengths_right.device().type() == at::DeviceType::CPU);
-  TORCH_INTERNAL_ASSERT(values_right.device().type() == at::DeviceType::CPU);
-  auto L = values_left.numel() + values_right.numel();
-  TORCH_CHECK(L < std::numeric_limits<int32_t>::max());
-  TORCH_CHECK(lengths_left.size(0) == lengths_right.size(0));
-  auto B = lengths_left.size(0);
-  auto combined_values = at::empty({L}, values_left.options());
-  if (L == 0) {
-    return combined_values;
-  }
-  const auto offsets_left =
-      fbgemm_gpu::asynchronous_complete_cumsum_cpu(lengths_left.view({-1}));
-  const auto offsets_right =
-      fbgemm_gpu::asynchronous_complete_cumsum_cpu(lengths_right.view({-1}));
-  AT_DISPATCH_INTEGRAL_TYPES(
-      lengths_left.scalar_type(),
-      "concat_1d_jagged_jagged_values_cpu_kernel_input1",
-      [&] {
-        using index_t = scalar_t;
-        AT_DISPATCH_ALL_TYPES_AND2(
-            at::ScalarType::BFloat16,
-            at::ScalarType::Half,
-            values_left.scalar_type(),
-            "concat_1d_jagged_jagged_values_cpu_kernel_input2",
-            [&] {
-              using val_t = scalar_t;
-              _concat_1d_jagged_jagged_cpu_kernel<index_t, val_t>(
-                  B,
-                  offsets_left.accessor<index_t, 1>(),
-                  values_left.accessor<val_t, 1>(),
-                  offsets_right.accessor<index_t, 1>(),
-                  values_right.accessor<val_t, 1>(),
-                  combined_values.accessor<val_t, 1>());
-            });
-      });
-  return combined_values;
-}
-
-at::Tensor concat_1d_jagged_jagged_meta(
-    const at::Tensor& lengths_left,
-    const at::Tensor& values_left,
-    const at::Tensor& lengths_right,
-    const at::Tensor& values_right) {
-  auto L = values_left.numel() + values_right.numel();
-  return at::native::empty_meta_symint(
-      {L},
-      /*dtype=*/::std::make_optional(values_left.scalar_type()),
-      /*layout=*/::std::make_optional(values_left.layout()),
-      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
-      /*pin_memory=*/::std::nullopt);
-}
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/concat_1d_jagged_jagged.cu b/recommendation_v4/generative_recommenders/ops/cpp/concat_1d_jagged_jagged.cu
deleted file mode 100644
index 8eeae9d59..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/concat_1d_jagged_jagged.cu
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <ATen/ATen.h>
-#include <ATen/Dispatch.h>
-#include <ATen/core/op_registration/op_registration.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/Exceptions.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <torch/library.h>
-
-#include "common.h"
-#include "fbgemm_gpu/sparse_ops.h" // @manual
-#include "fbgemm_gpu/utils/fixed_divisor.cuh" // @manual
-
-namespace hstu {
-
-static constexpr int32_t kMaxThreads = 1024;
-
-template <typename index_t, typename val_t>
-__global__
-__launch_bounds__(kMaxThreads) void _concat_1d_jagged_jagged_cuda_kernel(
-    int32_t B,
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        offsets_left,
-    const at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits>
-        values_left,
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        offsets_right,
-    const at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits>
-        values_right,
-    at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits>
-        combined_values) {
-  for (auto b = blockIdx.x * blockDim.y + threadIdx.y;
-       b < static_cast<uint32_t>(B);
-       b += gridDim.x * blockDim.y) {
-    auto left_start = offsets_left[b];
-    auto left_len = offsets_left[b + 1] - left_start;
-    auto right_start = offsets_right[b];
-    auto right_len = offsets_right[b + 1] - right_start;
-    auto combined_start = left_start + right_start;
-    for (auto i = threadIdx.x; i < static_cast<uint32_t>(left_len + right_len);
-         i += blockDim.x) {
-      if (i < static_cast<uint32_t>(left_len)) {
-        combined_values[combined_start + i] = values_left[left_start + i];
-      } else {
-        combined_values[combined_start + i] =
-            values_right[right_start + i - left_len];
-      }
-    }
-  }
-}
-
-at::Tensor concat_1d_jagged_jagged_cuda(
-    const at::Tensor& lengths_left,
-    const at::Tensor& values_left,
-    const at::Tensor& lengths_right,
-    const at::Tensor& values_right) {
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(values_left.get_device());
-  TORCH_INTERNAL_ASSERT(lengths_left.device().type() == at::DeviceType::CUDA);
-  TORCH_INTERNAL_ASSERT(values_left.device().type() == at::DeviceType::CUDA);
-  TORCH_INTERNAL_ASSERT(lengths_right.device().type() == at::DeviceType::CUDA);
-  TORCH_INTERNAL_ASSERT(values_right.device().type() == at::DeviceType::CUDA);
-  auto L = values_left.numel() + values_right.numel();
-  TORCH_CHECK(L < std::numeric_limits<int32_t>::max());
-  TORCH_CHECK(values_left.get_device() == lengths_left.get_device());
-  TORCH_CHECK(values_left.get_device() == lengths_right.get_device());
-  TORCH_CHECK(values_left.get_device() == values_right.get_device());
-  auto B = lengths_left.size(0);
-  auto combined_values = at::empty({L}, values_left.options());
-  if (L == 0) {
-    return combined_values;
-  }
-  const auto offsets_left =
-      fbgemm_gpu::asynchronous_complete_cumsum_gpu(lengths_left.view({-1}));
-  const auto offsets_right =
-      fbgemm_gpu::asynchronous_complete_cumsum_gpu(lengths_right.view({-1}));
-  // Optimized thread block configuration based on benchmark results
-  uint32_t B_blocks = 4;
-  dim3 threads(256, B_blocks);
-  auto blocks = div_round_up(B, B_blocks);
-  AT_DISPATCH_INTEGRAL_TYPES(
-      lengths_left.scalar_type(),
-      "concat_1d_jagged_jagged_values_cuda_kernel_input1",
-      [&] {
-        using index_t = scalar_t;
-        AT_DISPATCH_ALL_TYPES_AND2(
-            at::ScalarType::BFloat16,
-            at::ScalarType::Half,
-            values_left.scalar_type(),
-            "concat_1d_jagged_jagged_values_cuda_kernel_input2",
-            [&] {
-              using val_t = scalar_t;
-              _concat_1d_jagged_jagged_cuda_kernel<index_t, val_t>
-                  <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
-                      B,
-                      offsets_left.packed_accessor32<
-                          index_t,
-                          1,
-                          at::RestrictPtrTraits>(),
-                      values_left
-                          .packed_accessor32<val_t, 1, at::RestrictPtrTraits>(),
-                      offsets_right.packed_accessor32<
-                          index_t,
-                          1,
-                          at::RestrictPtrTraits>(),
-                      values_right
-                          .packed_accessor32<val_t, 1, at::RestrictPtrTraits>(),
-                      combined_values.packed_accessor32<
-                          val_t,
-                          1,
-                          at::RestrictPtrTraits>());
-            });
-      });
-  return combined_values;
-}
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/cpp_ops.cpp b/recommendation_v4/generative_recommenders/ops/cpp/cpp_ops.cpp
deleted file mode 100644
index 155cc7572..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/cpp_ops.cpp
+++ /dev/null
@@ -1,207 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <ATen/ATen.h>
-#include <ATen/Dispatch.h>
-#include <ATen/core/op_registration/op_registration.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/Exceptions.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <torch/extension.h>
-#include <torch/library.h>
-
-/*
- * Because different .SO may include the same CUDA CUB kernels, this results in
- * confusion, where libA may end up calling libB's cub kernel and causing
- * failures when we static link libcudart_static.a. To avoid this, we annotate
- * only the public functions and hide the rest.
- */
-#define DLL_PUBLIC __attribute__((visibility("default")))
-
-namespace hstu {
-at::Tensor expand_1d_jagged_to_dense_cpu(
-    const at::Tensor& values,
-    const at::Tensor& offsets,
-    const int64_t max_len);
-
-at::Tensor expand_1d_jagged_to_dense_meta(
-    const at::Tensor& values,
-    const at::Tensor& offsets,
-    const c10::SymInt max_len);
-
-at::Tensor expand_1d_jagged_to_dense_cuda(
-    const at::Tensor& values,
-    const at::Tensor& offsets,
-    const int64_t max_len);
-
-at::Tensor complete_cumsum_cpu(const at::Tensor& values);
-
-at::Tensor complete_cumsum_cuda(const at::Tensor& values);
-
-at::Tensor complete_cumsum_meta(const at::Tensor& values);
-
-at::Tensor concat_1d_jagged_jagged_cpu(
-    const at::Tensor& lengths_left,
-    const at::Tensor& values_left,
-    const at::Tensor& lengths_right,
-    const at::Tensor& values_right);
-
-at::Tensor concat_1d_jagged_jagged_cuda(
-    const at::Tensor& lengths_left,
-    const at::Tensor& values_left,
-    const at::Tensor& lengths_right,
-    const at::Tensor& values_right);
-
-at::Tensor concat_1d_jagged_jagged_meta(
-    const at::Tensor& lengths_left,
-    const at::Tensor& values_left,
-    const at::Tensor& lengths_right,
-    const at::Tensor& values_right);
-
-std::tuple<at::Tensor, at::Tensor> split_1d_jagged_jagged_cpu(
-    const at::Tensor& lengths_left,
-    const at::Tensor& lengths_right,
-    const at::Tensor& combined_values);
-
-std::tuple<at::Tensor, at::Tensor> split_1d_jagged_jagged_cuda(
-    const at::Tensor& lengths_left,
-    const at::Tensor& lengths_right,
-    const at::Tensor& combined_values);
-
-std::tuple<at::Tensor, at::Tensor> split_1d_jagged_jagged_meta(
-    const at::Tensor& lengths_left,
-    const at::Tensor& lengths_right,
-    const at::Tensor& combined_values);
-
-at::Tensor replace_last_n_with_jagged_cpu(
-    const at::Tensor& lengths_left,
-    const at::Tensor& values_left,
-    const at::Tensor& lengths_right,
-    const at::Tensor& values_right);
-
-at::Tensor replace_last_n_with_jagged_cuda(
-    const at::Tensor& lengths_left,
-    const at::Tensor& values_left,
-    const at::Tensor& lengths_right,
-    const at::Tensor& values_right);
-
-at::Tensor replace_last_n_with_jagged_meta(
-    const at::Tensor& lengths_left,
-    const at::Tensor& values_left,
-    const at::Tensor& lengths_right,
-    const at::Tensor& values_right);
-
-std::tuple<at::Tensor, at::Tensor, at::Tensor> jagged_transpose_1d_cpu(
-    const at::Tensor& values,
-    const at::Tensor& offsets,
-    const at::Tensor& lengths,
-    const int64_t max_len,
-    const int64_t size1,
-    const int64_t size2);
-
-std::tuple<at::Tensor, at::Tensor, at::Tensor> jagged_transpose_1d_cuda(
-    const at::Tensor& values,
-    const at::Tensor& offsets,
-    const at::Tensor& lengths,
-    const int64_t max_len,
-    const int64_t size1,
-    const int64_t size2);
-
-std::tuple<at::Tensor, at::Tensor, at::Tensor> jagged_transpose_1d_meta(
-    const at::Tensor& values,
-    const at::Tensor& offsets,
-    const at::Tensor& lengths,
-    const int64_t max_len,
-    const int64_t size1,
-    const int64_t size2);
-
-DLL_PUBLIC std::tuple<at::Tensor, at::Tensor> sort_kv_pairs_meta(
-    const at::Tensor& keys,
-    const at::Tensor& values,
-    const std::optional<int64_t>& end_bit,
-    const bool descending = false) {
-  TORCH_CHECK(
-      keys.dtype() == at::kInt || keys.dtype() == at::kLong ||
-      keys.dtype() == at::kByte || keys.dtype() == at::kShort);
-  TORCH_CHECK(keys.dim() == 1);
-  TORCH_CHECK(values.dim() == 1);
-  return {at::empty_like(keys), at::empty_like(values)};
-}
-
-std::tuple<at::Tensor, at::Tensor> sort_kv_pairs_cuda(
-    const at::Tensor& keys,
-    const at::Tensor& values,
-    const std::optional<int64_t>& end_bit,
-    const bool descending = false);
-
-} // namespace hstu
-
-TORCH_LIBRARY_FRAGMENT(hstu, m) {
-  m.def(
-      "expand_1d_jagged_to_dense(Tensor values, Tensor offsets, SymInt max_len) -> Tensor");
-  m.def(
-      "concat_1d_jagged_jagged(Tensor lengths_left, Tensor values_left, Tensor lengths_right, Tensor values_right) -> Tensor");
-  m.def(
-      "split_1d_jagged_jagged(Tensor lengths_left, Tensor lengths_right, Tensor combined_values) -> (Tensor, Tensor)");
-  m.def(
-      "replace_last_n_with_jagged(Tensor lengths_left, Tensor values_left, Tensor lengths_right, Tensor values_right) -> Tensor");
-  m.def(
-      "jagged_transpose_1d(Tensor values, Tensor offsets, Tensor lengths, int max_len, int size1, int size2) -> (Tensor, Tensor, Tensor)");
-  m.def("complete_cumsum(Tensor values) -> Tensor");
-  m.def(
-      "sort_kv_pairs(Tensor keys, Tensor values, int? end_bit=None, bool descending=False) -> (Tensor, Tensor)");
-}
-
-TORCH_LIBRARY_IMPL(hstu, CPU, m) {
-  m.impl("expand_1d_jagged_to_dense", hstu::expand_1d_jagged_to_dense_cpu);
-  m.impl("concat_1d_jagged_jagged", hstu::concat_1d_jagged_jagged_cpu);
-  m.impl("split_1d_jagged_jagged", hstu::split_1d_jagged_jagged_cpu);
-  m.impl("replace_last_n_with_jagged", hstu::replace_last_n_with_jagged_cpu);
-  m.impl("jagged_transpose_1d", hstu::jagged_transpose_1d_cpu);
-  m.impl("complete_cumsum", hstu::complete_cumsum_cpu);
-}
-
-TORCH_LIBRARY_IMPL(hstu, CUDA, m) {
-  m.impl("expand_1d_jagged_to_dense", hstu::expand_1d_jagged_to_dense_cuda);
-  m.impl("concat_1d_jagged_jagged", hstu::concat_1d_jagged_jagged_cuda);
-  m.impl("split_1d_jagged_jagged", hstu::split_1d_jagged_jagged_cuda);
-  m.impl("replace_last_n_with_jagged", hstu::replace_last_n_with_jagged_cuda);
-  m.impl("jagged_transpose_1d", hstu::jagged_transpose_1d_cuda);
-  m.impl("complete_cumsum", hstu::complete_cumsum_cuda);
-  m.impl(
-      "sort_kv_pairs",
-      torch::dispatch(
-          c10::DispatchKey::CUDA, TORCH_FN(hstu::sort_kv_pairs_cuda)));
-}
-
-TORCH_LIBRARY_IMPL(hstu, Meta, m) {
-  m.impl("expand_1d_jagged_to_dense", hstu::expand_1d_jagged_to_dense_meta);
-  m.impl("concat_1d_jagged_jagged", hstu::concat_1d_jagged_jagged_meta);
-  m.impl("split_1d_jagged_jagged", hstu::split_1d_jagged_jagged_meta);
-  m.impl("replace_last_n_with_jagged", hstu::replace_last_n_with_jagged_meta);
-  m.impl("jagged_transpose_1d", hstu::jagged_transpose_1d_meta);
-  m.impl("complete_cumsum", hstu::complete_cumsum_meta);
-  m.impl(
-      "sort_kv_pairs",
-      torch::dispatch(
-          c10::DispatchKey::Meta, TORCH_FN(hstu::sort_kv_pairs_meta)));
-}
-
-TORCH_LIBRARY_IMPL(hstu, Autograd, m) {
-  m.impl(
-      "expand_1d_jagged_to_dense",
-      torch::autograd::autogradNotImplementedFallback());
-  m.impl("complete_cumsum", torch::autograd::autogradNotImplementedFallback());
-}
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/cuda_hstu_attention.py b/recommendation_v4/generative_recommenders/ops/cpp/cuda_hstu_attention.py
deleted file mode 100644
index 0f9458c8b..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/cuda_hstu_attention.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-strict
-
-from typing import Optional
-
-import torch
-from generative_recommenders.ops.utils import is_sm100_plus
-
-try:
-    # We need to import the CUDA kernels after importing torch
-    import hstu._C  # pyre-ignore [21]
-except:
-    pass
-try:
-    torch.ops.load_library(
-        "//generative_recommenders/fb/ultra/ops/blackwell/hstu_attention:hstu_flash_attention"
-    )
-    torch.ops.load_library(
-        "//generative_recommenders/ops/cpp/hstu_attention:hstu_flash_attention"
-    )
-except:
-    pass
-
-
-def cuda_hstu_mha(
-    max_seq_len: int,
-    alpha: float,
-    q: torch.Tensor,
-    k: torch.Tensor,
-    v: torch.Tensor,
-    seq_offsets: Optional[torch.Tensor] = None,
-    causal: bool = False,
-    num_targets: Optional[torch.Tensor] = None,
-    attn_scale: Optional[torch.Tensor] = None,
-    max_attn_len: int = 0,
-    min_full_attn_seq_len: int = 0,
-    contextual_seq_len: int = 0,
-    q_descale: Optional[torch.Tensor] = None,
-    k_descale: Optional[torch.Tensor] = None,
-    v_descale: Optional[torch.Tensor] = None,
-    sort_by_length: bool = False,
-    deterministic: bool = False,
-    sm_margin: int = 0,
-    max_q_len: int = 0,
-    seq_offsets_q: Optional[torch.Tensor] = None,
-    num_softmax_heads: int = 0,
-    training: bool = True,
-    max_seq_len_tensor: Optional[torch.Tensor] = None,
-    contextual_seq_len_tensor: Optional[torch.Tensor] = None,
-    max_attn_len_tensor: Optional[torch.Tensor] = None,
-    min_full_attn_seq_len_tensor: Optional[torch.Tensor] = None,
-    num_groups: int = 1,
-    is_inference: bool = False,
-) -> torch.Tensor:
-    """
-    Arguments:
-        q, k, v: (batch_size, seqlen, nheads, headdim) or (total_seqlen, nheads, headdim)
-    """
-    if is_sm100_plus() and not is_inference:
-        return torch.ops.bw_hstu.bw_hstu_mha(
-            max_seq_len,
-            alpha,
-            q,
-            k,
-            v,
-            seq_offsets,
-            causal,
-            num_targets,
-            attn_scale,
-            max_attn_len,
-            min_full_attn_seq_len,
-            contextual_seq_len,
-            q_descale,
-            k_descale,
-            v_descale,
-            sort_by_length,
-            deterministic,
-            sm_margin,
-            max_q_len,
-            seq_offsets_q,
-            max_seq_len_tensor,
-            contextual_seq_len_tensor,
-            max_attn_len_tensor,
-            min_full_attn_seq_len_tensor,
-            num_groups,
-            num_softmax_heads,
-        )
-    else:
-        return cuda_hstu_mha_inference_wrapper(
-            max_seq_len,
-            alpha,
-            q,
-            k,
-            v,
-            seq_offsets,
-            causal,
-            num_targets,
-            attn_scale,
-            max_attn_len,
-            min_full_attn_seq_len,
-            contextual_seq_len,
-            q_descale,
-            k_descale,
-            v_descale,
-            sort_by_length,
-            deterministic,
-            sm_margin,
-            max_q_len,
-            seq_offsets_q,
-            num_softmax_heads,
-            training,
-            max_seq_len_tensor,
-            contextual_seq_len_tensor,
-            max_attn_len_tensor,
-            min_full_attn_seq_len_tensor,
-            num_groups,
-        )
-
-
-@torch.fx.wrap
-def cuda_hstu_mha_inference_wrapper(
-    max_seq_len: int,
-    alpha: float,
-    q: torch.Tensor,
-    k: torch.Tensor,
-    v: torch.Tensor,
-    seq_offsets: Optional[torch.Tensor] = None,
-    causal: bool = False,
-    num_targets: Optional[torch.Tensor] = None,
-    attn_scale: Optional[torch.Tensor] = None,
-    max_attn_len: int = 0,
-    min_full_attn_seq_len: int = 0,
-    contextual_seq_len: int = 0,
-    q_descale: Optional[torch.Tensor] = None,
-    k_descale: Optional[torch.Tensor] = None,
-    v_descale: Optional[torch.Tensor] = None,
-    sort_by_length: bool = False,
-    deterministic: bool = False,
-    sm_margin: int = 0,
-    max_q_len: int = 0,
-    seq_offsets_q: Optional[torch.Tensor] = None,
-    num_softmax_heads: int = 0,
-    training: bool = True,
-    max_seq_len_tensor: Optional[torch.Tensor] = None,
-    contextual_seq_len_tensor: Optional[torch.Tensor] = None,
-    max_attn_len_tensor: Optional[torch.Tensor] = None,
-    min_full_attn_seq_len_tensor: Optional[torch.Tensor] = None,
-    num_groups: int = 1,
-) -> torch.Tensor:
-    attn_scale = attn_scale.to(torch.float32) if attn_scale is not None else attn_scale
-
-    return torch.ops.hstu.hstu_mha(
-        max_seq_len,
-        alpha,
-        q,
-        k,
-        v,
-        seq_offsets,
-        causal,
-        num_targets,
-        attn_scale,
-        max_attn_len,
-        min_full_attn_seq_len,
-        contextual_seq_len,
-        q_descale,
-        k_descale,
-        v_descale,
-        sort_by_length,
-        deterministic,
-        sm_margin,
-        max_q_len,
-        seq_offsets_q,
-        num_softmax_heads,
-        training,
-        max_seq_len_tensor,
-        contextual_seq_len_tensor,
-        max_attn_len_tensor,
-        min_full_attn_seq_len_tensor,
-        num_groups,
-    )
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/cuda_hstu_preprocess_and_attention.py b/recommendation_v4/generative_recommenders/ops/cpp/cuda_hstu_preprocess_and_attention.py
deleted file mode 100644
index 2184ef2a5..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/cuda_hstu_preprocess_and_attention.py
+++ /dev/null
@@ -1,668 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#!/usr/bin/env python3
-
-# pyre-strict
-
-from typing import Optional, Tuple
-
-import torch
-from generative_recommenders.ops.triton.triton_addmm import (
-    maybe_triton_addmm_fwd,
-    triton_addmm_bwd,
-)
-from generative_recommenders.ops.triton.triton_layer_norm import (
-    triton_weighted_layer_norm_bwd,
-)
-from generative_recommenders.ops.utils import copy_if_different_ptr, is_sm100_plus
-from torch.nn import functional as F
-
-try:
-    from generative_recommenders.fb.ultra.ops.fp8.fp8_addmm import (
-        fp8_rowwise_quantize_addmm,
-    )
-    from generative_recommenders.fb.ultra.ops.fp8.layer_norm_quantization import (
-        triton_weighted_layer_norm_quantization_fwd,
-    )
-    from hammer.ops.triton.triton_apply_rope import (
-        triton_apply_rope_bwd,
-        triton_apply_rope_fwd,
-    )
-
-    if is_sm100_plus():
-        print("is sm100_plus architecture, loading hstu flash attention for blackwell")
-        torch.ops.load_library(
-            "//generative_recommenders/fb/ultra/ops/blackwell/hstu_attention:hstu_flash_attention"
-        )
-    print("loading hstu flash attention for general architecture")
-    torch.ops.load_library(
-        "//generative_recommenders/ops/cpp/hstu_attention:hstu_flash_attention"
-    )
-except Exception as ex:
-    print(f"Library importing error when importing library: {ex}")
-
-
-class _HSTUPreprocessAndAttentionFunction(torch.autograd.Function):
-    @staticmethod
-    # pyre-ignore [14]
-    def forward(
-        ctx,  # pyre-ignore [2]
-        x: torch.Tensor,
-        norm_weight: torch.Tensor,
-        norm_bias: torch.Tensor,
-        norm_eps: float,
-        num_heads: int,
-        attn_dim: int,
-        hidden_dim: int,
-        uvqk_weight: torch.Tensor,
-        uvqk_bias: Optional[torch.Tensor],
-        max_seq_len: int,
-        seq_offsets: torch.Tensor,
-        alpha: float,
-        invalid_attn_mask_type: str,
-        num_targets: Optional[torch.Tensor],
-        rotary_weights: Optional[
-            Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
-        ] = None,
-        attn_scale: Optional[torch.Tensor] = None,
-        recompute_uvqk_in_backward: bool = False,
-        recompute_normed_x_in_backward: bool = False,
-        contextual_seq_len: int = 0,
-        sort_by_length: bool = False,
-        max_attn_len: Optional[int] = None,
-        full_attn_size: Optional[int] = None,
-        silu_u: bool = True,
-        fp8_in_addmm_fwd: bool = False,
-        num_softmax_heads: int = 0,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        max_attn_len = max_attn_len or 0
-        full_attn_size = full_attn_size or 0
-        normed_x, x_mean, x_rstd, BLOCK_D, x_scale, normed_x_fp8 = (
-            triton_weighted_layer_norm_quantization_fwd(
-                x=x,
-                weight=norm_weight,
-                bias=norm_bias,
-                eps=norm_eps,
-                quantize_output=fp8_in_addmm_fwd,
-            )
-        )
-        # When silu_u is False and we want to recompute in backward, we split the weight
-        # for u and vqk separately during training to compute them independently.
-        # This avoids needing to clone u (which would otherwise keep the whole uvqk alive).
-        if not silu_u and recompute_uvqk_in_backward:
-            # Split the weights/biases to compute u and vqk separately
-            u_weight, vqk_weight = uvqk_weight.split(
-                [
-                    hidden_dim * num_heads,
-                    hidden_dim * num_heads
-                    + attn_dim * num_heads
-                    + attn_dim * num_heads,
-                ],
-                dim=1,
-            )
-            if uvqk_bias is not None:
-                u_bias, vqk_bias = uvqk_bias.split(
-                    [
-                        hidden_dim * num_heads,
-                        hidden_dim * num_heads
-                        + attn_dim * num_heads
-                        + attn_dim * num_heads,
-                    ],
-                    dim=0,
-                )
-            else:
-                u_bias, vqk_bias = None, None
-            if fp8_in_addmm_fwd:
-                assert x_scale is not None and normed_x_fp8 is not None
-                u = fp8_rowwise_quantize_addmm(
-                    x=normed_x,
-                    x_fp8=normed_x_fp8,
-                    w=u_weight,
-                    y=u_bias,
-                    x_scale=x_scale,
-                    custom_kernel=False,
-                    is_inference=False,
-                ).contiguous()
-                vqk = fp8_rowwise_quantize_addmm(
-                    x=normed_x,
-                    x_fp8=normed_x_fp8,
-                    w=vqk_weight,
-                    y=vqk_bias,
-                    x_scale=x_scale,
-                    custom_kernel=False,
-                    is_inference=False,
-                ).contiguous()
-            else:
-                u = maybe_triton_addmm_fwd(normed_x, u_weight, u_bias).contiguous()
-                vqk = maybe_triton_addmm_fwd(
-                    normed_x, vqk_weight, vqk_bias
-                ).contiguous()
-            v, q, k = vqk.split(
-                [
-                    hidden_dim * num_heads,
-                    attn_dim * num_heads,
-                    attn_dim * num_heads,
-                ],
-                dim=1,
-            )
-            # uvqk is not used since we split the computation, but we need it
-            # for saving in case recompute_uvqk_in_backward is False in a
-            # different code path. Set to None to satisfy type checker.
-            uvqk = None
-        else:
-            if fp8_in_addmm_fwd:
-                assert (
-                    x_scale is not None
-                    and normed_x_fp8 is not None
-                    and uvqk_bias is not None
-                )
-                uvqk = fp8_rowwise_quantize_addmm(
-                    x=normed_x,
-                    x_fp8=normed_x_fp8,
-                    w=uvqk_weight,
-                    y=uvqk_bias,
-                    x_scale=x_scale,
-                    custom_kernel=False,
-                    is_inference=False,
-                ).contiguous()
-            else:
-                uvqk = maybe_triton_addmm_fwd(
-                    normed_x, uvqk_weight, uvqk_bias
-                ).contiguous()
-            u, v, q, k = uvqk.split(
-                [
-                    hidden_dim * num_heads,
-                    hidden_dim * num_heads,
-                    attn_dim * num_heads,
-                    attn_dim * num_heads,
-                ],
-                dim=1,
-            )
-            if silu_u:
-                u = F.silu(u)
-        if rotary_weights is not None:
-            q_cos_weights = rotary_weights[0]
-            q_sin_weights = rotary_weights[1]
-            k_cos_weights = rotary_weights[2]
-            k_sin_weights = rotary_weights[3]
-            _q = triton_apply_rope_fwd(
-                x=q.view(-1, num_heads, attn_dim),
-                N=max_seq_len,
-                seq_offsets=seq_offsets,
-                cos_rope=q_cos_weights,
-                sin_rope=q_sin_weights,
-            ).view(-1, num_heads * attn_dim)
-            _k = triton_apply_rope_fwd(
-                x=k.view(-1, num_heads, attn_dim),
-                N=max_seq_len,
-                seq_offsets=seq_offsets,
-                cos_rope=k_cos_weights,
-                sin_rope=k_sin_weights,
-            ).view(-1, num_heads * attn_dim)
-            copy_if_different_ptr(q, _q)
-            copy_if_different_ptr(k, _k)
-        q = q.view(-1, num_heads, attn_dim)
-        k = k.view(-1, num_heads, attn_dim)
-        v = v.view(-1, num_heads, hidden_dim)
-        if is_sm100_plus():
-            out, softmax_lse = torch.ops.bw_hstu.bw_hstu_mha_fwd(
-                max_seq_len,
-                alpha,
-                q,
-                k,
-                v,
-                seq_offsets,
-                True,  # causal
-                num_targets,
-                attn_scale,
-                max_attn_len,
-                full_attn_size,
-                contextual_seq_len,
-                None,  # q_descale
-                None,  # k_descale
-                None,  # v_descale
-                0,  # sm_margin
-                max_seq_len,  # max_q_len,
-                None,  # seq_offsets_q,
-                None,  # max_seq_len_tensor,
-                None,  # contextual_seq_len_tensor,
-                None,  # max_attn_len_tensor,
-                None,  # min_full_attn_seq_len_tensor,
-                1,  # num_groups
-                num_softmax_heads,  # num_softmax_heads
-            )
-        else:
-            out, softmax_lse = torch.ops.hstu.hstu_mha_fwd(
-                max_seq_len,
-                alpha,
-                q,
-                k,
-                v,
-                seq_offsets,
-                True,  # causal
-                num_targets,
-                attn_scale,
-                max_attn_len,
-                full_attn_size,
-                contextual_seq_len,
-                None,  # q_descale
-                None,  # k_descale
-                None,  # v_descale
-                0,  # sm_margin
-                0,  # max_q_len,
-                None,  # seq_offsets_q,
-                num_softmax_heads,  # num_softmax_heads,
-            )
-        # update ctx
-        saved_tensors = [
-            x,
-            norm_weight,
-            norm_bias,
-            x_mean,
-            x_rstd,
-            uvqk_weight,
-            seq_offsets,
-            out,
-        ]
-        if num_softmax_heads > 0:
-            saved_tensors.append(softmax_lse)
-        if num_targets is not None:
-            saved_tensors.append(num_targets)
-        if attn_scale is not None:
-            saved_tensors.append(attn_scale)
-        if not recompute_normed_x_in_backward:
-            saved_tensors.append(normed_x)
-        if recompute_uvqk_in_backward:
-            if uvqk_bias is not None:
-                saved_tensors.append(uvqk_bias)
-            if fp8_in_addmm_fwd:
-                saved_tensors.append(x_scale)  # pyre-ignore
-                saved_tensors.append(normed_x_fp8)  # pyre-ignore
-        else:
-            saved_tensors.append(uvqk)
-        if rotary_weights is not None:
-            saved_tensors.append(rotary_weights[0])
-            saved_tensors.append(rotary_weights[1])
-            saved_tensors.append(rotary_weights[2])
-            saved_tensors.append(rotary_weights[3])
-        ctx.save_for_backward(*saved_tensors)
-        ctx.alpha = alpha
-        ctx.invalid_attn_mask_type = invalid_attn_mask_type
-        ctx.has_multiple_targets = num_targets is not None
-        ctx.has_rotary_weights = rotary_weights is not None
-        ctx.has_attn_scale = attn_scale is not None
-        ctx.max_seq_len = max_seq_len
-        ctx.max_attn_len = max_attn_len
-        ctx.full_attn_size = full_attn_size
-        ctx.recompute_normed_x_in_backward = recompute_normed_x_in_backward
-        ctx.recompute_uvqk_in_backward = recompute_uvqk_in_backward
-        ctx.hidden_dim = hidden_dim
-        ctx.attn_dim = attn_dim
-        ctx.num_heads = num_heads
-        ctx.has_uvqk_bias = uvqk_bias is not None
-        ctx.uvqk_bias_1d = uvqk_bias.dim() == 1 if uvqk_bias is not None else False
-        ctx.norm_eps = norm_eps
-        ctx.norm_BLOCK_D = BLOCK_D
-        ctx.contextual_seq_len = contextual_seq_len
-        ctx.sort_by_length = sort_by_length
-        ctx.silu_u = silu_u
-        ctx.fp8_in_addmm_fwd = fp8_in_addmm_fwd
-        ctx.num_softmax_heads = num_softmax_heads
-        return u, out
-
-    @staticmethod
-    # pyre-ignore[14]
-    def backward(
-        ctx,  # pyre-ignore[2]
-        _du: torch.Tensor,
-        dout: torch.Tensor,
-    ) -> Tuple[
-        torch.Tensor,  # d_x
-        torch.Tensor,  # d_norm_weight
-        torch.Tensor,  # d_norm_bias
-        None,
-        None,
-        None,
-        None,
-        torch.Tensor,  # d_uvqk_weight
-        torch.Tensor,  # d_uvqk_bias
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-        None,
-    ]:
-        x, norm_weight, norm_bias, x_mean, x_rstd, uvqk_weight, seq_offsets, out = (
-            ctx.saved_tensors[:8]
-        )
-        idx = 8
-        if ctx.num_softmax_heads > 0:
-            softmax_lse = ctx.saved_tensors[idx]
-            idx += 1
-        else:
-            softmax_lse = None
-        if ctx.has_multiple_targets:
-            num_targets = ctx.saved_tensors[idx]
-            idx += 1
-        else:
-            num_targets = None
-        if ctx.has_attn_scale:
-            attn_scale = ctx.saved_tensors[idx]
-            idx += 1
-        else:
-            attn_scale = None
-        if ctx.recompute_normed_x_in_backward:
-            normed_x, _, _, _, _, _ = triton_weighted_layer_norm_quantization_fwd(
-                x=x,
-                weight=norm_weight,
-                bias=norm_bias,
-                eps=ctx.norm_eps,
-                mean=x_mean,
-                rstd=x_rstd,
-                quantize_output=ctx.fp8_in_addmm_fwd,
-            )
-        else:
-            normed_x = ctx.saved_tensors[idx]
-            idx += 1
-        if ctx.recompute_uvqk_in_backward:
-            if ctx.has_uvqk_bias:
-                uvqk_bias = ctx.saved_tensors[idx]
-                idx += 1
-            else:
-                uvqk_bias = None
-            if not ctx.silu_u:
-                # When silu_u is False, we only recompute vqk (not u)
-                # Split the weights/biases to extract vqk portion
-                _, vqk_weight = uvqk_weight.split(
-                    [
-                        ctx.hidden_dim * ctx.num_heads,
-                        ctx.hidden_dim * ctx.num_heads
-                        + ctx.attn_dim * ctx.num_heads
-                        + ctx.attn_dim * ctx.num_heads,
-                    ],
-                    dim=1,
-                )
-                vqk_bias = None
-                if ctx.has_uvqk_bias:
-                    _, vqk_bias = uvqk_bias.split(
-                        [
-                            ctx.hidden_dim * ctx.num_heads,
-                            ctx.hidden_dim * ctx.num_heads
-                            + ctx.attn_dim * ctx.num_heads
-                            + ctx.attn_dim * ctx.num_heads,
-                        ],
-                        dim=0,
-                    )
-                if ctx.fp8_in_addmm_fwd:
-                    x_scale, normed_x_fp8 = ctx.saved_tensors[idx : idx + 2]
-                    vqk = fp8_rowwise_quantize_addmm(
-                        x=normed_x,
-                        x_fp8=normed_x_fp8,
-                        w=vqk_weight,
-                        y=vqk_bias,
-                        x_scale=x_scale,
-                        custom_kernel=False,
-                        is_inference=False,
-                    )
-                    idx += 2
-                else:
-                    vqk = maybe_triton_addmm_fwd(
-                        normed_x, vqk_weight, vqk_bias
-                    ).contiguous()
-                # Split vqk into v, q, k components
-                v, q, k = vqk.split(
-                    [
-                        ctx.hidden_dim * ctx.num_heads,
-                        ctx.attn_dim * ctx.num_heads,
-                        ctx.attn_dim * ctx.num_heads,
-                    ],
-                    dim=1,
-                )
-                u = None
-            else:
-                # When silu_u is True, we recompute uvqk (all components)
-                if ctx.fp8_in_addmm_fwd:
-                    x_scale, normed_x_fp8 = ctx.saved_tensors[idx : idx + 2]
-                    uvqk = fp8_rowwise_quantize_addmm(
-                        x=normed_x,
-                        x_fp8=normed_x_fp8,
-                        w=uvqk_weight,
-                        y=uvqk_bias,
-                        x_scale=x_scale,
-                        custom_kernel=False,
-                        is_inference=False,
-                    )
-                    idx += 2
-                else:
-                    uvqk = maybe_triton_addmm_fwd(
-                        normed_x, uvqk_weight, uvqk_bias
-                    ).contiguous()
-                # Split uvqk into u, v, q, k components
-                u, v, q, k = uvqk.split(
-                    [
-                        ctx.hidden_dim * ctx.num_heads,
-                        ctx.hidden_dim * ctx.num_heads,
-                        ctx.attn_dim * ctx.num_heads,
-                        ctx.attn_dim * ctx.num_heads,
-                    ],
-                    dim=1,
-                )
-        else:
-            uvqk = ctx.saved_tensors[idx]
-            idx += 1
-            # Split saved uvqk into u, v, q, k components
-            u, v, q, k = uvqk.split(
-                [
-                    ctx.hidden_dim * ctx.num_heads,
-                    ctx.hidden_dim * ctx.num_heads,
-                    ctx.attn_dim * ctx.num_heads,
-                    ctx.attn_dim * ctx.num_heads,
-                ],
-                dim=1,
-            )
-        if ctx.has_rotary_weights:
-            q_cos_weights, q_sin_weights, k_cos_weights, k_sin_weights = (
-                ctx.saved_tensors[idx : idx + 4]
-            )
-            idx += 4
-        else:
-            q_cos_weights, q_sin_weights, k_cos_weights, k_sin_weights = (
-                None,
-                None,
-                None,
-                None,
-            )
-
-        duvqk = torch.empty(
-            [
-                x.size(0),
-                ctx.hidden_dim * ctx.num_heads * 2 + ctx.attn_dim * ctx.num_heads * 2,
-            ],
-            device=x.device,
-            dtype=x.dtype,
-        )
-        du, dv, dq, dk = duvqk.split(
-            [
-                ctx.hidden_dim * ctx.num_heads,
-                ctx.hidden_dim * ctx.num_heads,
-                ctx.attn_dim * ctx.num_heads,
-                ctx.attn_dim * ctx.num_heads,
-            ],
-            dim=1,
-        )
-        q = q.view(-1, ctx.num_heads, ctx.attn_dim)
-        k = k.view(-1, ctx.num_heads, ctx.attn_dim)
-        v = v.view(-1, ctx.num_heads, ctx.hidden_dim)
-        dq = dq.view(-1, ctx.num_heads, ctx.attn_dim)
-        dk = dk.view(-1, ctx.num_heads, ctx.attn_dim)
-        dv = dv.view(-1, ctx.num_heads, ctx.hidden_dim)
-        if (
-            ctx.recompute_uvqk_in_backward and ctx.has_rotary_weights
-        ):  # recompute ROPE on qk
-            q = triton_apply_rope_fwd(
-                x=q,
-                N=ctx.max_seq_len,
-                seq_offsets=seq_offsets,
-                cos_rope=q_cos_weights,
-                sin_rope=q_sin_weights,
-            )
-            k = triton_apply_rope_fwd(
-                x=k,
-                N=ctx.max_seq_len,
-                seq_offsets=seq_offsets,
-                cos_rope=k_cos_weights,
-                sin_rope=k_sin_weights,
-            )
-        dq = dq.view(-1, ctx.num_heads, ctx.attn_dim)
-        dk = dk.view(-1, ctx.num_heads, ctx.attn_dim)
-        dv = dv.view(-1, ctx.num_heads, ctx.hidden_dim)
-        # Note: the two operations below update duvqk in place
-        if is_sm100_plus():
-            _dq, _dk, _dv = torch.ops.bw_hstu.bw_hstu_mha_bwd(
-                ctx.max_seq_len,
-                ctx.alpha,
-                dout,
-                q,
-                k,
-                v,
-                dq,
-                dk,
-                dv,
-                seq_offsets,
-                True,  # causal
-                num_targets,
-                attn_scale,
-                ctx.max_attn_len,
-                ctx.full_attn_size,
-                ctx.contextual_seq_len,
-                ctx.sort_by_length,
-                False,  # deterministic
-                0,  # sm_margin
-                ctx.max_seq_len,  # max_q_len,
-                None,  # seq_offsets_q,
-                None,  # max_seq_len_tensor,
-                None,  # contextual_seq_len_tensor,
-                None,  # max_attn_len_tensor,
-                None,  # min_full_attn_seq_len_tensor,
-                1,  # num_groups
-                ctx.num_softmax_heads,  # num_softmax_heads
-                out,  # out
-                softmax_lse,  # lse
-            )
-        else:
-            _dq, _dk, _dv = torch.ops.hstu.hstu_mha_bwd(
-                ctx.max_seq_len,
-                ctx.alpha,
-                dout,
-                q,
-                k,
-                v,
-                dq,
-                dk,
-                dv,
-                out,
-                seq_offsets,
-                True,  # causal
-                num_targets,
-                attn_scale,
-                ctx.max_attn_len,
-                ctx.full_attn_size,
-                ctx.contextual_seq_len,
-                ctx.sort_by_length,
-                False,  # deterministic
-                0,  # sm_margin
-                0,  # max_q_len,
-                None,  # seq_offsets_q,
-                ctx.num_softmax_heads,  # num_softmax_heads,
-                softmax_lse,
-            )
-        if ctx.has_rotary_weights:
-            _dq = triton_apply_rope_bwd(
-                grad=_dq,
-                N=ctx.max_seq_len,
-                seq_offsets=seq_offsets,
-                cos_rope=q_cos_weights,
-                sin_rope=q_sin_weights,
-            )
-            _dk = triton_apply_rope_bwd(
-                grad=_dk,
-                N=ctx.max_seq_len,
-                seq_offsets=seq_offsets,
-                cos_rope=k_cos_weights,
-                sin_rope=k_sin_weights,
-            )
-        copy_if_different_ptr(dq, _dq)
-        copy_if_different_ptr(dk, _dk)
-        copy_if_different_ptr(dv, _dv)
-        if ctx.silu_u:
-            torch.ops.aten.silu_backward(_du, u, grad_input=du)
-        else:
-            copy_if_different_ptr(du, _du)
-        d_normed_x, d_uvqk_weight, d_uvqk_bias = triton_addmm_bwd(
-            x=normed_x,
-            w=uvqk_weight,
-            dz=duvqk,
-            is_y_1d=ctx.uvqk_bias_1d and ctx.has_uvqk_bias,
-        )
-        d_x, d_norm_weight, d_norm_bias = triton_weighted_layer_norm_bwd(
-            dy=d_normed_x,
-            x=x,
-            weight=norm_weight,
-            bias=norm_bias,
-            mean=x_mean,
-            rstd=x_rstd,
-            learnable=True,
-            eps=ctx.norm_eps,
-            BLOCK_D=ctx.norm_BLOCK_D,
-        )
-        # pyre-ignore[7]
-        return (
-            d_x,
-            d_norm_weight,
-            d_norm_bias,
-            None,
-            None,
-            None,
-            None,
-            d_uvqk_weight,
-            d_uvqk_bias if ctx.has_uvqk_bias else None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-            None,
-        )
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/expand_1d_jagged_to_dense.cpp b/recommendation_v4/generative_recommenders/ops/cpp/expand_1d_jagged_to_dense.cpp
deleted file mode 100644
index 4730078e6..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/expand_1d_jagged_to_dense.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <ATen/ATen.h>
-#include <ATen/Dispatch.h>
-#include <ATen/core/op_registration/op_registration.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/Exceptions.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <torch/extension.h>
-#include <torch/library.h>
-
-namespace hstu {
-
-template <typename index_t, typename val_t>
-void expand_1d_jagged_to_dense_cpu_kernel_(
-    int64_t B,
-    int64_t max_len,
-    const at::TensorAccessor<val_t, 1>& values,
-    const at::TensorAccessor<index_t, 1>& offsets,
-    at::TensorAccessor<val_t, 2> output) {
-  for (auto i : c10::irange(B)) {
-    int64_t begin = offsets[i];
-    int64_t end = offsets[i + 1];
-    if (end - begin == 0) {
-      for (int64_t j : c10::irange(max_len)) {
-        output[i][j] = 0;
-        continue;
-      }
-    } else {
-      int64_t j = 0;
-      for (; j < std::min(end - begin, max_len); ++j) {
-        output[i][j] = values[begin + j];
-      }
-      for (; j < max_len; ++j) {
-        output[i][j] = values[end - 1];
-      }
-    }
-  } // for each i
-}
-
-at::Tensor expand_1d_jagged_to_dense_cpu(
-    const at::Tensor& values,
-    const at::Tensor& offsets,
-    const int64_t max_len) {
-  TORCH_INTERNAL_ASSERT(values.device().type() == at::DeviceType::CPU);
-  TORCH_INTERNAL_ASSERT(offsets.device().type() == at::DeviceType::CPU);
-  TORCH_CHECK(values.numel() < std::numeric_limits<int32_t>::max());
-  TORCH_CHECK(max_len >= 0);
-  auto B = offsets.size(0) - 1;
-  auto output = at::empty({B, max_len}, values.options());
-  if (values.numel() == 0 || max_len == 0) {
-    return output;
-  }
-  AT_DISPATCH_ALL_TYPES_AND2(
-      at::ScalarType::BFloat16,
-      at::ScalarType::Half,
-      values.scalar_type(),
-      "expand_1d_jagged_to_dense_cpu_input1",
-      [&] {
-        using val_t = scalar_t;
-        AT_DISPATCH_INTEGRAL_TYPES(
-            offsets.scalar_type(), "expand_1d_jagged_to_dense_cpu_input2", [&] {
-              using index_t = scalar_t;
-              expand_1d_jagged_to_dense_cpu_kernel_<index_t, val_t>(
-                  B,
-                  max_len,
-                  values.accessor<val_t, 1>(),
-                  offsets.accessor<index_t, 1>(),
-                  output.accessor<val_t, 2>());
-            });
-      });
-  return output;
-}
-
-at::Tensor expand_1d_jagged_to_dense_meta(
-    const at::Tensor& values,
-    const at::Tensor& offsets,
-    const c10::SymInt max_len) {
-  auto B = offsets.sym_size(0) - 1;
-  auto output = at::empty_symint({B, max_len}, values.options());
-  return output;
-}
-
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/expand_1d_jagged_to_dense.cu b/recommendation_v4/generative_recommenders/ops/cpp/expand_1d_jagged_to_dense.cu
deleted file mode 100644
index aa3678d2b..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/expand_1d_jagged_to_dense.cu
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <ATen/ATen.h>
-#include <ATen/Dispatch.h>
-#include <ATen/core/op_registration/op_registration.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/Exceptions.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <torch/library.h>
-
-#include "common.h"
-
-static constexpr int32_t kMaxThreads = 1024;
-
-namespace hstu {
-
-template <typename index_t, typename val_t>
-__global__
-__launch_bounds__(kMaxThreads) void expand_1d_jagged_to_dense_cuda_kernel_(
-    int64_t B,
-    int64_t max_len,
-    const at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits> values,
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> offsets,
-    at::PackedTensorAccessor32<val_t, 2, at::RestrictPtrTraits> output) {
-  int64_t b = blockIdx.y;
-  int64_t begin = offsets[b];
-  int64_t i = blockIdx.x * blockDim.x + threadIdx.x;
-  int64_t end = offsets[b + 1];
-  if (end - begin == 0) {
-    if (i < max_len) {
-      output[b][i] = 0;
-    }
-  } else {
-    if (i < std::min(end - begin, max_len)) {
-      output[b][i] = values[i + begin];
-    } else if (i < max_len) {
-      output[b][i] = values[end - 1];
-    }
-  }
-}
-
-at::Tensor expand_1d_jagged_to_dense_cuda(
-    const at::Tensor& values,
-    const at::Tensor& offsets,
-    const int64_t max_len) {
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(values.get_device());
-  TORCH_INTERNAL_ASSERT(values.device().type() == at::DeviceType::CUDA);
-  TORCH_INTERNAL_ASSERT(offsets.device().type() == at::DeviceType::CUDA);
-  TORCH_CHECK(values.numel() < std::numeric_limits<int32_t>::max());
-  TORCH_CHECK(values.get_device() == offsets.get_device());
-  TORCH_CHECK(max_len >= 0);
-  auto B = offsets.size(0) - 1;
-  auto output = at::empty({B, max_len}, values.options());
-  if (values.numel() == 0 || max_len == 0) {
-    return output;
-  }
-  uint32_t nthreads_per_block = max_len > 64 ? 64 : max_len;
-  dim3 grid_size = dim3(div_round_up(max_len, nthreads_per_block), B);
-  AT_DISPATCH_ALL_TYPES_AND2(
-      at::ScalarType::BFloat16,
-      at::ScalarType::Half,
-      values.scalar_type(),
-      "expand_1d_jagged_to_dense_cuda_input1",
-      [&] {
-        using val_t = scalar_t;
-        AT_DISPATCH_INTEGRAL_TYPES(
-            offsets.scalar_type(),
-            "expand_1d_jagged_to_dense_cuda_input2",
-            [&] {
-              using index_t = scalar_t;
-              expand_1d_jagged_to_dense_cuda_kernel_<index_t, val_t><<<
-                  grid_size,
-                  nthreads_per_block,
-                  0,
-                  at::cuda::getCurrentCUDAStream()>>>(
-                  B,
-                  max_len,
-                  values.packed_accessor32<val_t, 1, at::RestrictPtrTraits>(),
-                  offsets
-                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-                  output.packed_accessor32<val_t, 2, at::RestrictPtrTraits>());
-              C10_CUDA_KERNEL_LAUNCH_CHECK();
-            });
-      });
-
-  return output;
-}
-
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/copy_sm90_bulk_reduce.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/copy_sm90_bulk_reduce.h
deleted file mode 100644
index a22ae7745..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/copy_sm90_bulk_reduce.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/******************************************************************************
- * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
- *Pradeep Ramani, Tri Dao.
- ******************************************************************************/
-
-#pragma once
-
-#include <cute/arch/copy_sm90_tma.hpp>
-
-namespace cute {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-struct SM90_BULK_REDUCE_ADD {
-  CUTE_HOST_DEVICE static void
-  copy(float const* smem_ptr, float* gmem_ptr, int32_t store_bytes) {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(smem_ptr);
-    asm volatile(
-        "cp.reduce.async.bulk.global.shared::cta.bulk_group.add.f32 [%0], [%1], %2;\n"
-        :
-        : "l"(gmem_ptr), "r"(smem_int_ptr), "r"(store_bytes)
-        : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH(
-        "Trying to use BULK_REDUCE_ADD without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-
-  CUTE_HOST_DEVICE static void copy(
-      float const* smem_ptr,
-      float* gmem_ptr,
-      int32_t store_bytes,
-      uint64_t cache_hint) {
-#if defined(CUTE_ARCH_TMA_SM90_ENABLED)
-    uint32_t smem_int_ptr = cast_smem_ptr_to_uint(smem_ptr);
-    asm volatile(
-        "cp.reduce.async.bulk.global.shared::cta.bulk_group.L2::cache_hint.add.f32 [%0], [%1], %2, %3;\n"
-        :
-        : "l"(gmem_ptr), "r"(smem_int_ptr), "r"(store_bytes), "l"(cache_hint)
-        : "memory");
-#else
-    CUTE_INVALID_CONTROL_PATH(
-        "Trying to use BULK_REDUCE_ADD without CUTE_ARCH_TMA_SM90_ENABLED.");
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // end namespace cute
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/epilogue_bwd.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/epilogue_bwd.h
deleted file mode 100644
index 833f3ae28..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/epilogue_bwd.h
+++ /dev/null
@@ -1,481 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/******************************************************************************
- * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
- *Pradeep Ramani, Tri Dao.
- ******************************************************************************/
-
-#pragma once
-
-#include <cutlass/barrier.h>
-#include <cutlass/cutlass.h>
-
-#include "cutlass/gemm/collective/builders/sm90_common.inl"
-
-#include "copy_sm90_bulk_reduce.h"
-#include "named_barrier.h"
-#include "seqlen.h"
-#include "utils.h"
-
-namespace hstu {
-
-using namespace cute;
-
-template <
-    class TileShape_MNK_,
-    class Element_,
-    class ArchTag_,
-    int NumEpilogueThreads_,
-    bool Jagged,
-    bool dKV_swapAB_,
-    int AtomLayoutKdKV = 1>
-struct CollectiveEpilogueBwd {
-  using TileShape_MNK = TileShape_MNK_;
-  using Element = Element_;
-  using ArchTag = ArchTag_;
-  static constexpr int NumEpilogueThreads = NumEpilogueThreads_;
-  static constexpr bool dKV_swapAB = dKV_swapAB_;
-  static constexpr bool Use_TMA =
-      !Jagged && ArchTag::kMinComputeCapability >= 90;
-
-  static_assert(ArchTag::kMinComputeCapability >= 80);
-
-  using GmemTiledCopydKVTMA = cute::SM90_TMA_STORE;
-
-  // These are for storing the output tensor without TMA (e.g., for setting
-  // output to zero)
-  static constexpr int kGmemElemsPerLoad =
-      sizeof(cute::uint128_t) / sizeof(Element);
-  static_assert(
-      get<2>(TileShape_MNK{}) % kGmemElemsPerLoad == 0,
-      "Headdim must be a multiple of kGmemElemsPerLoad");
-  static constexpr int kHeadDim = get<2>(TileShape_MNK{});
-  static constexpr int kGmemThreadsPerRow =
-      cutlass::gcd(kHeadDim / kGmemElemsPerLoad, NumEpilogueThreads);
-  static_assert(
-      NumEpilogueThreads % kGmemThreadsPerRow == 0,
-      "NumEpilogueThreads must be a multiple of kGmemThreadsPerRow");
-  using GmemLayoutAtom = Layout<
-      Shape<
-          Int<NumEpilogueThreads / kGmemThreadsPerRow>,
-          Int<kGmemThreadsPerRow>>,
-      Stride<Int<kGmemThreadsPerRow>, _1>>;
-  using GmemTiledCopydKV = decltype(make_tiled_copy(
-      Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, Element>{},
-      GmemLayoutAtom{},
-      Layout<Shape<_1, Int<kGmemElemsPerLoad>>>{})); // Val layout, 8 or 16 vals
-                                                     // per store
-
-  using SmemLayoutAtomdKVTMA =
-      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
-               GMMA::Major::K,
-               Element,
-               // TODO: do we have to change this if dKV_swapAB is true?
-               decltype(cute::get<1>(TileShape_MNK{})),
-               Int<CUTE_STATIC_V(cute::get<2>(TileShape_MNK{})) /
-                   AtomLayoutKdKV>>());
-  using SmemLayoutdKVTMA = decltype(tile_to_shape(
-      SmemLayoutAtomdKVTMA{},
-      select<1, 2>(TileShape_MNK{})));
-  using SmemLayoutdKVtTMA = decltype(cute::composition(
-      SmemLayoutdKVTMA{},
-      make_layout(
-          make_shape(get<2>(TileShape_MNK{}), get<1>(TileShape_MNK{})),
-          make_stride(decltype(get<1>(TileShape_MNK{})){}, _1{}))));
-
-  // If we don't use TMA
-  static constexpr int kBlockKSmem =
-      kHeadDim % 64 == 0 ? 64 : (kHeadDim % 32 == 0 ? 32 : 16);
-  static constexpr int kSwizzle =
-      kBlockKSmem == 64 ? 3 : (kBlockKSmem == 32 ? 2 : 1);
-  using SmemLayoutAtomdKVSTG = decltype(composition(
-      Swizzle<kSwizzle, 3, 3>{},
-      Layout<Shape<Int<8>, Int<kBlockKSmem>>, Stride<Int<kBlockKSmem>, _1>>{}));
-
-  using SmemLayoutAtomdKV =
-      std::conditional_t<Use_TMA, SmemLayoutAtomdKVTMA, SmemLayoutAtomdKVSTG>;
-  using SmemLayoutdKV = decltype(tile_to_shape(
-      SmemLayoutAtomdKV{},
-      select<1, 2>(TileShape_MNK{})));
-  using SmemLayoutdKVt = decltype(cute::composition(
-      SmemLayoutdKV{},
-      make_layout(
-          make_shape(get<2>(TileShape_MNK{}), get<1>(TileShape_MNK{})),
-          make_stride(decltype(get<1>(TileShape_MNK{})){}, _1{}))));
-
-  using SmemCopyAtomdKV = Copy_Atom<
-      std::conditional_t<
-          ArchTag::kMinComputeCapability >= 90,
-          std::conditional_t<
-              !dKV_swapAB,
-              cute::SM90_U32x4_STSM_N,
-              cute::SM90_U16x8_STSM_T>,
-          AutoVectorizingCopyWithAssumedAlignment<128>>,
-      Element>;
-
-  static constexpr size_t SmemAlignmentdKV =
-      ArchTag::kMinComputeCapability >= 90
-      ? cutlass::detail::alignment_for_swizzle(SmemLayoutdKV{})
-      : 128;
-  static_assert(SmemAlignmentdKV >= 128, "Require at least 128B alignment");
-
-  struct TensorStorage : cute::aligned_struct<SmemAlignmentdKV> {
-    cute::
-        array_aligned<Element, cute::cosize_v<SmemLayoutdKV>, SmemAlignmentdKV>
-            smem_dk;
-    cute::
-        array_aligned<Element, cute::cosize_v<SmemLayoutdKV>, SmemAlignmentdKV>
-            smem_dv;
-  };
-
-  using ShapedKV =
-      cute::Shape<int32_t, int32_t, int32_t, int32_t>; // (seqlen_k, d, head,
-                                                       // batch)
-  using StridedKV = cute::Stride<int64_t, _1, int64_t, int64_t>;
-
-  using TMA_dKV = std::conditional_t<
-      Use_TMA,
-      decltype(make_tma_copy(
-          GmemTiledCopydKVTMA{},
-          make_tensor(
-              make_gmem_ptr(static_cast<Element*>(nullptr)),
-              ShapedKV{},
-              StridedKV{}),
-          SmemLayoutdKVTMA{},
-          select<1, 2>(TileShape_MNK{}),
-          _1{})), // no mcast for dKV
-      std::nullptr_t>;
-
-  // Host side kernel arguments
-  struct Arguments {
-    Element* ptr_dK;
-    ShapedKV const shape_dK;
-    StridedKV const stride_dK;
-    Element* ptr_dV;
-    StridedKV const stride_dV;
-    int const num_heads_q;
-    int const* seq_offsets;
-  };
-
-  // Device side kernel params
-  struct Params {
-    Element* ptr_dK;
-    ShapedKV const shape_dK;
-    StridedKV const stride_dK;
-    Element* ptr_dV;
-    StridedKV const stride_dV;
-    TMA_dKV tma_store_dK, tma_store_dV;
-    int const* seq_offsets = nullptr;
-  };
-
-  static Params to_underlying_arguments(Arguments const& args) {
-    Tensor mdK =
-        make_tensor(make_gmem_ptr(args.ptr_dK), args.shape_dK, args.stride_dK);
-    Tensor mdV =
-        make_tensor(make_gmem_ptr(args.ptr_dV), args.shape_dK, args.stride_dV);
-    TMA_dKV tma_store_dK = [&] {
-      if constexpr (Use_TMA) {
-        return make_tma_copy(
-            GmemTiledCopydKVTMA{},
-            mdK,
-            SmemLayoutdKVTMA{},
-            select<1, 2>(TileShape_MNK{}),
-            _1{}); // no mcast for dKV
-      } else {
-        return nullptr;
-      }
-    }();
-    TMA_dKV tma_store_dV = [&] {
-      if constexpr (Use_TMA) {
-        return make_tma_copy(
-            GmemTiledCopydKVTMA{},
-            mdV,
-            SmemLayoutdKVTMA{},
-            select<1, 2>(TileShape_MNK{}),
-            _1{}); // no mcast for dKV
-      } else {
-        return nullptr;
-      }
-    }();
-    return {
-        args.ptr_dK,
-        args.shape_dK,
-        args.stride_dK,
-        args.ptr_dV,
-        args.stride_dV,
-        tma_store_dK,
-        tma_store_dV,
-        args.seq_offsets};
-  }
-
-  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best
-  /// performance
-  CUTLASS_DEVICE
-  static void prefetch_tma_descriptors(Params const& params) {
-    if constexpr (Use_TMA) {
-      cute::prefetch_tma_descriptor(params.tma_store_dK.get_tma_descriptor());
-      cute::prefetch_tma_descriptor(params.tma_store_dV.get_tma_descriptor());
-    }
-  }
-
-  template <typename SharedStorage, typename FrgTensorO, typename TiledMma>
-  CUTLASS_DEVICE void store(
-      Params const& params,
-      FrgTensorO const& tdKrdK,
-      FrgTensorO const& tdVrdV,
-      SharedStorage& shared_storage,
-      TiledMma tiled_mma,
-      int thread_idx,
-      cute::tuple<int32_t, int32_t, int32_t> const& block_coord) {
-    auto [n_block, bidh, bidb] = block_coord;
-    Tensor sdK = cute::as_position_independent_swizzle_tensor(make_tensor(
-        make_smem_ptr(shared_storage.tensors.epilogue.smem_dk.data()),
-        SmemLayoutdKV{}));
-    Tensor sdV = cute::as_position_independent_swizzle_tensor(make_tensor(
-        make_smem_ptr(shared_storage.tensors.epilogue.smem_dv.data()),
-        SmemLayoutdKV{}));
-    Tensor sdKt = cute::as_position_independent_swizzle_tensor(make_tensor(
-        make_smem_ptr(shared_storage.tensors.epilogue.smem_dk.data()),
-        SmemLayoutdKVt{}));
-    Tensor sdVt = cute::as_position_independent_swizzle_tensor(make_tensor(
-        make_smem_ptr(shared_storage.tensors.epilogue.smem_dv.data()),
-        SmemLayoutdKVt{}));
-    auto smem_tiled_copy_dKV = make_tiled_copy_C(SmemCopyAtomdKV{}, tiled_mma);
-    auto smem_thr_copy_dKV = smem_tiled_copy_dKV.get_thread_slice(thread_idx);
-
-    Tensor tdVrdV_out = make_tensor_like<Element>(tdVrdV);
-    hstu::convert_type_out(tdVrdV, tdVrdV_out);
-    Tensor tdKrdK_out = make_tensor_like<Element>(tdKrdK);
-    hstu::convert_type_out(tdKrdK, tdKrdK_out);
-    Tensor taccdKrdK = smem_thr_copy_dKV.retile_S(
-        tdKrdK_out); // ((Atom,AtomNum), MMA_M, MMA_N)
-    Tensor taccdVrdV = smem_thr_copy_dKV.retile_S(
-        tdVrdV_out); // ((Atom,AtomNum), MMA_M, MMA_N)
-    // if (blockIdx.x == 0 && threadIdx.x == 128) { print(smem_thr_copy_dKV);
-    // print(sdK); printf("\n"); print(sdKt); printf("\n"); }
-    Tensor taccdKsdK = smem_thr_copy_dKV.partition_D(
-        cute::conditional_return<!dKV_swapAB>(
-            sdK, sdKt)); // ((Atom,AtomNum),PIPE_M,PIPE_N)
-    Tensor taccdVsdV = smem_thr_copy_dKV.partition_D(
-        cute::conditional_return<!dKV_swapAB>(
-            sdV, sdVt)); // ((Atom,AtomNum),PIPE_M,PIPE_N)
-
-    // Make sure all WGs have finished reading K and V
-    hstu::named_barrier_sync(
-        NumEpilogueThreads,
-        cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
-    cute::copy(smem_tiled_copy_dKV, taccdVrdV, taccdVsdV);
-    cute::copy(smem_tiled_copy_dKV, taccdKrdK, taccdKsdK);
-    if constexpr (Use_TMA) {
-      cutlass::arch::fence_view_async_shared(); // ensure smem writes are
-                                                // visible to TMA
-      cutlass::arch::NamedBarrier::arrive(
-          NumEpilogueThreads + cutlass::NumThreadsPerWarp,
-          cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
-
-      Tensor mdK = params.tma_store_dK.get_tma_tensor(params.shape_dK);
-      Tensor mdV = params.tma_store_dV.get_tma_tensor(params.shape_dK);
-      Tensor gdK = local_tile(
-          mdK(_, _, bidh, bidb),
-          select<1, 2>(TileShape_MNK{}),
-          make_coord(n_block, _0{})); // (M, K)
-      Tensor gdV = local_tile(
-          mdV(_, _, bidh, bidb),
-          select<1, 2>(TileShape_MNK{}),
-          make_coord(n_block, _0{})); // (M, K)
-      auto block_tma_dK = params.tma_store_dK.get_slice(_0{});
-      auto block_tma_dV = params.tma_store_dV.get_slice(_0{});
-      Tensor tdKgdK = block_tma_dK.partition_D(gdK); // (TMA, TMA_M, TMA_K)
-      Tensor tdKsdK = block_tma_dK.partition_S(sdK); // (TMA, TMA_M, TMA_K)
-      Tensor tdVgdV = block_tma_dV.partition_D(gdV); // (TMA, TMA_M, TMA_K)
-      Tensor tdVsdV = block_tma_dV.partition_S(sdV); // (TMA, TMA_M, TMA_K)
-      int warp_idx_sync =
-          __shfl_sync(0xffffffff, thread_idx / cutlass::NumThreadsPerWarp, 0);
-      if (warp_idx_sync ==
-          NumEpilogueThreads / cutlass::NumThreadsPerWarp - 1) {
-        cutlass::arch::NamedBarrier::sync(
-            NumEpilogueThreads + cutlass::NumThreadsPerWarp,
-            cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
-        if (cute::elect_one_sync()) {
-          cute::copy(params.tma_store_dV, tdVsdV, tdVgdV);
-          cute::copy(params.tma_store_dK, tdKsdK, tdKgdK);
-          tma_store_arrive();
-        }
-      }
-      tma_store_wait<0>();
-      // // Tell warp 0 that smem_k and smem_v are ready
-      // cutlass::arch::NamedBarrier::arrive(NumEpilogueThreads +
-      // cutlass::NumThreadsPerWarp,
-      // static_cast<uint32_t>(BwdNamedBarriers::KVEmpty) /*id*/);
-
-    } else {
-      hstu::named_barrier_sync(
-          NumEpilogueThreads,
-          cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
-      static constexpr int kBlockN = get<1>(TileShape_MNK{});
-      hstu::SeqlenInfo<Jagged, kBlockN> seqlen_info{
-          bidb, size<0>(params.shape_dK), params.seq_offsets};
-      Tensor mdK = make_tensor(
-          make_gmem_ptr(params.ptr_dK), params.shape_dK, params.stride_dK)(
-          _, _, bidh, !Jagged ? bidb : 0);
-      Tensor gdK = local_tile(
-          cute::domain_offset(make_coord(seqlen_info.offset, _0{}), mdK),
-          select<1, 2>(TileShape_MNK{}),
-          make_coord(n_block, _0{})); // (M, K)
-      Tensor mdV = make_tensor(
-          make_gmem_ptr(params.ptr_dV), params.shape_dK, params.stride_dV)(
-          _, _, bidh, !Jagged ? bidb : 0);
-      Tensor gdV = local_tile(
-          cute::domain_offset(make_coord(seqlen_info.offset, _0{}), mdV),
-          select<1, 2>(TileShape_MNK{}),
-          make_coord(n_block, _0{})); // (M, K)
-
-      GmemTiledCopydKV gmem_tiled_copy_dKV;
-      auto gmem_thr_copy_dKV = gmem_tiled_copy_dKV.get_thread_slice(thread_idx);
-      Tensor tdKVgdV = gmem_thr_copy_dKV.partition_D(gdV);
-      Tensor tdKVsdV =
-          gmem_thr_copy_dKV.partition_S(sdV); // (TMA, TMA_M, TMA_K)
-      Tensor tdKVgdK = gmem_thr_copy_dKV.partition_D(gdK);
-      Tensor tdKVsdK =
-          gmem_thr_copy_dKV.partition_S(sdK); // (TMA, TMA_M, TMA_K)
-      Tensor tdKVrdV = make_fragment_like(tdKVgdV);
-      Tensor tdKVrdK = make_fragment_like(tdKVgdK);
-      Tensor cdKV = cute::make_identity_tensor(
-          select<1, 2>(TileShape_MNK{})); // (BLK_M,BLK_K) -> (blk_m,blk_k)
-      // Repeat the partitioning with identity layouts
-      Tensor tdKVcdKV = gmem_thr_copy_dKV.partition_D(cdKV);
-      Tensor tdKVpdKV = make_tensor<bool>(make_shape(size<2>(tdKVgdV)));
-#pragma unroll
-      for (int k = 0; k < size(tdKVpdKV); ++k) {
-        tdKVpdKV(k) = get<1>(tdKVcdKV(_0{}, _0{}, k)) < get<1>(params.shape_dK);
-      }
-      // Need to check OOB when reading from smem if kBlockN isn't evenly tiled
-      static constexpr bool EvenN =
-          kBlockN % CUTE_STATIC_V(size<0>(GmemLayoutAtom{})) == 0;
-      hstu::copy<
-          /*Is_even_MN=*/EvenN,
-          /*Is_even_K=*/true,
-          /*Clear_OOB_MN=*/false>(
-          gmem_tiled_copy_dKV, tdKVsdV, tdKVrdV, tdKVcdKV, tdKVpdKV, kBlockN);
-      hstu::copy<
-          /*Is_even_MN=*/EvenN,
-          /*Is_even_K=*/true,
-          /*Clear_OOB_MN=*/false>(
-          gmem_tiled_copy_dKV, tdKVsdK, tdKVrdK, tdKVcdKV, tdKVpdKV, kBlockN);
-      // // Tell warp 0 that smem_k and smem_v are ready
-      // cutlass::arch::fence_view_async_shared(); // ensure smem reads are done
-      // before next TMA to smem_k/v
-      // hstu::named_barrier_arrive(NumEpilogueThreads +
-      // cutlass::NumThreadsPerWarp,
-      // static_cast<uint32_t>(BwdNamedBarriers::KVEmpty) /*id*/); Construct
-      // identity layout for gdKV Clear_OOB_K must be false since we don't want
-      // to write zeros to gmem
-      hstu::copy<
-          /*Is_even_MN=*/false,
-          /*Is_even_K=*/false,
-          /*Clear_OOB_MN=*/false,
-          /*Clear_OOB_K=*/false>(
-          gmem_tiled_copy_dKV,
-          tdKVrdV,
-          tdKVgdV,
-          tdKVcdKV,
-          tdKVpdKV,
-          std::min(seqlen_info.seqlen - n_block * kBlockN, kBlockN));
-      hstu::copy<
-          /*Is_even_MN=*/false,
-          /*Is_even_K=*/false,
-          /*Clear_OOB_MN=*/false,
-          /*Clear_OOB_K=*/false>(
-          gmem_tiled_copy_dKV,
-          tdKVrdK,
-          tdKVgdK,
-          tdKVcdKV,
-          tdKVpdKV,
-          std::min(seqlen_info.seqlen - n_block * kBlockN, kBlockN));
-    }
-  }
-
-  CUTLASS_DEVICE void store_tail() {
-    // if constexpr (Use_TMA) { tma_store_wait<0>(); }
-  }
-
-  // Write 0 to dK and dV
-  CUTLASS_DEVICE void store_zero(
-      Params const& params,
-      int thread_idx,
-      cute::tuple<int32_t, int32_t, int32_t> const& block_coord) {
-    static constexpr int kBlockN = get<1>(TileShape_MNK{});
-    auto [n_block, bidh, bidb] = block_coord;
-    hstu::SeqlenInfo<Jagged, kBlockN> seqlen_info{
-        bidb, size<0>(params.shape_dK), params.seq_offsets};
-    Tensor mdK = make_tensor(
-        make_gmem_ptr(params.ptr_dK), params.shape_dK, params.stride_dK)(
-        _, _, bidh, !Jagged ? bidb : 0);
-    Tensor gdK = local_tile(
-        cute::domain_offset(make_coord(seqlen_info.offset, _0{}), mdK),
-        select<1, 2>(TileShape_MNK{}),
-        make_coord(n_block, _0{})); // (M, K)
-    Tensor mdV = make_tensor(
-        make_gmem_ptr(params.ptr_dV), params.shape_dK, params.stride_dV)(
-        _, _, bidh, !Jagged ? bidb : 0);
-    Tensor gdV = local_tile(
-        cute::domain_offset(make_coord(seqlen_info.offset, _0{}), mdV),
-        select<1, 2>(TileShape_MNK{}),
-        make_coord(n_block, _0{})); // (M, K)
-
-    GmemTiledCopydKV gmem_tiled_copy_dKV;
-    auto gmem_thr_copy_dKV = gmem_tiled_copy_dKV.get_thread_slice(thread_idx);
-    Tensor tdKVgdK = gmem_thr_copy_dKV.partition_D(gdK);
-    Tensor tdKVgdV = gmem_thr_copy_dKV.partition_D(gdV);
-    Tensor tdKVrdKV = make_fragment_like(tdKVgdK);
-    clear(tdKVrdKV);
-    // Construct identity layout for gdKV
-    Tensor cdKV = cute::make_identity_tensor(
-        select<1, 2>(TileShape_MNK{})); // (BLK_M,BLK_K) -> (blk_m,blk_k)
-    // Repeat the partitioning with identity layouts
-    Tensor tdKVcdKV = gmem_thr_copy_dKV.partition_D(cdKV);
-    Tensor tdKVpdKV = make_tensor<bool>(make_shape(size<2>(tdKVgdK)));
-#pragma unroll
-    for (int k = 0; k < size(tdKVpdKV); ++k) {
-      tdKVpdKV(k) = get<1>(tdKVcdKV(_0{}, _0{}, k)) < get<1>(params.shape_dK);
-    }
-    // Clear_OOB_K must be false since we don't want to write zeros to gmem
-    hstu::copy<
-        /*Is_even_MN=*/false,
-        /*Is_even_K=*/false,
-        /*Clear_OOB_MN=*/false,
-        /*Clear_OOB_K=*/false>(
-        gmem_tiled_copy_dKV,
-        tdKVrdKV,
-        tdKVgdK,
-        tdKVcdKV,
-        tdKVpdKV,
-        seqlen_info.seqlen - n_block * kBlockN);
-    hstu::copy<
-        /*Is_even_MN=*/false,
-        /*Is_even_K=*/false,
-        /*Clear_OOB_MN=*/false,
-        /*Clear_OOB_K=*/false>(
-        gmem_tiled_copy_dKV,
-        tdKVrdKV,
-        tdKVgdV,
-        tdKVcdKV,
-        tdKVpdKV,
-        seqlen_info.seqlen - n_block * kBlockN);
-  }
-};
-
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/epilogue_fwd.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/epilogue_fwd.h
deleted file mode 100644
index c794a114c..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/epilogue_fwd.h
+++ /dev/null
@@ -1,550 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/******************************************************************************
- * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
- *Pradeep Ramani, Tri Dao.
- ******************************************************************************/
-
-#pragma once
-
-#include <cutlass/cutlass.h>
-#include <cutlass/fast_math.h> // For FastDivMod
-#include "cute/tensor.hpp"
-
-#include "cutlass/epilogue/collective/builders/sm90_common.inl"
-#include "cutlass/gemm/collective/builders/sm90_common.inl"
-
-#include "named_barrier.h"
-#include "seqlen.h"
-#include "utils.h"
-
-namespace hstu {
-
-using namespace cute;
-
-template <
-    class TileShape_MNK_,
-    class ClusterShape_,
-    class Element_,
-    class ArchTag_,
-    int NumEpilogueThreads_,
-    bool Jagged,
-    bool FP8PermuteCol = false>
-struct CollectiveEpilogueFwd {
-  using TileShape_MNK = TileShape_MNK_;
-  using ClusterShape = ClusterShape_;
-  using Element = Element_;
-  using ArchTag = ArchTag_;
-  static constexpr int NumEpilogueThreads = NumEpilogueThreads_;
-  static constexpr bool Use_smem = sizeof(Element) <= 2;
-  static constexpr bool Use_TMA_O =
-      ArchTag::kMinComputeCapability >= 90 && !Jagged && Use_smem;
-
-  static_assert(ArchTag::kMinComputeCapability >= 80);
-  static_assert(
-      ArchTag::kMinComputeCapability >= 90 ||
-      CUTE_STATIC_V(size(ClusterShape{})) == 1);
-
-  static constexpr int kBlockM = get<0>(TileShape_MNK{});
-  static constexpr int kHeadDim = get<2>(TileShape_MNK{});
-
-  using GmemTiledCopyOTMA = cute::SM90_TMA_STORE;
-
-  // These are for storing the output tensor without TMA (e.g., for setting
-  // output to zero)
-  static constexpr int kGmemElemsPerStore =
-      sizeof(cute::uint128_t) / sizeof(Element);
-  static_assert(
-      kHeadDim % kGmemElemsPerStore == 0,
-      "Headdim must be a multiple of kGmemElemsPerStore");
-  // We want each "row" to have 64 elements (128 bytes, i.e. 1 cache line). We
-  // want each thread to have 4 elements in the M direction and 2 elements in
-  // the K direction. In the case of PackGQA, this reduces the number of times
-  // we need to call divmod.
-  static constexpr int kBytePerRow = kHeadDim * sizeof(Element);
-  static constexpr int kBlockKGmem =
-      (kBytePerRow % 128 == 0 ? 128 : (kBytePerRow % 64 == 0 ? 64 : 32)) /
-      sizeof(Element);
-  // static constexpr int kBlockKGmem = kHeadDim % 128 == 0 ? 128 : (kHeadDim %
-  // 64 == 0 ? 64 : 32); static constexpr int kGmemThreadsPerRow =
-  // cutlass::gcd(kHeadDim / kGmemElemsPerStore, NumEpilogueThreads);
-  static constexpr int kGmemThreadsPerRow = kBlockKGmem / kGmemElemsPerStore;
-  // If PackGQA, we split the work of compute O_ptr among threads in the same
-  // row, so we need this to within a warp
-  static_assert(cutlass::NumThreadsPerWarp % kGmemThreadsPerRow == 0);
-  static_assert(
-      NumEpilogueThreads % kGmemThreadsPerRow == 0,
-      "NumEpilogueThreads must be a multiple of kGmemThreadsPerRow");
-  using GmemLayoutAtom = Layout<
-      Shape<
-          Int<NumEpilogueThreads / kGmemThreadsPerRow>,
-          Int<kGmemThreadsPerRow>>,
-      Stride<Int<kGmemThreadsPerRow>, _1>>;
-  static_assert(
-      kBlockM % CUTE_STATIC_V(shape<0>(GmemLayoutAtom{})) == 0,
-      "kBlockM must be a multiple of NumEpilogueThreads / kGmemThreadsPerRow");
-  using GmemTiledCopyO = decltype(make_tiled_copy(
-      Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, Element>{},
-      GmemLayoutAtom{},
-      Layout<Shape<_1, Int<kGmemElemsPerStore>>>{})); // Val layout, 8 or 16
-                                                      // vals per store
-
-  using SmemLayoutAtomOTMA =
-      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
-               GMMA::Major::K,
-               Element,
-               decltype(cute::get<0>(TileShape_MNK{})),
-               decltype(cute::get<2>(TileShape_MNK{}))>());
-  using SmemLayoutOTMA = decltype(tile_to_shape(
-      SmemLayoutAtomOTMA{},
-      select<0, 2>(TileShape_MNK{})));
-  static constexpr int kSwizzle = kBlockKGmem == 128
-      ? 4
-      : (kBlockKGmem == 64 ? 3 : (kBlockKGmem == 32 ? 2 : 1));
-  static constexpr int kSwizzleBase =
-      sizeof(Element) == 4 ? 2 : (sizeof(Element) == 2 ? 3 : 4);
-  using SmemLayoutAtomO = decltype(composition(
-      Swizzle<kSwizzle, kSwizzleBase, kSwizzleBase>{},
-      Layout<Shape<_8, Int<kBlockKGmem>>, Stride<Int<kBlockKGmem>, _1>>{}));
-  using SmemLayoutOSTS =
-      decltype(tile_to_shape(SmemLayoutAtomO{}, select<0, 2>(TileShape_MNK{})));
-  using SmemLayoutO = std::conditional_t<
-      ArchTag::kMinComputeCapability >= 90,
-      SmemLayoutOTMA,
-      SmemLayoutOSTS>;
-
-  using ShapeO =
-      cute::Shape<int32_t, int32_t, int32_t, int32_t, int32_t>; // (seqlen_q, d,
-                                                                // head, batch,
-                                                                // num_splits)
-  using StrideO = cute::Stride<int64_t, _1, int64_t, int64_t, int64_t>;
-  // ((qhead_per_khead, seqlen_q), d, nheads, batch, num_splits)
-  using ShapeOPacked = ShapeO;
-  using StrideOPacked = StrideO;
-  // ((qhead_per_khead, seqlen_q), nheads, batch, num_splits)
-  using StrideLSE =
-      cute::Stride<_1, int64_t, int64_t, int64_t>; // (seqlen_q, head, batch,
-  // num_splits)
-  using ShapeLSEPacked = cute::Shape<int32_t, int32_t, int32_t, int32_t>;
-  using StrideLSEPacked = StrideLSE;
-  using EpilogueTile_MN = decltype(select<0, 1>(TileShape_MNK{}));
-  using CopyOpR2S = std::conditional_t<
-      ArchTag::kMinComputeCapability >= 90,
-      // cute::SM90_U32x4_STSM_N if Element size is 2 bytes (fp16, bf16)
-      decltype(cutlass::epilogue::collective::detail::
-                   sm90_get_smem_store_op_for_accumulator<
-                       StrideO,
-                       Element,
-                       EpilogueTile_MN>()),
-      AutoVectorizingCopyWithAssumedAlignment<128>>;
-  using SmemCopyAtomO = Copy_Atom<CopyOpR2S, Element>;
-
-  // static constexpr size_t SmemAlignmentO =
-  // cutlass::detail::alignment_for_swizzle(SmemLayoutO{});
-  // static_assert(SmemAlignmentO >= 128, "Require at least 128B alignment");
-  // struct TensorStorage : cute::aligned_struct<SmemAlignmentO> {
-  //     cute::array_aligned<Element, Use_smem ? cute::cosize_v<SmemLayoutO> :
-  //     0, SmemAlignmentO> smem_o;
-  // };
-  struct TensorStorage : cute::aligned_struct<128> {
-    cute::array_aligned<Element, Use_smem ? cute::cosize_v<SmemLayoutO> : 0>
-        smem_o;
-  };
-
-  using TMA_O = std::conditional_t<
-      Use_TMA_O,
-      decltype(make_tma_copy(
-          GmemTiledCopyOTMA{},
-          make_tensor(
-              make_gmem_ptr(static_cast<Element*>(nullptr)),
-              ShapeO{},
-              StrideO{}),
-          SmemLayoutOTMA{},
-          select<0, 2>(TileShape_MNK{}),
-          _1{})), // no mcast for O
-      std::nullptr_t>;
-
-  // Host side kernel arguments
-  struct Arguments {
-    Element* ptr_O;
-    ShapeO const shape_O;
-    StrideO const stride_O;
-    int32_t const nheads;
-    int32_t const num_softmax_heads;
-    StrideLSE const stride_lse;
-    float* ptr_lse = nullptr;
-    int const* seq_offsets = nullptr;
-  };
-
-  // Device side kernel params
-  struct Params {
-    Element* ptr_O;
-    ShapeO const shape_O;
-    StrideO const stride_O;
-    ShapeOPacked const shape_O_packed;
-    StrideOPacked const stride_O_packed;
-    float* ptr_lse;
-    StrideLSE const stride_lse;
-    ShapeLSEPacked const shape_lse_packed;
-    StrideLSEPacked const stride_lse_packed;
-    TMA_O tma_store_O;
-    int const* seq_offsets = nullptr;
-  };
-
-  static Params to_underlying_arguments(Arguments const& args) {
-    Tensor mO =
-        make_tensor(make_gmem_ptr(args.ptr_O), args.shape_O, args.stride_O);
-    TMA_O tma_store_O = [&] {
-      if constexpr (Use_TMA_O) {
-        return make_tma_copy(
-            GmemTiledCopyOTMA{},
-            mO,
-            SmemLayoutO{},
-            select<0, 2>(TileShape_MNK{}),
-            _1{}); // no mcast
-      } else {
-        return nullptr;
-      }
-    }();
-    // If PackGQA, reshape O to be ((qhead_per_khead, seqlen_q), head_size,
-    // nhead_k, batch_size, num_splits)
-    int const qhead_per_khead = 1;
-    auto const shape_O_packed = cute::conditional_return<true>(
-        args.shape_O,
-        make_shape(
-            make_shape(qhead_per_khead, get<0>(args.shape_O)),
-            get<1>(args.shape_O),
-            args.nheads,
-            get<3>(args.shape_O),
-            get<4>(args.shape_O)));
-    auto const stride_O_packed = cute::conditional_return<true>(
-        args.stride_O,
-        make_stride(
-            make_stride(get<2>(args.stride_O), get<0>(args.stride_O)),
-            get<1>(args.stride_O),
-            get<2>(args.stride_O) * qhead_per_khead,
-            get<3>(args.stride_O),
-            get<4>(args.stride_O)));
-    auto const shape_lse_packed = select<0, 2, 3, 4>(args.shape_O);
-    auto const stride_lse_packed = args.stride_lse;
-    return {
-        args.ptr_O,
-        args.shape_O,
-        args.stride_O,
-        shape_O_packed,
-        stride_O_packed,
-        args.ptr_lse,
-        args.stride_lse,
-        shape_lse_packed,
-        stride_lse_packed,
-        tma_store_O,
-        args.seq_offsets};
-  }
-
-  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best
-  /// performance
-  CUTLASS_DEVICE
-  static void prefetch_tma_descriptors(Params const& params) {
-    if constexpr (Use_TMA_O) {
-      cute::prefetch_tma_descriptor(params.tma_store_O.get_tma_descriptor());
-    }
-  }
-
-  template <typename SharedStorage, typename FrgTensorO, typename TiledMma>
-  CUTLASS_DEVICE void store(
-      Params const& params,
-      FrgTensorO const& tOrO,
-      SharedStorage& shared_storage,
-      TiledMma tiled_mma,
-      int thread_idx,
-      cute::tuple<int32_t, int32_t, int32_t, int32_t> const& block_coord) {
-    auto [m_block, bidh, bidb, split_idx] = block_coord;
-    Tensor sO = make_tensor(
-        make_smem_ptr(shared_storage.tensors.epilogue.smem_o.data()),
-        SmemLayoutO{});
-    // Tensor sO_pi = cute::as_position_independent_swizzle_tensor(sO);
-
-    Tensor tOrO_out = make_tensor_like<Element>(tOrO);
-    hstu::convert_type_out(tOrO, tOrO_out);
-    if constexpr (
-        FP8PermuteCol && (sizeof(Element) == 2 || sizeof(Element) == 4)) {
-      hstu::permute_output_fp8_Vcolmajor(tOrO_out);
-    }
-
-    // Make sure all WGs have finished reading V
-    // Technically we don't need this if we're not using smem, but the mainloop
-    // makes the assumption that all epilogue threads sync at least once during
-    // the epilogue (so that we can start loading Q with cp.async if we need).
-    hstu::named_barrier_sync(
-        NumEpilogueThreads,
-        cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
-
-    // Step 1: Write O from rmem -> smem
-    if constexpr (Use_smem) {
-      auto smem_tiled_copy_O = make_tiled_copy_C(SmemCopyAtomO{}, tiled_mma);
-      auto smem_thr_copy_O = smem_tiled_copy_O.get_thread_slice(thread_idx);
-      Tensor taccOrO =
-          smem_thr_copy_O.retile_S(tOrO_out); // ((Atom,AtomNum), MMA_M, MMA_N)
-      Tensor taccOsO =
-          smem_thr_copy_O.partition_D(sO); // ((Atom,AtomNum),PIPE_M,PIPE_N)
-      // Tensor taccOsO = smem_thr_copy_O.partition_D(sO_pi);     //
-      // ((Atom,AtomNum),PIPE_M,PIPE_N)
-      cute::copy(smem_tiled_copy_O, taccOrO, taccOsO);
-      if constexpr (Use_TMA_O) {
-        cutlass::arch::fence_view_async_shared(); // ensure smem writes are
-                                                  // visible to TMA
-        cutlass::arch::NamedBarrier::arrive(
-            NumEpilogueThreads + cutlass::NumThreadsPerWarp,
-            cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
-      } else {
-        hstu::named_barrier_sync(
-            NumEpilogueThreads,
-            cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
-      }
-    } else {
-      if constexpr (ArchTag::kMinComputeCapability >= 90) {
-#pragma unroll
-        for (uint32_t cta_id = 0; cta_id < size(ClusterShape{}); ++cta_id) {
-          shared_storage.pipelines.barrier_O.arrive(cta_id);
-        }
-      }
-    }
-
-    hstu::SeqlenInfo<Jagged, kBlockM> seqlen_info{
-        bidb, size<0>(params.shape_O), params.seq_offsets};
-    int offset_o = seqlen_info.offset;
-    int seqlen_o = seqlen_info.seqlen;
-
-    // Step 2: Write LSE from rmem -> gmem
-    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
-    // (MMA,MMA_M,MMA_K)
-    Tensor taccOcO = thread_mma.partition_C(
-        cute::make_identity_tensor(select<0, 2>(TileShape_MNK{})));
-    static_assert(decltype(size<0, 0>(taccOcO))::value == 2);
-    static_assert(decltype(size<0, 1>(taccOcO))::value == 2);
-    Tensor taccOcO_rowcol = make_tensor(
-        taccOcO.data(), hstu::convert_layout_acc_rowcol(taccOcO.layout()));
-    Tensor taccOcO_row = taccOcO_rowcol(_, _0{});
-    // Step 3: Write O from smem -> gmem
-    if constexpr (Use_TMA_O) {
-      Tensor mO = params.tma_store_O.get_tma_tensor(params.shape_O)(
-          _, _, bidh, bidb, split_idx);
-      Tensor gO = local_tile(
-          mO,
-          select<0, 2>(TileShape_MNK{}),
-          make_coord(m_block, _0{})); // (M, K)
-      auto block_tma_O = params.tma_store_O.get_slice(_0{});
-      Tensor tOgO = block_tma_O.partition_D(gO); // (TMA, TMA_M, TMA_K)
-      Tensor tOsO = block_tma_O.partition_S(sO); // (TMA, TMA_M, TMA_K)
-      int warp_idx_sync =
-          __shfl_sync(0xffffffff, thread_idx / cutlass::NumThreadsPerWarp, 0);
-      if (warp_idx_sync ==
-          NumEpilogueThreads / cutlass::NumThreadsPerWarp - 1) {
-        cutlass::arch::NamedBarrier::sync(
-            NumEpilogueThreads + cutlass::NumThreadsPerWarp,
-            cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
-        if (cute::elect_one_sync()) {
-          cute::copy(params.tma_store_O, tOsO, tOgO);
-          tma_store_arrive();
-          tma_store_wait<0>();
-#pragma unroll
-          for (uint32_t cta_id = 0; cta_id < size(ClusterShape{}); ++cta_id) {
-            shared_storage.pipelines.barrier_O.arrive(cta_id);
-          }
-        }
-      }
-    } else { // Don't use TMA in Jagged case since we don't want to overwrite
-             // the output of another sequence
-      Tensor mO = make_tensor(
-          make_gmem_ptr(params.ptr_O + offset_o * get<0>(params.stride_O)),
-          params.shape_O_packed,
-          params.stride_O_packed)(_, _, bidh, !Jagged ? bidb : 0, split_idx);
-      Tensor gO = local_tile(
-          mO,
-          select<0, 2>(TileShape_MNK{}),
-          make_coord(m_block, _0{})); // (M, K)
-      // if (thread_idx == 0) { printf("Before O write, m_block: %d, bidh: %d,
-      // bidb: %d, split_idx: %d, offset_o: %d, seqlen_o: %d, mO_addr = %p, addr
-      // diff = %d\n", m_block, bidh, bidb, split_idx, offset_o, seqlen_o,
-      // mO.data(), reinterpret_cast<int>(&mO(0)) -
-      // reinterpret_cast<int>(params.ptr_O)); }
-      if constexpr (Use_smem) {
-        GmemTiledCopyO gmem_tiled_copy_O;
-        auto gmem_thr_copy_O = gmem_tiled_copy_O.get_thread_slice(thread_idx);
-        Tensor tOsO =
-            gmem_thr_copy_O.partition_S(sO); // ((Atom,AtomNum),ATOM_M,ATOM_N)
-        // Tensor tOsO = gmem_thr_copy_O.partition_S(sO_pi);        //
-        // ((Atom,AtomNum),ATOM_M,ATOM_N)
-        Tensor tOrO = make_fragment_like(tOsO);
-        cute::copy(gmem_tiled_copy_O, tOsO, tOrO);
-        if constexpr (ArchTag::kMinComputeCapability >= 90) {
-          cutlass::arch::fence_view_async_shared(); // ensure smem reads are
-                                                    // done before next TMA to
-                                                    // smem_v
-#pragma unroll
-          for (uint32_t cta_id = 0; cta_id < size(ClusterShape{}); ++cta_id) {
-            shared_storage.pipelines.barrier_O.arrive(cta_id);
-          }
-        }
-        // (BLK_M,BLK_K) -> (blk_m,blk_k)
-        Tensor tOcO = gmem_thr_copy_O.partition_D(
-            cute::make_identity_tensor(select<0, 2>(TileShape_MNK{})));
-        Tensor tOpO = make_tensor<bool>(make_shape(size<2>(tOsO)));
-#pragma unroll
-        for (int k = 0; k < size(tOpO); ++k) {
-          tOpO(k) = get<1>(tOcO(_0{}, _0{}, k)) < get<1>(params.shape_O);
-        }
-        Tensor tOgO = gmem_thr_copy_O.partition_D(gO);
-        // Clear_OOB_K must be false since we don't want to write zeros to
-        // gmem
-        hstu::copy<
-            /*Is_even_MN=*/false,
-            /*Is_even_K=*/false,
-            /*Clear_OOB_MN=*/false,
-            /*Clear_OOB_K=*/false>(
-            gmem_tiled_copy_O,
-            tOrO,
-            tOgO,
-            tOcO,
-            tOpO,
-            seqlen_o - m_block * kBlockM);
-      } else {
-        // We already arrived on barrier_O earlier
-        static constexpr int kGmemElemsPerStoreDirect = 2;
-        cute::Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, Element>
-            gmem_copy_direct;
-        // Reshape acc from ((2, 2, V), MMA_M, MMA_N) to (nrow=(2, MMA_M),
-        // ncol=(2, V, MMA_N))
-        Tensor tOrO_rowcol = make_tensor(
-            tOrO_out.data(), hstu::convert_layout_acc_rowcol(tOrO.layout()));
-        Tensor tOrO_copy = cute::tiled_divide(
-            tOrO_rowcol, Shape<_1, Int<kGmemElemsPerStoreDirect>>{});
-        Tensor tOgO = thread_mma.partition_C(gO);
-        Tensor tOgO_rowcol = make_tensor(
-            tOgO.data(), hstu::convert_layout_acc_rowcol(tOgO.layout()));
-        Tensor tOgO_copy = cute::tiled_divide(
-            tOgO_rowcol, Shape<_1, Int<kGmemElemsPerStoreDirect>>{});
-        Tensor taccOcO_col = taccOcO_rowcol(_0{}, _);
-#pragma unroll
-        for (int m = 0; m < size(taccOcO_row); ++m) {
-          if (get<0>(taccOcO_row(m)) < seqlen_o - m_block * kBlockM) {
-#pragma unroll
-            for (int k = 0; k < size(taccOcO_col) / kGmemElemsPerStoreDirect;
-                 ++k) {
-              if (get<1>(taccOcO_col(k * kGmemElemsPerStoreDirect)) <
-                  get<1>(params.shape_O)) {
-                cute::copy(
-                    gmem_copy_direct, tOrO_copy(_, m, k), tOgO_copy(_, m, k));
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  template <typename FrgTensorLSE, typename TiledMma>
-  CUTLASS_DEVICE void store_softmax(
-      Params const& params,
-      FrgTensorLSE const& lse,
-      TiledMma tiled_mma,
-      int thread_idx,
-      cute::tuple<int32_t, int32_t, int32_t, int32_t> const& block_coord) {
-    auto [m_block, bidh, bidb, split_idx] = block_coord;
-    hstu::SeqlenInfo<Jagged, kBlockM> seqlen_info{
-        bidb, size<0>(params.shape_O), params.seq_offsets};
-    int offset_o = seqlen_info.offset;
-    int seqlen_o = seqlen_info.seqlen;
-    // Step 2: Write LSE from rmem -> gmem
-    auto thread_mma = tiled_mma.get_thread_slice(thread_idx);
-    // (MMA,MMA_M,MMA_K)
-    Tensor taccOcO = thread_mma.partition_C(
-        cute::make_identity_tensor(select<0, 2>(TileShape_MNK{})));
-    static_assert(decltype(size<0, 0>(taccOcO))::value == 2);
-    static_assert(decltype(size<0, 1>(taccOcO))::value == 2);
-    Tensor taccOcO_rowcol = make_tensor(
-        taccOcO.data(), hstu::convert_layout_acc_rowcol(taccOcO.layout()));
-    Tensor taccOcO_row = taccOcO_rowcol(_, _0{});
-    CUTE_STATIC_ASSERT_V(size(lse) == size(taccOcO_row)); // MMA_M
-    Tensor mLSE = make_tensor(
-        make_gmem_ptr(params.ptr_lse + offset_o * get<0>(params.stride_lse)),
-        params.shape_lse_packed,
-        params.stride_lse_packed)(_, bidh, !Jagged ? bidb : 0, 0);
-#pragma unroll
-    for (int mi = 0; mi < size(lse); ++mi) {
-      int const row = m_block * kBlockM + get<0>(taccOcO_row(mi));
-      if (get<1>(taccOcO_row(_0{})) == 0 && row < seqlen_o) {
-        mLSE(row) = lse(mi);
-      }
-    }
-  }
-
-  CUTLASS_DEVICE void store_tail() {
-    // Don't need to do tma_store_wait<0>() here since we already did in @store
-  }
-
-  // Write 0 to output and -inf to LSE
-  template <bool Clear_O = true>
-  CUTLASS_DEVICE void store_zero(
-      Params const& params,
-      int thread_idx,
-      cute::tuple<int32_t, int32_t, int32_t, int32_t> const& block_coord) {
-    static constexpr int kBlockM = get<0>(TileShape_MNK{});
-    auto [m_block, bidh, bidb, split_idx] = block_coord;
-    hstu::SeqlenInfo<Jagged, kBlockM> seqlen_info{
-        bidb, size<0>(params.shape_O), params.seq_offsets};
-    int offset_o = seqlen_info.offset;
-    int seqlen_o = seqlen_info.seqlen;
-    Tensor mO = make_tensor(
-        make_gmem_ptr(params.ptr_O + offset_o * get<0>(params.stride_O)),
-        params.shape_O_packed,
-        params.stride_O_packed)(_, _, bidh, !Jagged ? bidb : 0, split_idx);
-
-    static_assert(kBlockM <= NumEpilogueThreads);
-    if constexpr (!Clear_O) {
-      return;
-    }
-
-    GmemTiledCopyO gmem_tiled_copy_O;
-    auto gmem_thr_copy_O = gmem_tiled_copy_O.get_thread_slice(thread_idx);
-    Tensor tOcO = gmem_thr_copy_O.partition_D(
-        cute::make_identity_tensor(select<0, 2>(TileShape_MNK{})));
-    Tensor tOpO = make_tensor<bool>(make_shape(size<2>(tOcO)));
-#pragma unroll
-    for (int k = 0; k < size(tOpO); ++k) {
-      tOpO(k) = get<1>(tOcO(_0{}, _0{}, k)) < get<1>(params.shape_O);
-    }
-    Tensor gO = local_tile(
-        mO, select<0, 2>(TileShape_MNK{}), make_coord(m_block, _0{})); // (M, K)
-    Tensor tOgO = gmem_thr_copy_O.partition_D(gO);
-    Tensor tOrO = make_fragment_like(tOgO);
-    cute::clear(tOrO);
-    // Clear_OOB_K must be false since we don't want to write zeros to gmem
-    hstu::copy<
-        /*Is_even_MN=*/false,
-        /*Is_even_K=*/false,
-        /*Clear_OOB_MN=*/false,
-        /*Clear_OOB_K=*/false>(
-        gmem_tiled_copy_O,
-        tOrO,
-        tOgO,
-        tOcO,
-        tOpO,
-        seqlen_o - m_block * kBlockM);
-  }
-};
-
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash.h
deleted file mode 100644
index ef37e3408..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cuda.h>
-#include <vector>
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-namespace hstu {
-
-struct Qkv_params {
-  using index_t = int64_t;
-  // The QKV matrices.
-  void* __restrict__ q_ptr;
-  void* __restrict__ k_ptr;
-  void* __restrict__ v_ptr;
-
-  // The stride between rows of the Q, K and V matrices.
-  index_t q_batch_stride;
-  index_t k_batch_stride;
-  index_t v_batch_stride;
-  index_t q_row_stride;
-  index_t k_row_stride;
-  index_t v_row_stride;
-  index_t q_head_stride;
-  index_t k_head_stride;
-  index_t v_head_stride;
-  index_t v_dim_stride;
-
-  // The number of heads.
-  int h;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-struct Flash_fwd_params : public Qkv_params {
-  using index_t = int64_t;
-
-  // The O matrix (output).
-  void* __restrict__ o_ptr;
-
-  // The stride between rows of O.
-  index_t o_batch_stride;
-  index_t o_row_stride;
-  index_t o_head_stride;
-
-  // For FP8 scaling
-  float* __restrict__ q_descale_ptr;
-  float* __restrict__ k_descale_ptr;
-  float* __restrict__ v_descale_ptr;
-  index_t q_descale_batch_stride;
-  index_t q_descale_head_stride;
-  index_t k_descale_batch_stride;
-  index_t k_descale_head_stride;
-  index_t v_descale_batch_stride;
-  index_t v_descale_head_stride;
-
-  // The dimensions.
-  int b, max_kv_len, max_q_len, qk_d, v_d, total_seq_len_q, total_seq_len_kv;
-
-  // groups
-  int num_groups, batch_size_per_group;
-  int* __restrict__ max_seq_len_tensor;
-  int* __restrict__ contextual_seq_len_tensor;
-  int* __restrict__ max_attn_len_tensor;
-  int* __restrict__ min_full_attn_seq_len_tensor;
-
-  // The scaling factors for the kernel.
-  float alpha;
-
-  int* __restrict__ seq_offsets;
-  int* __restrict__ seq_offsets_q;
-  float* __restrict__ softmax_lse;
-  int* __restrict__ num_targets;
-  float* __restrict__ attn_scale;
-
-  // Local window size
-  int max_attn_len, contextual_seq_len, min_full_attn_seq_len,
-      num_softmax_heads;
-
-  // Pointer to the RNG seed (idx 0) and offset (idx 1).
-  uint64_t* rng_state;
-
-  bool is_bf16;
-  bool is_fp32;
-  bool is_e4m3;
-  bool is_causal;
-  bool is_local;
-  bool has_contexual_mask;
-  bool scalar_scale;
-  bool training;
-
-  int* __restrict__ tile_count_semaphore;
-
-  int arch;
-  int num_sm;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-struct Flash_bwd_params : public Flash_fwd_params {
-  using index_t = int64_t;
-
-  // The dO and dQKV matrices.
-  void* __restrict__ do_ptr;
-  void* __restrict__ dq_ptr;
-  void* __restrict__ dk_ptr;
-  void* __restrict__ dv_ptr;
-  float* __restrict__ softmax_lse_log2;
-  float* __restrict__ softmax_d;
-
-  // To accumulate dQ
-  void* __restrict__ dq_accum_ptr;
-  int* __restrict__ dq_semaphore;
-
-  // The stride between rows of the dO, dQ, dK and dV matrices.
-  index_t do_batch_stride;
-  index_t do_row_stride;
-  index_t do_head_stride;
-  index_t dq_batch_stride;
-  index_t dk_batch_stride;
-  index_t dv_batch_stride;
-  index_t dq_row_stride;
-  index_t dk_row_stride;
-  index_t dv_row_stride;
-  index_t dq_head_stride;
-  index_t dk_head_stride;
-  index_t dv_head_stride;
-
-  int* __restrict__ sort_by_length_indices;
-
-  int max_q_len_rounded, qk_d_rounded, v_d_rounded;
-
-  bool deterministic;
-  index_t dq_accum_split_stride;
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <int Arch, typename T, int Headdim, bool Softmax>
-void run_mha_fwd_(Flash_fwd_params& params, cudaStream_t stream);
-template <int Arch, typename T, int Headdim, bool Softmax>
-void run_mha_bwd_(Flash_bwd_params& params, cudaStream_t stream);
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_api.cpp b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_api.cpp
deleted file mode 100644
index 389e6620f..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_api.cpp
+++ /dev/null
@@ -1,322 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/******************************************************************************
- * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
- *Pradeep Ramani, Tri Dao.
- ******************************************************************************/
-
-#include <ATen/cuda/CUDAContext.h>
-#include <Python.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <cutlass/numeric_types.h>
-#include <torch/csrc/autograd/custom_function.h>
-#include <torch/csrc/autograd/function.h>
-#include <torch/library.h> // @manual
-#include <torch/nn/functional.h>
-#include "flash_common.h"
-
-extern "C" {
-/* Creates a dummy empty _C module that can be imported from Python.
-    The import from Python will load the .so consisting of this file
-    in this extension, so that the TORCH_LIBRARY static initializers
-    below are run. */
-PyObject* PyInit__C(void) {
-  static struct PyModuleDef module_def = {
-      PyModuleDef_HEAD_INIT,
-      "_C", /* name of module */
-      NULL, /* module documentation, may be NULL */
-      -1, /* size of per-interpreter state of the module,
-              or -1 if the module keeps state in global variables. */
-      NULL, /* methods */
-  };
-  return PyModule_Create(&module_def);
-}
-}
-
-namespace hstu {
-
-class HSTUFlashAttentionFunctionGPU
-    : public torch::autograd::Function<HSTUFlashAttentionFunctionGPU> {
- public:
-  static at::Tensor forward(
-      torch::autograd::AutogradContext* ctx,
-      int64_t max_seq_len,
-      double alpha,
-      at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
-      at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
-      at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
-      const std::optional<at::Tensor>& seq_offsets,
-      bool causal,
-      const std::optional<at::Tensor>& num_targets,
-      const std::optional<at::Tensor>& attn_scale,
-      int64_t max_attn_len,
-      int64_t min_full_attn_seq_len,
-      int64_t contextual_seq_len,
-      const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
-      const std::optional<at::Tensor>& k_descale, // (b, h_k)
-      const std::optional<at::Tensor>& v_descale, // (b, h_k)
-      bool sort_by_length,
-      bool deterministic,
-      const int64_t sm_margin,
-      int64_t max_q_len,
-      const std::optional<at::Tensor>& seq_offsets_q,
-      int64_t num_softmax_heads,
-      bool training,
-      const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
-      const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
-      const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
-      const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
-          std::nullopt,
-      int64_t num_groups = 1) {
-    ctx->saved_data["max_seq_len"] = max_seq_len;
-    ctx->saved_data["alpha"] = alpha;
-    ctx->saved_data["causal"] = causal;
-    ctx->saved_data["max_attn_len"] = max_attn_len;
-    ctx->saved_data["min_full_attn_seq_len"] = min_full_attn_seq_len;
-    ctx->saved_data["contextual_seq_len"] = contextual_seq_len;
-    ctx->saved_data["deterministic"] = deterministic;
-    ctx->saved_data["sort_by_length"] = sort_by_length;
-    ctx->saved_data["sm_margin"] = sm_margin;
-    ctx->saved_data["max_q_len"] = max_q_len;
-    ctx->saved_data["num_softmax_heads"] = num_softmax_heads;
-    ctx->saved_data["num_groups"] = num_groups;
-    auto fwd_out = hstu::hstu_mha_fwd(
-        max_seq_len, // max_seq_len
-        alpha, // alpha
-        q, // q
-        k, // k
-        v, // v
-        seq_offsets, // seq_offsets
-        causal, // causal
-        num_targets, // num_targets
-        attn_scale, // attn_scale
-        max_attn_len, // max_attn_len
-        min_full_attn_seq_len, // min_full_attn_seq_len
-        contextual_seq_len, // contextual_seq_len
-        q_descale, // q_descale
-        k_descale, // k_descale
-        v_descale, // v_descale
-        sm_margin, // sm_margin
-        max_q_len, // max_q_len
-        seq_offsets_q, // seq_offsets_q
-        num_softmax_heads, // num_softmax_heads
-        training,
-        max_seq_len_tensor,
-        contextual_seq_len_tensor,
-        max_attn_len_tensor,
-        min_full_attn_seq_len_tensor,
-        num_groups);
-    auto out = get<0>(fwd_out);
-    auto softmax_lse = get<1>(fwd_out);
-    ctx->save_for_backward(
-        {q,
-         k,
-         v,
-         out,
-         seq_offsets.value_or(at::Tensor()),
-         num_targets.value_or(at::Tensor()),
-         attn_scale.value_or(at::Tensor()),
-         seq_offsets_q.value_or(at::Tensor()),
-         softmax_lse.value_or(at::Tensor()),
-         max_seq_len_tensor.value_or(at::Tensor()),
-         contextual_seq_len_tensor.value_or(at::Tensor()),
-         max_attn_len_tensor.value_or(at::Tensor()),
-         min_full_attn_seq_len_tensor.value_or(at::Tensor())});
-    return out;
-  }
-
-  static torch::autograd::variable_list backward(
-      torch::autograd::AutogradContext* ctx,
-      torch::autograd::variable_list grad_outputs) {
-    auto saved_tensors = ctx->get_saved_variables();
-    auto saved_data = ctx->saved_data;
-    auto q = saved_tensors[0];
-    auto k = saved_tensors[1];
-    auto v = saved_tensors[2];
-    auto out = saved_tensors[3];
-    auto seq_offsets = saved_tensors[4];
-    auto num_targets = saved_tensors[5];
-    auto attn_scale = saved_tensors[6];
-    auto seq_offsets_q = saved_tensors[7];
-    auto softmax_lse = saved_tensors[8];
-    auto max_seq_len_tensor = saved_tensors[9];
-    auto contextual_seq_len_tensor = saved_tensors[10];
-    auto max_attn_len_tensor = saved_tensors[11];
-    auto min_full_attn_seq_len_tensor = saved_tensors[12];
-    auto seq_offsets_opt =
-        seq_offsets.defined() ? std::optional(seq_offsets) : std::nullopt;
-    auto num_targets_opt =
-        num_targets.defined() ? std::optional(num_targets) : std::nullopt;
-    auto attn_scale_opt =
-        attn_scale.defined() ? std::optional(attn_scale) : std::nullopt;
-    auto seq_offsets_q_opt =
-        seq_offsets_q.defined() ? std::optional(seq_offsets_q) : std::nullopt;
-    auto softmax_lse_opt =
-        softmax_lse.defined() ? std::optional(softmax_lse) : std::nullopt;
-    auto max_seq_len_tensor_opt = max_seq_len_tensor.defined()
-        ? std::optional(max_seq_len_tensor)
-        : std::nullopt;
-    auto contextual_seq_len_tensor_opt = contextual_seq_len_tensor.defined()
-        ? std::optional(contextual_seq_len_tensor)
-        : std::nullopt;
-    auto max_attn_len_tensor_opt = max_attn_len_tensor.defined()
-        ? std::optional(max_attn_len_tensor)
-        : std::nullopt;
-    auto min_full_attn_seq_len_tensor_opt =
-        min_full_attn_seq_len_tensor.defined()
-        ? std::optional(min_full_attn_seq_len_tensor)
-        : std::nullopt;
-
-    auto dq = at::empty_like(q);
-    auto dk = at::empty_like(k);
-    auto dv = at::empty_like(v);
-
-    auto bwd_res = hstu::hstu_mha_bwd(
-        saved_data["max_seq_len"].toInt(), // max_seq_len
-        saved_data["alpha"].toDouble(), // alpha
-        grad_outputs[0], // dout
-        q, // q
-        k, // k
-        v, // v
-        dq, // dq
-        dk, // dk
-        dv, // dv
-        out, // out
-        seq_offsets_opt, // seq_offsets
-        saved_data["causal"].toBool(), // causal
-        num_targets_opt, // num_targets
-        attn_scale_opt, // attn_scale
-        saved_data["max_attn_len"].toInt(), // max_attn_len
-        saved_data["min_full_attn_seq_len"].toInt(), // min_full_attn_seq_len
-        saved_data["contextual_seq_len"].toInt(), // contextual_seq_len
-        saved_data["sort_by_length"].toBool(), // sort_by_length
-        saved_data["deterministic"].toBool(), // deterministic
-        saved_data["sm_margin"].toInt(), // sm_margin
-        saved_data["max_q_len"].toInt(), // max_q_len
-        seq_offsets_q_opt, // seq_offsets_q
-        saved_data["num_softmax_heads"].toInt(), // num_softmax_heads
-        softmax_lse_opt,
-        max_seq_len_tensor_opt,
-        contextual_seq_len_tensor_opt,
-        max_attn_len_tensor_opt,
-        min_full_attn_seq_len_tensor_opt,
-        saved_data["num_groups"].toInt());
-
-    return {
-        torch::autograd::Variable(), // max_seq_len
-        torch::autograd::Variable(), // alpha
-        bwd_res[0], // dq
-        bwd_res[1], // dk
-        bwd_res[2], // dv
-        torch::autograd::Variable(), // seq_offsets
-        torch::autograd::Variable(), // causal
-        torch::autograd::Variable(), // num_targets
-        torch::autograd::Variable(), // attn_scale
-        torch::autograd::Variable(), // max_attn_len
-        torch::autograd::Variable(), // min_full_attn_seq_len
-        torch::autograd::Variable(), // contextual_seq_len
-        torch::autograd::Variable(), // q_descale
-        torch::autograd::Variable(), // k_descale
-        torch::autograd::Variable(), // v_descale
-        torch::autograd::Variable(), // sort_by_length
-        torch::autograd::Variable(), // deterministic
-        torch::autograd::Variable(), // sm_margin
-        torch::autograd::Variable(), // max_q_len
-        torch::autograd::Variable(), // seq_offsets_q
-        torch::autograd::Variable(), // num_softmax_heads
-        torch::autograd::Variable(), // training
-        torch::autograd::Variable(), // max_seq_len_tensor
-        torch::autograd::Variable(), // contextual_seq_len_tensor
-        torch::autograd::Variable(), // max_attn_len_tensor
-        torch::autograd::Variable(), // min_full_attn_seq_len_tensor
-        torch::autograd::Variable(), // num_groups
-    };
-  }
-};
-
-at::Tensor cuda_hstu_mha(
-    int64_t max_seq_len,
-    double alpha,
-    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
-    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
-    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
-    const std::optional<at::Tensor>& seq_offsets,
-    bool causal,
-    const std::optional<at::Tensor>& num_targets,
-    const std::optional<at::Tensor>& attn_scale,
-    int64_t max_attn_len,
-    int64_t min_full_attn_seq_len,
-    int64_t contextual_seq_len,
-    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
-    const std::optional<at::Tensor>& k_descale, // (b, h_k)
-    const std::optional<at::Tensor>& v_descale, // (b, h_k)
-    bool sort_by_length,
-    bool deterministic,
-    const int64_t sm_margin = 0,
-    int64_t max_q_len = 0,
-    const std::optional<at::Tensor>& seq_offsets_q = std::nullopt,
-    int64_t num_softmax_heads = 0,
-    bool training = true,
-    const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
-    const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
-    const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
-    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
-        std::nullopt,
-    int64_t num_groups = 1) {
-  return hstu::HSTUFlashAttentionFunctionGPU::apply(
-      max_seq_len,
-      alpha,
-      q,
-      k,
-      v,
-      seq_offsets,
-      causal,
-      num_targets,
-      attn_scale,
-      max_attn_len,
-      min_full_attn_seq_len,
-      contextual_seq_len,
-      q_descale,
-      k_descale,
-      v_descale,
-      sort_by_length,
-      deterministic,
-      sm_margin,
-      max_q_len,
-      seq_offsets_q,
-      num_softmax_heads,
-      training,
-      max_seq_len_tensor,
-      contextual_seq_len_tensor,
-      max_attn_len_tensor,
-      min_full_attn_seq_len_tensor,
-      num_groups);
-}
-
-TORCH_LIBRARY_FRAGMENT(hstu, m) {
-  m.impl(
-      "hstu_mha",
-      torch::dispatch(c10::DispatchKey::CUDA, TORCH_FN(cuda_hstu_mha)));
-
-  m.impl(
-      "hstu_mha_fwd",
-      torch::dispatch(c10::DispatchKey::CUDA, TORCH_FN(hstu::hstu_mha_fwd)));
-
-  m.impl(
-      "hstu_mha_bwd",
-      torch::dispatch(c10::DispatchKey::CUDA, TORCH_FN(hstu::hstu_mha_bwd)));
-}
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_api_cpu.cpp b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_api_cpu.cpp
deleted file mode 100644
index c02424efe..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_api_cpu.cpp
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/******************************************************************************
- * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
- *Pradeep Ramani, Tri Dao.
- ******************************************************************************/
-
-#include <torch/library.h> // @manual
-#include <torch/nn/functional.h>
-#include "flash_common_cpu.h"
-
-namespace hstu {
-
-at::Tensor hstu_mha_cpu(
-    int64_t max_seq_len,
-    double alpha,
-    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
-    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
-    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
-    const std::optional<at::Tensor>& seq_offsets,
-    bool causal,
-    const std::optional<at::Tensor>& num_targets,
-    const std::optional<at::Tensor>& attn_scale,
-    int64_t max_attn_len,
-    int64_t min_full_attn_seq_len,
-    int64_t contextual_seq_len,
-    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
-    const std::optional<at::Tensor>& k_descale, // (b, h_k)
-    const std::optional<at::Tensor>& v_descale, // (b, h_k)
-    bool sort_by_length,
-    bool deterministic,
-    const int64_t sm_margin = 0,
-    int64_t max_q_len = 0,
-    const std::optional<at::Tensor>& seq_offsets_q = std::nullopt,
-    int64_t num_softmax_heads = 0,
-    bool training = true,
-    const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
-    const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
-    const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
-    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
-        std::nullopt,
-    int64_t num_groups = 1) {
-  auto fwd_out = hstu::hstu_mha_fwd_dummy(
-      max_seq_len,
-      alpha,
-      q,
-      k,
-      v,
-      seq_offsets,
-      causal,
-      num_targets,
-      attn_scale,
-      max_attn_len,
-      min_full_attn_seq_len,
-      contextual_seq_len,
-      q_descale,
-      k_descale,
-      v_descale,
-      sm_margin,
-      max_q_len,
-      seq_offsets_q,
-      num_softmax_heads,
-      training);
-  return get<0>(fwd_out);
-}
-
-at::Tensor hstu_mha_meta(
-    const at::SymInt max_seq_len,
-    double alpha,
-    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
-    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
-    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
-    const std::optional<at::Tensor>& seq_offsets,
-    bool causal,
-    const std::optional<at::Tensor>& num_targets,
-    const std::optional<at::Tensor>& attn_scale,
-    int64_t max_attn_len,
-    int64_t min_full_attn_seq_len,
-    int64_t contextual_seq_len,
-    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
-    const std::optional<at::Tensor>& k_descale, // (b, h_k)
-    const std::optional<at::Tensor>& v_descale, // (b, h_k)
-    bool sort_by_length,
-    bool deterministic,
-    const int64_t sm_margin = 0,
-    int64_t max_q_len = 0,
-    const std::optional<at::Tensor>& seq_offsets_q = std::nullopt,
-    int64_t num_softmax_heads = 0,
-    bool training = true,
-    const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
-    const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
-    const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
-    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
-        std::nullopt,
-    int64_t num_groups = 1) {
-  auto fwd_out = hstu::hstu_mha_fwd_meta(
-      max_seq_len,
-      alpha,
-      q,
-      k,
-      v,
-      seq_offsets,
-      causal,
-      num_targets,
-      attn_scale,
-      max_attn_len,
-      min_full_attn_seq_len,
-      contextual_seq_len,
-      q_descale,
-      k_descale,
-      v_descale,
-      sm_margin,
-      max_q_len,
-      seq_offsets_q,
-      num_softmax_heads,
-      training);
-  return get<0>(fwd_out);
-}
-
-// CPU-only implementation that registers under main hstu namespace
-// This provides fallback implementations when GPU code is not compiled
-TORCH_LIBRARY_FRAGMENT(hstu, m) {
-  // Only register operators if they haven't been registered by GPU code
-  // This allows CPU-only builds to work while GPU builds use GPU
-  // implementations
-
-  m.def(
-      "hstu_mha_fwd("
-      "SymInt max_seq_len, "
-      "float alpha, "
-      "Tensor q, "
-      "Tensor k, "
-      "Tensor v, "
-      "Tensor? seq_offsets, "
-      "bool causal, "
-      "Tensor? num_targets, "
-      "Tensor? attn_scale, "
-      "int max_attn_len, "
-      "int min_full_attn_seq_len, "
-      "int contextual_seq_len, "
-      "Tensor? q_descale, "
-      "Tensor? k_descale, "
-      "Tensor? v_descale, "
-      "int sm_margin = 0,"
-      "int max_q_len = 0,"
-      "Tensor? seq_offsets_q = None,"
-      "int num_softmax_heads = 0,"
-      "bool training = True,"
-      "Tensor? max_seq_len_tensor = None,"
-      "Tensor? contextual_seq_len_tensor = None,"
-      "Tensor? max_attn_len_tensor = None,"
-      "Tensor? min_full_attn_seq_len_tensor = None,"
-      "int num_groups = 1"
-      ") -> (Tensor, Tensor?)");
-
-  m.def(
-      "hstu_mha_bwd("
-      "int max_seq_len, "
-      "float alpha, "
-      "Tensor dout, "
-      "Tensor q, "
-      "Tensor k, "
-      "Tensor v, "
-      "Tensor dq, "
-      "Tensor dk, "
-      "Tensor dv, "
-      "Tensor out, "
-      "Tensor? seq_offsets, "
-      "bool causal, "
-      "Tensor? num_targets, "
-      "Tensor? attn_scale, "
-      "int max_attn_len, "
-      "int min_full_attn_seq_len, "
-      "int contextual_seq_len, "
-      "bool sort_by_length,"
-      "bool deterministic,"
-      "int sm_margin = 0,"
-      "int max_q_len = 0,"
-      "Tensor? seq_offsets_q = None,"
-      "int num_softmax_heads = 0,"
-      "Tensor? softmax_lse = None,"
-      "Tensor? max_seq_len_tensor = None,"
-      "Tensor? contextual_seq_len_tensor = None,"
-      "Tensor? max_attn_len_tensor = None,"
-      "Tensor? min_full_attn_seq_len_tensor = None,"
-      "int num_groups = 1"
-      ") -> Tensor[]");
-
-  m.def(
-      "hstu_mha("
-      "SymInt max_seq_len, "
-      "float alpha, "
-      "Tensor q, "
-      "Tensor k, "
-      "Tensor v, "
-      "Tensor? seq_offsets, "
-      "bool causal, "
-      "Tensor? num_targets, "
-      "Tensor? attn_scale, "
-      "int max_attn_len, "
-      "int min_full_attn_seq_len, "
-      "int contextual_seq_len, "
-      "Tensor? q_descale, "
-      "Tensor? k_descale, "
-      "Tensor? v_descale, "
-      "bool sort_by_length, "
-      "bool deterministic, "
-      "int sm_margin = 0,"
-      "int max_q_len = 0,"
-      "Tensor? seq_offsets_q = None,"
-      "int num_softmax_heads = 0,"
-      "bool training = True,"
-      "Tensor? max_seq_len_tensor = None,"
-      "Tensor? contextual_seq_len_tensor = None,"
-      "Tensor? max_attn_len_tensor = None,"
-      "Tensor? min_full_attn_seq_len_tensor = None,"
-      "int num_groups = 1"
-      ") -> Tensor");
-
-  // Register CPU implementations
-  m.impl(
-      "hstu_mha",
-      torch::dispatch(c10::DispatchKey::CPU, TORCH_FN(hstu_mha_cpu)));
-  m.impl(
-      "hstu_mha",
-      torch::dispatch(c10::DispatchKey::Meta, TORCH_FN(hstu_mha_meta)));
-
-  m.impl(
-      "hstu_mha_fwd",
-      torch::dispatch(
-          c10::DispatchKey::CPU, TORCH_FN(hstu::hstu_mha_fwd_dummy)));
-  m.impl(
-      "hstu_mha_fwd",
-      torch::dispatch(
-          c10::DispatchKey::Meta, TORCH_FN(hstu::hstu_mha_fwd_meta)));
-
-  m.impl(
-      "hstu_mha_bwd",
-      torch::dispatch(
-          c10::DispatchKey::CPU, TORCH_FN(hstu::hstu_mha_bwd_dummy)));
-}
-
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_kernel_sm90.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_kernel_sm90.h
deleted file mode 100644
index 051d5141b..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_kernel_sm90.h
+++ /dev/null
@@ -1,402 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/******************************************************************************
- * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
- *Pradeep Ramani, Tri Dao.
- ******************************************************************************/
-
-#pragma once
-
-#include "cute/tensor.hpp"
-
-#include <cutlass/arch/reg_reconfig.h>
-#include <cutlass/array.h>
-#include <cutlass/cutlass.h>
-#include <cutlass/kernel_hardware_info.h>
-#include <cutlass/numeric_conversion.h>
-#include <cutlass/numeric_types.h>
-#include "cutlass/pipeline/pipeline.hpp"
-
-#include "tile_scheduler.h"
-#include "utils.h"
-
-namespace hstu {
-
-using namespace cute;
-
-template <
-    bool Softmax,
-    class CollectiveMainloop_,
-    class CollectiveEpilogue_,
-    class TileScheduler_>
-class FlashAttnBwdSm90 {
- public:
-  // Mainloop derived types
-  using CollectiveMainloop = CollectiveMainloop_;
-  using TileShape_MNK = typename CollectiveMainloop::TileShape_MNK;
-  using TiledMmaSdP = typename CollectiveMainloop::TiledMmaSdP;
-  using TiledMmadKV = typename CollectiveMainloop::TiledMmadKV;
-  using ArchTag = typename CollectiveMainloop::ArchTag;
-  using ClusterShape = typename CollectiveMainloop::ClusterShape;
-  using MainloopArguments = typename CollectiveMainloop::Arguments;
-  using MainloopParams = typename CollectiveMainloop::Params;
-  static constexpr bool dKV_swapAB = CollectiveMainloop::dKV_swapAB;
-
-  // Epilogue derived types
-  using CollectiveEpilogue = CollectiveEpilogue_;
-  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
-  using EpilogueParams = typename CollectiveEpilogue::Params;
-
-  static_assert(ArchTag::kMinComputeCapability >= 90);
-
-  using TileScheduler = TileScheduler_;
-  using TileSchedulerArguments = typename hstu::TileSchedulerArguments;
-  using TileSchedulerParams = typename TileScheduler::Params;
-
-  static constexpr uint32_t NumLoadWarpGroups = 1;
-  static constexpr uint32_t NumMmaWarpGroups =
-      CUTE_STATIC_V(size(TiledMmaSdP{})) / cutlass::NumThreadsPerWarpGroup;
-  static constexpr uint32_t MaxThreadsPerBlock =
-      CUTE_STATIC_V(size(TiledMmaSdP{})) +
-      (NumLoadWarpGroups * cutlass::NumThreadsPerWarpGroup);
-  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
-  static_assert(NumMmaWarpGroups == 2 || NumMmaWarpGroups == 3);
-
-  /// Register requirement for Load and Math WGs
-  static constexpr uint32_t LoadRegisterRequirement =
-      NumMmaWarpGroups == 2 ? 24 : 32;
-  static constexpr uint32_t MmaRegisterRequirement =
-      NumMmaWarpGroups == 2 ? 240 : 160;
-  // If you want to print from the producer warp, you'd need to increase the
-  // number of registers Otherwise you'll get CUDA error. static constexpr
-  // uint32_t LoadRegisterRequirement = 40; static constexpr uint32_t
-  // MmaRegisterRequirement = NumMmaWarpGroups == 2 ? 232 : 152;
-
-  // Kernel level shared memory storage
-  struct SharedStorage {
-    struct TensorStorage : cute::aligned_struct<128> {
-      union {
-        typename CollectiveMainloop::TensorStorage mainloop;
-        typename CollectiveEpilogue::TensorStorage epilogue;
-      };
-    } tensors;
-
-    struct PipelineStorage : cute::aligned_struct<16> {
-      alignas(16) cutlass::arch::ClusterTransactionBarrier barrier_KV;
-      alignas(16) typename CollectiveMainloop::MainloopPipeline::SharedStorage
-          pipeline_q;
-      alignas(16)
-          typename CollectiveMainloop::MainloopPipeline_dO::SharedStorage
-          pipeline_do;
-      alignas(16) typename TileScheduler::SharedStorage smem_scheduler;
-    } pipelines;
-  };
-
-  static constexpr int SharedStorageSize = sizeof(SharedStorage);
-
-  // Device side arguments
-  struct Arguments {
-    MainloopArguments mainloop{};
-    EpilogueArguments epilogue{};
-    cutlass::KernelHardwareInfo hw_info{};
-    TileSchedulerArguments scheduler{};
-  };
-
-  // Kernel entry point API
-  struct Params {
-    MainloopParams mainloop{};
-    EpilogueParams epilogue{};
-    cutlass::KernelHardwareInfo hw_info{};
-    TileSchedulerParams scheduler{};
-  };
-
-  //
-  // Methods
-  //
-
-  // Convert to underlying arguments. In this case, a simple copy for the
-  // aliased type.
-  static Params to_underlying_arguments(Arguments const& args) {
-    CUTLASS_TRACE_HOST("to_underlying_arguments():");
-
-    // Get SM count if needed, otherwise use user supplied SM count
-    int sm_count = args.hw_info.sm_count;
-    if (sm_count <= 0) {
-      CUTLASS_TRACE_HOST(
-          "  WARNING: Arguments do not include a valid SM count.\n"
-          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
-      sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
-          args.hw_info.device_id);
-    }
-
-    CUTLASS_TRACE_HOST(
-        "to_underlying_arguments(): Setting persistent grid SM count to "
-        << sm_count);
-
-    cutlass::KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
-    return {
-        CollectiveMainloop::to_underlying_arguments(args.mainloop),
-        CollectiveEpilogue::to_underlying_arguments(args.epilogue),
-        hw_info,
-        TileScheduler::to_underlying_arguments(args.scheduler)};
-  }
-
-  // Computes the kernel launch grid shape based on runtime parameters
-  static dim3 get_grid_shape(Params const& params) {
-    return TileScheduler::get_grid_shape(
-        params.scheduler, params.hw_info.sm_count);
-  }
-
-  static dim3 get_block_shape() {
-    return dim3(MaxThreadsPerBlock, 1, 1);
-  }
-
-  CUTLASS_DEVICE
-  void operator()(Params const& params, char* smem_buf) {
-    static constexpr int NumMmaThreads =
-        NumMmaWarpGroups * cutlass::NumThreadsPerWarpGroup;
-    static constexpr int NumCopyThreads =
-        NumLoadWarpGroups * cutlass::NumThreadsPerWarpGroup;
-
-    using MainloopPipeline = typename CollectiveMainloop::MainloopPipeline;
-    using PipelineParams = typename MainloopPipeline::Params;
-    using PipelineState = typename MainloopPipeline::PipelineState;
-    using MainloopPipeline_dO =
-        typename CollectiveMainloop::MainloopPipeline_dO;
-    using PipelineParams_dO = typename MainloopPipeline_dO::Params;
-    using PipelineState_dO = typename MainloopPipeline_dO::PipelineState;
-    static constexpr bool Q_dO_same_stages =
-        std::is_same_v<MainloopPipeline, MainloopPipeline_dO>;
-
-    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
-
-    int const lane_predicate = cute::elect_one_sync();
-    int const warp_idx = cutlass::canonical_warp_idx_sync();
-
-    // Issue Tma Descriptor Prefetch from a single thread
-    if (warp_idx == 0 && lane_predicate) {
-      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
-      CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
-    }
-
-    // Obtain warp index
-    int const warp_group_thread_idx =
-        threadIdx.x % cutlass::NumThreadsPerWarpGroup;
-
-    PipelineParams pipeline_params;
-    if constexpr (Softmax) {
-      pipeline_params.transaction_bytes =
-          CollectiveMainloop::TmaTransactionBytesQ +
-          CollectiveMainloop::TmaTransactionBytesLSE;
-    } else {
-      pipeline_params.transaction_bytes =
-          CollectiveMainloop::TmaTransactionBytesQ;
-    }
-    int warp_group_idx = cutlass::canonical_warp_group_idx();
-    pipeline_params.role = warp_group_idx == 0
-        ? MainloopPipeline::ThreadCategory::Producer
-        : MainloopPipeline::ThreadCategory::Consumer;
-    pipeline_params.is_leader = warp_group_thread_idx == 0;
-    pipeline_params.num_consumers = NumMmaThreads;
-
-    if (warp_idx == 0 && lane_predicate) {
-      shared_storage.pipelines.barrier_KV.init(1 /*numThreads*/);
-    }
-    // We're counting on pipeline_q to call cutlass::arch::fence_barrier_init();
-    MainloopPipeline pipeline_q(
-        shared_storage.pipelines.pipeline_q, pipeline_params, ClusterShape{});
-    auto role_dO = warp_group_idx == 0
-        ? MainloopPipeline_dO::ThreadCategory::Producer
-        : MainloopPipeline_dO::ThreadCategory::Consumer;
-    PipelineParams_dO pipeline_params_dO{
-        pipeline_params.transaction_bytes,
-        role_dO,
-        pipeline_params.is_leader,
-        pipeline_params.num_consumers};
-    MainloopPipeline_dO pipeline_do(
-        shared_storage.pipelines.pipeline_do,
-        cute::conditional_return<Q_dO_same_stages>(
-            pipeline_params, pipeline_params_dO),
-        ClusterShape{});
-
-    CollectiveMainloop collective_mainloop;
-    CollectiveEpilogue collective_epilogue;
-
-    // We need this to guarantee that the Pipeline init is visible to all
-    // producers and consumer blocks in the Cluster
-    if constexpr (size(ClusterShape{}) > 1) {
-      cute::cluster_arrive_relaxed();
-      cute::cluster_wait();
-    } else {
-      __syncthreads();
-    }
-
-    if (warp_group_idx == 0) { // Producer
-      cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
-
-      int warp_idx_in_warpgroup =
-          __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0);
-      if (warp_idx_in_warpgroup == 0) { // Load K, V, and do TMA on Q and dO
-        PipelineState smem_pipe_write =
-            cutlass::make_producer_start_state<MainloopPipeline>();
-        PipelineState_dO smem_pipe_write_do =
-            cutlass::make_producer_start_state<MainloopPipeline_dO>();
-
-        TileScheduler scheduler(
-            reinterpret_cast<typename TileScheduler::SharedStorage*>(
-                &shared_storage.pipelines.smem_scheduler));
-        for (auto work_tile_info =
-                 scheduler.template get_initial_work</*IsProducerWarp=*/true>(
-                     params.scheduler);
-             work_tile_info.is_valid(params.scheduler);
-             work_tile_info =
-                 scheduler.template get_next_work</*IsProducerWarp=*/true>(
-                     params.scheduler, work_tile_info)) {
-          auto block_coord_ = work_tile_info.get_block_coord(params.scheduler);
-          auto [n_block, bidh, bidb, _ /*split_idx*/] = block_coord_;
-          cute::tuple<int32_t, int32_t, int32_t> block_coord = {
-              n_block, bidh, bidb};
-          auto scheduler_prefetch = [&scheduler, &params, &work_tile_info]() {
-            scheduler.prefetch_next_work(params.scheduler, work_tile_info);
-          };
-          collective_mainloop.load(
-              params.mainloop,
-              pipeline_q,
-              pipeline_do,
-              smem_pipe_write,
-              smem_pipe_write_do,
-              shared_storage,
-              scheduler_prefetch,
-              block_coord);
-        }
-        collective_mainloop.load_tail(
-            pipeline_q, pipeline_do, smem_pipe_write, smem_pipe_write_do);
-      } else if (warp_idx_in_warpgroup == 1) {
-        TileScheduler scheduler(
-            reinterpret_cast<typename TileScheduler::SharedStorage*>(
-                &shared_storage.pipelines.smem_scheduler));
-        for (auto work_tile_info =
-                 scheduler.template get_initial_work</*IsProducerWarp=*/false>(
-                     params.scheduler);
-             work_tile_info.is_valid(params.scheduler);
-             work_tile_info =
-                 scheduler.template get_next_work</*IsProducerWarp=*/false>(
-                     params.scheduler, work_tile_info)) {
-          auto block_coord_ = work_tile_info.get_block_coord(params.scheduler);
-          auto [n_block, bidh, bidb, _ /*split_idx*/] = block_coord_;
-          cute::tuple<int32_t, int32_t, int32_t> block_coord = {
-              n_block, bidh, bidb};
-          collective_mainloop.store_dq(
-              params.mainloop, shared_storage, block_coord);
-        }
-      }
-    } else { // Consumer
-      cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
-
-      TileScheduler scheduler(
-          reinterpret_cast<typename TileScheduler::SharedStorage*>(
-              &shared_storage.pipelines.smem_scheduler));
-      // Initialize matmul objects.
-      TiledMmadKV tiled_mma_dKV;
-
-      PipelineState smem_pipe_read;
-      PipelineState_dO smem_pipe_read_do;
-
-      collective_mainloop.mma_init();
-      scheduler.init_consumer();
-
-      int work_idx = 0;
-      CUTLASS_PRAGMA_NO_UNROLL
-      for (auto work_tile_info =
-               scheduler.template get_initial_work</*IsProducerWarp=*/false>(
-                   params.scheduler);
-           work_tile_info.is_valid(params.scheduler);
-           work_tile_info =
-               scheduler.template get_next_work</*IsProducerWarp=*/false>(
-                   params.scheduler, work_tile_info)) {
-        auto block_coord_ = work_tile_info.get_block_coord(params.scheduler);
-        auto [n_block, bidh, bidb, _ /*split_idx*/] = block_coord_;
-#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
-        if (threadIdx.x == 0 || threadIdx.x == 128) {
-          std::printf(
-              "n_block: (%d), bidh: (%d), bidb: (%d), blockIdx.x: (%d), blockIdx.y: (%d), blockIdx.z: (%d)\n",
-              n_block,
-              bidh,
-              bidb,
-              blockIdx.x,
-              blockIdx.y,
-              blockIdx.z);
-        }
-#endif
-        cute::tuple<int32_t, int32_t, int32_t> block_coord = {
-            n_block, bidh, bidb};
-
-        // dK and dV output accumulator.
-        Tensor tdKrdK = partition_fragment_C(
-            tiled_mma_dKV,
-            select<!dKV_swapAB ? 1 : 2, !dKV_swapAB ? 2 : 1>(TileShape_MNK{}));
-        Tensor tdVrdV = partition_fragment_C(
-            tiled_mma_dKV,
-            select<!dKV_swapAB ? 1 : 2, !dKV_swapAB ? 2 : 1>(TileShape_MNK{}));
-
-        bool tile_valid;
-        if constexpr (Softmax) {
-          tile_valid = collective_mainloop.mma_softmax(
-              params.mainloop,
-              pipeline_q,
-              pipeline_do,
-              smem_pipe_read,
-              smem_pipe_read_do,
-              tdKrdK,
-              tdVrdV,
-              threadIdx.x - NumCopyThreads,
-              work_idx,
-              block_coord,
-              shared_storage);
-        } else {
-          tile_valid = collective_mainloop.mma(
-              params.mainloop,
-              pipeline_q,
-              pipeline_do,
-              smem_pipe_read,
-              smem_pipe_read_do,
-              tdKrdK,
-              tdVrdV,
-              threadIdx.x - NumCopyThreads,
-              work_idx,
-              block_coord,
-              shared_storage);
-        }
-        if (tile_valid) {
-          collective_epilogue.store(
-              params.epilogue,
-              tdKrdK,
-              tdVrdV,
-              shared_storage,
-              tiled_mma_dKV,
-              threadIdx.x - NumCopyThreads,
-              block_coord);
-        } else {
-          collective_epilogue.store_zero(
-              params.epilogue, threadIdx.x - NumCopyThreads, block_coord);
-        }
-      }
-      collective_epilogue.store_tail();
-    }
-  }
-};
-
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_launch_template.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_launch_template.h
deleted file mode 100644
index 6900852df..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_launch_template.h
+++ /dev/null
@@ -1,492 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/******************************************************************************
- * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
- *Pradeep Ramani, Tri Dao.
- ******************************************************************************/
-
-#pragma once
-
-#include "cute/tensor.hpp"
-
-#include "cutlass/cluster_launch.hpp" // For ClusterLauncher
-#include "cutlass/device_kernel.h" // For device_kernel
-#include "cutlass/kernel_launch.h" // For kernel_launch
-
-#include "epilogue_bwd.h"
-#include "flash.h"
-#include "flash_bwd_kernel_sm90.h"
-#include "flash_bwd_postprocess_kernel.h"
-#include "flash_bwd_preprocess_kernel.h"
-#include "mainloop_bwd_sm90_tma_gmma_ws.h"
-#include "static_switch.h"
-#include "tile_scheduler.h"
-#include "tile_size.h"
-
-namespace hstu {
-
-using namespace cute;
-
-template <
-    int Arch,
-    int kHeadDim,
-    int kBlockM,
-    int kBlockN,
-    typename Element,
-    bool Causal,
-    bool Local,
-    bool Contexual_mask,
-    bool Jagged,
-    bool Has_targets,
-    bool Deterministic,
-    int Stages_dO = 2,
-    int Stages_dS_or_QSm80 = 2,
-    bool SdP_swapAB = true,
-    bool dKV_swapAB = false,
-    bool dQ_swapAB = false,
-    int NumMmaWarpGroups = 2,
-    int AtomLayoutMSdP = 1,
-    int AtomLayoutNdKV = 2,
-    int AtomLayoutMdQ = 1,
-    bool V_in_regs = false,
-    bool Cross = false,
-    bool Softmax = false>
-void run_flash_bwd(hstu::Flash_bwd_params& params, cudaStream_t stream) {
-#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
-  std::printf(
-      "[flash_bwd_launch_template] Local: (%d), Jagged: (%d), Has_targets: (%d), Causal: (%d), max_kv_len: (%d), kHeadDim: (%d), kBlockM: (%d), kBlockN: (%d)\n",
-      Local,
-      Jagged,
-      Has_targets,
-      Causal,
-      params.max_kv_len,
-      kHeadDim,
-      kBlockM,
-      kBlockN);
-#endif
-  static_assert(
-      !(Causal && Local), "Causal and Local cannot be true at the same time.");
-  using ElementAccum = float;
-  using ArchTag =
-      std::conditional_t<Arch >= 90, cutlass::arch::Sm90, cutlass::arch::Sm80>;
-
-  int const total_q_padded_rounded =
-      cute::round_up(params.total_seq_len_q + params.b * kBlockM, kBlockM);
-  int seqlen_q = !Jagged ? params.max_q_len : params.total_seq_len_q;
-  int seqlen_kv = !Jagged ? params.max_kv_len : params.total_seq_len_kv;
-  int seqlen_q_rounded =
-      !Jagged ? params.max_q_len_rounded : total_q_padded_rounded;
-  int batch = !Jagged ? params.b : 1;
-
-  using TileShape_MK = cute::Shape<Int<kBlockM>, Int<kHeadDim>>;
-  using PreprocessKernel = hstu::FlashAttnBwdPreprocess<
-      TileShape_MK,
-      Element,
-      ElementAccum,
-      ArchTag,
-      /*Clear_dQaccum=*/true,
-      Jagged,
-      Softmax>;
-  typename PreprocessKernel::Arguments preprocess_args{
-      static_cast<Element const*>(params.o_ptr),
-      {seqlen_q, params.v_d, params.h, batch}, // shape_O
-      {params.o_row_stride,
-       _1{},
-       params.o_head_stride,
-       !Jagged ? params.o_batch_stride : 0}, // stride_O
-      static_cast<Element const*>(params.do_ptr),
-      {params.do_row_stride,
-       _1{},
-       params.do_head_stride,
-       !Jagged ? params.do_batch_stride : 0}, // stride_dO
-      static_cast<float*>(params.softmax_d),
-      {seqlen_q_rounded, params.num_softmax_heads, batch}, // shape_dPsum
-      {_1{},
-       seqlen_q_rounded,
-       !Jagged ? params.num_softmax_heads * params.max_q_len_rounded
-               : 0}, // stride_dPsum
-      static_cast<float*>(params.softmax_lse),
-      {_1{},
-       seqlen_q,
-       !Jagged ? params.num_softmax_heads * params.max_q_len_rounded
-               : 0}, // stride_LSE
-      static_cast<float*>(params.softmax_lse_log2),
-      {_1{},
-       seqlen_q_rounded,
-       !Jagged ? params.num_softmax_heads * params.max_q_len_rounded
-               : 0}, // stride_LSE_log2
-      static_cast<ElementAccum*>(params.dq_accum_ptr),
-      {seqlen_q_rounded * params.qk_d_rounded,
-       params.h,
-       batch}, // shape_dQaccum
-      {_1{},
-       seqlen_q_rounded * params.qk_d_rounded,
-       !Jagged ? params.qk_d_rounded * params.max_q_len_rounded * params.h
-               : 0}, // stride_dQaccum
-      params.b,
-      params.h,
-      params.num_softmax_heads,
-      params.max_q_len,
-      params.dq_semaphore,
-      Cross ? params.seq_offsets_q : params.seq_offsets};
-  typename PreprocessKernel::Params preprocess_params =
-      PreprocessKernel::to_underlying_arguments(preprocess_args);
-  int num_m_block = cute::ceil_div(params.max_q_len, kBlockM);
-  dim3 grid_m(num_m_block, params.h, params.b);
-  cutlass::kernel_launch<PreprocessKernel>(
-      grid_m,
-      PreprocessKernel::MaxThreadsPerBlock,
-      PreprocessKernel::SharedStorageSize,
-      stream,
-      preprocess_params,
-      false /*launch_with_pdl*/);
-  CHECK_CUDA_KERNEL_LAUNCH();
-
-  using TileShape_MNK = cute::Shape<Int<kBlockM>, Int<kBlockN>, Int<kHeadDim>>;
-  using ClusterShape =
-      cute::Shape<_1, Int<1>, _1>; // Currently doesn't not support cluster
-  // Stages_dS_or_QSm80 is Stages_dS if Sm90 and Stages if Sm80
-  static constexpr int Stages = Arch >= 90 ? 2 : Stages_dS_or_QSm80;
-  static constexpr int Stages_dS = Arch >= 90 ? Stages_dS_or_QSm80 : 1;
-  using CollectiveMainloop = hstu::CollectiveMainloopBwdSm90<
-      Stages,
-      Stages_dO,
-      Stages_dS,
-      ClusterShape,
-      TileShape_MNK,
-      Element,
-      ElementAccum,
-      cutlass::arch::Sm90,
-      Causal,
-      Local,
-      Contexual_mask,
-      Jagged,
-      Has_targets,
-      Deterministic,
-      SdP_swapAB,
-      dKV_swapAB,
-      dQ_swapAB,
-      NumMmaWarpGroups,
-      AtomLayoutMSdP,
-      AtomLayoutNdKV,
-      AtomLayoutMdQ,
-      V_in_regs,
-      Cross,
-      Softmax>;
-  using CollectiveEpilogue = hstu::CollectiveEpilogueBwd<
-      TileShape_MNK,
-      Element,
-      ArchTag,
-      CollectiveMainloop::NumMmaThreads,
-      Jagged,
-      dKV_swapAB,
-      NumMmaWarpGroups*(Arch >= 90 ? 1 : cutlass::NumWarpsPerWarpGroup) /
-          AtomLayoutNdKV>;
-  using Scheduler =
-      hstu::SingleTileScheduler<Jagged, kBlockN, false /*Sort_by_length*/>;
-  using AttnKernel = hstu::enable_sm90_or_later<hstu::FlashAttnBwdSm90<
-      Softmax,
-      CollectiveMainloop,
-      CollectiveEpilogue,
-      Scheduler>>;
-
-  typename CollectiveMainloop::Arguments mainloop_args{
-      static_cast<Element const*>(params.q_ptr),
-      {seqlen_q, params.qk_d, params.h, batch}, // shape_Q
-      {params.q_row_stride,
-       _1{},
-       params.q_head_stride,
-       !Jagged ? params.q_batch_stride : 0}, // stride_Q
-      static_cast<Element const*>(params.k_ptr),
-      {seqlen_kv, params.qk_d, params.h, batch}, // shape_K
-      {params.k_row_stride,
-       _1{},
-       params.k_head_stride,
-       !Jagged ? params.k_batch_stride : 0}, // stride_K
-      static_cast<Element const*>(params.v_ptr),
-      {seqlen_kv, params.v_d, params.h, batch}, // shape_V
-      {params.v_row_stride,
-       _1{},
-       params.v_head_stride,
-       !Jagged ? params.v_batch_stride : 0}, // stride_V
-      static_cast<Element const*>(params.do_ptr),
-      {seqlen_q, params.v_d, params.h, batch}, // shape_dO
-      {params.do_row_stride,
-       _1{},
-       params.do_head_stride,
-       !Jagged ? params.do_batch_stride : 0}, // stride_dO
-      static_cast<ElementAccum*>(params.dq_accum_ptr),
-      {seqlen_q_rounded * params.qk_d_rounded,
-       params.h,
-       batch}, // shape_dQaccum
-      {_1{},
-       seqlen_q_rounded * params.qk_d_rounded,
-       !Jagged ? params.qk_d_rounded * params.max_q_len_rounded * params.h
-               : 0}, // stride_dQaccum
-      static_cast<float*>(params.softmax_lse_log2),
-      {seqlen_q_rounded, params.num_softmax_heads, batch}, // shape_LSE
-      {_1{},
-       seqlen_q_rounded,
-       !Jagged ? params.num_softmax_heads * params.max_q_len_rounded
-               : 0}, // stride_LSE_log2
-      static_cast<float*>(params.softmax_d),
-      {_1{},
-       seqlen_q_rounded,
-       !Jagged ? params.num_softmax_heads * params.max_q_len_rounded
-               : 0}, // stride_dPsum
-      params.max_attn_len,
-      params.min_full_attn_seq_len,
-      params.contextual_seq_len,
-      1.0f / params.max_kv_len,
-      params.alpha,
-      params.b,
-      params.num_softmax_heads,
-      params.num_groups,
-      params.batch_size_per_group,
-      params.dq_semaphore,
-      params.seq_offsets,
-      params.seq_offsets_q,
-      params.num_targets,
-      params.max_seq_len_tensor,
-      params.contextual_seq_len_tensor,
-      params.max_attn_len_tensor,
-      params.min_full_attn_seq_len_tensor,
-      params.attn_scale,
-      params.scalar_scale};
-  typename CollectiveEpilogue::Arguments epilogue_args{
-      static_cast<typename CollectiveEpilogue::Element*>(params.dk_ptr),
-      [&] {
-        return typename CollectiveEpilogue::ShapedKV{
-            seqlen_kv, params.qk_d, params.h, batch}; // shape_dK
-      }(),
-      [&] {
-        return typename CollectiveEpilogue::StridedKV{
-            params.dk_row_stride,
-            _1{},
-            params.dk_head_stride,
-            !Jagged ? params.dk_batch_stride : 0}; // stride_dK
-      }(),
-      static_cast<typename CollectiveEpilogue::Element*>(params.dv_ptr),
-      [&] {
-        return typename CollectiveEpilogue::StridedKV{
-            params.dv_row_stride,
-            _1{},
-            params.dv_head_stride,
-            !Jagged ? params.dv_batch_stride : 0}; // stride_dV
-      }(),
-      params.h,
-      params.seq_offsets};
-
-  int num_blocks_n =
-      cutlass::ceil_div(params.max_kv_len, get<1>(TileShape_MNK{}));
-  num_blocks_n = cutlass::round_up(num_blocks_n, size<1>(ClusterShape{}));
-  typename hstu::TileSchedulerArguments scheduler_args{
-      num_blocks_n,
-      params.h,
-      params.b,
-      params.max_kv_len,
-      params.qk_d,
-      sizeof(Element),
-      params.tile_count_semaphore,
-      params.seq_offsets,
-      params.sort_by_length_indices};
-
-  int device;
-  cudaGetDevice(&device);
-  typename AttnKernel::Params kernel_params =
-      AttnKernel::to_underlying_arguments(
-          {mainloop_args,
-           epilogue_args,
-           {device, params.num_sm},
-           scheduler_args});
-
-  dim3 grid_dims = AttnKernel::get_grid_shape(kernel_params);
-  dim3 block_dims = AttnKernel::get_block_shape();
-  int smem_size = AttnKernel::SharedStorageSize;
-  if constexpr (size(ClusterShape{}) > 1) {
-    void const* kernel = (void const*)cutlass::device_kernel<AttnKernel>;
-    if (smem_size >= 48 * 1024) {
-      CHECK_CUDA(cudaFuncSetAttribute(
-          kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
-    }
-    dim3 cluster_dims(
-        size<0>(ClusterShape{}),
-        size<1>(ClusterShape{}),
-        size<2>(ClusterShape{}));
-    cutlass::ClusterLauncher::launch(
-        grid_dims,
-        cluster_dims,
-        block_dims,
-        smem_size,
-        stream,
-        kernel,
-        kernel_params,
-        false /*launch_with_pdl*/);
-  } else {
-    if (smem_size >= 48 * 1024) {
-      CHECK_CUDA(cudaFuncSetAttribute(
-          cutlass::device_kernel<AttnKernel>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize,
-          smem_size));
-    }
-    cutlass::kernel_launch<AttnKernel>(
-        grid_dims,
-        block_dims,
-        smem_size,
-        stream,
-        kernel_params,
-        false /*launch_with_pdl*/);
-  }
-  CHECK_CUDA_KERNEL_LAUNCH();
-
-  using PostprocessKernel = hstu::FlashAttnBwdPostprocessConvertdQ<
-      TileShape_MK,
-      Element,
-      ElementAccum,
-      ArchTag,
-      AttnKernel::CollectiveMainloop::NumMmaThreads,
-      typename AttnKernel::CollectiveMainloop::TiledMmadQ,
-      AttnKernel::CollectiveMainloop::dQ_swapAB,
-      Jagged,
-      Softmax>;
-  typename PostprocessKernel::Arguments postprocess_args{
-      static_cast<ElementAccum const*>(params.dq_accum_ptr),
-      {seqlen_q_rounded * params.qk_d_rounded,
-       params.h,
-       batch}, // shape_dQaccum
-      {_1{},
-       seqlen_q_rounded * params.qk_d_rounded,
-       !Jagged ? params.qk_d_rounded * params.max_q_len_rounded * params.h
-               : 0}, // stride_dQaccum
-      static_cast<Element*>(params.dq_ptr),
-      {seqlen_q, params.qk_d, params.h, batch}, // shape_dQ
-      {params.dq_row_stride,
-       _1{},
-       params.dq_head_stride,
-       params.dq_batch_stride}, // stride_dQ
-      Cross ? params.seq_offsets_q : params.seq_offsets};
-  typename PostprocessKernel::Params postprocess_params =
-      PostprocessKernel::to_underlying_arguments(postprocess_args);
-  int num_m_block_postprocess =
-      cute::ceil_div(params.max_q_len, get<0>(TileShape_MK{}));
-  dim3 grid_m_postprocess(num_m_block_postprocess, params.h, params.b);
-  int smem_size_postprocess = PostprocessKernel::SharedStorageSize;
-  if (smem_size_postprocess >= 48 * 1024) {
-    CHECK_CUDA(cudaFuncSetAttribute(
-        cutlass::device_kernel<PostprocessKernel>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize,
-        smem_size_postprocess));
-  }
-  cutlass::kernel_launch<PostprocessKernel>(
-      grid_m_postprocess,
-      PostprocessKernel::MaxThreadsPerBlock,
-      smem_size_postprocess,
-      stream,
-      postprocess_params,
-      false /*launch_with_pdl*/);
-  CHECK_CUDA_KERNEL_LAUNCH();
-}
-
-template <
-    int Arch,
-    typename T,
-    int kBlockM,
-    int kBlockN,
-    int kHeadDim,
-    bool Causal,
-    bool Local,
-    int Stages_dO = 2,
-    int Stages_dS_or_QSm80 = 2,
-    bool SdP_swapAB = true,
-    bool dKV_swapAB = false,
-    bool dQ_swapAB = false,
-    int NumMmaWarpGroups = 2,
-    int AtomLayoutMSdP = 1,
-    int AtomLayoutNdKV = 2,
-    int AtomLayoutMdQ = 1,
-    bool V_in_regs = false,
-    bool Softmax = false>
-void run_mha_bwd_dispatch(hstu::Flash_bwd_params& params, cudaStream_t stream) {
-  BOOL_SWITCH(params.seq_offsets != nullptr, Jagged, [&] {
-    BOOL_SWITCH(params.num_targets != nullptr, Has_targets, [&] {
-      BOOL_SWITCH(params.has_contexual_mask, Contexual_mask, [&] {
-        BOOL_SWITCH(params.seq_offsets_q, Cross, [&] {
-          run_flash_bwd<
-              Arch,
-              kHeadDim,
-              kBlockM,
-              kBlockN,
-              T,
-              Causal,
-              Local,
-              Contexual_mask,
-              Jagged,
-              Has_targets,
-              false /*Deterministic*/,
-              Stages_dO,
-              Stages_dS_or_QSm80,
-              SdP_swapAB,
-              dKV_swapAB,
-              dQ_swapAB,
-              NumMmaWarpGroups,
-              AtomLayoutMSdP,
-              AtomLayoutNdKV,
-              AtomLayoutMdQ,
-              V_in_regs,
-              Cross,
-              Softmax>(params, stream);
-        });
-      });
-    });
-  });
-}
-
-template <int Arch, typename T, int kHeadDim, bool Softmax>
-void run_mha_bwd_(hstu::Flash_bwd_params& params, cudaStream_t stream) {
-  CAUSAL_LOCAL_SWITCH(params.is_causal, params.is_local, Causal, Local, [&] {
-    int const kBlockM = hstu::kBlockM_bwd(Arch, kHeadDim, Causal, Local);
-    int const kBlockN = hstu::kBlockN_bwd(Arch, kHeadDim);
-    bool const V_in_regs = hstu::V_in_regs_bwd(Arch, kHeadDim);
-    static constexpr std::tuple<int, int> Stages =
-        hstu::Stages_bwd(Arch, kHeadDim);
-    static constexpr std::tuple<bool, bool, bool> swapAB =
-        hstu::swapAB_bwd(Arch, kHeadDim, Causal, Local);
-    int const NumMmaWarpGroups = hstu::NumMmaWarpGroups_bwd(Arch, kHeadDim);
-    static constexpr std::tuple<int, int, int> AtomLayout =
-        hstu::AtomLayout_bwd(Arch, kHeadDim);
-    run_mha_bwd_dispatch<
-        Arch,
-        T,
-        kBlockM,
-        kBlockN,
-        kHeadDim,
-        Causal,
-        Local,
-        std::get<0>(Stages), /*Stages_dO*/
-        std::get<1>(Stages), /*Stages_dS_or_QSm80*/
-        std::get<0>(swapAB), /*SdP_swapAB*/
-        std::get<1>(swapAB), /*dKV_swapAB*/
-        std::get<2>(swapAB), /*dQ_swapAB*/
-        NumMmaWarpGroups,
-        std::get<0>(AtomLayout), /*AtomLayoutMSdP*/
-        std::get<1>(AtomLayout), /*AtomLayoutNdKV*/
-        std::get<2>(AtomLayout), /*AtomLayoutMdQ*/
-        V_in_regs,
-        Softmax>(params, stream);
-  });
-}
-
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_postprocess_kernel.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_postprocess_kernel.h
deleted file mode 100644
index ca04a1456..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_postprocess_kernel.h
+++ /dev/null
@@ -1,348 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/******************************************************************************
- * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
- *Pradeep Ramani, Tri Dao.
- ******************************************************************************/
-
-#pragma once
-
-#include "cute/tensor.hpp"
-
-#include <cutlass/array.h>
-#include <cutlass/cutlass.h>
-#include <cutlass/numeric_conversion.h>
-#include <cutlass/numeric_types.h>
-#include "cutlass/arch/barrier.h"
-
-#include "seqlen.h"
-#include "utils.h"
-
-namespace hstu {
-
-using namespace cute;
-
-template <
-    class TileShape_MK_,
-    class Element,
-    class ElementAccum,
-    class ArchTag_,
-    int kNThreads,
-    class TiledMma,
-    bool dQ_swapAB,
-    bool Jagged,
-    bool Softmax>
-class FlashAttnBwdPostprocessConvertdQ {
- public:
-  // Type Aliases
-  using TileShape_MK = TileShape_MK_;
-  using ArchTag = ArchTag_;
-
-  static_assert(ArchTag::kMinComputeCapability >= 75);
-  static constexpr bool IsSm90 = ArchTag::kMinComputeCapability >= 90;
-
-  static constexpr uint32_t MaxThreadsPerBlock = kNThreads;
-  static constexpr uint32_t MinBlocksPerMultiprocessor = 2;
-
-  static constexpr int kBlockM = get<0>(TileShape_MK{});
-  static constexpr int kHeadDim = get<1>(TileShape_MK{});
-  static_assert(
-      !IsSm90 || kNThreads % cutlass::NumThreadsPerWarpGroup == 0,
-      "kNThreads must be a multiple of NumThreadsPerWarpGroup");
-  static constexpr int NumdQWarpGgroups =
-      kNThreads / cutlass::NumThreadsPerWarpGroup;
-  using R2SLayoutAtomdQaccum = std::conditional_t<
-      IsSm90,
-      Layout<
-          Shape<Int<cutlass::NumThreadsPerWarpGroup>, Int<NumdQWarpGgroups>>>,
-      Layout<Shape<Int<kNThreads>>>>;
-  using R2STiledCopydQaccum = decltype(make_tiled_copy(
-      Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, ElementAccum>{},
-      R2SLayoutAtomdQaccum{},
-      Layout<Shape<Int<IsSm90 ? 4 : 1>>>{})); // Val layout, 1 or 4 vals per
-                                              // read
-  using G2SLayoutAtomdQaccum = Layout<Shape<Int<kNThreads>>>;
-  // UniversalCopy instead of AutoVectorizingCopyWithAssumedAlignment as the
-  // latter generates cp.async instructions
-  using G2STiledCopydQaccum = decltype(make_tiled_copy(
-      Copy_Atom<UniversalCopy<uint128_t>, ElementAccum>{},
-      G2SLayoutAtomdQaccum{},
-      Layout<Shape<_4>>{})); // Val layout, 4 vals per read
-  // We don't do bound checking for the gmem -> smem load so we just assert
-  // here.
-  static_assert(IsSm90 || (kBlockM * kHeadDim) % (kNThreads * 4) == 0);
-  static constexpr int SmemdQaccumSize = size(TileShape_MK{});
-  using SmemLayoutdQaccumFlat = Layout<Shape<Int<SmemdQaccumSize>>>;
-  using SmemLayoutdQaccum = std::conditional_t<
-      IsSm90,
-      Layout<Shape<
-          Int<kBlockM * kHeadDim / NumdQWarpGgroups>,
-          Int<NumdQWarpGgroups>>>,
-      Layout<Shape<Int<kBlockM * kHeadDim>>>>;
-
-  // We can't just use kHeadDim here. E.g. if MMA shape is 64 x 96 but split
-  // across 2 WGs, then setting kBlockKSmem to 32 will cause "Static shape_div
-  // failure". We want to treat it as 64 x 48, so kBlockKSmem should be 16.
-  static constexpr int MmaShapeN = get<1>(typename TiledMma::AtomShape_MNK{});
-  static constexpr int kBlockKSmem =
-      MmaShapeN % 64 == 0 ? 64 : (MmaShapeN % 32 == 0 ? 32 : 16);
-  static constexpr int kSwizzle =
-      kBlockKSmem == 64 ? 3 : (kBlockKSmem == 32 ? 2 : 1);
-  using SmemLayoutAtomdQ = decltype(composition(
-      Swizzle<kSwizzle, 3, 3>{},
-      Layout<Shape<Int<8>, Int<kBlockKSmem>>, Stride<Int<kBlockKSmem>, _1>>{}));
-  using SmemLayoutdQ =
-      decltype(tile_to_shape(SmemLayoutAtomdQ{}, TileShape_MK{}));
-  using SmemLayoutdQt = decltype(cute::composition(
-      SmemLayoutdQ{},
-      make_layout(
-          make_shape(get<1>(TileShape_MK{}), get<0>(TileShape_MK{})),
-          make_stride(Int<get<0>(TileShape_MK{})>{}, _1{}))));
-
-  using SmemCopyAtomdQ = Copy_Atom<
-      std::conditional_t<
-          IsSm90,
-          std::conditional_t<
-              !dQ_swapAB,
-              cute::SM90_U32x4_STSM_N,
-              cute::SM90_U16x8_STSM_T>,
-          AutoVectorizingCopyWithAssumedAlignment<128>>,
-      Element>;
-
-  static constexpr int kGmemElemsPerLoad =
-      sizeof(cute::uint128_t) / sizeof(Element);
-  static_assert(
-      kHeadDim % kGmemElemsPerLoad == 0,
-      "Headdim must be a multiple of kGmemElemsPerLoad");
-  static constexpr int kGmemThreadsPerRow =
-      cutlass::gcd(kHeadDim / kGmemElemsPerLoad, int(MaxThreadsPerBlock));
-  static_assert(
-      MaxThreadsPerBlock % kGmemThreadsPerRow == 0,
-      "MaxThreadsPerBlock must be a multiple of kGmemThreadsPerRow");
-  using GmemLayoutAtom = Layout<
-      Shape<
-          Int<MaxThreadsPerBlock / kGmemThreadsPerRow>,
-          Int<kGmemThreadsPerRow>>,
-      Stride<Int<kGmemThreadsPerRow>, _1>>;
-  using GmemTiledCopy = decltype(make_tiled_copy(
-      Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, Element>{},
-      GmemLayoutAtom{},
-      Layout<Shape<_1, Int<kGmemElemsPerLoad>>>{})); // Val layout, 8 or 16 vals
-                                                     // per load
-
-  struct SharedStorage : cute::aligned_struct<128> {
-    cute::array_aligned<ElementAccum, cute::cosize_v<SmemLayoutdQaccum>>
-        smem_dqacc;
-    cute::array_aligned<Element, cute::cosize_v<SmemLayoutdQ>> smem_dq;
-    alignas(16) cutlass::arch::ClusterTransactionBarrier barrier_dQaccum;
-  };
-
-  static constexpr int SharedStorageSize = sizeof(SharedStorage);
-
-  using ShapedQ =
-      cute::Shape<int32_t, int32_t, int32_t, int32_t>; // (seqlen_q, d, head,
-                                                       // batch)
-  using StridedQ = cute::Stride<int64_t, _1, int64_t, int64_t>;
-  using ShapedQaccum =
-      cute::Shape<int32_t, int32_t, int32_t>; // (seqlen_q * d, head, batch)
-  using StridedQaccum = cute::Stride<_1, int64_t, int64_t>;
-
-  // Device side arguments
-  struct Arguments {
-    ElementAccum const* ptr_dQaccum;
-    ShapedQaccum const shape_dQaccum;
-    StridedQaccum const stride_dQaccum;
-    Element* ptr_dQ;
-    ShapedQ const shape_dQ;
-    StridedQ const stride_dQ;
-    int const* seq_offsets = nullptr;
-  };
-
-  // Kernel entry point API
-  struct Params {
-    ElementAccum const* ptr_dQaccum;
-    ShapedQaccum const shape_dQaccum;
-    StridedQaccum const stride_dQaccum;
-    Element* ptr_dQ;
-    ShapedQ const shape_dQ;
-    StridedQ const stride_dQ;
-    int const* seq_offsets = nullptr;
-  };
-
-  // Convert to underlying arguments. In this case, a simple copy for the
-  // aliased type.
-  static Params to_underlying_arguments(Arguments const& args) {
-    return {
-        args.ptr_dQaccum,
-        args.shape_dQaccum,
-        args.stride_dQaccum,
-        args.ptr_dQ,
-        args.shape_dQ,
-        args.stride_dQ,
-        args.seq_offsets};
-  }
-
-  CUTLASS_DEVICE
-  void operator()(Params const& params, char* smem_buf) {
-    static constexpr int kBlockM = get<0>(TileShape_MK{});
-    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
-
-    Tensor sdQaccum = make_tensor(
-        make_smem_ptr(shared_storage.smem_dqacc.data()), SmemLayoutdQaccum{});
-    Tensor sdQaccum_flat = make_tensor(
-        make_smem_ptr(shared_storage.smem_dqacc.data()),
-        SmemLayoutdQaccumFlat{});
-    Tensor sdQ = make_tensor(
-        make_smem_ptr(shared_storage.smem_dq.data()), SmemLayoutdQ{});
-    Tensor sdQt = make_tensor(
-        make_smem_ptr(shared_storage.smem_dq.data()), SmemLayoutdQt{});
-
-    int const thread_idx = threadIdx.x;
-    int const m_block = blockIdx.x;
-    int const bidh = blockIdx.y;
-    int const bidb = blockIdx.z;
-
-    hstu::SeqlenInfo<Jagged, kBlockM> seqlen_info(
-        bidb, size<0>(params.shape_dQ), params.seq_offsets);
-    if (Jagged && m_block * kBlockM >= seqlen_info.seqlen) {
-      return;
-    }
-
-    // Step 1: load dQaccum from gmem to smem
-    Tensor mdQaccum = make_tensor(
-        make_gmem_ptr(
-            reinterpret_cast<ElementAccum const*>(params.ptr_dQaccum)),
-        params.shape_dQaccum,
-        params.stride_dQaccum)(_, bidh, !Jagged ? bidb : 0);
-    Tensor gdQaccum = local_tile(
-        domain_offset(
-            make_coord(seqlen_info.offset_padded * kHeadDim), mdQaccum),
-        Shape<Int<kBlockM * kHeadDim>>{},
-        make_coord(m_block)); // (M * K)
-    if constexpr (IsSm90) { // Use BulkCopy
-      static constexpr uint32_t TmaTransactionBytesdQaccum =
-          static_cast<uint32_t>(
-              size(SmemLayoutdQaccumFlat{}) *
-              cute::sizeof_bits_v<ElementAccum> / 8);
-      auto bulk_copy = Copy_Traits<SM90_BULK_COPY_AUTO>{};
-      // if (thread0()) { print(gdQaccum); printf("\n"); print(sdQaccum_flat);
-      // printf("\n"); }
-      if (thread_idx == 0) {
-        shared_storage.barrier_dQaccum.init(1 /*numThreads*/);
-        shared_storage.barrier_dQaccum.arrive_and_expect_tx(
-            TmaTransactionBytesdQaccum);
-        copy(
-            bulk_copy.with(
-                *reinterpret_cast<uint64_t*>(&shared_storage.barrier_dQaccum)),
-            gdQaccum,
-            sdQaccum_flat);
-      }
-      __syncthreads();
-      shared_storage.barrier_dQaccum.wait(0);
-    } else {
-      G2STiledCopydQaccum g2s_tiled_copy_dQaccum;
-      auto g2s_thr_copy_dQaccum =
-          g2s_tiled_copy_dQaccum.get_thread_slice(thread_idx);
-      Tensor tdQgdQaccumg2s = g2s_thr_copy_dQaccum.partition_S(gdQaccum);
-      Tensor tdQsdQaccumg2s = g2s_thr_copy_dQaccum.partition_D(sdQaccum);
-      cute::copy(g2s_tiled_copy_dQaccum, tdQgdQaccumg2s, tdQsdQaccumg2s);
-      __syncthreads();
-    }
-
-    // __syncthreads(); if (cute::thread0()) { print_tensor(sdQaccum); }
-
-    // Step 2: Load dQaccum from smem to register, then convert fp32 ->
-    // fp16/bf16
-    R2STiledCopydQaccum s2r_tiled_copy_dQaccum;
-    auto s2r_thr_copy_dQaccum =
-        s2r_tiled_copy_dQaccum.get_thread_slice(thread_idx);
-    Tensor tdQsdQaccum = s2r_thr_copy_dQaccum.partition_S(sdQaccum);
-    TiledMma tiled_mma_dQ;
-    Tensor taccdQrdQaccum = partition_fragment_C(
-        tiled_mma_dQ,
-        select<!dQ_swapAB ? 0 : 1, !dQ_swapAB ? 1 : 0>(TileShape_MK{}));
-    // if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 1) {
-    // print(tiled_mma_dQ); printf("\n"); } if (blockIdx.x == 0 && blockIdx.y ==
-    // 0 && threadIdx.x == 1) { print(tdQsdQaccum); } if (blockIdx.x == 0 &&
-    // blockIdx.y == 0 && threadIdx.x == 1) { print(taccdQrdQaccum); }
-    CUTE_STATIC_ASSERT_V(size(taccdQrdQaccum) == size(tdQsdQaccum));
-    Tensor tdQrdQaccum = s2r_thr_copy_dQaccum.retile_D(taccdQrdQaccum);
-    cute::copy(s2r_tiled_copy_dQaccum, tdQsdQaccum, tdQrdQaccum);
-    // Convert tdQrdQ from fp32 to fp16
-    Tensor rdQ = make_tensor_like<Element>(taccdQrdQaccum);
-    hstu::convert_type_out(taccdQrdQaccum, rdQ);
-
-    // Step 3: Copy dQ from register to smem
-    auto smem_tiled_copy_dQ = make_tiled_copy_C(SmemCopyAtomdQ{}, tiled_mma_dQ);
-    auto smem_thr_copy_dQ = smem_tiled_copy_dQ.get_thread_slice(thread_idx);
-    Tensor taccdQrdQ =
-        smem_thr_copy_dQ.retile_S(rdQ); // ((Atom,AtomNum), MMA_N, MMA_N)
-    // if (cute::thread0()) { print(smem_tiled_copy_dQ); }
-    // if (cute::thread0()) { print(smem_thr_copy_dQ); }
-    // if (cute::thread0()) { print(sdQ); }
-    Tensor taccdQsdQ = smem_thr_copy_dQ.partition_D(
-        cute::conditional_return<!dQ_swapAB>(
-            sdQ, sdQt)); // ((Atom,AtomNum),PIPE_M,PIPE_N)
-    cute::copy(smem_tiled_copy_dQ, taccdQrdQ, taccdQsdQ);
-    __syncthreads();
-
-    // Step 4: Copy dQ from smem to register to prepare for coalesced write to
-    // gmem
-    Tensor mdQ = make_tensor(
-        make_gmem_ptr(params.ptr_dQ), params.shape_dQ, params.stride_dQ)(
-        _, _, bidh, !Jagged ? bidb : 0);
-    Tensor gdQ = local_tile(
-        domain_offset(make_coord(seqlen_info.offset, _0{}), mdQ),
-        TileShape_MK{},
-        make_coord(m_block, _0{})); // (M, K)
-    GmemTiledCopy gmem_tiled_copy_dQ;
-    auto gmem_thr_copy_dQ = gmem_tiled_copy_dQ.get_thread_slice(thread_idx);
-    Tensor tdQsdQ =
-        gmem_thr_copy_dQ.partition_S(sdQ); // ((Atom,AtomNum),ATOM_M,ATOM_N)
-    Tensor tdQgdQ = gmem_thr_copy_dQ.partition_D(gdQ);
-
-    Tensor tdQrdQ = make_fragment_like(tdQsdQ);
-    Tensor tdQcdQ = gmem_thr_copy_dQ.partition_D(
-        cute::make_identity_tensor(TileShape_MK{}));
-    Tensor tdQpdQ = make_tensor<bool>(make_shape(size<2>(tdQgdQ)));
-#pragma unroll
-    for (int k = 0; k < size(tdQpdQ); ++k) {
-      tdQpdQ(k) = get<1>(tdQcdQ(_0{}, _0{}, k)) < get<1>(params.shape_dQ);
-    }
-    // Need to check OOB when reading from smem if kBlockM isn't evenly tiled
-    static constexpr bool EvenM =
-        kBlockM % CUTE_STATIC_V(size<0>(GmemLayoutAtom{})) == 0;
-    hstu::
-        copy</*Is_even_MN=*/EvenM, /*Is_even_K=*/true, /*Clear_OOB_MN=*/false>(
-            gmem_tiled_copy_dQ, tdQsdQ, tdQrdQ, tdQcdQ, tdQpdQ, kBlockM);
-
-    // Step 5: Copy dQ from register to gmem
-    // Clear_OOB_K must be false since we don't want to write zeros to gmem
-    hstu::copy<
-        /*Is_even_MN=*/false,
-        /*Is_even_K=*/false,
-        /*Clear_OOB_MN=*/false,
-        /*Clear_OOB_K=*/false>(
-        gmem_tiled_copy_dQ,
-        tdQrdQ,
-        tdQgdQ,
-        tdQcdQ,
-        tdQpdQ,
-        std::min(seqlen_info.seqlen - m_block * kBlockM, kBlockM));
-  }
-};
-
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_preprocess_kernel.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_preprocess_kernel.h
deleted file mode 100644
index 8d29778af..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_bwd_preprocess_kernel.h
+++ /dev/null
@@ -1,349 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/******************************************************************************
- * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
- *Pradeep Ramani, Tri Dao.
- ******************************************************************************/
-
-#pragma once
-
-#include "cute/tensor.hpp"
-
-#include <cutlass/array.h>
-#include <cutlass/cutlass.h>
-#include <cutlass/numeric_conversion.h>
-#include <cutlass/numeric_types.h>
-
-#include "seqlen.h"
-
-namespace hstu {
-
-using namespace cute;
-
-template <
-    class TileShape_MK_,
-    class Element,
-    class ElementAccum,
-    class ArchTag_,
-    bool Clear_dQaccum,
-    bool Jagged,
-    bool Softmax>
-class FlashAttnBwdPreprocess {
- public:
-  // Type Aliases
-  using TileShape_MK = TileShape_MK_;
-  using ArchTag = ArchTag_;
-
-  static_assert(
-      std::is_same_v<Element, cutlass::half_t> &&
-          ArchTag::kMinComputeCapability >= 75 ||
-      std::is_same_v<Element, cutlass::bfloat16_t> &&
-          ArchTag::kMinComputeCapability >= 80 ||
-      std::is_same_v<Element, cutlass::float_e4m3_t> &&
-          ArchTag::kMinComputeCapability >= 89);
-
-  static constexpr uint32_t MaxThreadsPerBlock = 256;
-  static constexpr uint32_t MinBlocksPerMultiprocessor = 2;
-  static constexpr int SharedStorageSize = 0;
-
-  static constexpr int kGmemElemsPerLoad =
-      sizeof(cute::uint128_t) / sizeof(Element);
-  static_assert(
-      get<1>(TileShape_MK{}) % kGmemElemsPerLoad == 0,
-      "Headdim must be a multiple of kGmemElemsPerLoad");
-  static constexpr int kBlockM = get<0>(TileShape_MK{});
-  static constexpr int kHeadDim = get<1>(TileShape_MK{});
-  // We want kBlockKGmem to be a power of 2 so that when we do the summing,
-  // it's just between threads in the same warp
-  static constexpr int kBlockKGmem =
-      kHeadDim % 128 == 0 ? 128 : (kHeadDim % 64 == 0 ? 64 : 32);
-  static constexpr int kGmemThreadsPerRow = kBlockKGmem / kGmemElemsPerLoad;
-  static_assert(
-      MaxThreadsPerBlock % kGmemThreadsPerRow == 0,
-      "MaxThreadsPerBlock must be a multiple of kGmemThreadsPerRow");
-  using GmemLayoutAtom = Layout<
-      Shape<
-          Int<MaxThreadsPerBlock / kGmemThreadsPerRow>,
-          Int<kGmemThreadsPerRow>>,
-      Stride<Int<kGmemThreadsPerRow>, _1>>;
-  using GmemTiledCopy = decltype(make_tiled_copy(
-      Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, Element>{},
-      GmemLayoutAtom{},
-      Layout<Shape<_1, Int<kGmemElemsPerLoad>>>{})); // Val layout, 8 or 16 vals
-                                                     // per load
-
-  static constexpr int kGmemElemsPerLoadAccum =
-      sizeof(cute::uint128_t) / sizeof(ElementAccum);
-  static_assert(
-      (kBlockM * kHeadDim / kGmemElemsPerLoadAccum) % MaxThreadsPerBlock == 0,
-      "MaxThreadsPerBlock must divide kBlockM * kHeadDim / kGmemElemsPerLoadAccum");
-  using GmemLayoutAtomAccum = Layout<Shape<Int<MaxThreadsPerBlock>>>;
-  using GmemTiledCopyAccum = decltype(make_tiled_copy(
-      Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, ElementAccum>{},
-      GmemLayoutAtomAccum{},
-      Layout<Shape<Int<kGmemElemsPerLoadAccum>>>{})); // Val layout, 4 vals per
-                                                      // store
-
-  using ShapeO =
-      cute::Shape<int32_t, int32_t, int32_t, int32_t>; // (seqlen_q, d, head,
-                                                       // batch)
-  using StrideO = cute::Stride<int64_t, _1, int64_t, int64_t>;
-  using ShapedPsum =
-      cute::Shape<int32_t, int32_t, int32_t>; // (seqlen_q, head, batch)
-  using StridedPsum = cute::Stride<_1, int64_t, int64_t>;
-  using ShapedQaccum =
-      cute::Shape<int32_t, int32_t, int32_t>; // (seqlen_q * d, head, batch)
-  using StridedQaccum = cute::Stride<_1, int64_t, int64_t>;
-
-  // Device side arguments
-  struct Arguments {
-    Element const* ptr_O;
-    ShapeO const shape_O;
-    StrideO const stride_O;
-    Element const* ptr_dO;
-    StrideO const stride_dO;
-    float* ptr_dPsum;
-    ShapedPsum const shape_dPsum;
-    StridedPsum const stride_dPsum;
-    float const* ptr_LSE;
-    StridedPsum const stride_LSE;
-    float* ptr_LSE_log2;
-    StridedPsum const stride_LSE_log2;
-    ElementAccum* ptr_dQaccum;
-    ShapedQaccum const shape_dQaccum;
-    StridedQaccum const stride_dQaccum;
-    int num_batch; // We need this to know the size of dq_semaphore in case of
-                   // jagged
-    int num_heads;
-    int num_softmax_heads;
-    int max_seq_len;
-    int* dq_semaphore;
-    int const* seq_offsets = nullptr;
-  };
-
-  // Kernel entry point API
-  struct Params {
-    Element const* ptr_O;
-    ShapeO const shape_O;
-    StrideO const stride_O;
-    Element const* ptr_dO;
-    StrideO const stride_dO;
-    float* ptr_dPsum;
-    ShapedPsum const shape_dPsum;
-    StridedPsum const stride_dPsum;
-    float const* ptr_LSE;
-    StridedPsum const stride_LSE;
-    float* ptr_LSE_log2;
-    StridedPsum const stride_LSE_log2;
-    ElementAccum* ptr_dQaccum;
-    ShapedQaccum const shape_dQaccum;
-    StridedQaccum const stride_dQaccum;
-    int num_batch;
-    int num_heads;
-    int num_softmax_heads;
-    int max_seq_len;
-    int* dq_semaphore;
-    int const* seq_offsets = nullptr;
-  };
-
-  // Convert to underlying arguments. In this case, a simple copy for the
-  // aliased type.
-  static Params to_underlying_arguments(Arguments const& args) {
-    return {args.ptr_O,       args.shape_O,       args.stride_O,
-            args.ptr_dO,      args.stride_dO,     args.ptr_dPsum,
-            args.shape_dPsum, args.stride_dPsum,  args.ptr_LSE,
-            args.stride_LSE,  args.ptr_LSE_log2,  args.stride_LSE_log2,
-            args.ptr_dQaccum, args.shape_dQaccum, args.stride_dQaccum,
-            args.num_batch,   args.num_heads,     args.num_softmax_heads,
-            args.max_seq_len, args.dq_semaphore,  args.seq_offsets};
-  }
-
-  CUTLASS_DEVICE
-  void operator()(Params const& params, [[maybe_unused]] char* smem_buf) {
-    static constexpr int kBlockM = get<0>(TileShape_MK{});
-
-    int const thread_idx = threadIdx.x;
-    int const m_block = blockIdx.x;
-    int const bidh = blockIdx.y;
-    int const bidb = blockIdx.z;
-
-    hstu::SeqlenInfo<Jagged, kBlockM> seqlen_info(
-        bidb, params.max_seq_len, params.seq_offsets);
-    int const seqlen_o = seqlen_info.seqlen;
-    if (Jagged && m_block * kBlockM >= seqlen_o) {
-      return;
-    }
-
-    if constexpr (Softmax) {
-      Tensor mO = make_tensor(
-          make_gmem_ptr(params.ptr_O), params.shape_O, params.stride_O)(
-          _, _, bidh, !Jagged ? bidb : 0);
-      Tensor gO = local_tile(
-          cute::domain_offset(make_coord(seqlen_info.offset, _0{}), mO),
-          TileShape_MK{},
-          make_coord(m_block, _0{})); // (M, K)
-      Tensor mdO = make_tensor(
-          make_gmem_ptr(params.ptr_dO), params.shape_O, params.stride_dO)(
-          _, _, bidh, !Jagged ? bidb : 0);
-      Tensor gdO = local_tile(
-          cute::domain_offset(make_coord(seqlen_info.offset, _0{}), mdO),
-          TileShape_MK{},
-          make_coord(m_block, _0{})); // (M, K)
-
-      auto shape_LSE = select<0, 2, 3>(params.shape_O);
-      Tensor mLSE = make_tensor(
-          make_gmem_ptr(params.ptr_LSE), shape_LSE, params.stride_LSE)(
-          _, bidh, !Jagged ? bidb : 0);
-      Tensor gLSE = local_tile(
-          cute::domain_offset(make_coord(seqlen_info.offset), mLSE),
-          Shape<Int<kBlockM>>{},
-          make_coord(m_block));
-      static_assert(kBlockM <= MaxThreadsPerBlock);
-      float lse =
-          thread_idx < seqlen_o - m_block * kBlockM && thread_idx < kBlockM
-          ? gLSE(thread_idx)
-          : 0.0f;
-
-      GmemTiledCopy gmem_tiled_copy_O;
-      auto gmem_thr_copy_O = gmem_tiled_copy_O.get_thread_slice(thread_idx);
-
-      Tensor tOgO = gmem_thr_copy_O.partition_S(gO);
-      Tensor tOgdO = gmem_thr_copy_O.partition_S(gdO);
-      // Construct identity layout for gO
-      Tensor cO = cute::make_identity_tensor(
-          TileShape_MK{}); // (BLK_M,BLK_K) -> (blk_m,blk_k)
-      // Repeat the partitioning with identity layouts
-      Tensor tOcO = gmem_thr_copy_O.partition_D(cO);
-      Tensor tOpO = make_tensor<bool>(make_shape(size<2>(tOgO)));
-#pragma unroll
-      for (int k = 0; k < size(tOpO); ++k) {
-        tOpO(k) = get<1>(tOcO(_0{}, _0{}, k)) < get<1>(params.shape_O);
-      }
-
-      // (8, kBlockM / 32, kHeadDim / 64) or (8, kBlockM / 16, kHeadDim / 128)
-      Tensor tOrO = make_fragment_like(tOgO);
-      Tensor tOrdO = make_fragment_like(tOgdO);
-      hstu::copy<
-          /*Is_even_MN=*/false,
-          /*Is_even_K=*/false,
-          /*Clear_OOB_MN=*/true,
-          /*Clearn_OOB_K=*/true>(
-          gmem_tiled_copy_O,
-          tOgO,
-          tOrO,
-          tOcO,
-          tOpO,
-          seqlen_o - m_block * kBlockM);
-      hstu::copy<
-          /*Is_even_MN=*/false,
-          /*Is_even_K=*/false,
-          /*Clear_OOB_MN=*/true,
-          /*Clearn_OOB_K=*/true>(
-          gmem_tiled_copy_O,
-          tOgdO,
-          tOrdO,
-          tOcO,
-          tOpO,
-          seqlen_o - m_block * kBlockM);
-      // Reshape from e.g. (8, kBlockM / 32, kHeadDim / 64) to (kBlockM / 32,
-      // (8, kHeadDim / 64))
-      Layout l = make_layout(
-          get<1>(tOrO.layout()),
-          make_layout(get<0>(tOrO.layout()), get<2>(tOrO.layout())));
-      Tensor tOrO_l = make_tensor(tOrO.data(), l);
-      Tensor o_fp32 = make_tensor_like<float>(tOrO_l);
-      hstu::convert_type_out(tOrO_l, o_fp32);
-      Tensor tOrdO_l = make_tensor(tOrdO.data(), l);
-      Tensor do_fp32 = make_tensor_like<float>(tOrdO_l);
-      hstu::convert_type_out(tOrdO_l, do_fp32);
-      // Sum across the last dimension
-      Tensor dP_sum = make_tensor<float>(make_shape(size<0>(o_fp32)));
-#pragma unroll
-      for (int mi = 0; mi < size<0>(o_fp32); ++mi) {
-        float dP_sum_cur = do_fp32(mi, 0) * o_fp32(mi, 0);
-#pragma unroll
-        for (int ni = 1; ni < size<1>(o_fp32); ni++) {
-          dP_sum_cur += do_fp32(mi, ni) * o_fp32(mi, ni);
-        }
-        hstu::SumOp<float> sum_op;
-        dP_sum(mi) =
-            hstu::Allreduce<kGmemThreadsPerRow>::run(dP_sum_cur, sum_op);
-      }
-
-      Tensor mdPsum = make_tensor(
-          make_gmem_ptr(params.ptr_dPsum),
-          params.shape_dPsum,
-          params.stride_dPsum)(_, bidh, !Jagged ? bidb : 0);
-      Tensor gdPsum = local_tile(
-          cute::domain_offset(make_coord(seqlen_info.offset_padded), mdPsum),
-          Shape<Int<kBlockM>>{},
-          make_coord(m_block));
-      if (get<1>(tOcO(_0{}, _0{}, _0{})) == 0) {
-#pragma unroll
-        for (int mi = 0; mi < size(dP_sum); ++mi) {
-          int const row = get<0>(tOcO(_0{}, mi, _0{}));
-          gdPsum(row) = row < seqlen_o - m_block * kBlockM ? dP_sum(mi) : 0;
-        }
-      }
-
-      int const seqlen_rounded = cute::round_up(seqlen_o, kBlockM);
-      Tensor mLSElog2 = make_tensor(
-          make_gmem_ptr(params.ptr_LSE_log2),
-          params.shape_dPsum,
-          params.stride_LSE_log2)(_, bidh, !Jagged ? bidb : 0);
-      Tensor gLSElog2 = local_tile(
-          cute::domain_offset(make_coord(seqlen_info.offset_padded), mLSElog2),
-          Shape<Int<kBlockM>>{},
-          make_coord(m_block));
-      if (thread_idx < seqlen_rounded - m_block * kBlockM &&
-          thread_idx < kBlockM) {
-        gLSElog2(thread_idx) = lse * float(M_LOG2E);
-      }
-    }
-    if constexpr (Clear_dQaccum) {
-      Tensor mdQaccum = make_tensor(
-          make_gmem_ptr(params.ptr_dQaccum),
-          params.shape_dQaccum,
-          params.stride_dQaccum)(_, bidh, !Jagged ? bidb : 0);
-      Tensor gdQaccum = local_tile(
-          cute::domain_offset(
-              make_coord(seqlen_info.offset_padded * kHeadDim), mdQaccum),
-          Shape<Int<kBlockM * kHeadDim>>{},
-          make_coord(m_block));
-      GmemTiledCopyAccum gmem_tiled_copy_dQaccum;
-      auto gmem_thr_copy_dQaccum =
-          gmem_tiled_copy_dQaccum.get_thread_slice(thread_idx);
-      Tensor tdQgdQaccum = gmem_thr_copy_dQaccum.partition_D(gdQaccum);
-      Tensor zero = make_fragment_like(tdQgdQaccum);
-      clear(zero);
-      cute::copy(
-          Copy_Atom<
-              AutoVectorizingCopyWithAssumedAlignment<128>,
-              ElementAccum>{},
-          zero,
-          tdQgdQaccum);
-    }
-
-    if (params.dq_semaphore != nullptr && thread_idx == 0) {
-      int const num_batch = params.num_batch;
-      int const num_head = params.num_heads;
-      params.dq_semaphore
-          [bidh + bidb * num_head + m_block * num_head * num_batch] = 0;
-    }
-  }
-};
-
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common.cpp b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common.cpp
deleted file mode 100644
index 66ec445be..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common.cpp
+++ /dev/null
@@ -1,1165 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/******************************************************************************
- * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
- *Pradeep Ramani, Tri Dao.
- ******************************************************************************/
-
-// Include these 2 headers instead of torch/extension.h since we don't need all
-// of the torch headers.
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <c10/util/Optional.h>
-#include <torch/nn/functional.h>
-#include <torch/version.h> // For TORCH_VERSION* macros
-
-#include <cutlass/numeric_types.h>
-
-#include "flash.h"
-#include "flash_common.h"
-#include "static_switch.h"
-#include "tile_size.h"
-
-#define CHECK_DEVICE(x) TORCH_CHECK(x.is_cuda(), #x " must be on CUDA")
-#define CHECK_SHAPE(x, ...)                           \
-  TORCH_CHECK(                                        \
-      x.sizes() == torch::IntArrayRef({__VA_ARGS__}), \
-      #x " must have shape (" #__VA_ARGS__ ")")
-#define CHECK_CONTIGUOUS(x) \
-  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
-
-at::Tensor switch_to_contiguous_if_needed(const at::Tensor& x) {
-  if (x.stride(x.dim() - 1) == 1) {
-    return x;
-  }
-  return x.contiguous();
-}
-
-namespace hstu {
-
-void set_params_fprop(
-    hstu::Flash_fwd_params& params,
-    // sizes
-    const size_t b,
-    const size_t total_seq_len_kv,
-    const size_t total_seq_len_q,
-    const size_t max_seq_len,
-    const size_t max_q_len,
-    const size_t h,
-    const size_t qk_d,
-    const size_t v_d,
-    // device pointers
-    const at::Tensor& q,
-    const at::Tensor& k,
-    const at::Tensor& v,
-    const at::Tensor& out,
-    void* seq_offsets,
-    void* num_targets,
-    void* attn_scale,
-    void* seq_offsets_q,
-    void* softmax_lse,
-    void* max_seq_len_tensor,
-    void* contextual_seq_len_tensor,
-    void* max_attn_len_tensor,
-    void* min_full_attn_seq_len_tensor,
-    const int num_groups,
-    bool causal,
-    float alpha,
-    const bool scalar_scale,
-    const int max_attn_len,
-    const int min_full_attn_seq_len,
-    const int contextual_seq_len,
-    const int num_softmax_heads,
-    const bool training,
-    const int sm_margin = 0) {
-  // Reset the parameters
-  params = {};
-
-  params.is_bf16 = q.dtype() == torch::kBFloat16;
-  params.is_e4m3 = q.dtype() == torch::kFloat8_e4m3fn;
-
-  // Set the pointers and strides.
-  params.q_ptr = q.data_ptr();
-  params.k_ptr = k.data_ptr();
-  params.v_ptr = v.data_ptr();
-  params.o_ptr = out.data_ptr();
-  // All stride are in elements, not bytes.
-  params.q_row_stride = q.stride(-3);
-  params.k_row_stride = k.stride(-3);
-  params.v_row_stride = v.stride(-3);
-  params.o_row_stride = out.stride(-3);
-  params.q_head_stride = q.stride(-2);
-  params.k_head_stride = k.stride(-2);
-  params.v_head_stride = v.stride(-2);
-  params.o_head_stride = out.stride(-2);
-  params.v_dim_stride = v.stride(-1);
-
-  if (seq_offsets == nullptr) {
-    params.q_batch_stride = q.stride(0);
-    params.k_batch_stride = k.stride(0);
-    params.v_batch_stride = v.stride(0);
-    params.o_batch_stride = out.stride(0);
-  }
-
-  params.seq_offsets = static_cast<int*>(seq_offsets);
-  params.seq_offsets_q = static_cast<int*>(seq_offsets_q);
-  params.num_targets = static_cast<int*>(num_targets);
-  params.attn_scale = static_cast<float*>(attn_scale);
-  params.softmax_lse = static_cast<float*>(softmax_lse);
-  params.max_seq_len_tensor = static_cast<int*>(max_seq_len_tensor);
-  params.contextual_seq_len_tensor =
-      static_cast<int*>(contextual_seq_len_tensor);
-  params.max_attn_len_tensor = static_cast<int*>(max_attn_len_tensor);
-  params.min_full_attn_seq_len_tensor =
-      static_cast<int*>(min_full_attn_seq_len_tensor);
-  params.num_groups = num_groups;
-  params.batch_size_per_group = b / num_groups;
-
-  // Set the dimensions.
-  params.b = b;
-  params.h = h;
-  params.total_seq_len_q = total_seq_len_q;
-  params.total_seq_len_kv = total_seq_len_kv;
-  params.max_kv_len = max_seq_len;
-  params.max_q_len = max_q_len;
-  params.qk_d = qk_d;
-  params.v_d = v_d;
-
-  params.alpha = alpha;
-
-  // Note: when num_groups > 1, max_attn_len, contextual_seq_len,
-  // min_full_attn_seq_len represent the max value in the tensor.
-  params.is_local = max_attn_len > 0;
-  params.is_causal = causal && (!params.is_local);
-  params.has_contexual_mask = contextual_seq_len > 0;
-  params.scalar_scale = scalar_scale;
-  params.num_softmax_heads = num_softmax_heads;
-  params.training = training;
-
-  params.max_attn_len = max_attn_len;
-  params.min_full_attn_seq_len = min_full_attn_seq_len;
-  params.contextual_seq_len = contextual_seq_len;
-
-  params.arch = at::cuda::getCurrentDeviceProperties()->major * 10 +
-      at::cuda::getCurrentDeviceProperties()->minor;
-  params.num_sm =
-      at::cuda::getCurrentDeviceProperties()->multiProcessorCount - sm_margin;
-
-#ifdef FLASHATTENTION_DISABLE_LOCAL
-  TORCH_CHECK(
-      !params.is_local,
-      "This flash attention build does not support local attention.");
-#endif
-}
-
-void set_params_dgrad(
-    hstu::Flash_bwd_params& params,
-    // sizes
-    const size_t b,
-    const size_t total_seq_len_kv,
-    const size_t total_seq_len_q,
-    const size_t max_seq_len,
-    const size_t max_q_len,
-    const size_t max_q_len_rounded,
-    const size_t h,
-    const size_t qk_d,
-    const size_t v_d,
-    const size_t qk_d_rounded,
-    const size_t v_d_rounded,
-    // device pointers
-    const at::Tensor& q,
-    const at::Tensor& k,
-    const at::Tensor& v,
-    const at::Tensor& out,
-    const at::Tensor& dout,
-    const at::Tensor& dq,
-    const at::Tensor& dk,
-    const at::Tensor& dv,
-    void* dq_accum_d,
-    void* seq_offsets,
-    void* num_targets,
-    void* attn_scale,
-    void* sort_by_length_indices,
-    void* seq_offsets_q,
-    void* softmax_lse,
-    void* softmax_d,
-    void* softmax_lse_log2,
-    void* max_seq_len_tensor,
-    void* contextual_seq_len_tensor,
-    void* max_attn_len_tensor,
-    void* min_full_attn_seq_len_tensor,
-    const int num_groups,
-    const bool scalar_scale,
-    const bool causal,
-    const float alpha,
-    const int max_attn_len,
-    const int min_full_attn_seq_len,
-    const int contextual_seq_len,
-    const int num_softmax_heads,
-    bool deterministic = false,
-    int const sm_margin = 0) {
-  hstu::set_params_fprop(
-      params,
-      b,
-      total_seq_len_kv,
-      total_seq_len_q,
-      max_seq_len,
-      max_q_len,
-      h,
-      qk_d,
-      v_d,
-      q,
-      k,
-      v,
-      out,
-      seq_offsets,
-      num_targets,
-      attn_scale,
-      seq_offsets_q,
-      softmax_lse,
-      max_seq_len_tensor,
-      contextual_seq_len_tensor,
-      max_attn_len_tensor,
-      min_full_attn_seq_len_tensor,
-      num_groups,
-      causal,
-      alpha,
-      scalar_scale,
-      max_attn_len,
-      min_full_attn_seq_len,
-      contextual_seq_len,
-      num_softmax_heads,
-      false /* training */,
-      sm_margin);
-
-  // Set the pointers and strides.
-  params.do_ptr = dout.data_ptr();
-  params.do_row_stride = dout.stride(-3);
-  params.do_head_stride = dout.stride(-2);
-  params.dq_ptr = dq.data_ptr();
-  params.dk_ptr = dk.data_ptr();
-  params.dv_ptr = dv.data_ptr();
-  params.dq_row_stride = dq.stride(-3);
-  params.dk_row_stride = dk.stride(-3);
-  params.dv_row_stride = dv.stride(-3);
-  params.dq_head_stride = dq.stride(-2);
-  params.dk_head_stride = dk.stride(-2);
-  params.dv_head_stride = dv.stride(-2);
-
-  params.qk_d_rounded = qk_d_rounded;
-  params.v_d_rounded = v_d_rounded;
-  params.max_q_len_rounded = max_q_len_rounded;
-
-  params.sort_by_length_indices = static_cast<int*>(sort_by_length_indices);
-
-  if (seq_offsets == nullptr) {
-    params.do_batch_stride = dout.stride(0);
-    params.dq_batch_stride = dq.stride(0);
-    params.dk_batch_stride = dk.stride(0);
-    params.dv_batch_stride = dv.stride(0);
-  }
-  params.dq_accum_ptr = dq_accum_d;
-  params.softmax_lse_log2 = static_cast<float*>(softmax_lse_log2);
-  params.softmax_d = static_cast<float*>(softmax_d);
-  params.deterministic = deterministic;
-}
-
-void run_mha_fwd(hstu::Flash_fwd_params& params, cudaStream_t stream) {
-  // HEADDIM_SWITCH(params.d, [&] {
-  //     hstu::run_mha_fwd_<cutlass::half_t, kHeadSize>(params, stream);
-  // });
-  ARCH_SWITCH(params.arch, Arch, [&] {
-    BOOL_SWITCH(params.num_softmax_heads == params.h, Softmax, [&] {
-      if (!params.is_e4m3) {
-        if (params.is_bf16) {
-#ifndef FLASHATTENTION_DISABLE_HDIM64
-          if (params.qk_d <= 64) {
-            return hstu::run_mha_fwd_<Arch, cutlass::bfloat16_t, 64, Softmax>(
-                params, stream);
-          }
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM96
-          if (params.qk_d <= 96) {
-            return hstu::run_mha_fwd_<Arch, cutlass::bfloat16_t, 96, Softmax>(
-                params, stream);
-          }
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM128
-          if (params.qk_d <= 128) {
-            return hstu::run_mha_fwd_<Arch, cutlass::bfloat16_t, 128, Softmax>(
-                params, stream);
-          }
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM192
-          if (params.qk_d <= 192) {
-            return hstu::run_mha_fwd_<Arch, cutlass::bfloat16_t, 192, Softmax>(
-                params, stream);
-          }
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM256
-          if (params.qk_d <= 256) {
-            return hstu::run_mha_fwd_<Arch, cutlass::bfloat16_t, 256, Softmax>(
-                params, stream);
-          }
-#endif
-        } else {
-#ifndef FLASHATTENTION_DISABLE_FP16
-#ifndef FLASHATTENTION_DISABLE_HDIM64
-          if (params.qk_d <= 64) {
-            return hstu::run_mha_fwd_<Arch, cutlass::half_t, 64, Softmax>(
-                params, stream);
-          }
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM96
-          if (params.qk_d <= 96) {
-            return hstu::run_mha_fwd_<Arch, cutlass::half_t, 96, Softmax>(
-                params, stream);
-          }
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM128
-          if (params.qk_d <= 128) {
-            return hstu::run_mha_fwd_<Arch, cutlass::half_t, 128, Softmax>(
-                params, stream);
-          }
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM192
-          if (params.qk_d <= 192) {
-            return hstu::run_mha_fwd_<Arch, cutlass::half_t, 192, Softmax>(
-                params, stream);
-          }
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM256
-          if (params.qk_d <= 256) {
-            return hstu::run_mha_fwd_<Arch, cutlass::half_t, 256, Softmax>(
-                params, stream);
-          }
-#endif
-#else
-                                TORCH_CHECK(false, "This flash attention build does not support FP16.");
-#endif
-        }
-      } else {
-#ifndef FLASHATTENTION_DISABLE_FP8
-#ifndef FLASHATTENTION_DISABLE_HDIM64
-        if (params.qk_d <= 64) {
-          return hstu::run_mha_fwd_<90, cutlass::float_e4m3_t, 64, Softmax>(
-              params, stream);
-        }
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM96
-        if (params.qk_d <= 96) {
-          return hstu::run_mha_fwd_<90, cutlass::float_e4m3_t, 96, Softmax>(
-              params, stream);
-        }
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM128
-        if (params.qk_d <= 128) {
-          return hstu::run_mha_fwd_<90, cutlass::float_e4m3_t, 128, Softmax>(
-              params, stream);
-        }
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM192
-        if (params.qk_d <= 192) {
-          return hstu::run_mha_fwd_<90, cutlass::float_e4m3_t, 192, Softmax>(
-              params, stream);
-        }
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM256
-        if (params.qk_d <= 256) {
-          return hstu::run_mha_fwd_<90, cutlass::float_e4m3_t, 256, Softmax>(
-              params, stream);
-        }
-#endif
-#else
-                            TORCH_CHECK(false, "This flash attention build does not support FP8.");
-#endif
-      }
-    });
-  });
-}
-
-std::tuple<at::Tensor, std::optional<at::Tensor>> hstu_mha_fwd(
-    int64_t max_seq_len,
-    double alpha,
-    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
-    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
-    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
-    const std::optional<at::Tensor>& seq_offsets,
-    bool causal,
-    const std::optional<at::Tensor>& num_targets,
-    const std::optional<at::Tensor>& attn_scale,
-    int64_t max_attn_len,
-    int64_t min_full_attn_seq_len,
-    int64_t contextual_seq_len,
-    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
-    const std::optional<at::Tensor>& k_descale, // (b, h_k)
-    const std::optional<at::Tensor>& v_descale, // (b, h_k)
-    const int64_t sm_margin,
-    int64_t max_q_len,
-    const std::optional<at::Tensor>& seq_offsets_q,
-    int64_t num_softmax_heads,
-    bool training,
-    const std::optional<at::Tensor>& max_seq_len_tensor,
-    const std::optional<at::Tensor>& contextual_seq_len_tensor,
-    const std::optional<at::Tensor>& max_attn_len_tensor,
-    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor,
-    int64_t num_groups) {
-  auto dprops = at::cuda::getCurrentDeviceProperties();
-  bool is_sm9x = dprops->major >= 9;
-  TORCH_CHECK(is_sm9x, "HSTU Attention only supports Hopper GPUs or newer.");
-
-  q = switch_to_contiguous_if_needed(q);
-  k = switch_to_contiguous_if_needed(k);
-  v = switch_to_contiguous_if_needed(v);
-
-  auto q_type = q.scalar_type();
-  TORCH_CHECK(
-      q_type == at::ScalarType::Half || q_type == at::ScalarType::BFloat16 ||
-          q_type == at::ScalarType::Float8_e4m3fn,
-      "FlashAttention only supports fp16, bf16, and fp8_e4m3 data type");
-  if (dprops->major < 9) {
-    TORCH_CHECK(
-        q_type == at::ScalarType::Half || q_type == at::ScalarType::BFloat16,
-        "FlashAttention on Ampere/Ada cards only supports fp16 and bf16 data type");
-  }
-  TORCH_CHECK(
-      k.scalar_type() == q_type, "query and key must have the same dtype");
-  TORCH_CHECK(
-      v.scalar_type() == q_type, "query and value must have the same dtype");
-
-  CHECK_DEVICE(q);
-  CHECK_DEVICE(k);
-  CHECK_DEVICE(v);
-
-  TORCH_CHECK(
-      q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
-  TORCH_CHECK(
-      k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
-  TORCH_CHECK(
-      v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
-
-  at::Tensor seq_offsets_;
-  bool const is_jagged = seq_offsets.has_value();
-  if (is_jagged) {
-    seq_offsets_ = seq_offsets.value();
-    CHECK_DEVICE(seq_offsets_);
-    CHECK_CONTIGUOUS(seq_offsets_);
-    TORCH_CHECK(
-        seq_offsets_.dtype() == torch::kInt32,
-        "seq_offsets_ must have dtype torch.int32");
-  }
-  at::Tensor num_targets_;
-  bool const has_multiple_targets = num_targets.has_value();
-  if (has_multiple_targets) {
-    num_targets_ = num_targets.value();
-    CHECK_DEVICE(num_targets_);
-    CHECK_CONTIGUOUS(num_targets_);
-    TORCH_CHECK(
-        num_targets_.dtype() == torch::kInt32,
-        "num_targets_ must have dtype torch.int32");
-  }
-  at::Tensor seq_offsets_q_;
-  bool const is_cross_attn = seq_offsets_q.has_value();
-  if (is_cross_attn) {
-    seq_offsets_q_ = seq_offsets_q.value();
-    CHECK_DEVICE(seq_offsets_q_);
-    CHECK_CONTIGUOUS(seq_offsets_q_);
-    TORCH_CHECK(
-        seq_offsets_q_.dtype() == torch::kInt32,
-        "seq_offsets_q_ must have dtype torch.int32");
-  } else {
-    max_q_len = max_seq_len;
-  }
-  at::Tensor attn_scale_;
-  bool scalar_scale = true;
-  bool const has_attn_scale = attn_scale.has_value();
-  if (has_attn_scale) {
-    attn_scale_ = attn_scale.value();
-    scalar_scale = attn_scale_.numel() == num_groups;
-    CHECK_DEVICE(attn_scale_);
-    TORCH_CHECK(
-        attn_scale_.dtype() == torch::kFloat32,
-        "attn_scale_ must have dtype torch.float32");
-  }
-  at::Tensor max_seq_len_tensor_;
-  at::Tensor contextual_seq_len_tensor_;
-  at::Tensor max_attn_len_tensor_;
-  at::Tensor min_full_attn_seq_len_tensor_;
-  if (num_groups > 1) {
-    TORCH_CHECK(
-        max_seq_len_tensor.has_value(),
-        "max_seq_len_tensor cannot be empty for num_groups > 1.");
-    max_seq_len_tensor_ = max_seq_len_tensor.value();
-    CHECK_DEVICE(max_seq_len_tensor_);
-    TORCH_CHECK(max_seq_len_tensor_.dtype() == torch::kInt32);
-    if (!is_cross_attn) {
-      TORCH_CHECK(
-          contextual_seq_len_tensor.has_value(),
-          "contextual_seq_len_tensor cannot be empty for num_groups > 1 and not cross_attn.");
-      TORCH_CHECK(
-          max_attn_len_tensor.has_value(),
-          "max_attn_len_tensor cannot be empty for num_groups > 1 and not cross_attn.");
-      TORCH_CHECK(
-          min_full_attn_seq_len_tensor.has_value(),
-          "min_full_attn_seq_len_tensor cannot be empty for num_groups > 1 and not cross_attn.");
-      contextual_seq_len_tensor_ = contextual_seq_len_tensor.value();
-      max_attn_len_tensor_ = max_attn_len_tensor.value();
-      min_full_attn_seq_len_tensor_ = min_full_attn_seq_len_tensor.value();
-      CHECK_DEVICE(contextual_seq_len_tensor_);
-      CHECK_DEVICE(max_attn_len_tensor_);
-      CHECK_DEVICE(min_full_attn_seq_len_tensor_);
-      TORCH_CHECK(contextual_seq_len_tensor_.dtype() == torch::kInt32);
-      TORCH_CHECK(max_attn_len_tensor_.dtype() == torch::kInt32);
-      TORCH_CHECK(min_full_attn_seq_len_tensor_.dtype() == torch::kInt32);
-    }
-  }
-#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
-  if (is_jagged && has_multiple_targets) {
-    auto uih_lengths = seq_offsets_.slice(0, 1)
-                           .sub(seq_offsets_.slice(0, 0, -1))
-                           .sub(num_targets_);
-    TORCH_CHECK(
-        (uih_lengths.gt(0)).sum().item<int64_t>() == num_targets_.size(0),
-        "some uih seqlen is 0");
-    TORCH_CHECK(
-        (uih_lengths.greater_equal(contextual_seq_len)).sum().item<int64_t>() ==
-            num_targets_.size(0),
-        "some uih seqlen is less than contextual_seq_len");
-  }
-#endif
-  TORCH_CHECK(
-      q.size(-1) == k.size(-1) && k.size(-1) == v.size(-1),
-      "only attndim == hidden_dim is supported");
-
-  auto const sizes_q = q.sizes();
-  auto const sizes_k = k.sizes();
-  const int batch_size = !is_jagged ? sizes_q[0] : seq_offsets_.size(0) - 1;
-  TORCH_CHECK(
-      batch_size % num_groups == 0, "batch_size not divisible by num_groups");
-  int total_seq_len_q = !is_jagged ? batch_size * max_q_len : sizes_q[0];
-  int total_seq_len_kv = !is_jagged ? batch_size * max_seq_len : sizes_k[0];
-  int num_heads = q.size(-2);
-  int const qk_head_size = q.size(-1);
-  int const v_head_size = v.size(-1);
-  int const max_headdim = get_max_headdim();
-  TORCH_CHECK(
-      qk_head_size <= max_headdim && v_head_size <= max_headdim,
-      "FlashAttention forward only supports head dimension at most " +
-          std::to_string(max_headdim));
-  TORCH_CHECK(max_attn_len >= 0, "max_attn_len must be at least 0");
-  TORCH_CHECK(
-      min_full_attn_seq_len >= 0, "min_full_attn_seq_len must be at least 0");
-  TORCH_CHECK(contextual_seq_len >= 0, "contextual_seq_len must be at least 0");
-  if (max_attn_len > 0) {
-    TORCH_CHECK(
-        min_full_attn_seq_len > 0,
-        "min_full_attn_seq_len=0 not supported when max_attn_len > 0");
-  }
-  TORCH_CHECK(
-      0 == num_softmax_heads || num_softmax_heads == num_heads,
-      "num_softmax_heads must be either 0 or num_heads");
-  if (!is_jagged) {
-    CHECK_SHAPE(q, batch_size, max_q_len, num_heads, qk_head_size);
-    CHECK_SHAPE(k, batch_size, max_seq_len, num_heads, qk_head_size);
-    CHECK_SHAPE(v, batch_size, max_seq_len, num_heads, v_head_size);
-  } else {
-    CHECK_SHAPE(q, total_seq_len_q, num_heads, qk_head_size);
-    CHECK_SHAPE(k, total_seq_len_kv, num_heads, qk_head_size);
-    CHECK_SHAPE(v, total_seq_len_kv, num_heads, v_head_size);
-    CHECK_SHAPE(seq_offsets_, batch_size + 1);
-  }
-  if (has_multiple_targets) {
-    CHECK_SHAPE(num_targets_, batch_size);
-  }
-  if (is_cross_attn) {
-    CHECK_SHAPE(seq_offsets_q_, batch_size + 1);
-  }
-
-  int const alignment = q_type == torch::kFloat8_e4m3fn ? 16 : 8;
-  TORCH_CHECK(
-      qk_head_size % alignment == 0 && v_head_size % alignment == 0,
-      "head_size should be a multiple of " + std::to_string(alignment));
-
-  auto opts = q.options();
-  auto out_type = q_type == at::ScalarType::Float8_e4m3fn
-      ? at::ScalarType::BFloat16
-      : q_type;
-  at::Tensor out;
-  if (!is_jagged) {
-    out = torch::empty(
-        {batch_size, max_q_len, num_heads, v_head_size}, opts.dtype(out_type));
-  } else {
-    out = torch::empty(
-        {total_seq_len_q, num_heads, v_head_size}, opts.dtype(out_type));
-  }
-  std::optional<at::Tensor> softmax_lse = std::nullopt;
-
-  // Early return for empty sequences to avoid TMA descriptor
-  // initialization failure
-  if (total_seq_len_kv == 0 || total_seq_len_q == 0) {
-    return {out, std::nullopt};
-  }
-
-  if (num_softmax_heads > 0) {
-    if (!is_jagged) {
-      softmax_lse = torch::empty(
-          {batch_size, num_softmax_heads, max_q_len}, opts.dtype(at::kFloat));
-    } else {
-      softmax_lse = torch::empty(
-          {num_softmax_heads, total_seq_len_q}, opts.dtype(at::kFloat));
-    }
-  }
-
-  // Otherwise the kernel will be launched from cuda:0 device
-  // Cast to char to avoid compiler warning about narrowing
-  at::cuda::CUDAGuard device_guard{(char)q.get_device()};
-  hstu::Flash_fwd_params params;
-  hstu::set_params_fprop(
-      params,
-      batch_size,
-      total_seq_len_kv,
-      total_seq_len_q,
-      max_seq_len,
-      max_q_len,
-      num_heads,
-      qk_head_size,
-      v_head_size,
-      q,
-      k,
-      v,
-      out,
-      !is_jagged ? nullptr : seq_offsets_.data_ptr(),
-      !has_multiple_targets ? nullptr : num_targets_.data_ptr(),
-      !has_attn_scale ? nullptr : attn_scale_.data_ptr(),
-      !is_cross_attn ? nullptr : seq_offsets_q_.data_ptr(),
-      (num_softmax_heads == 0) ? nullptr : softmax_lse.value().data_ptr(),
-      num_groups > 1 ? max_seq_len_tensor_.data_ptr() : nullptr,
-      ((num_groups > 1) && (!is_cross_attn))
-          ? contextual_seq_len_tensor_.data_ptr()
-          : nullptr,
-      ((num_groups > 1) && (!is_cross_attn)) ? max_attn_len_tensor_.data_ptr()
-                                             : nullptr,
-      ((num_groups > 1) && (!is_cross_attn))
-          ? min_full_attn_seq_len_tensor_.data_ptr()
-          : nullptr,
-      num_groups,
-      causal,
-      alpha,
-      scalar_scale,
-      max_attn_len,
-      min_full_attn_seq_len,
-      contextual_seq_len,
-      num_softmax_heads,
-      training,
-      sm_margin);
-  at::Tensor tile_count_semaphore;
-  // We don't use the persistent scheduler if not jagged
-  bool const persistent_scheduler = params.arch >= 90
-      ? (params.is_causal || params.is_local || is_jagged)
-      : (params.is_causal || is_jagged);
-  if (persistent_scheduler) {
-    tile_count_semaphore = torch::zeros({1}, opts.dtype(torch::kInt32));
-    params.tile_count_semaphore = tile_count_semaphore.data_ptr<int>();
-  } else {
-    params.tile_count_semaphore = nullptr;
-  }
-
-  if (q_type == at::ScalarType::Float8_e4m3fn) {
-    if (q_descale.has_value()) {
-      auto q_descale_ = q_descale.value();
-      CHECK_DEVICE(q_descale_);
-      CHECK_SHAPE(q_descale_, batch_size, num_heads);
-      params.q_descale_ptr = q_descale_.data_ptr<float>();
-      params.q_descale_batch_stride = q_descale_.stride(0);
-      params.q_descale_head_stride = q_descale_.stride(1);
-    } else {
-      params.q_descale_ptr = nullptr;
-    }
-    if (k_descale.has_value()) {
-      auto k_descale_ = k_descale.value();
-      CHECK_DEVICE(k_descale_);
-      CHECK_SHAPE(k_descale_, batch_size, num_heads);
-      params.k_descale_ptr = k_descale_.data_ptr<float>();
-      params.k_descale_batch_stride = k_descale_.stride(0);
-      params.k_descale_head_stride = k_descale_.stride(1);
-    } else {
-      params.k_descale_ptr = nullptr;
-    }
-    if (v_descale.has_value()) {
-      auto v_descale_ = v_descale.value();
-      CHECK_DEVICE(v_descale_);
-      CHECK_SHAPE(v_descale_, batch_size, num_heads);
-      params.v_descale_ptr = v_descale_.data_ptr<float>();
-      params.v_descale_batch_stride = v_descale_.stride(0);
-      params.v_descale_head_stride = v_descale_.stride(1);
-    } else {
-      params.v_descale_ptr = nullptr;
-    }
-  }
-
-#ifdef FLASHATTENTION_DISABLE_LOCAL
-  TORCH_CHECK(
-      !params.is_local,
-      "This flash attention build does not support local attention.");
-#endif
-
-  if (total_seq_len_q > 0 && num_heads > 0) {
-    auto stream = at::cuda::getCurrentCUDAStream().stream();
-    run_mha_fwd(params, stream);
-  }
-  return {out, softmax_lse};
-}
-
-void run_mha_bwd(hstu::Flash_bwd_params& params, cudaStream_t stream) {
-#ifndef FLASHATTENTION_DISABLE_BACKWARD
-  // FP16_SWITCH(!params.is_bf16, [&] {
-  //     HEADDIM_SWITCH(params.d, [&] {
-  //         hstu::run_mha_bwd_<elem_type, kHeadDim>(params, stream);
-  //     });
-  // });
-  ARCH_SWITCH(params.arch, Arch, [&] {
-    BOOL_SWITCH(params.num_softmax_heads == params.h, Softmax, [&] {
-      if (!params.is_bf16) {
-#ifndef FLASHATTENTION_DISABLE_FP16
-#ifndef FLASHATTENTION_DISABLE_HDIM64
-        if (params.qk_d <= 64) {
-          return hstu::run_mha_bwd_<Arch, cutlass::half_t, 64, Softmax>(
-              params, stream);
-        }
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM96
-        if (params.qk_d <= 96) {
-          return hstu::run_mha_bwd_<Arch, cutlass::half_t, 96, Softmax>(
-              params, stream);
-        }
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM128
-        if (params.qk_d <= 128) {
-          return hstu::run_mha_bwd_<Arch, cutlass::half_t, 128, Softmax>(
-              params, stream);
-        }
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM192
-        if (params.qk_d <= 192) {
-          return hstu::run_mha_bwd_<Arch, cutlass::half_t, 192, Softmax>(
-              params, stream);
-        }
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM256
-        if (params.qk_d <= 256) {
-          return hstu::run_mha_bwd_<Arch, cutlass::half_t, 256, Softmax>(
-              params, stream);
-        }
-#endif
-#else
-                TORCH_CHECK(false, "This flash attention build does not support FP16.");
-#endif
-      } else {
-#ifndef FLASHATTENTION_DISABLE_HDIM64
-        if (params.qk_d <= 64) {
-          return hstu::run_mha_bwd_<Arch, cutlass::bfloat16_t, 64, Softmax>(
-              params, stream);
-        }
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM96
-        if (params.qk_d <= 96) {
-          return hstu::run_mha_bwd_<Arch, cutlass::bfloat16_t, 96, Softmax>(
-              params, stream);
-        }
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM128
-        if (params.qk_d <= 128) {
-          return hstu::run_mha_bwd_<Arch, cutlass::bfloat16_t, 128, Softmax>(
-              params, stream);
-        }
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM192
-        if (params.qk_d <= 192) {
-          return hstu::run_mha_bwd_<Arch, cutlass::bfloat16_t, 192, Softmax>(
-              params, stream);
-        }
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM256
-        if (params.qk_d <= 256) {
-          return hstu::run_mha_bwd_<Arch, cutlass::bfloat16_t, 256, Softmax>(
-              params, stream);
-        }
-#endif
-      }
-    });
-  });
-#endif
-}
-
-std::vector<at::Tensor> hstu_mha_bwd(
-    int64_t max_seq_len,
-    double alpha,
-    at::Tensor& dout,
-    at::Tensor& q,
-    at::Tensor& k,
-    at::Tensor& v,
-    at::Tensor& dq,
-    at::Tensor& dk,
-    at::Tensor& dv,
-    at::Tensor& out,
-    const std::optional<at::Tensor>& seq_offsets,
-    bool causal,
-    const std::optional<at::Tensor>& num_targets,
-    const std::optional<at::Tensor>& attn_scale,
-    int64_t max_attn_len,
-    int64_t min_full_attn_seq_len,
-    int64_t contextual_seq_len,
-    bool sort_by_length,
-    bool const deterministic,
-    const int64_t sm_margin,
-    int64_t max_q_len,
-    const std::optional<at::Tensor>& seq_offsets_q,
-    int64_t num_softmax_heads,
-    const std::optional<at::Tensor>& softmax_lse,
-    const std::optional<at::Tensor>& max_seq_len_tensor,
-    const std::optional<at::Tensor>& contextual_seq_len_tensor,
-    const std::optional<at::Tensor>& max_attn_len_tensor,
-    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor,
-    int64_t num_groups) {
-#ifdef FLASHATTENTION_DISABLE_BACKWARD
-  TORCH_CHECK(false, "This flash attention build does not support backward.");
-#endif
-
-  auto dprops = at::cuda::getCurrentDeviceProperties();
-  bool is_sm9x = dprops->major >= 9;
-  TORCH_CHECK(is_sm9x, "HSTU Attention only supports Hopper GPUs or newer.");
-
-  q = switch_to_contiguous_if_needed(q);
-  k = switch_to_contiguous_if_needed(k);
-  v = switch_to_contiguous_if_needed(v);
-  out = switch_to_contiguous_if_needed(out);
-  dout = switch_to_contiguous_if_needed(dout);
-
-  auto q_type = q.dtype();
-  TORCH_CHECK(
-      q_type == torch::kFloat16 || q_type == torch::kBFloat16,
-      "FlashAttention only support fp16 and bf16 data type");
-  TORCH_CHECK(k.dtype() == q_type, "query and key must have the same dtype");
-  TORCH_CHECK(v.dtype() == q_type, "query and value must have the same dtype");
-  TORCH_CHECK(
-      dout.dtype() == q_type, "query and dout must have the same dtype");
-
-  CHECK_DEVICE(q);
-  CHECK_DEVICE(k);
-  CHECK_DEVICE(v);
-  CHECK_DEVICE(dout);
-
-  TORCH_CHECK(
-      q.stride(-1) == 1, "Input tensor must have contiguous last dimension");
-  TORCH_CHECK(
-      k.stride(-1) == 1, "Input tensor must have contiguous last dimension");
-  TORCH_CHECK(
-      v.stride(-1) == 1, "Input tensor must have contiguous last dimension");
-  TORCH_CHECK(
-      dout.stride(-1) == 1, "dout tensor must have contiguous last dimension");
-
-  at::Tensor seq_offsets_;
-  bool const is_jagged = seq_offsets.has_value();
-  if (is_jagged) {
-    seq_offsets_ = seq_offsets.value();
-    CHECK_DEVICE(seq_offsets_);
-    CHECK_CONTIGUOUS(seq_offsets_);
-    TORCH_CHECK(
-        seq_offsets_.dtype() == torch::kInt32,
-        "seq_offsets_ must have dtype torch.int32");
-  }
-  at::Tensor sort_by_length_indices_;
-  if (sort_by_length && is_jagged) {
-    auto seq_lengths =
-        seq_offsets_.slice(0, 1).sub(seq_offsets_.slice(0, 0, -1));
-    std::tuple<torch::Tensor, torch::Tensor> sort_result = torch::sort(
-        seq_lengths, false /*stable*/, 0 /*dim*/, true /*descending*/);
-    sort_by_length_indices_ = std::get<1>(sort_result).to(torch::kInt32);
-    CHECK_DEVICE(sort_by_length_indices_);
-    CHECK_CONTIGUOUS(sort_by_length_indices_);
-    TORCH_CHECK(
-        sort_by_length_indices_.dtype() == torch::kInt32,
-        "sort_by_length_indices_ must have dtype torch.int32");
-  }
-  at::Tensor num_targets_;
-  bool const has_multiple_targets = num_targets.has_value();
-  if (has_multiple_targets) {
-    num_targets_ = num_targets.value();
-    CHECK_DEVICE(num_targets_);
-    CHECK_CONTIGUOUS(num_targets_);
-    TORCH_CHECK(
-        num_targets_.dtype() == torch::kInt32,
-        "num_targets_ must have dtype torch.int32");
-  }
-  at::Tensor attn_scale_;
-  bool scalar_scale = true;
-  bool const has_attn_scale = attn_scale.has_value();
-  if (has_attn_scale) {
-    attn_scale_ = attn_scale.value();
-    scalar_scale = attn_scale_.numel() == num_groups;
-    CHECK_DEVICE(attn_scale_);
-    TORCH_CHECK(
-        attn_scale_.dtype() == torch::kFloat32,
-        "attn_scale_ must have dtype torch.float32");
-  }
-  at::Tensor seq_offsets_q_;
-  bool const is_cross_attn = seq_offsets_q.has_value();
-  if (is_cross_attn) {
-    seq_offsets_q_ = seq_offsets_q.value();
-    CHECK_DEVICE(seq_offsets_q_);
-    CHECK_CONTIGUOUS(seq_offsets_q_);
-    TORCH_CHECK(
-        seq_offsets_q_.dtype() == torch::kInt32,
-        "seq_offsets_q_ must have dtype torch.int32");
-  } else {
-    max_q_len = max_seq_len;
-  }
-  at::Tensor max_seq_len_tensor_;
-  at::Tensor contextual_seq_len_tensor_;
-  at::Tensor max_attn_len_tensor_;
-  at::Tensor min_full_attn_seq_len_tensor_;
-  if (num_groups > 1) {
-    TORCH_CHECK(
-        max_seq_len_tensor.has_value(),
-        "max_seq_len_tensor cannot be empty for num_groups > 1.");
-    max_seq_len_tensor_ = max_seq_len_tensor.value();
-    CHECK_DEVICE(max_seq_len_tensor_);
-    TORCH_CHECK(max_seq_len_tensor_.dtype() == torch::kInt32);
-    if (!is_cross_attn) {
-      TORCH_CHECK(
-          contextual_seq_len_tensor.has_value(),
-          "contextual_seq_len_tensor cannot be empty for num_groups > 1 and not cross_attn.");
-      TORCH_CHECK(
-          max_attn_len_tensor.has_value(),
-          "max_attn_len_tensor cannot be empty for num_groups > 1 and not cross_attn.");
-      TORCH_CHECK(
-          min_full_attn_seq_len_tensor.has_value(),
-          "min_full_attn_seq_len_tensor cannot be empty for num_groups > 1 and not cross_attn.");
-      contextual_seq_len_tensor_ = contextual_seq_len_tensor.value();
-      max_attn_len_tensor_ = max_attn_len_tensor.value();
-      min_full_attn_seq_len_tensor_ = min_full_attn_seq_len_tensor.value();
-      CHECK_DEVICE(contextual_seq_len_tensor_);
-      CHECK_DEVICE(max_attn_len_tensor_);
-      CHECK_DEVICE(min_full_attn_seq_len_tensor_);
-      TORCH_CHECK(contextual_seq_len_tensor_.dtype() == torch::kInt32);
-      TORCH_CHECK(max_attn_len_tensor_.dtype() == torch::kInt32);
-      TORCH_CHECK(min_full_attn_seq_len_tensor_.dtype() == torch::kInt32);
-    }
-  }
-  auto const sizes_q = q.sizes();
-  auto const sizes_kv = k.sizes();
-  int const batch_size = !is_jagged ? sizes_q[0] : seq_offsets_.size(0) - 1;
-  TORCH_CHECK(
-      batch_size % num_groups == 0, "batch_size not divisible by num_groups");
-  if (!is_jagged) {
-    max_seq_len = sizes_kv[1];
-  }
-  int const total_seq_len_q = !is_jagged ? batch_size * sizes_q[1] : sizes_q[0];
-  int const total_seq_len_kv =
-      !is_jagged ? batch_size * sizes_kv[1] : sizes_kv[0];
-  int const num_heads = q.size(-2);
-  int const qk_head_size = q.size(-1);
-  int const v_head_size = v.size(-1);
-  TORCH_CHECK(
-      qk_head_size % 8 == 0 && v_head_size % 8 == 0,
-      "head_size should be a multiple of 8");
-  int const max_headdim = get_max_headdim();
-  TORCH_CHECK(
-      qk_head_size <= max_headdim && v_head_size <= max_headdim,
-      "FlashAttention backward only supports head dimension at most " +
-          std::to_string(max_headdim));
-  TORCH_CHECK(max_attn_len >= 0, "max_attn_len must be at least 0");
-  TORCH_CHECK(
-      min_full_attn_seq_len >= 0, "min_full_attn_seq_len must be at least 0");
-  TORCH_CHECK(contextual_seq_len >= 0, "contextual_seq_len must be at least 0");
-  if (!is_jagged) {
-    CHECK_SHAPE(q, batch_size, max_q_len, num_heads, qk_head_size);
-    CHECK_SHAPE(k, batch_size, max_seq_len, num_heads, qk_head_size);
-    CHECK_SHAPE(v, batch_size, max_seq_len, num_heads, v_head_size);
-    CHECK_SHAPE(dout, batch_size, max_q_len, num_heads, v_head_size);
-    CHECK_SHAPE(dq, batch_size, max_q_len, num_heads, qk_head_size);
-    CHECK_SHAPE(dk, batch_size, max_seq_len, num_heads, qk_head_size);
-    CHECK_SHAPE(dv, batch_size, max_seq_len, num_heads, v_head_size);
-  } else {
-    CHECK_SHAPE(q, total_seq_len_q, num_heads, qk_head_size);
-    CHECK_SHAPE(k, total_seq_len_kv, num_heads, qk_head_size);
-    CHECK_SHAPE(v, total_seq_len_kv, num_heads, v_head_size);
-    CHECK_SHAPE(dout, total_seq_len_q, num_heads, v_head_size);
-    CHECK_SHAPE(dq, total_seq_len_q, num_heads, qk_head_size);
-    CHECK_SHAPE(dk, total_seq_len_kv, num_heads, qk_head_size);
-    CHECK_SHAPE(dv, total_seq_len_kv, num_heads, v_head_size);
-    CHECK_SHAPE(seq_offsets_, batch_size + 1);
-  }
-  if (has_multiple_targets) {
-    CHECK_SHAPE(num_targets_, batch_size);
-  }
-  if (is_cross_attn) {
-    CHECK_SHAPE(seq_offsets_q_, batch_size + 1);
-  }
-  int const arch = at::cuda::getCurrentDeviceProperties()->major * 10 +
-      at::cuda::getCurrentDeviceProperties()->minor;
-  int const qk_head_size_rounded = round_up_headdim(qk_head_size);
-  int const v_head_size_rounded = round_up_headdim(v_head_size);
-  // Very important that these match the kernel configs
-  bool const is_local = max_attn_len > 0;
-  int const kBlockM =
-      hstu::kBlockM_bwd(arch, qk_head_size_rounded, causal, is_local);
-  auto round_multiple = [](int x, int m) { return (x + m - 1) / m * m; };
-  int const max_q_len_rounded = round_multiple(max_q_len, kBlockM);
-  int const total_seq_len_q_padded_rounded =
-      round_multiple(total_seq_len_q + batch_size * kBlockM, kBlockM);
-
-  TORCH_CHECK(dq.dtype() == q_type, "dq must have the same dtype as q");
-  CHECK_DEVICE(dq);
-  TORCH_CHECK(dq.stride(-1) == 1, "dq must have contiguous last dimension");
-  if (!is_jagged) {
-    CHECK_SHAPE(dq, batch_size, max_q_len, num_heads, qk_head_size);
-  } else {
-    CHECK_SHAPE(dq, total_seq_len_q, num_heads, qk_head_size);
-  }
-  TORCH_CHECK(dk.dtype() == q_type, "dk must have the same dtype as q");
-  CHECK_DEVICE(dk);
-  TORCH_CHECK(dk.stride(-1) == 1, "dk must have contiguous last dimension");
-  if (!is_jagged) {
-    CHECK_SHAPE(dk, batch_size, max_seq_len, num_heads, qk_head_size);
-  } else {
-    CHECK_SHAPE(dk, total_seq_len_kv, num_heads, qk_head_size);
-  }
-  TORCH_CHECK(dv.dtype() == q_type, "dv must have the same dtype as q");
-  CHECK_DEVICE(dv);
-  TORCH_CHECK(dv.stride(-1) == 1, "dv must have contiguous last dimension");
-  if (!is_jagged) {
-    CHECK_SHAPE(dv, batch_size, max_seq_len, num_heads, v_head_size);
-  } else {
-    CHECK_SHAPE(dv, total_seq_len_kv, num_heads, v_head_size);
-  }
-
-  // Otherwise the kernel will be launched from cuda:0 device
-  // Cast to char to avoid compiler warning about narrowing
-  at::cuda::CUDAGuard device_guard{(char)q.get_device()};
-  auto opts = q.options();
-
-  at::Tensor dq_accum;
-  if (!is_jagged) {
-    dq_accum = torch::empty(
-        {batch_size, num_heads, max_q_len_rounded * qk_head_size_rounded},
-        opts.dtype(at::kFloat));
-  } else {
-    dq_accum = torch::empty(
-        {num_heads, total_seq_len_q_padded_rounded * qk_head_size_rounded},
-        opts.dtype(at::kFloat));
-  }
-  at::Tensor softmax_d, softmax_lse_log2;
-  if (!is_jagged) {
-    // Need softmax_d to have seqlen_q_rounded since we want its address to be
-    // aligned by 16/8 bytes for TMA / LDG.64
-    softmax_d = torch::empty(
-        {batch_size, num_softmax_heads, max_q_len_rounded},
-        opts.dtype(at::kFloat));
-    softmax_lse_log2 = torch::empty(
-        {batch_size, num_softmax_heads, max_q_len_rounded},
-        opts.dtype(at::kFloat));
-  } else {
-    softmax_d = torch::empty(
-        {num_softmax_heads, total_seq_len_q_padded_rounded},
-        opts.dtype(at::kFloat));
-    softmax_lse_log2 = torch::empty(
-        {num_softmax_heads, total_seq_len_q_padded_rounded},
-        opts.dtype(at::kFloat));
-  }
-
-  // Early return for empty sequences; analog to TMA prevention guard
-  // in hstu_mha_fwd
-  if (total_seq_len_kv == 0 || total_seq_len_q == 0) {
-    return {dq, dk, dv};
-  }
-
-  hstu::Flash_bwd_params params;
-  hstu::set_params_dgrad(
-      params,
-      batch_size,
-      total_seq_len_kv,
-      total_seq_len_q,
-      max_seq_len,
-      max_q_len,
-      max_q_len_rounded,
-      num_heads,
-      qk_head_size,
-      v_head_size,
-      qk_head_size_rounded,
-      v_head_size_rounded,
-      q,
-      k,
-      v,
-      out,
-      dout,
-      dq,
-      dk,
-      dv,
-      dq_accum.data_ptr(),
-      !is_jagged ? nullptr : seq_offsets_.data_ptr(),
-      !has_multiple_targets ? nullptr : num_targets_.data_ptr(),
-      !has_attn_scale ? nullptr : attn_scale_.data_ptr(),
-      !(sort_by_length && is_jagged) ? nullptr
-                                     : sort_by_length_indices_.data_ptr(),
-      !is_cross_attn ? nullptr : seq_offsets_q_.data_ptr(),
-      num_softmax_heads == 0 ? nullptr : softmax_lse.value().data_ptr(),
-      num_softmax_heads == 0 ? nullptr : softmax_d.data_ptr(),
-      num_softmax_heads == 0 ? nullptr : softmax_lse_log2.data_ptr(),
-      num_groups > 1 ? max_seq_len_tensor_.data_ptr() : nullptr,
-      ((num_groups > 1) && (!is_cross_attn))
-          ? contextual_seq_len_tensor_.data_ptr()
-          : nullptr,
-      ((num_groups > 1) && (!is_cross_attn)) ? max_attn_len_tensor_.data_ptr()
-                                             : nullptr,
-      ((num_groups > 1) && (!is_cross_attn))
-          ? min_full_attn_seq_len_tensor_.data_ptr()
-          : nullptr,
-      num_groups,
-      scalar_scale,
-      causal,
-      alpha,
-      max_attn_len,
-      min_full_attn_seq_len,
-      contextual_seq_len,
-      num_softmax_heads,
-      deterministic,
-      sm_margin);
-
-  // auto tile_count_semaphore = (params.is_causal || params.is_local) ?
-  // torch::zeros({1}, opts.dtype(torch::kInt32)) : torch::empty({1},
-  // opts.dtype(torch::kInt32)); params.tile_count_semaphore =
-  // tile_count_semaphore.data_ptr<int>(); Will be zero'ed out in the
-  // backward preprocess kernel
-  at::Tensor dq_semaphore = torch::empty(
-      {(max_seq_len + kBlockM - 1) / kBlockM, batch_size, num_heads},
-      opts.dtype(torch::kInt32));
-  params.dq_semaphore = dq_semaphore.data_ptr<int>();
-
-#ifdef FLASHATTENTION_DISABLE_LOCAL
-  TORCH_CHECK(
-      !params.is_local,
-      "This flash attention build does not support local attention.");
-#endif
-
-  if (total_seq_len_q > 0 && num_heads > 0) {
-    auto stream = at::cuda::getCurrentCUDAStream().stream();
-    run_mha_bwd(params, stream);
-  }
-  return {dq, dk, dv};
-}
-
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common.h
deleted file mode 100644
index 98ca009f5..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/******************************************************************************
- * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
- *Pradeep Ramani, Tri Dao.
- ******************************************************************************/
-
-// Include these 2 headers instead of torch/extension.h since we don't need all
-// of the torch headers.
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <c10/util/Optional.h>
-
-#include <torch/nn/functional.h>
-
-#include <torch/library.h> // @manual
-
-#define CHECK_DEVICE(x) TORCH_CHECK(x.is_cuda(), #x " must be on CUDA")
-#define CHECK_SHAPE(x, ...)                           \
-  TORCH_CHECK(                                        \
-      x.sizes() == torch::IntArrayRef({__VA_ARGS__}), \
-      #x " must have shape (" #__VA_ARGS__ ")")
-#define CHECK_CONTIGUOUS(x) \
-  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
-
-inline int round_up_headdim(int head_size) {
-#ifndef FLASHATTENTION_DISABLE_HDIM64
-  if (head_size <= 64) {
-    return 64;
-  }
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM96
-  if (head_size <= 96) {
-    return 96;
-  }
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM128
-  if (head_size <= 128) {
-    return 128;
-  }
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM192
-  if (head_size <= 192) {
-    return 192;
-  }
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM256
-  if (head_size <= 256) {
-    return 256;
-  }
-#endif
-  return 256;
-}
-
-inline int get_max_headdim() {
-#ifndef FLASHATTENTION_DISABLE_HDIM256
-  return 256;
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM192
-  return 192;
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM128
-  return 128;
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM96
-  return 96;
-#endif
-#ifndef FLASHATTENTION_DISABLE_HDIM64
-  return 64;
-#endif
-  return 0;
-}
-
-namespace hstu {
-
-std::tuple<at::Tensor, std::optional<at::Tensor>> hstu_mha_fwd(
-    int64_t max_seq_len,
-    double alpha,
-    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
-    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
-    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
-    const std::optional<at::Tensor>& seq_offsets,
-    bool causal,
-    const std::optional<at::Tensor>& num_targets,
-    const std::optional<at::Tensor>& attn_scale,
-    int64_t max_attn_len,
-    int64_t min_full_attn_seq_len,
-    int64_t contextual_seq_len,
-    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
-    const std::optional<at::Tensor>& k_descale, // (b, h_k)
-    const std::optional<at::Tensor>& v_descale, // (b, h_k)
-    const int64_t sm_margin = 0,
-    int64_t max_q_len = 0,
-    const std::optional<at::Tensor>& seq_offsets_q = std::nullopt,
-    int64_t num_softmax_heads = 0,
-    bool training = true,
-    const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
-    const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
-    const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
-    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
-        std::nullopt,
-    int64_t num_groups = 1);
-
-std::vector<at::Tensor> hstu_mha_bwd(
-    int64_t max_seq_len,
-    double alpha,
-    at::Tensor& dout,
-    at::Tensor& q,
-    at::Tensor& k,
-    at::Tensor& v,
-    at::Tensor& dq,
-    at::Tensor& dk,
-    at::Tensor& dv,
-    at::Tensor& out,
-    const std::optional<at::Tensor>& seq_offsets,
-    bool causal,
-    const std::optional<at::Tensor>& num_targets,
-    const std::optional<at::Tensor>& attn_scale,
-    int64_t max_attn_len,
-    int64_t min_full_attn_seq_len,
-    int64_t contextual_seq_len,
-    bool sort_by_length,
-    bool const deterministic,
-    const int64_t sm_margin = 0,
-    int64_t max_q_len = 0,
-    const std::optional<at::Tensor>& seq_offsets_q = std::nullopt,
-    int64_t num_softmax_heads = 0,
-    const std::optional<at::Tensor>& softmax_lse = std::nullopt,
-    const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
-    const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
-    const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
-    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
-        std::nullopt,
-    int64_t num_groups = 1);
-
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common_cpu.cpp b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common_cpu.cpp
deleted file mode 100644
index 30d4f792c..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common_cpu.cpp
+++ /dev/null
@@ -1,172 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/******************************************************************************
- * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
- *Pradeep Ramani, Tri Dao.
- ******************************************************************************/
-
-#include <c10/util/Optional.h>
-#include <torch/nn/functional.h>
-
-#include "flash_common_cpu.h"
-
-namespace hstu {
-
-std::tuple<at::Tensor, std::optional<at::Tensor>> hstu_mha_fwd_meta(
-    const at::SymInt max_seq_len,
-    double alpha,
-    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
-    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
-    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
-    const std::optional<at::Tensor>& seq_offsets,
-    bool causal,
-    const std::optional<at::Tensor>& num_targets,
-    const std::optional<at::Tensor>& attn_scale,
-    int64_t max_attn_len,
-    int64_t min_full_attn_seq_len,
-    int64_t contextual_seq_len,
-    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
-    const std::optional<at::Tensor>& k_descale, // (b, h_k)
-    const std::optional<at::Tensor>& v_descale, // (b, h_k)
-    const int64_t sm_margin,
-    int64_t max_q_len,
-    const std::optional<at::Tensor>& seq_offsets_q,
-    int64_t num_softmax_heads,
-    bool training,
-    const std::optional<at::Tensor>& max_seq_len_tensor,
-    const std::optional<at::Tensor>& contextual_seq_len_tensor,
-    const std::optional<at::Tensor>& max_attn_len_tensor,
-    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor,
-    int64_t num_groups) {
-  auto q_type = q.scalar_type();
-  auto const sizes = q.sym_sizes();
-  at::Tensor seq_offsets_;
-  bool const is_jagged = seq_offsets.has_value();
-  if (is_jagged) {
-    seq_offsets_ = seq_offsets.value();
-  }
-  const c10::SymInt batch_size =
-      !is_jagged ? sizes[0] : seq_offsets_.sym_sizes()[0] - 1;
-  auto total_seq_len = !is_jagged ? batch_size * max_seq_len : sizes[0];
-  const auto& num_heads = sizes[sizes.size() - 2];
-  auto v_head_size = v.sym_sizes()[v.sym_sizes().size() - 1];
-  auto out_type = q_type == at::ScalarType::Float8_e4m3fn
-      ? at::ScalarType::BFloat16
-      : q_type;
-  auto opts = q.options();
-
-  at::Tensor out;
-  if (!is_jagged) {
-    out = at::empty_symint(
-        {batch_size, max_seq_len, num_heads, v_head_size},
-        opts.dtype(out_type));
-  } else {
-    out = at::empty_symint(
-        {total_seq_len, num_heads, v_head_size}, opts.dtype(out_type));
-  }
-  return {out, std::nullopt};
-};
-
-std::tuple<at::Tensor, std::optional<at::Tensor>> hstu_mha_fwd_dummy(
-    int64_t max_seq_len,
-    double alpha,
-    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
-    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
-    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
-    const std::optional<at::Tensor>& seq_offsets,
-    bool causal,
-    const std::optional<at::Tensor>& num_targets,
-    const std::optional<at::Tensor>& attn_scale,
-    int64_t max_attn_len,
-    int64_t min_full_attn_seq_len,
-    int64_t contextual_seq_len,
-    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
-    const std::optional<at::Tensor>& k_descale, // (b, h_k)
-    const std::optional<at::Tensor>& v_descale, // (b, h_k)
-    const int64_t sm_margin,
-    const int64_t max_q_len,
-    const std::optional<at::Tensor>& seq_offsets_q,
-    int64_t num_softmax_heads,
-    bool training,
-    const std::optional<at::Tensor>& max_seq_len_tensor,
-    const std::optional<at::Tensor>& contextual_seq_len_tensor,
-    const std::optional<at::Tensor>& max_attn_len_tensor,
-    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor,
-    int64_t num_groups) {
-  auto q_type = q.scalar_type();
-  auto const sizes = q.sizes();
-  at::Tensor seq_offsets_;
-  bool const is_jagged = seq_offsets.has_value();
-  if (is_jagged) {
-    seq_offsets_ = seq_offsets.value();
-  }
-  const int batch_size = !is_jagged ? sizes[0] : seq_offsets_.size(0) - 1;
-  int total_seq_len = !is_jagged ? batch_size * max_seq_len : sizes[0];
-  int num_heads = q.size(-2);
-  //   int const qk_head_size = q.size(-1);
-  int const v_head_size = v.size(-1);
-  //   int const max_headdim = get_max_headdim();
-  auto out_type = q_type == at::ScalarType::Float8_e4m3fn
-      ? at::ScalarType::BFloat16
-      : q_type;
-  auto opts = q.options();
-
-  at::Tensor out;
-  if (!is_jagged) {
-    out = torch::empty(
-        {batch_size, max_seq_len, num_heads, v_head_size},
-        opts.dtype(out_type));
-  } else {
-    out = torch::empty(
-        {total_seq_len, num_heads, v_head_size}, opts.dtype(out_type));
-  }
-  return {out, std::nullopt};
-};
-
-std::vector<at::Tensor> hstu_mha_bwd_dummy(
-    int64_t max_seq_len,
-    double alpha,
-    at::Tensor& dout,
-    at::Tensor& q,
-    at::Tensor& k,
-    at::Tensor& v,
-    at::Tensor& dq,
-    at::Tensor& dk,
-    at::Tensor& dv,
-    at::Tensor& out,
-    const std::optional<at::Tensor>& seq_offsets,
-    bool causal,
-    const std::optional<at::Tensor>& num_targets,
-    const std::optional<at::Tensor>& attn_scale,
-    int64_t max_attn_len,
-    int64_t min_full_attn_seq_len,
-    int64_t contextual_seq_len,
-    bool sort_by_length,
-    bool const deterministic,
-    const int64_t sm_margin,
-    const int64_t max_q_len,
-    const std::optional<at::Tensor>& seq_offsets_q,
-    int64_t num_softmax_heads,
-    const std::optional<at::Tensor>& softmax_lse,
-    const std::optional<at::Tensor>& max_seq_len_tensor,
-    const std::optional<at::Tensor>& contextual_seq_len_tensor,
-    const std::optional<at::Tensor>& max_attn_len_tensor,
-    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor,
-    int64_t num_groups) {
-  return {dq, dk, dv};
-};
-
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common_cpu.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common_cpu.h
deleted file mode 100644
index 9d0e18a71..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_common_cpu.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/******************************************************************************
- * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
- *Pradeep Ramani, Tri Dao.
- ******************************************************************************/
-
-#include <c10/util/Optional.h>
-#include <torch/library.h> // @manual
-#include <torch/nn/functional.h>
-
-namespace hstu {
-
-std::tuple<at::Tensor, std::optional<at::Tensor>> hstu_mha_fwd_dummy(
-    int64_t max_seq_len,
-    double alpha,
-    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
-    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
-    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
-    const std::optional<at::Tensor>& seq_offsets,
-    bool causal,
-    const std::optional<at::Tensor>& num_targets,
-    const std::optional<at::Tensor>& attn_scale,
-    int64_t max_attn_len,
-    int64_t min_full_attn_seq_len,
-    int64_t contextual_seq_len,
-    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
-    const std::optional<at::Tensor>& k_descale, // (b, h_k)
-    const std::optional<at::Tensor>& v_descale, // (b, h_k)
-    const int64_t sm_margin = 0,
-    int64_t max_q_len = 0,
-    const std::optional<at::Tensor>& seq_offsets_q = std::nullopt,
-    int64_t num_softmax_heads = 0,
-    bool training = true,
-    const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
-    const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
-    const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
-    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
-        std::nullopt,
-    int64_t num_groups = 1);
-
-std::vector<at::Tensor> hstu_mha_bwd_dummy(
-    int64_t max_seq_len,
-    double alpha,
-    at::Tensor& dout,
-    at::Tensor& q,
-    at::Tensor& k,
-    at::Tensor& v,
-    at::Tensor& dq,
-    at::Tensor& dk,
-    at::Tensor& dv,
-    at::Tensor& out,
-    const std::optional<at::Tensor>& seq_offsets,
-    bool causal,
-    const std::optional<at::Tensor>& num_targets,
-    const std::optional<at::Tensor>& attn_scale,
-    int64_t max_attn_len,
-    int64_t min_full_attn_seq_len,
-    int64_t contextual_seq_len,
-    bool sort_by_length,
-    bool const deterministic,
-    const int64_t sm_margin = 0,
-    int64_t max_q_len = 0,
-    const std::optional<at::Tensor>& seq_offsets_q = std::nullopt,
-    int64_t num_softmax_heads = 0,
-    const std::optional<at::Tensor>& softmax_lse = std::nullopt,
-    const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
-    const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
-    const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
-    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
-        std::nullopt,
-    int64_t num_groups = 1);
-
-std::tuple<at::Tensor, std::optional<at::Tensor>> hstu_mha_fwd_meta(
-    const at::SymInt max_seq_len,
-    double alpha,
-    at::Tensor& q, // (b, s, h, d) or (total_s, h, d)
-    at::Tensor& k, // (b, s, h, d) or (total_s, h, d)
-    at::Tensor& v, // (b, s, h, d) or (total_s, h, d)
-    const std::optional<at::Tensor>& seq_offsets,
-    bool causal,
-    const std::optional<at::Tensor>& num_targets,
-    const std::optional<at::Tensor>& attn_scale,
-    int64_t max_attn_len,
-    int64_t min_full_attn_seq_len,
-    int64_t contextual_seq_len,
-    const std::optional<at::Tensor>& q_descale, // (b, h_k), not (b, h)
-    const std::optional<at::Tensor>& k_descale, // (b, h_k)
-    const std::optional<at::Tensor>& v_descale, // (b, h_k)
-    const int64_t sm_margin = 0,
-    int64_t max_q_len = 0,
-    const std::optional<at::Tensor>& seq_offsets_q = std::nullopt,
-    int64_t num_softmax_heads = 0,
-    bool training = true,
-    const std::optional<at::Tensor>& max_seq_len_tensor = std::nullopt,
-    const std::optional<at::Tensor>& contextual_seq_len_tensor = std::nullopt,
-    const std::optional<at::Tensor>& max_attn_len_tensor = std::nullopt,
-    const std::optional<at::Tensor>& min_full_attn_seq_len_tensor =
-        std::nullopt,
-    int64_t num_groups = 1);
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_fwd_kernel_sm90.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_fwd_kernel_sm90.h
deleted file mode 100644
index 2e3d0916b..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_fwd_kernel_sm90.h
+++ /dev/null
@@ -1,511 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/******************************************************************************
- * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
- *Pradeep Ramani, Tri Dao.
- ******************************************************************************/
-
-#pragma once
-
-#include "cute/tensor.hpp"
-
-#include <cutlass/arch/reg_reconfig.h>
-#include <cutlass/array.h>
-#include <cutlass/cutlass.h>
-#include <cutlass/kernel_hardware_info.h>
-#include <cutlass/numeric_conversion.h>
-#include <cutlass/numeric_types.h>
-#include <cutlass/pipeline/pipeline.hpp>
-
-#include "seqlen.h"
-#include "softmax.h"
-#include "tile_scheduler.h"
-#include "utils.h"
-
-namespace hstu {
-
-using namespace cute;
-
-template <
-    bool Softmax,
-    class CollectiveMainloop_,
-    class CollectiveEpilogue_,
-    class TileScheduler_>
-class FlashAttnFwdSm90 {
- public:
-  // Type Aliases
-  using CollectiveMainloop = CollectiveMainloop_;
-  using CollectiveEpilogue = CollectiveEpilogue_;
-  static constexpr bool Is_FP8 = CollectiveMainloop::Is_FP8;
-  static constexpr bool Transpose_V = CollectiveMainloop::Transpose_V;
-  static constexpr bool Use_TMA_O = CollectiveEpilogue::Use_TMA_O;
-  static constexpr int NumProducerThreads =
-      CollectiveMainloop::NumProducerThreads;
-  using SeqlenInfo_t = typename CollectiveMainloop::SeqlenInfo_t;
-
-  // Mainloop derived types
-  using TileShape_MNK = typename CollectiveMainloop::TileShape_MNK;
-  using TiledMma0 = typename CollectiveMainloop::TiledMma0;
-  using TiledMma1 = typename CollectiveMainloop::TiledMma1;
-  using ArchTag = typename CollectiveMainloop::ArchTag;
-  using ClusterShape = typename CollectiveMainloop::ClusterShape;
-  using MainloopArguments = typename CollectiveMainloop::Arguments;
-  using MainloopParams = typename CollectiveMainloop::Params;
-  using BarrierQ = cutlass::arch::ClusterTransactionBarrier;
-
-  // Epilogue derived types
-  using EpilogueArguments = typename CollectiveEpilogue::Arguments;
-  using EpilogueParams = typename CollectiveEpilogue::Params;
-
-  static_assert(ArchTag::kMinComputeCapability >= 90);
-
-  using TileScheduler = TileScheduler_;
-  using TileSchedulerArguments = typename hstu::TileSchedulerArguments;
-  using TileSchedulerParams = typename TileScheduler::Params;
-
-  static constexpr uint32_t NumLoadWarpGroups = 1;
-  static constexpr uint32_t NumMmaWarpGroups =
-      CUTE_STATIC_V(size(TiledMma0{})) / cutlass::NumThreadsPerWarpGroup;
-  static constexpr uint32_t MaxThreadsPerBlock =
-      CUTE_STATIC_V(size(TiledMma0{})) +
-      (NumLoadWarpGroups * cutlass::NumThreadsPerWarpGroup);
-  static constexpr uint32_t MinBlocksPerMultiprocessor = 1;
-  static_assert(
-      NumMmaWarpGroups == 1 || NumMmaWarpGroups == 2 || NumMmaWarpGroups == 3);
-
-  /// Register requirement for Load and Math WGs
-  // If we use cp.async to load K and V, we need more registers for the producer
-  // WG.
-  static constexpr uint32_t LoadRegisterRequirement =
-      NumMmaWarpGroups == 1 ? 56 : (NumMmaWarpGroups == 2 ? 24 : 32);
-  static constexpr uint32_t MmaRegisterRequirement =
-      NumMmaWarpGroups == 1 ? 256 : (NumMmaWarpGroups == 2 ? 240 : 160);
-  // If you want to print from the producer warp, you'd need to increase the
-  // number of registers Otherwise you'll get CUDA error. static constexpr
-  // uint32_t LoadRegisterRequirement = 40; static constexpr uint32_t
-  // MmaRegisterRequirement = NumMmaWarpGroups == 2 ? 232 : 152;
-
-  // Kernel level shared memory storage
-  // We overlap the shared memory for the mainloop and epilogue. However, we
-  // only want smem_o to overlap with smem_v and nothing else, so we'll pad in
-  // case sizeof(smem_o) > sizeof(smem_v).
-  static constexpr int mainloop_smem_padding_ =
-      int(sizeof(typename CollectiveEpilogue::TensorStorage)) -
-      int(sizeof(
-          decltype((typename CollectiveMainloop::TensorStorage{}).smem_v)));
-  static constexpr int mainloop_smem_padding =
-      mainloop_smem_padding_ < 0 ? 0 : mainloop_smem_padding_;
-  struct SharedStorage {
-    struct TensorStorage : cute::aligned_struct<128> {
-      union {
-        struct {
-          cute::array<uint32_t, mainloop_smem_padding / sizeof(uint32_t)>
-              padding_;
-          typename CollectiveMainloop::TensorStorage mainloop;
-        };
-        // We want smem_o to line up with the start of smem_v
-        typename CollectiveEpilogue::TensorStorage epilogue;
-      };
-    } tensors;
-
-    struct PipelineStorage : cute::aligned_struct<16> {
-      alignas(16) BarrierQ barrier_Q;
-      alignas(16) cutlass::arch::ClusterBarrier barrier_O;
-      alignas(16) typename CollectiveMainloop::MainloopPipelineK::SharedStorage
-          pipeline_k;
-      alignas(16) typename CollectiveMainloop::MainloopPipelineV::SharedStorage
-          pipeline_v;
-      alignas(16) typename CollectiveMainloop::MainloopPipelineVt::SharedStorage
-          pipeline_vt;
-      alignas(16) typename TileScheduler::SharedStorage smem_scheduler;
-    } pipelines;
-  };
-
-  static constexpr int SharedStorageSize = sizeof(SharedStorage);
-
-  // Device side arguments
-  struct Arguments {
-    MainloopArguments mainloop{};
-    EpilogueArguments epilogue{};
-    cutlass::KernelHardwareInfo hw_info{};
-    TileSchedulerArguments scheduler{};
-  };
-
-  // Kernel entry point API
-  struct Params {
-    MainloopParams mainloop{};
-    EpilogueParams epilogue{};
-    cutlass::KernelHardwareInfo hw_info{};
-    TileSchedulerParams scheduler{};
-  };
-
-  //
-  // Methods
-  //
-
-  // Convert to underlying arguments. In this case, a simple copy for the
-  // aliased type.
-  static Params to_underlying_arguments(Arguments const& args) {
-    CUTLASS_TRACE_HOST("to_underlying_arguments():");
-
-    // Get SM count if needed, otherwise use user supplied SM count
-    int sm_count = args.hw_info.sm_count;
-    if (sm_count <= 0) {
-      CUTLASS_TRACE_HOST(
-          "  WARNING: Arguments do not include a valid SM count.\n"
-          "  For optimal performance, populate the arguments KernelHardwareInfo struct with the SM count.");
-      sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
-          args.hw_info.device_id);
-    }
-
-    CUTLASS_TRACE_HOST(
-        "to_underlying_arguments(): Setting persistent grid SM count to "
-        << sm_count);
-
-    cutlass::KernelHardwareInfo hw_info{args.hw_info.device_id, sm_count};
-    return {
-        CollectiveMainloop::to_underlying_arguments(args.mainloop),
-        CollectiveEpilogue::to_underlying_arguments(args.epilogue),
-        hw_info,
-        TileScheduler::to_underlying_arguments(args.scheduler)};
-  }
-
-  // Computes the kernel launch grid shape based on runtime parameters
-  static dim3 get_grid_shape(Params const& params) {
-    return TileScheduler::get_grid_shape(
-        params.scheduler, params.hw_info.sm_count);
-  }
-
-  static dim3 get_block_shape() {
-    return dim3(MaxThreadsPerBlock, 1, 1);
-  }
-
-  CUTLASS_DEVICE
-  void operator()(Params const& params, char* smem_buf) {
-    static constexpr int NumMmaThreads =
-        NumMmaWarpGroups * cutlass::NumThreadsPerWarpGroup;
-    static constexpr int MmaThreadOffset =
-        NumLoadWarpGroups * cutlass::NumThreadsPerWarpGroup;
-    static constexpr int kBlockM = get<0>(TileShape_MNK{});
-
-    using MainloopPipelineK = typename CollectiveMainloop::MainloopPipelineK;
-    using MainloopPipelineV = typename CollectiveMainloop::MainloopPipelineV;
-    using MainloopPipelineVt = typename CollectiveMainloop::MainloopPipelineVt;
-    using MainloopPipelineKVNew =
-        typename CollectiveMainloop::MainloopPipelineKVNew;
-    using PipelineState = typename CollectiveMainloop::PipelineState;
-    using PipelineParamsK = typename MainloopPipelineK::Params;
-    using PipelineParamsV = typename MainloopPipelineV::Params;
-    using PipelineParamsVt = typename MainloopPipelineVt::Params;
-    using PipelineParamsKVNew = typename MainloopPipelineKVNew::Params;
-
-    SharedStorage& shared_storage = *reinterpret_cast<SharedStorage*>(smem_buf);
-
-    int const lane_predicate = cute::elect_one_sync();
-    int const warp_idx = cutlass::canonical_warp_idx_sync();
-
-    // Issue Tma Descriptor Prefetch from a single thread
-    if (warp_idx == 0 && lane_predicate) {
-      CollectiveMainloop::prefetch_tma_descriptors(params.mainloop);
-      CollectiveEpilogue::prefetch_tma_descriptors(params.epilogue);
-    }
-
-    // Obtain warp index
-    int const warp_group_thread_idx =
-        threadIdx.x % cutlass::NumThreadsPerWarpGroup;
-    int warp_group_idx = cutlass::canonical_warp_group_idx();
-
-    if (warp_idx == 0 && lane_predicate) {
-      shared_storage.pipelines.barrier_Q.init(1 /*numThreads*/);
-      shared_storage.pipelines.barrier_O.init(
-          size(ClusterShape{}) *
-          (Use_TMA_O ? 1 : NumMmaThreads) /*numThreads*/);
-    }
-
-    // We're counting on pipeline_k to call cutlass::arch::fence_barrier_init();
-    PipelineParamsK pipeline_params_k;
-    pipeline_params_k.role = warp_group_idx == 0
-        ? MainloopPipelineK::ThreadCategory::Producer
-        : MainloopPipelineK::ThreadCategory::Consumer;
-    pipeline_params_k.transaction_bytes =
-        CollectiveMainloop::TmaTransactionBytesK;
-    pipeline_params_k.is_leader = warp_group_thread_idx == 0;
-    pipeline_params_k.num_consumers = NumMmaThreads;
-
-    MainloopPipelineK pipeline_k = [&] {
-      return MainloopPipelineK(
-          shared_storage.pipelines.pipeline_k,
-          pipeline_params_k,
-          ClusterShape{});
-    }();
-    // MainloopPipelineV pipeline_v(shared_storage.pipelines.pipeline_v,
-    // pipeline_params_v, ClusterShape{});
-    MainloopPipelineV pipeline_v = [&] {
-      if constexpr (!Transpose_V) {
-        static_assert(is_same_v<PipelineParamsK, PipelineParamsV>);
-        return MainloopPipelineV(
-            shared_storage.pipelines.pipeline_v,
-            pipeline_params_k,
-            ClusterShape{});
-      } else {
-        PipelineParamsV pipeline_params_v;
-        pipeline_params_v.role = warp_group_idx == 0
-            ? MainloopPipelineV::ThreadCategory::Producer
-            : MainloopPipelineV::ThreadCategory::Consumer;
-        pipeline_params_v.producer_arv_count = NumProducerThreads;
-        pipeline_params_v.consumer_arv_count = NumMmaThreads;
-        return MainloopPipelineV(
-            shared_storage.pipelines.pipeline_v, pipeline_params_v);
-      }
-    }();
-    static_assert(is_same_v<PipelineParamsK, PipelineParamsVt>);
-    // If we need to transpose V (e.g. FP8 and V is row-major), we use
-    // pipeline_vt for the TMA, then the producer WG will read from pipeline_vt
-    // and write to pipeline_v. If we don't need to transpose V, we use
-    // pipeline_v for the TMA, and pipeline_vt won't be used. Technically for
-    // pipeline_params_vt, warp0 of WG0 is the producer and all of WG0 are
-    // consumers. However, the thread role isn't used in the pipeline
-    // implementation.
-    MainloopPipelineVt pipeline_vt = [&] {
-      pipeline_params_k.num_consumers =
-          NumProducerThreads; // TMA_V is only consumed by the producer WG
-      return MainloopPipelineVt(
-          shared_storage.pipelines.pipeline_vt,
-          pipeline_params_k,
-          ClusterShape{});
-    }();
-
-    CollectiveMainloop collective_mainloop;
-    CollectiveEpilogue collective_epilogue;
-
-    // We need this to guarantee that the Pipeline init is visible to all
-    // producers and consumer blocks in the Cluster
-    if constexpr (size(ClusterShape{}) > 1) {
-      cute::cluster_arrive_relaxed();
-      cute::cluster_wait();
-    } else {
-      __syncthreads();
-    }
-
-    if (warp_group_idx == 0) { // Producer
-      cutlass::arch::warpgroup_reg_dealloc<LoadRegisterRequirement>();
-
-      // The pipelines for AppendKV and main attention are different, since e.g.
-      // main attention might use cp.async to load KV (if PagedKV) while
-      // AppendKV always uses TMA to load KV_new. Since the pipeline states are
-      // different, we have to manually sync to make sure the two pipelines
-      // don't race when accessing smem_k and smem_v.
-      PipelineState smem_pipe_write =
-          cutlass::make_producer_start_state<MainloopPipelineK>();
-      PipelineState smem_pipe_write_new =
-          cutlass::make_producer_start_state<MainloopPipelineKVNew>();
-      int work_idx = 0;
-
-      TileScheduler scheduler(
-          reinterpret_cast<typename TileScheduler::SharedStorage*>(
-              &shared_storage.pipelines.smem_scheduler));
-      int warp_idx_in_warpgroup =
-          __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0);
-      static constexpr bool SingleProducerWarp =
-          NumProducerThreads == cutlass::NumThreadsPerWarp;
-      if constexpr (SingleProducerWarp) {
-        if (warp_idx_in_warpgroup != 0) {
-          return;
-        }
-      }
-      if (!SingleProducerWarp && warp_idx_in_warpgroup != 0) {
-        scheduler.init_consumer();
-      }
-
-      // Load Q, K, V
-      for (auto work_tile_info = SingleProducerWarp ||
-                   warp_idx_in_warpgroup == 0
-               ? scheduler.template get_initial_work</*IsProducerWarp=*/true>(
-                     params.scheduler)
-               : scheduler.template get_initial_work</*IsProducerWarp=*/false>(
-                     params.scheduler);
-           work_tile_info.is_valid(params.scheduler);
-           work_tile_info = SingleProducerWarp || warp_idx_in_warpgroup == 0
-               ? scheduler.template get_next_work</*IsProducerWarp=*/true>(
-                     params.scheduler, work_tile_info)
-               : scheduler.template get_next_work</*IsProducerWarp=*/false>(
-                     params.scheduler, work_tile_info)) {
-        auto block_coord = work_tile_info.get_block_coord(params.scheduler);
-        SeqlenInfo_t seqlen_info{
-            get<2>(block_coord) /*bidb*/,
-            get<0>(params.mainloop.shape_Q),
-            get<0>(params.mainloop.shape_K),
-            params.mainloop.seq_offsets,
-            params.mainloop.seq_offsets_q,
-            params.mainloop.num_targets,
-        };
-        auto scheduler_prefetch = [&scheduler, &params, &work_tile_info]() {
-          scheduler.prefetch_next_work(params.scheduler, work_tile_info);
-        };
-        // pipeline_vt won't be used if we don't need to transpose V.
-        collective_mainloop.load(
-            params.mainloop,
-            pipeline_k,
-            pipeline_v,
-            pipeline_vt,
-            smem_pipe_write,
-            shared_storage,
-            scheduler_prefetch,
-            seqlen_info,
-            block_coord,
-            work_idx);
-      }
-      collective_mainloop.load_tail(
-          pipeline_k,
-          pipeline_v,
-          pipeline_vt,
-          smem_pipe_write,
-          shared_storage,
-          work_idx);
-    } else { // Consumer
-      cutlass::arch::warpgroup_reg_alloc<MmaRegisterRequirement>();
-
-      TileScheduler scheduler(
-          reinterpret_cast<typename TileScheduler::SharedStorage*>(
-              &shared_storage.pipelines.smem_scheduler));
-      // Initialize matmul objects.
-      TiledMma1 tiled_mma1;
-
-      PipelineState smem_pipe_read;
-      // We don't need separate variables smem_pipe_release_k and
-      // smem_pipe_release_v (like in Cutlass's gemm) because the read and
-      // release pipeline states are always the same.
-
-      scheduler.init_consumer();
-      collective_mainloop.mma_init();
-
-      int work_idx = 0;
-      CUTLASS_PRAGMA_NO_UNROLL
-      for (auto work_tile_info =
-               scheduler.template get_initial_work</*IsProducerWarp=*/false>(
-                   params.scheduler);
-           work_tile_info.is_valid(params.scheduler);
-           work_tile_info =
-               scheduler.template get_next_work</*IsProducerWarp=*/false>(
-                   params.scheduler, work_tile_info)) {
-        // Attention output (GEMM-II) accumulator.
-        Tensor tOrO =
-            partition_fragment_C(tiled_mma1, select<0, 2>(TileShape_MNK{}));
-        // If there's tanh softcap, the scaling will be done before tanh.
-        auto block_coord = work_tile_info.get_block_coord(params.scheduler);
-        int const bidb = get<2>(block_coord);
-        int const bidh = get<1>(block_coord);
-        if constexpr (Is_FP8) {
-          int const bidh_kv = bidh;
-          float const q_descale = params.mainloop.ptr_q_descale == nullptr
-              ? 1.0f
-              : params.mainloop.ptr_q_descale
-                    [bidb * get<0>(params.mainloop.stride_q_descale) +
-                     bidh_kv * get<1>(params.mainloop.stride_q_descale)];
-          float const k_descale = params.mainloop.ptr_k_descale == nullptr
-              ? 1.0f
-              : params.mainloop.ptr_k_descale
-                    [bidb * get<0>(params.mainloop.stride_k_descale) +
-                     bidh_kv * get<1>(params.mainloop.stride_k_descale)];
-        }
-
-        SeqlenInfo_t seqlen_info{
-            bidb,
-            get<0>(params.mainloop.shape_Q),
-            get<0>(params.mainloop.shape_K),
-            params.mainloop.seq_offsets,
-            params.mainloop.seq_offsets_q,
-            params.mainloop.num_targets,
-        };
-        float alpha_log2 = params.mainloop.alpha_log2;
-        bool tile_valid;
-        if constexpr (Softmax) {
-          hstu::Softmax<
-              2 * (2 * kBlockM / NumMmaThreads),
-              /*Max_offset=*/!Is_FP8 ? 0 : 8>
-              softmax(alpha_log2);
-          tile_valid = collective_mainloop.mma_softmax(
-              params.mainloop,
-              pipeline_k,
-              pipeline_v,
-              smem_pipe_read,
-              tOrO,
-              softmax,
-              threadIdx.x - MmaThreadOffset,
-              work_idx,
-              seqlen_info,
-              block_coord,
-              shared_storage);
-          if (tile_valid) {
-            collective_epilogue.store(
-                params.epilogue,
-                tOrO,
-                shared_storage,
-                tiled_mma1,
-                threadIdx.x - MmaThreadOffset,
-                block_coord);
-            collective_epilogue.store_softmax(
-                params.epilogue,
-                softmax.row_sum,
-                tiled_mma1,
-                threadIdx.x - MmaThreadOffset,
-                block_coord);
-          } else {
-            // Write 0 to gO and -inf to gLSE.
-            // If Split, we don't have to write 0 to O if the mha_combine kernel
-            // is used, since it will not use the value of O if LSE is -inf.
-            collective_epilogue.template store_zero<true /*Clear_O*/>(
-                params.epilogue, threadIdx.x - MmaThreadOffset, block_coord);
-            // collective_epilogue.store_zero(params.epilogue, threadIdx.x -
-            // MmaThreadOffset, block_coord);
-          }
-        } else {
-          tile_valid = collective_mainloop.mma(
-              params.mainloop,
-              pipeline_k,
-              pipeline_v,
-              smem_pipe_read,
-              tOrO,
-              threadIdx.x - MmaThreadOffset,
-              work_idx,
-              seqlen_info,
-              block_coord,
-              shared_storage);
-          if (tile_valid) {
-            collective_epilogue.store(
-                params.epilogue,
-                tOrO,
-                shared_storage,
-                tiled_mma1,
-                threadIdx.x - MmaThreadOffset,
-                block_coord);
-          } else {
-            // Write 0 to gO and -inf to gLSE.
-            // If Split, we don't have to write 0 to O if the mha_combine kernel
-            // is used, since it will not use the value of O if LSE is -inf.
-            collective_epilogue.template store_zero<true /*Clear_O*/>(
-                params.epilogue, threadIdx.x - MmaThreadOffset, block_coord);
-            // collective_epilogue.store_zero(params.epilogue, threadIdx.x -
-            // MmaThreadOffset, block_coord);
-          }
-        }
-      }
-      collective_epilogue.store_tail();
-    }
-  }
-};
-
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_fwd_launch_template.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_fwd_launch_template.h
deleted file mode 100644
index c79ea3a3f..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/flash_fwd_launch_template.h
+++ /dev/null
@@ -1,376 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/******************************************************************************
- * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
- *Pradeep Ramani, Tri Dao.
- ******************************************************************************/
-
-#pragma once
-
-// clang-format off
-#include "cute/tensor.hpp"
-
-#include "cutlass/cutlass.h"
-#include "cutlass/device_kernel.h"  // For device_kernel
-#include <cutlass/kernel_hardware_info.h>
-#include "cutlass/cluster_launch.hpp"
-#include "cutlass/kernel_launch.h"
-
-#include "static_switch.h"
-#include "flash.h"
-#include "tile_size.h"
-#include "tile_scheduler.h"
-#include "flash_fwd_kernel_sm90.h"
-#include "mainloop_fwd_sm90_tma_gmma_ws.h"
-#include "epilogue_fwd.h"
-// clang-format on
-
-namespace hstu {
-
-using namespace cute;
-
-template <
-    int Arch,
-    int kHeadDim,
-    int ClusterM,
-    typename Element,
-    typename ElementOut,
-    bool Causal,
-    bool Local,
-    bool Contexual_mask,
-    bool Jagged,
-    bool Has_targets,
-    bool V_colmajor,
-    bool Cross,
-    bool Softmax,
-    bool Training>
-void run_flash_fwd(hstu::Flash_fwd_params& params, cudaStream_t stream) {
-  static_assert(
-      !(Causal && Local),
-      "Causal and Local cannot be enabled at the same time");
-  static constexpr bool Is_FP8 =
-      cute::is_same_v<Element, cutlass::float_e4m3_t> ||
-      cute::is_same_v<Element, cutlass::float_e5m2_t>;
-  static constexpr bool FP8_TransposeV = Is_FP8 && !V_colmajor;
-  using ArchTag =
-      std::conditional_t<Arch >= 90, cutlass::arch::Sm90, cutlass::arch::Sm80>;
-
-  // Can't use structured binding since it's not compatible with constexpr
-  static constexpr std::tuple<int, int, bool> kBlockMN_RS =
-      hstu::tile_size_fwd_sm90(
-          kHeadDim,
-          Causal,
-          Local,
-          sizeof(Element) /*element_size*/,
-          V_colmajor,
-          Cross,
-          Training);
-  static constexpr std::tuple<int, int, int, int, bool>
-      kBlockMN_kNWarps_Stages_RS = hstu::tile_size_fwd_sm8x(
-          Arch == 86 || Arch == 89,
-          kHeadDim,
-          Causal,
-          Local,
-          sizeof(Element) /*element_size*/);
-  static constexpr int kBlockM = Arch >= 90
-      ? std::get<0>(kBlockMN_RS)
-      : std::get<0>(kBlockMN_kNWarps_Stages_RS);
-  static constexpr int kBlockN = Arch >= 90
-      ? std::get<1>(kBlockMN_RS)
-      : std::get<1>(kBlockMN_kNWarps_Stages_RS);
-  static constexpr bool Mma1_is_RS = std::get<2>(kBlockMN_RS);
-  static constexpr int kNWarps = std::get<2>(kBlockMN_kNWarps_Stages_RS);
-  static constexpr int kStages =
-      Arch >= 90 ? 2 : std::get<3>(kBlockMN_kNWarps_Stages_RS);
-  static constexpr bool Q_in_regs =
-      Arch >= 90 ? false : std::get<4>(kBlockMN_kNWarps_Stages_RS);
-
-#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
-  std::printf(
-      "kBlockM: (%d), kBlockN: (%d), Mma1_is_RS: (%d), kNWarps: (%d), kStages: (%d), Q_in_regs: (%d)\n",
-      kBlockM,
-      kBlockN,
-      Mma1_is_RS,
-      kNWarps,
-      kStages,
-      Q_in_regs);
-#endif
-
-  using TileShape_MNK = cute::Shape<Int<kBlockM>, Int<kBlockN>, Int<kHeadDim>>;
-  using ClusterShape = cute::Shape<Int<ClusterM>, _1, _1>;
-  using CollectiveMainloop = hstu::CollectiveMainloopFwdSm90<
-      kStages,
-      ClusterShape,
-      TileShape_MNK,
-      Element,
-      float,
-      cutlass::arch::Sm90,
-      Causal,
-      Local,
-      Contexual_mask,
-      Jagged,
-      Has_targets,
-      Mma1_is_RS,
-      V_colmajor,
-      Cross>;
-  using CollectiveEpilogue = hstu::CollectiveEpilogueFwd<
-      TileShape_MNK,
-      ClusterShape,
-      ElementOut,
-      ArchTag,
-      CollectiveMainloop::NumMmaThreads,
-      Jagged,
-      FP8_TransposeV>;
-
-  static constexpr int NumProducerThreads = Arch >= 90
-      ? CollectiveMainloop::NumProducerThreads
-      : CollectiveMainloop::NumMmaThreads;
-  using SchedulerPersistent = std::conditional_t<
-      Jagged,
-      hstu::VarlenDynamicPersistentTileScheduler<
-          kBlockM,
-          CollectiveMainloop::NumMmaThreads,
-          NumProducerThreads,
-          Arch >= 90 /*WarpSpecialized*/>,
-      std::conditional_t<
-          !Causal && !Local,
-          hstu::StaticPersistentTileScheduler,
-          hstu::DynamicPersistentTileScheduler<
-              CollectiveMainloop::NumMmaThreads,
-              NumProducerThreads,
-              Arch >= 90 /*WarpSpecialized*/>>>;
-  using SchedulerSingleTile = hstu::
-      SingleTileScheduler<Jagged, kBlockM, false /*Sort_by_length_indices*/>;
-  // If Split then we probably don't have enough work for PersistentScheduler to
-  // be useful. However, if Jagged (e.g., during decode where we have
-  // max_seqlens), using PersistentScheduler is better since we'll avoid
-  // launching a bunch of thread blocks that immediately exit. On Sm80,
-  // noncausal persistent seems a bit slower.
-  using Scheduler = std::conditional_t<
-      Arch >= 90 ? false : !(Causal && !Jagged),
-      SchedulerSingleTile,
-      SchedulerPersistent>;
-  using AttnKernel = hstu::enable_sm90_or_later<hstu::FlashAttnFwdSm90<
-      Softmax,
-      CollectiveMainloop,
-      CollectiveEpilogue,
-      Scheduler>>;
-
-  int seqlen_q = !Jagged ? params.max_q_len : params.total_seq_len_q;
-  int seqlen_kv = !Jagged ? params.max_kv_len : params.total_seq_len_kv;
-  int batch = !Jagged ? params.b : 1;
-#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
-  std::printf("max/total seqlen: (%d), batch: (%d)\n", seqlen, batch);
-#endif
-  typename CollectiveMainloop::StrideV v_strides =
-      cute::conditional_return<!V_colmajor>(
-          make_stride(
-              params.v_row_stride,
-              _1{},
-              params.v_head_stride,
-              !Jagged ? params.v_batch_stride : 0),
-          make_stride(
-              _1{},
-              params.v_dim_stride,
-              params.v_head_stride,
-              !Jagged ? params.v_batch_stride : 0));
-  typename CollectiveMainloop::Arguments mainloop_args{
-      static_cast<Element const*>(params.q_ptr),
-      {seqlen_q, params.qk_d, params.h, batch}, // shape_Q
-      {params.q_row_stride,
-       _1{},
-       params.q_head_stride,
-       !Jagged ? params.q_batch_stride : 0}, // stride_Q
-      static_cast<Element*>(params.k_ptr),
-      {seqlen_kv, params.qk_d, params.h, batch}, // shape_K
-      {params.k_row_stride,
-       _1{},
-       params.k_head_stride,
-       !Jagged ? params.k_batch_stride : 0}, // stride_K
-      static_cast<Element*>(params.v_ptr),
-      v_strides, // stride_V
-      params.q_descale_ptr,
-      params.k_descale_ptr,
-      params.v_descale_ptr,
-      {params.q_descale_batch_stride, params.q_descale_head_stride},
-      {params.k_descale_batch_stride, params.k_descale_head_stride},
-      {params.v_descale_batch_stride, params.v_descale_head_stride},
-      1.0f / params.max_kv_len,
-      params.alpha,
-      params.max_attn_len,
-      params.min_full_attn_seq_len,
-      params.contextual_seq_len,
-      params.num_softmax_heads,
-      params.num_groups,
-      params.batch_size_per_group,
-      params.seq_offsets,
-      params.seq_offsets_q,
-      params.num_targets,
-      params.max_seq_len_tensor,
-      params.contextual_seq_len_tensor,
-      params.max_attn_len_tensor,
-      params.min_full_attn_seq_len_tensor,
-      params.attn_scale,
-      params.scalar_scale,
-  };
-  typename CollectiveEpilogue::Arguments epilogue_args{
-      static_cast<ElementOut*>(params.o_ptr),
-      {seqlen_q, params.v_d, params.h, batch, 1}, // shape_O
-      {params.o_row_stride,
-       _1{},
-       params.o_head_stride,
-       !Jagged ? params.o_batch_stride : 0,
-       0}, // stride_O
-      params.h,
-      params.num_softmax_heads,
-      {_1{}, seqlen_q, !Jagged ? params.h * seqlen_q : 0, 0}, // stride_LSE}
-      static_cast<float*>(params.softmax_lse),
-      Cross ? params.seq_offsets_q : params.seq_offsets};
-
-  int num_blocks_m =
-      cutlass::ceil_div(params.max_q_len, get<0>(TileShape_MNK{}));
-  num_blocks_m = cutlass::round_up(num_blocks_m, size<0>(ClusterShape{}));
-  typename hstu::TileSchedulerArguments scheduler_args{
-      num_blocks_m,
-      params.h,
-      params.b,
-      params.max_q_len,
-      params.qk_d,
-      sizeof(Element),
-      params.tile_count_semaphore,
-      Cross ? params.seq_offsets_q : params.seq_offsets,
-      nullptr /*sort_by_length_indices*/};
-
-  int device;
-  CHECK_CUDA(cudaGetDevice(&device));
-  typename AttnKernel::Params kernel_params =
-      AttnKernel::to_underlying_arguments(
-          {mainloop_args,
-           epilogue_args,
-           {device, params.num_sm},
-           scheduler_args});
-
-  dim3 grid_dims = AttnKernel::get_grid_shape(kernel_params);
-  dim3 block_dims = AttnKernel::get_block_shape();
-  int smem_size = AttnKernel::SharedStorageSize;
-  // int smem_size_q = sizeof(decltype((typename
-  // CollectiveMainloop::TensorStorage{}).smem_q)); int smem_size_k =
-  // sizeof(decltype((typename CollectiveMainloop::TensorStorage{}).smem_k));
-  // int smem_size_v = sizeof(decltype((typename
-  // CollectiveMainloop::TensorStorage{}).smem_v)); printf("smem_size = %d, q =
-  // %d, k = %d, v = %d\n", smem_size, smem_size_q, smem_size_k, smem_size_v);
-  // Get the ptr to kernel function.
-  if constexpr (size(ClusterShape{}) > 1) {
-    void const* kernel = (void const*)cutlass::device_kernel<AttnKernel>;
-    if (smem_size >= 48 * 1024) {
-      CHECK_CUDA(cudaFuncSetAttribute(
-          kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
-    }
-    dim3 cluster_dims(
-        size<0>(ClusterShape{}),
-        size<1>(ClusterShape{}),
-        size<2>(ClusterShape{}));
-    cutlass::ClusterLaunchParams launch_params{
-        grid_dims, block_dims, cluster_dims, smem_size, stream};
-    cutlass::launch_kernel_on_cluster(launch_params, kernel, kernel_params);
-  } else {
-#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
-    std::cout << "ClusterShape = 1" << std::endl;
-    std::cout << "grid_dims = " << grid_dims << std::endl;
-    std::cout << "block_dims = " << block_dims << std::endl;
-    std::cout << "smem_size = " << smem_size << std::endl;
-#endif
-    auto kernel = cutlass::device_kernel<AttnKernel>;
-    if (smem_size >= 48 * 1024) {
-      CHECK_CUDA(cudaFuncSetAttribute(
-          kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
-    }
-    kernel<<<grid_dims, block_dims, smem_size, stream>>>(kernel_params);
-  }
-  CHECK_CUDA_KERNEL_LAUNCH();
-}
-
-template <
-    int Arch,
-    int kHeadDim,
-    bool Causal,
-    bool Local,
-    bool Softmax,
-    typename T,
-    typename T_out>
-void run_mha_fwd_dispatch(hstu::Flash_fwd_params& params, cudaStream_t stream) {
-  static constexpr bool V_colmajor = false; // V_colmajor_ && sizeof(T) == 1;
-  BOOL_SWITCH(params.num_targets, Has_targets, [&] {
-    BOOL_SWITCH(params.seq_offsets, Jagged, [&] {
-      BOOL_SWITCH(params.seq_offsets_q, Cross, [&] {
-        BOOL_SWITCH(params.has_contexual_mask, Contexual_mask, [&] {
-          BOOL_SWITCH(params.training, Training, [&] {
-#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
-            std::printf(
-                "[flash_fwd_launch_template] Local: (%d), Jagged: (%d), Has_targets: (%d), Causal: (%d), max_kv_len: (%d), kHeadDim: (%d)\n",
-                Local,
-                Jagged,
-                Has_targets,
-                Causal,
-                params.max_kv_len,
-                kHeadDim);
-#endif
-            // static constexpr bool Enable_cluster = Arch >= 90 &&
-            //     (sizeof(T) == 2 ? (kHeadDim >= 128) : (kHeadDim == 192)) &&
-            //     !Causal && !Local && !Jagged;
-            // static constexpr bool Enable_cluster = false;
-            // CLUSTER_SWITCH(
-            //     cutlass::ceil_div(params.max_q_len, kBlockM) % 2 == 0,
-            //     Use_cluster,
-            //     [&] {
-            // static constexpr int ClusterM =
-            //     Enable_cluster && Use_cluster ? 2 : 1;
-            run_flash_fwd<
-                Arch,
-                kHeadDim,
-                1, // ClusterM,
-                T,
-                T_out,
-                Causal,
-                Local,
-                Contexual_mask,
-                Jagged,
-                Has_targets,
-                V_colmajor,
-                Cross,
-                Softmax,
-                Training>(params, stream);
-          });
-        });
-      });
-    });
-  });
-}
-
-template <int Arch, typename T, int kHeadDim, bool Softmax>
-void run_mha_fwd_(hstu::Flash_fwd_params& params, cudaStream_t stream) {
-  static_assert(
-      sizeof(T) == 2 || sizeof(T) == 1, "Only 16bit and 8bit are supported");
-  static constexpr bool Is_FP8 = cute::is_same_v<T, cutlass::float_e4m3_t> ||
-      cute::is_same_v<T, cutlass::float_e5m2_t>;
-  using T_out = std::conditional_t<!Is_FP8, T, cutlass::bfloat16_t>;
-  CAUSAL_LOCAL_SWITCH(params.is_causal, params.is_local, Causal, Local, [&] {
-    // VCOLMAJOR_SWITCH(params.v_dim_stride != 1, V_colmajor_, [&] {
-    run_mha_fwd_dispatch<Arch, kHeadDim, Causal, Local, Softmax, T, T_out>(
-        params, stream);
-  });
-}
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/generate_kernels.py b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/generate_kernels.py
deleted file mode 100644
index 6c3a03188..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/generate_kernels.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-
-# Copied from Driss Guessous's PR in PyTorch: https://github.com/pytorch/pytorch/pull/105602
-
-# This file is run to generate the kernel instantiations for the flash_attn kernels
-# They are written to several files in order to speed up compilation
-
-import argparse
-import itertools
-from dataclasses import dataclass
-from pathlib import Path
-from typing import List, Optional, Union
-
-
-DTYPE_MAP = {
-    "fp16": "cutlass::half_t",
-    "bf16": "cutlass::bfloat16_t",
-    "e4m3": "cutlass::float_e4m3_t",
-}
-
-DTYPE_MAP_FWD_SM8x = {
-    "fp16": "cutlass::half_t",
-    "bf16": "cutlass::bfloat16_t",
-}
-
-DTYPE_MAP_BWD = {
-    "fp16": "cutlass::half_t",
-    "bf16": "cutlass::bfloat16_t",
-}
-
-SM = [90]  # Sm kernels support up to
-SOFTMAX = ["true", "false"]
-HEAD_DIMENSIONS = [64, 96, 128, 192, 256]
-
-KERNEL_IMPL_TEMPLATE_FWD_SM90 = """
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {{
-#ifndef FLASHATTENTION_DISABLE_HDIM{HEAD_DIM}
-template void run_mha_fwd_<{ARCH}, {DTYPE}, {HEAD_DIM}, {SOFTMAX}>(Flash_fwd_params &params, cudaStream_t stream);
-#endif
-}} // namespace hstu
-"""
-
-KERNEL_IMPL_TEMPLATE_FWD_SM8x = """
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {{
-#ifndef FLASHATTENTION_DISABLE_SM8x
-#ifndef FLASHATTENTION_DISABLE_HDIM{HEAD_DIM}
-template void run_mha_fwd_<80, {DTYPE}, {HEAD_DIM}, {SOFTMAX}>(Flash_fwd_params &params, cudaStream_t stream);
-#endif
-#endif
-}} // namespace hstu
-"""
-
-KERNEL_IMPL_TEMPLATE_BWD_SM90 = """
-#ifdef OSS_ENV
-#include "hstu_attention/flash_bwd_launch_template.h"
-#else
-#include "flash_bwd_launch_template.h"
-#endif
-
-namespace hstu {{
-#ifndef FLASHATTENTION_DISABLE_HDIM{HEAD_DIM}
-template void run_mha_bwd_<{ARCH}, {DTYPE}, {HEAD_DIM}, {SOFTMAX}>(Flash_bwd_params &params, cudaStream_t stream);
-#endif
-}} // namespace hstu
-"""
-
-KERNEL_IMPL_TEMPLATE_BWD_SM8x = """
-#ifdef OSS_ENV
-#include "hstu_attention/flash_bwd_launch_template.h"
-#else
-#include "flash_bwd_launch_template.h"
-#endif
-
-namespace hstu {{
-#ifndef FLASHATTENTION_DISABLE_SM8x
-#ifndef FLASHATTENTION_DISABLE_HDIM{HEAD_DIM}
-template void run_mha_bwd_<80, {DTYPE}, {HEAD_DIM}, {SOFTMAX}>(Flash_bwd_params &params, cudaStream_t stream);
-#endif
-#endif
-}} // namespace hstu
-"""
-
-
-@dataclass
-class Kernel:
-    sm: int
-    dtype: str
-    head_dim: int
-    softmax: str
-    direction: str
-
-    @property
-    def template(self) -> str:
-        if self.direction == "fwd":
-            if self.sm == 90:
-                return KERNEL_IMPL_TEMPLATE_FWD_SM90.format(
-                    ARCH=str(self.sm),
-                    DTYPE=DTYPE_MAP[self.dtype],
-                    HEAD_DIM=self.head_dim,
-                    SOFTMAX=self.softmax,
-                )
-            else:
-                # Always enable PackGQA for Sm8x to reduce compilation
-                return KERNEL_IMPL_TEMPLATE_FWD_SM8x.format(
-                    DTYPE=DTYPE_MAP[self.dtype],
-                    HEAD_DIM=self.head_dim,
-                    SOFTMAX=self.softmax,
-                )
-        else:
-            assert self.direction == "bwd"
-            if self.sm == 90:
-                return KERNEL_IMPL_TEMPLATE_BWD_SM90.format(
-                    ARCH=str(self.sm),
-                    DTYPE=DTYPE_MAP[self.dtype],
-                    HEAD_DIM=self.head_dim,
-                    SOFTMAX=self.softmax,
-                )
-            else:
-                return KERNEL_IMPL_TEMPLATE_BWD_SM8x.format(
-                    DTYPE=DTYPE_MAP[self.dtype],
-                    HEAD_DIM=self.head_dim,
-                    SOFTMAX=self.softmax,
-                )
-
-    @property
-    def filename(self) -> str:
-        return f"flash_{self.direction}_hdim{self.head_dim}_{self.dtype}_softmax{self.softmax}_sm{self.sm}.cu"
-
-
-def get_all_kernels() -> List[Kernel]:
-    kernels: List[Kernel] = []
-    for dtype, head_dim, sm, softmax in itertools.product(
-        DTYPE_MAP.keys(), HEAD_DIMENSIONS, SM, SOFTMAX
-    ):
-        # We always enable PackGQA for Sm8x or Split
-        # so we should just pass in packgqa=False to avoid the `_packgqa` in the filename.
-        if sm >= 90 or dtype in DTYPE_MAP_FWD_SM8x:
-            kernels.append(
-                Kernel(
-                    sm=sm,
-                    dtype=dtype,
-                    head_dim=head_dim,
-                    direction="fwd",
-                    softmax=softmax,
-                )
-            )
-    for dtype, head_dim, sm, softmax in itertools.product(
-        DTYPE_MAP_BWD.keys(), HEAD_DIMENSIONS, SM, SOFTMAX
-    ):
-        kernels.append(
-            Kernel(
-                sm=sm,
-                dtype=dtype,
-                head_dim=head_dim,
-                direction="bwd",
-                softmax=softmax,
-            )
-        )
-    return kernels
-
-
-def write_kernel(kernel: Union[Kernel], autogen_dir: Path) -> None:
-    prelude = """
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */ \n
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar, Pradeep Ramani, Tri Dao.
-// Splitting the different template instantiations to different files to speed up compilation.
-// This file is auto-generated. See "generate_kernels.py"\n
-"""
-    (autogen_dir / kernel.filename).write_text(prelude + kernel.template)
-
-
-def main(output_dir_name: Optional[str]) -> None:
-    output_dir = (
-        Path(output_dir_name) if output_dir_name is not None else Path(__file__).parent
-    )
-    output_dir.mkdir(parents=True, exist_ok=True)
-    kernels_all = list(get_all_kernels())
-    for kernel in kernels_all:
-        write_kernel(kernel, output_dir)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog="generate_kernels",
-        description="Generate the flash_attention kernels template instantiations",
-    )
-    # Set an optional output directory
-    parser.add_argument(
-        "-o",
-        "--output_dir",
-        default="instantiations",
-        required=False,
-        help="Where to generate the kernels  will default to the current directory ",
-    )
-    args = parser.parse_args()
-    main(args.output_dir)
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_bf16_softmaxfalse_sm90.cu
deleted file mode 100644
index da0eeb2df..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_bf16_softmaxfalse_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_bwd_launch_template.h"
-#else
-#include "flash_bwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM128
-template void run_mha_bwd_<90, cutlass::bfloat16_t, 128, false>(
-    Flash_bwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_bf16_softmaxtrue_sm90.cu
deleted file mode 100644
index 8d85c2235..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_bf16_softmaxtrue_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_bwd_launch_template.h"
-#else
-#include "flash_bwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM128
-template void run_mha_bwd_<90, cutlass::bfloat16_t, 128, true>(
-    Flash_bwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_fp16_softmaxfalse_sm90.cu
deleted file mode 100644
index 09226cd80..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_fp16_softmaxfalse_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_bwd_launch_template.h"
-#else
-#include "flash_bwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM128
-template void run_mha_bwd_<90, cutlass::half_t, 128, false>(
-    Flash_bwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_fp16_softmaxtrue_sm90.cu
deleted file mode 100644
index 63e451d14..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim128_fp16_softmaxtrue_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_bwd_launch_template.h"
-#else
-#include "flash_bwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM128
-template void run_mha_bwd_<90, cutlass::half_t, 128, true>(
-    Flash_bwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_bf16_softmaxfalse_sm90.cu
deleted file mode 100644
index e379d9918..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_bf16_softmaxfalse_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_bwd_launch_template.h"
-#else
-#include "flash_bwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM192
-template void run_mha_bwd_<90, cutlass::bfloat16_t, 192, false>(
-    Flash_bwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_bf16_softmaxtrue_sm90.cu
deleted file mode 100644
index 7faa31376..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_bf16_softmaxtrue_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_bwd_launch_template.h"
-#else
-#include "flash_bwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM192
-template void run_mha_bwd_<90, cutlass::bfloat16_t, 192, true>(
-    Flash_bwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_fp16_softmaxfalse_sm90.cu
deleted file mode 100644
index 5ddc7d7fc..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_fp16_softmaxfalse_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_bwd_launch_template.h"
-#else
-#include "flash_bwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM192
-template void run_mha_bwd_<90, cutlass::half_t, 192, false>(
-    Flash_bwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_fp16_softmaxtrue_sm90.cu
deleted file mode 100644
index 530deae2b..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim192_fp16_softmaxtrue_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_bwd_launch_template.h"
-#else
-#include "flash_bwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM192
-template void run_mha_bwd_<90, cutlass::half_t, 192, true>(
-    Flash_bwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_bf16_softmaxfalse_sm90.cu
deleted file mode 100644
index 185907c5e..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_bf16_softmaxfalse_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_bwd_launch_template.h"
-#else
-#include "flash_bwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM256
-template void run_mha_bwd_<90, cutlass::bfloat16_t, 256, false>(
-    Flash_bwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_bf16_softmaxtrue_sm90.cu
deleted file mode 100644
index 39df173bb..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_bf16_softmaxtrue_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_bwd_launch_template.h"
-#else
-#include "flash_bwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM256
-template void run_mha_bwd_<90, cutlass::bfloat16_t, 256, true>(
-    Flash_bwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_fp16_softmaxfalse_sm90.cu
deleted file mode 100644
index cdc0a9f7e..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_fp16_softmaxfalse_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_bwd_launch_template.h"
-#else
-#include "flash_bwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM256
-template void run_mha_bwd_<90, cutlass::half_t, 256, false>(
-    Flash_bwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_fp16_softmaxtrue_sm90.cu
deleted file mode 100644
index 6f3182d34..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim256_fp16_softmaxtrue_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_bwd_launch_template.h"
-#else
-#include "flash_bwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM256
-template void run_mha_bwd_<90, cutlass::half_t, 256, true>(
-    Flash_bwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_bf16_softmaxfalse_sm90.cu
deleted file mode 100644
index 89285d0be..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_bf16_softmaxfalse_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_bwd_launch_template.h"
-#else
-#include "flash_bwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM64
-template void run_mha_bwd_<90, cutlass::bfloat16_t, 64, false>(
-    Flash_bwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_bf16_softmaxtrue_sm90.cu
deleted file mode 100644
index ab39c7e06..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_bf16_softmaxtrue_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_bwd_launch_template.h"
-#else
-#include "flash_bwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM64
-template void run_mha_bwd_<90, cutlass::bfloat16_t, 64, true>(
-    Flash_bwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_fp16_softmaxfalse_sm90.cu
deleted file mode 100644
index 8d62b8827..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_fp16_softmaxfalse_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_bwd_launch_template.h"
-#else
-#include "flash_bwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM64
-template void run_mha_bwd_<90, cutlass::half_t, 64, false>(
-    Flash_bwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_fp16_softmaxtrue_sm90.cu
deleted file mode 100644
index 5192d945f..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim64_fp16_softmaxtrue_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_bwd_launch_template.h"
-#else
-#include "flash_bwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM64
-template void run_mha_bwd_<90, cutlass::half_t, 64, true>(
-    Flash_bwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_bf16_softmaxfalse_sm90.cu
deleted file mode 100644
index cbeeac64a..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_bf16_softmaxfalse_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_bwd_launch_template.h"
-#else
-#include "flash_bwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM96
-template void run_mha_bwd_<90, cutlass::bfloat16_t, 96, false>(
-    Flash_bwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_bf16_softmaxtrue_sm90.cu
deleted file mode 100644
index b654969e4..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_bf16_softmaxtrue_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_bwd_launch_template.h"
-#else
-#include "flash_bwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM96
-template void run_mha_bwd_<90, cutlass::bfloat16_t, 96, true>(
-    Flash_bwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_fp16_softmaxfalse_sm90.cu
deleted file mode 100644
index ea81f7ee4..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_fp16_softmaxfalse_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_bwd_launch_template.h"
-#else
-#include "flash_bwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM96
-template void run_mha_bwd_<90, cutlass::half_t, 96, false>(
-    Flash_bwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_fp16_softmaxtrue_sm90.cu
deleted file mode 100644
index 7439f322e..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_bwd_hdim96_fp16_softmaxtrue_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_bwd_launch_template.h"
-#else
-#include "flash_bwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM96
-template void run_mha_bwd_<90, cutlass::half_t, 96, true>(
-    Flash_bwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_bf16_softmaxfalse_sm90.cu
deleted file mode 100644
index a39bcd505..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_bf16_softmaxfalse_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM128
-template void run_mha_fwd_<90, cutlass::bfloat16_t, 128, false>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_bf16_softmaxtrue_sm90.cu
deleted file mode 100644
index 464a0f443..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_bf16_softmaxtrue_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM128
-template void run_mha_fwd_<90, cutlass::bfloat16_t, 128, true>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_e4m3_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_e4m3_softmaxfalse_sm90.cu
deleted file mode 100644
index 3075657bb..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_e4m3_softmaxfalse_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM128
-template void run_mha_fwd_<90, cutlass::float_e4m3_t, 128, false>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_e4m3_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_e4m3_softmaxtrue_sm90.cu
deleted file mode 100644
index 1ab6e4394..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_e4m3_softmaxtrue_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM128
-template void run_mha_fwd_<90, cutlass::float_e4m3_t, 128, true>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_fp16_softmaxfalse_sm90.cu
deleted file mode 100644
index be5a6cb0d..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_fp16_softmaxfalse_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM128
-template void run_mha_fwd_<90, cutlass::half_t, 128, false>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_fp16_softmaxtrue_sm90.cu
deleted file mode 100644
index 7c303e7ef..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim128_fp16_softmaxtrue_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM128
-template void run_mha_fwd_<90, cutlass::half_t, 128, true>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_bf16_softmaxfalse_sm90.cu
deleted file mode 100644
index 6e8d906d5..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_bf16_softmaxfalse_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM192
-template void run_mha_fwd_<90, cutlass::bfloat16_t, 192, false>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_bf16_softmaxtrue_sm90.cu
deleted file mode 100644
index 80367708f..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_bf16_softmaxtrue_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM192
-template void run_mha_fwd_<90, cutlass::bfloat16_t, 192, true>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_e4m3_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_e4m3_softmaxfalse_sm90.cu
deleted file mode 100644
index 67ade004b..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_e4m3_softmaxfalse_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM192
-template void run_mha_fwd_<90, cutlass::float_e4m3_t, 192, false>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_e4m3_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_e4m3_softmaxtrue_sm90.cu
deleted file mode 100644
index 9f40d2726..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_e4m3_softmaxtrue_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM192
-template void run_mha_fwd_<90, cutlass::float_e4m3_t, 192, true>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_fp16_softmaxfalse_sm90.cu
deleted file mode 100644
index 1779657c0..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_fp16_softmaxfalse_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM192
-template void run_mha_fwd_<90, cutlass::half_t, 192, false>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_fp16_softmaxtrue_sm90.cu
deleted file mode 100644
index 0037dbc17..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim192_fp16_softmaxtrue_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM192
-template void run_mha_fwd_<90, cutlass::half_t, 192, true>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_bf16_softmaxfalse_sm90.cu
deleted file mode 100644
index 93440571c..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_bf16_softmaxfalse_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM256
-template void run_mha_fwd_<90, cutlass::bfloat16_t, 256, false>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_bf16_softmaxtrue_sm90.cu
deleted file mode 100644
index c0634db8f..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_bf16_softmaxtrue_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM256
-template void run_mha_fwd_<90, cutlass::bfloat16_t, 256, true>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_e4m3_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_e4m3_softmaxfalse_sm90.cu
deleted file mode 100644
index a0eb625f5..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_e4m3_softmaxfalse_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM256
-template void run_mha_fwd_<90, cutlass::float_e4m3_t, 256, false>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_e4m3_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_e4m3_softmaxtrue_sm90.cu
deleted file mode 100644
index 8b7216302..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_e4m3_softmaxtrue_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM256
-template void run_mha_fwd_<90, cutlass::float_e4m3_t, 256, true>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_fp16_softmaxfalse_sm90.cu
deleted file mode 100644
index fe89b532f..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_fp16_softmaxfalse_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM256
-template void run_mha_fwd_<90, cutlass::half_t, 256, false>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_fp16_softmaxtrue_sm90.cu
deleted file mode 100644
index c0857f941..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim256_fp16_softmaxtrue_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM256
-template void run_mha_fwd_<90, cutlass::half_t, 256, true>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_bf16_softmaxfalse_sm90.cu
deleted file mode 100644
index 841e9359e..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_bf16_softmaxfalse_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM64
-template void run_mha_fwd_<90, cutlass::bfloat16_t, 64, false>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_bf16_softmaxtrue_sm90.cu
deleted file mode 100644
index 3da54d69f..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_bf16_softmaxtrue_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM64
-template void run_mha_fwd_<90, cutlass::bfloat16_t, 64, true>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_e4m3_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_e4m3_softmaxfalse_sm90.cu
deleted file mode 100644
index 4761ca635..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_e4m3_softmaxfalse_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM64
-template void run_mha_fwd_<90, cutlass::float_e4m3_t, 64, false>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_e4m3_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_e4m3_softmaxtrue_sm90.cu
deleted file mode 100644
index 33e66d0a7..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_e4m3_softmaxtrue_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM64
-template void run_mha_fwd_<90, cutlass::float_e4m3_t, 64, true>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_fp16_softmaxfalse_sm90.cu
deleted file mode 100644
index fab2951ee..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_fp16_softmaxfalse_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM64
-template void run_mha_fwd_<90, cutlass::half_t, 64, false>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_fp16_softmaxtrue_sm90.cu
deleted file mode 100644
index 2ef1f29c9..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim64_fp16_softmaxtrue_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM64
-template void run_mha_fwd_<90, cutlass::half_t, 64, true>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_bf16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_bf16_softmaxfalse_sm90.cu
deleted file mode 100644
index bc52514e9..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_bf16_softmaxfalse_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM96
-template void run_mha_fwd_<90, cutlass::bfloat16_t, 96, false>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_bf16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_bf16_softmaxtrue_sm90.cu
deleted file mode 100644
index 11ea3bb20..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_bf16_softmaxtrue_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM96
-template void run_mha_fwd_<90, cutlass::bfloat16_t, 96, true>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_e4m3_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_e4m3_softmaxfalse_sm90.cu
deleted file mode 100644
index 9e0b05a31..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_e4m3_softmaxfalse_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM96
-template void run_mha_fwd_<90, cutlass::float_e4m3_t, 96, false>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_e4m3_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_e4m3_softmaxtrue_sm90.cu
deleted file mode 100644
index 7fa79aa76..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_e4m3_softmaxtrue_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM96
-template void run_mha_fwd_<90, cutlass::float_e4m3_t, 96, true>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_fp16_softmaxfalse_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_fp16_softmaxfalse_sm90.cu
deleted file mode 100644
index 83a25a649..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_fp16_softmaxfalse_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM96
-template void run_mha_fwd_<90, cutlass::half_t, 96, false>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_fp16_softmaxtrue_sm90.cu b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_fp16_softmaxtrue_sm90.cu
deleted file mode 100644
index e0526dec8..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/instantiations/flash_fwd_hdim96_fp16_softmaxtrue_sm90.cu
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
-// Pradeep Ramani, Tri Dao. Splitting the different template instantiations to
-// different files to speed up compilation. This file is auto-generated. See
-// "generate_kernels.py"
-
-#ifdef OSS_ENV
-#include "hstu_attention/flash_fwd_launch_template.h"
-#else
-#include "flash_fwd_launch_template.h"
-#endif
-
-namespace hstu {
-#ifndef FLASHATTENTION_DISABLE_HDIM96
-template void run_mha_fwd_<90, cutlass::half_t, 96, true>(
-    Flash_fwd_params& params,
-    cudaStream_t stream);
-#endif
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mainloop_bwd_sm90_tma_gmma_ws.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mainloop_bwd_sm90_tma_gmma_ws.h
deleted file mode 100644
index e702faf0b..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mainloop_bwd_sm90_tma_gmma_ws.h
+++ /dev/null
@@ -1,3166 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/******************************************************************************
- * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
- *Pradeep Ramani, Tri Dao.
- ******************************************************************************/
-
-#pragma once
-
-#include <cutlass/array.h>
-#include <cutlass/barrier.h>
-#include <cutlass/cutlass.h>
-#include <cutlass/numeric_conversion.h>
-#include <cutlass/numeric_types.h>
-#include "cutlass/pipeline/pipeline.hpp"
-
-#include "cute/tensor.hpp"
-
-#include "cutlass/gemm/collective/builders/sm90_common.inl"
-
-#include "copy_sm90_bulk_reduce.h"
-#include "mask.h"
-#include "named_barrier.h"
-#include "seqlen.h"
-#include "utils.h"
-
-namespace hstu {
-
-using namespace cute;
-
-template <
-    int Stages,
-    int Stages_dO,
-    int Stages_dS,
-    class ClusterShape_,
-    class TileShape_MNK_,
-    class Element_,
-    class ElementAccum_,
-    class ArchTag_,
-    bool Causal,
-    bool Local,
-    bool Contexual_mask,
-    bool Jagged,
-    bool Has_targets,
-    bool Deterministic,
-    bool SdP_swapAB_,
-    bool dKV_swapAB_,
-    bool dQ_swapAB_,
-    int NumMmaWarpGroups = 2,
-    int AtomLayoutMSdP = 1,
-    int AtomLayoutNdKV = 2,
-    int AtomLayoutMdQ = 1,
-    bool Mma_dP_is_RS = false,
-    bool Cross = false,
-    bool Softmax = false>
-struct CollectiveMainloopBwdSm90 {
-  static constexpr int kStages = Stages;
-  static constexpr int kStages_dO = Stages_dO;
-  static constexpr int kStages_dS = Stages_dS;
-  static_assert(kStages >= kStages_dO);
-  static_assert(Stages_dS == 1 || Stages_dS == kStages);
-  static_assert(
-      !Mma_dP_is_RS || SdP_swapAB_); // If Mma_dP_is_RS, we need SdP_SwapAB
-  using ClusterShape = ClusterShape_;
-  using TileShape_MNK = TileShape_MNK_;
-  using Element = Element_;
-  using ElementAccum = ElementAccum_;
-  using ArchTag = ArchTag_;
-  using SeqlenInfo_t = hstu::SeqlenInfoQKBwd<
-      Jagged,
-      Cross,
-      Has_targets,
-      CUTE_STATIC_V(get<0>(TileShape_MNK{}))>;
-
-  static constexpr bool SdP_swapAB = SdP_swapAB_;
-  static constexpr bool dKV_swapAB = dKV_swapAB_;
-  static constexpr bool dQ_swapAB = dQ_swapAB_;
-
-  static constexpr bool Q_dO_same_stages = kStages == kStages_dO;
-
-  static constexpr int kBlockM = get<0>(TileShape_MNK{});
-  static constexpr int kBlockN = get<1>(TileShape_MNK{});
-  static constexpr int kHeadDim = get<2>(TileShape_MNK{});
-
-  static_assert(ArchTag::kMinComputeCapability >= 90);
-  static_assert(get<0>(ClusterShape{}) == 1 && get<2>(ClusterShape{}) == 1);
-
-  static constexpr int NumMmaThreads =
-      NumMmaWarpGroups * cutlass::NumThreadsPerWarpGroup;
-  static constexpr int NumProducerThreads = cutlass::NumThreadsPerWarp * 2;
-
-  static_assert(NumMmaWarpGroups % AtomLayoutMSdP == 0);
-  static_assert(NumMmaWarpGroups % AtomLayoutNdKV == 0);
-  static_assert(NumMmaWarpGroups % AtomLayoutMdQ == 0);
-  static constexpr bool Mma_dKV_is_RS = AtomLayoutMSdP == 1 &&
-      AtomLayoutNdKV == NumMmaWarpGroups && SdP_swapAB && !dKV_swapAB;
-  static constexpr bool Mma_dQ_is_RS = AtomLayoutMSdP == NumMmaWarpGroups &&
-      AtomLayoutMdQ == NumMmaWarpGroups && !SdP_swapAB &&
-      !dQ_swapAB; // If dQ_swapAB we can't use RS
-
-  static constexpr GMMA::Major PdS_Major = GMMA::Major::K;
-  // static constexpr GMMA::Major PdS_Major = GMMA::Major::MN;
-  static constexpr GMMA::Major PdSt_Major =
-      PdS_Major == GMMA::Major::K ? GMMA::Major::MN : GMMA::Major::K;
-
-  using TileShapeAtomSdP = std::conditional_t<
-      !SdP_swapAB,
-      Shape<
-          Int<kBlockM>,
-          Int<kBlockN / (NumMmaWarpGroups / AtomLayoutMSdP)>,
-          Int<kHeadDim>>,
-      Shape<Int<kBlockN>, Int<kBlockM / AtomLayoutMSdP>, Int<kHeadDim>>>;
-  using AtomLayoutSdP = std::conditional_t<
-      !SdP_swapAB,
-      Layout<Shape<
-          Int<AtomLayoutMSdP>,
-          Int<NumMmaWarpGroups / AtomLayoutMSdP>,
-          _1>>,
-      Layout<Shape<
-          Int<NumMmaWarpGroups / AtomLayoutMSdP>,
-          Int<AtomLayoutMSdP>,
-          _1>>>;
-  using TiledMmaSdP = decltype(cute::make_tiled_mma(
-      cute::GMMA::
-          ss_op_selector<Element, Element, ElementAccum, TileShapeAtomSdP>(),
-      AtomLayoutSdP{}));
-
-  using TiledMmadPRS = decltype(cute::make_tiled_mma(
-      cute::GMMA::
-          rs_op_selector<Element, Element, ElementAccum, TileShapeAtomSdP>(),
-      AtomLayoutSdP{}));
-
-  using TileShapeAtomdKV = std::conditional_t<
-      !dKV_swapAB,
-      Shape<
-          Int<kBlockN>,
-          Int<kHeadDim / (NumMmaWarpGroups / AtomLayoutNdKV)>,
-          Int<kBlockM>>,
-      Shape<Int<kHeadDim>, Int<kBlockN / AtomLayoutNdKV>, Int<kBlockM>>>;
-  using AtomLayoutdKV = std::conditional_t<
-      !dKV_swapAB,
-      Layout<Shape<
-          Int<AtomLayoutNdKV>,
-          Int<NumMmaWarpGroups / AtomLayoutNdKV>,
-          _1>>,
-      Layout<Shape<
-          Int<NumMmaWarpGroups / AtomLayoutNdKV>,
-          Int<AtomLayoutNdKV>,
-          _1>>>;
-  using TiledMmadKV = decltype(cute::make_tiled_mma(
-      std::conditional_t<
-          Mma_dKV_is_RS,
-          decltype(cute::GMMA::rs_op_selector<
-                   Element,
-                   Element,
-                   ElementAccum,
-                   TileShapeAtomdKV,
-                   GMMA::Major::K,
-                   GMMA::Major::MN>()),
-          decltype(cute::GMMA::ss_op_selector<
-                   Element,
-                   Element,
-                   ElementAccum,
-                   TileShapeAtomdKV,
-                   !dKV_swapAB ? PdSt_Major : GMMA::Major::MN,
-                   !dKV_swapAB ? GMMA::Major::MN : PdSt_Major>())>{},
-      AtomLayoutdKV{}));
-
-  static constexpr bool dQacc_use_TMA = kHeadDim < 256;
-  // For hdim256, we want to slice the dQ MMA (64 x 256 on 2 WGs) into two (64 x
-  // 128 on 2 WGs) so that we can do atomic add on one half before doing the
-  // other half of the MMA, to reduce register pressure.
-  static constexpr bool Slice_dQKV_Mma = kHeadDim == 256 && !dQacc_use_TMA &&
-      dQ_swapAB && AtomLayoutMdQ == 1 && NumMmaWarpGroups == 2;
-  static_assert(
-      !(Deterministic && Slice_dQKV_Mma),
-      "Deterministic mode not supported with Slice_dQKV_Mma");
-
-  static constexpr int TileShapeAtomdQ_BlockM = kBlockM / AtomLayoutMdQ;
-  static constexpr int TileShapeAtomdQ_HeadDim =
-      (Slice_dQKV_Mma ? kHeadDim / 2 : kHeadDim) /
-      (NumMmaWarpGroups / AtomLayoutMdQ);
-  static_assert(
-      !dQ_swapAB ? TileShapeAtomdQ_BlockM == 64 : TileShapeAtomdQ_HeadDim == 64,
-      "Tile_M must be 64.");
-  using TileShapeAtomdQ = std::conditional_t<
-      !dQ_swapAB,
-      Shape<
-          Int<TileShapeAtomdQ_BlockM>,
-          Int<TileShapeAtomdQ_HeadDim>,
-          Int<kBlockN>>,
-      Shape<
-          Int<TileShapeAtomdQ_HeadDim>,
-          Int<TileShapeAtomdQ_BlockM>,
-          Int<kBlockN>>>;
-  using AtomLayoutdQ = std::conditional_t<
-      !dQ_swapAB,
-      Layout<
-          Shape<Int<AtomLayoutMdQ>, Int<NumMmaWarpGroups / AtomLayoutMdQ>, _1>>,
-      Layout<Shape<
-          Int<NumMmaWarpGroups / AtomLayoutMdQ>,
-          Int<AtomLayoutMdQ>,
-          _1>>>;
-  using TiledMmadQ = decltype(cute::make_tiled_mma(
-      std::conditional_t<
-          Mma_dQ_is_RS,
-          decltype(cute::GMMA::rs_op_selector<
-                   Element,
-                   Element,
-                   ElementAccum,
-                   TileShapeAtomdQ,
-                   GMMA::Major::K,
-                   GMMA::Major::MN>()),
-          decltype(cute::GMMA::ss_op_selector<
-                   Element,
-                   Element,
-                   ElementAccum,
-                   TileShapeAtomdQ,
-                   !dQ_swapAB ? PdS_Major : GMMA::Major::MN,
-                   !dQ_swapAB ? GMMA::Major::MN : PdS_Major>())>{},
-      AtomLayoutdQ{}));
-
-  // We need to accommodate both Q and Q^T (and dO and dO^T) in shared memory.
-  // Q & dO are used in the SdP Mma and Q^T and dO^T are used in the dKV Mma.
-  // Since this is GMMA::Major::K, the M dimension (kBlockM) doesn't matter for
-  // the layout, only the K dimension changes the layout.
-  using SmemLayoutAtomQdO =
-      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
-               GMMA::Major::K,
-               Element,
-               Int<kBlockM>,
-               Int<kHeadDim /
-                   (NumMmaWarpGroups / AtomLayoutNdKV)>>()); // for dKV_Mma
-  using SmemLayoutQ = decltype(tile_to_shape(
-      SmemLayoutAtomQdO{},
-      make_shape(
-          shape<0>(TileShape_MNK{}),
-          shape<2>(TileShape_MNK{}),
-          Int<kStages>{})));
-  using SmemLayoutdO = decltype(tile_to_shape(
-      SmemLayoutAtomQdO{},
-      make_shape(
-          shape<0>(TileShape_MNK{}),
-          shape<2>(TileShape_MNK{}),
-          Int<kStages_dO>{})));
-
-  using SmemLayoutAtomK =
-      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
-               GMMA::Major::K,
-               Element,
-               Int<kBlockN>,
-               Int<kHeadDim / (NumMmaWarpGroups / AtomLayoutMdQ)>>());
-  using SmemLayoutK =
-      decltype(tile_to_shape(SmemLayoutAtomK{}, select<1, 2>(TileShape_MNK{})));
-
-  using SmemLayoutAtomV =
-      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
-               GMMA::Major::K,
-               Element,
-               decltype(cute::get<1>(TileShape_MNK{})),
-               decltype(cute::get<2>(TileShape_MNK{}))>());
-  using SmemLayoutV =
-      decltype(tile_to_shape(SmemLayoutAtomV{}, select<1, 2>(TileShape_MNK{})));
-
-  using SmemLayoutAtomPdS =
-      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
-               PdS_Major,
-               Element,
-               Int<kBlockM / AtomLayoutMSdP>,
-               Int<kBlockN / (NumMmaWarpGroups / AtomLayoutMSdP)>>());
-  using SmemLayoutPdS = decltype(tile_to_shape(
-      SmemLayoutAtomPdS{},
-      make_shape(Int<kBlockM>{}, Int<kBlockN>{}, Int<kStages_dS>{}),
-      std::conditional_t<
-          PdS_Major == GMMA::Major::K,
-          cute::Step<_1, _2, _3>,
-          cute::Step<_2, _1, _3>>{}));
-  // Need stride to be multiple of 32, otherwise we get error (misaligned
-  // address) when doing TMA if e.g. kBlockM=80 We set stride to be multiple of
-  // 64 so that if ShuffleLSE, even if threads read from sLSE but out of bounds,
-  // it's still a valid smem address.
-  using SmemLayoutLSE = cute::Layout<
-      cute::Shape<Int<kBlockM>, Int<kStages>>,
-      cute::Stride<_1, Int<cute::round_up(kBlockM, 64)>>>;
-  using SmemLayoutLSEMma = std::conditional_t<
-      SdP_swapAB,
-      cute::Layout<
-          cute::Shape<Int<kBlockN>, Int<kBlockM>, Int<kStages>>,
-          cute::Stride<_0, _1, Int<cute::round_up(kBlockM, 64)>>>,
-      cute::Layout<
-          cute::Shape<Int<kBlockM>, Int<kBlockN>, Int<kStages>>,
-          cute::Stride<_1, _0, Int<cute::round_up(kBlockM, 64)>>>>;
-
-  // Note this is the transpose in terms of the view, not in terms of memory.
-  using SmemLayoutQt = decltype(cute::composition(
-      SmemLayoutQ{},
-      make_layout(
-          make_shape(
-              get<2>(TileShape_MNK{}),
-              get<0>(TileShape_MNK{}),
-              Int<kStages>{}),
-          make_stride(Int<kBlockM>{}, _1{}, Int<kBlockM * kHeadDim>{}))));
-  using SmemLayoutdOt = decltype(cute::composition(
-      SmemLayoutdO{},
-      make_layout(
-          make_shape(
-              get<2>(TileShape_MNK{}),
-              get<0>(TileShape_MNK{}),
-              Int<kStages_dO>{}),
-          make_stride(Int<kBlockM>{}, _1{}, Int<kBlockM * kHeadDim>{}))));
-  using SmemLayoutKt = decltype(cute::composition(
-      SmemLayoutK{},
-      make_layout(
-          make_shape(get<2>(TileShape_MNK{}), get<1>(TileShape_MNK{})),
-          make_stride(Int<kBlockN>{}, _1{}))));
-  using SmemLayoutPdSt = decltype(cute::composition(
-      SmemLayoutPdS{},
-      make_layout(
-          make_shape(Int<kBlockN>{}, Int<kBlockM>{}, Int<kStages_dS>{}),
-          make_stride(Int<kBlockM>{}, _1{}, Int<kBlockM * kBlockN>{}))));
-
-  // Thread layout, 256 or 384 threads per row
-  // We split into NumMmaWarpGroups so that we can do Bulk reduce add for each
-  // WG separately.
-  using R2SLayoutAtomdQaccum = Layout<
-      Shape<Int<cutlass::NumThreadsPerWarpGroup>, Int<NumMmaWarpGroups>>>;
-  using R2STiledCopydQaccum = decltype(make_tiled_copy(
-      Copy_Atom<AutoVectorizingCopyWithAssumedAlignment<128>, ElementAccum>{},
-      R2SLayoutAtomdQaccum{},
-      Layout<Shape<_4>>{})); // Val layout, 4 vals per store
-  using SmemLayoutdQaccum = Layout<
-      Shape<Int<kBlockM * kHeadDim / NumMmaWarpGroups>, Int<NumMmaWarpGroups>>>;
-
-  static constexpr int kNumPdSStore = kBlockM * kBlockN / NumMmaThreads;
-  // If !SdP_swapAB, the accum registers hold P / dS, otherwise they hold Pt /
-  // dSt. If PdS_major is MN, then we need to "transpose" the write.
-  using SmemCopyAtomPdS = Copy_Atom<
-      std::conditional_t<
-          (!SdP_swapAB) ^ (PdS_Major == GMMA::Major::MN),
-          std::conditional_t<
-              kNumPdSStore % 8 == 0,
-              cute::SM90_U32x4_STSM_N,
-              cute::SM90_U32x2_STSM_N>,
-          std::conditional_t<
-              kNumPdSStore % 8 == 0,
-              cute::SM90_U16x8_STSM_T,
-              cute::SM90_U16x4_STSM_T>>,
-      Element>;
-
-  using GmemTiledCopyQdO =
-      decltype(cutlass::gemm::collective::detail::
-                   sm90_cluster_shape_to_tma_atom(shape<1>(ClusterShape{})));
-  using GmemTiledCopyKV = cute::SM90_TMA_LOAD;
-
-  using ShapeQKV =
-      cute::Shape<int32_t, int32_t, int32_t, int32_t>; // (seqlen, d, head,
-                                                       // batch)
-  using StrideQKV = cute::Stride<int64_t, _1, int64_t, int64_t>;
-  using ShapeLSE =
-      cute::Shape<int32_t, int32_t, int32_t>; // (seqlen, head, batch)
-  using StrideLSE = cute::Stride<_1, int64_t, int64_t>; // (seqlen, head, batch)
-  using ShapedQaccum =
-      cute::Shape<int32_t, int32_t, int32_t>; // (seqlen * d, head, batch)
-  using StridedQaccum = cute::Stride<_1, int64_t, int64_t>;
-
-  using TMA_QdO = decltype(make_tma_copy_A_sm90(
-      GmemTiledCopyQdO{},
-      make_tensor(
-          make_gmem_ptr(static_cast<Element const*>(nullptr)),
-          ShapeQKV{},
-          StrideQKV{}),
-      take<0, 2>(SmemLayoutQ{}),
-      TileShape_MNK{},
-      ClusterShape{})); // mcast along N mode for this M load, if any
-
-  using TMA_K = decltype(make_tma_copy_B_sm90(
-      GmemTiledCopyKV{},
-      make_tensor(
-          make_gmem_ptr(static_cast<Element const*>(nullptr)),
-          ShapeQKV{},
-          StrideQKV{}),
-      SmemLayoutK{},
-      TileShape_MNK{},
-      ClusterShape{})); // no mcast for KV
-
-  using TMA_V = decltype(make_tma_copy_B_sm90(
-      GmemTiledCopyKV{},
-      make_tensor(
-          make_gmem_ptr(static_cast<Element const*>(nullptr)),
-          ShapeQKV{},
-          StrideQKV{}),
-      SmemLayoutV{},
-      TileShape_MNK{},
-      ClusterShape{})); // no mcast for KV
-
-  using MainloopPipeline = typename cutlass::PipelineTmaAsync<kStages>;
-  using PipelineState = typename MainloopPipeline::PipelineState;
-  using MainloopPipeline_dO = typename cutlass::PipelineTmaAsync<kStages_dO>;
-  using PipelineState_dO = typename MainloopPipeline_dO::PipelineState;
-
-  // Set the bytes transferred in this TMA transaction (may involve multiple
-  // issues)
-  static constexpr uint32_t TmaTransactionBytesQ = static_cast<uint32_t>(
-      size(take<0, 2>(SmemLayoutQ{})) * cutlass::sizeof_bits_v<Element> / 8);
-  static constexpr uint32_t TmaTransactionBytesK = static_cast<uint32_t>(
-      size(SmemLayoutK{}) * cutlass::sizeof_bits_v<Element> / 8);
-  static constexpr uint32_t TmaTransactionBytesV = static_cast<uint32_t>(
-      size(SmemLayoutV{}) * cutlass::sizeof_bits_v<Element> / 8);
-  static constexpr uint32_t TmaTransactionBytesLSE = static_cast<uint32_t>(
-      size(select<0>(SmemLayoutLSE{})) * cutlass::sizeof_bits_v<ElementAccum> /
-      8);
-
-  // These are tuned for speed. They don't affect correctness.
-  // We have separate iterations with causal masking. Not necessary for hdim 128
-  // but for hdim 64 this helps quite a bit to not have to do causal masking for
-  // most of the iterations. For hdim 192, separating masking iterations results
-  // in register spills.
-  static constexpr bool SeparateMaskingIterations = false;
-  // Do we keep the LSE and dPsum in each thread, or split them across 8 threads
-  // that share them and then shuffle to get the value whenever we need? This
-  // can reduce register pressure when SdP_swapAB, where each thread needs to
-  // keep statistics for (kBlockM / 4) rows. If !SdP_swapAB, each thread only
-  // needs to keep statistic for 2 rows.
-  static constexpr bool ShuffleLSE = SdP_swapAB && kHeadDim <= 64;
-  static constexpr bool ShuffledPsum = SdP_swapAB && kHeadDim <= 64;
-  static constexpr size_t SmemAlignmentP =
-      cutlass::detail::alignment_for_swizzle(SmemLayoutPdS{});
-  static constexpr size_t SmemAlignmentdS =
-      cutlass::detail::alignment_for_swizzle(SmemLayoutPdS{});
-  // Without this SmemAlignment, with hdim 256 we get "misaligned address" error
-  // in TMA
-  static constexpr size_t SmemAlignmentQKVdO = kHeadDim % 256 == 0 ? 256 : 128;
-  static constexpr size_t SmemAlignmentV = !Mma_dP_is_RS
-      ? SmemAlignmentQKVdO
-      : cutlass::detail::alignment_for_swizzle(SmemLayoutV{});
-  static_assert(
-      SmemAlignmentP >= 128 && SmemAlignmentdS >= 128,
-      "Require at least 128B alignment");
-
-  // TODO: do we have to worry that smem_dk and smem_dv in the epilogue don't
-  // line up w smem_k and smem_v due to alignment?
-  using SmemdQacc_t = std::conditional_t<
-      !dQacc_use_TMA,
-      cute::array<ElementAccum, 0>,
-      cute::array_aligned<ElementAccum, cute::cosize_v<SmemLayoutdQaccum>>>;
-  using SmemP_t = std::conditional_t<
-      Mma_dKV_is_RS,
-      cute::array<Element, 0>,
-      cute::array_aligned<
-          Element,
-          cute::cosize_v<SmemLayoutPdS>,
-          SmemAlignmentP>>;
-  struct TensorStorage
-      : cute::aligned_struct<
-            cute::max(SmemAlignmentP, SmemAlignmentdS, SmemAlignmentQKVdO)> {
-    cute::
-        array_aligned<Element, cute::cosize_v<SmemLayoutK>, SmemAlignmentQKVdO>
-            smem_k;
-    cute::array_aligned<Element, cute::cosize_v<SmemLayoutV>, SmemAlignmentV>
-        smem_v;
-    SmemdQacc_t smem_dqacc;
-    cute::
-        array_aligned<Element, cute::cosize_v<SmemLayoutQ>, SmemAlignmentQKVdO>
-            smem_q;
-    cute::
-        array_aligned<Element, cute::cosize_v<SmemLayoutdO>, SmemAlignmentQKVdO>
-            smem_do;
-    cute::array_aligned<ElementAccum, cute::cosize_v<SmemLayoutLSE>, 128>
-        smem_lse;
-    cute::array_aligned<ElementAccum, cute::cosize_v<SmemLayoutLSE>, 128>
-        smem_dpsum;
-    SmemP_t smem_p;
-    cute::array_aligned<Element, cute::cosize_v<SmemLayoutPdS>, SmemAlignmentdS>
-        smem_ds;
-  };
-
-  // Host side kernel arguments
-  struct Arguments {
-    Element const* const ptr_Q;
-    ShapeQKV const shape_Q;
-    StrideQKV const stride_Q;
-    Element const* const ptr_K;
-    ShapeQKV const shape_K;
-    StrideQKV const stride_K;
-    Element const* const ptr_V;
-    ShapeQKV const shape_V;
-    StrideQKV const stride_V;
-    Element const* const ptr_dO;
-    ShapeQKV const shape_dO;
-    StrideQKV const stride_dO;
-    ElementAccum* const ptr_dQaccum;
-    ShapedQaccum const shape_dQaccum;
-    StridedQaccum const stride_dQaccum;
-    float const* const ptr_LSE_log2;
-    ShapeLSE const shape_LSE;
-    StrideLSE const stride_LSE_log2;
-    float const* const ptr_dPsum;
-    StrideLSE const stride_dPsum;
-    int const max_attn_len;
-    int const min_full_attn_seq_len;
-    int const contextual_seq_len;
-    float const max_seq_len_inv;
-    float const alpha;
-    int const num_batch;
-    int const num_softmax_heads;
-    int const num_groups;
-    int const batch_size_per_group;
-    int* const dq_semaphore;
-    int const* const seq_offsets = nullptr;
-    int const* const seq_offsets_q = nullptr;
-    int const* const num_targets = nullptr;
-    int const* const max_seq_len_tensor = nullptr;
-    int const* const contextual_seq_len_tensor = nullptr;
-    int const* const max_attn_len_tensor = nullptr;
-    int const* const min_full_attn_seq_len_tensor = nullptr;
-    float const* const attn_scale = nullptr;
-    bool const scalar_scale = true;
-  };
-
-  // Device side kernel params
-  struct Params {
-    ShapeQKV const shape_Q;
-    ShapeQKV const shape_K;
-    ShapeQKV const shape_V;
-    ShapeQKV const shape_dO;
-    ElementAccum* const ptr_dQaccum;
-    ShapedQaccum const shape_dQaccum;
-    StridedQaccum stride_dQaccum;
-    TMA_QdO tma_load_Q, tma_load_dO;
-    TMA_K tma_load_K;
-    TMA_V tma_load_V;
-    float const* const ptr_LSE_log2;
-    ShapeLSE const shape_LSE;
-    StrideLSE const stride_LSE_log2;
-    float const* const ptr_dPsum;
-    StrideLSE const stride_dPsum;
-    int const max_attn_len;
-    int const min_full_attn_seq_len;
-    int const contextual_seq_len;
-    float const max_seq_len_inv;
-    float const alpha;
-    float const alpha_log2;
-    int const num_batch;
-    int const num_softmax_heads;
-    int const num_groups;
-    int const batch_size_per_group;
-    int* const dq_semaphore;
-    int const* const seq_offsets = nullptr;
-    int const* const seq_offsets_q = nullptr;
-    int const* const num_targets;
-    int const* const max_seq_len_tensor = nullptr;
-    int const* const contextual_seq_len_tensor = nullptr;
-    int const* const max_attn_len_tensor = nullptr;
-    int const* const min_full_attn_seq_len_tensor = nullptr;
-    float const* const attn_scale;
-    bool const scalar_scale = true;
-  };
-
-  static Params to_underlying_arguments(Arguments const& args) {
-    Tensor mQ =
-        make_tensor(make_gmem_ptr(args.ptr_Q), args.shape_Q, args.stride_Q);
-    TMA_QdO tma_load_Q = make_tma_copy_A_sm90(
-        GmemTiledCopyQdO{},
-        mQ,
-        SmemLayoutQ{}(_, _, _0{}),
-        TileShape_MNK{},
-        ClusterShape{}); // mcast along N mode for this M load, if any
-    Tensor mdO =
-        make_tensor(make_gmem_ptr(args.ptr_dO), args.shape_Q, args.stride_dO);
-    TMA_QdO tma_load_dO = make_tma_copy_A_sm90(
-        GmemTiledCopyQdO{},
-        mdO,
-        SmemLayoutdO{}(_, _, _0{}),
-        TileShape_MNK{},
-        ClusterShape{}); // mcast along N mode for this M load, if any
-    Tensor mK =
-        make_tensor(make_gmem_ptr(args.ptr_K), args.shape_K, args.stride_K);
-    TMA_K tma_load_K = make_tma_copy_B_sm90(
-        GmemTiledCopyKV{},
-        mK,
-        SmemLayoutK{},
-        TileShape_MNK{},
-        ClusterShape{}); // no mcast for KV
-    Tensor mV =
-        make_tensor(make_gmem_ptr(args.ptr_V), args.shape_K, args.stride_V);
-    TMA_V tma_load_V = make_tma_copy_B_sm90(
-        GmemTiledCopyKV{},
-        mV,
-        SmemLayoutV{},
-        TileShape_MNK{},
-        ClusterShape{}); // no mcast for KV
-    if constexpr (Deterministic) {
-      assert(args.dq_semaphore != nullptr);
-    }
-    return {
-        args.shape_Q,
-        args.shape_K,
-        args.shape_V,
-        args.shape_dO,
-        args.ptr_dQaccum,
-        args.shape_dQaccum,
-        args.stride_dQaccum,
-        tma_load_Q,
-        tma_load_dO,
-        tma_load_K,
-        tma_load_V,
-        args.ptr_LSE_log2,
-        args.shape_LSE,
-        args.stride_LSE_log2,
-        args.ptr_dPsum,
-        args.stride_dPsum,
-        args.max_attn_len,
-        args.min_full_attn_seq_len,
-        args.contextual_seq_len,
-        args.max_seq_len_inv,
-        args.alpha,
-        float(args.alpha * M_LOG2E),
-        args.num_batch,
-        args.num_softmax_heads,
-        args.num_groups,
-        args.batch_size_per_group,
-        args.dq_semaphore,
-        args.seq_offsets,
-        args.seq_offsets_q,
-        args.num_targets,
-        args.max_seq_len_tensor,
-        args.contextual_seq_len_tensor,
-        args.max_attn_len_tensor,
-        args.min_full_attn_seq_len_tensor,
-        args.attn_scale,
-        args.scalar_scale};
-  }
-
-  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best
-  /// performance
-  CUTLASS_DEVICE
-  static void prefetch_tma_descriptors(Params const& params) {
-    cute::prefetch_tma_descriptor(params.tma_load_Q.get_tma_descriptor());
-    cute::prefetch_tma_descriptor(params.tma_load_dO.get_tma_descriptor());
-    cute::prefetch_tma_descriptor(params.tma_load_K.get_tma_descriptor());
-    cute::prefetch_tma_descriptor(params.tma_load_V.get_tma_descriptor());
-  }
-
-  CUTLASS_DEVICE
-  cute::tuple<int, int> get_m_block_min_max(
-      int const max_attn_len,
-      int const contextual_seq_len,
-      int const uihlen,
-      int const seqlen,
-      int const n_block) {
-    static constexpr int kBlockM = get<0>(TileShape_MNK{});
-    static constexpr int kBlockN = get<1>(TileShape_MNK{});
-    if constexpr (Has_targets) {
-      int n_idx_min = n_block * kBlockN;
-      if (n_idx_min >= uihlen) {
-        int n_idx_max = (n_block + 1) * kBlockN;
-        return {
-            std::max(0, n_idx_min / kBlockM),
-            cute::ceil_div(std::min(n_idx_max, seqlen), kBlockM)};
-      }
-    }
-    // uih part
-    int m_block_max = cute::ceil_div(seqlen, kBlockM);
-    if constexpr (Local) {
-      int local_m_block_max =
-          cute::ceil_div((n_block + 1) * kBlockN + max_attn_len, kBlockM);
-      if constexpr (Contexual_mask) {
-        // row contexual without sink
-        if (n_block * kBlockN < contextual_seq_len) {
-          local_m_block_max = std::max(
-              local_m_block_max,
-              cute::ceil_div(contextual_seq_len + max_attn_len, kBlockM));
-        }
-      }
-      m_block_max = std::min(m_block_max, local_m_block_max);
-    }
-    int m_block_min = 0;
-    if constexpr (Causal || Local) {
-      m_block_min = std::max(m_block_min, (n_block * kBlockN) / kBlockM);
-    }
-    return {m_block_min, m_block_max};
-  }
-
-  CUTLASS_DEVICE
-  cute::tuple<int, int> get_full_m_block_min_max(
-      int const uihlen,
-      int const seqlen,
-      int const min_full_attn_seq_len,
-      int const m_block_max,
-      int const n_block) {
-    static constexpr int kBlockM = get<0>(TileShape_MNK{});
-    static constexpr int kBlockN = get<1>(TileShape_MNK{});
-    if constexpr (Cross) {
-      return {0, 0};
-    }
-    if constexpr (!Local) {
-      return {0, 0};
-    }
-    if constexpr (Has_targets) {
-      int n_idx_min = n_block * kBlockN;
-      if (n_idx_min >= uihlen) {
-        return {0, 0};
-      }
-    }
-    if constexpr (Local) {
-      int full_m_block_max = cute::ceil_div(seqlen, kBlockM);
-      int full_m_block_min =
-          std::max(m_block_max, (uihlen - min_full_attn_seq_len) / kBlockM);
-      return {full_m_block_min, full_m_block_max};
-    }
-    return {0, 0};
-  }
-
-  CUTLASS_DEVICE
-  int get_contexual_m_block_max(
-      int const uihlen,
-      int const contextual_seq_len,
-      int const m_block_min,
-      int const n_block) {
-    static constexpr int kBlockM = get<0>(TileShape_MNK{});
-    static constexpr int kBlockN = get<1>(TileShape_MNK{});
-    if constexpr (Cross) {
-      return 0;
-    }
-    if constexpr (!Contexual_mask) {
-      return 0;
-    }
-    if constexpr (Has_targets) {
-      int n_idx_min = n_block * kBlockN;
-      if (n_idx_min >= uihlen) {
-        return 0;
-      }
-    }
-    if constexpr (Causal || Local) {
-      int contexual_m_block_max =
-          std::min(m_block_min, cute::ceil_div(contextual_seq_len, kBlockM));
-      return contexual_m_block_max;
-    }
-    return 0;
-  }
-
-  CUTLASS_DEVICE
-  int get_next_m_block(
-      int const m_block,
-      int const m_block_min,
-      int const m_block_max,
-      int const contexual_m_block_max,
-      int const full_m_block_min,
-      int const full_m_block_max) {
-    int const out_m_block = m_block + 1;
-    if constexpr (Contexual_mask || Local) {
-      if (out_m_block == m_block_max) {
-        if (contexual_m_block_max > 0) {
-          return 0;
-        }
-        if (full_m_block_max > full_m_block_min) {
-          return full_m_block_min;
-        }
-        return -1;
-      }
-      if (out_m_block == contexual_m_block_max) {
-        if (full_m_block_max > full_m_block_min) {
-          return full_m_block_min;
-        }
-        return -1;
-      }
-      if (out_m_block == full_m_block_max) {
-        return -1;
-      }
-      return out_m_block;
-    }
-    if (out_m_block == m_block_max) {
-      return -1;
-    }
-    return out_m_block;
-  }
-
-  CUTLASS_DEVICE
-  cute::tuple<int, int> get_cross_m_block_min_max(
-      int const uihlen_q,
-      int const seqlen_q,
-      int const seqlen_kv,
-      int const n_block) {
-    static constexpr int kBlockM = get<0>(TileShape_MNK{});
-    static constexpr int kBlockN = get<1>(TileShape_MNK{});
-    int m_block_max = cute::ceil_div(seqlen_q, kBlockM);
-    if constexpr (!Causal) {
-      return {0, m_block_max};
-    }
-    int m_block_min =
-        std::max(0, (n_block * kBlockN + uihlen_q - seqlen_kv) / kBlockM);
-    return {m_block_min, m_block_max};
-  }
-
-  template <typename SchedulerPrefetch, typename SharedStorage>
-  CUTLASS_DEVICE void load(
-      Params const& params,
-      MainloopPipeline pipeline_q,
-      MainloopPipeline_dO pipeline_do,
-      PipelineState& smem_pipe_write,
-      PipelineState_dO& smem_pipe_write_do,
-      SharedStorage& shared_storage,
-      SchedulerPrefetch const& scheduler_prefetch,
-      cute::tuple<int32_t, int32_t, int32_t> block_coord) {
-    auto [n_block, bidh, bidb] = block_coord;
-    SeqlenInfo_t seqlen_info{
-        bidb,
-        get<0>(params.shape_Q),
-        get<0>(params.shape_K),
-        params.seq_offsets,
-        params.seq_offsets_q,
-        params.num_targets};
-    if constexpr (Jagged) {
-      static constexpr int kBlockN = get<1>(TileShape_MNK{});
-      if (n_block * kBlockN >= seqlen_info.seqlen_kv) {
-        scheduler_prefetch();
-        return;
-      }
-    }
-    int min_full_attn_seq_len_, max_attn_len_, contextual_seq_len_;
-    if constexpr (!Cross) {
-      if (params.num_groups > 1) {
-        int group_id = bidb / params.batch_size_per_group;
-        min_full_attn_seq_len_ = params.min_full_attn_seq_len_tensor[group_id];
-        max_attn_len_ = params.max_attn_len_tensor[group_id];
-        contextual_seq_len_ = params.contextual_seq_len_tensor[group_id];
-      } else {
-        min_full_attn_seq_len_ = params.min_full_attn_seq_len;
-        max_attn_len_ = params.max_attn_len;
-        contextual_seq_len_ = params.contextual_seq_len;
-      }
-    }
-    int m_block_min, m_block_max;
-    if constexpr (Cross) {
-      auto m_block_min_max = get_cross_m_block_min_max(
-          seqlen_info.uihlen_q,
-          seqlen_info.seqlen_q,
-          seqlen_info.seqlen_kv,
-          n_block);
-      m_block_min = get<0>(m_block_min_max);
-      m_block_max = get<1>(m_block_min_max);
-    } else {
-      auto m_block_min_max = get_m_block_min_max(
-          max_attn_len_,
-          contextual_seq_len_,
-          seqlen_info.uihlen_q,
-          seqlen_info.seqlen_q,
-          n_block);
-      m_block_min = get<0>(m_block_min_max);
-      m_block_max = get<1>(m_block_min_max);
-    }
-    auto full_m_block_min_max = get_full_m_block_min_max(
-        seqlen_info.uihlen_q,
-        seqlen_info.seqlen_q,
-        min_full_attn_seq_len_,
-        m_block_max,
-        n_block);
-    int const full_m_block_min = get<0>(full_m_block_min_max);
-    int const full_m_block_max = get<1>(full_m_block_min_max);
-    int contexual_m_block_max = get_contexual_m_block_max(
-        seqlen_info.uihlen_q, contextual_seq_len_, m_block_min, n_block);
-
-    Tensor sQ = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
-        SmemLayoutQ{});
-    Tensor sdO = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_do.data()),
-        SmemLayoutdO{});
-    Tensor sK = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_k.data()),
-        SmemLayoutK{});
-    Tensor sV = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_v.data()),
-        SmemLayoutV{});
-    Tensor sLSE = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_lse.data()),
-        SmemLayoutLSE{});
-    Tensor sdPsum = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_dpsum.data()),
-        SmemLayoutLSE{});
-
-    // Prepare the TMA loads
-    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
-    constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
-    uint2 cluster_local_block_id = {
-        block_rank_in_cluster % cluster_shape_x,
-        block_rank_in_cluster / cluster_shape_x};
-    Tensor mQ = params.tma_load_Q.get_tma_tensor(params.shape_Q)(
-        _, _, bidh, !Jagged ? bidb : 0);
-    Tensor mdO = params.tma_load_dO.get_tma_tensor(params.shape_Q)(
-        _, _, bidh, !Jagged ? bidb : 0);
-    Tensor mK = params.tma_load_K.get_tma_tensor(params.shape_K)(
-        _, _, bidh, !Jagged ? bidb : 0);
-    Tensor mV = params.tma_load_V.get_tma_tensor(params.shape_K)(
-        _, _, bidh, !Jagged ? bidb : 0);
-    Tensor mLSE = make_tensor(
-        make_gmem_ptr(params.ptr_LSE_log2),
-        params.shape_LSE,
-        params.stride_LSE_log2)(_, bidh, !Jagged ? bidb : 0);
-    Tensor mdPsum = make_tensor(
-        make_gmem_ptr(params.ptr_dPsum), params.shape_LSE, params.stride_dPsum)(
-        _, bidh, !Jagged ? bidb : 0);
-
-    Tensor gQ = local_tile(
-        domain_offset(make_coord(seqlen_info.offset_q, _0{}), mQ),
-        select<0, 2>(TileShape_MNK{}),
-        make_coord(_, _0{})); // (M, K, _)
-    Tensor gdO = local_tile(
-        domain_offset(make_coord(seqlen_info.offset_q, _0{}), mdO),
-        select<0, 2>(TileShape_MNK{}),
-        make_coord(_, _0{})); // (M, K, _)
-    Tensor gK = local_tile(
-        domain_offset(make_coord(seqlen_info.offset_k, _0{}), mK),
-        select<1, 2>(TileShape_MNK{}),
-        make_coord(n_block, _0{})); // (N, K)
-    Tensor gV = local_tile(
-        domain_offset(make_coord(seqlen_info.offset_k, _0{}), mV),
-        select<1, 2>(TileShape_MNK{}),
-        make_coord(n_block, _0{})); // (N, K)
-    Tensor gLSE = local_tile(
-        domain_offset(make_coord(seqlen_info.offset_q_padded), mLSE),
-        select<0>(TileShape_MNK{}),
-        make_coord(_)); // (M, _)
-    Tensor gdPsum = local_tile(
-        domain_offset(make_coord(seqlen_info.offset_q_padded), mdPsum),
-        select<0>(TileShape_MNK{}),
-        make_coord(_)); // (M, _)
-
-    Tensor sK_x =
-        make_tensor(sK.data(), make_layout(sK.layout(), Layout<_1>{}));
-    Tensor gK_x =
-        make_tensor(gK.data(), make_layout(gK.layout(), Layout<_1>{}));
-    Tensor sV_x =
-        make_tensor(sV.data(), make_layout(sV.layout(), Layout<_1>{}));
-    Tensor gV_x =
-        make_tensor(gV.data(), make_layout(gV.layout(), Layout<_1>{}));
-    // auto [tQgQ, tQsQ] = tma_partition(params.tma_load_Q,
-    // block_rank_in_cluster, Layout<ClusterShape>{},
-    //                                   group_modes<0, 2>(sQ), group_modes<0,
-    //                                   2>(gQ));  // (TMA, k), (TMA, PIPE)
-    // auto [tdOgdO, tdOsdO] = tma_partition(params.tma_load_dO,
-    // block_rank_in_cluster, Layout<ClusterShape>{},
-    //                                   group_modes<0, 2>(sdO), group_modes<0,
-    //                                   2>(gdO));  // (TMA, k), (TMA, PIPE)
-    auto block_tma_Q = params.tma_load_Q.get_slice(cluster_local_block_id.y);
-    auto block_tma_dO = params.tma_load_dO.get_slice(cluster_local_block_id.y);
-    Tensor tQgQ = group_modes<0, 3>(block_tma_Q.partition_S(gQ));
-    Tensor tQsQ = group_modes<0, 3>(block_tma_Q.partition_D(sQ));
-    Tensor tdOgdO = group_modes<0, 3>(block_tma_dO.partition_S(gdO));
-    Tensor tdOsdO = group_modes<0, 3>(block_tma_dO.partition_D(sdO));
-    auto [tKgK, tKsK] = tma_partition(
-        params.tma_load_K,
-        _0{},
-        Layout<_1>{},
-        group_modes<0, 2>(sK_x),
-        group_modes<0, 2>(gK_x)); // (TMA), (TMA)
-    auto [tVgV, tVsV] = tma_partition(
-        params.tma_load_V,
-        _0{},
-        Layout<_1>{},
-        group_modes<0, 2>(sV_x),
-        group_modes<0, 2>(gV_x)); // (TMA), (TMA)
-    auto bulk_copy = Copy_Traits<SM90_BULK_COPY_AUTO>{};
-
-    uint16_t mcast_mask_qdo = 0;
-    if constexpr (cute::is_same_v<GmemTiledCopyQdO, SM90_TMA_LOAD_MULTICAST>) {
-      auto block_layout = Layout<ClusterShape>{}; // (m,n) -> block_id
-      for (int n = 0; n < size<1>(block_layout); ++n) {
-        mcast_mask_qdo |=
-            (uint16_t(1) << block_layout(cluster_local_block_id.x, n, _0{}));
-      }
-    }
-
-    int m_block = m_block_min;
-    int next_m_block = -1;
-    int lane_predicate = cute::elect_one_sync();
-
-    if (lane_predicate) {
-      pipeline_q.producer_acquire(smem_pipe_write);
-      copy(
-          params.tma_load_Q.with(
-              *pipeline_q.producer_get_barrier(smem_pipe_write),
-              mcast_mask_qdo,
-              TMA::CacheHintSm90::EVICT_LAST),
-          tQgQ(_, m_block),
-          tQsQ(_, smem_pipe_write.index()));
-      if constexpr (Softmax) {
-        copy(
-            bulk_copy.with(*pipeline_q.producer_get_barrier(smem_pipe_write)),
-            gLSE(_, m_block),
-            sLSE(_, smem_pipe_write.index()));
-      }
-    }
-
-    // // Wait for the MMA warpgroups to say that smem_k and smem_v are ready
-    // cutlass::arch::NamedBarrier::sync(NumMmaThreads +
-    // cutlass::NumThreadsPerWarp,
-    // static_cast<uint32_t>(BwdNamedBarriers::KVEmpty) /*id*/);
-
-    auto load_step = [&](int m_block) {
-      // If Q and dO have the same number of stages, we can use the same
-      // pipeline state variable to reduce registers
-      PipelineState_dO smem_pipe_write_do_cur =
-          cute::conditional_return<Q_dO_same_stages>(
-              smem_pipe_write, smem_pipe_write_do);
-      pipeline_do.producer_acquire(smem_pipe_write_do_cur);
-      copy(
-          params.tma_load_dO.with(
-              *pipeline_do.producer_get_barrier(smem_pipe_write_do_cur),
-              mcast_mask_qdo,
-              TMA::CacheHintSm90::EVICT_LAST),
-          tdOgdO(_, m_block),
-          tdOsdO(_, smem_pipe_write_do_cur.index()));
-      if constexpr (Softmax) {
-        copy(
-            bulk_copy.with(
-                *pipeline_do.producer_get_barrier(smem_pipe_write_do_cur)),
-            gdPsum(_, m_block),
-            sdPsum(_, smem_pipe_write_do_cur.index()));
-      }
-      if constexpr (!Q_dO_same_stages) {
-        ++smem_pipe_write_do;
-      }
-      ++smem_pipe_write;
-      next_m_block = get_next_m_block(
-          m_block,
-          m_block_min,
-          m_block_max,
-          contexual_m_block_max,
-          full_m_block_min,
-          full_m_block_max);
-      if (next_m_block != -1) {
-        pipeline_q.producer_acquire(smem_pipe_write);
-        copy(
-            params.tma_load_Q.with(
-                *pipeline_q.producer_get_barrier(smem_pipe_write),
-                mcast_mask_qdo,
-                TMA::CacheHintSm90::EVICT_LAST),
-            tQgQ(_, next_m_block),
-            tQsQ(_, smem_pipe_write.index()));
-        if constexpr (Softmax) {
-          copy(
-              bulk_copy.with(*pipeline_q.producer_get_barrier(smem_pipe_write)),
-              gLSE(_, next_m_block),
-              sLSE(_, smem_pipe_write.index()));
-        }
-      }
-    };
-
-    if (lane_predicate) {
-      // Copy K tile and V tile from GMEM to SMEM.
-      shared_storage.pipelines.barrier_KV.arrive_and_expect_tx(
-          TmaTransactionBytesK + TmaTransactionBytesV);
-      copy(
-          params.tma_load_K.with(
-              reinterpret_cast<
-                  cutlass::arch::ClusterTransactionBarrier::ValueType&>(
-                  shared_storage.pipelines.barrier_KV),
-              0 /*mcast_mask*/),
-          tKgK,
-          tKsK);
-      copy(
-          params.tma_load_V.with(
-              reinterpret_cast<
-                  cutlass::arch::ClusterTransactionBarrier::ValueType&>(
-                  shared_storage.pipelines.barrier_KV),
-              0 /*mcast_mask*/),
-          tVgV,
-          tVsV);
-
-#pragma unroll(kHeadDim < 256 ? 2 : 1)
-      for (; m_block < m_block_max; ++m_block) {
-        load_step(m_block);
-      }
-    }
-    scheduler_prefetch();
-    m_block = next_m_block;
-    if constexpr (Contexual_mask) {
-      if (lane_predicate) {
-        if (m_block >= 0) {
-#pragma unroll(kHeadDim < 256 ? 2 : 1)
-          for (; m_block < contexual_m_block_max; ++m_block) {
-            load_step(m_block);
-          }
-        }
-      }
-    }
-    m_block = next_m_block;
-    if constexpr (Local) {
-      if (lane_predicate) {
-        if (m_block >= 0) {
-#pragma unroll(kHeadDim < 256 ? 2 : 1)
-          for (; m_block < full_m_block_max; ++m_block) {
-            load_step(m_block);
-          }
-        }
-      }
-    }
-    if constexpr (Q_dO_same_stages) {
-      smem_pipe_write_do = smem_pipe_write;
-    }
-  }
-
-  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
-  CUTLASS_DEVICE void load_tail(
-      MainloopPipeline pipeline_q,
-      MainloopPipeline_dO pipeline_do,
-      PipelineState& smem_pipe_write) {
-    static_assert(
-        Q_dO_same_stages, "Q and dO must have the same number of stages");
-    // Need to copy since pipeline_q.producer_tail(smem_pipe_write) will
-    // increment smem_pipe_write
-    PipelineState smem_pipe_write_do = smem_pipe_write;
-    // Issue the epilogue waits
-    if (cute::elect_one_sync()) {
-      /* This helps avoid early exit of blocks in Cluster
-       * Waits for all stages to either be released (all Consumer UNLOCKs), or
-       * if the stage was never used then would just be acquired since the phase
-       * was still inverted from make_producer_start_state
-       */
-      pipeline_q.producer_tail(smem_pipe_write);
-      pipeline_do.producer_tail(smem_pipe_write_do);
-    }
-  }
-
-  /// Perform a Producer Epilogue to prevent early exit of blocks in a Cluster
-  CUTLASS_DEVICE void load_tail(
-      MainloopPipeline pipeline_q,
-      MainloopPipeline_dO pipeline_do,
-      PipelineState& smem_pipe_write,
-      PipelineState_dO& smem_pipe_write_do) {
-    // Issue the epilogue waits
-    if (cute::elect_one_sync()) {
-      /* This helps avoid early exit of blocks in Cluster
-       * Waits for all stages to either be released (all Consumer UNLOCKs), or
-       * if the stage was never used then would just be acquired since the phase
-       * was still inverted from make_producer_start_state
-       */
-      pipeline_q.producer_tail(smem_pipe_write);
-      pipeline_do.producer_tail(smem_pipe_write_do);
-    }
-  }
-
-  template <typename SharedStorage>
-  CUTLASS_DEVICE void store_dq(
-      Params const& params,
-      SharedStorage& shared_storage,
-      cute::tuple<int32_t, int32_t, int32_t> block_coord) {
-    if constexpr (!dQacc_use_TMA) {
-      return;
-    }
-
-    auto [n_block, bidh, bidb] = block_coord;
-    SeqlenInfo_t seqlen_info{
-        bidb,
-        get<0>(params.shape_Q),
-        get<0>(params.shape_K),
-        params.seq_offsets,
-        params.seq_offsets_q,
-        params.num_targets};
-    if constexpr (Jagged) {
-      static constexpr int kBlockN = get<1>(TileShape_MNK{});
-      if (n_block * kBlockN >= seqlen_info.seqlen_kv) {
-        return;
-      }
-    }
-    int min_full_attn_seq_len_, max_attn_len_, contextual_seq_len_;
-    if constexpr (!Cross) {
-      if (params.num_groups > 1) {
-        int group_id = bidb / params.batch_size_per_group;
-        min_full_attn_seq_len_ = params.min_full_attn_seq_len_tensor[group_id];
-        max_attn_len_ = params.max_attn_len_tensor[group_id];
-        contextual_seq_len_ = params.contextual_seq_len_tensor[group_id];
-      } else {
-        min_full_attn_seq_len_ = params.min_full_attn_seq_len;
-        max_attn_len_ = params.max_attn_len;
-        contextual_seq_len_ = params.contextual_seq_len;
-      }
-    }
-    int m_block_min, m_block_max;
-    if constexpr (Cross) {
-      auto m_block_min_max = get_cross_m_block_min_max(
-          seqlen_info.uihlen_q,
-          seqlen_info.seqlen_q,
-          seqlen_info.seqlen_kv,
-          n_block);
-      m_block_min = get<0>(m_block_min_max);
-      m_block_max = get<1>(m_block_min_max);
-    } else {
-      auto m_block_min_max = get_m_block_min_max(
-          max_attn_len_,
-          contextual_seq_len_,
-          seqlen_info.uihlen_q,
-          seqlen_info.seqlen_q,
-          n_block);
-      m_block_min = get<0>(m_block_min_max);
-      m_block_max = get<1>(m_block_min_max);
-    }
-    auto full_m_block_min_max = get_full_m_block_min_max(
-        seqlen_info.uihlen_q,
-        seqlen_info.seqlen_q,
-        min_full_attn_seq_len_,
-        m_block_max,
-        n_block);
-    int const full_m_block_min = get<0>(full_m_block_min_max);
-    int const full_m_block_max = get<1>(full_m_block_min_max);
-    int contexual_m_block_max = get_contexual_m_block_max(
-        seqlen_info.uihlen_q, contextual_seq_len_, m_block_min, n_block);
-
-    Tensor sdQ = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_dqacc.data()),
-        SmemLayoutdQaccum{});
-    static constexpr int dQ_TMA_num_bytes =
-        CUTE_STATIC_V(size<0>(sdQ)) * sizeof(ElementAccum);
-
-    Tensor mdQaccum = make_tensor(
-        make_gmem_ptr(reinterpret_cast<ElementAccum*>(params.ptr_dQaccum)),
-        params.shape_dQaccum,
-        params.stride_dQaccum)(_, bidh, !Jagged ? bidb : 0);
-    Tensor gdQaccum_ = local_tile(
-        domain_offset(
-            make_coord(seqlen_info.offset_q_padded * kHeadDim), mdQaccum),
-        Shape<Int<kBlockM * kHeadDim>>{},
-        make_coord(_)); // (M * K, _)
-    Tensor gdQaccum = cute::flat_divide(
-        gdQaccum_,
-        Int<kBlockM * kHeadDim / NumMmaWarpGroups>{}); // (M * K / WG, WG, _)
-
-    int const num_batch = params.num_batch;
-    int const num_head = get<2>(params.shape_Q);
-    int* lock_ptr =
-        !Deterministic ? nullptr : params.dq_semaphore + bidb * num_head + bidh;
-    using Barrier = cutlass::GenericBarrier<cutlass::detail::SyncwarpSync>;
-    bool const lane_predicate = cute::elect_one_sync();
-
-    auto store_dq_step = [&](int m_block) {
-      if constexpr (Deterministic) {
-        Barrier::wait_eq(
-            lock_ptr,
-            threadIdx.x % cutlass::NumThreadsPerWarp,
-            m_block * num_batch * num_head,
-            n_block);
-      }
-#pragma unroll
-      for (int warpgroup_idx = 0; warpgroup_idx < NumMmaWarpGroups;
-           ++warpgroup_idx) {
-        cutlass::arch::NamedBarrier::sync(
-            cutlass::NumThreadsPerWarpGroup + cutlass::NumThreadsPerWarp,
-            static_cast<uint32_t>(BwdNamedBarriers::dQFullWG1) +
-                warpgroup_idx /*id*/); // sdQ full, to be written to gmem
-        if (lane_predicate) {
-          SM90_BULK_REDUCE_ADD::copy(
-              raw_pointer_cast(sdQ(_, warpgroup_idx).data()),
-              raw_pointer_cast(gdQaccum(_, warpgroup_idx, m_block).data()),
-              dQ_TMA_num_bytes,
-              static_cast<uint64_t>(TMA::CacheHintSm90::EVICT_LAST));
-          tma_store_arrive();
-        }
-      }
-      // Note, the for_each() function is required here to ensure
-      // `warpgroup_idx` is of type Int<x>.
-      for_each(make_int_sequence<NumMmaWarpGroups>{}, [&](auto warpgroup_idx) {
-        if (lane_predicate) {
-          tma_store_wait<NumMmaWarpGroups - 1 - CUTE_STATIC_V(warpgroup_idx)>();
-        }
-        cutlass::arch::NamedBarrier::arrive(
-            cutlass::NumThreadsPerWarpGroup + cutlass::NumThreadsPerWarp,
-            static_cast<uint32_t>(BwdNamedBarriers::dQEmptyWG1) +
-                warpgroup_idx /*id*/); // sdQ empty, ready to be written to
-      });
-      if constexpr (Deterministic) {
-        Barrier::arrive_inc(
-            lock_ptr,
-            threadIdx.x % cutlass::NumThreadsPerWarp,
-            m_block * num_batch * num_head);
-      }
-    };
-
-#pragma unroll 2
-    for (int m_block = m_block_min; m_block < m_block_max; ++m_block) {
-      store_dq_step(m_block);
-    }
-    if constexpr (Contexual_mask) {
-#pragma unroll 2
-      for (int m_block = 0; m_block < contexual_m_block_max; ++m_block) {
-        store_dq_step(m_block);
-      }
-    }
-    if constexpr (Local) {
-#pragma unroll 2
-      for (int m_block = full_m_block_min; m_block < full_m_block_max;
-           ++m_block) {
-        store_dq_step(m_block);
-      }
-    }
-    if constexpr (Local && Deterministic) {
-      constexpr int kBlockM = get<0>(TileShape_MNK{});
-      int const m_block_global_max =
-          cute::ceil_div(seqlen_info.seqlen_q, kBlockM);
-#pragma unroll 2
-      for (int m_block = m_block_max; m_block < m_block_global_max; ++m_block) {
-        Barrier::arrive_inc(
-            lock_ptr,
-            threadIdx.x % cutlass::NumThreadsPerWarp,
-            m_block * num_batch * num_head);
-      }
-    }
-  }
-
-  CUTLASS_DEVICE void mma_init() {
-    // We're not currently using this bc we're not using persistent scheduler
-    // // Tell producer (warp 0) that smem_k and smem_v are ready
-    // cutlass::arch::NamedBarrier::arrive(NumMmaThreads +
-    // cutlass::NumThreadsPerWarp,
-    // static_cast<uint32_t>(BwdNamedBarriers::KVEmpty) /*id*/);
-    int warp_idx_in_warpgroup =
-        __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0);
-    if constexpr (dQacc_use_TMA) {
-      if (warp_idx_in_warpgroup == 0) {
-        cutlass::arch::NamedBarrier::arrive(
-            cutlass::NumThreadsPerWarpGroup + cutlass::NumThreadsPerWarp,
-            static_cast<uint32_t>(BwdNamedBarriers::dQEmptyWG1) - 1 +
-                hstu::canonical_warp_group_idx_nosync() /*id*/); // sdQ empty,
-                                                                 // ready to be
-                                                                 // written to
-      }
-    }
-  }
-
-  template <typename SharedStorage, typename FrgTensordKV>
-  CUTLASS_DEVICE bool mma(
-      Params const& params,
-      MainloopPipeline pipeline_q,
-      MainloopPipeline_dO pipeline_do,
-      PipelineState& smem_pipe_read,
-      PipelineState_dO& smem_pipe_read_do,
-      FrgTensordKV& tdKrdK,
-      FrgTensordKV& tdVrdV,
-      int thread_idx,
-      int& work_idx,
-      cute::tuple<int32_t, int32_t, int32_t> block_coord,
-      SharedStorage& shared_storage) {
-    static_assert(
-        is_rmem<FrgTensordKV>::value,
-        "dK and dV tensor must be rmem resident.");
-
-    int n_block = get<0>(block_coord);
-    int bidb = get<2>(block_coord);
-    SeqlenInfo_t seqlen_info{
-        bidb,
-        get<0>(params.shape_Q),
-        get<0>(params.shape_K),
-        params.seq_offsets,
-        params.seq_offsets_q,
-        params.num_targets};
-    if constexpr (Jagged) {
-      static constexpr int kBlockN = get<1>(TileShape_MNK{});
-      if (n_block * kBlockN >= seqlen_info.seqlen_kv) {
-        return false;
-      }
-    }
-    int min_full_attn_seq_len_, max_attn_len_, contextual_seq_len_;
-    float scalar_scale_val_;
-    if constexpr (!Cross) {
-      if (params.num_groups > 1) {
-        int group_id = bidb / params.batch_size_per_group;
-        min_full_attn_seq_len_ = params.min_full_attn_seq_len_tensor[group_id];
-        max_attn_len_ = params.max_attn_len_tensor[group_id];
-        contextual_seq_len_ = params.contextual_seq_len_tensor[group_id];
-      } else {
-        min_full_attn_seq_len_ = params.min_full_attn_seq_len;
-        max_attn_len_ = params.max_attn_len;
-        contextual_seq_len_ = params.contextual_seq_len;
-      }
-    }
-    if (params.num_groups > 1) {
-      int group_id = bidb / params.batch_size_per_group;
-      int max_seq_len_per_group = params.max_seq_len_tensor[group_id];
-      // attention scale
-      scalar_scale_val_ = params.scalar_scale
-          ? (params.attn_scale == nullptr ? 1.0f / max_seq_len_per_group
-                                          : params.attn_scale[group_id])
-          : 0;
-    } else {
-      // attention scale
-      scalar_scale_val_ = params.scalar_scale
-          ? (params.attn_scale == nullptr ? params.max_seq_len_inv
-                                          : params.attn_scale[0])
-          : 0;
-    }
-    int m_block_min, m_block_max;
-    if constexpr (Cross) {
-      auto m_block_min_max = get_cross_m_block_min_max(
-          seqlen_info.uihlen_q,
-          seqlen_info.seqlen_q,
-          seqlen_info.seqlen_kv,
-          n_block);
-      m_block_min = get<0>(m_block_min_max);
-      m_block_max = get<1>(m_block_min_max);
-    } else {
-      auto m_block_min_max = get_m_block_min_max(
-          max_attn_len_,
-          contextual_seq_len_,
-          seqlen_info.uihlen_q,
-          seqlen_info.seqlen_q,
-          n_block);
-      m_block_min = get<0>(m_block_min_max);
-      m_block_max = get<1>(m_block_min_max);
-    }
-    auto full_m_block_min_max = get_full_m_block_min_max(
-        seqlen_info.uihlen_q,
-        seqlen_info.seqlen_q,
-        min_full_attn_seq_len_,
-        m_block_max,
-        n_block);
-    int const full_m_block_min = get<0>(full_m_block_min_max);
-    int const full_m_block_max = get<1>(full_m_block_min_max);
-    int contexual_m_block_max = get_contexual_m_block_max(
-        seqlen_info.uihlen_q, contextual_seq_len_, m_block_min, n_block);
-
-    Tensor sQ = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
-        SmemLayoutQ{});
-    Tensor sdO = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_do.data()),
-        SmemLayoutdO{});
-    Tensor sK = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_k.data()),
-        SmemLayoutK{});
-    Tensor sV = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_v.data()),
-        SmemLayoutV{});
-    Tensor sQt = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
-        SmemLayoutQt{});
-    Tensor sdOt = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_do.data()),
-        SmemLayoutdOt{});
-    Tensor sKt = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_k.data()),
-        SmemLayoutKt{});
-    Tensor sP = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_p.data()),
-        SmemLayoutPdS{});
-    Tensor sP_pi = cute::as_position_independent_swizzle_tensor(sP);
-    Tensor sPt = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_p.data()),
-        SmemLayoutPdSt{});
-    Tensor sPt_pi = cute::as_position_independent_swizzle_tensor(sPt);
-    Tensor sdS = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_ds.data()),
-        SmemLayoutPdS{});
-    Tensor sdS_pi = cute::as_position_independent_swizzle_tensor(sdS);
-    Tensor sdSt = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_ds.data()),
-        SmemLayoutPdSt{});
-    Tensor sdSt_pi = cute::as_position_independent_swizzle_tensor(sdSt);
-    Tensor sdQ = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_dqacc.data()),
-        SmemLayoutdQaccum{});
-
-    static_assert(
-        stride<0>(typename TiledMmaSdP::ALayout{}) == 0 and
-            stride<0>(typename TiledMmaSdP::BLayout{}) == 0 and
-            size<0>(typename TiledMmaSdP::ALayout{}) ==
-                cutlass::NumThreadsPerWarpGroup and
-            size<0>(typename TiledMmaSdP::BLayout{}) ==
-                cutlass::NumThreadsPerWarpGroup,
-        "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
-    constexpr int MmaWarpGroups =
-        NumMmaThreads / cutlass::NumThreadsPerWarpGroup;
-    Layout warp_group_thread_layout = make_layout(
-        make_shape(Int<MmaWarpGroups>{}),
-        make_stride(Int<cutlass::NumThreadsPerWarpGroup>{}));
-    Layout warp_group_thread_layout_dq = make_layout(
-        make_shape(Int<NumMmaWarpGroups>{}),
-        make_stride(Int<cutlass::NumThreadsPerWarpGroup>{}));
-
-    int warp_group_idx = __shfl_sync(
-        0xFFFFFFFF, thread_idx / cutlass::NumThreadsPerWarpGroup, 0);
-    TiledMmaSdP tiled_mma_SdP;
-    using TiledMmadP =
-        std::conditional_t<!Mma_dP_is_RS, TiledMmaSdP, TiledMmadPRS>;
-    TiledMmadP tiled_mma_dP;
-    TiledMmadKV tiled_mma_dKV;
-    TiledMmadQ tiled_mma_dQ;
-
-    auto wg_mma_SdP =
-        tiled_mma_SdP.get_slice(warp_group_thread_layout(warp_group_idx));
-    auto wg_mma_dP =
-        tiled_mma_dP.get_slice(warp_group_thread_layout(warp_group_idx));
-    auto thread_mma_SdP = tiled_mma_SdP.get_thread_slice(thread_idx);
-    auto wg_mma_dKV =
-        tiled_mma_dKV.get_slice(warp_group_thread_layout(warp_group_idx));
-    auto wg_mma_dQ =
-        tiled_mma_dQ.get_slice(warp_group_thread_layout_dq(warp_group_idx));
-
-    auto smem_tiled_copy_PdS =
-        make_tiled_copy_C(SmemCopyAtomPdS{}, tiled_mma_SdP);
-    auto smem_thr_copy_PdS = smem_tiled_copy_PdS.get_thread_slice(thread_idx);
-
-    R2STiledCopydQaccum r2s_tiled_copy_dQaccum;
-    auto r2s_thr_copy_dQaccum =
-        r2s_tiled_copy_dQaccum.get_thread_slice(thread_idx);
-    Tensor tdQsdQaccum = r2s_thr_copy_dQaccum.partition_D(sdQ);
-    // if (thread_idx == 0) { print(sdQ); printf("\n"); print(tdQsdQaccum);
-    // printf("\n"); }
-
-    // Allocate "fragments/descriptors"
-    // We have to use the templated mma_partition_fragment_AB instead of
-    // cute::conditional_return or lambda, because some partition_fragment_A/B
-    // don't compile.
-    // https://stackoverflow.com/questions/50051473/if-constexpr-in-c17-does-not-work-in-a-non-templated-function
-    Tensor tSrQ = mma_partition_fragment_AB</*A=*/!SdP_swapAB>(wg_mma_SdP, sQ);
-    Tensor tSrK = mma_partition_fragment_AB</*A=*/SdP_swapAB>(wg_mma_SdP, sK);
-    Tensor tdPrdO =
-        mma_partition_fragment_AB</*A=*/!SdP_swapAB>(wg_mma_SdP, sdO);
-    Tensor tdPrV = mma_partition_fragment_AB</*A=*/SdP_swapAB>(wg_mma_dP, sV);
-    Tensor tdVrdO =
-        mma_partition_fragment_AB</*A=*/dKV_swapAB>(wg_mma_dKV, sdOt);
-    Tensor tdKrQ = mma_partition_fragment_AB</*A=*/dKV_swapAB>(wg_mma_dKV, sQt);
-    Tensor tdQrdS = mma_partition_fragment_AB</*A=*/!dQ_swapAB>(wg_mma_dQ, sdS);
-    Tensor tdQrK = mma_partition_fragment_AB</*A=*/dQ_swapAB>(wg_mma_dQ, sKt);
-
-    Tensor tPsP = smem_thr_copy_PdS.partition_D(
-        cute::conditional_return<!SdP_swapAB>(
-            sP_pi, sPt_pi)); // ((Atom,AtomNum),PIPE_M,PIPE_N)
-    Tensor tdSsdS = smem_thr_copy_PdS.partition_D(
-        cute::conditional_return<!SdP_swapAB>(
-            sdS_pi, sdSt_pi)); // ((Atom,AtomNum),PIPE_M,PIPE_N)
-    // if (blockIdx.x == 0 && threadIdx.x == 128) { print(smem_thr_copy_PdS);
-    // print(sP_pi); printf("\n"); print(sPt_pi); printf("\n"); print(tPsP);
-    // printf("\n"); print(tdSsdS); printf("\n"); }
-
-    auto consumer_wait = [](auto& pipeline, auto& smem_pipe_read) {
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-    };
-
-    int bidh = get<1>(block_coord);
-    // For the case where we do atomicAdd directly to gdQaccum instead of using
-    // TMA
-    Tensor mdQaccum = make_tensor(
-        make_gmem_ptr(reinterpret_cast<ElementAccum*>(params.ptr_dQaccum)),
-        params.shape_dQaccum,
-        params.stride_dQaccum)(_, bidh, !Jagged ? bidb : 0);
-    Tensor gdQaccum_ = local_tile(
-        domain_offset(
-            make_coord(seqlen_info.offset_q_padded * kHeadDim), mdQaccum),
-        Shape<Int<kBlockM * kHeadDim>>{},
-        make_coord(_)); // (M * K, _)
-    Tensor gdQaccum = cute::flat_divide(
-        gdQaccum_,
-        Int<kBlockM * kHeadDim / NumMmaWarpGroups>{}); // (M * K / WG, WG, _)
-    // We can reuse r2s_thr_copy_dQaccum for this partitioning
-    Tensor tdQgdQaccum = r2s_thr_copy_dQaccum.partition_D(gdQaccum);
-    // if (blockIdx.x == 0 && threadIdx.x == 128) { print(mdQaccum);
-    // printf("\n"); print(gdQaccum_); printf("\n"); print(gdQaccum);
-    // printf("\n"); print(tdQgdQaccum); printf("\n"); }
-
-    hstu::Mask<kBlockM, kBlockN, TiledMmaSdP, SdP_swapAB> mask(
-        thread_idx,
-        seqlen_info.seqlen_q,
-        seqlen_info.seqlen_kv,
-        max_attn_len_,
-        min_full_attn_seq_len_,
-        contextual_seq_len_,
-        seqlen_info.uihlen_q);
-
-    int m_block = m_block_min;
-
-    clear(tdKrdK);
-    clear(tdVrdV);
-    // tiled_mma_dKV.accumulate_ = GMMA::ScaleOut::Zero;
-
-    cutlass::ConsumerToken barrier_token = static_cast<cutlass::BarrierStatus>(
-        shared_storage.pipelines.barrier_KV.try_wait(work_idx % 2));
-    if (barrier_token == cutlass::BarrierStatus::WaitAgain) {
-      shared_storage.pipelines.barrier_KV.wait(work_idx % 2);
-    }
-
-    if constexpr (Mma_dP_is_RS) {
-      using SmemCopyAtomV = Copy_Atom<cute::SM75_U32x4_LDSM_N, Element>;
-      auto smem_tiled_copy_V = make_tiled_copy_A(SmemCopyAtomV{}, tiled_mma_dP);
-      auto smem_thr_copy_V = smem_tiled_copy_V.get_thread_slice(thread_idx);
-      Tensor tdPrV_copy_view = smem_thr_copy_V.retile_D(tdPrV);
-      Tensor tdPsV_copy_view = smem_thr_copy_V.partition_S(
-          cute::as_position_independent_swizzle_tensor(sV));
-      cute::copy(smem_tiled_copy_V, tdPsV_copy_view, tdPrV_copy_view);
-    }
-    static constexpr int Qdim = !SdP_swapAB ? 0 : 1;
-    auto thread0_mma_SdP = tiled_mma_SdP.get_thread_slice(_0{});
-    Tensor cS = cute::make_identity_tensor(
-        Shape<
-            Int<!SdP_swapAB ? kBlockM : kBlockN>,
-            Int<!SdP_swapAB ? kBlockN : kBlockM>>{});
-    Tensor tScS = thread_mma_SdP.partition_C(cS);
-    Tensor tScS_rowcol = make_tensor(
-        tScS.data(),
-        hstu::convert_layout_acc_rowcol</*Transposed=*/SdP_swapAB>(
-            tScS.layout()));
-    Tensor t0ScS = thread0_mma_SdP.partition_C(cS);
-    Tensor t0ScS_rowcol = make_tensor(
-        t0ScS.data(),
-        hstu::convert_layout_acc_rowcol</*Transposed=*/SdP_swapAB>(
-            t0ScS.layout()));
-    int const thread_qdim_offset = get<Qdim>(tScS_rowcol(_0{}, _0{}));
-
-    auto bwd_step = [&](int m_block, auto mask_fn) {
-      Tensor tSrS = partition_fragment_C(
-          tiled_mma_SdP,
-          select<!SdP_swapAB ? 0 : 1, !SdP_swapAB ? 1 : 0>(TileShape_MNK{}));
-      consumer_wait(pipeline_q, smem_pipe_read);
-      hstu::gemm</*zero_init=*/true, /*wg_wait=*/-1, /*SwapAB=*/SdP_swapAB>(
-          tiled_mma_SdP, tSrQ(_, _, _, smem_pipe_read.index()), tSrK, tSrS);
-      Tensor tdPrdP = partition_fragment_C(
-          tiled_mma_SdP,
-          select<!SdP_swapAB ? 0 : 1, !SdP_swapAB ? 1 : 0>(TileShape_MNK{}));
-      PipelineState_dO smem_pipe_read_do_cur =
-          cute::conditional_return<Q_dO_same_stages>(
-              smem_pipe_read, smem_pipe_read_do);
-      consumer_wait(pipeline_do, smem_pipe_read_do_cur);
-      hstu::gemm</*zero_init=*/true, /*wg_wait=*/-1, /*SwapAB=*/SdP_swapAB>(
-          tiled_mma_dP,
-          tdPrdO(_, _, _, smem_pipe_read_do_cur.index()),
-          tdPrV,
-          tdPrdP);
-      warpgroup_wait<1>();
-      // Reshape tSrS from ((2, 2, V), MMA_N, MMA_M) to (nrow=(2, V, MMA_M),
-      // ncol=(2, MMA_N))
-      Tensor scores = make_tensor(
-          tSrS.data(),
-          hstu::convert_layout_acc_rowcol</*Transposed=*/SdP_swapAB>(
-              tSrS.layout()));
-      Tensor tSrS_sigmoid = make_tensor_like(tSrS);
-      Tensor sigmoid = make_tensor(
-          tSrS_sigmoid.data(),
-          hstu::convert_layout_acc_rowcol</*Transposed=*/SdP_swapAB>(
-              tSrS_sigmoid.layout()));
-      int qdim_offset = params.scalar_scale
-          ? 0
-          : m_block * kBlockM + thread_qdim_offset + seqlen_info.offset_q;
-      mask_fn(tSrS, m_block);
-#pragma unroll
-      for (int mi = 0; mi < size<0>(scores); ++mi) {
-        float scale = scalar_scale_val_;
-        if (!params.scalar_scale) {
-          int q_index = qdim_offset + int(get<Qdim>(t0ScS_rowcol(mi, _0{})));
-          int q_local = q_index - seqlen_info.offset_q;
-          if (q_local < seqlen_info.seqlen_q) {
-            scale = params.attn_scale[q_index];
-          }
-        }
-#pragma unroll
-        for (int ni = 0; ni < size<1>(scores); ++ni) {
-          scores(mi, ni) = scores(mi, ni) * params.alpha;
-          sigmoid(mi, ni) =
-              __fdividef(1., 1.0f + cutlass::fast_exp(-scores(mi, ni)));
-          scores(mi, ni) = sigmoid(mi, ni) * scores(mi, ni) * scale;
-        }
-      }
-      mask_fn(tSrS_sigmoid, m_block);
-
-      warpgroup_wait<0>();
-      // Reshape tdPrdP from ((2, 2, V), MMA_N, MMA_M) to (nrow=(2, V, MMA_M),
-      // ncol=(2, MMA_N))
-      Tensor dS = make_tensor(tdPrdP.data(), scores.layout());
-#pragma unroll
-      for (int mi = 0; mi < size<0>(dS); ++mi) {
-        float scale = scalar_scale_val_;
-        if (!params.scalar_scale) {
-          int q_index = qdim_offset + int(get<Qdim>(t0ScS_rowcol(mi, _0{})));
-          int q_local = q_index - seqlen_info.offset_q;
-          if (q_local < seqlen_info.seqlen_q) {
-            scale = params.attn_scale[q_index];
-          }
-        }
-#pragma unroll
-        for (int ni = 0; ni < size<1>(dS); ++ni) {
-          dS(mi, ni) = dS(mi, ni) * sigmoid(mi, ni) * scale +
-              dS(mi, ni) * scores(mi, ni) * (1.f - sigmoid(mi, ni));
-          dS(mi, ni) = dS(mi, ni) * params.alpha;
-          //   if (dS(mi, ni) > 0.0001) {
-          //     std::printf(
-          //         "dS(mi, ni) is (%f), (m, n) is (%d, %d), thread_idx is
-          //         (%d), blockIdx.z is (%d)\n", dS(mi, ni), mi, ni,
-          //         threadIdx.x,
-          //         blockIdx.z);
-          //   }
-        }
-      }
-      // Convert scores from fp32 to fp16/bf16
-      Tensor rP = make_tensor_like<Element>(tSrS);
-      hstu::convert_type_out(tSrS, rP);
-      if constexpr (!Mma_dKV_is_RS) {
-        // Need to sync to make sure P has already been used in the previous
-        // iteration before writing new values
-        if constexpr (kStages_dS == 1) {
-          cutlass::arch::NamedBarrier::sync(
-              NumMmaThreads,
-              static_cast<uint32_t>(BwdNamedBarriers::PdS) /*id*/);
-        }
-        Tensor tPaP =
-            smem_thr_copy_PdS.retile_S(rP); // ((Atom,AtomNum), MMA_N, MMA_N)
-        cute::copy(
-            smem_tiled_copy_PdS,
-            tPaP,
-            tPsP(
-                _,
-                _,
-                _,
-                cute::conditional_return<kStages_dS == 1>(
-                    _0{}, smem_pipe_read.index())));
-      }
-      Tensor rdS = make_tensor_like<Element>(tdPrdP);
-      hstu::convert_type_out(tdPrdP, rdS);
-      // If there's double buffering on dS, we don't need to sync here.
-      // Otherwise we might have WG1 writing to dS before WG2 is done reading
-      // from it during MmadQ. But because both WGs have to sync at the end of
-      // the loop and double buffering, this race condition is not possible.
-      // This sync is to ensure (1) P is written in case of !Mma_dKV_is_RS and
-      // (2) dS is already read by the Mma in the previous iteration in case of
-      // Mma_dKV_is_RS.
-      if constexpr (!Mma_dKV_is_RS || (kStages_dS == 1 && Mma_dKV_is_RS)) {
-        cutlass::arch::fence_view_async_shared();
-        cutlass::arch::NamedBarrier::sync(
-            NumMmaThreads, static_cast<uint32_t>(BwdNamedBarriers::PdS) /*id*/);
-      }
-      // For hdim 64, It's faster to write to smem_dS first before the dV gemm
-      Tensor tdSadS =
-          smem_thr_copy_PdS.retile_S(rdS); // ((Atom,AtomNum), MMA_N, MMA_N)
-      cute::copy(
-          smem_tiled_copy_PdS,
-          tdSadS,
-          tdSsdS(
-              _,
-              _,
-              _,
-              cute::conditional_return<kStages_dS == 1>(
-                  _0{}, smem_pipe_read.index())));
-
-      if constexpr (!Slice_dQKV_Mma) {
-        // Most cases take this path, except for hdim256 where we want to slice
-        // to reduce register pressure
-        if constexpr (Mma_dKV_is_RS) {
-          Tensor tdVrP = make_tensor(
-              rP.data(), convert_layout_acc_Aregs<TiledMmadKV>(tSrS.layout()));
-          hstu::gemm</*zero_init=*/false, /*wg_wait=*/-1>(
-              tiled_mma_dKV,
-              tdVrP,
-              tdVrdO(_, _, _, smem_pipe_read_do_cur.index()),
-              tdVrdV);
-        } else {
-          Tensor tdVrP =
-              mma_partition_fragment_AB</*A=*/!dKV_swapAB>(wg_mma_dKV, sPt);
-          Tensor tdVrP_cur = tdVrP(
-              _,
-              _,
-              _,
-              cute::conditional_return<kStages_dS == 1>(
-                  _0{}, smem_pipe_read.index()));
-          hstu::
-              gemm</*zero_init=*/false, /*wg_wait=*/-1, /*SwapAB=*/dKV_swapAB>(
-                  tiled_mma_dKV,
-                  tdVrP_cur,
-                  tdVrdO(_, _, _, smem_pipe_read_do_cur.index()),
-                  tdVrdV);
-        }
-        // SMEM fence to make sure sdS is written before it's read by WGMMA
-        cutlass::arch::fence_view_async_shared();
-        cutlass::arch::NamedBarrier::sync(
-            NumMmaThreads, static_cast<uint32_t>(BwdNamedBarriers::PdS) /*id*/);
-        Tensor tdQrdQ = partition_fragment_C(
-            tiled_mma_dQ,
-            select<!dQ_swapAB ? 0 : 2, !dQ_swapAB ? 2 : 0>(TileShape_MNK{}));
-        Tensor tdQrdS_cur = tdQrdS(
-            _,
-            _,
-            _,
-            cute::conditional_return<kStages_dS == 1>(
-                _0{}, smem_pipe_read.index()));
-        hstu::gemm</*zero_init=*/true, /*wg_wait=*/1, /*SwapAB=*/dQ_swapAB>(
-            tiled_mma_dQ, tdQrdS_cur, tdQrK, tdQrdQ);
-        pipeline_do.consumer_release(smem_pipe_read_do_cur); // release dQ
-
-        if constexpr (Mma_dKV_is_RS) {
-          Tensor tdKrdS = make_tensor(
-              rdS.data(),
-              convert_layout_acc_Aregs<TiledMmadKV>(tdPrdP.layout()));
-          hstu::gemm</*zero_init=*/false, /*wg_wait=*/1>(
-              tiled_mma_dKV,
-              tdKrdS,
-              tdKrQ(_, _, _, smem_pipe_read.index()),
-              tdKrdK);
-        } else {
-          Tensor tdKrdS =
-              mma_partition_fragment_AB</*A=*/!dKV_swapAB>(wg_mma_dKV, sdSt);
-          Tensor tdKrdS_cur = tdKrdS(
-              _,
-              _,
-              _,
-              cute::conditional_return<kStages_dS == 1>(
-                  _0{}, smem_pipe_read.index()));
-          hstu::gemm</*zero_init=*/false, /*wg_wait=*/1, /*SwapAB=*/dKV_swapAB>(
-              tiled_mma_dKV,
-              tdKrdS_cur,
-              tdKrQ(_, _, _, smem_pipe_read.index()),
-              tdKrdK);
-        }
-        if constexpr (dQacc_use_TMA) {
-          int const warp_group_idx =
-              hstu::canonical_warp_group_idx_nosync() - 1;
-          cutlass::arch::NamedBarrier::sync(
-              cutlass::NumThreadsPerWarpGroup + cutlass::NumThreadsPerWarp,
-              static_cast<uint32_t>(BwdNamedBarriers::dQEmptyWG1) +
-                  warp_group_idx /*id*/); // sdQ full, to be written to gmem
-          Tensor taccdQrdQ = r2s_thr_copy_dQaccum.retile_S(tdQrdQ);
-          cute::copy(r2s_tiled_copy_dQaccum, taccdQrdQ, tdQsdQaccum);
-          cutlass::arch::fence_view_async_shared();
-          cutlass::arch::NamedBarrier::arrive(
-              cutlass::NumThreadsPerWarpGroup + cutlass::NumThreadsPerWarp,
-              static_cast<uint32_t>(BwdNamedBarriers::dQFullWG1) +
-                  warp_group_idx /*id*/); // sdQ full, to be written to gmem
-        } else {
-          // We can reuse r2s_thr_copy_dQaccum for this partitioning
-          Tensor tdQrdQ_atomic =
-              recast<float4>(r2s_thr_copy_dQaccum.retile_S(tdQrdQ));
-          Tensor tdQgdQaccum_atomic =
-              recast<float4>(tdQgdQaccum(_, _, _, m_block));
-          static_assert(
-              CUTE_STATIC_V(size(tdQrdQ_atomic)) ==
-              CUTE_STATIC_V(size(tdQgdQaccum_atomic)));
-#pragma unroll
-          for (int i = 0; i < size(tdQrdQ_atomic); ++i) {
-            atomicAdd(&tdQgdQaccum_atomic(i), tdQrdQ_atomic(i));
-          }
-        }
-
-      } else { // Slice_dQKV_Mma
-
-        static_assert(!(Slice_dQKV_Mma && Mma_dKV_is_RS));
-        Tensor tdVrP =
-            mma_partition_fragment_AB</*A=*/!dKV_swapAB>(wg_mma_dKV, sPt);
-        Tensor tdVrP_cur = tdVrP(
-            _,
-            _,
-            _,
-            cute::conditional_return<kStages_dS == 1>(
-                _0{}, smem_pipe_read.index()));
-        hstu::gemm<
-            /*zero_init=*/false,
-            /*wg_wait=*/-1,
-            /*SwapAB=*/dKV_swapAB,
-            /*M_slice=*/0>(
-            tiled_mma_dKV,
-            tdVrP_cur,
-            tdVrdO(_, _, _, smem_pipe_read_do_cur.index()),
-            tdVrdV);
-
-        cutlass::arch::fence_view_async_shared();
-        cutlass::arch::NamedBarrier::sync(
-            NumMmaThreads, static_cast<uint32_t>(BwdNamedBarriers::PdS) /*id*/);
-        Tensor tdQrdQ = partition_fragment_C(
-            tiled_mma_dQ,
-            select<!dQ_swapAB ? 0 : 2, !dQ_swapAB ? 2 : 0>(TileShape_MNK{}));
-        Tensor tdQrdS_cur = tdQrdS(
-            _,
-            _,
-            _,
-            cute::conditional_return<kStages_dS == 1>(
-                _0{}, smem_pipe_read.index()));
-        hstu::gemm<
-            /*zero_init=*/true,
-            /*wg_wait=*/-1,
-            /*SwapAB=*/dQ_swapAB,
-            /*M_slice=*/0>(tiled_mma_dQ, tdQrdS_cur, tdQrK, tdQrdQ);
-        hstu::gemm<
-            /*zero_init=*/false,
-            /*wg_wait=*/1,
-            /*SwapAB=*/dKV_swapAB,
-            /*M_slice=*/1>(
-            tiled_mma_dKV,
-            tdVrP_cur,
-            tdVrdO(_, _, _, smem_pipe_read_do_cur.index()),
-            tdVrdV);
-        Tensor tdQrdQ_atomic =
-            recast<float4>(r2s_thr_copy_dQaccum.retile_S(tdQrdQ));
-        Tensor tdQgdQaccum_atomic =
-            recast<float4>(tdQgdQaccum(_, _, _, m_block));
-#pragma unroll
-        for (int i = 0; i < size(tdQrdQ_atomic) / 2; ++i) {
-          atomicAdd(&tdQgdQaccum_atomic(i), tdQrdQ_atomic(i));
-        }
-
-        Tensor tdKrdS =
-            mma_partition_fragment_AB</*A=*/!dKV_swapAB>(wg_mma_dKV, sdSt);
-        Tensor tdKrdS_cur = tdKrdS(
-            _,
-            _,
-            _,
-            cute::conditional_return<kStages_dS == 1>(
-                _0{}, smem_pipe_read.index()));
-        hstu::gemm<
-            /*zero_init=*/false,
-            /*wg_wait=*/1,
-            /*SwapAB=*/dKV_swapAB,
-            /*M_slice=*/0>(
-            tiled_mma_dKV,
-            tdKrdS_cur,
-            tdKrQ(_, _, _, smem_pipe_read.index()),
-            tdKrdK);
-        pipeline_do.consumer_release(smem_pipe_read_do_cur); // release dO
-
-        hstu::gemm<
-            /*zero_init=*/true,
-            /*wg_wait=*/0,
-            /*SwapAB=*/dQ_swapAB,
-            /*M_slice=*/1>(tiled_mma_dQ, tdQrdS_cur, tdQrK, tdQrdQ);
-#pragma unroll
-        for (int i = size(tdQrdQ_atomic) / 2; i < size(tdQrdQ_atomic); ++i) {
-          atomicAdd(&tdQgdQaccum_atomic(i), tdQrdQ_atomic(i));
-        }
-
-        hstu::gemm<
-            /*zero_init=*/false,
-            /*wg_wait=*/-1,
-            /*SwapAB=*/dKV_swapAB,
-            /*M_slice=*/1>(
-            tiled_mma_dKV,
-            tdKrdS_cur,
-            tdKrQ(_, _, _, smem_pipe_read.index()),
-            tdKrdK);
-      }
-
-      warpgroup_wait<0>();
-      pipeline_q.consumer_release(smem_pipe_read); // release Q
-      ++smem_pipe_read;
-      if constexpr (!Q_dO_same_stages) {
-        ++smem_pipe_read_do;
-      }
-    };
-    static constexpr int kBlockM = get<0>(TileShape_MNK{});
-    static constexpr int kBlockN = get<1>(TileShape_MNK{});
-
-    if constexpr (Cross) {
-      auto mask_fn = [&](auto& tSrS, int m_block) {
-        mask.template apply<
-            true /*Seqlenq_mask*/,
-            true /*Seqlenk_mask*/,
-            Causal,
-            false /*Local*/,
-            false /*Contexual_mask*/,
-            false /*Target_mask*/,
-            Cross,
-            false /*Softmax*/>(tSrS, m_block, n_block);
-      };
-      CUTLASS_PRAGMA_NO_UNROLL
-      for (; m_block < m_block_max; ++m_block) {
-        bwd_step(m_block, mask_fn);
-      }
-      if constexpr (Q_dO_same_stages) {
-        smem_pipe_read_do = smem_pipe_read;
-      }
-      ++work_idx;
-      return true;
-    }
-    if constexpr (Has_targets) {
-      if (n_block * kBlockN >= seqlen_info.uihlen_q) {
-        auto mask_fn = [&](auto& tSrS, int m_block) {
-          mask.template apply<
-              true /*Seqlenq_mask*/,
-              true /*Seqlenk_mask*/,
-              false /*Causal*/,
-              false /*Local*/,
-              false /*Contexual_mask*/,
-              Has_targets /*Target_mask*/,
-              false /*Cross*/,
-              false /*Softmax*/>(tSrS, m_block, n_block);
-        };
-        CUTLASS_PRAGMA_NO_UNROLL
-        for (; m_block < m_block_max; ++m_block) {
-          bwd_step(m_block, mask_fn);
-        }
-        if constexpr (Q_dO_same_stages) {
-          smem_pipe_read_do = smem_pipe_read;
-        }
-        ++work_idx;
-        return true;
-      } else if ((n_block + 1) * kBlockN >= seqlen_info.uihlen_q) {
-        if constexpr ((Causal || Local) && SeparateMaskingIterations) {
-          auto mask_fn = [&](auto& tSrS, int m_block) {
-            mask.template apply<
-                true /*Seqlenq_mask*/,
-                true /*Seqlenk_mask*/,
-                Causal,
-                Local,
-                Contexual_mask,
-                Has_targets /*Target_mask*/,
-                false /*Cross*/,
-                false /*Softmax*/>(tSrS, m_block, n_block);
-          };
-          int const m_block_masking_max =
-              ((n_block + 1) * kBlockN - 1) / kBlockM + 1;
-          CUTLASS_PRAGMA_NO_UNROLL
-          for (; m_block < std::min(m_block_max, m_block_masking_max);
-               ++m_block) {
-            bwd_step(m_block, mask_fn);
-          }
-        }
-
-        auto mask_fn = [&](auto& tSrS, int m_block) {
-          mask.template apply<
-              true /*Seqlenq_mask*/,
-              true /*Seqlenk_mask*/,
-              Causal && !SeparateMaskingIterations,
-              Local && !SeparateMaskingIterations,
-              Contexual_mask,
-              Has_targets /*Target_mask*/,
-              false /*Cross*/,
-              false /*Softmax*/>(tSrS, m_block, n_block);
-        };
-        if constexpr (SeparateMaskingIterations) {
-          int const m_block_max_before_local_mask =
-              !Local || !SeparateMaskingIterations
-              ? m_block_max
-              : std::min(
-                    m_block_max, (n_block * kBlockN + max_attn_len_) / kBlockM);
-          CUTLASS_PRAGMA_NO_UNROLL
-          for (; m_block < m_block_max_before_local_mask; ++m_block) {
-            bwd_step(m_block, mask_fn);
-          }
-        } else {
-          int num_m_block = m_block_max - m_block_min;
-          CUTLASS_PRAGMA_NO_UNROLL
-          for (int i = 0; i < num_m_block + full_m_block_max -
-                   full_m_block_min + contexual_m_block_max;
-               ++i) {
-            if (i < num_m_block) {
-              m_block = m_block_min + i;
-            } else if (i < num_m_block + contexual_m_block_max) {
-              m_block = i - num_m_block;
-            } else {
-              m_block =
-                  i - num_m_block - contexual_m_block_max + full_m_block_min;
-            }
-            bwd_step(m_block, mask_fn);
-          }
-        }
-
-        if constexpr (Local && SeparateMaskingIterations) {
-          auto mask_fn = [&](auto& tSrS, int m_block) {
-            mask.template apply<
-                true /*Seqlenq_mask*/,
-                true /*Seqlenk_mask*/,
-                false /*Causal_mask*/,
-                Local,
-                Contexual_mask,
-                Has_targets /*Target_mask*/,
-                false /*Cross*/,
-                false /*Softmax*/>(tSrS, m_block, n_block);
-          };
-          CUTLASS_PRAGMA_NO_UNROLL
-          for (; m_block < m_block_max; ++m_block) {
-            bwd_step(m_block, mask_fn);
-          }
-        }
-        if constexpr (Contexual_mask && SeparateMaskingIterations) {
-          auto mask_fn = [&](auto& tSrS, int m_block) {
-            mask.template apply<
-                true /*Seqlenq_mask*/,
-                true /*Seqlenk_mask*/,
-                Causal /*Causal_mask*/,
-                Local /*Local_mask*/,
-                Contexual_mask,
-                Has_targets,
-                false /*Cross*/,
-                false /*Softmax*/>(tSrS, m_block, n_block);
-          };
-          CUTLASS_PRAGMA_NO_UNROLL
-          for (m_block = 0; m_block < contexual_m_block_max; ++m_block) {
-            bwd_step(m_block, mask_fn);
-          }
-        }
-
-        if constexpr (Local && SeparateMaskingIterations) {
-          auto mask_fn = [&](auto& tSrS, int m_block) {
-            mask.template apply<
-                true /*Seqlenq_mask*/,
-                true /*Seqlenk_mask*/,
-                false /*Causal_mask*/,
-                Local,
-                Contexual_mask,
-                Has_targets,
-                false /*Cross*/,
-                false /*Softmax*/>(tSrS, m_block, n_block);
-          };
-          CUTLASS_PRAGMA_NO_UNROLL
-          for (m_block = full_m_block_min; m_block < full_m_block_max;
-               ++m_block) {
-            bwd_step(m_block, mask_fn);
-          }
-        }
-        if constexpr (Q_dO_same_stages) {
-          smem_pipe_read_do = smem_pipe_read;
-        }
-        ++work_idx;
-        return true;
-      }
-    }
-    // We have separate iterations with causal masking. Not necessary for hdim
-    // 128 but for hdim 64 this helps quite a bit to not have to do causal
-    // masking for most of the iterations.
-    if constexpr ((Causal || Local) && SeparateMaskingIterations) {
-      auto mask_fn = [&](auto& tSrS, int m_block) {
-        mask.template apply<
-            true /*Seqlenq_mask*/,
-            true /*Seqlenk_mask*/,
-            Causal,
-            Local,
-            Contexual_mask,
-            false /*Target_mask*/,
-            false /*Cross*/,
-            false /*Softmax*/>(tSrS, m_block, n_block);
-      };
-      static constexpr int kBlockM = get<0>(TileShape_MNK{});
-      int const m_block_masking_max =
-          ((n_block + 1) * kBlockN - 1) / kBlockM + 1;
-      CUTLASS_PRAGMA_NO_UNROLL
-      for (; m_block < std::min(m_block_max, m_block_masking_max); ++m_block) {
-        bwd_step(m_block, mask_fn);
-      }
-    }
-
-    auto mask_fn = [&](auto& tSrS, int m_block) {
-      mask.template apply<
-          true /*Seqlenq_mask*/,
-          true /*Seqlenk_mask*/,
-          Causal && !SeparateMaskingIterations,
-          Local && !SeparateMaskingIterations,
-          Contexual_mask,
-          false /*Target_mask*/,
-          false /*Cross*/,
-          false /*Softmax*/>(tSrS, m_block, n_block);
-    };
-    if constexpr (SeparateMaskingIterations) {
-      int const m_block_max_before_local_mask =
-          !Local || !SeparateMaskingIterations
-          ? m_block_max
-          : std::min(
-                m_block_max, (n_block * kBlockN + max_attn_len_) / kBlockM);
-      CUTLASS_PRAGMA_NO_UNROLL
-      for (; m_block < m_block_max_before_local_mask; ++m_block) {
-        bwd_step(m_block, mask_fn);
-      }
-    } else {
-      int num_m_block = m_block_max - m_block_min;
-      CUTLASS_PRAGMA_NO_UNROLL
-      for (int i = 0; i < num_m_block + full_m_block_max - full_m_block_min +
-               contexual_m_block_max;
-           ++i) {
-        if (i < num_m_block) {
-          m_block = m_block_min + i;
-        } else if (i < num_m_block + contexual_m_block_max) {
-          m_block = i - num_m_block;
-        } else {
-          m_block = i - num_m_block - contexual_m_block_max + full_m_block_min;
-        }
-        bwd_step(m_block, mask_fn);
-      }
-    }
-
-    if constexpr (Local && SeparateMaskingIterations) {
-      auto mask_fn = [&](auto& tSrS, int m_block) {
-        mask.template apply<
-            true /*Seqlenq_mask*/,
-            true /*Seqlenk_mask*/,
-            false /*Causal_mask*/,
-            Local,
-            Contexual_mask,
-            false /*Target_mask*/,
-            false /*Cross*/,
-            false /*Softmax*/>(tSrS, m_block, n_block);
-      };
-      CUTLASS_PRAGMA_NO_UNROLL
-      for (; m_block < m_block_max; ++m_block) {
-        bwd_step(m_block, mask_fn);
-      }
-    }
-    if constexpr (Contexual_mask && SeparateMaskingIterations) {
-      auto mask_fn = [&](auto& tSrS, int m_block) {
-        mask.template apply<
-            true /*Seqlenq_mask*/,
-            true /*Seqlenk_mask*/,
-            Causal /*Causal_mask*/,
-            Local /*Local_mask*/,
-            Contexual_mask,
-            false /*Target_mask*/,
-            false /*Cross*/,
-            false /*Softmax*/>(tSrS, m_block, n_block);
-      };
-      CUTLASS_PRAGMA_NO_UNROLL
-      for (m_block = 0; m_block < contexual_m_block_max; ++m_block) {
-        bwd_step(m_block, mask_fn);
-      }
-    }
-
-    if constexpr (Local && SeparateMaskingIterations) {
-      auto mask_fn = [&](auto& tSrS, int m_block) {
-        mask.template apply<
-            true /*Seqlenq_mask*/,
-            true /*Seqlenk_mask*/,
-            false /*Causal_mask*/,
-            Local,
-            Contexual_mask,
-            false /*Target_mask*/,
-            false /*Cross*/,
-            false /*Softmax*/>(tSrS, m_block, n_block);
-      };
-      CUTLASS_PRAGMA_NO_UNROLL
-      for (m_block = full_m_block_min; m_block < full_m_block_max; ++m_block) {
-        bwd_step(m_block, mask_fn);
-      }
-    }
-
-    // if (blockIdx.x == 0 && threadIdx.x == 128) { print_tensor(tdVrdV); }
-    if constexpr (Q_dO_same_stages) {
-      smem_pipe_read_do = smem_pipe_read;
-    }
-    ++work_idx;
-    return true;
-  }
-
-  template <typename SharedStorage, typename FrgTensordKV>
-  CUTLASS_DEVICE bool mma_softmax(
-      Params const& params,
-      MainloopPipeline pipeline_q,
-      MainloopPipeline_dO pipeline_do,
-      PipelineState& smem_pipe_read,
-      PipelineState_dO& smem_pipe_read_do,
-      FrgTensordKV& tdKrdK,
-      FrgTensordKV& tdVrdV,
-      int thread_idx,
-      int& work_idx,
-      cute::tuple<int32_t, int32_t, int32_t> block_coord,
-      SharedStorage& shared_storage) {
-    static_assert(
-        is_rmem<FrgTensordKV>::value,
-        "dK and dV tensor must be rmem resident.");
-
-    int n_block = get<0>(block_coord);
-    int bidb = get<2>(block_coord);
-    SeqlenInfo_t seqlen_info{
-        bidb,
-        get<0>(params.shape_Q),
-        get<0>(params.shape_K),
-        params.seq_offsets,
-        params.seq_offsets_q,
-        params.num_targets};
-    if constexpr (Jagged) {
-      static constexpr int kBlockN = get<1>(TileShape_MNK{});
-      if (n_block * kBlockN >= seqlen_info.seqlen_kv) {
-        return false;
-      }
-    }
-    int min_full_attn_seq_len_, max_attn_len_, contextual_seq_len_;
-    if constexpr (!Cross) {
-      if (params.num_groups > 1) {
-        int group_id = bidb / params.num_groups;
-        min_full_attn_seq_len_ = params.min_full_attn_seq_len_tensor[group_id];
-        max_attn_len_ = params.max_attn_len_tensor[group_id];
-        contextual_seq_len_ = params.contextual_seq_len_tensor[group_id];
-      } else {
-        min_full_attn_seq_len_ = params.min_full_attn_seq_len;
-        max_attn_len_ = params.max_attn_len;
-        contextual_seq_len_ = params.contextual_seq_len;
-      }
-    }
-    int m_block_min, m_block_max;
-    if constexpr (Cross) {
-      auto m_block_min_max = get_cross_m_block_min_max(
-          seqlen_info.uihlen_q,
-          seqlen_info.seqlen_q,
-          seqlen_info.seqlen_kv,
-          n_block);
-      m_block_min = get<0>(m_block_min_max);
-      m_block_max = get<1>(m_block_min_max);
-    } else {
-      auto m_block_min_max = get_m_block_min_max(
-          max_attn_len_,
-          contextual_seq_len_,
-          seqlen_info.uihlen_q,
-          seqlen_info.seqlen_q,
-          n_block);
-      m_block_min = get<0>(m_block_min_max);
-      m_block_max = get<1>(m_block_min_max);
-    }
-    auto full_m_block_min_max = get_full_m_block_min_max(
-        seqlen_info.uihlen_q,
-        seqlen_info.seqlen_q,
-        min_full_attn_seq_len_,
-        m_block_max,
-        n_block);
-    int const full_m_block_min = get<0>(full_m_block_min_max);
-    int const full_m_block_max = get<1>(full_m_block_min_max);
-    int contexual_m_block_max = get_contexual_m_block_max(
-        seqlen_info.uihlen_q, contextual_seq_len_, m_block_min, n_block);
-
-    Tensor sQ = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
-        SmemLayoutQ{});
-    Tensor sdO = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_do.data()),
-        SmemLayoutdO{});
-    Tensor sK = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_k.data()),
-        SmemLayoutK{});
-    Tensor sV = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_v.data()),
-        SmemLayoutV{});
-    Tensor sQt = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
-        SmemLayoutQt{});
-    Tensor sdOt = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_do.data()),
-        SmemLayoutdOt{});
-    Tensor sKt = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_k.data()),
-        SmemLayoutKt{});
-    Tensor sP = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_p.data()),
-        SmemLayoutPdS{});
-    Tensor sP_pi = cute::as_position_independent_swizzle_tensor(sP);
-    Tensor sPt = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_p.data()),
-        SmemLayoutPdSt{});
-    Tensor sPt_pi = cute::as_position_independent_swizzle_tensor(sPt);
-    Tensor sdS = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_ds.data()),
-        SmemLayoutPdS{});
-    Tensor sdS_pi = cute::as_position_independent_swizzle_tensor(sdS);
-    Tensor sdSt = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_ds.data()),
-        SmemLayoutPdSt{});
-    Tensor sdSt_pi = cute::as_position_independent_swizzle_tensor(sdSt);
-    Tensor sdQ = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_dqacc.data()),
-        SmemLayoutdQaccum{});
-    Tensor sLSEMma = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_lse.data()),
-        SmemLayoutLSEMma{});
-    Tensor sdPsumMma = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_dpsum.data()),
-        SmemLayoutLSEMma{});
-
-    static_assert(
-        stride<0>(typename TiledMmaSdP::ALayout{}) == 0 and
-            stride<0>(typename TiledMmaSdP::BLayout{}) == 0 and
-            size<0>(typename TiledMmaSdP::ALayout{}) ==
-                cutlass::NumThreadsPerWarpGroup and
-            size<0>(typename TiledMmaSdP::BLayout{}) ==
-                cutlass::NumThreadsPerWarpGroup,
-        "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
-    constexpr int MmaWarpGroups =
-        NumMmaThreads / cutlass::NumThreadsPerWarpGroup;
-    Layout warp_group_thread_layout = make_layout(
-        make_shape(Int<MmaWarpGroups>{}),
-        make_stride(Int<cutlass::NumThreadsPerWarpGroup>{}));
-    Layout warp_group_thread_layout_dq = make_layout(
-        make_shape(Int<NumMmaWarpGroups>{}),
-        make_stride(Int<cutlass::NumThreadsPerWarpGroup>{}));
-
-    int warp_group_idx = __shfl_sync(
-        0xFFFFFFFF, thread_idx / cutlass::NumThreadsPerWarpGroup, 0);
-    TiledMmaSdP tiled_mma_SdP;
-    using TiledMmadP =
-        std::conditional_t<!Mma_dP_is_RS, TiledMmaSdP, TiledMmadPRS>;
-    TiledMmadP tiled_mma_dP;
-    TiledMmadKV tiled_mma_dKV;
-    TiledMmadQ tiled_mma_dQ;
-
-    auto wg_mma_SdP =
-        tiled_mma_SdP.get_slice(warp_group_thread_layout(warp_group_idx));
-    auto wg_mma_dP =
-        tiled_mma_dP.get_slice(warp_group_thread_layout(warp_group_idx));
-    auto thread_mma_SdP = tiled_mma_SdP.get_thread_slice(thread_idx);
-    auto wg_mma_dKV =
-        tiled_mma_dKV.get_slice(warp_group_thread_layout(warp_group_idx));
-    auto wg_mma_dQ =
-        tiled_mma_dQ.get_slice(warp_group_thread_layout_dq(warp_group_idx));
-
-    auto smem_tiled_copy_PdS =
-        make_tiled_copy_C(SmemCopyAtomPdS{}, tiled_mma_SdP);
-    auto smem_thr_copy_PdS = smem_tiled_copy_PdS.get_thread_slice(thread_idx);
-
-    R2STiledCopydQaccum r2s_tiled_copy_dQaccum;
-    auto r2s_thr_copy_dQaccum =
-        r2s_tiled_copy_dQaccum.get_thread_slice(thread_idx);
-    Tensor tdQsdQaccum = r2s_thr_copy_dQaccum.partition_D(sdQ);
-    // if (thread_idx == 0) { print(sdQ); printf("\n"); print(tdQsdQaccum);
-    // printf("\n"); }
-
-    // Allocate "fragments/descriptors"
-    // We have to use the templated mma_partition_fragment_AB instead of
-    // cute::conditional_return or lambda, because some partition_fragment_A/B
-    // don't compile.
-    // https://stackoverflow.com/questions/50051473/if-constexpr-in-c17-does-not-work-in-a-non-templated-function
-    Tensor tSrQ = mma_partition_fragment_AB</*A=*/!SdP_swapAB>(wg_mma_SdP, sQ);
-    Tensor tSrK = mma_partition_fragment_AB</*A=*/SdP_swapAB>(wg_mma_SdP, sK);
-    Tensor tdPrdO =
-        mma_partition_fragment_AB</*A=*/!SdP_swapAB>(wg_mma_SdP, sdO);
-    Tensor tdPrV = mma_partition_fragment_AB</*A=*/SdP_swapAB>(wg_mma_dP, sV);
-    Tensor tdVrdO =
-        mma_partition_fragment_AB</*A=*/dKV_swapAB>(wg_mma_dKV, sdOt);
-    Tensor tdKrQ = mma_partition_fragment_AB</*A=*/dKV_swapAB>(wg_mma_dKV, sQt);
-    Tensor tdQrdS = mma_partition_fragment_AB</*A=*/!dQ_swapAB>(wg_mma_dQ, sdS);
-    Tensor tdQrK = mma_partition_fragment_AB</*A=*/dQ_swapAB>(wg_mma_dQ, sKt);
-
-    Tensor tPsP = smem_thr_copy_PdS.partition_D(
-        cute::conditional_return<!SdP_swapAB>(
-            sP_pi, sPt_pi)); // ((Atom,AtomNum),PIPE_M,PIPE_N)
-    Tensor tdSsdS = smem_thr_copy_PdS.partition_D(
-        cute::conditional_return<!SdP_swapAB>(
-            sdS_pi, sdSt_pi)); // ((Atom,AtomNum),PIPE_M,PIPE_N)
-    // if (blockIdx.x == 0 && threadIdx.x == 128) { print(smem_thr_copy_PdS);
-    // print(sP_pi); printf("\n"); print(sPt_pi); printf("\n"); print(tPsP);
-    // printf("\n"); print(tdSsdS); printf("\n"); }
-
-    // thread_mma_SdP.partition_C(sLSEMma) has shape ((2, 2, V), MMA_M, MMA_N,
-    // PIPE), we only take the col indices or row indices, depending on whether
-    // SdP_swapAB.
-    Tensor tLSEsLSE = cute::conditional_return<!SdP_swapAB>(
-        group_modes<0, 2>(thread_mma_SdP.partition_C(sLSEMma)(
-            make_coord(_0{}, _, _0{}), _, _0{}, _)), // (2, MMA_M, PIPE)
-        group_modes<0, 3>(thread_mma_SdP.partition_C(sLSEMma)(
-            make_coord(_, _0{}, _), _0{}, _, _))); // (2, V, MMA_N, PIPE)
-    Tensor tLSEsdPsum = cute::conditional_return<!SdP_swapAB>(
-        group_modes<0, 2>(thread_mma_SdP.partition_C(sdPsumMma)(
-            make_coord(_0{}, _, _0{}), _, _0{}, _)),
-        group_modes<0, 3>(thread_mma_SdP.partition_C(sdPsumMma)(
-            make_coord(_, _0{}, _), _0{}, _, _)));
-    // if (blockIdx.x == 0 && threadIdx.x == 128) { print(sLSEMma);
-    // printf("\n"); print(tLSEsLSE); printf("\n"); } If we want to split the
-    // stats among the 8 threads that share the same rows.
-    static constexpr int kStatsPerThread =
-        cute::ceil_div(decltype(size(tLSEsLSE))::value, 8);
-
-    auto consumer_wait = [](auto& pipeline, auto& smem_pipe_read) {
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-    };
-
-    int bidh = get<1>(block_coord);
-    // For the case where we do atomicAdd directly to gdQaccum instead of using
-    // TMA
-    Tensor mdQaccum = make_tensor(
-        make_gmem_ptr(reinterpret_cast<ElementAccum*>(params.ptr_dQaccum)),
-        params.shape_dQaccum,
-        params.stride_dQaccum)(_, bidh, !Jagged ? bidb : 0);
-    Tensor gdQaccum_ = local_tile(
-        domain_offset(
-            make_coord(seqlen_info.offset_q_padded * kHeadDim), mdQaccum),
-        Shape<Int<kBlockM * kHeadDim>>{},
-        make_coord(_)); // (M * K, _)
-    Tensor gdQaccum = cute::flat_divide(
-        gdQaccum_,
-        Int<kBlockM * kHeadDim / NumMmaWarpGroups>{}); // (M * K / WG, WG, _)
-    // We can reuse r2s_thr_copy_dQaccum for this partitioning
-    Tensor tdQgdQaccum = r2s_thr_copy_dQaccum.partition_D(gdQaccum);
-    // if (blockIdx.x == 0 && threadIdx.x == 128) { print(mdQaccum);
-    // printf("\n"); print(gdQaccum_); printf("\n"); print(gdQaccum);
-    // printf("\n"); print(tdQgdQaccum); printf("\n"); }
-
-    hstu::Mask<kBlockM, kBlockN, TiledMmaSdP, SdP_swapAB> mask(
-        thread_idx,
-        seqlen_info.seqlen_q,
-        seqlen_info.seqlen_kv,
-        max_attn_len_,
-        min_full_attn_seq_len_,
-        contextual_seq_len_,
-        seqlen_info.uihlen_q);
-
-    int m_block = m_block_min;
-
-    clear(tdKrdK);
-    clear(tdVrdV);
-    // tiled_mma_dKV.accumulate_ = GMMA::ScaleOut::Zero;
-
-    cutlass::ConsumerToken barrier_token = static_cast<cutlass::BarrierStatus>(
-        shared_storage.pipelines.barrier_KV.try_wait(work_idx % 2));
-    if (barrier_token == cutlass::BarrierStatus::WaitAgain) {
-      shared_storage.pipelines.barrier_KV.wait(work_idx % 2);
-    }
-
-    if constexpr (Mma_dP_is_RS) {
-      using SmemCopyAtomV = Copy_Atom<cute::SM75_U32x4_LDSM_N, Element>;
-      auto smem_tiled_copy_V = make_tiled_copy_A(SmemCopyAtomV{}, tiled_mma_dP);
-      auto smem_thr_copy_V = smem_tiled_copy_V.get_thread_slice(thread_idx);
-      Tensor tdPrV_copy_view = smem_thr_copy_V.retile_D(tdPrV);
-      Tensor tdPsV_copy_view = smem_thr_copy_V.partition_S(
-          cute::as_position_independent_swizzle_tensor(sV));
-      cute::copy(smem_tiled_copy_V, tdPsV_copy_view, tdPrV_copy_view);
-    }
-    static constexpr int Qdim = !SdP_swapAB ? 0 : 1;
-    auto thread0_mma_SdP = tiled_mma_SdP.get_thread_slice(_0{});
-    Tensor cS = cute::make_identity_tensor(
-        Shape<
-            Int<!SdP_swapAB ? kBlockM : kBlockN>,
-            Int<!SdP_swapAB ? kBlockN : kBlockM>>{});
-    Tensor tScS = thread_mma_SdP.partition_C(cS);
-    Tensor tScS_rowcol = make_tensor(
-        tScS.data(),
-        hstu::convert_layout_acc_rowcol</*Transposed=*/SdP_swapAB>(
-            tScS.layout()));
-    Tensor t0ScS = thread0_mma_SdP.partition_C(cS);
-    Tensor t0ScS_rowcol = make_tensor(
-        t0ScS.data(),
-        hstu::convert_layout_acc_rowcol</*Transposed=*/SdP_swapAB>(
-            t0ScS.layout()));
-    int const thread_qdim_offset = get<Qdim>(tScS_rowcol(_0{}, _0{}));
-
-    auto bwd_step = [&](int m_block, auto mask_fn) {
-      Tensor tSrS = partition_fragment_C(
-          tiled_mma_SdP,
-          select<!SdP_swapAB ? 0 : 1, !SdP_swapAB ? 1 : 0>(TileShape_MNK{}));
-      consumer_wait(pipeline_q, smem_pipe_read);
-      hstu::gemm</*zero_init=*/true, /*wg_wait=*/-1, /*SwapAB=*/SdP_swapAB>(
-          tiled_mma_SdP, tSrQ(_, _, _, smem_pipe_read.index()), tSrK, tSrS);
-      Tensor tLSErLSE = cute::conditional_return<!ShuffleLSE>(
-          make_fragment_like(tLSEsLSE(_, _0{})),
-          make_tensor<ElementAccum>(Int<kStatsPerThread>{}));
-      if constexpr (!ShuffleLSE) {
-        cute::copy(tLSEsLSE(_, smem_pipe_read.index()), tLSErLSE);
-      } else {
-#pragma unroll
-        for (int i = 0; i < kStatsPerThread; ++i) {
-          // It's ok to read OOB, since we made sure sLSE is large enough and we
-          // won't use the OOB values
-          tLSErLSE(i) =
-              tLSEsLSE((thread_idx % 32) / 4 + i * 8, smem_pipe_read.index());
-        }
-      }
-      Tensor tdPrdP = partition_fragment_C(
-          tiled_mma_SdP,
-          select<!SdP_swapAB ? 0 : 1, !SdP_swapAB ? 1 : 0>(TileShape_MNK{}));
-      PipelineState_dO smem_pipe_read_do_cur =
-          cute::conditional_return<Q_dO_same_stages>(
-              smem_pipe_read, smem_pipe_read_do);
-      consumer_wait(pipeline_do, smem_pipe_read_do_cur);
-      hstu::gemm</*zero_init=*/true, /*wg_wait=*/-1, /*SwapAB=*/SdP_swapAB>(
-          tiled_mma_dP,
-          tdPrdO(_, _, _, smem_pipe_read_do_cur.index()),
-          tdPrV,
-          tdPrdP);
-      warpgroup_wait<1>();
-      // Reshape tSrS from ((2, 2, V), MMA_N, MMA_M) to (nrow=(2, V, MMA_M),
-      // ncol=(2, MMA_N))
-      Tensor scores = make_tensor(
-          tSrS.data(),
-          hstu::convert_layout_acc_rowcol</*Transposed=*/SdP_swapAB>(
-              tSrS.layout()));
-      mask_fn(tSrS, m_block);
-#pragma unroll
-      for (int mi = 0; mi < size<0>(scores); ++mi) {
-        float const lse_scaled = [&] {
-          if constexpr (!ShuffleLSE)
-            return tLSErLSE(mi);
-          else
-            return __shfl_sync(
-                0xffffffff, tLSErLSE(mi / 8), (mi % 8) * 4 + (thread_idx % 4));
-        }();
-#pragma unroll
-        for (int ni = 0; ni < size<1>(scores); ++ni) {
-          scores(mi, ni) =
-              exp2f(scores(mi, ni) * params.alpha_log2 - lse_scaled);
-        }
-      }
-      Tensor tLSErdPsum = cute::conditional_return<!ShuffledPsum>(
-          make_fragment_like(tLSEsdPsum(_, _0{})),
-          make_tensor<ElementAccum>(Int<kStatsPerThread>{}));
-      if constexpr (!ShuffledPsum) {
-        cute::copy(tLSEsdPsum(_, smem_pipe_read_do_cur.index()), tLSErdPsum);
-      } else {
-#pragma unroll
-        for (int i = 0; i < kStatsPerThread; ++i) {
-          tLSErdPsum(i) = tLSEsdPsum(
-              (thread_idx % 32) / 4 + i * 8, smem_pipe_read_do_cur.index());
-        }
-      }
-
-      warpgroup_wait<0>();
-      // Reshape tdPrdP from ((2, 2, V), MMA_N, MMA_M) to (nrow=(2, V, MMA_M),
-      // ncol=(2, MMA_N))
-      Tensor dS = make_tensor(tdPrdP.data(), scores.layout());
-#pragma unroll
-      for (int mi = 0; mi < size<0>(dS); ++mi) {
-        float const dP_sum_cur = [&] {
-          if constexpr (!ShuffledPsum)
-            return tLSErdPsum(mi);
-          else
-            return __shfl_sync(
-                0xffffffff,
-                tLSErdPsum(mi / 8),
-                (mi % 8) * 4 + (thread_idx % 4));
-        }();
-#pragma unroll
-        for (int ni = 0; ni < size<1>(dS); ++ni) {
-          dS(mi, ni) =
-              scores(mi, ni) * (dS(mi, ni) - dP_sum_cur) * params.alpha;
-        }
-      }
-      // Convert scores from fp32 to fp16/bf16
-      Tensor rP = make_tensor_like<Element>(tSrS);
-      hstu::convert_type_out(tSrS, rP);
-      if constexpr (!Mma_dKV_is_RS) {
-        // Need to sync to make sure P has already been used in the previous
-        // iteration before writing new values
-        if constexpr (kStages_dS == 1) {
-          cutlass::arch::NamedBarrier::sync(
-              NumMmaThreads,
-              static_cast<uint32_t>(BwdNamedBarriers::PdS) /*id*/);
-        }
-        Tensor tPaP =
-            smem_thr_copy_PdS.retile_S(rP); // ((Atom,AtomNum), MMA_N, MMA_N)
-        cute::copy(
-            smem_tiled_copy_PdS,
-            tPaP,
-            tPsP(
-                _,
-                _,
-                _,
-                cute::conditional_return<kStages_dS == 1>(
-                    _0{}, smem_pipe_read.index())));
-      }
-      Tensor rdS = make_tensor_like<Element>(tdPrdP);
-      hstu::convert_type_out(tdPrdP, rdS);
-      // If there's double buffering on dS, we don't need to sync here.
-      // Otherwise we might have WG1 writing to dS before WG2 is done reading
-      // from it during MmadQ. But because both WGs have to sync at the end of
-      // the loop and double buffering, this race condition is not possible.
-      // This sync is to ensure (1) P is written in case of !Mma_dKV_is_RS and
-      // (2) dS is already read by the Mma in the previous iteration in case of
-      // Mma_dKV_is_RS.
-      if constexpr (!Mma_dKV_is_RS || (kStages_dS == 1 && Mma_dKV_is_RS)) {
-        cutlass::arch::fence_view_async_shared();
-        cutlass::arch::NamedBarrier::sync(
-            NumMmaThreads, static_cast<uint32_t>(BwdNamedBarriers::PdS) /*id*/);
-      }
-      // For hdim 64, It's faster to write to smem_dS first before the dV gemm
-      Tensor tdSadS =
-          smem_thr_copy_PdS.retile_S(rdS); // ((Atom,AtomNum), MMA_N, MMA_N)
-      cute::copy(
-          smem_tiled_copy_PdS,
-          tdSadS,
-          tdSsdS(
-              _,
-              _,
-              _,
-              cute::conditional_return<kStages_dS == 1>(
-                  _0{}, smem_pipe_read.index())));
-
-      if constexpr (!Slice_dQKV_Mma) {
-        // Most cases take this path, except for hdim256 where we want to slice
-        // to reduce register pressure
-        if constexpr (Mma_dKV_is_RS) {
-          Tensor tdVrP = make_tensor(
-              rP.data(), convert_layout_acc_Aregs<TiledMmadKV>(tSrS.layout()));
-          hstu::gemm</*zero_init=*/false, /*wg_wait=*/-1>(
-              tiled_mma_dKV,
-              tdVrP,
-              tdVrdO(_, _, _, smem_pipe_read_do_cur.index()),
-              tdVrdV);
-        } else {
-          Tensor tdVrP =
-              mma_partition_fragment_AB</*A=*/!dKV_swapAB>(wg_mma_dKV, sPt);
-          Tensor tdVrP_cur = tdVrP(
-              _,
-              _,
-              _,
-              cute::conditional_return<kStages_dS == 1>(
-                  _0{}, smem_pipe_read.index()));
-          hstu::
-              gemm</*zero_init=*/false, /*wg_wait=*/-1, /*SwapAB=*/dKV_swapAB>(
-                  tiled_mma_dKV,
-                  tdVrP_cur,
-                  tdVrdO(_, _, _, smem_pipe_read_do_cur.index()),
-                  tdVrdV);
-        }
-        // SMEM fence to make sure sdS is written before it's read by WGMMA
-        cutlass::arch::fence_view_async_shared();
-        cutlass::arch::NamedBarrier::sync(
-            NumMmaThreads, static_cast<uint32_t>(BwdNamedBarriers::PdS) /*id*/);
-        Tensor tdQrdQ = partition_fragment_C(
-            tiled_mma_dQ,
-            select<!dQ_swapAB ? 0 : 2, !dQ_swapAB ? 2 : 0>(TileShape_MNK{}));
-        Tensor tdQrdS_cur = tdQrdS(
-            _,
-            _,
-            _,
-            cute::conditional_return<kStages_dS == 1>(
-                _0{}, smem_pipe_read.index()));
-        hstu::gemm</*zero_init=*/true, /*wg_wait=*/1, /*SwapAB=*/dQ_swapAB>(
-            tiled_mma_dQ, tdQrdS_cur, tdQrK, tdQrdQ);
-        pipeline_do.consumer_release(smem_pipe_read_do_cur); // release dQ
-
-        if constexpr (Mma_dKV_is_RS) {
-          Tensor tdKrdS = make_tensor(
-              rdS.data(),
-              convert_layout_acc_Aregs<TiledMmadKV>(tdPrdP.layout()));
-          hstu::gemm</*zero_init=*/false, /*wg_wait=*/1>(
-              tiled_mma_dKV,
-              tdKrdS,
-              tdKrQ(_, _, _, smem_pipe_read.index()),
-              tdKrdK);
-        } else {
-          Tensor tdKrdS =
-              mma_partition_fragment_AB</*A=*/!dKV_swapAB>(wg_mma_dKV, sdSt);
-          Tensor tdKrdS_cur = tdKrdS(
-              _,
-              _,
-              _,
-              cute::conditional_return<kStages_dS == 1>(
-                  _0{}, smem_pipe_read.index()));
-          hstu::gemm</*zero_init=*/false, /*wg_wait=*/1, /*SwapAB=*/dKV_swapAB>(
-              tiled_mma_dKV,
-              tdKrdS_cur,
-              tdKrQ(_, _, _, smem_pipe_read.index()),
-              tdKrdK);
-        }
-        if constexpr (dQacc_use_TMA) {
-          int const warp_group_idx =
-              hstu::canonical_warp_group_idx_nosync() - 1;
-          cutlass::arch::NamedBarrier::sync(
-              cutlass::NumThreadsPerWarpGroup + cutlass::NumThreadsPerWarp,
-              static_cast<uint32_t>(BwdNamedBarriers::dQEmptyWG1) +
-                  warp_group_idx /*id*/); // sdQ full, to be written to gmem
-          Tensor taccdQrdQ = r2s_thr_copy_dQaccum.retile_S(tdQrdQ);
-          cute::copy(r2s_tiled_copy_dQaccum, taccdQrdQ, tdQsdQaccum);
-          cutlass::arch::fence_view_async_shared();
-          cutlass::arch::NamedBarrier::arrive(
-              cutlass::NumThreadsPerWarpGroup + cutlass::NumThreadsPerWarp,
-              static_cast<uint32_t>(BwdNamedBarriers::dQFullWG1) +
-                  warp_group_idx /*id*/); // sdQ full, to be written to gmem
-        } else {
-          // We can reuse r2s_thr_copy_dQaccum for this partitioning
-          Tensor tdQrdQ_atomic =
-              recast<float4>(r2s_thr_copy_dQaccum.retile_S(tdQrdQ));
-          Tensor tdQgdQaccum_atomic =
-              recast<float4>(tdQgdQaccum(_, _, _, m_block));
-          static_assert(
-              CUTE_STATIC_V(size(tdQrdQ_atomic)) ==
-              CUTE_STATIC_V(size(tdQgdQaccum_atomic)));
-#pragma unroll
-          for (int i = 0; i < size(tdQrdQ_atomic); ++i) {
-            atomicAdd(&tdQgdQaccum_atomic(i), tdQrdQ_atomic(i));
-          }
-        }
-
-      } else { // Slice_dQKV_Mma
-
-        static_assert(!(Slice_dQKV_Mma && Mma_dKV_is_RS));
-        Tensor tdVrP =
-            mma_partition_fragment_AB</*A=*/!dKV_swapAB>(wg_mma_dKV, sPt);
-        Tensor tdVrP_cur = tdVrP(
-            _,
-            _,
-            _,
-            cute::conditional_return<kStages_dS == 1>(
-                _0{}, smem_pipe_read.index()));
-        hstu::gemm<
-            /*zero_init=*/false,
-            /*wg_wait=*/-1,
-            /*SwapAB=*/dKV_swapAB,
-            /*M_slice=*/0>(
-            tiled_mma_dKV,
-            tdVrP_cur,
-            tdVrdO(_, _, _, smem_pipe_read_do_cur.index()),
-            tdVrdV);
-
-        cutlass::arch::fence_view_async_shared();
-        cutlass::arch::NamedBarrier::sync(
-            NumMmaThreads, static_cast<uint32_t>(BwdNamedBarriers::PdS) /*id*/);
-        Tensor tdQrdQ = partition_fragment_C(
-            tiled_mma_dQ,
-            select<!dQ_swapAB ? 0 : 2, !dQ_swapAB ? 2 : 0>(TileShape_MNK{}));
-        Tensor tdQrdS_cur = tdQrdS(
-            _,
-            _,
-            _,
-            cute::conditional_return<kStages_dS == 1>(
-                _0{}, smem_pipe_read.index()));
-        hstu::gemm<
-            /*zero_init=*/true,
-            /*wg_wait=*/-1,
-            /*SwapAB=*/dQ_swapAB,
-            /*M_slice=*/0>(tiled_mma_dQ, tdQrdS_cur, tdQrK, tdQrdQ);
-        hstu::gemm<
-            /*zero_init=*/false,
-            /*wg_wait=*/1,
-            /*SwapAB=*/dKV_swapAB,
-            /*M_slice=*/1>(
-            tiled_mma_dKV,
-            tdVrP_cur,
-            tdVrdO(_, _, _, smem_pipe_read_do_cur.index()),
-            tdVrdV);
-        Tensor tdQrdQ_atomic =
-            recast<float4>(r2s_thr_copy_dQaccum.retile_S(tdQrdQ));
-        Tensor tdQgdQaccum_atomic =
-            recast<float4>(tdQgdQaccum(_, _, _, m_block));
-#pragma unroll
-        for (int i = 0; i < size(tdQrdQ_atomic) / 2; ++i) {
-          atomicAdd(&tdQgdQaccum_atomic(i), tdQrdQ_atomic(i));
-        }
-
-        Tensor tdKrdS =
-            mma_partition_fragment_AB</*A=*/!dKV_swapAB>(wg_mma_dKV, sdSt);
-        Tensor tdKrdS_cur = tdKrdS(
-            _,
-            _,
-            _,
-            cute::conditional_return<kStages_dS == 1>(
-                _0{}, smem_pipe_read.index()));
-        hstu::gemm<
-            /*zero_init=*/false,
-            /*wg_wait=*/1,
-            /*SwapAB=*/dKV_swapAB,
-            /*M_slice=*/0>(
-            tiled_mma_dKV,
-            tdKrdS_cur,
-            tdKrQ(_, _, _, smem_pipe_read.index()),
-            tdKrdK);
-        pipeline_do.consumer_release(smem_pipe_read_do_cur); // release dO
-
-        hstu::gemm<
-            /*zero_init=*/true,
-            /*wg_wait=*/0,
-            /*SwapAB=*/dQ_swapAB,
-            /*M_slice=*/1>(tiled_mma_dQ, tdQrdS_cur, tdQrK, tdQrdQ);
-#pragma unroll
-        for (int i = size(tdQrdQ_atomic) / 2; i < size(tdQrdQ_atomic); ++i) {
-          atomicAdd(&tdQgdQaccum_atomic(i), tdQrdQ_atomic(i));
-        }
-
-        hstu::gemm<
-            /*zero_init=*/false,
-            /*wg_wait=*/-1,
-            /*SwapAB=*/dKV_swapAB,
-            /*M_slice=*/1>(
-            tiled_mma_dKV,
-            tdKrdS_cur,
-            tdKrQ(_, _, _, smem_pipe_read.index()),
-            tdKrdK);
-      }
-
-      warpgroup_wait<0>();
-      pipeline_q.consumer_release(smem_pipe_read); // release Q
-      ++smem_pipe_read;
-      if constexpr (!Q_dO_same_stages) {
-        ++smem_pipe_read_do;
-      }
-    };
-    static constexpr int kBlockM = get<0>(TileShape_MNK{});
-    static constexpr int kBlockN = get<1>(TileShape_MNK{});
-
-    if constexpr (Cross) {
-      auto mask_fn = [&](auto& tSrS, int m_block) {
-        mask.template apply<
-            true /*Seqlenq_mask*/,
-            true /*Seqlenk_mask*/,
-            Causal,
-            false /*Local*/,
-            false /*Contexual_mask*/,
-            false /*Target_mask*/,
-            Cross,
-            true /*Softmax*/>(tSrS, m_block, n_block);
-      };
-      CUTLASS_PRAGMA_NO_UNROLL
-      for (; m_block < m_block_max; ++m_block) {
-        bwd_step(m_block, mask_fn);
-      }
-      if constexpr (Q_dO_same_stages) {
-        smem_pipe_read_do = smem_pipe_read;
-      }
-      ++work_idx;
-      return true;
-    }
-    if constexpr (Has_targets) {
-      if (n_block * kBlockN >= seqlen_info.uihlen_q) {
-        auto mask_fn = [&](auto& tSrS, int m_block) {
-          mask.template apply<
-              true /*Seqlenq_mask*/,
-              true /*Seqlenk_mask*/,
-              false /*Causal*/,
-              false /*Local*/,
-              false /*Contexual_mask*/,
-              Has_targets /*Target_mask*/,
-              false /*Cross*/,
-              true /*Softmax*/>(tSrS, m_block, n_block);
-        };
-        CUTLASS_PRAGMA_NO_UNROLL
-        for (; m_block < m_block_max; ++m_block) {
-          bwd_step(m_block, mask_fn);
-        }
-        if constexpr (Q_dO_same_stages) {
-          smem_pipe_read_do = smem_pipe_read;
-        }
-        ++work_idx;
-        return true;
-      } else if ((n_block + 1) * kBlockN >= seqlen_info.uihlen_q) {
-        if constexpr ((Causal || Local) && SeparateMaskingIterations) {
-          auto mask_fn = [&](auto& tSrS, int m_block) {
-            mask.template apply<
-                true /*Seqlenq_mask*/,
-                true /*Seqlenk_mask*/,
-                Causal,
-                Local,
-                Contexual_mask,
-                Has_targets /*Target_mask*/,
-                false /*Cross*/,
-                true /*Softmax*/>(tSrS, m_block, n_block);
-          };
-          int const m_block_masking_max =
-              ((n_block + 1) * kBlockN - 1) / kBlockM + 1;
-          CUTLASS_PRAGMA_NO_UNROLL
-          for (; m_block < std::min(m_block_max, m_block_masking_max);
-               ++m_block) {
-            bwd_step(m_block, mask_fn);
-          }
-        }
-
-        auto mask_fn = [&](auto& tSrS, int m_block) {
-          mask.template apply<
-              true /*Seqlenq_mask*/,
-              true /*Seqlenk_mask*/,
-              Causal && !SeparateMaskingIterations,
-              Local && !SeparateMaskingIterations,
-              Contexual_mask,
-              Has_targets /*Target_mask*/,
-              false /*Cross*/,
-              true /*Softmax*/>(tSrS, m_block, n_block);
-        };
-        if constexpr (SeparateMaskingIterations) {
-          int const m_block_max_before_local_mask =
-              !Local || !SeparateMaskingIterations
-              ? m_block_max
-              : std::min(
-                    m_block_max, (n_block * kBlockN + max_attn_len_) / kBlockM);
-          CUTLASS_PRAGMA_NO_UNROLL
-          for (; m_block < m_block_max_before_local_mask; ++m_block) {
-            bwd_step(m_block, mask_fn);
-          }
-        } else {
-          int num_m_block = m_block_max - m_block_min;
-          CUTLASS_PRAGMA_NO_UNROLL
-          for (int i = 0; i < num_m_block + full_m_block_max -
-                   full_m_block_min + contexual_m_block_max;
-               ++i) {
-            if (i < num_m_block) {
-              m_block = m_block_min + i;
-            } else if (i < num_m_block + contexual_m_block_max) {
-              m_block = i - num_m_block;
-            } else {
-              m_block =
-                  i - num_m_block - contexual_m_block_max + full_m_block_min;
-            }
-            bwd_step(m_block, mask_fn);
-          }
-        }
-
-        if constexpr (Local && SeparateMaskingIterations) {
-          auto mask_fn = [&](auto& tSrS, int m_block) {
-            mask.template apply<
-                true /*Seqlenq_mask*/,
-                true /*Seqlenk_mask*/,
-                false /*Causal_mask*/,
-                Local,
-                Contexual_mask,
-                Has_targets /*Target_mask*/,
-                false /*Cross*/,
-                true /*Softmax*/>(tSrS, m_block, n_block);
-          };
-          CUTLASS_PRAGMA_NO_UNROLL
-          for (; m_block < m_block_max; ++m_block) {
-            bwd_step(m_block, mask_fn);
-          }
-        }
-        if constexpr (Contexual_mask && SeparateMaskingIterations) {
-          auto mask_fn = [&](auto& tSrS, int m_block) {
-            mask.template apply<
-                true /*Seqlenq_mask*/,
-                true /*Seqlenk_mask*/,
-                Causal /*Causal_mask*/,
-                Local /*Local_mask*/,
-                Contexual_mask,
-                Has_targets,
-                false /*Cross*/,
-                true /*Softmax*/>(tSrS, m_block, n_block);
-          };
-          CUTLASS_PRAGMA_NO_UNROLL
-          for (m_block = 0; m_block < contexual_m_block_max; ++m_block) {
-            bwd_step(m_block, mask_fn);
-          }
-        }
-
-        if constexpr (Local && SeparateMaskingIterations) {
-          auto mask_fn = [&](auto& tSrS, int m_block) {
-            mask.template apply<
-                true /*Seqlenq_mask*/,
-                true /*Seqlenk_mask*/,
-                false /*Causal_mask*/,
-                Local,
-                Contexual_mask,
-                Has_targets,
-                false /*Cross*/,
-                true /*Softmax*/>(tSrS, m_block, n_block);
-          };
-          CUTLASS_PRAGMA_NO_UNROLL
-          for (m_block = full_m_block_min; m_block < full_m_block_max;
-               ++m_block) {
-            bwd_step(m_block, mask_fn);
-          }
-        }
-        if constexpr (Q_dO_same_stages) {
-          smem_pipe_read_do = smem_pipe_read;
-        }
-        ++work_idx;
-        return true;
-      }
-    }
-    // We have separate iterations with causal masking. Not necessary for hdim
-    // 128 but for hdim 64 this helps quite a bit to not have to do causal
-    // masking for most of the iterations.
-    if constexpr ((Causal || Local) && SeparateMaskingIterations) {
-      auto mask_fn = [&](auto& tSrS, int m_block) {
-        mask.template apply<
-            true /*Seqlenq_mask*/,
-            true /*Seqlenk_mask*/,
-            Causal,
-            Local,
-            Contexual_mask,
-            false /*Target_mask*/,
-            false /*Cross*/,
-            true /*Softmax*/>(tSrS, m_block, n_block);
-      };
-      static constexpr int kBlockM = get<0>(TileShape_MNK{});
-      int const m_block_masking_max =
-          ((n_block + 1) * kBlockN - 1) / kBlockM + 1;
-      CUTLASS_PRAGMA_NO_UNROLL
-      for (; m_block < std::min(m_block_max, m_block_masking_max); ++m_block) {
-        bwd_step(m_block, mask_fn);
-      }
-    }
-
-    auto mask_fn = [&](auto& tSrS, int m_block) {
-      mask.template apply<
-          true /*Seqlenq_mask*/,
-          true /*Seqlenk_mask*/,
-          Causal && !SeparateMaskingIterations,
-          Local && !SeparateMaskingIterations,
-          Contexual_mask,
-          false /*Target_mask*/,
-          false /*Cross*/,
-          true /*Softmax*/>(tSrS, m_block, n_block);
-    };
-    if constexpr (SeparateMaskingIterations) {
-      int const m_block_max_before_local_mask =
-          !Local || !SeparateMaskingIterations
-          ? m_block_max
-          : std::min(
-                m_block_max, (n_block * kBlockN + max_attn_len_) / kBlockM);
-      CUTLASS_PRAGMA_NO_UNROLL
-      for (; m_block < m_block_max_before_local_mask; ++m_block) {
-        bwd_step(m_block, mask_fn);
-      }
-    } else {
-      int num_m_block = m_block_max - m_block_min;
-      CUTLASS_PRAGMA_NO_UNROLL
-      for (int i = 0; i < num_m_block + full_m_block_max - full_m_block_min +
-               contexual_m_block_max;
-           ++i) {
-        if (i < num_m_block) {
-          m_block = m_block_min + i;
-        } else if (i < num_m_block + contexual_m_block_max) {
-          m_block = i - num_m_block;
-        } else {
-          m_block = i - num_m_block - contexual_m_block_max + full_m_block_min;
-        }
-        bwd_step(m_block, mask_fn);
-      }
-    }
-
-    if constexpr (Local && SeparateMaskingIterations) {
-      auto mask_fn = [&](auto& tSrS, int m_block) {
-        mask.template apply<
-            true /*Seqlenq_mask*/,
-            true /*Seqlenk_mask*/,
-            false /*Causal_mask*/,
-            Local,
-            Contexual_mask,
-            false /*Target_mask*/,
-            false /*Cross*/,
-            true /*Softmax*/>(tSrS, m_block, n_block);
-      };
-      CUTLASS_PRAGMA_NO_UNROLL
-      for (; m_block < m_block_max; ++m_block) {
-        bwd_step(m_block, mask_fn);
-      }
-    }
-    if constexpr (Contexual_mask && SeparateMaskingIterations) {
-      auto mask_fn = [&](auto& tSrS, int m_block) {
-        mask.template apply<
-            true /*Seqlenq_mask*/,
-            true /*Seqlenk_mask*/,
-            Causal /*Causal_mask*/,
-            Local /*Local_mask*/,
-            Contexual_mask,
-            false /*Target_mask*/,
-            false /*Cross*/,
-            true /*Softmax*/>(tSrS, m_block, n_block);
-      };
-      CUTLASS_PRAGMA_NO_UNROLL
-      for (m_block = 0; m_block < contexual_m_block_max; ++m_block) {
-        bwd_step(m_block, mask_fn);
-      }
-    }
-
-    if constexpr (Local && SeparateMaskingIterations) {
-      auto mask_fn = [&](auto& tSrS, int m_block) {
-        mask.template apply<
-            true /*Seqlenq_mask*/,
-            true /*Seqlenk_mask*/,
-            false /*Causal_mask*/,
-            Local,
-            Contexual_mask,
-            false /*Target_mask*/,
-            false /*Cross*/,
-            true /*Softmax*/>(tSrS, m_block, n_block);
-      };
-      CUTLASS_PRAGMA_NO_UNROLL
-      for (m_block = full_m_block_min; m_block < full_m_block_max; ++m_block) {
-        bwd_step(m_block, mask_fn);
-      }
-    }
-
-    // if (blockIdx.x == 0 && threadIdx.x == 128) { print_tensor(tdVrdV); }
-    if constexpr (Q_dO_same_stages) {
-      smem_pipe_read_do = smem_pipe_read;
-    }
-    ++work_idx;
-    return true;
-  }
-};
-
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mainloop_fwd_sm90_tma_gmma_ws.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mainloop_fwd_sm90_tma_gmma_ws.h
deleted file mode 100644
index 7c8a447af..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mainloop_fwd_sm90_tma_gmma_ws.h
+++ /dev/null
@@ -1,2180 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/******************************************************************************
- * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
- *Pradeep Ramani, Tri Dao.
- ******************************************************************************/
-
-#pragma once
-
-#include <cutlass/array.h>
-#include <cutlass/cutlass.h>
-#include <cutlass/numeric_conversion.h>
-#include <cutlass/numeric_types.h>
-#include "cutlass/pipeline/pipeline.hpp"
-
-#include "cute/tensor.hpp"
-
-#include "cutlass/gemm/collective/builders/sm90_common.inl"
-
-#include "mask.h"
-#include "named_barrier.h"
-#include "seqlen.h"
-#include "sm90_pipeline_no_cluster.h"
-#include "utils.h"
-
-namespace hstu {
-
-using namespace cute;
-
-template <
-    int Stages,
-    class ClusterShape_,
-    class TileShape_MNK_,
-    class Element_,
-    class ElementAccum_,
-    class ArchTag_,
-    bool Causal,
-    bool Local,
-    bool Contexual_mask,
-    bool Jagged,
-    bool Has_targets,
-    bool Mma1_is_RS,
-    bool V_colmajor_,
-    bool Cross>
-struct CollectiveMainloopFwdSm90 {
-  static constexpr int kStages = Stages;
-  using ClusterShape = ClusterShape_;
-  using TileShape_MNK = TileShape_MNK_;
-  using Element = Element_;
-  using ElementAccum = ElementAccum_;
-  using ArchTag = ArchTag_;
-  static constexpr bool Is_FP8 =
-      cute::is_same_v<Element, cutlass::float_e4m3_t> ||
-      cute::is_same_v<Element, cutlass::float_e5m2_t>;
-  ;
-  static constexpr bool V_colmajor = V_colmajor_;
-  static constexpr bool Transpose_V = Is_FP8 && !V_colmajor;
-  using SeqlenInfo_t = hstu::SeqlenInfoQKFwd<Jagged, Cross, Has_targets>;
-
-  static_assert(ArchTag::kMinComputeCapability >= 90);
-
-  static constexpr cute::GMMA::Major MmaMajorV =
-      !Is_FP8 && !V_colmajor ? GMMA::Major::MN : GMMA::Major::K;
-  static constexpr cute::GMMA::Major TmaMajorV =
-      !V_colmajor ? GMMA::Major::MN : GMMA::Major::K;
-
-  static constexpr int kBlockM = get<0>(TileShape_MNK{});
-  static constexpr int kBlockN = get<1>(TileShape_MNK{});
-  static constexpr int kHeadDim = get<2>(TileShape_MNK{});
-
-  // Register bandwidth is actually a bottleneck so we don't want Q to be in
-  // registers. Leaving this option here for reference.
-  static constexpr bool Mma0_is_RS = false;
-  // We can have Mma1 (P @ V) with P in smem in rmem to reduce register pressure
-  // at the cost of more smem.
-  static_assert(!(!Mma1_is_RS && Is_FP8), "Mma1 must be RS if FP8");
-  static_assert(
-      !(!Mma1_is_RS && Transpose_V),
-      "Mma1 must be RS if Transpose_V");
-
-  using AtomLayoutMNK = Layout<Shape<Int<kBlockM / 64>, _1, _1>>;
-  using TiledMma0 = decltype(cute::make_tiled_mma(
-      std::conditional_t<
-          !Mma0_is_RS,
-          decltype(cute::GMMA::ss_op_selector<
-                   Element,
-                   Element,
-                   ElementAccum,
-                   TileShape_MNK>()),
-          decltype(cute::GMMA::rs_op_selector<
-                   Element,
-                   Element,
-                   ElementAccum,
-                   TileShape_MNK>())>{},
-      AtomLayoutMNK{}));
-  using TiledMma1 = decltype(cute::make_tiled_mma(
-      std::conditional_t<
-          !Mma1_is_RS,
-          decltype(cute::GMMA::ss_op_selector<
-                   Element,
-                   Element,
-                   ElementAccum,
-                   decltype(select<0, 2, 1>(TileShape_MNK{})),
-                   GMMA::Major::K,
-                   MmaMajorV>()),
-          decltype(cute::GMMA::rs_op_selector<
-                   Element,
-                   Element,
-                   ElementAccum,
-                   decltype(select<0, 2, 1>(TileShape_MNK{})),
-                   GMMA::Major::K,
-                   MmaMajorV>())>{},
-      AtomLayoutMNK{}));
-
-  static constexpr int NumMmaThreads = size(TiledMma0{});
-  static constexpr int NumProducerThreads = !Transpose_V
-      ? cutlass::NumThreadsPerWarp
-      : cutlass::NumThreadsPerWarpGroup;
-  static_assert(NumMmaThreads % cutlass::NumThreadsPerWarpGroup == 0);
-  static constexpr int NumMmaWarpGroups =
-      NumMmaThreads / cutlass::NumThreadsPerWarpGroup;
-  static_assert(
-      NumMmaWarpGroups == 1 || NumMmaWarpGroups == 2 || NumMmaWarpGroups == 3);
-
-  using SmemLayoutAtomQ =
-      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
-               GMMA::Major::K,
-               Element,
-               decltype(cute::get<0>(TileShape_MNK{})),
-               decltype(cute::get<2>(TileShape_MNK{}))>());
-  using SmemLayoutQ =
-      decltype(tile_to_shape(SmemLayoutAtomQ{}, select<0, 2>(TileShape_MNK{})));
-
-  using SmemLayoutAtomK =
-      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
-               GMMA::Major::K,
-               Element,
-               decltype(cute::get<1>(TileShape_MNK{})),
-               decltype(cute::get<2>(TileShape_MNK{}))>());
-  using SmemLayoutK = decltype(tile_to_shape(
-      SmemLayoutAtomK{},
-      make_shape(
-          shape<1>(TileShape_MNK{}),
-          shape<2>(TileShape_MNK{}),
-          Int<kStages>{})));
-
-  using SmemLayoutAtomVt =
-      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
-               TmaMajorV,
-               Element,
-               decltype(cute::get<2>(TileShape_MNK{})),
-               decltype(cute::get<1>(TileShape_MNK{}))>());
-  using SmemLayoutVt = decltype(tile_to_shape(
-      SmemLayoutAtomVt{},
-      make_shape(
-          shape<2>(TileShape_MNK{}),
-          shape<1>(TileShape_MNK{}),
-          Int<kStages>{}),
-      std::conditional_t<
-          TmaMajorV == GMMA::Major::K,
-          cute::Step<_1, _2, _3>,
-          cute::Step<_2, _1, _3>>{}));
-
-  using SmemLayoutAtomVtMma =
-      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
-               MmaMajorV,
-               Element,
-               decltype(cute::get<2>(TileShape_MNK{})),
-               decltype(cute::get<1>(TileShape_MNK{}))>());
-  using SmemLayoutVtMma = decltype(tile_to_shape(
-      SmemLayoutAtomVtMma{},
-      make_shape(
-          shape<2>(TileShape_MNK{}),
-          shape<1>(TileShape_MNK{}),
-          Int<kStages>{}),
-      std::conditional_t<
-          MmaMajorV == GMMA::Major::K,
-          cute::Step<_1, _2, _3>,
-          cute::Step<_2, _1, _3>>{}));
-
-  // Only used if we're using cp.async to load V
-  using SmemLayoutAtomVCpAsync =
-      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
-               GMMA::Major::K,
-               Element,
-               decltype(cute::get<1>(TileShape_MNK{})),
-               decltype(cute::get<2>(TileShape_MNK{}))>());
-  using SmemLayoutVCpAsync = decltype(tile_to_shape(
-      SmemLayoutAtomVCpAsync{},
-      make_shape(
-          shape<1>(TileShape_MNK{}),
-          shape<2>(TileShape_MNK{}),
-          Int<kStages>{})));
-
-  using SmemLayoutAtomP =
-      decltype(cutlass::gemm::collective::detail::ss_smem_selector<
-               GMMA::Major::K,
-               Element,
-               decltype(cute::get<0>(TileShape_MNK{})),
-               decltype(cute::get<1>(TileShape_MNK{}))>());
-  using SmemLayoutP =
-      decltype(tile_to_shape(SmemLayoutAtomP{}, select<0, 1>(TileShape_MNK{})));
-
-  using SmemCopyAtomP = Copy_Atom<cute::SM90_U32x4_STSM_N, Element>;
-
-  // Use LDSM.T and STSM to transpose V in the case of FP8 and V being
-  // row-major. For FP16/BF16 we don't do any transposing.
-  static_assert(!Transpose_V || (kHeadDim % 32 == 0 && kBlockN % 32 == 0));
-  static constexpr bool kHeadDim_multiple_64 = kHeadDim % 64 == 0;
-  // Either kHeadDim is a multiple of 64 (in which case we use a block size of
-  // 64 x 32 for the transpose), or we need kBlockN to be a multiple of 64 (in
-  // which case we use a block size of 32 x 64 for the transpose).
-  static_assert(!Transpose_V || (kHeadDim_multiple_64 || kBlockN % 64 == 0));
-  using LDSM_thread_shape = std::conditional_t<
-      kHeadDim_multiple_64,
-      Shape<_32, _4, _1, _1>,
-      Shape<_16, _4, _1, _2>>;
-  using LDSM_thread_stride = std::conditional_t<
-      kHeadDim_multiple_64,
-      Stride<_4, _1, _0, _0>,
-      Stride<_4, _1, _0, _64>>;
-  using LDSM_value_shape = Shape<_2, _2, _1, _4>;
-  using LDSM_value_stride = Stride<_1, _2, _16, _4>;
-  using LDSM_divide_shape =
-      std::conditional_t<kHeadDim_multiple_64, Shape<_64, _8>, Shape<_32, _8>>;
-  using S2RTiledCopyVt = decltype(make_tiled_copy(
-      Copy_Atom<SM75_U16x8_LDSM_T, Element>{},
-      Layout<LDSM_thread_shape, LDSM_thread_stride>{},
-      Layout<LDSM_value_shape, LDSM_value_stride>{}));
-
-  using STSM_thread_shape = std::conditional_t<
-      kHeadDim_multiple_64,
-      Shape<_8, _4, _4, _1>,
-      Shape<_8, _4, _2, _2>>;
-  using STSM_thread_stride = std::conditional_t<
-      kHeadDim_multiple_64,
-      Stride<_4, _1, _32, _0>,
-      Stride<_4, _1, _32, _64>>;
-  using STSM_value_shape = Shape<_1, _4, _2, _2>;
-  using STSM_value_stride = Stride<_0, _1, _4, _8>;
-  using STSM_divide_shape = Shape<_8, _16>;
-  // These will not permute the columns of V (the kHeadDim dimension) but incur
-  // bank conflicts so a little slower (e.g. 1150 TFLOPS for hdim 256 instead of
-  // 1200 TFLOPS). Instead we will permute the cols of V, and un-permute the
-  // cols of O in the epilogue. using STSM_value_shape = Shape<_2, _4, _1, _2>;
-  // using STSM_value_stride = Stride<_4, _1, _0, _8>;
-  // using STSM_divide_shape = Shape<_16, _16>;
-  using R2STiledCopyV = decltype(make_tiled_copy(
-      Copy_Atom<SM90_U32x4_STSM_N, Element>{},
-      Layout<STSM_thread_shape, STSM_thread_stride>{},
-      Layout<STSM_value_shape, STSM_value_stride>{}));
-
-  using GmemTiledCopyQ = cute::SM90_TMA_LOAD;
-  using GmemTiledCopyKV =
-      decltype(cutlass::gemm::collective::detail::
-                   sm90_cluster_shape_to_tma_atom(shape<0>(ClusterShape{})));
-
-  // We use CpAsync for K and V if PagedKV and AppendKV, since TMA doesn't work
-  // there
-  static constexpr int kGmemElemsPerLoad =
-      sizeof(cute::uint128_t) / sizeof(Element);
-  static_assert(
-      kHeadDim % kGmemElemsPerLoad == 0,
-      "Headdim must be a multiple of kGmemElemsPerLoad");
-  // We want each "row" to have 64 elements (128 bytes, i.e. 1 cache line). E.g.
-  // if hdim=128, we want each thread to have 4 loads in the M direction and 2
-  // vectorized load in the K direction. We want each thread to have at least 2
-  // loads in the K direction since in the case of non-interleaved rotary
-  // (combining elements at indices 0 and rotary_dim/2, 1 and rotary_dim/2+1,
-  // etc), each thread will load twice from the same row.
-  static constexpr int kBytePerHalfRow = kHeadDim / 2 * sizeof(Element);
-  static constexpr int kBlockKGmem =
-      (kBytePerHalfRow % 128 == 0 ? 128
-                                  : (kBytePerHalfRow % 64 == 0 ? 64 : 32)) /
-      sizeof(Element);
-  static constexpr int kGmemThreadsPerRow = kBlockKGmem / kGmemElemsPerLoad;
-  static_assert(
-      NumMmaThreads % kGmemThreadsPerRow == 0,
-      "NumMmaThreads must be a multiple of kGmemThreadsPerRow");
-  // We assume threads loading the same row are in the same warp. This is for an
-  // optimization in PagedKV where these threads share the same page table entry
-  // and share the work of computing pointers to paged K and paged V.
-  static_assert(
-      cutlass::NumThreadsPerWarp % kGmemThreadsPerRow == 0,
-      "kGmemThreadsPerRow must divide NumThreadsPerWarp");
-  using GmemLayoutAtom = Layout<
-      Shape<Int<NumMmaThreads / kGmemThreadsPerRow>, Int<kGmemThreadsPerRow>>,
-      Stride<Int<kGmemThreadsPerRow>, _1>>;
-  // If AppendKV, we'll be loading Q for rotary, and we assume divisibility to
-  // avoid predication
-  static_assert(
-      kBlockM % CUTE_STATIC_V(shape<0>(GmemLayoutAtom{})) == 0,
-      "kBlockM must be a multiple of NumMmaThreads / kGmemThreadsPerRow");
-
-  using ShapeQKV =
-      cute::Shape<int32_t, int32_t, int32_t, int32_t>; // (seqlen, d, head,
-                                                       // batch)
-  using StrideQK = cute::Stride<int64_t, _1, int64_t, int64_t>;
-  using StrideV = std::conditional_t<
-      !V_colmajor,
-      StrideQK,
-      cute::Stride<_1, int64_t, int64_t, int64_t>>;
-  // ((qhead_per_khead, seqlen), d, nheads_kv, batch, num_splits)
-  using ShapeQPacked = ShapeQKV;
-  using StrideQPacked = StrideQK;
-  using StrideDescale = cute::Stride<int64_t, int64_t>;
-
-  using TMA_Q = decltype(make_tma_copy_A_sm90(
-      GmemTiledCopyQ{},
-      make_tensor(
-          make_gmem_ptr(static_cast<Element const*>(nullptr)),
-          ShapeQKV{},
-          StrideQK{}),
-      SmemLayoutQ{},
-      TileShape_MNK{},
-      ClusterShape{}));
-
-  using TMA_K = decltype(make_tma_copy_B_sm90(
-      GmemTiledCopyKV{},
-      make_tensor(
-          make_gmem_ptr(static_cast<Element const*>(nullptr)),
-          ShapeQKV{},
-          StrideQK{}),
-      take<0, 2>(SmemLayoutK{}),
-      TileShape_MNK{},
-      ClusterShape{})); // mcast along M mode for this N load, if any
-
-  using TMA_V = decltype(make_tma_copy(
-      GmemTiledCopyKV{},
-      make_tensor(
-          make_gmem_ptr(static_cast<Element const*>(nullptr)),
-          ShapeQKV{},
-          select<1, 0, 2, 3>(StrideV{})),
-      take<0, 2>(SmemLayoutVt{}),
-      select<2, 1>(TileShape_MNK{}),
-      size<0>(ClusterShape{}))); // mcast along M mode for this N load, if any
-
-  // Set the bytes transferred in this TMA transaction (may involve multiple
-  // issues)
-  static constexpr uint32_t TmaTransactionBytesQ = static_cast<uint32_t>(
-      size(SmemLayoutQ{}) * cutlass::sizeof_bits_v<Element> / 8);
-  static constexpr uint32_t TmaTransactionBytesK = static_cast<uint32_t>(
-      size(take<0, 2>(SmemLayoutK{})) * cutlass::sizeof_bits_v<Element> / 8);
-  static constexpr uint32_t TmaTransactionBytesV = static_cast<uint32_t>(
-      size(take<0, 2>(SmemLayoutVt{})) * cutlass::sizeof_bits_v<Element> / 8);
-  static_assert(TmaTransactionBytesK == TmaTransactionBytesV);
-
-  using PipelineTmaAsync = std::conditional_t<
-      CUTE_STATIC_V(size(ClusterShape{})) == 1,
-      typename cutlass::PipelineTmaAsyncNoCluster<kStages>,
-      typename cutlass::PipelineTmaAsync<kStages>>;
-  using MainloopPipelineK = PipelineTmaAsync;
-  using MainloopPipelineV = std::conditional_t<
-      !Transpose_V,
-      PipelineTmaAsync,
-      typename cutlass::PipelineAsync<kStages>>;
-  using MainloopPipelineVt = PipelineTmaAsync;
-  // We always use TMA for K_new and V_new
-  using MainloopPipelineKVNew = PipelineTmaAsync;
-  using PipelineState = cutlass::PipelineState<kStages>;
-
-  // If PackGQA, we use cp.async (instead of TMA) to load Q, so we want smem_q
-  // to be aligned and have sQ being position_independent_swizzle_tensor. If
-  // !Use_TMA_KV, we use cp.async (instead of TMA) to load K & V, so we want
-  // smem_k and smem_v to be aligned.
-  static constexpr size_t SmemAlignmentQ =
-      !Mma0_is_RS ? 128 : cutlass::detail::alignment_for_swizzle(SmemLayoutQ{});
-  static constexpr size_t SmemAlignmentK = 128;
-  static constexpr size_t SmemAlignmentVtNoTranspose =
-      cutlass::detail::alignment_for_swizzle(SmemLayoutVt{});
-  static_assert(
-      SmemAlignmentQ >= 128 and SmemAlignmentK >= 128 &&
-          SmemAlignmentVtNoTranspose >= 128,
-      "Require at least 128B alignment");
-  static constexpr size_t SmemAlignmentP =
-      cutlass::detail::alignment_for_swizzle(SmemLayoutP{});
-  static_assert(SmemAlignmentP >= 128, "Require at least 128B alignment");
-
-  using SmemP_t = std::conditional_t<
-      Mma1_is_RS,
-      cute::array<Element, 0>,
-      cute::
-          array_aligned<Element, cute::cosize_v<SmemLayoutP>, SmemAlignmentP>>;
-  // Sometimes even with SmemP_t = cute::array<Element, 0>, putting it in the
-  // TensorStorage struct causes smem size to go from 227KB to 228KB and we get
-  // "invalid argument".
-
-  struct TensorStorageWithoutPNoTranspose : cute::aligned_struct<cute::max(
-                                                SmemAlignmentQ,
-                                                SmemAlignmentK,
-                                                SmemAlignmentVtNoTranspose)> {
-    cute::array_aligned<
-        Element,
-        cute::cosize_v<SmemLayoutVt>,
-        SmemAlignmentVtNoTranspose>
-        smem_v;
-    cute::array_aligned<Element, cute::cosize_v<SmemLayoutQ>, SmemAlignmentQ>
-        smem_q;
-    cute::array_aligned<Element, cute::cosize_v<SmemLayoutK>, SmemAlignmentK>
-        smem_k;
-  };
-
-  struct TensorStorageWithPNoTranspose : cute::aligned_struct<cute::max(
-                                             SmemAlignmentQ,
-                                             SmemAlignmentK,
-                                             SmemAlignmentVtNoTranspose,
-                                             SmemAlignmentP)> {
-    cute::array_aligned<
-        Element,
-        cute::cosize_v<SmemLayoutVt>,
-        SmemAlignmentVtNoTranspose>
-        smem_v;
-    cute::array_aligned<Element, cute::cosize_v<SmemLayoutQ>, SmemAlignmentQ>
-        smem_q;
-    cute::array_aligned<Element, cute::cosize_v<SmemLayoutK>, SmemAlignmentK>
-        smem_k;
-    SmemP_t smem_p;
-  };
-
-  using TensorStorageNoTranspose = std::conditional_t<
-      Mma1_is_RS,
-      TensorStorageWithoutPNoTranspose,
-      TensorStorageWithPNoTranspose>;
-
-  static constexpr size_t SmemAlignmentVt =
-      cutlass::detail::alignment_for_swizzle(SmemLayoutVt{});
-  static constexpr size_t SmemAlignmentV =
-      cutlass::detail::alignment_for_swizzle(SmemLayoutVtMma{});
-  static_assert(
-      SmemAlignmentVt >= 128 and SmemAlignmentV >= 128,
-      "Require at least 128B alignment");
-  struct TensorStorageTransposeV
-      : cute::aligned_struct<
-            cute::max(SmemAlignmentQ, SmemAlignmentK, SmemAlignmentV)> {
-    cute::
-        array_aligned<Element, cute::cosize_v<SmemLayoutVtMma>, SmemAlignmentV>
-            smem_v;
-    cute::array_aligned<Element, cute::cosize_v<SmemLayoutVt>, SmemAlignmentVt>
-        smem_vt;
-    cute::array_aligned<Element, cute::cosize_v<SmemLayoutQ>, SmemAlignmentQ>
-        smem_q;
-    cute::array_aligned<Element, cute::cosize_v<SmemLayoutK>, SmemAlignmentK>
-        smem_k;
-  };
-
-  using TensorStorage = std::conditional_t<
-      !Transpose_V,
-      TensorStorageNoTranspose,
-      TensorStorageTransposeV>;
-
-  // These are tuned for speed. They don't affect correctness.
-  static constexpr bool UseSchedulerBarrier =
-      (NumMmaWarpGroups >= 2) && (!Is_FP8 ? kHeadDim <= 128 : kHeadDim >= 128);
-  static constexpr bool RescaleOBeforeGemm =
-      kHeadDim > 128 && (!Is_FP8 || V_colmajor);
-
-  // Host side kernel arguments
-  struct Arguments {
-    Element const* const ptr_Q;
-    ShapeQKV const shape_Q;
-    StrideQK const stride_Q;
-    Element* const
-        ptr_K; // Not Element const* since we might append to KV cache in-place
-    ShapeQKV const shape_K;
-    StrideQK const stride_K;
-    Element* const ptr_V;
-    StrideV const stride_V;
-    float const *ptr_q_descale, *ptr_k_descale, *ptr_v_descale;
-    StrideDescale const stride_q_descale, stride_k_descale, stride_v_descale;
-    float const max_seq_len_inv;
-    float const alpha;
-    int const max_attn_len;
-    int const min_full_attn_seq_len;
-    int const contextual_seq_len;
-    int const num_softmax_heads;
-    int const num_groups;
-    int const batch_size_per_group;
-    int const* const seq_offsets = nullptr;
-    int const* const seq_offsets_q = nullptr;
-    int const* const num_targets = nullptr;
-    int const* const max_seq_len_tensor = nullptr;
-    int const* const contextual_seq_len_tensor = nullptr;
-    int const* const max_attn_len_tensor = nullptr;
-    int const* const min_full_attn_seq_len_tensor = nullptr;
-    float const* const attn_scale = nullptr;
-    bool const scalar_scale = true;
-  };
-
-  // Device side kernel params
-  struct Params {
-    Element const* const ptr_Q;
-    ShapeQKV const shape_Q;
-    StrideQK const stride_Q;
-    ShapeQPacked const shape_Q_packed;
-    StrideQPacked const stride_Q_packed;
-    Element* const ptr_K;
-    ShapeQKV const shape_K;
-    StrideQK const stride_K;
-    Element* const ptr_V;
-    StrideV const stride_V;
-    TMA_Q tma_load_Q;
-    TMA_K tma_load_K;
-    TMA_V tma_load_V;
-    float const *ptr_q_descale, *ptr_k_descale, *ptr_v_descale;
-    StrideDescale const stride_q_descale, stride_k_descale, stride_v_descale;
-    float const max_seq_len_inv;
-    float const alpha;
-    float const alpha_log2;
-    int const max_attn_len;
-    int const min_full_attn_seq_len;
-    int const contextual_seq_len;
-    int const num_softmax_heads;
-    int const num_groups;
-    int const batch_size_per_group;
-    int const* const seq_offsets = nullptr;
-    int const* const seq_offsets_q = nullptr;
-    int const* const num_targets = nullptr;
-    int const* const max_seq_len_tensor = nullptr;
-    int const* const contextual_seq_len_tensor = nullptr;
-    int const* const max_attn_len_tensor = nullptr;
-    int const* const min_full_attn_seq_len_tensor = nullptr;
-    float const* const attn_scale = nullptr;
-    bool const scalar_scale = true;
-  };
-
-  static Params to_underlying_arguments(Arguments const& args) {
-    Tensor mQ =
-        make_tensor(make_gmem_ptr(args.ptr_Q), args.shape_Q, args.stride_Q);
-    TMA_Q tma_load_Q = make_tma_copy_A_sm90(
-        GmemTiledCopyQ{},
-        mQ,
-        SmemLayoutQ{},
-        TileShape_MNK{},
-        ClusterShape{}); // no mcast for Q
-    Tensor mK =
-        make_tensor(make_gmem_ptr(args.ptr_K), args.shape_K, args.stride_K);
-    TMA_K tma_load_K = make_tma_copy_B_sm90(
-        GmemTiledCopyKV{},
-        mK,
-        take<0, 2>(SmemLayoutK{}),
-        TileShape_MNK{},
-        ClusterShape{}); // mcast along M mode for this N load, if any
-    Tensor mV = make_tensor(
-        make_gmem_ptr(args.ptr_V),
-        select<1, 0, 2, 3>(args.shape_K),
-        select<1, 0, 2, 3>(args.stride_V));
-    TMA_V tma_load_V = make_tma_copy(
-        GmemTiledCopyKV{},
-        mV,
-        take<0, 2>(SmemLayoutVt{}),
-        select<2, 1>(TileShape_MNK{}),
-        size<0>(ClusterShape{})); // mcast along M mode for this N load, if any
-    auto const shape_Q_packed = cute::conditional_return<true>(
-        args.shape_Q,
-        make_shape(
-            make_shape(1, get<0>(args.shape_Q)),
-            get<1>(args.shape_Q),
-            get<2>(args.shape_K),
-            get<3>(args.shape_Q)));
-    auto const stride_Q_packed = cute::conditional_return<true>(
-        args.stride_Q,
-        make_stride(
-            make_stride(get<2>(args.stride_Q), get<0>(args.stride_Q)),
-            get<1>(args.stride_Q),
-            get<2>(args.stride_Q),
-            get<3>(args.stride_Q)));
-    return {
-        args.ptr_Q,
-        args.shape_Q,
-        args.stride_Q,
-        shape_Q_packed,
-        stride_Q_packed,
-        args.ptr_K,
-        args.shape_K,
-        args.stride_K,
-        args.ptr_V,
-        args.stride_V,
-        tma_load_Q,
-        tma_load_K,
-        tma_load_V,
-        args.ptr_q_descale,
-        args.ptr_k_descale,
-        args.ptr_v_descale,
-        args.stride_q_descale,
-        args.stride_k_descale,
-        args.stride_v_descale,
-        args.max_seq_len_inv,
-        args.alpha,
-        float(args.alpha * M_LOG2E),
-        args.max_attn_len,
-        args.min_full_attn_seq_len,
-        args.contextual_seq_len,
-        args.num_softmax_heads,
-        args.num_groups,
-        args.batch_size_per_group,
-        args.seq_offsets,
-        args.seq_offsets_q,
-        args.num_targets,
-        args.max_seq_len_tensor,
-        args.contextual_seq_len_tensor,
-        args.max_attn_len_tensor,
-        args.min_full_attn_seq_len_tensor,
-        args.attn_scale,
-        args.scalar_scale};
-  }
-
-  /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best
-  /// performance
-  CUTLASS_DEVICE
-  static void prefetch_tma_descriptors(Params const& params) {
-    cute::prefetch_tma_descriptor(params.tma_load_Q.get_tma_descriptor());
-    cute::prefetch_tma_descriptor(params.tma_load_K.get_tma_descriptor());
-    cute::prefetch_tma_descriptor(params.tma_load_V.get_tma_descriptor());
-  }
-
-  CUTLASS_DEVICE
-  cute::tuple<int, int> get_n_block_min_max(
-      int max_attn_len,
-      int min_full_attn_seq_len,
-      int contextual_seq_len,
-      int uihlen,
-      int m_block) {
-    static constexpr int kBlockM = get<0>(TileShape_MNK{});
-    static constexpr int kBlockN = get<1>(TileShape_MNK{});
-    if constexpr (Contexual_mask) {
-      if (m_block * kBlockM < contextual_seq_len) {
-        return {0, cute::ceil_div(uihlen, kBlockN)};
-      }
-    }
-    if constexpr (Has_targets) {
-      int m_idx_max = (m_block + 1) * kBlockM;
-      if (m_idx_max > uihlen) {
-        return {0, cute::ceil_div(uihlen, kBlockN)};
-      }
-    }
-    int n_block_max;
-    int n_block_min;
-    // Non-target part, n_block_max
-    if constexpr (Causal || Local) {
-      int m_idx_max = (m_block + 1) * kBlockM;
-      n_block_max = cute::ceil_div(std::min(m_idx_max, uihlen), kBlockN);
-    } else {
-      n_block_max = cute::ceil_div(uihlen, kBlockN);
-    }
-    // Non-target part, n_block_min
-    if constexpr (Local) {
-      int m_idx_min = m_block * kBlockM;
-      int m_idx_max = (m_block + 1) * kBlockM;
-      if (min_full_attn_seq_len == 0 ||
-          m_idx_max <= uihlen - min_full_attn_seq_len) {
-        n_block_min = std::max(int(0), (m_idx_min - max_attn_len) / kBlockN);
-        if constexpr (Contexual_mask) {
-          // row contexual without sink
-          if (n_block_min * kBlockN < contextual_seq_len) {
-            n_block_min = 0;
-          }
-        }
-      } else {
-        n_block_min = 0;
-      }
-    } else {
-      n_block_min = 0;
-    }
-    return {n_block_min, n_block_max};
-  }
-
-  CUTLASS_DEVICE
-  cute::tuple<int, int> get_target_n_block_min_max(
-      int n_block_max,
-      int uihlen,
-      int seqlen,
-      int m_block) {
-    static constexpr int kBlockM = get<0>(TileShape_MNK{});
-    static constexpr int kBlockN = get<1>(TileShape_MNK{});
-    int m_idx_max = (m_block + 1) * kBlockM;
-    if (m_idx_max <= uihlen) { // Non-target part
-      return {n_block_max, n_block_max};
-    } else { // Target part
-      int m_idx_min = m_block * kBlockM;
-      return {
-          std::max(n_block_max, m_idx_min / kBlockN),
-          cute::ceil_div(std::min(m_idx_max, seqlen), kBlockN)};
-    }
-  }
-
-  CUTLASS_DEVICE
-  int get_contexual_n_block_max(
-      int n_block_min,
-      int min_full_attn_seq_len,
-      int contextual_seq_len,
-      int uihlen,
-      int m_block) {
-    return 0;
-    // TODO: reenable below once contexual + semi local implementation is
-    // finalized
-    static constexpr int kBlockM = get<0>(TileShape_MNK{});
-    static constexpr int kBlockN = get<1>(TileShape_MNK{});
-    if constexpr (!Local) {
-      return 0;
-    }
-    if (m_block * kBlockM < contextual_seq_len) {
-      return 0;
-    }
-    int m_idx_max = (m_block + 1) * kBlockM;
-    if constexpr (Has_targets) {
-      if (m_idx_max > uihlen) {
-        return 0;
-      }
-    }
-    if (min_full_attn_seq_len == 0 ||
-        m_idx_max <= uihlen - min_full_attn_seq_len) {
-      return std::min(n_block_min, cute::ceil_div(contextual_seq_len, kBlockN));
-    }
-    return 0;
-  }
-
-  CUTLASS_DEVICE
-  cute::tuple<int, int> get_cross_n_block_min_max(
-      int const uihlen_q,
-      int const seqlen_q,
-      int const seqlen_kv,
-      int const m_block) {
-    static constexpr int kBlockM = get<0>(TileShape_MNK{});
-    static constexpr int kBlockN = get<1>(TileShape_MNK{});
-    if constexpr (!Causal) {
-      return {0, cute::ceil_div(seqlen_kv, kBlockN)};
-    }
-    int n_block_max =
-        std::min(seqlen_kv, (m_block + 1) * kBlockM + seqlen_kv - uihlen_q);
-    return {0, cute::ceil_div(n_block_max, kBlockN)};
-  }
-
-  template <typename SchedulerPrefetch, typename SharedStorage>
-  CUTLASS_DEVICE void load(
-      Params const& params,
-      MainloopPipelineK pipeline_k,
-      MainloopPipelineV pipeline_v,
-      MainloopPipelineVt pipeline_vt,
-      PipelineState& smem_pipe_write,
-      SharedStorage& shared_storage,
-      SchedulerPrefetch const& scheduler_prefetch,
-      SeqlenInfo_t const& seqlen_info,
-      cute::tuple<int32_t, int32_t, int32_t, int32_t> block_coord,
-      int& work_idx) {
-    auto [m_block, bidh, bidb, split_idx] = block_coord;
-    if constexpr (Jagged) {
-      static constexpr int kBlockM = get<0>(TileShape_MNK{});
-      if (m_block * kBlockM >= seqlen_info.seqlen_q) {
-        scheduler_prefetch();
-        return;
-      }
-    }
-    int min_full_attn_seq_len_, max_attn_len_, contextual_seq_len_;
-    if constexpr (!Cross) {
-      if (params.num_groups > 1) {
-        int group_id = bidb / params.batch_size_per_group;
-        min_full_attn_seq_len_ = params.min_full_attn_seq_len_tensor[group_id];
-        max_attn_len_ = params.max_attn_len_tensor[group_id];
-        contextual_seq_len_ = params.contextual_seq_len_tensor[group_id];
-      } else {
-        min_full_attn_seq_len_ = params.min_full_attn_seq_len;
-        max_attn_len_ = params.max_attn_len;
-        contextual_seq_len_ = params.contextual_seq_len;
-      }
-    }
-    int n_block_min, n_block_max;
-    if constexpr (Cross) {
-      auto n_block_min_max = get_cross_n_block_min_max(
-          seqlen_info.uihlen_q,
-          seqlen_info.seqlen_q,
-          seqlen_info.seqlen_kv,
-          m_block);
-      n_block_min = get<0>(n_block_min_max);
-      n_block_max = get<1>(n_block_min_max);
-    } else {
-      auto n_block_min_max = get_n_block_min_max(
-          max_attn_len_,
-          min_full_attn_seq_len_,
-          contextual_seq_len_,
-          seqlen_info.uihlen_q,
-          m_block);
-      n_block_min = get<0>(n_block_min_max);
-      n_block_max = get<1>(n_block_min_max);
-    }
-#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
-    if (n_block_max <= n_block_min) {
-      std::printf(
-          "mainloop_fwd_sm90: n_block_max <= n_block_min not expected.");
-      scheduler_prefetch();
-      return;
-    }
-#endif
-
-    Tensor sQ = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
-        SmemLayoutQ{});
-    Tensor sK = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_k.data()),
-        SmemLayoutK{});
-    Tensor sK_pi = as_position_independent_swizzle_tensor(sK);
-    // as_position_independent_swizzle_tensor makes address calculation easier
-    // when we do LDSM & STSM to transpose. But it requires smem_vt and smem_v
-    // to be aligned to e.g 512 bytes.
-    Tensor sVt = [&] {
-      if constexpr (!Transpose_V) {
-        return make_tensor(
-            make_smem_ptr(shared_storage.tensors.mainloop.smem_v.data()),
-            SmemLayoutVt{});
-      } else {
-        return cute::as_position_independent_swizzle_tensor(make_tensor(
-            make_smem_ptr(shared_storage.tensors.mainloop.smem_vt.data()),
-            SmemLayoutVt{}));
-      }
-    }();
-    // Only used if Transpose_V
-    Tensor sV = cute::as_position_independent_swizzle_tensor(make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_v.data()),
-        SmemLayoutVtMma{}));
-
-    int const thread_idx = threadIdx.x % NumProducerThreads;
-
-    // Prepare the TMA loads
-    uint32_t block_rank_in_cluster = cute::block_rank_in_cluster();
-    constexpr uint32_t cluster_shape_x = get<0>(ClusterShape());
-    uint2 cluster_local_block_id = {
-        block_rank_in_cluster % cluster_shape_x,
-        block_rank_in_cluster / cluster_shape_x};
-
-    Tensor mQ = params.tma_load_Q.get_tma_tensor(params.shape_Q)(
-        _, _, bidh, !Jagged ? bidb : 0);
-    Tensor mK_TMA = params.tma_load_K.get_tma_tensor(params.shape_K)(
-        _, _, bidh, !Jagged ? bidb : 0);
-    Tensor mVt_TMA = params.tma_load_V.get_tma_tensor(
-        select<1, 0, 2, 3>(params.shape_K))(_, _, bidh, !Jagged ? bidb : 0);
-
-    Tensor gQ = local_tile(
-        domain_offset(make_coord(seqlen_info.offset_q, _0{}), mQ),
-        select<0, 2>(TileShape_MNK{}),
-        make_coord(m_block, _0{})); // (M, K)
-    // if (cute::thread0()) { printf("Jagged = %d, params.leftpad_k = %p,
-    // leftpad_k = %d\n", Jagged, params.leftpad_k, leftpad_k); }
-    Tensor gK_TMA = local_tile(
-        domain_offset(make_coord(seqlen_info.offset_k, _0{}), mK_TMA),
-        select<1, 2>(TileShape_MNK{}),
-        make_coord(_, _0{})); // (N, K, _)
-    Tensor gVt_TMA = local_tile(
-        domain_offset(make_coord(_0{}, seqlen_info.offset_k), mVt_TMA),
-        select<2, 1>(TileShape_MNK{}),
-        make_coord(_0{}, _)); // (K, N, _)
-
-    auto block_tma_Q = params.tma_load_Q.get_slice(_0{});
-    Tensor tQgQ = group_modes<0, 3>(block_tma_Q.partition_S(gQ)); // (TMA)
-    Tensor tQsQ = group_modes<0, 3>(block_tma_Q.partition_D(sQ)); // (TMA)
-    // tma_partition doesn't handle position_independent_swizzle_tensor
-    // correctly, so we need to do it manually
-    auto block_tma_K = params.tma_load_K.get_slice(cluster_local_block_id.x);
-    Tensor tKgK_TMA =
-        group_modes<0, 3>(block_tma_K.partition_S(gK_TMA)); // (TMA, k)
-    Tensor tKsK_TMA =
-        group_modes<0, 3>(block_tma_K.partition_D(sK)); // (TMA, PIPE)
-    auto block_tma_V = params.tma_load_V.get_slice(cluster_local_block_id.x);
-    Tensor tVgVt_TMA =
-        group_modes<0, 3>(block_tma_V.partition_S(gVt_TMA)); // (TMA, k)
-    Tensor tVsVt_TMA =
-        group_modes<0, 3>(block_tma_V.partition_D(sVt)); // (TMA, PIPE)
-
-    // Set up for transposing V, only used if Transpose_V
-    S2RTiledCopyVt s2r_tiled_copy_vt;
-    R2STiledCopyV r2s_tiled_copy_v;
-    auto s2r_thr_copy_vt = s2r_tiled_copy_vt.get_thread_slice(thread_idx);
-    auto r2s_thr_copy_v = r2s_tiled_copy_v.get_thread_slice(thread_idx);
-    // flat_divide(sVt, LDSM_divide_shape{}):  (64, 8, kHeadDim / 64, kBlockN /
-    // 8, kStages)
-    Tensor tTranssVt_ = s2r_thr_copy_vt.partition_S(
-        flat_divide(sVt, LDSM_divide_shape{})); // ((16, 1), 1, 1, kHeadDim /
-                                                // 64, kBlockN / 32, kStages)
-    // flat_divide(sV, STSM_divide_shape{}):  (8, 16, kHeadDim / 8, (4, kBlockN
-    // / 64), kStages)
-    Tensor tTranssV_ = r2s_thr_copy_v.partition_D(
-        flat_divide(sV, STSM_divide_shape{})); // ((16, 1), 1, 1, kHeadDim / 64,
-                                               // (2, kBlockN / 64), kStages)
-    CUTE_STATIC_ASSERT_V(rank(tTranssVt_) == rank(tTranssV_));
-    CUTE_STATIC_ASSERT_V(size<0>(tTranssVt_) == size<0>(tTranssV_));
-    CUTE_STATIC_ASSERT_V(size<1>(tTranssVt_) == size<1>(tTranssV_));
-    CUTE_STATIC_ASSERT_V(size<2>(tTranssVt_) == size<2>(tTranssV_));
-    CUTE_STATIC_ASSERT_V(size<3>(tTranssVt_) == size<3>(tTranssV_));
-    CUTE_STATIC_ASSERT_V(size<4>(tTranssVt_) == size<4>(tTranssV_));
-    // Faster to have 2 LDSM.T, byte permute, STSM for better ILP
-    static constexpr int Transpose_ILP =
-        (size<2>(tTranssVt_) * size<3>(tTranssVt_)) % 2 == 0 ? 2 : 1;
-    Tensor tTranssVt = logical_divide(
-        group_modes<1, rank(tTranssVt_) - 1>(tTranssVt_),
-        Shape<Underscore, Int<Transpose_ILP>>{}); // ((16, 1), (2, kHeadDim / 64
-                                                  // * kBlockN / 32 / 2),
-                                                  // kStages)
-    Tensor tTranssV = logical_divide(
-        group_modes<1, rank(tTranssV_) - 1>(tTranssV_),
-        Shape<Underscore, Int<Transpose_ILP>>{}); // ((16, 1), (2, kHeadDim / 64
-                                                  // * kBlockN / 32 / 2),
-                                                  // kStages)
-    auto transpose_V = [&](int stage) {
-      if constexpr (Transpose_V) {
-#pragma unroll
-        for (int i = 0; i < size<1, 1>(tTranssVt); ++i) {
-          Tensor tTransrV =
-              make_fragment_like(tTranssV(_, make_coord(_, _0{}), _0{}));
-          static_assert(size<0>(tTransrV) == 16);
-          Tensor tTransrV_64 = recast<uint2>(tTransrV);
-          cute::copy(
-              s2r_tiled_copy_vt,
-              tTranssVt(_, make_coord(_, i), stage),
-              tTransrV);
-#pragma unroll
-          for (int j = 0; j < size(tTransrV_64); ++j) {
-            uint32_t upper = tTransrV_64[j].x;
-            uint32_t lower = tTransrV_64[j].y;
-            tTransrV_64[j].x = __byte_perm(upper, lower, 0x6420);
-            tTransrV_64[j].y = __byte_perm(upper, lower, 0x7531);
-          }
-          cute::copy(
-              r2s_tiled_copy_v, tTransrV, tTranssV(_, make_coord(_, i), stage));
-        }
-      }
-    };
-
-    uint16_t mcast_mask_kv = 0;
-    if constexpr (cute::is_same_v<GmemTiledCopyKV, SM90_TMA_LOAD_MULTICAST>) {
-      auto block_layout = Layout<ClusterShape>{}; // (m,n) -> block_id
-      for (int m = 0; m < size<0>(block_layout); ++m) {
-        mcast_mask_kv |=
-            (uint16_t(1) << block_layout(m, cluster_local_block_id.y, _0{}));
-      }
-    }
-
-    auto load_K = [&](int const n_block, auto const& smem_pipe_write) {
-      pipeline_k.producer_acquire(smem_pipe_write);
-      copy(
-          params.tma_load_K.with(
-              *pipeline_k.producer_get_barrier(smem_pipe_write),
-              mcast_mask_kv,
-              TMA::CacheHintSm90::EVICT_LAST),
-          tKgK_TMA(_, n_block),
-          tKsK_TMA(_, smem_pipe_write.index()));
-    };
-
-    auto load_V = [&](int const n_block, auto const& smem_pipe_write) {
-      auto pipeline_v_load =
-          cute::conditional_return<!Transpose_V>(pipeline_v, pipeline_vt);
-      pipeline_v_load.producer_acquire(smem_pipe_write);
-      copy(
-          params.tma_load_V.with(
-              *pipeline_v_load.producer_get_barrier(smem_pipe_write),
-              mcast_mask_kv,
-              TMA::CacheHintSm90::EVICT_LAST),
-          tVgVt_TMA(_, n_block),
-          tVsVt_TMA(_, smem_pipe_write.index()));
-    };
-
-    auto copy_Vt_to_V = [&](auto const& smem_pipe_write) {
-      // Instead of maintaining smem_pipe_read as a separate variable, we can
-      // just use smem_pipe_write, and exploit the invariance that
-      // smem_pipe_write.phase() == smem_pipe_read.phase() ^ 1. This saves 1 or
-      // 2 registers.
-      PipelineState smem_pipe_read{
-          smem_pipe_write.index(),
-          smem_pipe_write.phase() ^ 1,
-          smem_pipe_write.count()};
-      pipeline_vt.consumer_wait(smem_pipe_read);
-      pipeline_v.producer_acquire(smem_pipe_write);
-      transpose_V(smem_pipe_write.index());
-      // SMEM fence to make sure V is transposed before math
-      cutlass::arch::fence_view_async_shared();
-      pipeline_v.producer_commit(smem_pipe_write);
-      // Very important: PipelineTmaAsync::consumer_release assumes that the
-      // warpgroup is synchronized before calling. Without this we get race
-      // conditions.
-      cutlass::arch::NamedBarrier::sync(
-          cutlass::NumThreadsPerWarpGroup,
-          static_cast<uint32_t>(FwdNamedBarriers::ProducerWG) /*id*/);
-      pipeline_vt.consumer_release(smem_pipe_read);
-    };
-
-    int n_block = n_block_max - 1;
-
-    int warp_idx_in_warpgroup =
-        __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0);
-    // If this is true, we're guaranteed that only the first warp will execute
-    // this function
-    static constexpr bool SingleProducerWarp =
-        NumProducerThreads == cutlass::NumThreadsPerWarp;
-    bool should_load_KV =
-        ((SingleProducerWarp || warp_idx_in_warpgroup == 0) &&
-         cute::elect_one_sync());
-
-    if (should_load_KV) {
-      if constexpr (Transpose_V) {
-        load_V(n_block, smem_pipe_write);
-      }
-      // if (thread_idx == 0) { printf("Producer: main load, before load_K,
-      // index = %d\n", smem_pipe_write.index());}
-      load_K(n_block, smem_pipe_write);
-      // if (thread_idx == 0) { printf("Producer: main load, after load K, index
-      // = %d\n", smem_pipe_write.index());}
-    }
-
-    // TMA_Q, Wait for the MMA warpgroups to signal that smem_q is ready
-    if (SingleProducerWarp || warp_idx_in_warpgroup == 0) {
-      cutlass::arch::NamedBarrier::sync(
-          NumMmaThreads + cutlass::NumThreadsPerWarp,
-          static_cast<uint32_t>(FwdNamedBarriers::QueryEmpty) /*id*/);
-    }
-    if ((SingleProducerWarp || warp_idx_in_warpgroup == 0) &&
-        cute::elect_one_sync()) {
-      shared_storage.pipelines.barrier_Q.arrive_and_expect_tx(
-          TmaTransactionBytesQ);
-      copy(
-          params.tma_load_Q.with(
-              reinterpret_cast<typename cutlass::arch::
-                                   ClusterTransactionBarrier::ValueType&>(
-                  shared_storage.pipelines.barrier_Q),
-              0 /*mcast_mask*/,
-              TMA::CacheHintSm90::EVICT_FIRST),
-          tQgQ,
-          tQsQ);
-    }
-
-    // Wait for the MMA WGs to signal that smem_v are ready and V can be copied
-    // from gmem Need ClusterBarrier, not just NamedBarrier. Otherwise we might
-    // have CTA 0 finishing the TMA store on O first, call TMA multicast load on
-    // V, before CTA 1 can finishing TMA store on O. if (thread_idx == 0) {
-    // printf("Producer: main load, before barrier_O, work_idx = %d\n",
-    // work_idx);}
-    shared_storage.pipelines.barrier_O.wait((work_idx + 1) % 2);
-    // if (thread_idx == 0) { printf("Producer: main load, after barrier_O\n");}
-
-    int n_block_prev = n_block;
-    --n_block;
-#pragma unroll(!Transpose_V ? 2 : 1)
-    for (; n_block >= n_block_min; --n_block) {
-      PipelineState smem_pipe_write_v =
-          smem_pipe_write; // copy the state, write_v is always 1 step behind
-      ++smem_pipe_write;
-      if (should_load_KV) {
-        if constexpr (Transpose_V) {
-          load_V(n_block, smem_pipe_write);
-        } else {
-          load_V(n_block_prev, smem_pipe_write_v);
-        }
-        load_K(n_block, smem_pipe_write);
-      }
-      n_block_prev = n_block;
-      if constexpr (Transpose_V) {
-        copy_Vt_to_V(smem_pipe_write_v);
-      }
-    }
-    scheduler_prefetch();
-    if constexpr (!Transpose_V) {
-      if (should_load_KV) {
-        load_V(n_block_prev, smem_pipe_write);
-      }
-    }
-    if constexpr (Transpose_V) {
-      copy_Vt_to_V(smem_pipe_write);
-    }
-    ++smem_pipe_write;
-    if constexpr (!Cross) {
-      if constexpr (Has_targets) {
-        auto [target_n_block_min, target_n_block_max] =
-            get_target_n_block_min_max(
-                n_block_max,
-                seqlen_info.uihlen_q,
-                seqlen_info.seqlen_kv,
-                m_block);
-#pragma unroll 1
-        for (n_block = target_n_block_max - 1; n_block >= target_n_block_min;
-             --n_block) {
-          if (should_load_KV) {
-            load_V(n_block, smem_pipe_write);
-            load_K(n_block, smem_pipe_write);
-          }
-          if constexpr (Transpose_V) {
-            copy_Vt_to_V(smem_pipe_write);
-          }
-          ++smem_pipe_write;
-        }
-      }
-      if constexpr (Contexual_mask) {
-        int contexual_n_block_max = get_contexual_n_block_max(
-            n_block_min,
-            min_full_attn_seq_len_,
-            contextual_seq_len_,
-            seqlen_info.uihlen_q,
-            m_block);
-#pragma unroll 1
-        for (n_block = contexual_n_block_max - 1; n_block >= 0; --n_block) {
-          if (should_load_KV) {
-            load_V(n_block, smem_pipe_write);
-            load_K(n_block, smem_pipe_write);
-          }
-          if constexpr (Transpose_V) {
-            copy_Vt_to_V(smem_pipe_write);
-          }
-          ++smem_pipe_write;
-        }
-      }
-    }
-    // At the end, all threads have the correct smem_pipe_write.
-    ++work_idx;
-  }
-
-  template <typename SharedStorage>
-  CUTLASS_DEVICE void load_tail(
-      MainloopPipelineK pipeline_k,
-      MainloopPipelineV pipeline_v,
-      MainloopPipelineVt pipeline_vt,
-      PipelineState& smem_pipe_write,
-      SharedStorage& shared_storage,
-      int const work_idx) {
-    // If we don't wait for barrier_O here, when using Cluster, CTA0 might exit
-    // early and CTA1 will try to arrive on barrier_O of CTA0, causing
-    // "unspecified launch failure".
-    shared_storage.pipelines.barrier_O.wait((work_idx + 1) % 2);
-    int warp_idx_in_warpgroup =
-        __shfl_sync(0xffffffff, (threadIdx.x / 32) % 4, 0);
-    // Issue the epilogue waits
-    // TODO: check if this should be called by 1 thread or more
-    if (warp_idx_in_warpgroup == 0 && cute::elect_one_sync()) {
-      /* This helps avoid early exit of blocks in Cluster
-       *  Waits for all stages to either be released (all Consumer UNLOCKs), or
-       * if the stage was never used then would just be acquired since the phase
-       * was still inverted from make_producer_start_state
-       */
-      pipeline_k.producer_tail(smem_pipe_write);
-      pipeline_v.producer_tail(smem_pipe_write);
-      if constexpr (Transpose_V) {
-        pipeline_vt.producer_tail(smem_pipe_write);
-      }
-    }
-  }
-
-  CUTLASS_DEVICE void warp_scheduler_barrier_sync() {
-    if constexpr (UseSchedulerBarrier) {
-      cutlass::arch::NamedBarrier::sync(
-          2 * cutlass::NumThreadsPerWarpGroup,
-          static_cast<uint32_t>(FwdNamedBarriers::WarpSchedulerWG1) - 1 +
-              hstu::canonical_warp_group_idx_nosync() /*id*/);
-    }
-  }
-
-  CUTLASS_DEVICE void warp_scheduler_barrier_arrive() {
-    if constexpr (UseSchedulerBarrier) {
-      static_assert(NumMmaWarpGroups == 2 || NumMmaWarpGroups == 3);
-      int const cur_WG = hstu::canonical_warp_group_idx_nosync() - 1;
-      int const next_WG = NumMmaWarpGroups == 2
-          ? 1 - cur_WG
-          : (cur_WG < NumMmaWarpGroups - 1 ? cur_WG + 1 : 0);
-      cutlass::arch::NamedBarrier::arrive(
-          2 * cutlass::NumThreadsPerWarpGroup,
-          static_cast<uint32_t>(FwdNamedBarriers::WarpSchedulerWG1) +
-              next_WG /*id*/);
-    }
-  }
-
-  CUTLASS_DEVICE void mma_init() {
-    // Tell producers that smem_q is ready
-    cutlass::arch::NamedBarrier::arrive(
-        NumMmaThreads + cutlass::NumThreadsPerWarp,
-        static_cast<uint32_t>(FwdNamedBarriers::QueryEmpty) /*id*/);
-    if constexpr (UseSchedulerBarrier) {
-      // We have NamedBarrier for up to 3 WGs
-      static_assert(NumMmaWarpGroups == 2 || NumMmaWarpGroups == 3);
-      // WG1 needs the very first signal to start
-      if (hstu::canonical_warp_group_idx_nosync() == 1) {
-        cutlass::arch::NamedBarrier::arrive(
-            2 * cutlass::NumThreadsPerWarpGroup,
-            static_cast<uint32_t>(FwdNamedBarriers::WarpSchedulerWG1) /*id*/);
-      }
-    }
-  }
-
-  template <typename SharedStorage, typename FrgTensorO>
-  CUTLASS_DEVICE bool mma(
-      Params const& params,
-      MainloopPipelineK pipeline_k,
-      MainloopPipelineV pipeline_v,
-      PipelineState& smem_pipe_read,
-      FrgTensorO& tOrO,
-      int const thread_idx,
-      int& work_idx,
-      SeqlenInfo_t const& seqlen_info,
-      cute::tuple<int32_t, int32_t, int32_t, int32_t> block_coord,
-      SharedStorage& shared_storage) {
-    static_assert(
-        is_rmem<FrgTensorO>::value, "O tensor must be rmem resident.");
-    static constexpr int kBlockM = get<0>(TileShape_MNK{});
-    static constexpr int kBlockN = get<1>(TileShape_MNK{});
-
-    // can't use auto [m_block, ...] = block_coord since structured binding
-    // cannot be captured in lambda
-    int const m_block = get<0>(block_coord);
-    int const bidh = get<1>(block_coord);
-    int const bidb = get<2>(block_coord);
-    int const split_idx = get<3>(block_coord);
-    if constexpr (Jagged) {
-      static constexpr int kBlockM = get<0>(TileShape_MNK{});
-      if (m_block * kBlockM >= seqlen_info.seqlen_q) {
-        return false;
-      }
-    }
-    int min_full_attn_seq_len_, max_attn_len_, contextual_seq_len_;
-    float scalar_scale_val_;
-    if constexpr (!Cross) {
-      if (params.num_groups > 1) {
-        int group_id = bidb / params.batch_size_per_group;
-        min_full_attn_seq_len_ = params.min_full_attn_seq_len_tensor[group_id];
-        max_attn_len_ = params.max_attn_len_tensor[group_id];
-        contextual_seq_len_ = params.contextual_seq_len_tensor[group_id];
-      } else {
-        min_full_attn_seq_len_ = params.min_full_attn_seq_len;
-        max_attn_len_ = params.max_attn_len;
-        contextual_seq_len_ = params.contextual_seq_len;
-      }
-    }
-    if (params.num_groups > 1) {
-      int group_id = bidb / params.batch_size_per_group;
-      int max_seq_len_per_group = params.max_seq_len_tensor[group_id];
-      // attention scale
-      scalar_scale_val_ = params.scalar_scale
-          ? (params.attn_scale == nullptr ? 1.0f / max_seq_len_per_group
-                                          : params.attn_scale[group_id])
-          : 0;
-    } else {
-      // attention scale
-      scalar_scale_val_ = params.scalar_scale
-          ? (params.attn_scale == nullptr ? params.max_seq_len_inv
-                                          : params.attn_scale[0])
-          : 0;
-    }
-    int n_block_min, n_block_max;
-    if constexpr (Cross) {
-      auto n_block_min_max = get_cross_n_block_min_max(
-          seqlen_info.uihlen_q,
-          seqlen_info.seqlen_q,
-          seqlen_info.seqlen_kv,
-          m_block);
-      n_block_min = get<0>(n_block_min_max);
-      n_block_max = get<1>(n_block_min_max);
-    } else {
-      auto n_block_min_max = get_n_block_min_max(
-          max_attn_len_,
-          min_full_attn_seq_len_,
-          contextual_seq_len_,
-          seqlen_info.uihlen_q,
-          m_block);
-      n_block_min = get<0>(n_block_min_max);
-      n_block_max = get<1>(n_block_min_max);
-    }
-
-#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
-    if (n_block_max <= n_block_min) {
-      std::printf(
-          "mainloop_fwd_sm90: n_block_max <= n_block_min not expected.");
-      return false;
-    }
-#endif
-
-    Tensor sQ = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
-        SmemLayoutQ{});
-    Tensor sK = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_k.data()),
-        SmemLayoutK{});
-    Tensor sV = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_v.data()),
-        SmemLayoutVtMma{});
-    Tensor sP = [&] {
-      if constexpr (Mma1_is_RS) {
-        // We might not have smem_p if !Mma1_is_RS1, just use smem_q as a
-        // placeholder since we don't use it
-        return make_tensor(
-            make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
-            SmemLayoutP{});
-      } else {
-        return make_tensor(
-            make_smem_ptr(shared_storage.tensors.mainloop.smem_p.data()),
-            SmemLayoutP{});
-      }
-    }();
-
-    if constexpr (!Mma0_is_RS) {
-      static_assert(
-          stride<0>(typename TiledMma0::ALayout{}) == 0 and
-              stride<0>(typename TiledMma0::BLayout{}) == 0 and
-              size<0>(typename TiledMma0::ALayout{}) ==
-                  cutlass::NumThreadsPerWarpGroup and
-              size<0>(typename TiledMma0::BLayout{}) ==
-                  cutlass::NumThreadsPerWarpGroup,
-          "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
-    }
-    constexpr int MmaWarpGroups =
-        size(TiledMma0{}) / cutlass::NumThreadsPerWarpGroup;
-    Layout warp_group_thread_layout = make_layout(
-        make_shape(Int<MmaWarpGroups>{}),
-        make_stride(Int<cutlass::NumThreadsPerWarpGroup>{}));
-
-    int warp_group_idx = __shfl_sync(
-        0xFFFFFFFF, thread_idx / cutlass::NumThreadsPerWarpGroup, 0);
-    TiledMma0 tiled_mma0;
-    TiledMma1 tiled_mma1;
-    auto wg_mma0 =
-        tiled_mma0.get_slice(warp_group_thread_layout(warp_group_idx));
-    auto wg_mma1 =
-        tiled_mma1.get_slice(warp_group_thread_layout(warp_group_idx));
-
-    auto smem_tiled_copy_P = make_tiled_copy_C(SmemCopyAtomP{}, tiled_mma0);
-    auto smem_thr_copy_P = smem_tiled_copy_P.get_thread_slice(thread_idx);
-
-    // Allocate "fragments/descriptors"
-    Tensor tSrQ = wg_mma0.partition_fragment_A(sQ);
-    Tensor tSrK = wg_mma0.partition_fragment_B(sK);
-    Tensor tOrV = wg_mma1.partition_fragment_B(sV);
-    Tensor tOsP = wg_mma1.partition_fragment_A(sP);
-    Tensor tPsP = smem_thr_copy_P.partition_D(
-        cute::as_position_independent_swizzle_tensor(sP));
-
-    auto consumer_wait = [](auto& pipeline, auto& smem_pipe_read) {
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-    };
-
-    clear(tOrO);
-
-    int n_block = n_block_max - 1;
-
-    hstu::Mask<kBlockM, kBlockN, TiledMma0> mask(
-        thread_idx,
-        seqlen_info.seqlen_q,
-        seqlen_info.seqlen_kv,
-        max_attn_len_,
-        min_full_attn_seq_len_,
-        contextual_seq_len_,
-        seqlen_info.uihlen_q);
-
-    auto& barrier_Q = shared_storage.pipelines.barrier_Q;
-    barrier_Q.wait(work_idx % 2);
-
-    static constexpr int Qdim = 0;
-    auto thread_mma = tiled_mma0.get_thread_slice(thread_idx);
-    auto thread0_mma = tiled_mma0.get_thread_slice(_0{});
-    Tensor cS = cute::make_identity_tensor(Shape<Int<kBlockM>, Int<kBlockN>>{});
-    Tensor tScS = thread_mma.partition_C(cS);
-    Tensor tScS_rowcol = make_tensor(
-        tScS.data(),
-        hstu::convert_layout_acc_rowcol</*Transposed=*/false>(tScS.layout()));
-    Tensor t0ScS = thread0_mma.partition_C(cS);
-    Tensor t0ScS_rowcol = make_tensor(
-        t0ScS.data(),
-        hstu::convert_layout_acc_rowcol</*Transposed=*/false>(t0ScS.layout()));
-    int const thread_qdim_offset = get<Qdim>(tScS_rowcol(_0{}, _0{}));
-    SiluScaleOp<float> silu_scale_op;
-    int qdim_offset = params.scalar_scale
-        ? 0
-        : m_block * kBlockM + thread_qdim_offset + seqlen_info.offset_q;
-
-    if constexpr (Mma0_is_RS) {
-      using SmemCopyAtomQ = Copy_Atom<cute::SM75_U32x4_LDSM_N, Element>;
-      auto smem_tiled_copy_Q = make_tiled_copy_A(SmemCopyAtomQ{}, tiled_mma0);
-      auto smem_thr_copy_Q = smem_tiled_copy_Q.get_thread_slice(thread_idx);
-      Tensor tSrQ_copy_view = smem_thr_copy_Q.retile_D(tSrQ);
-      Tensor tSsQ_copy_view = smem_thr_copy_Q.partition_S(
-          cute::as_position_independent_swizzle_tensor(sQ));
-      cute::copy(smem_tiled_copy_Q, tSsQ_copy_view, tSrQ_copy_view);
-    }
-
-    Tensor tSrS =
-        partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));
-    Tensor tSrS_rowcol = make_tensor(
-        tSrS.data(),
-        hstu::convert_layout_acc_rowcol</*Transposed=*/false>(tSrS.layout()));
-    consumer_wait(pipeline_k, smem_pipe_read);
-    hstu::gemm</*zero_init=*/true, /*wg_wait=*/-1>(
-        tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS);
-    warpgroup_wait<0>();
-    pipeline_k.consumer_release(smem_pipe_read);
-#pragma unroll
-    for (int mi = 0; mi < size<0>(tSrS_rowcol); ++mi) {
-      float scale = scalar_scale_val_;
-      if (!params.scalar_scale) {
-        int q_index = qdim_offset + int(get<Qdim>(t0ScS_rowcol(mi, _0{})));
-        // Convert global index to local sequence position for bounds checking
-        int q_local = q_index - seqlen_info.offset_q;
-        if (q_local < seqlen_info.seqlen_q) {
-          scale = params.attn_scale[q_index];
-        }
-      }
-#pragma unroll
-      for (int ni = 0; ni < size<1>(tSrS_rowcol); ++ni) {
-        tSrS_rowcol(mi, ni) =
-            silu_scale_op(tSrS_rowcol(mi, ni) * params.alpha, scale);
-      }
-    }
-    int const m_idx_max = (m_block + 1) * kBlockM;
-    if constexpr (Cross) {
-      mask.template apply<
-          false /*Seqlenq_mask*/,
-          true /*Seqlenk_mask*/,
-          Causal,
-          false /*Local*/,
-          false /*Contexual_mask*/,
-          false /*Target_mask*/,
-          Cross,
-          false /*Softmax*/>(tSrS, m_block, n_block);
-    } else {
-      if (m_idx_max <= seqlen_info.uihlen_q) {
-        mask.template apply<
-            false /*Seqlenq_mask*/,
-            false /*Seqlenk_mask*/,
-            Causal,
-            Local,
-            Contexual_mask,
-            false /*Target_mask*/,
-            Cross,
-            false /*Softmax*/>(tSrS, m_block, n_block);
-      } else if (
-          m_idx_max <=
-          cute::ceil_div(seqlen_info.uihlen_q, kBlockM) * kBlockM) {
-        mask.template apply<
-            false /*Seqlenq_mask*/,
-            true /*Seqlenk_mask*/,
-            Causal,
-            Local,
-            Contexual_mask,
-            Has_targets,
-            Cross,
-            false /*Softmax*/>(tSrS, m_block, n_block);
-      } else {
-        mask.template apply<
-            false /*Seqlenq_mask*/,
-            true /*Seqlenk_mask*/,
-            false /*Causal*/,
-            false,
-            Contexual_mask,
-            Has_targets,
-            Cross,
-            false /*Softmax*/>(tSrS, m_block, n_block);
-      }
-    }
-    if constexpr (Is_FP8 && !V_colmajor) {
-      hstu::permute_Cregs_fp8(tSrS);
-    }
-    Tensor tOrP_acc = make_tensor(
-        tSrS.data(), hstu::convert_layout_acc_Aregs<TiledMma1>(tSrS.layout()));
-    Tensor tOrP = make_tensor_like<Element>(tOrP_acc);
-    convert_type_out(tOrP_acc, tOrP);
-    if constexpr (Is_FP8 && V_colmajor) {
-      hstu::permute_Aregs_fp8(tOrP);
-    }
-    if constexpr (!Mma1_is_RS) {
-      cute::copy(smem_tiled_copy_P, smem_thr_copy_P.retile_S(tOrP), tPsP);
-      cutlass::arch::fence_view_async_shared();
-      __syncwarp(); // Only need syncwarp since each warp is using its own P
-                    // values for Mma1
-    }
-    --n_block;
-
-    // Each step does gemm0 and silu for iter n_block and gemm1 for prev iter.
-    auto fwd_step_intra_warp_pipeline = [&](int const n_block, auto mask_fn) {
-      PipelineState smem_pipe_read_v(
-          smem_pipe_read.index(),
-          smem_pipe_read.phase(),
-          smem_pipe_read.count());
-      ++smem_pipe_read;
-      Tensor tSrS =
-          partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));
-      Tensor tSrS_rowcol = make_tensor(
-          tSrS.data(),
-          hstu::convert_layout_acc_rowcol</*Transposed=*/false>(tSrS.layout()));
-      if (!UseSchedulerBarrier || warp_group_idx == 0) {
-        consumer_wait(pipeline_k, smem_pipe_read);
-      }
-      warp_scheduler_barrier_sync();
-      hstu::gemm</*zero_init=*/true, /*wg_wait=*/-1>(
-          tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS);
-      if (!UseSchedulerBarrier || warp_group_idx == 0) {
-        consumer_wait(pipeline_v, smem_pipe_read_v);
-      }
-      hstu::gemm</*zero_init=*/false, /*wg_wait=*/-1>(
-          tiled_mma1,
-          cute::conditional_return<Mma1_is_RS>(tOrP, tOsP),
-          tOrV(_, _, _, smem_pipe_read_v.index()),
-          tOrO);
-      warp_scheduler_barrier_arrive();
-      warpgroup_wait<1>();
-      pipeline_k.consumer_release(smem_pipe_read); // release K
-#pragma unroll
-      for (int mi = 0; mi < size<0>(tSrS_rowcol); ++mi) {
-        float scale = scalar_scale_val_;
-        if (!params.scalar_scale) {
-          int q_index = qdim_offset + int(get<Qdim>(t0ScS_rowcol(mi, _0{})));
-          // Convert global index to local sequence position for bounds checking
-          int q_local = q_index - seqlen_info.offset_q;
-          if (q_local < seqlen_info.seqlen_q) {
-            scale = params.attn_scale[q_index];
-          }
-        }
-#pragma unroll
-        for (int ni = 0; ni < size<1>(tSrS_rowcol); ++ni) {
-          tSrS_rowcol(mi, ni) =
-              silu_scale_op(tSrS_rowcol(mi, ni) * params.alpha, scale);
-        }
-      }
-      mask_fn(tSrS, n_block);
-      warpgroup_wait<0>();
-      pipeline_v.consumer_release(smem_pipe_read_v); // release V
-      if constexpr (Is_FP8 && !V_colmajor) {
-        hstu::permute_Cregs_fp8(tSrS);
-      }
-      convert_type_out(make_tensor(tSrS.data(), tOrP.layout()), tOrP);
-      if constexpr (Is_FP8 && V_colmajor) {
-        hstu::permute_Aregs_fp8(tOrP);
-      }
-      if constexpr (!Mma1_is_RS) {
-        cute::copy(smem_tiled_copy_P, smem_thr_copy_P.retile_S(tOrP), tPsP);
-      }
-      if constexpr (!Mma1_is_RS) {
-        cutlass::arch::fence_view_async_shared();
-        __syncwarp();
-      }
-    };
-
-    if constexpr (Cross) {
-      if constexpr (Causal) {
-        if (m_idx_max <=
-            cute::ceil_div(seqlen_info.uihlen_q, kBlockM) * kBlockM) {
-          auto mask_fn = [&](auto& tSrS, int n_block) {
-            mask.template apply<
-                false /*Seqlenq_mask*/,
-                false /*Seqlenk_mask*/,
-                Causal,
-                false /*Local*/,
-                false /*Contexual_mask*/,
-                false /*Target_mask*/,
-                Cross,
-                false /*Softmax*/>(tSrS, m_block, n_block);
-          };
-          int const m_idx_min = m_block * kBlockM;
-          int const n_block_min_causal_mask = std::max(
-              n_block_min,
-              (m_idx_min + seqlen_info.seqlen_kv - seqlen_info.uihlen_q) /
-                  kBlockN);
-#pragma unroll 1
-          for (; n_block >= n_block_min_causal_mask; --n_block) {
-            fwd_step_intra_warp_pipeline(n_block, mask_fn);
-          }
-        }
-      }
-      auto no_mask_fn = [](auto& tSrS, int n_block) {};
-#pragma unroll 1
-      for (; n_block >= n_block_min; --n_block) {
-        fwd_step_intra_warp_pipeline(n_block, no_mask_fn);
-      }
-    } else {
-      if constexpr (Causal || Local) { // Separate iterations with causal
-                                       // or local masking
-        if (m_idx_max <=
-            cute::ceil_div(seqlen_info.uihlen_q, kBlockM) * kBlockM) {
-          auto mask_fn = [&](auto& tSrS, int n_block) {
-            mask.template apply<
-                false /*Seqlenq_mask*/,
-                false /*Seqlenk_mask*/,
-                Causal,
-                Local,
-                Contexual_mask,
-                false /*Has_targets*/,
-                Cross,
-                false /*Softmax*/>(tSrS, m_block, n_block);
-          };
-          int const m_idx_min = m_block * kBlockM;
-          int const n_block_min_causal_local_mask =
-              std::max(n_block_min, m_idx_min / kBlockN);
-#pragma unroll 1
-          for (; n_block >= n_block_min_causal_local_mask; --n_block) {
-            fwd_step_intra_warp_pipeline(n_block, mask_fn);
-          }
-        }
-      }
-      int n_block_min_before_local_mask = n_block_min;
-      if constexpr (Local) {
-        if (m_idx_max <=
-            cute::ceil_div(
-                seqlen_info.uihlen_q - min_full_attn_seq_len_, kBlockM) *
-                kBlockM) {
-          n_block_min_before_local_mask = std::max(
-              n_block_min, cute::ceil_div(m_idx_max - max_attn_len_, kBlockN));
-        }
-      }
-      auto no_mask_fn = [](auto& tSrS, int n_block) {};
-#pragma unroll 1
-      for (; n_block >= n_block_min_before_local_mask; --n_block) {
-        fwd_step_intra_warp_pipeline(n_block, no_mask_fn);
-      }
-      // Separate masking iterations on the left for local attention
-      if constexpr (Local) {
-        auto local_mask_fn = [&](auto& tSrS, int n_block) {
-          mask.template apply<
-              false /*Seqlenq_mask*/,
-              false /*Seqlenk_mask*/,
-              false /*Causal_mask*/,
-              Local,
-              Contexual_mask,
-              false /*Has_targets*/,
-              Cross,
-              false /*Softmax*/>(tSrS, m_block, n_block);
-        };
-#pragma unroll 1
-        for (; n_block >= n_block_min; --n_block) {
-          fwd_step_intra_warp_pipeline(n_block, local_mask_fn);
-        }
-      }
-      // Target part GEMM
-      if constexpr (Has_targets) {
-        auto [target_n_block_min, target_n_block_max] =
-            get_target_n_block_min_max(
-                n_block_max,
-                seqlen_info.uihlen_q,
-                seqlen_info.seqlen_kv,
-                m_block);
-        auto target_mask_fn = [&](auto& tSrS, int n_block) {
-          mask.template apply<
-              false /*Seqlenq_mask*/,
-              true /*Seqlenk_mask*/,
-              false /*Causal_mask*/,
-              false /*Local*/,
-              Contexual_mask,
-              Has_targets,
-              Cross,
-              false /*Softmax*/>(tSrS, m_block, n_block);
-        };
-#pragma unroll 1
-        for (n_block = target_n_block_max - 1; n_block >= target_n_block_min;
-             --n_block) {
-          fwd_step_intra_warp_pipeline(n_block, target_mask_fn);
-        }
-      }
-      if constexpr (Contexual_mask) {
-        int contexual_n_block_max = get_contexual_n_block_max(
-            n_block_min,
-            min_full_attn_seq_len_,
-            contextual_seq_len_,
-            seqlen_info.uihlen_q,
-            m_block);
-        auto contexual_mask_fn = [&](auto& tSrS, int n_block) {
-          mask.template apply<
-              false /*Seqlenq_mask*/,
-              true /*Seqlenk_mask*/,
-              false /*Causal_mask*/,
-              Local,
-              Contexual_mask,
-              Has_targets,
-              Cross,
-              false /*Softmax*/>(tSrS, m_block, n_block);
-        };
-#pragma unroll 1
-        for (n_block = contexual_n_block_max - 1; n_block >= 0; --n_block) {
-          fwd_step_intra_warp_pipeline(n_block, contexual_mask_fn);
-        }
-      }
-    }
-    // Tell producers that smem_q is ready
-    cutlass::arch::NamedBarrier::arrive(
-        NumMmaThreads + cutlass::NumThreadsPerWarp,
-        static_cast<uint32_t>(FwdNamedBarriers::QueryEmpty) /*id*/);
-    consumer_wait(pipeline_v, smem_pipe_read);
-    hstu::gemm</*zero_init=*/false, /*wg_wait=*/-1>(
-        tiled_mma1,
-        cute::conditional_return<Mma1_is_RS>(tOrP, tOsP),
-        tOrV(_, _, _, smem_pipe_read.index()),
-        tOrO);
-    warpgroup_wait<0>();
-    pipeline_v.consumer_release(
-        smem_pipe_read); // release V, otherwise producers will hang
-    if constexpr (Is_FP8 && !V_colmajor) {
-      hstu::permute_output_fp8(tOrO);
-    }
-    ++smem_pipe_read;
-    ++work_idx;
-    return true;
-  }
-
-  template <typename SharedStorage, typename FrgTensorO, typename Softmax>
-  CUTLASS_DEVICE bool mma_softmax(
-      Params const& params,
-      MainloopPipelineK pipeline_k,
-      MainloopPipelineV pipeline_v,
-      PipelineState& smem_pipe_read,
-      FrgTensorO& tOrO,
-      Softmax& softmax,
-      int const thread_idx,
-      int& work_idx,
-      SeqlenInfo_t const& seqlen_info,
-      cute::tuple<int32_t, int32_t, int32_t, int32_t> block_coord,
-      SharedStorage& shared_storage) {
-    static_assert(
-        is_rmem<FrgTensorO>::value, "O tensor must be rmem resident.");
-    static constexpr int kBlockM = get<0>(TileShape_MNK{});
-    static constexpr int kBlockN = get<1>(TileShape_MNK{});
-
-    // can't use auto [m_block, ...] = block_coord since structured binding
-    // cannot be captured in lambda
-    int const m_block = get<0>(block_coord);
-    int const bidh = get<1>(block_coord);
-    int const bidb = get<2>(block_coord);
-    int const split_idx = get<3>(block_coord);
-    if constexpr (Jagged) {
-      static constexpr int kBlockM = get<0>(TileShape_MNK{});
-      if (m_block * kBlockM >= seqlen_info.seqlen_q) {
-        return false;
-      }
-    }
-    int min_full_attn_seq_len_, max_attn_len_, contextual_seq_len_;
-    if constexpr (!Cross) {
-      if (params.num_groups > 1) {
-        int group_id = bidb / params.batch_size_per_group;
-        min_full_attn_seq_len_ = params.min_full_attn_seq_len_tensor[group_id];
-        max_attn_len_ = params.max_attn_len_tensor[group_id];
-        contextual_seq_len_ = params.contextual_seq_len_tensor[group_id];
-      } else {
-        min_full_attn_seq_len_ = params.min_full_attn_seq_len;
-        max_attn_len_ = params.max_attn_len;
-        contextual_seq_len_ = params.contextual_seq_len;
-      }
-    }
-    int n_block_min, n_block_max;
-    if constexpr (Cross) {
-      auto n_block_min_max = get_cross_n_block_min_max(
-          seqlen_info.uihlen_q,
-          seqlen_info.seqlen_q,
-          seqlen_info.seqlen_kv,
-          m_block);
-      n_block_min = get<0>(n_block_min_max);
-      n_block_max = get<1>(n_block_min_max);
-    } else {
-      auto n_block_min_max = get_n_block_min_max(
-          max_attn_len_,
-          min_full_attn_seq_len_,
-          contextual_seq_len_,
-          seqlen_info.uihlen_q,
-          m_block);
-      n_block_min = get<0>(n_block_min_max);
-      n_block_max = get<1>(n_block_min_max);
-    }
-
-#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
-    if (n_block_max <= n_block_min) {
-      std::printf(
-          "mainloop_fwd_sm90: n_block_max <= n_block_min not expected.");
-      return false;
-    }
-#endif
-
-    Tensor sQ = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
-        SmemLayoutQ{});
-    Tensor sK = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_k.data()),
-        SmemLayoutK{});
-    Tensor sV = make_tensor(
-        make_smem_ptr(shared_storage.tensors.mainloop.smem_v.data()),
-        SmemLayoutVtMma{});
-    Tensor sP = [&] {
-      if constexpr (Mma1_is_RS) {
-        // We might not have smem_p if !Mma1_is_RS1, just use smem_q as a
-        // placeholder since we don't use it
-        return make_tensor(
-            make_smem_ptr(shared_storage.tensors.mainloop.smem_q.data()),
-            SmemLayoutP{});
-      } else {
-        return make_tensor(
-            make_smem_ptr(shared_storage.tensors.mainloop.smem_p.data()),
-            SmemLayoutP{});
-      }
-    }();
-
-    if constexpr (!Mma0_is_RS) {
-      static_assert(
-          stride<0>(typename TiledMma0::ALayout{}) == 0 and
-              stride<0>(typename TiledMma0::BLayout{}) == 0 and
-              size<0>(typename TiledMma0::ALayout{}) ==
-                  cutlass::NumThreadsPerWarpGroup and
-              size<0>(typename TiledMma0::BLayout{}) ==
-                  cutlass::NumThreadsPerWarpGroup,
-          "Stride of the first mode must be 0 and the size of the mode must be NumThreadsPerWarpGroup");
-    }
-    constexpr int MmaWarpGroups =
-        size(TiledMma0{}) / cutlass::NumThreadsPerWarpGroup;
-    Layout warp_group_thread_layout = make_layout(
-        make_shape(Int<MmaWarpGroups>{}),
-        make_stride(Int<cutlass::NumThreadsPerWarpGroup>{}));
-
-    int warp_group_idx = __shfl_sync(
-        0xFFFFFFFF, thread_idx / cutlass::NumThreadsPerWarpGroup, 0);
-    TiledMma0 tiled_mma0;
-    TiledMma1 tiled_mma1;
-    auto wg_mma0 =
-        tiled_mma0.get_slice(warp_group_thread_layout(warp_group_idx));
-    auto wg_mma1 =
-        tiled_mma1.get_slice(warp_group_thread_layout(warp_group_idx));
-
-    auto smem_tiled_copy_P = make_tiled_copy_C(SmemCopyAtomP{}, tiled_mma0);
-    auto smem_thr_copy_P = smem_tiled_copy_P.get_thread_slice(thread_idx);
-
-    // Allocate "fragments/descriptors"
-    Tensor tSrQ = wg_mma0.partition_fragment_A(sQ);
-    Tensor tSrK = wg_mma0.partition_fragment_B(sK);
-    Tensor tOrV = wg_mma1.partition_fragment_B(sV);
-    Tensor tOsP = wg_mma1.partition_fragment_A(sP);
-    Tensor tPsP = smem_thr_copy_P.partition_D(
-        cute::as_position_independent_swizzle_tensor(sP));
-
-    auto consumer_wait = [](auto& pipeline, auto& smem_pipe_read) {
-      auto barrier_token = pipeline.consumer_try_wait(smem_pipe_read);
-      pipeline.consumer_wait(smem_pipe_read, barrier_token);
-    };
-
-    clear(tOrO);
-
-    int n_block = n_block_max - 1;
-
-    hstu::Mask<kBlockM, kBlockN, TiledMma0> mask(
-        thread_idx,
-        seqlen_info.seqlen_q,
-        seqlen_info.seqlen_kv,
-        max_attn_len_,
-        min_full_attn_seq_len_,
-        contextual_seq_len_,
-        seqlen_info.uihlen_q);
-
-    auto& barrier_Q = shared_storage.pipelines.barrier_Q;
-    barrier_Q.wait(work_idx % 2);
-    static constexpr int Qdim = 0;
-    auto thread_mma = tiled_mma0.get_thread_slice(thread_idx);
-    auto thread0_mma = tiled_mma0.get_thread_slice(_0{});
-    Tensor cS = cute::make_identity_tensor(Shape<Int<kBlockM>, Int<kBlockN>>{});
-    Tensor tScS = thread_mma.partition_C(cS);
-    Tensor tScS_rowcol = make_tensor(
-        tScS.data(),
-        hstu::convert_layout_acc_rowcol</*Transposed=*/false>(tScS.layout()));
-    Tensor t0ScS = thread0_mma.partition_C(cS);
-    Tensor t0ScS_rowcol = make_tensor(
-        t0ScS.data(),
-        hstu::convert_layout_acc_rowcol</*Transposed=*/false>(t0ScS.layout()));
-    int const thread_qdim_offset = get<Qdim>(tScS_rowcol(_0{}, _0{}));
-    int qdim_offset = params.scalar_scale
-        ? 0
-        : m_block * kBlockM + thread_qdim_offset + seqlen_info.offset_q;
-
-    if constexpr (Mma0_is_RS) {
-      using SmemCopyAtomQ = Copy_Atom<cute::SM75_U32x4_LDSM_N, Element>;
-      auto smem_tiled_copy_Q = make_tiled_copy_A(SmemCopyAtomQ{}, tiled_mma0);
-      auto smem_thr_copy_Q = smem_tiled_copy_Q.get_thread_slice(thread_idx);
-      Tensor tSrQ_copy_view = smem_thr_copy_Q.retile_D(tSrQ);
-      Tensor tSsQ_copy_view = smem_thr_copy_Q.partition_S(
-          cute::as_position_independent_swizzle_tensor(sQ));
-      cute::copy(smem_tiled_copy_Q, tSsQ_copy_view, tSrQ_copy_view);
-    }
-
-    Tensor tSrS =
-        partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));
-    Tensor tSrS_rowcol = make_tensor(
-        tSrS.data(),
-        hstu::convert_layout_acc_rowcol</*Transposed=*/false>(tSrS.layout()));
-    consumer_wait(pipeline_k, smem_pipe_read);
-    hstu::gemm</*zero_init=*/true, /*wg_wait=*/-1>(
-        tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS);
-    warpgroup_wait<0>();
-    pipeline_k.consumer_release(smem_pipe_read);
-    int const m_idx_max = (m_block + 1) * kBlockM;
-    if constexpr (Cross) {
-      mask.template apply<
-          false /*Seqlenq_mask*/,
-          true /*Seqlenk_mask*/,
-          Causal,
-          false /*Local*/,
-          false /*Contexual_mask*/,
-          false /*Target_mask*/,
-          Cross,
-          true /*Softmax*/>(tSrS, m_block, n_block);
-    } else {
-      if (m_idx_max <= seqlen_info.uihlen_q) {
-        mask.template apply<
-            false /*Seqlenq_mask*/,
-            false /*Seqlenk_mask*/,
-            Causal,
-            Local,
-            Contexual_mask,
-            false /*Target_mask*/,
-            Cross,
-            true /*Softmax*/>(tSrS, m_block, n_block);
-      } else if (
-          m_idx_max <=
-          cute::ceil_div(seqlen_info.uihlen_q, kBlockM) * kBlockM) {
-        mask.template apply<
-            false /*Seqlenq_mask*/,
-            true /*Seqlenk_mask*/,
-            Causal,
-            Local,
-            Contexual_mask,
-            Has_targets,
-            Cross,
-            true /*Softmax*/>(tSrS, m_block, n_block);
-      } else {
-        mask.template apply<
-            false /*Seqlenq_mask*/,
-            true /*Seqlenk_mask*/,
-            false /*Causal*/,
-            false,
-            Contexual_mask,
-            Has_targets,
-            Cross,
-            true /*Softmax*/>(tSrS, m_block, n_block);
-      }
-    }
-    Tensor scores_scale = softmax.template max_get_scale<
-        /*Is_first=*/true,
-        /*Check_inf=*/true>(tSrS);
-    softmax.template online_softmax</*Is_first=*/true, /*Check_inf=*/true>(
-        tSrS);
-    if constexpr (Is_FP8 && !V_colmajor) {
-      hstu::permute_Cregs_fp8(tSrS);
-    }
-    Tensor tOrP_acc = make_tensor(
-        tSrS.data(), hstu::convert_layout_acc_Aregs<TiledMma1>(tSrS.layout()));
-    Tensor tOrP = make_tensor_like<Element>(tOrP_acc);
-    convert_type_out(tOrP_acc, tOrP);
-    if constexpr (Is_FP8 && V_colmajor) {
-      hstu::permute_Aregs_fp8(tOrP);
-    }
-    if constexpr (!Mma1_is_RS) {
-      cute::copy(smem_tiled_copy_P, smem_thr_copy_P.retile_S(tOrP), tPsP);
-      cutlass::arch::fence_view_async_shared();
-      __syncwarp(); // Only need syncwarp since each warp is using its own P
-                    // values for Mma1
-    }
-    --n_block;
-
-    // Each step does gemm0 and softmax for iter n_block and gemm1 for prev
-    auto fwd_step_intra_warp_pipeline = [&](int const n_block,
-                                            auto mask_fn,
-                                            auto check_inf_type) {
-      static constexpr bool Check_inf = decltype(check_inf_type)::value;
-      PipelineState smem_pipe_read_v(
-          smem_pipe_read.index(),
-          smem_pipe_read.phase(),
-          smem_pipe_read.count());
-      ++smem_pipe_read;
-      Tensor tSrS =
-          partition_fragment_C(tiled_mma0, select<0, 1>(TileShape_MNK{}));
-      Tensor tSrS_rowcol = make_tensor(
-          tSrS.data(),
-          hstu::convert_layout_acc_rowcol</*Transposed=*/false>(tSrS.layout()));
-      if (!UseSchedulerBarrier || warp_group_idx == 0) {
-        consumer_wait(pipeline_k, smem_pipe_read);
-      }
-      warp_scheduler_barrier_sync();
-      hstu::gemm</*zero_init=*/true, /*wg_wait=*/-1>(
-          tiled_mma0, tSrQ, tSrK(_, _, _, smem_pipe_read.index()), tSrS);
-      if (!UseSchedulerBarrier || warp_group_idx == 0) {
-        consumer_wait(pipeline_v, smem_pipe_read_v);
-      }
-      hstu::gemm</*zero_init=*/false, /*wg_wait=*/-1>(
-          tiled_mma1,
-          cute::conditional_return<Mma1_is_RS>(tOrP, tOsP),
-          tOrV(_, _, _, smem_pipe_read_v.index()),
-          tOrO);
-      warp_scheduler_barrier_arrive();
-      warpgroup_wait<1>();
-      pipeline_k.consumer_release(smem_pipe_read); // release K
-      mask_fn(tSrS, n_block);
-      cute::copy(
-          softmax.template max_get_scale</*Is_first=*/false, Check_inf>(tSrS),
-          scores_scale);
-      softmax.template online_softmax</*Is_first=*/false, Check_inf>(tSrS);
-      warpgroup_wait<0>();
-      pipeline_v.consumer_release(smem_pipe_read_v); // release V
-      if constexpr (Is_FP8 && !V_colmajor) {
-        hstu::permute_Cregs_fp8(tSrS);
-      }
-      convert_type_out(make_tensor(tSrS.data(), tOrP.layout()), tOrP);
-      if constexpr (Is_FP8 && V_colmajor) {
-        hstu::permute_Aregs_fp8(tOrP);
-      }
-      softmax.rescale_o(tOrO, scores_scale);
-      if constexpr (!Mma1_is_RS) {
-        cute::copy(smem_tiled_copy_P, smem_thr_copy_P.retile_S(tOrP), tPsP);
-      }
-      if constexpr (!Mma1_is_RS) {
-        cutlass::arch::fence_view_async_shared();
-        __syncwarp();
-      }
-    };
-
-    if constexpr (Cross) {
-      if constexpr (Causal) {
-        if (m_idx_max <=
-            cute::ceil_div(seqlen_info.uihlen_q, kBlockM) * kBlockM) {
-          auto mask_fn = [&](auto& tSrS, int n_block) {
-            mask.template apply<
-                false /*Seqlenq_mask*/,
-                false /*Seqlenk_mask*/,
-                Causal,
-                false /*Local*/,
-                false /*Contexual_mask*/,
-                false /*Target_mask*/,
-                Cross,
-                true /*Softmax*/>(tSrS, m_block, n_block);
-          };
-          int const m_idx_min = m_block * kBlockM;
-          int const n_block_min_causal_mask = std::max(
-              n_block_min,
-              (m_idx_min + seqlen_info.seqlen_kv - seqlen_info.uihlen_q) /
-                  kBlockN);
-#pragma unroll 1
-          for (; n_block >= n_block_min_causal_mask; --n_block) {
-            fwd_step_intra_warp_pipeline(n_block, mask_fn, cute::true_type{});
-          }
-        }
-      }
-      auto no_mask_fn = [](auto& tSrS, int n_block) {};
-#pragma unroll 1
-      for (; n_block >= n_block_min; --n_block) {
-        fwd_step_intra_warp_pipeline(n_block, no_mask_fn, cute::false_type{});
-      }
-    } else {
-      if constexpr (Causal || Local) { // Separate iterations with causal
-                                       // or local masking
-        if (m_idx_max <=
-            cute::ceil_div(seqlen_info.uihlen_q, kBlockM) * kBlockM) {
-          auto mask_fn = [&](auto& tSrS, int n_block) {
-            mask.template apply<
-                false /*Seqlenq_mask*/,
-                false /*Seqlenk_mask*/,
-                Causal,
-                Local,
-                Contexual_mask,
-                false /*Has_targets*/,
-                Cross,
-                true /*Softmax*/>(tSrS, m_block, n_block);
-          };
-          int const m_idx_min = m_block * kBlockM;
-          int const n_block_min_causal_local_mask =
-              std::max(n_block_min, m_idx_min / kBlockN);
-#pragma unroll 1
-          for (; n_block >= n_block_min_causal_local_mask; --n_block) {
-            fwd_step_intra_warp_pipeline(n_block, mask_fn, cute::true_type{});
-          }
-        }
-      }
-      int n_block_min_before_local_mask = n_block_min;
-      if constexpr (Local) {
-        if (m_idx_max <=
-            cute::ceil_div(
-                seqlen_info.uihlen_q - min_full_attn_seq_len_, kBlockM) *
-                kBlockM) {
-          n_block_min_before_local_mask = std::max(
-              n_block_min, cute::ceil_div(m_idx_max - max_attn_len_, kBlockN));
-        }
-      }
-      auto no_mask_fn = [](auto& tSrS, int n_block) {};
-#pragma unroll 1
-      for (; n_block >= n_block_min_before_local_mask; --n_block) {
-        fwd_step_intra_warp_pipeline(n_block, no_mask_fn, cute::false_type{});
-      }
-      // Separate masking iterations on the left for local attention
-      if constexpr (Local) {
-        auto local_mask_fn = [&](auto& tSrS, int n_block) {
-          mask.template apply<
-              false /*Seqlenq_mask*/,
-              false /*Seqlenk_mask*/,
-              false /*Causal_mask*/,
-              Local,
-              Contexual_mask,
-              false /*Has_targets*/,
-              Cross,
-              true /*Softmax*/>(tSrS, m_block, n_block);
-        };
-#pragma unroll 1
-        for (; n_block >= n_block_min; --n_block) {
-          fwd_step_intra_warp_pipeline(
-              n_block, local_mask_fn, cute::true_type{});
-        }
-      }
-      // Target part GEMM
-      if constexpr (Has_targets) {
-        auto [target_n_block_min, target_n_block_max] =
-            get_target_n_block_min_max(
-                n_block_max,
-                seqlen_info.uihlen_q,
-                seqlen_info.seqlen_kv,
-                m_block);
-        auto target_mask_fn = [&](auto& tSrS, int n_block) {
-          mask.template apply<
-              false /*Seqlenq_mask*/,
-              true /*Seqlenk_mask*/,
-              false /*Causal_mask*/,
-              false /*Local*/,
-              Contexual_mask,
-              Has_targets,
-              Cross,
-              true /*Softmax*/>(tSrS, m_block, n_block);
-        };
-#pragma unroll 1
-        for (n_block = target_n_block_max - 1; n_block >= target_n_block_min;
-             --n_block) {
-          fwd_step_intra_warp_pipeline(
-              n_block, target_mask_fn, cute::true_type{});
-        }
-      }
-      if constexpr (Contexual_mask) {
-        int contexual_n_block_max = get_contexual_n_block_max(
-            n_block_min,
-            min_full_attn_seq_len_,
-            contextual_seq_len_,
-            seqlen_info.uihlen_q,
-            m_block);
-        auto contexual_mask_fn = [&](auto& tSrS, int n_block) {
-          mask.template apply<
-              false /*Seqlenq_mask*/,
-              true /*Seqlenk_mask*/,
-              false /*Causal_mask*/,
-              Local,
-              Contexual_mask,
-              Has_targets,
-              Cross,
-              true /*Softmax*/>(tSrS, m_block, n_block);
-        };
-#pragma unroll 1
-        for (n_block = contexual_n_block_max - 1; n_block >= 0; --n_block) {
-          fwd_step_intra_warp_pipeline(
-              n_block, contexual_mask_fn, cute::true_type{});
-        }
-      }
-    }
-    // Tell producers that smem_q is ready
-    cutlass::arch::NamedBarrier::arrive(
-        NumMmaThreads + cutlass::NumThreadsPerWarp,
-        static_cast<uint32_t>(FwdNamedBarriers::QueryEmpty) /*id*/);
-    consumer_wait(pipeline_v, smem_pipe_read);
-    hstu::gemm</*zero_init=*/false, /*wg_wait=*/-1>(
-        tiled_mma1,
-        cute::conditional_return<Mma1_is_RS>(tOrP, tOsP),
-        tOrV(_, _, _, smem_pipe_read.index()),
-        tOrO);
-    cute::copy(softmax.finalize(1.0f), scores_scale);
-    warpgroup_wait<0>();
-    pipeline_v.consumer_release(
-        smem_pipe_read); // release V, otherwise producers will hang
-    softmax.rescale_o(tOrO, scores_scale);
-    if constexpr (Is_FP8 && !V_colmajor) {
-      hstu::permute_output_fp8(tOrO);
-    }
-    ++smem_pipe_read;
-    ++work_idx;
-    return true;
-  }
-};
-
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mask.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mask.h
deleted file mode 100644
index e35af5193..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/mask.h
+++ /dev/null
@@ -1,396 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/******************************************************************************
- * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
- *Pradeep Ramani, Tri Dao.
- ******************************************************************************/
-
-#pragma once
-
-#include <cute/tensor.hpp>
-
-#include "utils.h"
-
-namespace hstu {
-
-using namespace cute;
-
-template <int BlockM, int BlockN, typename TiledMma, bool SwapAB = false>
-struct Mask {
-  int const thread_idx;
-  int const max_q_len;
-  int const max_kv_len;
-  int const max_attn_len;
-  int const min_full_attn_seq_len;
-  int const contextual_seq_len;
-  int const max_uih_len;
-
-  CUTLASS_DEVICE
-  Mask(
-      const int thread_idx,
-      const int max_q_len,
-      const int max_kv_len,
-      const int max_attn_len,
-      const int min_full_attn_seq_len,
-      const int contextual_seq_len,
-      const int max_uih_len)
-      : thread_idx(thread_idx),
-        max_q_len(max_q_len),
-        max_kv_len(max_kv_len),
-        max_attn_len(max_attn_len),
-        min_full_attn_seq_len(min_full_attn_seq_len),
-        contextual_seq_len(contextual_seq_len),
-        max_uih_len(max_uih_len) {};
-
-  template <
-      bool Seqlenq_mask = false,
-      bool Seqlenk_mask = false,
-      bool Causal_mask = false,
-      bool Local_mask = false,
-      bool Contexual_mask = false,
-      bool Target_mask = false, // If Target_mask, Seqlenk_mask will be disabled
-      bool Cross = false,
-      bool Softmax = false,
-      typename Engine,
-      typename Layout>
-  CUTLASS_DEVICE void apply(
-      Tensor<Engine, Layout>& tSrS,
-      const int m_block,
-      const int n_block) const {
-    static_assert(
-        !(Causal_mask && Local_mask), "Cannot be both causal and local");
-    static_assert(Layout::rank == 3, "Only support 3D Tensor");
-    if constexpr (Cross) {
-      static_assert(
-          (!Local_mask) && (!Contexual_mask) && (!Target_mask),
-          "Local, contexual, and target masks not supported under cross attention");
-    }
-    if (!Seqlenq_mask && !Seqlenk_mask && !Causal_mask && !Local_mask &&
-        !Target_mask) {
-      return;
-    }
-
-    auto thread_mma = TiledMma{}.get_thread_slice(thread_idx);
-    auto thread0_mma = TiledMma{}.get_thread_slice(_0{});
-
-    static constexpr int Qdim = !SwapAB ? 0 : 1, Kdim = !SwapAB ? 1 : 0;
-
-    Tensor cS = cute::make_identity_tensor(
-        Shape<
-            Int<!SwapAB ? BlockM : BlockN>,
-            Int<!SwapAB ? BlockN : BlockM>>{});
-    Tensor tScS = thread_mma.partition_C(cS);
-    Tensor tSrS_rowcol = make_tensor(
-        tSrS.data(),
-        hstu::convert_layout_acc_rowcol</*Transposed=*/SwapAB>(tSrS.layout()));
-    Tensor tScS_rowcol = make_tensor(
-        tScS.data(),
-        hstu::convert_layout_acc_rowcol</*Transposed=*/SwapAB>(tScS.layout()));
-    Tensor t0ScS = thread0_mma.partition_C(cS);
-    Tensor t0ScS_rowcol = make_tensor(
-        t0ScS.data(),
-        hstu::convert_layout_acc_rowcol</*Transposed=*/SwapAB>(t0ScS.layout()));
-    // We want to use the col indices of thread0 to compare, since that is known
-    // at compile time. So we subtract the limit by the first col index of this
-    // thread
-    int const thread_kdim_offset = get<Kdim>(tScS_rowcol(_0{}, _0{}));
-    int const thread_qdim_offset = get<Qdim>(tScS_rowcol(_0{}, _0{}));
-    int const seqlen_k_limit =
-        max_kv_len - n_block * BlockN - thread_kdim_offset;
-    int const uihlen_k_limit =
-        max_uih_len - n_block * BlockN - thread_kdim_offset;
-    int const seqlen_q_limit =
-        max_q_len - m_block * BlockM - thread_qdim_offset;
-    if constexpr (Seqlenq_mask) {
-#pragma unroll
-      for (int m = 0; m < size<0>(tSrS_rowcol); ++m) {
-        if (int(get<Qdim>(t0ScS_rowcol(m, _0{}))) >= seqlen_q_limit) {
-#pragma unroll
-          for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
-            if constexpr (Softmax) {
-              tSrS_rowcol(m, n) = -INFINITY;
-            } else {
-              tSrS_rowcol(m, n) = 0.0f;
-            }
-          }
-        }
-      }
-    }
-    if constexpr (Cross) {
-      if constexpr (!Causal_mask) {
-        if constexpr (Seqlenk_mask) {
-#pragma unroll
-          for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
-            int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(_0{}, n)));
-            if (t0_col_idx >= seqlen_k_limit) {
-#pragma unroll
-              for (int m = 0; m < size<0>(tSrS_rowcol); ++m) {
-                if constexpr (Softmax) {
-                  tSrS_rowcol(m, n) = -INFINITY;
-                } else {
-                  tSrS_rowcol(m, n) = 0.0f;
-                }
-              }
-            }
-          }
-        }
-      } else {
-        int const causal_row_offset = max_kv_len - max_uih_len + 1 -
-            n_block * BlockN - thread_kdim_offset;
-#pragma unroll
-        for (int m = 0; m < size<0>(tSrS_rowcol); ++m) {
-          if constexpr (Seqlenq_mask) {
-            if (int(get<Qdim>(t0ScS_rowcol(m, _0{}))) >= seqlen_q_limit) {
-              continue;
-            }
-          }
-          int const row_idx = get<Qdim>(t0ScS_rowcol(m, _0{})) +
-              m_block * BlockM + thread_qdim_offset;
-          int const col_limit_right = !Seqlenk_mask
-              ? row_idx + causal_row_offset
-              : __viaddmin_s32(row_idx, causal_row_offset, seqlen_k_limit);
-#pragma unroll
-          for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
-            int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(_0{}, n)));
-            if (t0_col_idx >= col_limit_right) {
-              if constexpr (Softmax) {
-                tSrS_rowcol(m, n) = -INFINITY;
-              } else {
-                tSrS_rowcol(m, n) = 0.0f;
-              }
-            }
-          }
-        }
-      }
-    } else {
-      if constexpr (!Causal_mask && !Local_mask) {
-        if constexpr (Seqlenk_mask || Target_mask) {
-#pragma unroll
-          for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
-            int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(_0{}, n)));
-            if constexpr (Target_mask) {
-              if (t0_col_idx >= uihlen_k_limit) {
-                bool const oob_predicate = (t0_col_idx >= seqlen_k_limit);
-                int const col_offset =
-                    t0_col_idx - seqlen_k_limit + seqlen_q_limit;
-#pragma unroll
-                for (int m = 0; m < size<0>(tSrS_rowcol); ++m) {
-                  int const t0_row_idx = int(get<Qdim>(t0ScS_rowcol(m, _0{})));
-                  if ((t0_row_idx != col_offset) || oob_predicate) {
-                    if constexpr (Softmax) {
-                      tSrS_rowcol(m, n) = -INFINITY;
-                    } else {
-                      tSrS_rowcol(m, n) = 0.0f;
-                    }
-                  }
-                }
-              }
-            } else if constexpr (Seqlenk_mask) {
-              if (t0_col_idx >= seqlen_k_limit) {
-#pragma unroll
-                for (int m = 0; m < size<0>(tSrS_rowcol); ++m) {
-                  if constexpr (Softmax) {
-                    tSrS_rowcol(m, n) = -INFINITY;
-                  } else {
-                    tSrS_rowcol(m, n) = 0.0f;
-                  }
-                }
-              }
-            }
-          }
-        }
-      } else { // Causal_mask or Local_mask
-        int const causal_row_offset = 1 - n_block * BlockN - thread_kdim_offset;
-        if constexpr (Causal_mask) {
-#pragma unroll
-          for (int m = 0; m < size<0>(tSrS_rowcol); ++m) {
-            if constexpr (Seqlenq_mask) {
-              if (int(get<Qdim>(t0ScS_rowcol(m, _0{}))) >= seqlen_q_limit) {
-                continue;
-              }
-            }
-            if constexpr (Contexual_mask) {
-              if (int(get<Qdim>(t0ScS_rowcol(m, _0{}))) <
-                  contextual_seq_len - m_block * BlockM - thread_qdim_offset) {
-#pragma unroll
-                for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
-                  int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(_0{}, n)));
-                  if (t0_col_idx >= uihlen_k_limit) {
-                    if constexpr (Softmax) {
-                      tSrS_rowcol(m, n) = -INFINITY;
-                    } else {
-                      tSrS_rowcol(m, n) = 0.0f;
-                    }
-                  }
-                }
-                continue;
-              }
-            }
-            int const row_idx = get<Qdim>(t0ScS_rowcol(m, _0{})) +
-                m_block * BlockM + thread_qdim_offset;
-            if constexpr (!Target_mask) {
-              int const col_limit_right = !Seqlenk_mask
-                  ? row_idx + causal_row_offset
-                  : __viaddmin_s32(row_idx, causal_row_offset, seqlen_k_limit);
-#pragma unroll
-              for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
-                int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(_0{}, n)));
-                if (t0_col_idx >= col_limit_right) {
-                  if constexpr (Softmax) {
-                    tSrS_rowcol(m, n) = -INFINITY;
-                  } else {
-                    tSrS_rowcol(m, n) = 0.0f;
-                  }
-                }
-              }
-            } else {
-#pragma unroll
-              for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
-                int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(_0{}, n)));
-                int const col_idx =
-                    t0_col_idx + n_block * BlockN + thread_kdim_offset;
-                bool const uih_cond =
-                    (t0_col_idx >= row_idx + causal_row_offset) &&
-                    (row_idx < max_uih_len);
-                bool const target_cond = (row_idx != col_idx) &&
-                    (row_idx >= max_uih_len) && (col_idx >= max_uih_len);
-                bool const seqlen_k_cond = (t0_col_idx >= seqlen_k_limit);
-                if (uih_cond || target_cond || seqlen_k_cond) {
-                  if constexpr (Softmax) {
-                    tSrS_rowcol(m, n) = -INFINITY;
-                  } else {
-                    tSrS_rowcol(m, n) = 0.0f;
-                  }
-                }
-              }
-            }
-          }
-        } else { // Local_mask
-          int const local_row_offset_left =
-              causal_row_offset - 1 - max_attn_len;
-          int const col_limit_sink = 0 - n_block * BlockN - thread_kdim_offset;
-#pragma unroll
-          for (int m = 0; m < size<0>(tSrS_rowcol); ++m) {
-            if constexpr (Seqlenq_mask) {
-              if (int(get<Qdim>(t0ScS_rowcol(m, _0{}))) >= seqlen_q_limit) {
-                continue;
-              }
-            }
-            if constexpr (Contexual_mask) {
-              if (int(get<Qdim>(t0ScS_rowcol(m, _0{}))) <
-                  contextual_seq_len - m_block * BlockM - thread_qdim_offset) {
-#pragma unroll
-                for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
-                  int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(_0{}, n)));
-                  if (t0_col_idx >= uihlen_k_limit) {
-                    if constexpr (Softmax) {
-                      tSrS_rowcol(m, n) = -INFINITY;
-                    } else {
-                      tSrS_rowcol(m, n) = 0.0f;
-                    }
-                  }
-                }
-                continue;
-              }
-            }
-            int const row_idx = get<Qdim>(t0ScS_rowcol(m, _0{})) +
-                m_block * BlockM + thread_qdim_offset;
-            int col_limit_left = row_idx + local_row_offset_left;
-            if constexpr (Contexual_mask) {
-              // row contexual without sink
-              if (col_limit_left + n_block * BlockN + thread_kdim_offset <
-                  contextual_seq_len) {
-                col_limit_left = 0;
-              }
-            }
-            if constexpr (!Target_mask) {
-              int const col_limit_right = !Seqlenk_mask
-                  ? row_idx + causal_row_offset
-                  : __viaddmin_s32(row_idx, causal_row_offset, seqlen_k_limit);
-#pragma unroll
-              for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
-                int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(m, n)));
-                if (row_idx < max_uih_len - min_full_attn_seq_len) {
-                  bool const local_left_cond = Contexual_mask
-                      ? (t0_col_idx < col_limit_left &&
-                         t0_col_idx >= col_limit_sink)
-                      : (t0_col_idx < col_limit_left);
-                  if (local_left_cond) {
-                    if constexpr (Softmax) {
-                      tSrS_rowcol(m, n) = -INFINITY;
-                    } else {
-                      tSrS_rowcol(m, n) = 0.0f;
-                    }
-                  }
-                }
-                if (t0_col_idx >= col_limit_right) {
-                  if constexpr (Softmax) {
-                    tSrS_rowcol(m, n) = -INFINITY;
-                  } else {
-                    tSrS_rowcol(m, n) = 0.0f;
-                  }
-                }
-              }
-            } else {
-              int const col_limit_right = row_idx + causal_row_offset;
-#pragma unroll
-              for (int n = 0; n < size<1>(tSrS_rowcol); ++n) {
-                int const t0_col_idx = int(get<Kdim>(t0ScS_rowcol(_0{}, n)));
-                if (row_idx < max_uih_len) {
-                  if (row_idx < max_uih_len - min_full_attn_seq_len) {
-                    bool const local_left_cond = Contexual_mask
-                        ? (t0_col_idx < col_limit_left &&
-                           t0_col_idx >= col_limit_sink)
-                        : (t0_col_idx < col_limit_left);
-                    if (local_left_cond) {
-                      if constexpr (Softmax) {
-                        tSrS_rowcol(m, n) = -INFINITY;
-                      } else {
-                        tSrS_rowcol(m, n) = 0.0f;
-                      }
-                    }
-                  }
-                  if (t0_col_idx >= col_limit_right) {
-                    if constexpr (Softmax) {
-                      tSrS_rowcol(m, n) = -INFINITY;
-                    } else {
-                      tSrS_rowcol(m, n) = 0.0f;
-                    }
-                  }
-                } else {
-                  int const col_idx =
-                      t0_col_idx + n_block * BlockN + thread_kdim_offset;
-                  bool const target_cond = (row_idx != col_idx) &&
-                      (row_idx >= max_uih_len) && (col_idx >= max_uih_len);
-                  bool const seqlen_k_cond = (t0_col_idx >= seqlen_k_limit);
-                  if (target_cond || seqlen_k_cond) {
-                    if constexpr (Softmax) {
-                      tSrS_rowcol(m, n) = -INFINITY;
-                    } else {
-                      tSrS_rowcol(m, n) = 0.0f;
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  };
-};
-
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/named_barrier.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/named_barrier.h
deleted file mode 100644
index 79dce0dd9..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/named_barrier.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/******************************************************************************
- * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
- *Pradeep Ramani, Tri Dao.
- ******************************************************************************/
-
-#pragma once
-
-#include "cutlass/arch/barrier.h"
-
-namespace hstu {
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// cutlass::arch::NamedBarrier::sync/arrive are only enabled Sm90 even though
-// they work for Sm80 as well. We reimplement them here, enabled for both Sm90
-// and Sm80.
-
-CUTLASS_DEVICE
-static void named_barrier_sync(uint32_t num_threads, uint32_t barrier_id_) {
-  static constexpr uint32_t ReservedNamedBarrierCount = static_cast<uint32_t>(
-      cutlass::arch::ReservedNamedBarriers::FirstUserBarrier);
-  uint32_t barrier_id = barrier_id_ + ReservedNamedBarrierCount;
-  asm volatile("bar.sync %0, %1;" : : "r"(barrier_id), "r"(num_threads));
-  cutlass::arch::synclog_emit_named_barrier_arrive_and_wait(
-      __LINE__, num_threads, barrier_id);
-}
-
-CUTLASS_DEVICE
-static void named_barrier_sync(
-    uint32_t num_threads,
-    cutlass::arch::ReservedNamedBarriers reserved_named_barriers) {
-  uint32_t barrier_id = static_cast<uint32_t>(reserved_named_barriers);
-  asm volatile("bar.sync %0, %1;" : : "r"(barrier_id), "r"(num_threads));
-  cutlass::arch::synclog_emit_named_barrier_arrive_and_wait(
-      __LINE__, num_threads, barrier_id);
-}
-
-CUTLASS_DEVICE
-static void named_barrier_arrive(uint32_t num_threads, uint32_t barrier_id_) {
-  static constexpr uint32_t ReservedNamedBarrierCount = static_cast<uint32_t>(
-      cutlass::arch::ReservedNamedBarriers::FirstUserBarrier);
-  uint32_t barrier_id = barrier_id_ + ReservedNamedBarrierCount;
-  cutlass::arch::synclog_emit_named_barrier_arrive(
-      __LINE__, num_threads, barrier_id);
-  asm volatile("bar.arrive %0, %1;" : : "r"(barrier_id), "r"(num_threads));
-}
-
-CUTLASS_DEVICE
-static void named_barrier_arrive(
-    uint32_t num_threads,
-    cutlass::arch::ReservedNamedBarriers reserved_named_barriers) {
-  uint32_t barrier_id = static_cast<uint32_t>(reserved_named_barriers);
-  cutlass::arch::synclog_emit_named_barrier_arrive(
-      __LINE__, num_threads, barrier_id);
-  asm volatile("bar.arrive %0, %1;" : : "r"(barrier_id), "r"(num_threads));
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-// Enumerates the reserved named barriers to avoid potential conflicts
-
-enum class FwdNamedBarriers {
-  QueryEmpty = 0,
-  ProducerWG = 1,
-  TileCountSmemEmpty = 2,
-  TileCountSmemFull = 3,
-  WarpSchedulerWG1 = 4,
-  WarpSchedulerWG2 = 5,
-  WarpSchedulerWG3 = 6,
-};
-
-enum class BwdNamedBarriers {
-  KVEmpty = 0,
-  PdS = 1,
-  // This needs to match FwdNamedBarriers::TileCountSmemEmpty since
-  // TileScheduler uses it
-  TileCountSmemEmpty = 2,
-  TileCountSmemFull = 3,
-  dQEmptyWG1 = 4,
-  dQEmptyWG2 = 5,
-  dQEmptyWG3 = 6,
-  dQFullWG1 = 7,
-  dQFullWG2 = 8,
-  dQFullWG3 = 9,
-};
-
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/seqlen.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/seqlen.h
deleted file mode 100644
index c5721b272..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/seqlen.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/******************************************************************************
- * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
- *Pradeep Ramani, Tri Dao.
- ******************************************************************************/
-
-#pragma once
-
-#include <cute/tensor.hpp>
-
-namespace hstu {
-
-// We consolidate all the info related to sequence length here. This is so that
-// we can do all the gmem reads once at the beginning of each tile, rather than
-// having to repeat these reads to compute various things like n_block_min,
-// n_block_max, etc.
-
-template <bool Jagged, int kBlock>
-struct SeqlenInfo {
-  int const offset, offset_padded;
-  int const seqlen;
-
-  CUTLASS_DEVICE
-  SeqlenInfo(
-      int const bidb,
-      int const seqlen_static,
-      int const* const seq_offsets)
-      : offset(!Jagged ? 0 : seq_offsets[bidb]),
-        offset_padded(
-            !Jagged ? 0
-                    : (seq_offsets[bidb] + bidb * kBlock) / kBlock * kBlock),
-        seqlen(
-            !Jagged ? seqlen_static
-                    : (seq_offsets[bidb + 1] - seq_offsets[bidb])) {}
-};
-
-template <bool Jagged, bool Cross, bool Has_targets, int kBlockM>
-struct SeqlenInfoQKBwd {
-  int const offset_q, offset_k, offset_q_padded;
-  int const seqlen_q, seqlen_kv, uihlen_q;
-
-  CUTLASS_DEVICE
-  SeqlenInfoQKBwd(
-      int const bidb,
-      int const max_q_len,
-      int const max_kv_len,
-      int const* const seq_offsets,
-      int const* const seq_offsets_q,
-      int const* const num_targets)
-      : offset_q(
-            !Jagged ? 0 : (Cross ? seq_offsets_q[bidb] : seq_offsets[bidb])),
-        offset_k(!Jagged ? 0 : seq_offsets[bidb])
-        // If jagged, the layout for dQaccum is that we pad
-        // each sequence in the batch by an extra kBlockM, so that the write for
-        // each sequence doesn't touch the next sequence. Sequence i starts at
-        // seq_offsets[i] + i * kBlockM and ends at seq_offsets[i + 1] + i *
-        // kBlockM However, the start must align to multiples of kBlockM.
-        ,
-        offset_q_padded(
-            !Jagged ? 0
-                : Cross
-                ? ((seq_offsets_q[bidb] + bidb * kBlockM) / kBlockM * kBlockM)
-                : ((seq_offsets[bidb] + bidb * kBlockM) / kBlockM * kBlockM)),
-        seqlen_q(
-            !Jagged ? max_q_len
-                    : (Cross ? (seq_offsets_q[bidb + 1] - seq_offsets_q[bidb])
-                             : (seq_offsets[bidb + 1] - seq_offsets[bidb]))),
-        seqlen_kv(
-            !Jagged ? max_kv_len : (seq_offsets[bidb + 1] - seq_offsets[bidb])),
-        uihlen_q(
-            !Jagged
-                ? (Has_targets ? max_q_len - num_targets[bidb] : max_q_len)
-                : (Has_targets
-                       ? (Cross ? (seq_offsets_q[bidb + 1] -
-                                   seq_offsets_q[bidb] - num_targets[bidb])
-                                : (seq_offsets[bidb + 1] - seq_offsets[bidb] -
-                                   num_targets[bidb]))
-                       : (Cross
-                              ? (seq_offsets_q[bidb + 1] - seq_offsets_q[bidb])
-                              : (seq_offsets[bidb + 1] - seq_offsets[bidb])))) {
-  }
-};
-
-template <bool Jagged, bool Cross, bool Has_targets>
-struct SeqlenInfoQKFwd {
-  int const offset_q, offset_k;
-  int const seqlen_q, seqlen_kv, uihlen_q;
-
-  CUTLASS_DEVICE
-  SeqlenInfoQKFwd(
-      int const bidb,
-      int const max_q_len,
-      int const max_kv_len,
-      int const* const seq_offsets,
-      int const* const seq_offsets_q,
-      int const* const num_targets)
-      : offset_q(
-            !Jagged ? 0 : (Cross ? seq_offsets_q[bidb] : seq_offsets[bidb])),
-        offset_k(!Jagged ? 0 : seq_offsets[bidb]),
-        seqlen_q(
-            !Jagged ? max_q_len
-                    : (Cross ? (seq_offsets_q[bidb + 1] - seq_offsets_q[bidb])
-                             : (seq_offsets[bidb + 1] - seq_offsets[bidb]))),
-        seqlen_kv(
-            !Jagged ? max_kv_len : (seq_offsets[bidb + 1] - seq_offsets[bidb])),
-        uihlen_q(
-            !Jagged
-                ? (Has_targets ? max_q_len - num_targets[bidb] : max_q_len)
-                : (Has_targets
-                       ? (Cross ? (seq_offsets_q[bidb + 1] -
-                                   seq_offsets_q[bidb] - num_targets[bidb])
-                                : (seq_offsets[bidb + 1] - seq_offsets[bidb] -
-                                   num_targets[bidb]))
-                       : (Cross
-                              ? (seq_offsets_q[bidb + 1] - seq_offsets_q[bidb])
-                              : (seq_offsets[bidb + 1] - seq_offsets[bidb])))) {
-  }
-};
-
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/sm90_pipeline_no_cluster.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/sm90_pipeline_no_cluster.h
deleted file mode 100644
index b6428e79c..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/sm90_pipeline_no_cluster.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/******************************************************************************
- * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
- *Pradeep Ramani, Tri Dao.
- ******************************************************************************/
-
-#pragma once
-
-#include <cutlass/pipeline/sm90_pipeline.hpp>
-
-namespace cutlass {
-
-using namespace cute;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// As of Cutlass v3.6.0, if size(ClusterShape) == 1, PipelineTmaAsync has all
-// threads signaling the barrier during consumer_release. This causes a perf
-// regression in FA3 forward pass (especially hdim 128 causal). We instead
-// reimplement the version of PipelineTmaAsync before v3.6.0 where only 1 out of
-// 128 threads signals the barrier.
-//
-// Assumption: params.num_consumers % NumThreadsPerWarpGroup == 0
-template <int Stages_, class Base = cutlass::PipelineTmaAsync<Stages_>>
-class PipelineTmaAsyncNoCluster : public Base {
- public:
-  using FullBarrier = typename Base::FullBarrier;
-  using EmptyBarrier = typename Base::EmptyBarrier;
-  static constexpr uint32_t Stages = Stages_;
-  using PipelineState = typename Base::PipelineState;
-
-  using SharedStorage = typename Base::SharedStorage;
-  using ThreadCategory = typename Base::ThreadCategory;
-  using Params = typename Base::Params;
-
-  static CUTLASS_DEVICE void init_barriers(
-      SharedStorage& storage,
-      Params params) {
-    int warp_idx = canonical_warp_idx_sync();
-    bool is_initializing_warp = (warp_idx == 0);
-    if (is_initializing_warp) {
-      // Barrier FULL and EMPTY init
-      constexpr int producer_arv_cnt = 1;
-      uint32_t const num_consumer_warpgroups_per_cluster =
-          params.num_consumers / NumThreadsPerWarpGroup;
-      uint32_t const multicast_consumer_arrival_count =
-          num_consumer_warpgroups_per_cluster;
-
-      cutlass::arch::detail::initialize_barrier_array_pair_aligned<
-          decltype(storage.full_barrier_),
-          decltype(storage.empty_barrier_),
-          Stages>(
-          storage.full_barrier_,
-          storage.empty_barrier_,
-          producer_arv_cnt,
-          multicast_consumer_arrival_count);
-    }
-    cutlass::arch::fence_barrier_init();
-  }
-
-  template <class ClusterShape, class InitBarriers, class InitMasks>
-  CUTLASS_DEVICE PipelineTmaAsyncNoCluster(
-      SharedStorage& storage,
-      Params params,
-      ClusterShape cluster_shape,
-      InitBarriers = {},
-      InitMasks = {})
-      : Base(
-            storage,
-            params,
-            make_shape(_1{}, _1{}, _1{}) /*cluster_shape*/,
-            cute::false_type{} /*init_barriers*/,
-            cute::false_type{} /*init_masks*/),
-        empty_barrier_ptr_(&storage.empty_barrier_[0]) {
-    int warp_idx = canonical_warp_idx_sync();
-    int lane_predicate = cute::elect_one_sync();
-
-    static_assert(
-        cute::is_same_v<InitBarriers, cute::true_type> ||
-        cute::is_same_v<InitBarriers, cute::false_type>);
-    static_assert(
-        cute::is_same_v<InitMasks, cute::true_type> ||
-        cute::is_same_v<InitMasks, cute::false_type>);
-    if constexpr (cute::is_same_v<InitBarriers, cute::true_type>) {
-      init_barriers(storage, params);
-    }
-  }
-
-  // Constructor
-  template <class ClusterShape>
-  CUTLASS_DEVICE PipelineTmaAsyncNoCluster(
-      SharedStorage& storage,
-      Params params,
-      ClusterShape cluster_shape)
-      : PipelineTmaAsyncNoCluster(
-            storage,
-            params,
-            cluster_shape,
-            cute::true_type{},
-            cute::true_type{}) {}
-
-  template <class ClusterShape, class InitBarriers>
-  CUTLASS_DEVICE PipelineTmaAsyncNoCluster(
-      SharedStorage& storage,
-      Params params,
-      ClusterShape cluster_shape,
-      InitBarriers = {})
-      : PipelineTmaAsyncNoCluster(
-            storage,
-            params,
-            cluster_shape,
-            InitBarriers{},
-            cute::true_type{}) {}
-
-  CUTLASS_DEVICE
-  void consumer_release(PipelineState state) {
-    consumer_release(state.index());
-  }
-
- private:
-  EmptyBarrier* const empty_barrier_ptr_ = nullptr;
-
-  // Consumer signalling Producer of completion
-  // Ensures all blocks in the Same Row and Column get notifed.
-  CUTLASS_DEVICE
-  void consumer_release(uint32_t stage, uint32_t skip = false) {
-    empty_barrier_ptr_[stage].arrive(
-        0 /*dst_blockid_*/,
-        uint32_t(threadIdx.x % cutlass::NumThreadsPerWarpGroup == 0) &
-            (!skip) /*is_signaling_thread*/);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // end namespace cutlass
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/softmax.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/softmax.h
deleted file mode 100644
index 1bd6131c6..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/softmax.h
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/******************************************************************************
- * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
- *Pradeep Ramani, Tri Dao.
- ******************************************************************************/
-
-#pragma once
-
-#include <cute/tensor.hpp>
-#include <cutlass/numeric_types.h>
-#include <cmath>
-#include "utils.h"
-
-namespace hstu {
-
-using namespace cute;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    bool zero_init = true,
-    typename Engine0,
-    typename Layout0,
-    typename Engine1,
-    typename Layout1,
-    typename Operator>
-__device__ __forceinline__ void thread_reduce_(
-    Tensor<Engine0, Layout0> const& tensor,
-    Tensor<Engine1, Layout1>& summary,
-    Operator& op) {
-  static_assert(Layout0::rank == 2, "Only support 2D Tensor");
-  static_assert(Layout1::rank == 1, "Only support 1D Tensor");
-  CUTE_STATIC_ASSERT_V(size<0>(summary) == size<0>(tensor));
-#pragma unroll
-  for (int ni = 0; ni < size<1>(tensor); ni++) {
-#pragma unroll
-    for (int mi = 0; mi < size<0>(tensor); mi++) {
-      summary(mi) = zero_init && ni == 0 ? tensor(mi, ni)
-                                         : op(summary(mi), tensor(mi, ni));
-    }
-  }
-}
-
-template <
-    typename Engine0,
-    typename Layout0,
-    typename Engine1,
-    typename Layout1,
-    typename Operator>
-__device__ __forceinline__ void quad_allreduce_(
-    Tensor<Engine0, Layout0>& dst,
-    Tensor<Engine1, Layout1>& src,
-    Operator& op) {
-  CUTE_STATIC_ASSERT_V(size(dst) == size(src));
-#pragma unroll
-  for (int i = 0; i < size(dst); i++) {
-    dst(i) = Allreduce<4>::run(src(i), op);
-  }
-}
-
-template <
-    bool zero_init = true,
-    typename Engine0,
-    typename Layout0,
-    typename Engine1,
-    typename Layout1,
-    typename Operator>
-__device__ __forceinline__ void reduce_(
-    Tensor<Engine0, Layout0> const& tensor,
-    Tensor<Engine1, Layout1>& summary,
-    Operator& op) {
-  thread_reduce_<zero_init>(tensor, summary, op);
-  quad_allreduce_(summary, summary, op);
-}
-
-template <
-    bool zero_init = true,
-    typename Engine0,
-    typename Layout0,
-    typename Engine1,
-    typename Layout1>
-__device__ __forceinline__ void reduce_max(
-    Tensor<Engine0, Layout0> const& tensor,
-    Tensor<Engine1, Layout1>& max) {
-  MaxOp<float> max_op;
-  reduce_<zero_init>(tensor, max, max_op);
-}
-
-template <
-    bool zero_init = true,
-    bool warp_reduce = true,
-    typename Engine0,
-    typename Layout0,
-    typename Engine1,
-    typename Layout1>
-__device__ __forceinline__ void reduce_sum(
-    Tensor<Engine0, Layout0> const& tensor,
-    Tensor<Engine1, Layout1>& sum) {
-  SumOp<float> sum_op;
-  thread_reduce_<zero_init>(tensor, sum, sum_op);
-  if constexpr (warp_reduce) {
-    quad_allreduce_(sum, sum, sum_op);
-  }
-}
-
-// Apply the exp to all the elements.
-template <
-    bool Scale_max = true,
-    bool Check_inf = true,
-    int Max_offset = 0,
-    typename Engine0,
-    typename Layout0,
-    typename Engine1,
-    typename Layout1>
-__forceinline__ __device__ void scale_apply_exp2(
-    Tensor<Engine0, Layout0>& tensor,
-    Tensor<Engine1, Layout1> const& max,
-    const float scale) {
-  // For FP8, we can subtract max by 8.0 so that the value after exp2 is in the
-  // range of [0, 256]. This lets us use more of the FP8 range (instead of just
-  // [0, 1]) to reduce underflow.
-  static constexpr float max_offset =
-      float(Max_offset); // We can only template on int, not float
-  static_assert(Layout0::rank == 2, "Only support 2D Tensor");
-  static_assert(Layout1::rank == 1, "Only support 1D Tensor");
-  CUTE_STATIC_ASSERT_V(size<0>(max) == size<0>(tensor));
-#pragma unroll
-  for (int mi = 0; mi < size<0>(tensor); ++mi) {
-    // If max is -inf, then all elements must have been -inf (possibly due to
-    // masking). We don't want (-inf - (-inf)) since that would give NaN.
-    const float max_scaled = Check_inf
-        ? (max(mi) == -INFINITY
-               ? 0.f
-               : (!Scale_max ? max(mi) : max(mi) * scale) - max_offset)
-        : (!Scale_max ? max(mi) : max(mi) * scale) - max_offset;
-#pragma unroll
-    for (int ni = 0; ni < size<1>(tensor); ++ni) {
-      // Instead of computing exp(x - max), we compute exp2(x * log_2(e) -
-      // max * log_2(e)). This allows the compiler to use the ffma
-      // instruction instead of fadd and fmul separately.
-      tensor(mi, ni) = exp2f(tensor(mi, ni) * scale - max_scaled);
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <int kNRows, int Max_offset = 0>
-struct Softmax {
-  using TensorT = decltype(make_tensor<float>(Shape<Int<kNRows>>{}));
-  TensorT row_max, row_sum;
-  float const softmax_scale_log2;
-
-  CUTLASS_DEVICE Softmax(float const softmax_scale_log2_)
-      : softmax_scale_log2(softmax_scale_log2_) {};
-
-  template <bool Is_first, bool Check_inf = false, typename Tensor0>
-  __forceinline__ __device__ TensorT max_get_scale(Tensor0& acc_s) {
-    // Reshape acc_s from ((2, 2, V), MMA_M, MMA_N) to (nrow=(2, MMA_M),
-    // ncol=(2, V, MMA_N))
-    Tensor scores = make_tensor(
-        acc_s.data(), hstu::convert_layout_acc_rowcol(acc_s.layout()));
-    static_assert(CUTE_STATIC_V(size<0>(scores)) == kNRows);
-    TensorT scores_scale;
-    if constexpr (Is_first) {
-      hstu::template reduce_max</*zero_init=*/true>(scores, row_max);
-      cute::fill(scores_scale, 1.f);
-    } else {
-      Tensor scores_max_prev = make_fragment_like(row_max);
-      cute::copy(row_max, scores_max_prev);
-      hstu::template reduce_max</*zero_init=*/false>(scores, row_max);
-#pragma unroll
-      for (int mi = 0; mi < size(row_max); ++mi) {
-        float scores_max_cur = !Check_inf
-            ? row_max(mi)
-            : (row_max(mi) == -INFINITY ? 0.0f : row_max(mi));
-        scores_scale(mi) =
-            exp2f((scores_max_prev(mi) - scores_max_cur) * softmax_scale_log2);
-        row_sum(mi) *= scores_scale(mi);
-      }
-    }
-    return scores_scale;
-  };
-
-  template <bool Is_first, bool Check_inf = false, typename Tensor0>
-  __forceinline__ __device__ void online_softmax(Tensor0& acc_s) {
-    // Reshape acc_s from ((2, 2, V), MMA_M, MMA_N) to (nrow=(2, MMA_M),
-    // ncol=(2, V, MMA_N))
-    Tensor scores = make_tensor(
-        acc_s.data(), hstu::convert_layout_acc_rowcol(acc_s.layout()));
-    static_assert(CUTE_STATIC_V(size<0>(scores)) == kNRows);
-    hstu::template scale_apply_exp2</*Scale_max=*/true, Check_inf, Max_offset>(
-        scores, row_max, softmax_scale_log2);
-    // We don't do the reduce across threads here since we don't need to use the
-    // row_sum. We do that reduce at the end when we need to normalize the
-    // softmax.
-    hstu::reduce_sum</*zero_init=*/Is_first, /*warp_reduce=*/false>(
-        scores, row_sum);
-  };
-
-  __forceinline__ __device__ TensorT finalize(float const final_scale = 1.f) {
-    SumOp<float> sum_op;
-    quad_allreduce_(row_sum, row_sum, sum_op);
-    TensorT scores_scale;
-#pragma unroll
-    for (int mi = 0; mi < size(row_sum); ++mi) {
-      float sum = row_sum(mi);
-      float inv_sum = (sum == 0.f || sum != sum) ? 0.f : 1.f / sum;
-      scores_scale(mi) = inv_sum * final_scale;
-      // For FP8, we might have scaled the output of exp by 2**8 so we need to
-      // divide sum by that amount.
-      if constexpr (Max_offset != 0) {
-        static constexpr float sum_scale = 1.f / float(1 << Max_offset);
-        sum *= sum_scale;
-      }
-      row_sum(mi) = (sum == 0.f || sum != sum)
-          ? -INFINITY
-          : row_max(mi) * (softmax_scale_log2 * float(M_LN2)) + __logf(sum);
-    }
-    return scores_scale;
-  };
-
-  template <typename Tensor1>
-  __forceinline__ __device__ void rescale_o(
-      Tensor1& acc_o,
-      TensorT const& scores_scale) {
-    // Reshape acc_o from (MMA=4, MMA_M, MMA_K) to (nrow=(2, MMA_M), ncol=(2,
-    // MMA_K))
-    Tensor acc_o_rowcol = make_tensor(
-        acc_o.data(), hstu::convert_layout_acc_rowcol(acc_o.layout()));
-    static_assert(CUTE_STATIC_V(size<0>(acc_o_rowcol)) == kNRows);
-#pragma unroll
-    for (int mi = 0; mi < size<0>(acc_o_rowcol); ++mi) {
-#pragma unroll
-      for (int ni = 0; ni < size<1>(acc_o_rowcol); ++ni) {
-        acc_o_rowcol(mi, ni) *= scores_scale(mi);
-      }
-    }
-  };
-};
-
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/static_switch.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/static_switch.h
deleted file mode 100644
index c5759c9d2..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/static_switch.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Inspired by
-// https://github.com/NVIDIA/DALI/blob/main/include/dali/core/static_switch.h
-// and https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Dispatch.h
-
-#pragma once
-
-/// @param COND       - a boolean expression to switch by
-/// @param CONST_NAME - a name given for the constexpr bool variable.
-/// @param ...       - code to execute for true and false
-///
-/// Usage:
-/// ```
-/// BOOL_SWITCH(flag, BoolConst, [&] {
-///     some_function<BoolConst>(...);
-/// });
-/// ```
-//
-
-#define BOOL_SWITCH(COND, CONST_NAME, ...)      \
-  [&] {                                         \
-    if (COND) {                                 \
-      constexpr static bool CONST_NAME = true;  \
-      return __VA_ARGS__();                     \
-    } else {                                    \
-      constexpr static bool CONST_NAME = false; \
-      return __VA_ARGS__();                     \
-    }                                           \
-  }()
-
-#ifdef FLASHATTENTION_DISABLE_LOCAL
-#define CAUSAL_LOCAL_SWITCH(                                           \
-    CAUSAL_COND, LOCAL_COND, CAUSAL_CONST_NAME, LOCAL_CONST_NAME, ...) \
-  [&] {                                                                \
-    constexpr static bool LOCAL_CONST_NAME = false;                    \
-    if (CAUSAL_COND) {                                                 \
-      constexpr static bool CAUSAL_CONST_NAME = true;                  \
-      return __VA_ARGS__();                                            \
-    } else {                                                           \
-      constexpr static bool CAUSAL_CONST_NAME = false;                 \
-      return __VA_ARGS__();                                            \
-    }                                                                  \
-  }()
-#else
-#define CAUSAL_LOCAL_SWITCH(                                           \
-    CAUSAL_COND, LOCAL_COND, CAUSAL_CONST_NAME, LOCAL_CONST_NAME, ...) \
-  [&] {                                                                \
-    if (CAUSAL_COND) {                                                 \
-      constexpr static bool CAUSAL_CONST_NAME = true;                  \
-      constexpr static bool LOCAL_CONST_NAME = false;                  \
-      return __VA_ARGS__();                                            \
-    } else if (LOCAL_COND) {                                           \
-      constexpr static bool CAUSAL_CONST_NAME = false;                 \
-      constexpr static bool LOCAL_CONST_NAME = true;                   \
-      return __VA_ARGS__();                                            \
-    } else {                                                           \
-      constexpr static bool CAUSAL_CONST_NAME = false;                 \
-      constexpr static bool LOCAL_CONST_NAME = false;                  \
-      return __VA_ARGS__();                                            \
-    }                                                                  \
-  }()
-#endif
-
-#ifdef FLASHATTENTION_DISABLE_CLUSTER
-#define CLUSTER_SWITCH(COND, CONST_NAME, ...) \
-  [&] {                                       \
-    constexpr static bool CONST_NAME = false; \
-    return __VA_ARGS__();                     \
-  }()
-#else
-#define CLUSTER_SWITCH BOOL_SWITCH
-#endif
-
-// #ifdef FLASHATTENTION_DISABLE_SM8x
-#define ARCH_SWITCH(ARCH, ARCH_NAME, ...) \
-  [&] {                                   \
-    constexpr static int ARCH_NAME = 90;  \
-    return __VA_ARGS__();                 \
-  }()
-// #else
-// #define ARCH_SWITCH(ARCH, ARCH_NAME, ...)  \
-//   [&] {                                    \
-//     if (ARCH < 90) {                       \
-//       constexpr static int ARCH_NAME = 80; \
-//       return __VA_ARGS__();                \
-//     } else {                               \
-//       constexpr static int ARCH_NAME = 90; \
-//       return __VA_ARGS__();                \
-//     }                                      \
-//   }()
-// #endif
-
-#ifndef FLASHATTENTION_ENABLE_VCOLMAJOR
-#define VCOLMAJOR_SWITCH(COND, CONST_NAME, ...) \
-  [&] {                                         \
-    constexpr static bool CONST_NAME = false;   \
-    return __VA_ARGS__();                       \
-  }()
-#else
-#define VCOLMAJOR_SWITCH BOOL_SWITCH
-#endif
-
-#define HEADDIM_SWITCH(HEADDIM, ...)        \
-  [&] {                                     \
-    if (HEADDIM == 64) {                    \
-      constexpr static int kHeadSize = 64;  \
-      return __VA_ARGS__();                 \
-    } else if (HEADDIM == 96) {             \
-      constexpr static int kHeadSize = 96;  \
-      return __VA_ARGS__();                 \
-    } else if (HEADDIM == 128) {            \
-      constexpr static int kHeadSize = 128; \
-      return __VA_ARGS__();                 \
-    } else if (HEADDIM == 96) {             \
-      constexpr static int kHeadSize = 96;  \
-      return __VA_ARGS__();                 \
-    } else if (HEADDIM == 256) {            \
-      constexpr static int kHeadSize = 256; \
-      return __VA_ARGS__();                 \
-    }                                       \
-  }()
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/tile_scheduler.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/tile_scheduler.h
deleted file mode 100644
index cde7837ce..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/tile_scheduler.h
+++ /dev/null
@@ -1,616 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/******************************************************************************
- * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
- *Pradeep Ramani, Tri Dao.
- ******************************************************************************/
-
-#pragma once
-
-#include "cute/tensor.hpp"
-#include "cutlass/arch/barrier.h"
-#include "cutlass/fast_math.h"
-
-#include "named_barrier.h"
-
-namespace hstu {
-
-///////////////////////////////////////////////////////////////////////////////
-
-// Host side kernel arguments
-struct TileSchedulerArguments {
-  int const num_blocks, num_head, num_batch;
-  int const max_seq_len, headdim,
-      element_size; // Used to calculate L2 swizzling
-  int* const tile_count_semaphore = nullptr;
-  int* const seq_offsets = nullptr;
-  int* const sort_by_length_indices = nullptr;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <
-    bool Jagged = false,
-    int kBlock = 128,
-    bool Sort_by_length_indices = false>
-class SingleTileScheduler {
- public:
-  using SharedStorage = int;
-
-  // Device side kernel params
-  struct Params {
-    int const num_blocks, num_head, num_batch;
-    int const max_seq_len;
-    int* const seq_offsets;
-    int* const sort_by_length_indices;
-  };
-
-  static Params to_underlying_arguments(TileSchedulerArguments const& args) {
-    return {
-        args.num_blocks,
-        args.num_head,
-        args.num_batch,
-        args.max_seq_len,
-        !Jagged ? nullptr : args.seq_offsets,
-        !Sort_by_length_indices ? nullptr : args.sort_by_length_indices};
-  }
-
-  static dim3 get_grid_shape(Params const& params, int num_sm) {
-#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
-    std::printf(
-        "SingleTileScheduler::get_grid_shape: %d, %d, %d\n",
-        params.num_blocks,
-        params.num_head,
-        params.num_batch);
-#endif
-    return {
-        uint32_t(params.num_blocks),
-        uint32_t(params.num_head),
-        uint32_t(params.num_batch)};
-  }
-
-  struct WorkTileInfo {
-    int block_idx = 0;
-    int bidh = 0;
-    int bidb = 0;
-    bool is_valid_tile = false;
-
-    CUTLASS_DEVICE
-    bool is_valid(Params const& params) const {
-      return is_valid_tile;
-    }
-
-    CUTLASS_DEVICE
-    cute::tuple<int32_t, int32_t, int32_t, int32_t> get_block_coord(
-        Params const& params) const {
-      return {block_idx, bidh, bidb, 0 /*split_idx*/};
-    }
-  };
-
-  CUTLASS_DEVICE
-  SingleTileScheduler(SharedStorage* const smem_scheduler) {}
-
-  template <bool IsProducerWarp = false>
-  CUTLASS_DEVICE WorkTileInfo get_initial_work(Params const& params) const {
-    int bidb = int(blockIdx.z);
-    if constexpr (Sort_by_length_indices) {
-      bidb = params.sort_by_length_indices[bidb];
-    }
-    WorkTileInfo work_info{int(blockIdx.x), int(blockIdx.y), bidb, true};
-    if constexpr (Jagged) {
-      int seqlen =
-          (params.seq_offsets ? params.seq_offsets[work_info.bidb + 1] -
-                   params.seq_offsets[work_info.bidb]
-                              : params.max_seq_len);
-      work_info.is_valid_tile = work_info.block_idx * kBlock < seqlen;
-    }
-    return work_info;
-  }
-
-  CUTLASS_DEVICE
-  void init_consumer() const {}
-
-  CUTLASS_DEVICE
-  void prefetch_next_work(Params const& params, WorkTileInfo& current_work)
-      const {}
-
-  template <bool IsProducerWarp = false>
-  CUTLASS_DEVICE WorkTileInfo
-  get_next_work(Params const& params, WorkTileInfo const& current_work) const {
-    return {-1, -1, -1, false};
-  }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-class StaticPersistentTileScheduler {
- public:
-  using SharedStorage = int;
-
-  // Device side kernel params
-  struct Params {
-    int total_blocks;
-    cutlass::FastDivmod m_block_divmod, head_divmod;
-    cutlass::FastDivmod nsplits_divmod;
-  };
-
-  static Params to_underlying_arguments(TileSchedulerArguments const& args) {
-    return {
-        args.num_blocks * args.num_head * args.num_batch,
-        cutlass::FastDivmod(args.num_blocks),
-        cutlass::FastDivmod(args.num_head),
-        cutlass::FastDivmod(1)};
-  }
-
-  static dim3 get_grid_shape(Params const& params, int num_sm) {
-#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
-    std::printf("StaticPersistentTileScheduler::get_grid_shape %d\n", num_sm);
-#endif
-    return {uint32_t(num_sm)};
-  }
-
-  struct WorkTileInfo {
-    int tile_idx;
-
-    CUTLASS_DEVICE
-    bool is_valid(Params const& params) const {
-      return tile_idx < params.total_blocks;
-    }
-
-    CUTLASS_DEVICE
-    cute::tuple<int32_t, int32_t, int32_t, int32_t> get_block_coord(
-        Params const& params) const {
-      int block, bidh, bidb;
-      bidb = params.head_divmod.divmod(
-          bidh, params.m_block_divmod.divmod(block, tile_idx));
-      int split_idx = 0;
-      return {block, bidh, bidb, split_idx};
-    }
-  };
-
-  CUTLASS_DEVICE
-  StaticPersistentTileScheduler(SharedStorage* const smem_scheduler) {};
-
-  template <bool IsProducerWarp = false>
-  CUTLASS_DEVICE WorkTileInfo get_initial_work(Params const& params) const {
-    return {int(blockIdx.x)};
-  }
-
-  CUTLASS_DEVICE
-  void init_consumer() const {}
-
-  CUTLASS_DEVICE
-  void prefetch_next_work(Params const& params, WorkTileInfo& current_work)
-      const {}
-
-  template <bool IsProducerWarp = false>
-  CUTLASS_DEVICE WorkTileInfo
-  get_next_work(Params const& params, WorkTileInfo const& current_work) const {
-    return {current_work.tile_idx + int(gridDim.x)};
-  }
-};
-
-template <
-    int NumMmaThreads = 2 * cutlass::NumThreadsPerWarpGroup,
-    int NumProducerThreads = cutlass::NumThreadsPerWarp,
-    bool WarpSpecialized = true>
-class DynamicPersistentTileScheduler {
-  // This scheduler targets the causal (or local) case where each tile takes
-  // different amount of time. We use longest-processing-time-first scheduling:
-  // the longest remaining tile is assigned to the first SM that's free.
-  // SM indicates they are free by incrementing a semaphore.
-  // However, we have to make sure K & V still fit into L2 cache, so we perform
-  // scheduling on "sections" of the head & batch dimension, each section
-  // consisting of e.g. 8 heads. This is the L2 swizzling part. The size of each
-  // section is precomputed based on the size of K & V and the L2 cache size.
-
-  static_assert(WarpSpecialized || NumProducerThreads == NumMmaThreads);
-  static constexpr int NumThreads =
-      WarpSpecialized ? NumMmaThreads + NumProducerThreads : NumMmaThreads;
-
- public:
-  using SharedStorage = int;
-
- protected:
-  SharedStorage* const tile_count_smem;
-
- public:
-  // Device side kernel params
-  struct Params {
-    int const total_blocks;
-    cutlass::FastDivmod const m_block_divmod, head_divmod;
-    cutlass::FastDivmod const l2_minor_divmod, l2_major_divmod;
-    cutlass::FastDivmod const l2_minor_residual_divmod;
-    int const num_hb_quotient;
-    int* const tile_count_semaphore;
-  };
-
-  static Params to_underlying_arguments(TileSchedulerArguments const& args) {
-    int const size_one_kv_head =
-        args.max_seq_len * args.headdim * args.element_size * 2;
-    int const size_l2 = 32 * 1024 * 1024; // 32 MB for K & V
-    // Swizzle is the size of each "section". Round swizzle to a power of 2
-    // If not PackGQA already, the size of each section can increase by
-    // qhead_per_khead
-    int const swizzle = (1 << cutlass::find_log2(size_l2 / size_one_kv_head));
-    // If we're in the last section (called residual), we don't want to divide
-    // by swizzle. Instead we want to divide by the remainder.
-    int const num_hb_remainder = (args.num_head * args.num_batch) % swizzle;
-    int const num_split_blocks = args.num_blocks;
-    // printf("num_split_blocks = %d, num_head = %d, num_batch = %d, swizzle =
-    // %d, PackGQA = %d, qhead_per_khead = %d, num_hb_remainder = %d\n",
-    // num_split_blocks, args.num_head, args.num_batch, swizzle, int(PackGQA),
-    // args.qhead_per_khead, num_hb_remainder);
-    assert(args.tile_count_semaphore != nullptr);
-    return {
-        num_split_blocks * args.num_head * args.num_batch,
-        cutlass::FastDivmod(args.num_blocks),
-        cutlass::FastDivmod(args.num_head),
-        cutlass::FastDivmod(swizzle),
-        cutlass::FastDivmod(swizzle * num_split_blocks),
-        // don't divide by 0
-        cutlass::FastDivmod(num_hb_remainder > 0 ? num_hb_remainder : 1),
-        (args.num_head * args.num_batch) / swizzle,
-        args.tile_count_semaphore};
-  }
-
-  static dim3 get_grid_shape(Params const& params, int num_sm) {
-#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
-    std::printf("DynamicPersistentTileScheduler::get_grid_shape %d\n", num_sm);
-#endif
-    return {uint32_t(num_sm)};
-  }
-
-  struct WorkTileInfo {
-    int tile_idx;
-
-    CUTLASS_DEVICE
-    bool is_valid(Params const& params) const {
-      return tile_idx < params.total_blocks;
-    }
-
-    CUTLASS_DEVICE
-    cute::tuple<int32_t, int32_t, int32_t, int32_t> get_block_coord(
-        Params const& params) const {
-      int block, bidh, bidb;
-      int l2_mod, bidhb, bidhb_residual;
-      bidhb = params.l2_major_divmod.divmod(l2_mod, tile_idx);
-      // If we're in the last section (called residual), we don't want to divide
-      // by swizzle. Instead we want to divide by the remainder.
-      if (bidhb < params.num_hb_quotient) {
-        block = params.l2_minor_divmod.divmod(bidhb_residual, l2_mod);
-      } else {
-        block = params.l2_minor_residual_divmod.divmod(bidhb_residual, l2_mod);
-      }
-      bidb = params.head_divmod.divmod(
-          bidh, bidhb * params.l2_minor_divmod.divisor + bidhb_residual);
-      int split_idx = 0;
-      // Longest-processing-time-first
-      block = params.m_block_divmod.divisor - 1 - block;
-      return {block, bidh, bidb, split_idx};
-    }
-  };
-
-  CUTLASS_DEVICE
-  DynamicPersistentTileScheduler(SharedStorage* const smem_scheduler)
-      : tile_count_smem(smem_scheduler) {};
-
-  template <bool IsProducerWarp = false>
-  CUTLASS_DEVICE WorkTileInfo get_initial_work(Params const& params) const {
-    return {int(blockIdx.x)};
-  }
-
-  CUTLASS_DEVICE
-  void init_consumer() const {
-    if (WarpSpecialized || cutlass::canonical_warp_idx_sync() > 0) {
-      hstu::named_barrier_arrive(
-          NumThreads,
-          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemEmpty) /*id*/);
-    }
-  }
-
-  CUTLASS_DEVICE
-  void prefetch_next_work(Params const& params, WorkTileInfo& current_work)
-      const {
-    if (threadIdx.x % NumProducerThreads == 0) {
-      current_work.tile_idx =
-          atomicAdd(params.tile_count_semaphore, 1) + int(gridDim.x);
-    }
-  }
-
-  template <bool IsProducerWarp = false>
-  CUTLASS_DEVICE WorkTileInfo
-  get_next_work(Params const& params, WorkTileInfo const& current_work) const {
-    if constexpr (IsProducerWarp) {
-      // thread 0 already has the right tile_idx, just need to broadcast to the
-      // rest of warp 0
-      int new_tile_idx =
-          __shfl_sync(0xffffffff, current_work.tile_idx, 0 /*lane*/);
-      hstu::named_barrier_sync(
-          NumThreads,
-          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemEmpty) /*id*/);
-      if (threadIdx.x % NumProducerThreads == 0) {
-        *tile_count_smem = current_work.tile_idx;
-      }
-      hstu::named_barrier_arrive(
-          NumThreads,
-          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemFull) /*id*/);
-      return {new_tile_idx};
-    } else {
-      hstu::named_barrier_sync(
-          NumThreads,
-          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemFull) /*id*/);
-      int tile_idx = *tile_count_smem;
-      hstu::named_barrier_arrive(
-          NumThreads,
-          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemEmpty) /*id*/);
-      return {tile_idx};
-    }
-  }
-};
-
-template <
-    int kBlock,
-    int NumMmaThreads = 2 * cutlass::NumThreadsPerWarpGroup,
-    int NumProducerThreads = cutlass::NumThreadsPerWarp,
-    bool WarpSpecialized = true>
-class VarlenDynamicPersistentTileScheduler {
-  static_assert(WarpSpecialized || NumProducerThreads == NumMmaThreads);
-  static constexpr int NumThreads =
-      WarpSpecialized ? NumMmaThreads + NumProducerThreads : NumMmaThreads;
-
- public:
-  using SharedStorage = int4;
-
- protected:
-  SharedStorage* const work_info_smem;
-
- public:
-  // Device side kernel params
-  struct Params {
-    int num_head, num_batch;
-    int const max_seq_len;
-    cutlass::FastDivmod nsplits_divmod;
-    int* const tile_count_semaphore;
-    int* const seq_offsets;
-  };
-
-  static Params to_underlying_arguments(TileSchedulerArguments const& args) {
-    // If Split, for the purpose of scheduling, we pretend that instead there
-    // are (args.num_splits * args.num_head) number of heads.
-    assert(args.tile_count_semaphore != nullptr);
-    return {
-        args.num_head,
-        args.num_batch,
-        args.max_seq_len,
-        cutlass::FastDivmod(1),
-        args.tile_count_semaphore,
-        args.seq_offsets};
-  }
-
-  static dim3 get_grid_shape(Params const& params, int num_sm) {
-#ifdef HSTU_FLASH_ATTN_DEBUG_INFO
-    std::printf(
-        "VarlenDynamicPersistentTileScheduler::get_grid_shape %d\n", num_sm);
-#endif
-    return {uint32_t(num_sm)};
-  }
-
-  struct WorkTileInfo {
-    int tile_idx, block, bidh, bidb;
-
-    CUTLASS_DEVICE
-    bool is_valid(Params const& params) const {
-      // if (blockIdx.x >= 0 && (threadIdx.x == 128 || threadIdx.x == 0)) {
-      // printf("blockIdx.x = %d, threadIdx.x = %d, checking valid, bidb = %d,
-      // params.num_batch = %d\n", blockIdx.x, threadIdx.x, bidb,
-      // params.num_batch); }
-      return bidb < params.num_batch;
-    }
-
-    CUTLASS_DEVICE
-    cute::tuple<int32_t, int32_t, int32_t, int32_t> get_block_coord(
-        Params const& params) const {
-      return {block, bidh, bidb, 0 /*split_idx*/};
-    }
-  };
-
-  CUTLASS_DEVICE
-  VarlenDynamicPersistentTileScheduler(SharedStorage* const smem_scheduler)
-      : work_info_smem(smem_scheduler) {};
-
-  CUTLASS_DEVICE
-  WorkTileInfo tile_idx_to_work_tile(
-      Params const& params,
-      int next_tile_idx,
-      WorkTileInfo const& current_work) const {
-    auto prefix_sum = [](int val) {
-      auto lane = threadIdx.x % cutlass::NumThreadsPerWarp;
-      CUTLASS_PRAGMA_UNROLL
-      for (int i = 1; i < cutlass::NumThreadsPerWarp; i <<= 1) {
-        int32_t partial_sum = __shfl_up_sync(0xffffffff, val, i);
-        if (lane >= i) {
-          val += partial_sum;
-        }
-      }
-      return val;
-    };
-
-    auto get_num_m_blocks = [&](int bidb_start) {
-      auto lane = threadIdx.x % cutlass::NumThreadsPerWarp;
-      int seqlen;
-      if (params.seq_offsets) {
-        int cur_cu_seqlen = lane + bidb_start <= params.num_batch
-            ? params.seq_offsets[lane + bidb_start]
-            : 0;
-        int next_cu_seqlen = __shfl_down_sync(0xffffffff, cur_cu_seqlen, 1);
-        seqlen = next_cu_seqlen - cur_cu_seqlen;
-      } else {
-        seqlen = params.max_seq_len;
-      }
-      return lane + bidb_start < params.num_batch &&
-              lane < cutlass::NumThreadsPerWarp - 1
-          ? cute::ceil_div(seqlen, kBlock)
-          : 0;
-    };
-
-    int num_m_blocks =
-        get_num_m_blocks(current_work.bidb); // Different for each lane
-    // Cumulative number of blocks for the next 31 batches
-    int num_m_blocks_cumulative = prefix_sum(num_m_blocks);
-    // Total number of blocks for the next 31 batches
-    int m_blocks_in_group = __shfl_sync(
-        0xffffffff, num_m_blocks_cumulative, cutlass::NumThreadsPerWarp - 1);
-    int group_end_tile = current_work.tile_idx - current_work.block -
-        current_work.bidh * __shfl_sync(0xffffffff, num_m_blocks, 0 /*lane*/) +
-        m_blocks_in_group * params.num_head; // Same for all lanes
-    int bidb = current_work.bidb;
-    // if (blockIdx.x <= 9 && threadIdx.x == 0) {
-    //     printf("Before while, blockIdx.x = %d, threadIdx.x = %d, bidb = %d,
-    //     num_m_blocks = %d, next_tile_idx = %d, group_end_tile = %d,
-    //     m_blocks_in_group = %d\n", blockIdx.x, threadIdx.x, bidb,
-    //     num_m_blocks, next_tile_idx, group_end_tile, m_blocks_in_group);
-    // }
-    while (group_end_tile <= next_tile_idx) {
-      bidb += cutlass::NumThreadsPerWarp - 1;
-      if (bidb >= params.num_batch) {
-        // if (blockIdx.x <= 9 && threadIdx.x == 0) {
-        //     printf("Returning early, blockIdx.x = %d, threadIdx.x = %d, bidb
-        //     = %d, num_m_blocks = %d, next_tile_idx = %d, group_end_tile = %d,
-        //     m_blocks_in_group = %d\n", blockIdx.x, threadIdx.x, bidb,
-        //     num_m_blocks, next_tile_idx, group_end_tile, m_blocks_in_group);
-        // }
-        return {next_tile_idx, 0, 0, params.num_batch};
-      }
-      num_m_blocks = get_num_m_blocks(bidb);
-      num_m_blocks_cumulative = prefix_sum(num_m_blocks);
-      m_blocks_in_group = __shfl_sync(
-          0xffffffff, num_m_blocks_cumulative, cutlass::NumThreadsPerWarp - 1);
-      group_end_tile += m_blocks_in_group * params.num_head;
-      // if (blockIdx.x <= 9 && threadIdx.x == 0) {
-      //     printf("Bottom of while, blockIdx.x = %d, threadIdx.x = %d, bidb =
-      //     %d, num_m_blocks = %d, next_tile_idx = %d, group_end_tile = %d,
-      //     m_blocks_in_group = %d\n", blockIdx.x, threadIdx.x, bidb,
-      //     num_m_blocks, next_tile_idx, group_end_tile, m_blocks_in_group);
-      // }
-    }
-    int group_start_tile = group_end_tile - m_blocks_in_group * params.num_head;
-    // The next problem to process is the first one that does not have ending
-    // tile position that is greater than or equal to tile index.
-    int batch_idx_in_group = __popc(__ballot_sync(
-        0xffffffff,
-        group_start_tile + num_m_blocks_cumulative * params.num_head <=
-            next_tile_idx));
-    bidb += batch_idx_in_group;
-    num_m_blocks = __shfl_sync(0xffffffff, num_m_blocks, batch_idx_in_group);
-    int mh_block = next_tile_idx - group_start_tile -
-        (batch_idx_in_group == 0 ? 0
-                                 : __shfl_sync(
-                                       0xffffffff,
-                                       num_m_blocks_cumulative,
-                                       batch_idx_in_group - 1)) *
-            params.num_head;
-    int bidh = mh_block / num_m_blocks;
-    int block = mh_block - bidh * num_m_blocks;
-    // if (blockIdx.x <= 9 && threadIdx.x == 0) {
-    //     printf("blockIdx.x = %d, threadIdx.x = %d, batch_idx_in_group = %d,
-    //     bidb = %d, num_m_blocks = %d, next_tile_idx = %d, group_end_tile =
-    //     %d, m_blocks_in_group = %d, mh_block = %d, bidh = %d, block = %d\n",
-    //     blockIdx.x, threadIdx.x, batch_idx_in_group, bidb, num_m_blocks,
-    //     next_tile_idx, group_end_tile, m_blocks_in_group, mh_block, bidh,
-    //     block);
-    // }
-    return {next_tile_idx, block, bidh, bidb};
-  }
-
-  template <bool IsProducerWarp = false>
-  CUTLASS_DEVICE WorkTileInfo get_initial_work(Params const& params) const {
-    if constexpr (IsProducerWarp) {
-      WorkTileInfo work_info =
-          tile_idx_to_work_tile(params, int(blockIdx.x), {0, 0, 0, 0});
-      if (threadIdx.x % cutlass::NumThreadsPerWarp == 0) {
-        *work_info_smem = make_int4(
-            work_info.tile_idx,
-            work_info.block,
-            work_info.bidh,
-            work_info.bidb);
-      }
-      hstu::named_barrier_arrive(
-          NumThreads,
-          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemFull) /*id*/);
-      return work_info;
-    } else {
-      return get_next_work<false>(params, {0, 0, 0, 0});
-    }
-  }
-
-  CUTLASS_DEVICE
-  void init_consumer() const {
-    // Don't arrive at the TileCountSmemEmpty barrier here, because
-    // get_initial_work will do that
-  }
-
-  CUTLASS_DEVICE
-  void prefetch_next_work(Params const& params, WorkTileInfo& current_work)
-      const {
-    if (threadIdx.x % NumProducerThreads == 0) {
-      current_work.tile_idx =
-          atomicAdd(params.tile_count_semaphore, 1) + int(gridDim.x);
-    }
-  }
-
-  template <bool IsProducerWarp = false>
-  CUTLASS_DEVICE WorkTileInfo
-  get_next_work(Params const& params, WorkTileInfo const& current_work) const {
-    if constexpr (IsProducerWarp) {
-      // thread 0 has the next tile_idx, just need to broadcast to the rest of
-      // warp 0
-      int new_tile_idx =
-          __shfl_sync(0xffffffff, current_work.tile_idx, 0 /*lane*/);
-      WorkTileInfo work_info = {
-          __shfl_sync(0xffffffff, current_work.tile_idx, 1 /*lane*/),
-          current_work.block,
-          current_work.bidh,
-          current_work.bidb};
-      work_info = tile_idx_to_work_tile(params, new_tile_idx, work_info);
-      hstu::named_barrier_sync(
-          NumThreads,
-          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemEmpty) /*id*/);
-      if (threadIdx.x % cutlass::NumThreadsPerWarp == 0) {
-        *work_info_smem = make_int4(
-            work_info.tile_idx,
-            work_info.block,
-            work_info.bidh,
-            work_info.bidb);
-      }
-      hstu::named_barrier_arrive(
-          NumThreads,
-          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemFull) /*id*/);
-      return work_info;
-    } else {
-      hstu::named_barrier_sync(
-          NumThreads,
-          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemFull) /*id*/);
-      int4 work_info = *work_info_smem;
-      hstu::named_barrier_arrive(
-          NumThreads,
-          static_cast<uint32_t>(FwdNamedBarriers::TileCountSmemEmpty) /*id*/);
-      return WorkTileInfo{work_info.x, work_info.y, work_info.z, work_info.w};
-    }
-  }
-};
-
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/tile_size.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/tile_size.h
deleted file mode 100644
index 3c8968bda..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/tile_size.h
+++ /dev/null
@@ -1,220 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/******************************************************************************
- * Copyright (c) 2024, Jay Shah, Ganesh Bikshandi, Ying Zhang, Vijay Thakkar,
- *Pradeep Ramani, Tri Dao.
- ******************************************************************************/
-
-#pragma once
-
-#include <tuple>
-
-namespace hstu {
-
-constexpr int kBlockM_bwd(
-    const int arch,
-    const int headdim,
-    const bool causal,
-    const bool is_local) {
-  int const kBlockM_sm90 = headdim <= 64
-      ? 64
-      : (headdim <= 96
-             ? 64
-             : (headdim <= 128 ? (causal || is_local ? 64 : 80) : 64));
-  int const kBlockM_sm80 = headdim <= 64 ? 128 : 64;
-  int const kBlockM = arch >= 90 ? kBlockM_sm90 : kBlockM_sm80;
-  return kBlockM;
-}
-
-constexpr int kBlockN_bwd(const int arch, const int headdim) {
-  int const kBlockN_sm90 = headdim <= 128 ? 128 : (headdim <= 192 ? 96 : 80);
-  int const kBlockN_sm80 = headdim <= 128 ? 128 : (headdim <= 192 ? 80 : 64);
-  int const kBlockN = arch >= 90 ? kBlockN_sm90 : kBlockN_sm80;
-  return kBlockN;
-}
-
-constexpr int NumMmaWarpGroups_bwd(const int arch, const int headdim) {
-  if (headdim <= 128) {
-    return 2;
-  } else if (headdim == 192) {
-    return arch >= 90 ? 3 : 2;
-  } else {
-    return 2;
-  }
-}
-
-constexpr bool V_in_regs_bwd(const int arch, const int headdim) {
-  if (arch >= 90 && headdim == 96) {
-    return true;
-  }
-  return false;
-}
-
-// Stages_dO, Stages_dS_or_QSm80
-constexpr std::tuple<int, int> Stages_bwd(const int arch, const int headdim) {
-  if (headdim <= 128) {
-    return {2, 2};
-  }
-  if (headdim == 192) {
-    if (arch >= 90) {
-      return {1, 1};
-    } else {
-      return {1, 2};
-    }
-  } else {
-    return {1, 1};
-  }
-}
-
-// AtomLayoutMSdP, AtomLayoutNdKV, AtomLayoutMdQ
-constexpr std::tuple<int, int, int> AtomLayout_bwd(
-    const int arch,
-    const int headdim) {
-  if (headdim <= 64) {
-    if (arch >= 90) {
-      return {1, 2, 1};
-    } else {
-      return {4, 4, 4};
-    }
-  } else if (headdim <= 96) {
-    if (arch >= 90) {
-      return {1, 2, 1};
-    } else {
-      return {2, 4, 2};
-    }
-  } else if (headdim <= 128) {
-    if (arch >= 90) {
-      return {1, 2, 1};
-    } else {
-      return {2, 2, 2};
-    }
-  } else {
-    if (arch >= 90) {
-      return {1, 1, 1};
-    } else {
-      return {4, 2, 2};
-    }
-  }
-}
-
-// SdP_swapAB, dKV_swapAB, dQ_swapAB
-constexpr std::tuple<bool, bool, bool> swapAB_bwd(
-    const int arch,
-    const int headdim,
-    const bool causal,
-    const bool local) {
-  if (headdim <= 96) {
-    return {arch >= 90 ? true : false, false, false};
-  } else if (headdim == 128) {
-    bool SdP_swapAB = arch >= 90 ? true : false;
-    bool dKV_swapAB = false;
-    bool dQ_swapAB = arch >= 90 ? ((causal || local) ? false : true) : false;
-    return {SdP_swapAB, dKV_swapAB, dQ_swapAB};
-  } else if (headdim == 192) {
-    return {false, true, false};
-  } else {
-    return {false, arch >= 90 ? true : false, arch >= 90 ? true : false};
-  }
-}
-
-// Return {kBlockM, kBlockN, Mma1_is_RS}
-constexpr std::tuple<int, int, bool> tile_size_fwd_sm90(
-    int headdim,
-    bool is_causal,
-    bool is_local,
-    int element_size = 2,
-    bool v_colmajor = false,
-    bool Cross = false,
-    bool Training = true) {
-  // for cross attention, q is usually much smaller than k/v, so we reduce the
-  // BlockM size to increase parallelism
-  bool small_blockm = Cross && (!Training);
-  if (element_size == 2) {
-    if (headdim <= 64) {
-      return {small_blockm ? 64 : 192, 128, true};
-      // Good for long seqlen (>= 4k) but suffers from tile quantization at
-      // short seqlen return {192, is_causal || is_local ? 192 : 176, true,
-      // false};
-    } else if (headdim <= 96) {
-      return {small_blockm ? 64 : 192, is_local ? 128 : 144, false};
-    } else if (headdim <= 128) {
-      return {small_blockm ? 64 : 128, is_causal || is_local ? 128 : 176, true};
-      // {128, 192, false, false} and {192, 128, false, true} are quite good too
-      // 128 x 192 hits the limit of smem if Mma1_is_RS, 128 x 144 hits the
-      // limit if !Mma1_is_RS
-    } else if (headdim <= 192) {
-      return {
-          small_blockm ? 64 : 128,
-          is_local ? 96 : 112,
-          true}; // 128 x 112 hits the limit of smem
-    } else {
-      return {
-          small_blockm ? 64 : 128,
-          is_local ? 64 : 80,
-          true}; // 128 x 80 hits the limit of smem
-    }
-  } else {
-    if (headdim <= 64) {
-      return {192, 160, true};
-    } else if (headdim <= 96) {
-      return {192, 128, true};
-    } else if (headdim <= 128) {
-      return {128, (v_colmajor ? 192 : 224), true};
-    } else if (headdim <= 192) {
-      return {128, 160, true};
-    } else {
-      return {128, is_local ? 64 : 128, true};
-    }
-  }
-}
-
-// Return {kBlockM, kBlockN, kNWarps, kStages, Q_in_regs}
-constexpr std::tuple<int, int, int, int, bool> tile_size_fwd_sm8x(
-    bool sm86_or_89,
-    int headdim,
-    bool is_causal,
-    bool is_local,
-    int element_size = 2) {
-  if (element_size == 2) {
-    if (headdim <= 64) {
-      return {128, (is_local ? 96 : 112), 4, 1, false};
-    } else if (headdim <= 96) {
-      return {128, is_local ? 48 : 64, 4, 1, false};
-    } else if (headdim <= 128) {
-      bool const use_8_warps = sm86_or_89;
-      return {
-          128,
-          use_8_warps ? (is_local ? 96 : 128) : (is_local ? 48 : 64),
-          use_8_warps ? 8 : 4,
-          1,
-          use_8_warps};
-    } else if (headdim <= 192) {
-      bool const kBlockN_64 = is_local;
-      return {128, kBlockN_64 ? 64 : 96, 8, sm86_or_89 ? 1 : 2, !kBlockN_64};
-    } else {
-      return {
-          128,
-          sm86_or_89 ? (is_local ? 48 : 64) : (is_local ? 64 : 96),
-          8,
-          1,
-          false};
-    }
-  } else {
-    // Placeholder for now
-    return {128, 64, 8, 2, false};
-  }
-}
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/utils.h b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/utils.h
deleted file mode 100644
index 50a065ed4..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/utils.h
+++ /dev/null
@@ -1,789 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/******************************************************************************
- * Copyright (c) 2024, Tri Dao.
- ******************************************************************************/
-
-#pragma once
-
-#include <assert.h>
-#include <stdint.h>
-#include <stdlib.h>
-
-#include <cuda_fp16.h>
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
-#include <cuda_bf16.h>
-#endif
-
-#include <cute/tensor.hpp>
-#include <cutlass/gemm/device/gemm.h>
-
-#include <cutlass/array.h>
-#include <cutlass/cutlass.h>
-#include <cutlass/numeric_conversion.h>
-#include <cutlass/numeric_types.h>
-
-#define CHECK_CUDA(call)                \
-  do {                                  \
-    cudaError_t status_ = call;         \
-    if (status_ != cudaSuccess) {       \
-      fprintf(                          \
-          stderr,                       \
-          "CUDA error (%s:%d): %s\n",   \
-          __FILE__,                     \
-          __LINE__,                     \
-          cudaGetErrorString(status_)); \
-      exit(1);                          \
-    }                                   \
-  } while (0)
-
-#define CHECK_CUDA_KERNEL_LAUNCH() CHECK_CUDA(cudaGetLastError())
-
-#ifndef M_LOG2E
-#define M_LOG2E 1.44269504088896340735992468100 /* log_2 (e) */
-#endif
-
-namespace hstu {
-
-using namespace cute;
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// A wrapper for the kernel that is used to guard against compilation on
-// architectures that will never use the kernel. The purpose of this is to
-// reduce the size of the compiled binary.
-// Adapted from
-// https://github.com/vllm-project/vllm/blob/4d29e91be84d27ca313d657eee92c067439a4c23/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cuh#L55
-template <typename Kernel>
-struct enable_sm90_or_later : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE void operator()(Args&&... args) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
-    Kernel::operator()(std::forward<Args>(args)...);
-#endif
-  }
-};
-
-template <typename Kernel>
-struct enable_sm80_to_sm89 : Kernel {
-  template <typename... Args>
-  CUTLASS_DEVICE void operator()(Args&&... args) {
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800) && (__CUDA_ARCH__ <= 890)
-    Kernel::operator()(std::forward<Args>(args)...);
-#endif
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename T>
-struct MaxOp {
-  __device__ __forceinline__ T operator()(T const& x, T const& y) {
-    return x > y ? x : y;
-  }
-};
-
-template <>
-struct MaxOp<float> {
-  // This is slightly faster
-  __device__ __forceinline__ float operator()(float const& x, float const& y) {
-    return max(x, y);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename T>
-struct SumOp {
-  __device__ __forceinline__ T operator()(T const& x, T const& y) {
-    return x + y;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename T>
-struct SiluScaleOp {
-  cutlass::epilogue::thread::SiLu<T> silu;
-  __device__ __forceinline__ T operator()(T const& t, T const& scale) {
-    float t2 = t / 2;
-    return t2 * (1 + cutlass::fast_tanh(t2)) *
-        scale; // __fdividef(t, 1.0f + cutlass::fast_exp(-t)) * scale
-  }
-};
-
-template <typename T, typename Engine, typename Layout>
-CUTLASS_DEVICE void inplace_silu_scale(
-    Tensor<Engine, Layout>& tensor,
-    T const& scale_before,
-    T const& scale_after) {
-  SiluScaleOp<T> silu_scale_op;
-#pragma unroll
-  for (int i = 0; i < size(tensor); ++i) {
-    tensor(i) = silu_scale_op(tensor(i) * scale_before, scale_after);
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <int THREADS>
-struct Allreduce {
-  static_assert(THREADS == 32 || THREADS == 16 || THREADS == 8 || THREADS == 4);
-  template <typename T, typename Operator>
-  static __device__ __forceinline__ T run(T x, Operator& op) {
-    constexpr int OFFSET = THREADS / 2;
-    x = op(x, __shfl_xor_sync(uint32_t(-1), x, OFFSET));
-    return Allreduce<OFFSET>::run(x, op);
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <>
-struct Allreduce<2> {
-  template <typename T, typename Operator>
-  static __device__ __forceinline__ T run(T x, Operator& op) {
-    x = op(x, __shfl_xor_sync(uint32_t(-1), x, 1));
-    return x;
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// For SM80, convert acc_layout from (MMA=4, MMA_M, MMA_N) to (nrow=(2,
-// MMA_M), ncol=(2, MMA_N)). For SM90, convert acc_layout from ((2, 2, V),
-// MMA_M, MMA_N) to (nrow=(2, MMA_M), ncol=(2, V, MMA_N))
-template <bool Transposed = false, typename Layout0>
-CUTLASS_DEVICE auto convert_layout_acc_rowcol(Layout0 acc_layout) {
-  if constexpr (decltype(rank<0>(acc_layout))::value == 3) { // SM90
-    static_assert(decltype(size<0, 0>(acc_layout))::value == 2);
-    static_assert(decltype(size<0, 1>(acc_layout))::value == 2);
-    static_assert(decltype(rank(acc_layout))::value == 3);
-    auto l = acc_layout;
-    if constexpr (!Transposed) {
-      return make_layout(
-          make_layout(get<0, 1>(l), get<1>(l)),
-          make_layout(get<0, 0>(l), get<0, 2>(l), get<2>(l)));
-    } else {
-      return make_layout(
-          make_layout(get<0, 0>(l), get<0, 2>(l), get<2>(l)),
-          make_layout(get<0, 1>(l), get<1>(l)));
-    }
-
-  } else { // SM80
-    static_assert(decltype(size<0>(acc_layout))::value == 4);
-    static_assert(decltype(rank(acc_layout))::value == 3);
-    auto l = logical_divide(acc_layout, Shape<_2>{}); // ((2, 2), MMA_M, MMA_N)
-    if constexpr (!Transposed) {
-      return make_layout(
-          make_layout(get<0, 1>(l), get<1>(l)),
-          make_layout(get<0, 0>(l), get<2>(l)));
-    } else {
-      return make_layout(
-          make_layout(get<0, 0>(l), get<2>(l)),
-          make_layout(get<0, 1>(l), get<1>(l)));
-    }
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// For SM80, convert acc_layout from (MMA=4, MMA_M, MMA_N) to ((4, 2), MMA_M,
-// MMA_N / 2) if using m16n8k16, or to (4, MMA_M, MMA_N) if using m16n8k8. For
-// SM90, FP16/BF16, convert acc_layout from ((2, 2, N / 8), MMA_M, MMA_N) to
-// ((2, 2, 2), MMA_M, (N / 16, MMA_N)) For SM90, FP8, convert acc_layout from
-// ((2, 2, N / 8), MMA_M, MMA_N) to ((4, 2, 2), MMA_M, (N / 32, MMA_N))
-template <typename MMA_Traits, typename Layout0>
-CUTLASS_DEVICE auto convert_layout_acc_Aregs(Layout0 acc_layout) {
-  using X = Underscore;
-  if constexpr (decltype(rank<0>(acc_layout))::value == 3) { // SM90
-    static_assert(decltype(size<0, 0>(acc_layout))::value == 2);
-    static_assert(decltype(size<0, 1>(acc_layout))::value == 2);
-    static_assert(decltype(rank(acc_layout))::value == 3);
-    static_assert(decltype(rank(get<0>(acc_layout)))::value == 3);
-    if constexpr (sizeof(typename MMA_Traits::ValTypeA) == 2) {
-      auto l =
-          logical_divide(get<0, 2>(acc_layout), Tile<_2>{}); // ((2, N / 16))
-      return make_layout(
-          make_layout(
-              get<0, 0>(acc_layout), get<0, 1>(acc_layout), get<0, 0>(l)),
-          get<1>(acc_layout),
-          coalesce(make_layout(get<0, 1>(l), get<2>(acc_layout))));
-    } else {
-      static_assert(sizeof(typename MMA_Traits::ValTypeA) == 1);
-      static_assert(decltype(stride<0, 0>(acc_layout))::value == 1);
-      static_assert(decltype(stride<0, 1>(acc_layout))::value == 2);
-      auto l = logical_divide(
-          get<0, 2>(acc_layout),
-          Tile<Layout<Shape<_2, _2>>>{}); // (((2, 2), N / 32))
-      // This combines the first two modes (<0, 0> and <0, 1>) into one mode.
-      // Will require register shuffling later to be correct.
-      return make_layout(
-          make_layout(Layout<_4>{}, get<0, 0, 0>(l), get<0, 0, 1>(l)),
-          get<1>(acc_layout),
-          coalesce(make_layout(
-              get<0, 1>(l),
-              get<2>(acc_layout)))); // ((4, 2, 2), MMA_M, N / 32 * MMA_N)
-      // This combination is right but doesn't work with register shuffling.
-      // return make_layout(make_layout(coalesce(make_layout(get<0,
-      // 0>(acc_layout), get<0, 0, 0>(l))), get<0, 1>(acc_layout), get<0, 0,
-      // 1>(l)),
-      //                    get<1>(acc_layout),
-      //                    coalesce(make_layout(get<0, 1>(l),
-      //                    get<2>(acc_layout))));
-    }
-  } else { // SM80
-    static_assert(decltype(size<0>(acc_layout))::value == 4);
-    static_assert(decltype(rank(acc_layout))::value == 3);
-    constexpr int mma_shape_K = get<2>(typename MMA_Traits::Shape_MNK{});
-    static_assert(mma_shape_K == 8 || mma_shape_K == 16);
-    if constexpr (mma_shape_K == 8) {
-      return acc_layout;
-    } else {
-      auto l = logical_divide(
-          acc_layout, Shape<X, X, _2>{}); // (4, MMA_M, (2, MMA_N / 2)))
-      return make_layout(
-          make_layout(get<0>(l), get<2, 0>(l)), get<1>(l), get<2, 1>(l));
-    }
-  }
-};
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename To_type, typename Engine, typename Layout>
-CUTLASS_DEVICE auto convert_type_unsafe(Tensor<Engine, Layout> const& tensor) {
-  using From_type = typename Engine::value_type;
-  static constexpr int numel = decltype(size(tensor))::value;
-  cutlass::NumericArrayConverter<To_type, From_type, numel> convert_op;
-  // HACK: this requires tensor to be "contiguous"
-  auto frag =
-      convert_op(*reinterpret_cast<const cutlass::Array<From_type, numel>*>(
-          tensor.data()));
-  return make_tensor(make_rmem_ptr<To_type>(&frag), tensor.layout());
-  // Unsafe because we're returning a tensor with memory allocated on the
-  // stack. If the compiler does not inline this function, then the memory
-  // might not be valid.
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename Engine, typename Layout, typename EngineOut>
-CUTLASS_DEVICE void convert_type_out(
-    Tensor<Engine, Layout> const& tensor,
-    Tensor<EngineOut, Layout>& out) {
-  // Somehow if we allocate out inside this function and return it, e2e is
-  // slower and the output can be wrong.
-  using From_type = typename Engine::value_type;
-  using To_type = typename EngineOut::value_type;
-  static constexpr int FragmentSize = std::max(
-      sizeof(From_type) / sizeof(To_type), sizeof(To_type) / sizeof(From_type));
-  static_assert(
-      CUTE_STATIC_V(size(tensor)) % FragmentSize == 0,
-      "Fragment size does not vectorize properly");
-  Tensor frag = recast<cutlass::Array<From_type, FragmentSize> const>(tensor);
-  Tensor out_frg = recast<cutlass::Array<To_type, FragmentSize>>(out);
-  static_assert(size(frag) == size(out_frg));
-  cutlass::NumericArrayConverter<To_type, From_type, FragmentSize> convert_op;
-#pragma unroll
-  for (int i = 0; i < size(frag); ++i) {
-    out_frg[i] = convert_op(frag[i]);
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Blocks until all but N previous cp.async.commit_group operations have
-// committed. This differs from cute::cp_async_wait in that when N = 0 we
-// don't call cp.async.wait_all (which is equivalent to commit_group then
-// wait_group 0). Instead we just call cp.async.wait_group 0, which is
-// slightly faster.
-// https://github.com/NVIDIA/cutlass/blob/master/include/cute/arch/copy_sm80.hpp#L113
-template <int N>
-CUTE_HOST_DEVICE void cp_async_wait() {
-#if defined(CUTE_ARCH_CP_ASYNC_SM80_ENABLED)
-  asm volatile("cp.async.wait_group %0;\n" ::"n"(N));
-#endif
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <bool A, class Mma, class Tensor0>
-CUTLASS_DEVICE auto mma_partition_fragment_AB(
-    Mma const& mma,
-    Tensor0 const& tensor0) {
-  if constexpr (A) {
-    return mma.partition_fragment_A(tensor0);
-  } else {
-    return mma.partition_fragment_B(tensor0);
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    bool zero_init = false,
-    int wg_wait = 0,
-    bool SwapAB = false,
-    int M_slice = -1,
-    typename Tensor0,
-    typename Tensor1,
-    typename Tensor2,
-    typename TiledMma>
-CUTLASS_DEVICE void gemm(
-    TiledMma& tiled_mma,
-    Tensor0 const& tCrA,
-    Tensor1 const& tCrB,
-    Tensor2& tCrC) {
-  if constexpr (M_slice >= 0) {
-    static constexpr int MMA_M = decltype(size<1>(tCrC))::value;
-    static_assert(M_slice < MMA_M);
-    // After logical_divide, C has shape ((2,2,V), (MMA_M, 1), MMA_N)
-    Tensor tCrC_slice =
-        cute::logical_divide(tCrC, Shape<cute::Underscore, Int<MMA_M>>{})(
-            _, make_coord(Int<M_slice>{}, _), _);
-    if constexpr (!SwapAB) {
-      Tensor tCrA_slice =
-          cute::logical_divide(tCrA, Shape<cute::Underscore, Int<MMA_M>>{})(
-              _, make_coord(Int<M_slice>{}, _), _);
-      gemm<zero_init, wg_wait, SwapAB, /*M_slice=*/-1>(
-          tiled_mma, tCrA_slice, tCrB, tCrC_slice);
-    } else {
-      Tensor tCrB_slice =
-          cute::logical_divide(tCrB, Shape<cute::Underscore, Int<MMA_M>>{})(
-              _, make_coord(Int<M_slice>{}, _), _);
-      gemm<zero_init, wg_wait, SwapAB, /*M_slice=*/-1>(
-          tiled_mma, tCrA, tCrB_slice, tCrC_slice);
-    }
-  } else {
-    constexpr bool Is_RS = !cute::is_base_of<
-        cute::GMMA::DescriptorIterator,
-        typename TiledMma::FrgTypeA>::value;
-    // Need to cast away const on tCrA since warpgroup_fence_operand doesn't
-    // take const
-    if constexpr (Is_RS) {
-      if constexpr (!SwapAB) {
-        warpgroup_fence_operand(const_cast<Tensor0&>(tCrA));
-      } else {
-        warpgroup_fence_operand(const_cast<Tensor1&>(tCrB));
-      }
-    }
-    warpgroup_fence_operand(tCrC);
-    warpgroup_arrive();
-    if constexpr (zero_init) {
-      tiled_mma.accumulate_ = GMMA::ScaleOut::Zero;
-    }
-    // Unroll the K mode manually to set scale D to 1
-    CUTLASS_PRAGMA_UNROLL
-    for (int k_block = 0; k_block < size<2>(tCrA); ++k_block) {
-      if constexpr (!SwapAB) {
-        cute::gemm(tiled_mma, tCrA(_, _, k_block), tCrB(_, _, k_block), tCrC);
-      } else {
-        cute::gemm(tiled_mma, tCrB(_, _, k_block), tCrA(_, _, k_block), tCrC);
-      }
-      tiled_mma.accumulate_ = GMMA::ScaleOut::One;
-    }
-    warpgroup_commit_batch();
-    if constexpr (wg_wait >= 0) {
-      warpgroup_wait<wg_wait>();
-    }
-    warpgroup_fence_operand(tCrC);
-    if constexpr (Is_RS) {
-      if constexpr (!SwapAB) {
-        warpgroup_fence_operand(const_cast<Tensor0&>(tCrA));
-      } else {
-        warpgroup_fence_operand(const_cast<Tensor1&>(tCrB));
-      }
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    bool A_in_regs = false,
-    bool B_in_regs = false,
-    bool SwapAB = false,
-    typename Tensor0,
-    typename Tensor1,
-    typename Tensor2,
-    typename Tensor3,
-    typename Tensor4,
-    typename TiledMma,
-    typename TiledCopyA,
-    typename TiledCopyB,
-    typename ThrCopyA,
-    typename ThrCopyB,
-    typename Hook>
-CUTLASS_DEVICE void gemm_sm80(
-    Tensor0& acc,
-    Tensor1& tCrA,
-    Tensor2& tCrB,
-    Tensor3 const& tCsA,
-    Tensor4 const& tCsB,
-    TiledMma tiled_mma,
-    TiledCopyA smem_tiled_copy_A,
-    TiledCopyB smem_tiled_copy_B,
-    ThrCopyA smem_thr_copy_A,
-    ThrCopyB smem_thr_copy_B,
-    Hook fn) {
-  if constexpr (SwapAB) {
-    gemm_sm80<B_in_regs, A_in_regs>(
-        acc,
-        tCrB,
-        tCrA,
-        tCsB,
-        tCsA,
-        tiled_mma,
-        smem_tiled_copy_B,
-        smem_tiled_copy_A,
-        smem_thr_copy_B,
-        smem_thr_copy_A,
-        fn);
-  } else {
-    CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(acc)); // MMA_M
-    CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(acc)); // MMA_N
-    CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB)); // MMA_K
-    Tensor tCrA_copy_view = smem_thr_copy_A.retile_D(tCrA);
-    CUTE_STATIC_ASSERT_V(size<1>(tCsA) == size<1>(tCrA_copy_view)); // M
-    Tensor tCrB_copy_view = smem_thr_copy_B.retile_D(tCrB);
-    CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view)); // N
-    if (!A_in_regs) {
-      cute::copy(
-          smem_tiled_copy_A, tCsA(_, _, _0{}), tCrA_copy_view(_, _, _0{}));
-    }
-    if (!B_in_regs) {
-      cute::copy(
-          smem_tiled_copy_B, tCsB(_, _, _0{}), tCrB_copy_view(_, _, _0{}));
-    }
-#pragma unroll
-    for (int i = 0; i < size<2>(tCrA); ++i) {
-      if (i < size<2>(tCrA) - 1) {
-        if (!A_in_regs) {
-          cute::copy(
-              smem_tiled_copy_A,
-              tCsA(_, _, i + 1),
-              tCrA_copy_view(_, _, i + 1));
-        }
-        if (!B_in_regs) {
-          cute::copy(
-              smem_tiled_copy_B,
-              tCsB(_, _, i + 1),
-              tCrB_copy_view(_, _, i + 1));
-        }
-      }
-      if constexpr (!std::is_same_v<Hook, std::nullptr_t>) {
-        if (i == 0) {
-          fn();
-        }
-      }
-      cute::gemm(tiled_mma, tCrA(_, _, i), tCrB(_, _, i), acc);
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    typename Tensor0,
-    typename Tensor1,
-    typename Tensor2,
-    typename Tensor3,
-    typename TiledMma,
-    typename TiledCopy,
-    typename ThrCopy>
-CUTLASS_DEVICE void gemm_rs_sm80(
-    Tensor0& acc,
-    Tensor1& tCrA,
-    Tensor2& tCrB,
-    Tensor3 const& tCsB,
-    TiledMma tiled_mma,
-    TiledCopy smem_tiled_copy_B,
-    ThrCopy smem_thr_copy_B) {
-  CUTE_STATIC_ASSERT_V(size<1>(tCrA) == size<1>(acc)); // MMA_M
-  CUTE_STATIC_ASSERT_V(size<1>(tCrB) == size<2>(acc)); // MMA_N
-  CUTE_STATIC_ASSERT_V(size<2>(tCrA) == size<2>(tCrB)); // MMA_K
-  Tensor tCrB_copy_view = smem_thr_copy_B.retile_D(tCrB);
-  CUTE_STATIC_ASSERT_V(size<1>(tCsB) == size<1>(tCrB_copy_view)); // N
-  cute::copy(smem_tiled_copy_B, tCsB(_, _, _0{}), tCrB_copy_view(_, _, _0{}));
-#pragma unroll
-  for (int i = 0; i < size<2>(tCrA); ++i) {
-    if (i < size<2>(tCrA) - 1) {
-      cute::copy(
-          smem_tiled_copy_B, tCsB(_, _, i + 1), tCrB_copy_view(_, _, i + 1));
-    }
-    cute::gemm(tiled_mma, tCrA(_, _, i), tCrB(_, _, i), acc);
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <
-    bool Is_even_MN = true,
-    bool Is_even_K = true,
-    bool Clear_OOB_MN = false,
-    bool Clear_OOB_K = true,
-    class CopyAtom,
-    class TV,
-    class Tiler,
-    typename Engine0,
-    typename Layout0,
-    typename Engine1,
-    typename Layout1,
-    typename Engine2,
-    typename Layout2,
-    typename Engine3,
-    typename Layout3>
-CUTLASS_DEVICE void copy(
-    TiledCopy<CopyAtom, TV, Tiler> const& tiled_copy,
-    Tensor<Engine0, Layout0> const& S,
-    Tensor<Engine1, Layout1>& D,
-    Tensor<Engine2, Layout2> const& identity_MN,
-    Tensor<Engine3, Layout3> const& predicate_K,
-    const int max_MN = 0) {
-  // Decay TiledCopy to CopyAtom
-  auto copy_atom = static_cast<CopyAtom const&>(tiled_copy);
-  CUTE_STATIC_ASSERT_V(rank(S) == Int<3>{});
-  CUTE_STATIC_ASSERT_V(rank(D) == Int<3>{});
-  CUTE_STATIC_ASSERT_V(size<0>(S) == size<0>(D)); // MMA
-  CUTE_STATIC_ASSERT_V(size<1>(S) == size<1>(D)); // MMA_M
-  CUTE_STATIC_ASSERT_V(size<2>(S) == size<2>(D)); // MMA_K
-  // There's no case where !Clear_OOB_K && Clear_OOB_MN
-  static_assert(!(Clear_OOB_MN && !Clear_OOB_K));
-  auto has_with_bool = cute::is_valid(
-      [](auto t) -> void_t<decltype(declval<typename decltype(t)::Traits>()
-                                        .with(true))> {},
-      copy_atom);
-#pragma unroll
-  for (int m = 0; m < size<1>(S); ++m) {
-    bool predicate_mn =
-        Is_even_MN || get<0>(identity_MN(_0{}, m, _0{})) < max_MN;
-    if constexpr (Is_even_MN || !Clear_OOB_MN) {
-      if (Is_even_MN || predicate_mn) {
-#pragma unroll
-        for (int k = 0; k < size<2>(S); ++k) {
-          if constexpr (Is_even_K || !Clear_OOB_K) {
-            if (Is_even_K || predicate_K(k)) {
-              cute::copy(copy_atom, S(_, m, k), D(_, m, k));
-            }
-          } else { // Clear_OOB_K == true && Is_even_K == false
-            // If copy traits can be transformed with a predicate value, do
-            // it, otherwise branch here
-            if constexpr (has_with_bool) {
-              cute::copy(
-                  copy_atom.with(predicate_K(k)), S(_, m, k), D(_, m, k));
-            } else {
-              if (predicate_K(k)) {
-                cute::copy(copy_atom, S(_, m, k), D(_, m, k));
-              } else {
-                cute::clear(D(_, m, k));
-              }
-            }
-          }
-        }
-      }
-    } else { // Clear_OOB_MN == true && Is_even_MN == false, also implies
-             // Clear_OOB_K == true
-      if constexpr (!has_with_bool) {
-        if (predicate_mn) {
-#pragma unroll
-          for (int k = 0; k < size<2>(S); ++k) {
-            if (Is_even_K || predicate_K(k)) {
-              cute::copy(copy_atom, S(_, m, k), D(_, m, k));
-            } else if (Clear_OOB_K) {
-              cute::clear(D(_, m, k));
-            }
-          }
-        } else {
-          cute::clear(D(_, m, _));
-        }
-      } else { // combine the mn predicate with the k predicate
-#pragma unroll
-        for (int k = 0; k < size<2>(S); ++k) {
-          cute::copy(
-              copy_atom.with(predicate_mn && (Is_even_K || predicate_K(k))),
-              S(_, m, k),
-              D(_, m, k));
-        }
-      }
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-// Byte permute and shuffle to match register layout of
-// (FP8 downcasted) accumulator of GEMM-I to FP8 operand A of GEMM-II.
-template <typename Fragment>
-CUTLASS_DEVICE void permute_Aregs_fp8(Fragment& frag) {
-  // frag has shape ((4, 2, 2), MMA_M, MMA_N), each element is 8 bits
-  static_assert(decltype(size<0, 0>(frag))::value == 4);
-  static_assert(decltype(size<0, 1>(frag))::value == 2);
-  static_assert(decltype(stride<0, 0>(frag))::value == 1);
-  static_assert(decltype(stride<0, 1>(frag))::value == 4);
-  static_assert(sizeof(typename Fragment::value_type) == 1);
-
-  auto quad_idx = threadIdx.x % 4;
-  bool lane_03 = quad_idx == 0 || quad_idx == 3;
-  int selector_upper = lane_03 ? 0x5410 : 0x1054;
-  int selector_lower = lane_03 ? 0x7632 : 0x3276;
-
-  static constexpr int upper_map[4] = {0, 3, 1, 2};
-  // static constexpr int lower_map[4] = {1, 2, 0, 3};
-
-  Tensor frag_64b = recast<uint2>(frag); // ((1, 1, 2), MMA_M, MMA_N)
-#pragma unroll
-  for (int i = 0; i < size(frag_64b); ++i) {
-    uint32_t upper = frag_64b[i].x;
-    uint32_t lower = frag_64b[i].y;
-    uint32_t upper0 = lane_03 ? upper : lower;
-    uint32_t lower0 = lane_03 ? lower : upper;
-    upper0 = __shfl_sync(uint32_t(-1), upper0, upper_map[quad_idx], 4);
-    // lower0 = __shfl_sync(uint32_t(-1), lower0, lower_map[quad_idx], 4);
-    lower0 = __shfl_sync(uint32_t(-1), lower0, upper_map[quad_idx] ^ 1, 4);
-    frag_64b[i].x = __byte_perm(upper0, lower0, selector_upper);
-    frag_64b[i].y = __byte_perm(upper0, lower0, selector_lower);
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename Fragment>
-CUTLASS_DEVICE void permute_Cregs_fp8(Fragment& frag) {
-  // frag has shape ((2, 2, N / 8), MMA_M, MMA_N), each element is 32 bits
-  static_assert(decltype(size<0, 0>(frag))::value == 2);
-  static_assert(decltype(size<0, 1>(frag))::value == 2);
-  static_assert(decltype(size<0, 2>(frag))::value % 2 == 0);
-  static_assert(decltype(stride<0, 0>(frag))::value == 1);
-  static_assert(sizeof(typename Fragment::value_type) == 4);
-  Tensor frag_64b =
-      group_modes<1, 3>(recast<uint2>(frag)); // ((1, 2, N / 8), (MMA_M, MMA_N))
-#pragma unroll
-  for (int mi = 0; mi < size<1>(frag_64b); ++mi) {
-#pragma unroll
-    for (int i = 0; i < size<0, 2>(frag_64b) / 2; ++i) {
-      cutlass::swap(
-          frag_64b(make_coord(_0{}, _1{}, 2 * i), mi),
-          frag_64b(make_coord(_0{}, _0{}, 2 * i + 1), mi));
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename Fragment>
-CUTLASS_DEVICE void permute_output_fp8(Fragment& out) {
-  // out has shape ((2, 2, N / 8), MMA_M, MMA_N), each element is 32 bits
-  static_assert(decltype(size<0, 0>(out))::value == 2);
-  static_assert(decltype(size<0, 1>(out))::value == 2);
-  static_assert(decltype(size<0, 2>(out))::value % 2 == 0);
-  static_assert(decltype(stride<0, 0>(out))::value == 1);
-  static_assert(sizeof(typename Fragment::value_type) == 4);
-  Tensor frag = group_modes<1, 3>(out); // ((2, 2, N / 8), (MMA_M, MMA_N))
-#pragma unroll
-  for (int mi = 0; mi < size<1>(frag); ++mi) {
-#pragma unroll
-    for (int j = 0; j < size<0, 1>(frag); ++j) {
-#pragma unroll
-      for (int i = 0; i < size<0, 2>(frag) / 2; ++i) {
-        cutlass::swap(
-            frag(make_coord(_1{}, j, 2 * i), mi),
-            frag(make_coord(_0{}, j, 2 * i + 1), mi));
-      }
-    }
-  }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename Fragment>
-CUTLASS_DEVICE void permute_output_fp8_Vcolmajor(Fragment& frag) {
-  // frag has shape ((2, 2, N / 8), MMA_M, MMA_N), each element is 16 bits
-  static_assert(decltype(size<0, 0>(frag))::value == 2);
-  static_assert(decltype(size<0, 1>(frag))::value == 2);
-  static_assert(decltype(stride<0, 0>(frag))::value == 1);
-  static_assert(
-      sizeof(typename Fragment::value_type) == 2 ||
-      sizeof(typename Fragment::value_type) == 4);
-
-  auto quad_idx = threadIdx.x % 4;
-  bool lane_03 = quad_idx == 0 || quad_idx == 3;
-
-  static constexpr int upper_map[4] = {0, 2, 3, 1};
-  // static constexpr int lower_map[4] = {2, 0, 1, 3};
-
-  // if (blockIdx.x == 0 && threadIdx.x == 128) { print_tensor(frag); }
-  using type2 = std::conditional_t<
-      sizeof(typename Fragment::value_type) == 2,
-      uint32_t,
-      uint64_t>;
-  Tensor frag_2 =
-      group_modes<1, 3>(recast<type2>(frag)); // ((1, 2, N / 8), (MMA_M, MMA_N))
-// if (blockIdx.x == 0 && threadIdx.x == 128) { print(frag); printf("\n");
-// print(frag_2); }
-#pragma unroll
-  for (int mi = 0; mi < size<1>(frag_2); ++mi) {
-#pragma unroll
-    for (int j = 0; j < size<0, 1>(frag_2); ++j) {
-#pragma unroll
-      for (int i = 0; i < size<0, 2>(frag_2) / 2; ++i) {
-        type2 upper = frag_2(make_coord(_0{}, j, 2 * i), mi);
-        type2 lower = frag_2(make_coord(_0{}, j, 2 * i + 1), mi);
-        type2 upper0 = lane_03 ? upper : lower;
-        type2 lower0 = lane_03 ? lower : upper;
-        upper0 = __shfl_sync(uint32_t(-1), upper0, upper_map[quad_idx], 4);
-        // lower0 = __shfl_sync(uint32_t(-1), lower0, lower_map[quad_idx], 4);
-        lower0 = __shfl_sync(uint32_t(-1), lower0, upper_map[quad_idx] ^ 2, 4);
-        frag_2(make_coord(_0{}, j, 2 * i), mi) = lane_03 ? upper0 : lower0;
-        frag_2(make_coord(_0{}, j, 2 * i + 1), mi) = lane_03 ? lower0 : upper0;
-      }
-    }
-  }
-  // if (blockIdx.x == 0 && threadIdx.x == 128) { print_tensor(frag); }
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-template <typename Engine, typename Layout>
-CUTLASS_DEVICE void apply_softcap(
-    Tensor<Engine, Layout>& tensor,
-    float const softcap) {
-#pragma unroll
-  for (int i = 0; i < size(tensor); ++i) {
-    tensor(i) = cutlass::fast_tanh(tensor(i) * softcap);
-  }
-}
-
-template <typename Engine, typename Layout>
-CUTLASS_DEVICE auto calculate_dtanh(Tensor<Engine, Layout>& tensor) {
-  Tensor out = make_fragment_like<float>(tensor);
-#pragma unroll
-  for (int i = 0; i < size(tensor); ++i) {
-    out(i) = 1.f - (tensor(i) * tensor(i));
-  }
-  return out;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-CUTLASS_DEVICE
-int canonical_warp_group_idx_nosync() {
-  return threadIdx.x / cutlass::NumThreadsPerWarpGroup;
-}
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/version.txt b/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/version.txt
deleted file mode 100644
index 04d34e1a3..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/hstu_attention/version.txt
+++ /dev/null
@@ -1 +0,0 @@
-5231d95
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/jagged_transpose_1d.cpp b/recommendation_v4/generative_recommenders/ops/cpp/jagged_transpose_1d.cpp
deleted file mode 100644
index d3a2e9421..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/jagged_transpose_1d.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <ATen/ATen.h>
-#include <ATen/Dispatch.h>
-#include <ATen/core/op_registration/op_registration.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/Exceptions.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <torch/extension.h>
-#include <torch/library.h>
-
-#include "fbgemm_gpu/sparse_ops.h" // @manual
-
-namespace hstu {
-
-template <typename index_t, typename val_t>
-void _jagged_transpose_1d_cpu_kernel(
-    int32_t size1,
-    int32_t size2,
-    int32_t max_len,
-    const at::TensorAccessor<index_t, 1>& offsets,
-    const at::TensorAccessor<val_t, 1>& values,
-    const at::TensorAccessor<index_t, 1>& lengths,
-    const at::TensorAccessor<index_t, 1>& trans_offsets,
-    at::TensorAccessor<val_t, 1> trans_values) {
-  for (auto i : c10::irange(size1)) {
-    for (auto j : c10::irange(size2)) {
-      auto src_idx = i * size2 + j;
-      auto dst_idx = j * size1 + i;
-      auto src_offset = offsets[src_idx];
-      auto src_length = lengths[src_idx];
-      auto dst_offset = trans_offsets[dst_idx];
-
-      for (auto k = 0; k < src_length; ++k) {
-        trans_values[dst_offset + k] = values[src_offset + k];
-      }
-    }
-  }
-}
-
-std::tuple<at::Tensor, at::Tensor, at::Tensor> jagged_transpose_1d_cpu(
-    const at::Tensor& values,
-    const at::Tensor& offsets,
-    const at::Tensor& lengths,
-    const int64_t max_len,
-    const int64_t size1,
-    const int64_t size2) {
-  TORCH_INTERNAL_ASSERT(values.device().type() == at::DeviceType::CPU);
-  TORCH_INTERNAL_ASSERT(offsets.device().type() == at::DeviceType::CPU);
-  TORCH_INTERNAL_ASSERT(lengths.device().type() == at::DeviceType::CPU);
-  TORCH_CHECK(offsets.size(0) == size1 * size2 + 1);
-  TORCH_CHECK(lengths.size(0) == size1 * size2);
-
-  auto trans_lengths =
-      lengths.view({size1, size2}).transpose(0, 1).contiguous().view({-1});
-  auto trans_offsets =
-      fbgemm_gpu::asynchronous_complete_cumsum_cpu(trans_lengths);
-  auto L_out = trans_offsets[-1].item<int64_t>();
-  auto trans_values = at::empty({L_out}, values.options());
-
-  if (L_out == 0) {
-    return std::make_tuple(trans_values, trans_offsets, trans_lengths);
-  }
-
-  AT_DISPATCH_INTEGRAL_TYPES(
-      lengths.scalar_type(), "jagged_transpose_1d_cpu_kernel_input1", [&] {
-        using index_t = scalar_t;
-        AT_DISPATCH_ALL_TYPES_AND2(
-            at::ScalarType::BFloat16,
-            at::ScalarType::Half,
-            values.scalar_type(),
-            "jagged_transpose_1d_cpu_kernel_input2",
-            [&] {
-              using val_t = scalar_t;
-              _jagged_transpose_1d_cpu_kernel<index_t, val_t>(
-                  size1,
-                  size2,
-                  max_len,
-                  offsets.accessor<index_t, 1>(),
-                  values.accessor<val_t, 1>(),
-                  lengths.accessor<index_t, 1>(),
-                  trans_offsets.accessor<index_t, 1>(),
-                  trans_values.accessor<val_t, 1>());
-            });
-      });
-
-  return std::make_tuple(trans_values, trans_offsets, trans_lengths);
-}
-
-std::tuple<at::Tensor, at::Tensor, at::Tensor> jagged_transpose_1d_meta(
-    const at::Tensor& values,
-    const at::Tensor& offsets,
-    const at::Tensor& lengths,
-    const int64_t max_len,
-    const int64_t size1,
-    const int64_t size2) {
-  auto trans_lengths =
-      lengths.view({size1, size2}).transpose(0, 1).contiguous().view({-1});
-  auto L_out = trans_lengths.sum().item<int64_t>();
-
-  auto trans_values = at::native::empty_meta_symint(
-      {L_out},
-      /*dtype=*/::std::make_optional(values.scalar_type()),
-      /*layout=*/::std::make_optional(values.layout()),
-      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
-      /*pin_memory=*/::std::nullopt);
-
-  auto trans_offsets = at::native::empty_meta_symint(
-      {size1 * size2 + 1},
-      /*dtype=*/::std::make_optional(lengths.scalar_type()),
-      /*layout=*/::std::make_optional(lengths.layout()),
-      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
-      /*pin_memory=*/::std::nullopt);
-
-  return std::make_tuple(trans_values, trans_offsets, trans_lengths);
-}
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/jagged_transpose_1d.cu b/recommendation_v4/generative_recommenders/ops/cpp/jagged_transpose_1d.cu
deleted file mode 100644
index 380100962..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/jagged_transpose_1d.cu
+++ /dev/null
@@ -1,127 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <ATen/ATen.h>
-#include <ATen/Dispatch.h>
-#include <ATen/core/op_registration/op_registration.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/Exceptions.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <torch/library.h>
-
-#include "common.h"
-#include "fbgemm_gpu/sparse_ops.h" // @manual
-#include "fbgemm_gpu/utils/fixed_divisor.cuh" // @manual
-
-namespace hstu {
-
-static constexpr int32_t kMaxThreads = 1024;
-
-template <typename index_t, typename val_t>
-__global__ __launch_bounds__(kMaxThreads) void _jagged_transpose_1d_cuda_kernel(
-    int32_t size1,
-    int32_t size2,
-    int32_t max_len,
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> offsets,
-    const at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits> values,
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits> lengths,
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        trans_offsets,
-    at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits> trans_values) {
-  for (auto idx = blockIdx.x * blockDim.y + threadIdx.y;
-       idx < static_cast<uint32_t>(size1 * size2);
-       idx += gridDim.x * blockDim.y) {
-    auto i = idx / size2;
-    auto j = idx % size2;
-    auto src_idx = i * size2 + j;
-    auto dst_idx = j * size1 + i;
-    auto src_offset = offsets[src_idx];
-    auto src_length = lengths[src_idx];
-    auto dst_offset = trans_offsets[dst_idx];
-
-    for (auto k = threadIdx.x; k < static_cast<uint32_t>(src_length);
-         k += blockDim.x) {
-      trans_values[dst_offset + k] = values[src_offset + k];
-    }
-  }
-}
-
-std::tuple<at::Tensor, at::Tensor, at::Tensor> jagged_transpose_1d_cuda(
-    const at::Tensor& values,
-    const at::Tensor& offsets,
-    const at::Tensor& lengths,
-    const int64_t max_len,
-    const int64_t size1,
-    const int64_t size2) {
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(values.get_device());
-  TORCH_INTERNAL_ASSERT(values.device().type() == at::DeviceType::CUDA);
-  TORCH_INTERNAL_ASSERT(offsets.device().type() == at::DeviceType::CUDA);
-  TORCH_INTERNAL_ASSERT(lengths.device().type() == at::DeviceType::CUDA);
-  TORCH_CHECK(offsets.size(0) == size1 * size2 + 1);
-  TORCH_CHECK(lengths.size(0) == size1 * size2);
-  TORCH_CHECK(values.get_device() == offsets.get_device());
-  TORCH_CHECK(values.get_device() == lengths.get_device());
-
-  auto trans_lengths =
-      lengths.view({size1, size2}).transpose(0, 1).contiguous().view({-1});
-  auto trans_offsets =
-      fbgemm_gpu::asynchronous_complete_cumsum_gpu(trans_lengths);
-  auto L_out = trans_offsets[-1].item<int64_t>();
-  TORCH_CHECK(L_out < std::numeric_limits<int32_t>::max());
-  auto trans_values = at::empty({L_out}, values.options());
-
-  if (L_out == 0) {
-    return std::make_tuple(trans_values, trans_offsets, trans_lengths);
-  }
-
-  // Optimized thread block configuration based on benchmark results
-  uint32_t B_blocks = 4;
-  dim3 threads(256, B_blocks);
-  auto blocks = div_round_up(size1 * size2, B_blocks);
-
-  AT_DISPATCH_INTEGRAL_TYPES(
-      lengths.scalar_type(), "jagged_transpose_1d_cuda_kernel_input1", [&] {
-        using index_t = scalar_t;
-        AT_DISPATCH_ALL_TYPES_AND2(
-            at::ScalarType::BFloat16,
-            at::ScalarType::Half,
-            values.scalar_type(),
-            "jagged_transpose_1d_cuda_kernel_input2",
-            [&] {
-              using val_t = scalar_t;
-              _jagged_transpose_1d_cuda_kernel<index_t, val_t><<<
-                  blocks,
-                  threads,
-                  0,
-                  at::cuda::getCurrentCUDAStream()>>>(
-                  size1,
-                  size2,
-                  max_len,
-                  offsets
-                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-                  values.packed_accessor32<val_t, 1, at::RestrictPtrTraits>(),
-                  lengths
-                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-                  trans_offsets
-                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-                  trans_values
-                      .packed_accessor32<val_t, 1, at::RestrictPtrTraits>());
-            });
-      });
-
-  return std::make_tuple(trans_values, trans_offsets, trans_lengths);
-}
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/replace_last_n_with_jagged.cpp b/recommendation_v4/generative_recommenders/ops/cpp/replace_last_n_with_jagged.cpp
deleted file mode 100644
index fa8eaac09..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/replace_last_n_with_jagged.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <ATen/ATen.h>
-#include <ATen/Dispatch.h>
-#include <ATen/core/op_registration/op_registration.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/Exceptions.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <torch/extension.h>
-#include <torch/library.h>
-
-#include "fbgemm_gpu/sparse_ops.h" // @manual
-
-namespace hstu {
-
-template <typename index_t, typename val_t>
-void _replace_last_n_with_jagged_cpu_kernel(
-    int32_t B,
-    const at::TensorAccessor<index_t, 1>& lengths_left,
-    const at::TensorAccessor<index_t, 1>& offsets_left,
-    const at::TensorAccessor<val_t, 2>& values_left,
-    const at::TensorAccessor<index_t, 1>& lengths_right,
-    const at::TensorAccessor<index_t, 1>& offsets_right,
-    const at::TensorAccessor<val_t, 2>& values_right,
-    const at::TensorAccessor<index_t, 1>& output_offsets,
-    at::TensorAccessor<val_t, 2> output) {
-  for (auto b : c10::irange(B)) {
-    auto left_start = offsets_left[b];
-    auto left_len = lengths_left[b];
-    auto right_start = offsets_right[b];
-    auto right_len = lengths_right[b];
-    auto output_start = output_offsets[b];
-
-    auto keep_len = left_len - right_len;
-
-    for (auto i = 0; i < left_len; ++i) {
-      for (auto d = 0; d < values_left.size(1); ++d) {
-        if (i < keep_len) {
-          output[output_start + i][d] = values_left[left_start + i][d];
-        } else {
-          auto right_idx = i - keep_len;
-          if (right_idx < right_len) {
-            output[output_start + i][d] =
-                values_right[right_start + right_idx][d];
-          }
-        }
-      }
-    }
-  }
-}
-
-at::Tensor replace_last_n_with_jagged_cpu(
-    const at::Tensor& lengths_left,
-    const at::Tensor& values_left,
-    const at::Tensor& lengths_right,
-    const at::Tensor& values_right) {
-  TORCH_INTERNAL_ASSERT(lengths_left.device().type() == at::DeviceType::CPU);
-  TORCH_INTERNAL_ASSERT(lengths_right.device().type() == at::DeviceType::CPU);
-  TORCH_INTERNAL_ASSERT(values_left.device().type() == at::DeviceType::CPU);
-  TORCH_INTERNAL_ASSERT(values_right.device().type() == at::DeviceType::CPU);
-  TORCH_CHECK(lengths_left.size(0) == lengths_right.size(0));
-  TORCH_CHECK(values_left.size(1) == values_right.size(1));
-
-  auto B = lengths_left.size(0);
-  auto D = values_left.size(1);
-
-  auto L_out = lengths_left.sum().item<int64_t>();
-
-  auto output = at::empty({L_out, D}, values_left.options());
-
-  if (L_out == 0) {
-    return output;
-  }
-
-  const auto offsets_left =
-      fbgemm_gpu::asynchronous_complete_cumsum_cpu(lengths_left.view({-1}));
-  const auto offsets_right =
-      fbgemm_gpu::asynchronous_complete_cumsum_cpu(lengths_right.view({-1}));
-  const auto output_offsets = offsets_left;
-
-  AT_DISPATCH_INTEGRAL_TYPES(
-      lengths_left.scalar_type(),
-      "replace_last_n_with_jagged_cpu_kernel_input1",
-      [&] {
-        using index_t = scalar_t;
-        AT_DISPATCH_ALL_TYPES_AND2(
-            at::ScalarType::BFloat16,
-            at::ScalarType::Half,
-            values_left.scalar_type(),
-            "replace_last_n_with_jagged_cpu_kernel_input2",
-            [&] {
-              using val_t = scalar_t;
-              _replace_last_n_with_jagged_cpu_kernel<index_t, val_t>(
-                  B,
-                  lengths_left.accessor<index_t, 1>(),
-                  offsets_left.accessor<index_t, 1>(),
-                  values_left.accessor<val_t, 2>(),
-                  lengths_right.accessor<index_t, 1>(),
-                  offsets_right.accessor<index_t, 1>(),
-                  values_right.accessor<val_t, 2>(),
-                  output_offsets.accessor<index_t, 1>(),
-                  output.accessor<val_t, 2>());
-            });
-      });
-
-  return output;
-}
-
-at::Tensor replace_last_n_with_jagged_meta(
-    const at::Tensor& lengths_left,
-    const at::Tensor& values_left,
-    const at::Tensor& lengths_right,
-    const at::Tensor& values_right) {
-  auto L_out = lengths_left.sum().item<int64_t>();
-  auto D = values_left.size(1);
-
-  auto output = at::native::empty_meta_symint(
-      {L_out, D},
-      /*dtype=*/::std::make_optional(values_left.scalar_type()),
-      /*layout=*/::std::make_optional(values_left.layout()),
-      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
-      /*pin_memory=*/::std::nullopt);
-
-  return output;
-}
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/replace_last_n_with_jagged.cu b/recommendation_v4/generative_recommenders/ops/cpp/replace_last_n_with_jagged.cu
deleted file mode 100644
index 00a589eb9..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/replace_last_n_with_jagged.cu
+++ /dev/null
@@ -1,156 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <ATen/ATen.h>
-#include <ATen/Dispatch.h>
-#include <ATen/core/op_registration/op_registration.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/Exceptions.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <torch/library.h>
-
-#include "common.h"
-#include "fbgemm_gpu/sparse_ops.h" // @manual
-#include "fbgemm_gpu/utils/fixed_divisor.cuh" // @manual
-
-namespace hstu {
-
-static constexpr int32_t kMaxThreads = 1024;
-
-template <typename index_t, typename val_t>
-__global__
-__launch_bounds__(kMaxThreads) void _replace_last_n_with_jagged_cuda_kernel(
-    int32_t B,
-    int32_t D,
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        lengths_left,
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        offsets_left,
-    const at::PackedTensorAccessor32<val_t, 2, at::RestrictPtrTraits>
-        values_left,
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        lengths_right,
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        offsets_right,
-    const at::PackedTensorAccessor32<val_t, 2, at::RestrictPtrTraits>
-        values_right,
-    at::PackedTensorAccessor32<val_t, 2, at::RestrictPtrTraits> output) {
-  for (auto b = blockIdx.x * blockDim.y + threadIdx.y;
-       b < static_cast<uint32_t>(B);
-       b += gridDim.x * blockDim.y) {
-    auto left_start = offsets_left[b];
-    auto left_len = lengths_left[b];
-    auto right_start = offsets_right[b];
-    auto right_len = lengths_right[b];
-    auto output_start = offsets_left[b];
-    auto keep_len = left_len - right_len;
-
-    for (auto i = threadIdx.x; i < static_cast<uint32_t>(left_len * D);
-         i += blockDim.x) {
-      auto seq_pos = i / D;
-      auto dim_pos = i % D;
-      if (seq_pos < static_cast<uint32_t>(keep_len)) {
-        output[output_start + seq_pos][dim_pos] =
-            values_left[left_start + seq_pos][dim_pos];
-      } else {
-        auto right_idx = seq_pos - keep_len;
-        if (right_idx < static_cast<uint32_t>(right_len)) {
-          output[output_start + seq_pos][dim_pos] =
-              values_right[right_start + right_idx][dim_pos];
-        }
-      }
-    }
-  }
-}
-
-at::Tensor replace_last_n_with_jagged_cuda(
-    const at::Tensor& lengths_left,
-    const at::Tensor& values_left,
-    const at::Tensor& lengths_right,
-    const at::Tensor& values_right) {
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(values_left.get_device());
-  TORCH_INTERNAL_ASSERT(lengths_left.device().type() == at::DeviceType::CUDA);
-  TORCH_INTERNAL_ASSERT(lengths_right.device().type() == at::DeviceType::CUDA);
-  TORCH_INTERNAL_ASSERT(values_left.device().type() == at::DeviceType::CUDA);
-  TORCH_INTERNAL_ASSERT(values_right.device().type() == at::DeviceType::CUDA);
-  TORCH_CHECK(lengths_left.size(0) == lengths_right.size(0));
-  TORCH_CHECK(values_left.size(1) == values_right.size(1));
-
-  auto B = lengths_left.size(0);
-  auto D = values_left.size(1);
-  auto L_out = lengths_left.sum().item<int64_t>();
-  TORCH_CHECK(L_out < std::numeric_limits<int32_t>::max());
-  TORCH_CHECK(values_left.get_device() == lengths_left.get_device());
-  TORCH_CHECK(values_left.get_device() == lengths_right.get_device());
-  TORCH_CHECK(values_left.get_device() == values_right.get_device());
-
-  auto output = at::empty({L_out, D}, values_left.options());
-
-  if (L_out == 0) {
-    return output;
-  }
-
-  const auto offsets_left =
-      fbgemm_gpu::asynchronous_complete_cumsum_gpu(lengths_left.view({-1}));
-  const auto offsets_right =
-      fbgemm_gpu::asynchronous_complete_cumsum_gpu(lengths_right.view({-1}));
-
-  // Optimized thread block configuration based on benchmark results
-  uint32_t B_blocks, threads_x;
-  B_blocks = 4;
-  threads_x = 256;
-
-  dim3 threads(threads_x, B_blocks);
-  auto blocks = div_round_up(B, B_blocks);
-
-  AT_DISPATCH_INTEGRAL_TYPES(
-      lengths_left.scalar_type(),
-      "replace_last_n_with_jagged_cuda_kernel_input1",
-      [&] {
-        using index_t = scalar_t;
-        AT_DISPATCH_ALL_TYPES_AND2(
-            at::ScalarType::BFloat16,
-            at::ScalarType::Half,
-            values_left.scalar_type(),
-            "replace_last_n_with_jagged_cuda_kernel_input2",
-            [&] {
-              using val_t = scalar_t;
-              _replace_last_n_with_jagged_cuda_kernel<index_t, val_t><<<
-                  blocks,
-                  threads,
-                  0,
-                  at::cuda::getCurrentCUDAStream()>>>(
-                  B,
-                  D,
-                  lengths_left
-                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-                  offsets_left
-                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-                  values_left
-                      .packed_accessor32<val_t, 2, at::RestrictPtrTraits>(),
-                  lengths_right
-                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-                  offsets_right
-                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-                  values_right
-                      .packed_accessor32<val_t, 2, at::RestrictPtrTraits>(),
-                  output.packed_accessor32<val_t, 2, at::RestrictPtrTraits>());
-            });
-      });
-
-  return output;
-}
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/setup.py b/recommendation_v4/generative_recommenders/ops/cpp/setup.py
deleted file mode 100644
index 2a06a9d05..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/setup.py
+++ /dev/null
@@ -1,487 +0,0 @@
-# pyre-unsafe
-"""
-Modified from https://github.com/Dao-AILab/flash-attention/blob/main/hopper/setup.py
-"""
-
-import itertools
-import os
-import platform
-import subprocess
-import sys
-import sysconfig
-import warnings
-from pathlib import Path
-
-import torch
-from packaging.version import parse, Version
-from setuptools import find_packages, setup
-from torch.utils.cpp_extension import BuildExtension, CUDA_HOME, CUDAExtension
-
-# ninja build does not work unless include_dirs are abs path
-this_dir = os.path.dirname(os.path.abspath(__file__))
-PACKAGE_NAME = "hstu"
-# FORCE_BUILD: Force a fresh build locally, instead of attempting to find prebuilt wheels
-# SKIP_CUDA_BUILD: Intended to allow CI to use a simple `python setup.py sdist` run to copy over raw files, without any cuda compilation
-FORCE_BUILD = os.getenv("FLASH_ATTENTION_FORCE_BUILD", "FALSE") == "TRUE"
-SKIP_CUDA_BUILD = os.getenv("FLASH_ATTENTION_SKIP_CUDA_BUILD", "FALSE") == "TRUE"
-# For CI, we want the option to build with C++11 ABI since the nvcr images use C++11 ABI
-FORCE_CXX11_ABI = os.getenv("FLASH_ATTENTION_FORCE_CXX11_ABI", "FALSE") == "TRUE"
-
-# HACK: we monkey patch pytorch's _write_ninja_file to pass
-# "-gencode arch=compute_sm90a,code=sm_90a" to files ending in '_sm90.cu',
-# and pass "-gencode arch=compute_sm80,code=sm_80" to files ending in '_sm80.cu'
-from torch.utils.cpp_extension import (
-    _is_cuda_file,
-    _join_cuda_home,
-    _join_rocm_home,
-    _maybe_write,
-    COMMON_HIP_FLAGS,
-    get_cxx_compiler,
-    IS_HIP_EXTENSION,
-    IS_WINDOWS,
-    SUBPROCESS_DECODE_ARGS,
-)
-
-DISABLE_BACKWARD = os.getenv("FLASH_ATTENTION_DISABLE_BACKWARD", "FALSE") == "TRUE"
-DISABLE_FP16 = os.getenv("FLASH_ATTENTION_DISABLE_FP16", "TRUE") == "TRUE"
-DISABLE_HDIM64 = os.getenv("FLASH_ATTENTION_DISABLE_HDIM64", "TRUE") == "TRUE"
-DISABLE_HDIM96 = os.getenv("FLASH_ATTENTION_DISABLE_HDIM96", "TRUE") == "TRUE"
-DISABLE_HDIM128 = os.getenv("FLASH_ATTENTION_DISABLE_HDIM128", "FALSE") == "TRUE"
-DISABLE_HDIM192 = os.getenv("FLASH_ATTENTION_DISABLE_HDIM192", "TRUE") == "TRUE"
-DISABLE_HDIM256 = os.getenv("FLASH_ATTENTION_DISABLE_HDIM256", "TRUE") == "TRUE"
-DISABLE_SM8x = os.getenv("FLASH_ATTENTION_DISABLE_SM80", "TRUE") == "TRUE"
-
-
-def _write_ninja_file(
-    path,
-    cflags,
-    post_cflags,
-    cuda_cflags,
-    cuda_post_cflags,
-    cuda_dlink_post_cflags,
-    sources,
-    objects,
-    ldflags,
-    library_target,
-    with_cuda,
-    **kwargs,  # kwargs (ignored) to absorb new flags in torch.utils.cpp_extension
-) -> None:
-    r"""Write a ninja file that does the desired compiling and linking.
-
-    `path`: Where to write this file
-    `cflags`: list of flags to pass to $cxx. Can be None.
-    `post_cflags`: list of flags to append to the $cxx invocation. Can be None.
-    `cuda_cflags`: list of flags to pass to $nvcc. Can be None.
-    `cuda_postflags`: list of flags to append to the $nvcc invocation. Can be None.
-    `sources`: list of paths to source files
-    `objects`: list of desired paths to objects, one per source.
-    `ldflags`: list of flags to pass to linker. Can be None.
-    `library_target`: Name of the output library. Can be None; in that case,
-                      we do no linking.
-    `with_cuda`: If we should be compiling with CUDA.
-    """
-
-    def sanitize_flags(flags):
-        if flags is None:
-            return []
-        else:
-            return [flag.strip() for flag in flags]
-
-    cflags = sanitize_flags(cflags)
-    post_cflags = sanitize_flags(post_cflags)
-    cuda_cflags = sanitize_flags(cuda_cflags)
-    cuda_post_cflags = sanitize_flags(cuda_post_cflags)
-    cuda_dlink_post_cflags = sanitize_flags(cuda_dlink_post_cflags)
-    ldflags = sanitize_flags(ldflags)
-
-    # Sanity checks...
-    assert len(sources) == len(objects)
-    assert len(sources) > 0
-
-    compiler = get_cxx_compiler()
-
-    # Version 1.3 is required for the `deps` directive.
-    config = ["ninja_required_version = 1.3"]
-    config.append(f"cxx = {compiler}")
-    if with_cuda or cuda_dlink_post_cflags:
-        if IS_HIP_EXTENSION:
-            nvcc = _join_rocm_home("bin", "hipcc")
-        else:
-            nvcc = _join_cuda_home("bin", "nvcc")
-        if "PYTORCH_NVCC" in os.environ:
-            nvcc_from_env = os.getenv(
-                "PYTORCH_NVCC"
-            )  # user can set nvcc compiler with ccache using the environment variable here
-        else:
-            nvcc_from_env = nvcc
-        config.append(f"nvcc_from_env = {nvcc_from_env}")
-        config.append(f"nvcc = {nvcc}")
-
-    if IS_HIP_EXTENSION:
-        post_cflags = COMMON_HIP_FLAGS + post_cflags
-    flags = [f"cflags = {' '.join(cflags)}"]
-    flags.append(f"post_cflags = {' '.join(post_cflags)}")
-    if with_cuda:
-        flags.append(f"cuda_cflags = {' '.join(cuda_cflags)}")
-        flags.append(f"cuda_post_cflags = {' '.join(cuda_post_cflags)}")
-        cuda_post_cflags_sm80 = [
-            s if s != "arch=compute_90a,code=sm_90a" else "arch=compute_80,code=sm_80"
-            for s in cuda_post_cflags
-        ]
-        flags.append(f"cuda_post_cflags_sm80 = {' '.join(cuda_post_cflags_sm80)}")
-        cuda_post_cflags_sm80_sm90 = cuda_post_cflags + [
-            "-gencode",
-            "arch=compute_80,code=sm_80",
-        ]
-        flags.append(
-            f"cuda_post_cflags_sm80_sm90 = {' '.join(cuda_post_cflags_sm80_sm90)}"
-        )
-        cuda_post_cflags_sm100 = [
-            s
-            if s != "arch=compute_90a,code=sm_90a"
-            else "arch=compute_100a,code=sm_100a"
-            for s in cuda_post_cflags
-        ]
-        flags.append(f"cuda_post_cflags_sm100 = {' '.join(cuda_post_cflags_sm100)}")
-    flags.append(f"cuda_dlink_post_cflags = {' '.join(cuda_dlink_post_cflags)}")
-    flags.append(f"ldflags = {' '.join(ldflags)}")
-
-    # Turn into absolute paths so we can emit them into the ninja build
-    # file wherever it is.
-    sources = [os.path.abspath(file) for file in sources]
-
-    # See https://ninja-build.org/build.ninja.html for reference.
-    compile_rule = ["rule compile"]
-    if IS_WINDOWS:
-        compile_rule.append(
-            "  command = cl /showIncludes $cflags -c $in /Fo$out $post_cflags"
-        )
-        compile_rule.append("  deps = msvc")
-    else:
-        compile_rule.append(
-            "  command = $cxx -MMD -MF $out.d $cflags -c $in -o $out $post_cflags"
-        )
-        compile_rule.append("  depfile = $out.d")
-        compile_rule.append("  deps = gcc")
-
-    if with_cuda:
-        cuda_compile_rule = ["rule cuda_compile"]
-        nvcc_gendeps = ""
-        # --generate-dependencies-with-compile is not supported by ROCm
-        # Nvcc flag `--generate-dependencies-with-compile` is not supported by sccache, which may increase build time.
-        if (
-            torch.version.cuda is not None
-            and os.getenv("TORCH_EXTENSION_SKIP_NVCC_GEN_DEPENDENCIES", "0") != "1"
-        ):
-            cuda_compile_rule.append("  depfile = $out.d")
-            cuda_compile_rule.append("  deps = gcc")
-            # Note: non-system deps with nvcc are only supported
-            # on Linux so use --generate-dependencies-with-compile
-            # to make this work on Windows too.
-            nvcc_gendeps = (
-                "--generate-dependencies-with-compile --dependency-output $out.d"
-            )
-        cuda_compile_rule_sm80 = (
-            ["rule cuda_compile_sm80"]
-            + cuda_compile_rule[1:]
-            + [
-                f"  command = $nvcc_from_env {nvcc_gendeps} $cuda_cflags -c $in -o $out $cuda_post_cflags_sm80"
-            ]
-        )
-        cuda_compile_rule_sm80_sm90 = (
-            ["rule cuda_compile_sm80_sm90"]
-            + cuda_compile_rule[1:]
-            + [
-                f"  command = $nvcc_from_env {nvcc_gendeps} $cuda_cflags -c $in -o $out $cuda_post_cflags_sm80_sm90"
-            ]
-        )
-        cuda_compile_rule_sm100 = (
-            ["rule cuda_compile_sm100"]
-            + cuda_compile_rule[1:]
-            + [
-                f"  command = $nvcc_from_env {nvcc_gendeps} $cuda_cflags -c $in -o $out $cuda_post_cflags_sm100"
-            ]
-        )
-        cuda_compile_rule.append(
-            f"  command = $nvcc_from_env {nvcc_gendeps} $cuda_cflags -c $in -o $out $cuda_post_cflags"
-        )
-
-    # Emit one build rule per source to enable incremental build.
-    build = []
-    for source_file, object_file in zip(sources, objects):
-        is_cuda_source = _is_cuda_file(source_file) and with_cuda
-        if is_cuda_source:
-            if source_file.endswith("_sm90.cu"):
-                rule = "cuda_compile"
-            elif source_file.endswith("_sm80.cu"):
-                rule = "cuda_compile_sm80"
-            elif source_file.endswith("_sm100.cu"):
-                rule = "cuda_compile_sm100"
-            else:
-                rule = "cuda_compile_sm80_sm90"
-        else:
-            rule = "compile"
-        if IS_WINDOWS:
-            source_file = source_file.replace(":", "$:")
-            object_file = object_file.replace(":", "$:")
-        source_file = source_file.replace(" ", "$ ")
-        object_file = object_file.replace(" ", "$ ")
-        build.append(f"build {object_file}: {rule} {source_file}")
-
-    if cuda_dlink_post_cflags:
-        devlink_out = os.path.join(os.path.dirname(objects[0]), "dlink.o")
-        devlink_rule = ["rule cuda_devlink"]
-        devlink_rule.append("  command = $nvcc $in -o $out $cuda_dlink_post_cflags")
-        devlink = [f"build {devlink_out}: cuda_devlink {' '.join(objects)}"]
-        objects += [devlink_out]
-    else:
-        devlink_rule, devlink = [], []
-
-    if library_target is not None:
-        link_rule = ["rule link"]
-        if IS_WINDOWS:
-            cl_paths = (
-                subprocess.check_output(["where", "cl"])
-                .decode(*SUBPROCESS_DECODE_ARGS)
-                .split("\r\n")
-            )
-            if len(cl_paths) >= 1:
-                cl_path = os.path.dirname(cl_paths[0]).replace(":", "$:")
-            else:
-                raise RuntimeError("MSVC is required to load C++ extensions")
-            link_rule.append(
-                f'  command = "{cl_path}/link.exe" $in /nologo $ldflags /out:$out'
-            )
-        else:
-            link_rule.append("  command = $cxx $in $ldflags -o $out")
-
-        link = [f"build {library_target}: link {' '.join(objects)}"]
-
-        default = [f"default {library_target}"]
-    else:
-        link_rule, link, default = [], [], []
-
-    # 'Blocks' should be separated by newlines, for visual benefit.
-    blocks = [config, flags, compile_rule]
-    if with_cuda:
-        blocks.append(cuda_compile_rule)  # type: ignore[possibly-undefined]
-        blocks.append(cuda_compile_rule_sm80)  # type: ignore[possibly-undefined]
-        blocks.append(cuda_compile_rule_sm80_sm90)  # type: ignore[possibly-undefined]
-        blocks.append(cuda_compile_rule_sm100)  # type: ignore[possibly-undefined]
-    blocks += [devlink_rule, link_rule, build, devlink, link, default]
-    content = "\n\n".join("\n".join(b) for b in blocks)
-    # Ninja requires a new lines at the end of the .ninja file
-    content += "\n"
-    _maybe_write(path, content)
-
-
-# Monkey patching
-torch.utils.cpp_extension._write_ninja_file = _write_ninja_file
-
-
-def get_platform():
-    """
-    Returns the platform name as used in wheel filenames.
-    """
-    if sys.platform.startswith("linux"):
-        return "linux_x86_64"
-    elif sys.platform == "darwin":
-        mac_version = ".".join(platform.mac_ver()[0].split(".")[:2])
-        return f"macosx_{mac_version}_x86_64"
-    elif sys.platform == "win32":
-        return "win_amd64"
-    else:
-        raise ValueError("Unsupported platform: {}".format(sys.platform))
-
-
-def get_cuda_bare_metal_version(cuda_dir):
-    raw_output = subprocess.check_output(
-        [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True
-    )
-    output = raw_output.split()
-    release_idx = output.index("release") + 1
-    bare_metal_version = parse(output[release_idx].split(",")[0])
-
-    return raw_output, bare_metal_version
-
-
-def check_if_cuda_home_none(global_option: str) -> None:
-    if CUDA_HOME is not None:
-        return
-    # warn instead of error because user could be downloading prebuilt wheels, so nvcc won't be necessary
-    # in that case.
-    warnings.warn(
-        f"{global_option} was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  "
-        "If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, "
-        "only images whose names contain 'devel' will provide nvcc."
-    )
-
-
-def nvcc_threads_args():
-    nvcc_threads = os.getenv("NVCC_THREADS") or "4"
-    return ["--threads", nvcc_threads]
-
-
-exe_extension = sysconfig.get_config_var("EXE")
-
-
-cmdclass = {}
-ext_modules = []
-
-# We want this even if SKIP_CUDA_BUILD because when we run python setup.py sdist we want the .hpp
-# files included in the source distribution, in case the user compiles from source.
-subprocess.run(["git", "submodule", "update", "--init", "cutlass"])
-
-if not SKIP_CUDA_BUILD:
-    print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
-    TORCH_MAJOR = int(torch.__version__.split(".")[0])
-    TORCH_MINOR = int(torch.__version__.split(".")[1])
-
-    check_if_cuda_home_none(PACKAGE_NAME)
-    _, bare_metal_version = get_cuda_bare_metal_version(CUDA_HOME)
-    if bare_metal_version < Version("12.3"):
-        raise RuntimeError(
-            f"FlashAttention-3 is only supported on CUDA 12.3 and above, get {bare_metal_version} from {CUDA_HOME}"
-        )
-
-    cc_flag = []
-    cc_flag.append("-gencode")
-    cc_flag.append("arch=compute_90a,code=sm_90a")
-
-    # HACK: The compiler flag -D_GLIBCXX_USE_CXX11_ABI is set to be the same as
-    # torch._C._GLIBCXX_USE_CXX11_ABI
-    # https://github.com/pytorch/pytorch/blob/8472c24e3b5b60150096486616d98b7bea01500b/torch/utils/cpp_extension.py#L920
-    if FORCE_CXX11_ABI:
-        torch._C._GLIBCXX_USE_CXX11_ABI = True
-    repo_dir = Path(this_dir).parent
-    cutlass_dir = repo_dir / "cpp" / "cutlass"
-
-    feature_args = (
-        []
-        + ["-DOSS_ENV"]
-        + (["-DFLASHATTENTION_DISABLE_BACKWARD"] if DISABLE_BACKWARD else [])
-        + (["-DFLASHATTENTION_DISABLE_FP16"] if DISABLE_FP16 else [])
-        + ["-DFLASHATTENTION_DISABLE_FP8"]
-        + (["-DFLASHATTENTION_DISABLE_HDIM64"] if DISABLE_HDIM64 else [])
-        + (["-DFLASHATTENTION_DISABLE_HDIM96"] if DISABLE_HDIM96 else [])
-        + (["-DFLASHATTENTION_DISABLE_HDIM128"] if DISABLE_HDIM128 else [])
-        + (["-DFLASHATTENTION_DISABLE_HDIM192"] if DISABLE_HDIM192 else [])
-        + (["-DFLASHATTENTION_DISABLE_HDIM256"] if DISABLE_HDIM256 else [])
-        + (["-DFLASHATTENTION_DISABLE_SM8x"] if DISABLE_SM8x else [])
-    )
-
-    DTYPE = ["bf16"] + (["fp16"] if not DISABLE_FP16 else [])
-    HEAD_DIMENSIONS = (
-        []
-        + ([64] if not DISABLE_HDIM64 else [])
-        + ([96] if not DISABLE_HDIM96 else [])
-        + ([128] if not DISABLE_HDIM128 else [])
-        + ([192] if not DISABLE_HDIM192 else [])
-        + ([256] if not DISABLE_HDIM256 else [])
-    )
-    sources_fwd_sm80 = [
-        f"hstu_attention/instantiations/flash_fwd_hdim{hdim}_{dtype}_sm80.cu"
-        for hdim, dtype in itertools.product(HEAD_DIMENSIONS, DTYPE)
-    ]
-    sources_bwd_sm80 = [
-        f"hstu_attention/instantiations/flash_bwd_hdim{hdim}_{dtype}_sm80.cu"
-        for hdim, dtype in itertools.product(HEAD_DIMENSIONS, DTYPE)
-    ]
-    sources_fwd_sm90 = [
-        f"hstu_attention/instantiations/flash_fwd_hdim{hdim}_{dtype}_sm90.cu"
-        for hdim, dtype in itertools.product(HEAD_DIMENSIONS, DTYPE)
-    ]
-    sources_bwd_sm90 = [
-        f"hstu_attention/instantiations/flash_bwd_hdim{hdim}_{dtype}_sm90.cu"
-        for hdim, dtype in itertools.product(HEAD_DIMENSIONS, DTYPE)
-    ]
-    if DISABLE_BACKWARD:
-        sources_bwd_sm90 = []
-        sources_bwd_sm80 = []
-    sources = (
-        [
-            "hstu_attention/flash_api.cpp",
-            "hstu_attention/flash_common.cpp",
-            "hstu_attention/flash_cpu_dummy.cpp",
-            "hstu_attention/flash_meta.cpp",
-        ]
-        + (sources_fwd_sm80 if not DISABLE_SM8x else [])
-        + sources_fwd_sm90
-        + (sources_bwd_sm80 if not DISABLE_SM8x else [])
-        + sources_bwd_sm90
-    )
-    nvcc_flags = [
-        "-O3",
-        "-std=c++17",
-        "--ftemplate-backtrace-limit=0",  # To debug template code
-        "--use_fast_math",
-        # "--keep",
-        # "--ptxas-options=--verbose,--register-usage-level=5,--warn-on-local-memory-usage",  # printing out number of registers
-        "--resource-usage",  # printing out number of registers
-        # f"--split-compile={os.getenv('NVCC_THREADS', '4')}",  # split-compile is faster
-        "-lineinfo",  # TODO: disable this for release to reduce binary size
-        "-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED",  # Necessary for the WGMMA shapes that we use
-        "-DCUTLASS_ENABLE_GDC_FOR_SM90",  # For PDL
-        "-DCUTLASS_DEBUG_TRACE_LEVEL=0",  # Can toggle for debugging
-        "-DNDEBUG",  # Important, otherwise performance is severely impacted
-        "-Xfatbin",  # compress all binary sections
-        "-compress-all",
-    ]
-    if get_platform() == "win_amd64":
-        nvcc_flags.extend(
-            [
-                "-D_USE_MATH_DEFINES",  # for M_LN2
-                "-Xcompiler=/Zc:__cplusplus",  # sets __cplusplus correctly, CUTLASS_CONSTEXPR_IF_CXX17 needed for cutlass::gcd
-            ]
-        )
-    include_dirs = [
-        Path(this_dir),
-        cutlass_dir / "include",
-    ]
-
-    ext_modules.append(
-        CUDAExtension(
-            name=f"{PACKAGE_NAME}._C",
-            sources=sources,
-            extra_compile_args={
-                "cxx": ["-O3", "-std=c++17", "-DPy_LIMITED_API=0x03090000"]
-                + feature_args,
-                "nvcc": nvcc_threads_args() + nvcc_flags + cc_flag + feature_args,
-            },
-            include_dirs=include_dirs,
-            py_limited_api=True,
-        )
-    )
-
-
-setup(
-    name=PACKAGE_NAME,
-    version="0.1.0",
-    packages=find_packages(
-        exclude=(
-            "build",
-            "csrc",
-            "include",
-            "tests",
-            "dist",
-            "docs",
-            "benchmarks",
-        )
-    ),
-    py_modules=["cuda_hstu_attention"],
-    description="FlashAttention HSTU",
-    classifiers=[
-        "Programming Language :: Python :: 3",
-        "License :: OSI Approved :: Apache Software License",
-        "Operating System :: Unix",
-    ],
-    ext_modules=ext_modules,
-    cmdclass={"build_ext": BuildExtension},
-    python_requires=">=3.8",
-    install_requires=[
-        "torch",
-        "einops",
-        "packaging",
-        "ninja==1.11.1.1",
-    ],
-)
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda.cpp b/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda.cpp
deleted file mode 100644
index 3925aefd7..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda.cpp
+++ /dev/null
@@ -1,40 +0,0 @@
-#include "common.h"
-#include "sort_kv_pairs_cuda_kernels_template.h"
-
-namespace hstu {
-
-DLL_PUBLIC std::tuple<at::Tensor, at::Tensor> sort_kv_pairs_cuda(
-    const at::Tensor& keys,
-    const at::Tensor& values,
-    const std::optional<int64_t>& end_bit,
-    const bool descending = false) {
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(keys.get_device());
-  TORCH_CHECK(
-      keys.dtype() == at::kInt || keys.dtype() == at::kLong ||
-      keys.dtype() == at::kByte || keys.dtype() == at::kShort);
-  TORCH_CHECK(keys.numel() < std::numeric_limits<int32_t>::max());
-  TORCH_CHECK(keys.dim() == 1);
-  TORCH_CHECK(values.dim() == 1);
-  at::Tensor sorted_keys;
-  at::Tensor sorted_values;
-
-  AT_DISPATCH_INTEGRAL_TYPES(keys.scalar_type(), "sort_pairs_cuda_input1", [&] {
-    using key_t = scalar_t;
-    AT_DISPATCH_ALL_TYPES_AND2(
-        at::ScalarType::Half,
-        at::ScalarType::BFloat16,
-        values.scalar_type(),
-        "sort_pairs_cuda_input2",
-        [&] {
-          using val_t = scalar_t;
-          std::tie(sorted_keys, sorted_values) =
-              sort_kv_pairs_cuda_dispatched<key_t, val_t>(
-                  keys, values, end_bit, descending);
-        });
-  });
-
-  return {std::move(sorted_keys), std::move(sorted_values)};
-}
-
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda_kernels_template.cu b/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda_kernels_template.cu
deleted file mode 100644
index 8cd175c71..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda_kernels_template.cu
+++ /dev/null
@@ -1,82 +0,0 @@
-#include <generative_recommenders/ops/cpp/common.h>
-#include <generative_recommenders/ops/cpp/sort_kv_pairs_cuda_kernels_template.h>
-
-#include <cub/device/device_radix_sort.cuh>
-
-namespace hstu {
-
-template <>
-DLL_PUBLIC std::tuple<at::Tensor, at::Tensor>
-sort_kv_pairs_cuda_dispatched<SUB_KEY_T, SUB_VALUE_T>(
-    const at::Tensor& keys,
-    const at::Tensor& values,
-    const std::optional<int64_t>& end_bit,
-    const bool descending) {
-  size_t temp_storage_bytes = 0;
-  auto keys_contig = keys.contiguous();
-  auto values_contig = values.contiguous();
-  auto sorted_keys = at::empty_like(keys_contig);
-  auto sorted_values = at::empty_like(values_contig);
-
-  if (descending) {
-    AT_CUDA_CHECK(
-        cub::DeviceRadixSort::SortPairsDescending(
-            nullptr,
-            temp_storage_bytes,
-            keys_contig.data_ptr<SUB_KEY_T>(),
-            sorted_keys.data_ptr<SUB_KEY_T>(),
-            values_contig.data_ptr<SUB_VALUE_T>(),
-            sorted_values.data_ptr<SUB_VALUE_T>(),
-            keys_contig.numel(),
-            0,
-            end_bit.has_value() ? end_bit.value() : sizeof(SUB_KEY_T) * 8,
-            at::cuda::getCurrentCUDAStream()));
-    auto temp_storage = at::empty(
-        {static_cast<int64_t>(temp_storage_bytes)},
-        keys_contig.options().dtype(at::kByte));
-    AT_CUDA_CHECK(
-        cub::DeviceRadixSort::SortPairsDescending(
-            temp_storage.data_ptr(),
-            temp_storage_bytes,
-            keys_contig.data_ptr<SUB_KEY_T>(),
-            sorted_keys.data_ptr<SUB_KEY_T>(),
-            values_contig.data_ptr<SUB_VALUE_T>(),
-            sorted_values.data_ptr<SUB_VALUE_T>(),
-            keys_contig.numel(),
-            0,
-            end_bit.has_value() ? end_bit.value() : sizeof(SUB_KEY_T) * 8,
-            at::cuda::getCurrentCUDAStream()));
-  } else {
-    AT_CUDA_CHECK(
-        cub::DeviceRadixSort::SortPairs(
-            nullptr,
-            temp_storage_bytes,
-            keys_contig.data_ptr<SUB_KEY_T>(),
-            sorted_keys.data_ptr<SUB_KEY_T>(),
-            values_contig.data_ptr<SUB_VALUE_T>(),
-            sorted_values.data_ptr<SUB_VALUE_T>(),
-            keys_contig.numel(),
-            0,
-            end_bit.has_value() ? end_bit.value() : sizeof(SUB_KEY_T) * 8,
-            at::cuda::getCurrentCUDAStream()));
-    auto temp_storage = at::empty(
-        {static_cast<int64_t>(temp_storage_bytes)},
-        keys_contig.options().dtype(at::kByte));
-    AT_CUDA_CHECK(
-        cub::DeviceRadixSort::SortPairs(
-            temp_storage.data_ptr(),
-            temp_storage_bytes,
-            keys_contig.data_ptr<SUB_KEY_T>(),
-            sorted_keys.data_ptr<SUB_KEY_T>(),
-            values_contig.data_ptr<SUB_VALUE_T>(),
-            sorted_values.data_ptr<SUB_VALUE_T>(),
-            keys_contig.numel(),
-            0,
-            end_bit.has_value() ? end_bit.value() : sizeof(SUB_KEY_T) * 8,
-            at::cuda::getCurrentCUDAStream()));
-  }
-
-  return {std::move(sorted_keys), std::move(sorted_values)};
-}
-
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda_kernels_template.h b/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda_kernels_template.h
deleted file mode 100644
index e599eccb0..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/sort_kv_pairs_cuda_kernels_template.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#pragma once
-
-#include <ATen/ATen.h>
-#include <optional>
-
-namespace hstu {
-
-template <typename key_t, typename val_t>
-std::tuple<at::Tensor, at::Tensor> sort_kv_pairs_cuda_dispatched(
-    const at::Tensor& keys_contig,
-    const at::Tensor& values_contig,
-    const std::optional<int64_t>& end_bit,
-    const bool descending);
-
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/split_1d_jagged_jagged.cpp b/recommendation_v4/generative_recommenders/ops/cpp/split_1d_jagged_jagged.cpp
deleted file mode 100644
index c361488fa..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/split_1d_jagged_jagged.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <ATen/ATen.h>
-#include <ATen/Dispatch.h>
-#include <ATen/core/op_registration/op_registration.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/Exceptions.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <torch/extension.h>
-#include <torch/library.h>
-
-#include "fbgemm_gpu/sparse_ops.h" // @manual
-
-namespace hstu {
-
-template <typename index_t, typename val_t>
-void _split_1d_jagged_jagged_cpu_kernel(
-    int32_t B,
-    const at::TensorAccessor<index_t, 1>& combined_offsets,
-    const at::TensorAccessor<val_t, 1>& combined_values,
-    const at::TensorAccessor<index_t, 1>& lengths_left,
-    const at::TensorAccessor<index_t, 1>& offsets_left,
-    const at::TensorAccessor<index_t, 1>& offsets_right,
-    at::TensorAccessor<val_t, 1> values_left,
-    at::TensorAccessor<val_t, 1> values_right) {
-  for (auto b : c10::irange(B)) {
-    auto combined_start = combined_offsets[b];
-    auto left_len = lengths_left[b];
-    auto left_start = offsets_left[b];
-    auto right_start = offsets_right[b];
-
-    for (auto i = 0; i < left_len; ++i) {
-      values_left[left_start + i] = combined_values[combined_start + i];
-    }
-
-    auto right_len = combined_offsets[b + 1] - combined_offsets[b] - left_len;
-    for (auto i = 0; i < right_len; ++i) {
-      values_right[right_start + i] =
-          combined_values[combined_start + left_len + i];
-    }
-  }
-}
-
-std::tuple<at::Tensor, at::Tensor> split_1d_jagged_jagged_cpu(
-    const at::Tensor& lengths_left,
-    const at::Tensor& lengths_right,
-    const at::Tensor& combined_values) {
-  TORCH_INTERNAL_ASSERT(lengths_left.device().type() == at::DeviceType::CPU);
-  TORCH_INTERNAL_ASSERT(lengths_right.device().type() == at::DeviceType::CPU);
-  TORCH_INTERNAL_ASSERT(combined_values.device().type() == at::DeviceType::CPU);
-  TORCH_CHECK(lengths_left.size(0) == lengths_right.size(0));
-  auto B = lengths_left.size(0);
-
-  auto L_left = lengths_left.sum().item<int64_t>();
-  auto L_right = lengths_right.sum().item<int64_t>();
-  TORCH_CHECK(L_left + L_right == combined_values.numel());
-
-  auto values_left = at::empty({L_left}, combined_values.options());
-  auto values_right = at::empty({L_right}, combined_values.options());
-
-  if (L_left == 0 && L_right == 0) {
-    return std::make_tuple(values_left, values_right);
-  }
-
-  const auto combined_lengths = lengths_left + lengths_right;
-  const auto combined_offsets =
-      fbgemm_gpu::asynchronous_complete_cumsum_cpu(combined_lengths.view({-1}));
-  const auto offsets_left =
-      fbgemm_gpu::asynchronous_complete_cumsum_cpu(lengths_left.view({-1}));
-  const auto offsets_right =
-      fbgemm_gpu::asynchronous_complete_cumsum_cpu(lengths_right.view({-1}));
-
-  AT_DISPATCH_INTEGRAL_TYPES(
-      lengths_left.scalar_type(),
-      "split_1d_jagged_jagged_values_cpu_kernel_input1",
-      [&] {
-        using index_t = scalar_t;
-        AT_DISPATCH_ALL_TYPES_AND2(
-            at::ScalarType::BFloat16,
-            at::ScalarType::Half,
-            combined_values.scalar_type(),
-            "split_1d_jagged_jagged_values_cpu_kernel_input2",
-            [&] {
-              using val_t = scalar_t;
-              _split_1d_jagged_jagged_cpu_kernel<index_t, val_t>(
-                  B,
-                  combined_offsets.accessor<index_t, 1>(),
-                  combined_values.accessor<val_t, 1>(),
-                  lengths_left.accessor<index_t, 1>(),
-                  offsets_left.accessor<index_t, 1>(),
-                  offsets_right.accessor<index_t, 1>(),
-                  values_left.accessor<val_t, 1>(),
-                  values_right.accessor<val_t, 1>());
-            });
-      });
-
-  return std::make_tuple(values_left, values_right);
-}
-
-std::tuple<at::Tensor, at::Tensor> split_1d_jagged_jagged_meta(
-    const at::Tensor& lengths_left,
-    const at::Tensor& lengths_right,
-    const at::Tensor& combined_values) {
-  auto L_left = lengths_left.sum().item<int64_t>();
-  auto L_right = lengths_right.sum().item<int64_t>();
-
-  auto values_left = at::native::empty_meta_symint(
-      {L_left},
-      /*dtype=*/::std::make_optional(combined_values.scalar_type()),
-      /*layout=*/::std::make_optional(combined_values.layout()),
-      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
-      /*pin_memory=*/::std::nullopt);
-
-  auto values_right = at::native::empty_meta_symint(
-      {L_right},
-      /*dtype=*/::std::make_optional(combined_values.scalar_type()),
-      /*layout=*/::std::make_optional(combined_values.layout()),
-      /*device=*/::std::make_optional(c10::Device(c10::kMeta)),
-      /*pin_memory=*/::std::nullopt);
-
-  return std::make_tuple(values_left, values_right);
-}
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/split_1d_jagged_jagged.cu b/recommendation_v4/generative_recommenders/ops/cpp/split_1d_jagged_jagged.cu
deleted file mode 100644
index 181489bae..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/split_1d_jagged_jagged.cu
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) Meta Platforms, Inc. and affiliates.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <ATen/ATen.h>
-#include <ATen/Dispatch.h>
-#include <ATen/core/op_registration/op_registration.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/Exceptions.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <torch/library.h>
-
-#include "common.h"
-#include "fbgemm_gpu/sparse_ops.h" // @manual
-#include "fbgemm_gpu/utils/fixed_divisor.cuh" // @manual
-
-namespace hstu {
-
-static constexpr int32_t kMaxThreads = 1024;
-
-template <typename index_t, typename val_t>
-__global__
-__launch_bounds__(kMaxThreads) void _split_1d_jagged_jagged_cuda_kernel(
-    int32_t B,
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        combined_offsets,
-    const at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits>
-        combined_values,
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        lengths_left,
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        offsets_left,
-    const at::PackedTensorAccessor32<index_t, 1, at::RestrictPtrTraits>
-        offsets_right,
-    at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits> values_left,
-    at::PackedTensorAccessor32<val_t, 1, at::RestrictPtrTraits> values_right) {
-  for (auto b = blockIdx.x * blockDim.y + threadIdx.y;
-       b < static_cast<uint32_t>(B);
-       b += gridDim.x * blockDim.y) {
-    auto combined_start = combined_offsets[b];
-    auto left_len = lengths_left[b];
-    auto right_len = combined_offsets[b + 1] - combined_offsets[b] - left_len;
-    auto left_start = offsets_left[b];
-    auto right_start = offsets_right[b];
-
-    for (auto i = threadIdx.x; i < static_cast<uint32_t>(left_len + right_len);
-         i += blockDim.x) {
-      if (i < static_cast<uint32_t>(left_len)) {
-        values_left[left_start + i] = combined_values[combined_start + i];
-      } else {
-        values_right[right_start + i - left_len] =
-            combined_values[combined_start + i];
-      }
-    }
-  }
-}
-
-std::tuple<at::Tensor, at::Tensor> split_1d_jagged_jagged_cuda(
-    const at::Tensor& lengths_left,
-    const at::Tensor& lengths_right,
-    const at::Tensor& combined_values) {
-  at::cuda::OptionalCUDAGuard device_guard;
-  device_guard.set_index(combined_values.get_device());
-  TORCH_INTERNAL_ASSERT(lengths_left.device().type() == at::DeviceType::CUDA);
-  TORCH_INTERNAL_ASSERT(lengths_right.device().type() == at::DeviceType::CUDA);
-  TORCH_INTERNAL_ASSERT(
-      combined_values.device().type() == at::DeviceType::CUDA);
-  TORCH_CHECK(lengths_left.size(0) == lengths_right.size(0));
-
-  auto B = lengths_left.size(0);
-  auto L_left = lengths_left.sum().item<int64_t>();
-  auto L_right = lengths_right.sum().item<int64_t>();
-  TORCH_CHECK(L_left + L_right == combined_values.numel());
-  TORCH_CHECK(L_left < std::numeric_limits<int32_t>::max());
-  TORCH_CHECK(L_right < std::numeric_limits<int32_t>::max());
-  TORCH_CHECK(combined_values.get_device() == lengths_left.get_device());
-  TORCH_CHECK(combined_values.get_device() == lengths_right.get_device());
-
-  auto values_left = at::empty({L_left}, combined_values.options());
-  auto values_right = at::empty({L_right}, combined_values.options());
-
-  if (L_left == 0 && L_right == 0) {
-    return std::make_tuple(values_left, values_right);
-  }
-
-  const auto combined_lengths = lengths_left + lengths_right;
-  const auto combined_offsets =
-      fbgemm_gpu::asynchronous_complete_cumsum_gpu(combined_lengths.view({-1}));
-  const auto offsets_left =
-      fbgemm_gpu::asynchronous_complete_cumsum_gpu(lengths_left.view({-1}));
-  const auto offsets_right =
-      fbgemm_gpu::asynchronous_complete_cumsum_gpu(lengths_right.view({-1}));
-
-  // Optimized thread block configuration based on benchmark results
-  uint32_t B_blocks = 4;
-  dim3 threads(256, B_blocks);
-  auto blocks = div_round_up(B, B_blocks);
-
-  AT_DISPATCH_INTEGRAL_TYPES(
-      lengths_left.scalar_type(),
-      "split_1d_jagged_jagged_values_cuda_kernel_input1",
-      [&] {
-        using index_t = scalar_t;
-        AT_DISPATCH_ALL_TYPES_AND2(
-            at::ScalarType::BFloat16,
-            at::ScalarType::Half,
-            combined_values.scalar_type(),
-            "split_1d_jagged_jagged_values_cuda_kernel_input2",
-            [&] {
-              using val_t = scalar_t;
-              _split_1d_jagged_jagged_cuda_kernel<index_t, val_t><<<
-                  blocks,
-                  threads,
-                  0,
-                  at::cuda::getCurrentCUDAStream()>>>(
-                  B,
-                  combined_offsets
-                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-                  combined_values
-                      .packed_accessor32<val_t, 1, at::RestrictPtrTraits>(),
-                  lengths_left
-                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-                  offsets_left
-                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-                  offsets_right
-                      .packed_accessor32<index_t, 1, at::RestrictPtrTraits>(),
-                  values_left
-                      .packed_accessor32<val_t, 1, at::RestrictPtrTraits>(),
-                  values_right
-                      .packed_accessor32<val_t, 1, at::RestrictPtrTraits>());
-            });
-      });
-
-  return std::make_tuple(values_left, values_right);
-}
-} // namespace hstu
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/tests/concat_1d_jagged_jagged_test.py b/recommendation_v4/generative_recommenders/ops/cpp/tests/concat_1d_jagged_jagged_test.py
deleted file mode 100644
index 8c27a787b..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/tests/concat_1d_jagged_jagged_test.py
+++ /dev/null
@@ -1,135 +0,0 @@
-#!/usr/bin/env python3
-
-# pyre-strict
-
-import unittest
-
-import torch
-from generative_recommenders.common import gpu_unavailable
-from hammer.ops.jagged import concat_1D_jagged_jagged
-from hypothesis import given, settings, strategies as st, Verbosity
-
-# buck2 test @mode/opt -c fbcode.nvcc_arch=h100 fbcode//generative_recommenders/ops/cpp/tests:concat_1d_jagged_jagged_test
-
-torch.ops.load_library("//generative_recommenders/ops/cpp:cpp_ops")
-torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
-torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
-
-
-class OpsTest(unittest.TestCase):
-    @unittest.skipIf(*gpu_unavailable)
-    # pyre-ignore
-    @given(
-        batch_size=st.integers(10, 500),
-        max_seq_len_left=st.integers(10, 1000),
-        max_seq_len_right=st.integers(10, 1000),
-        val_dtype=st.sampled_from([torch.float32, torch.float16, torch.bfloat16]),
-    )
-    @settings(
-        verbosity=Verbosity.verbose,
-        max_examples=100,
-        deadline=None,
-    )
-    def test_concat_1d_jagged_jagged(
-        self,
-        batch_size: int,
-        max_seq_len_left: int,
-        max_seq_len_right: int,
-        val_dtype: torch.dtype,
-    ) -> None:
-        batch_size = 3
-        max_seq_len_left = 4
-        max_seq_len_right = 2
-        lengths_left = torch.randint(
-            0, max_seq_len_left + 1, (batch_size,), device="cpu"
-        )
-        values_left = torch.rand(
-            (int(torch.sum(lengths_left).cpu().item()),), dtype=val_dtype, device="cpu"
-        )
-        offsets_left = torch.zeros(
-            (batch_size + 1,),
-            dtype=lengths_left.dtype,
-            device=lengths_left.device,
-        )
-        offsets_left[1:] = torch.cumsum(lengths_left.view(-1), dim=0)
-        lengths_right = torch.randint(
-            0, max_seq_len_right + 1, (batch_size,), device="cpu"
-        )
-        values_right = torch.rand(
-            (int(torch.sum(lengths_right).cpu().item()),), dtype=val_dtype, device="cpu"
-        )
-        offsets_right = torch.zeros(
-            (batch_size + 1,),
-            dtype=lengths_right.dtype,
-            device=lengths_right.device,
-        )
-        offsets_right[1:] = torch.cumsum(lengths_right.view(-1), dim=0)
-        custom_cpu_result = torch.ops.hstu.concat_1d_jagged_jagged(
-            lengths_left=lengths_left,
-            values_left=values_left,
-            lengths_right=lengths_right,
-            values_right=values_right,
-        )
-
-        custom_cuda_result = torch.ops.hstu.concat_1d_jagged_jagged(
-            lengths_left=lengths_left.cuda(),
-            values_left=values_left.cuda(),
-            lengths_right=lengths_right.cuda(),
-            values_right=values_right.cuda(),
-        )
-        torch.testing.assert_close(custom_cuda_result.cpu(), custom_cpu_result)
-
-    @unittest.skipIf(*gpu_unavailable)
-    def test_concat_1d_jagged_jagged_vs_hammer(self) -> None:
-        torch.manual_seed(42)
-        batch_size = 8
-        max_seq_len_left = 50
-        max_seq_len_right = 30
-
-        lengths_left = torch.randint(
-            0, max_seq_len_left + 1, (batch_size,), dtype=torch.int32
-        )
-        lengths_right = torch.randint(
-            0, max_seq_len_right + 1, (batch_size,), dtype=torch.int32
-        )
-
-        total_left = int(lengths_left.sum().item())
-        total_right = int(lengths_right.sum().item())
-
-        values_left = (
-            torch.randn(total_left, dtype=torch.float32)
-            if total_left > 0
-            else torch.empty(0, dtype=torch.float32)
-        )
-        values_right = (
-            torch.randn(total_right, dtype=torch.float32)
-            if total_right > 0
-            else torch.empty(0, dtype=torch.float32)
-        )
-
-        offsets_left = torch.zeros(
-            (batch_size + 1,), dtype=lengths_left.dtype, device=lengths_left.device
-        )
-        offsets_left[1:] = torch.cumsum(lengths_left.view(-1), dim=0)
-        offsets_right = torch.zeros(
-            (batch_size + 1,), dtype=lengths_right.dtype, device=lengths_right.device
-        )
-        offsets_right[1:] = torch.cumsum(lengths_right.view(-1), dim=0)
-
-        combined_values_ref = concat_1D_jagged_jagged(
-            max_seq_len_left=max_seq_len_left,
-            offsets_left=offsets_left,
-            values_left=values_left,
-            max_seq_len_right=max_seq_len_right,
-            offsets_right=offsets_right,
-            values_right=values_right,
-        )
-
-        custom_cuda_result = torch.ops.hstu.concat_1d_jagged_jagged(
-            lengths_left=lengths_left.cuda(),
-            values_left=values_left.cuda(),
-            lengths_right=lengths_right.cuda(),
-            values_right=values_right.cuda(),
-        )
-
-        torch.testing.assert_close(custom_cuda_result.cpu(), combined_values_ref)
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/tests/hstu_mha_cpu_test.py b/recommendation_v4/generative_recommenders/ops/cpp/tests/hstu_mha_cpu_test.py
deleted file mode 100644
index cb787ea61..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/tests/hstu_mha_cpu_test.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
-
-# pyre-strict
-
-# cmd: buck2 run @//mode/opt -c fbcode.enable_gpu_sections=true -c fbcode.platform010_cuda_version=12.8 -c fbcode.nvcc_arch=b200a  //generative_recommenders/ops/cpp/tests:hstu_mha_cpu_test
-
-import unittest
-
-import torch
-
-torch.ops.load_library(
-    "//generative_recommenders/ops/cpp/hstu_attention:hstu_flash_attention"
-)
-
-
-class TestHstuMhaFwd(unittest.TestCase):
-    def test_hstu_mha_fwd(self) -> None:
-        q: torch.Tensor = torch.randn([100, 4, 64], dtype=torch.bfloat16, device="cpu")
-        k: torch.Tensor = torch.randn([100, 4, 64], dtype=torch.bfloat16, device="cpu")
-        v: torch.Tensor = torch.randn([100, 4, 64], dtype=torch.bfloat16, device="cpu")
-        res = torch.ops.hstu.hstu_mha_fwd(
-            10,
-            0.25,
-            q,
-            k,
-            v,
-            torch.empty([0], dtype=torch.int32, device="cpu"),
-            True,  # causal
-            None,
-            None,
-            0,
-            0,
-            0,
-            None,  # q_descale
-            None,  # k_descale
-            None,  # v_descale
-            0,  # sm_margin
-        )
-        self.assertIsNotNone(res)
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/tests/jagged_transpose_1d_test.py b/recommendation_v4/generative_recommenders/ops/cpp/tests/jagged_transpose_1d_test.py
deleted file mode 100644
index 6a5f5997b..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/tests/jagged_transpose_1d_test.py
+++ /dev/null
@@ -1,132 +0,0 @@
-#!/usr/bin/env python3
-
-# pyre-strict
-
-import unittest
-
-import torch
-from generative_recommenders.common import gpu_unavailable
-from hammer.ops.jagged import jagged_transpose_1D
-from hypothesis import given, settings, strategies as st, Verbosity
-
-# buck2 test @mode/opt -c fbcode.nvcc_arch=h100 fbcode//generative_recommenders/ops/cpp/tests:jagged_transpose_1d_test
-
-torch.ops.load_library("//generative_recommenders/ops/cpp:cpp_ops")
-torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
-torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
-
-
-class OpsTest(unittest.TestCase):
-    @unittest.skipIf(*gpu_unavailable)
-    # pyre-ignore
-    @given(
-        size1=st.integers(2, 10),
-        size2=st.integers(2, 10),
-        max_len=st.integers(5, 50),
-        val_dtype=st.sampled_from([torch.float32, torch.float16, torch.bfloat16]),
-    )
-    @settings(
-        verbosity=Verbosity.verbose,
-        max_examples=100,
-        deadline=None,
-    )
-    def test_jagged_transpose_1d(
-        self,
-        size1: int,
-        size2: int,
-        max_len: int,
-        val_dtype: torch.dtype,
-    ) -> None:
-        lengths = torch.randint(
-            0, max_len + 1, (size1 * size2,), dtype=torch.int32, device="cpu"
-        )
-        offsets = torch.zeros(
-            (size1 * size2 + 1,), dtype=lengths.dtype, device=lengths.device
-        )
-        offsets[1:] = torch.cumsum(lengths.view(-1), dim=0)
-
-        values = torch.randn(int(offsets[-1].item()), dtype=val_dtype, device="cpu")
-
-        (
-            custom_cpu_values,
-            custom_cpu_offsets,
-            custom_cpu_lengths,
-        ) = torch.ops.hstu.jagged_transpose_1d(
-            values=values,
-            offsets=offsets,
-            lengths=lengths,
-            max_len=max_len,
-            size1=size1,
-            size2=size2,
-        )
-
-        (
-            custom_cuda_values,
-            custom_cuda_offsets,
-            custom_cuda_lengths,
-        ) = torch.ops.hstu.jagged_transpose_1d(
-            values=values.cuda(),
-            offsets=offsets.cuda(),
-            lengths=lengths.cuda(),
-            max_len=max_len,
-            size1=size1,
-            size2=size2,
-        )
-
-        torch.testing.assert_close(custom_cuda_values.cpu(), custom_cpu_values)
-        torch.testing.assert_close(custom_cuda_offsets.cpu(), custom_cpu_offsets)
-        torch.testing.assert_close(custom_cuda_lengths.cpu(), custom_cpu_lengths)
-
-    @unittest.skipIf(*gpu_unavailable)
-    # pyre-ignore
-    @given(
-        size1=st.integers(2, 10),
-        size2=st.integers(2, 10),
-        max_len=st.integers(5, 50),
-        val_dtype=st.sampled_from([torch.float32, torch.float16, torch.bfloat16]),
-    )
-    @settings(
-        verbosity=Verbosity.verbose,
-        max_examples=100,
-        deadline=None,
-    )
-    def test_jagged_transpose_1d_vs_hammer(
-        self,
-        size1: int,
-        size2: int,
-        max_len: int,
-        val_dtype: torch.dtype,
-    ) -> None:
-        lengths = torch.randint(0, max_len + 1, (size1 * size2,), dtype=torch.int32)
-        offsets = torch.zeros(
-            (size1 * size2 + 1,), dtype=lengths.dtype, device=lengths.device
-        )
-        offsets[1:] = torch.cumsum(lengths.view(-1), dim=0)
-
-        values = torch.randn(int(offsets[-1].item()), dtype=val_dtype)
-
-        values_ref, offsets_ref, lengths_ref = jagged_transpose_1D(
-            values=values,
-            offsets=offsets,
-            lengths=lengths,
-            max_len=max_len,
-            size1=size1,
-            size2=size2,
-        )
-
-        (
-            custom_cuda_values,
-            custom_cuda_offsets,
-            custom_cuda_lengths,
-        ) = torch.ops.hstu.jagged_transpose_1d(
-            values=values.cuda(),
-            offsets=offsets.cuda(),
-            lengths=lengths.cuda(),
-            max_len=max_len,
-            size1=size1,
-            size2=size2,
-        )
-
-        torch.testing.assert_close(custom_cuda_values.cpu(), values_ref)
-        torch.testing.assert_close(custom_cuda_offsets.cpu(), offsets_ref)
-        torch.testing.assert_close(custom_cuda_lengths.cpu(), lengths_ref)
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/tests/replace_last_n_with_jagged_test.py b/recommendation_v4/generative_recommenders/ops/cpp/tests/replace_last_n_with_jagged_test.py
deleted file mode 100644
index 9826f199d..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/tests/replace_last_n_with_jagged_test.py
+++ /dev/null
@@ -1,105 +0,0 @@
-#!/usr/bin/env python3
-
-# pyre-strict
-
-import unittest
-
-import torch
-from generative_recommenders.common import gpu_unavailable
-from hammer.ops.jagged import replace_last_n_with_jagged
-
-# buck2 test @mode/opt -c fbcode.nvcc_arch=h100 fbcode//generative_recommenders/ops/cpp/tests:replace_last_n_with_jagged_test
-
-torch.ops.load_library("//generative_recommenders/ops/cpp:cpp_ops")
-torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
-torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
-
-
-class OpsTest(unittest.TestCase):
-    @unittest.skipIf(*gpu_unavailable)
-    def test_replace_last_n_with_jagged(self) -> None:
-        torch.manual_seed(42)
-        batch_size = 8
-        embedding_dim = 64
-        max_seq_len_left = 25
-        max_seq_len_right = 10
-
-        lengths_left = torch.randint(
-            max_seq_len_right, max_seq_len_left + 1, (batch_size,), dtype=torch.int32
-        )
-        lengths_right = torch.randint(
-            1, max_seq_len_right + 1, (batch_size,), dtype=torch.int32
-        )
-
-        lengths_right = torch.min(lengths_right, lengths_left)
-
-        total_left = int(lengths_left.sum().item())
-        total_right = int(lengths_right.sum().item())
-
-        values_left = torch.randn(total_left, embedding_dim, dtype=torch.float32)
-        values_right = torch.randn(total_right, embedding_dim, dtype=torch.float32)
-
-        custom_cpu_result = torch.ops.hstu.replace_last_n_with_jagged(
-            lengths_left=lengths_left,
-            values_left=values_left,
-            lengths_right=lengths_right,
-            values_right=values_right,
-        )
-
-        custom_cuda_result = torch.ops.hstu.replace_last_n_with_jagged(
-            lengths_left=lengths_left.cuda(),
-            values_left=values_left.cuda(),
-            lengths_right=lengths_right.cuda(),
-            values_right=values_right.cuda(),
-        )
-
-        torch.testing.assert_close(custom_cuda_result.cpu(), custom_cpu_result)
-
-    @unittest.skipIf(*gpu_unavailable)
-    def test_replace_last_n_with_jagged_vs_hammer(self) -> None:
-        torch.manual_seed(42)
-        batch_size = 8
-        embedding_dim = 32
-        max_seq_len_left = 20
-        max_seq_len_right = 8
-
-        lengths_left = torch.randint(
-            max_seq_len_right, max_seq_len_left + 1, (batch_size,), dtype=torch.int32
-        )
-        lengths_right = torch.randint(
-            1, max_seq_len_right + 1, (batch_size,), dtype=torch.int32
-        )
-
-        lengths_right = torch.min(lengths_right, lengths_left)
-
-        total_left = int(lengths_left.sum().item())
-        total_right = int(lengths_right.sum().item())
-
-        values_left = torch.randn(total_left, embedding_dim, dtype=torch.float32)
-        values_right = torch.randn(total_right, embedding_dim, dtype=torch.float32)
-
-        offsets_left = torch.zeros(
-            (batch_size + 1,), dtype=lengths_left.dtype, device=lengths_left.device
-        )
-        offsets_left[1:] = torch.cumsum(lengths_left.view(-1), dim=0)
-        offsets_right = torch.zeros(
-            (batch_size + 1,), dtype=lengths_right.dtype, device=lengths_right.device
-        )
-        offsets_right[1:] = torch.cumsum(lengths_right.view(-1), dim=0)
-
-        result_ref = replace_last_n_with_jagged(
-            max_seq_len_left=max_seq_len_left,
-            offsets_left=offsets_left,
-            values_left=values_left,
-            offsets_right=offsets_right,
-            values_right=values_right,
-        )
-
-        custom_cuda_result = torch.ops.hstu.replace_last_n_with_jagged(
-            lengths_left=lengths_left.cuda(),
-            values_left=values_left.cuda(),
-            lengths_right=lengths_right.cuda(),
-            values_right=values_right.cuda(),
-        )
-
-        torch.testing.assert_close(custom_cuda_result.cpu(), result_ref)
diff --git a/recommendation_v4/generative_recommenders/ops/cpp/tests/split_1d_jagged_jagged_test.py b/recommendation_v4/generative_recommenders/ops/cpp/tests/split_1d_jagged_jagged_test.py
deleted file mode 100644
index 24f12c4a2..000000000
--- a/recommendation_v4/generative_recommenders/ops/cpp/tests/split_1d_jagged_jagged_test.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#!/usr/bin/env python3
-
-# pyre-strict
-
-import unittest
-
-import torch
-from generative_recommenders.common import gpu_unavailable
-from hammer.ops.jagged import split_1D_jagged_jagged
-
-# buck2 test @mode/opt -c fbcode.nvcc_arch=h100 fbcode//generative_recommenders/ops/cpp/tests:split_1d_jagged_jagged_test
-
-torch.ops.load_library("//generative_recommenders/ops/cpp:cpp_ops")
-torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops")
-torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:sparse_ops_cpu")
-
-
-class OpsTest(unittest.TestCase):
-    @unittest.skipIf(*gpu_unavailable)
-    def test_split_1d_jagged_jagged(self) -> None:
-        torch.manual_seed(42)
-        batch_size = 8
-        max_seq_len_left = 25
-        max_seq_len_right = 20
-
-        lengths_left = torch.randint(
-            0, max_seq_len_left + 1, (batch_size,), dtype=torch.int32
-        )
-        lengths_right = torch.randint(
-            0, max_seq_len_right + 1, (batch_size,), dtype=torch.int32
-        )
-
-        combined_lengths = lengths_left + lengths_right
-        combined_offsets = torch.zeros(
-            (batch_size + 1,), dtype=lengths_left.dtype, device=lengths_left.device
-        )
-        combined_offsets[1:] = torch.cumsum(combined_lengths.view(-1), dim=0)
-
-        combined_values = torch.randn(
-            int(combined_offsets[-1].item()), dtype=torch.float32
-        )
-
-        custom_cpu_left, custom_cpu_right = torch.ops.hstu.split_1d_jagged_jagged(
-            lengths_left=lengths_left,
-            lengths_right=lengths_right,
-            combined_values=combined_values,
-        )
-
-        custom_cuda_left, custom_cuda_right = torch.ops.hstu.split_1d_jagged_jagged(
-            lengths_left=lengths_left.cuda(),
-            lengths_right=lengths_right.cuda(),
-            combined_values=combined_values.cuda(),
-        )
-
-        torch.testing.assert_close(custom_cuda_left.cpu(), custom_cpu_left)
-        torch.testing.assert_close(custom_cuda_right.cpu(), custom_cpu_right)
-
-    @unittest.skipIf(*gpu_unavailable)
-    def test_split_1d_jagged_jagged_vs_hammer(self) -> None:
-        torch.manual_seed(42)
-        batch_size = 8
-        max_seq_len_left = 25
-        max_seq_len_right = 20
-
-        lengths_left = torch.randint(
-            0, max_seq_len_left + 1, (batch_size,), dtype=torch.int32
-        )
-        lengths_right = torch.randint(
-            0, max_seq_len_right + 1, (batch_size,), dtype=torch.int32
-        )
-
-        offsets_left = torch.zeros(
-            (batch_size + 1,), dtype=lengths_left.dtype, device=lengths_left.device
-        )
-        offsets_left[1:] = torch.cumsum(lengths_left.view(-1), dim=0)
-        offsets_right = torch.zeros(
-            (batch_size + 1,), dtype=lengths_right.dtype, device=lengths_right.device
-        )
-        offsets_right[1:] = torch.cumsum(lengths_right.view(-1), dim=0)
-
-        combined_offsets = offsets_left + offsets_right
-        combined_values = torch.randn(
-            int(combined_offsets[-1].item()), dtype=torch.float32
-        )
-
-        left_ref, right_ref = split_1D_jagged_jagged(
-            max_seq_len=max_seq_len_left + max_seq_len_right,
-            values=combined_values,
-            offsets_left=offsets_left,
-            offsets_right=offsets_right,
-        )
-
-        custom_cuda_left, custom_cuda_right = torch.ops.hstu.split_1d_jagged_jagged(
-            lengths_left=lengths_left.cuda(),
-            lengths_right=lengths_right.cuda(),
-            combined_values=combined_values.cuda(),
-        )
-
-        torch.testing.assert_close(custom_cuda_left.cpu(), left_ref)
-        torch.testing.assert_close(custom_cuda_right.cpu(), right_ref)
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/README.md b/recommendation_v4/generative_recommenders/ops/triton_aot/README.md
deleted file mode 100644
index 2b0b1a834..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/README.md
+++ /dev/null
@@ -1,54 +0,0 @@
-# Local Triton AOT Support
-
-This package is a minimal local copy of the Triton AOT pieces needed by the
-DLRM v3 HSTU inference end-to-end test. It avoids depending on the standalone
-`fbcode/triton_aot` package while preserving the compile, transform, and
-runtime-loading flow used by `generative_recommenders`.
-
-This is not intended to be a full fork of `fbcode/triton_aot`. Keep changes
-scoped to the GR inference use case unless a broader migration plan exists.
-
-## Code Structure
-
-- `types.py`: local `TritonAOT` registration object and `triton_aot` helper used
-  by GR AOT wrapper modules.
-- `preprocess.py`: FX graph preprocessing helpers, including wrapper-node
-  unwrapping before compile/transform.
-- `triton_*.py`: GR kernel-specific AOT wrapper modules for addmm, jagged
-  concat/split, layer norm variants, HSTU attention, and timestamp position
-  embeddings.
-- `compile/`: compile-time state, Triton signature/spec processing, generated
-  C++ codegen, and the `TritonAOTCompile` context manager.
-- `transform/`: FX graph transformation and generated Python wrapper code that
-  swaps Python AOT wrappers for `torch.ops.triton_aot.*` calls backed by built
-  shared libraries.
-- `build/`: extension builders and CUBIN embedding utilities used to create
-  loadable kernel libraries from compiled Triton artifacts.
-- `templates/`: C++ template files used by the compile/codegen path for kernel
-  entry points, embedded CUBIN data, and Torch operator registration.
-- `shared/`: compatibility helpers and type/spec conversion utilities shared by
-  compile and transform code.
-
-## Runtime Flow
-
-1. GR `triton_*.py` wrappers expose Triton kernels through local `triton_aot`
-   descriptors.
-2. `TritonAOTCompile` runs representative CUDA inputs, records kernel specs, and
-   compiles the collected Triton kernels into shared libraries.
-3. `transform_kernels` rewrites the FX graph so wrapper calls dispatch through
-   `torch.ops.triton_aot.*`.
-4. The e2e test copies the generated libraries into its workdir and passes them
-   to the C++ runner before executing the scripted sparse/dense modules.
-
-
-## Authors
-
-- Chang Pan <changpan@meta.com>
-- Zhiyong Wang (MRS) <zhiywang@meta.com>
-- Chenzhi Yu <chenzhi@meta.com>
-- Runming Lu <runming@meta.com>
-- Chun-Wei Chen <jcwchen@meta.com>
-- Michael He <michaelhe@meta.com>
-- Linjian Ma <linjianma@meta.com>
-- Xing Liu <xingl@meta.com>
-- Zhuoran Zhao <zhuoran@meta.com>
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/arg_descriptor.py b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/arg_descriptor.py
deleted file mode 100644
index bc5963674..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/arg_descriptor.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-
-# pyre-strict
-
-"""ArgDescriptor — per-arg codegen descriptor for AOT-T.
-
-Centralises arg classification (pointer / scalar / constant) so every
-``gen_*`` function in ``codegen.py`` iterates descriptors instead of
-doing its own dict lookups into ``OpsUnit`` fields.
-
-Also provides type-mapping helpers that convert ``ArgDescriptor``
-metadata into context-specific C++ / TorchScript type strings.
-"""
-
-from __future__ import annotations
-
-from dataclasses import dataclass
-from typing import Any
-
-from generative_recommenders.ops.triton_aot.compile.spec_processing import OpsUnit
-from triton.runtime.jit import JITFunction
-
-
-# ---------------------------------------------------------------------------
-# Type-mapping helpers
-# ---------------------------------------------------------------------------
-
-CONSTANT_SELECTOR_CTYPE: dict[type[Any], str] = {
-    bool: "bool",
-    int: "int",
-    str: "const std::string&",
-}
-
-CONSTANT_CPP_OP_CTYPE: dict[type[Any], str] = {
-    bool: "bool",
-    int: "int64_t",
-    str: "const std::string&",
-}
-
-CONSTANT_TORCH_SCHEMA: dict[type[Any], str] = {
-    bool: "bool",
-    int: "int",
-    str: "str",
-}
-
-
-def scalar_cpp_op_ctype(triton_dtype: str) -> str:
-    """Triton scalar dtype → widened C++ type for cpp_op / torch_op params."""
-    if triton_dtype.startswith("i"):
-        return "int64_t"
-    if triton_dtype.startswith("f"):
-        return "double"
-    if triton_dtype == "bool":
-        return "bool"
-    raise ValueError(f"Unsupported scalar dtype for cpp_op: {triton_dtype}")
-
-
-def scalar_torch_schema(triton_dtype: str) -> str:
-    """Triton scalar dtype → TorchScript schema type string."""
-    if triton_dtype.startswith("i"):
-        return "int"
-    if triton_dtype.startswith("f"):
-        return "float"
-    if triton_dtype == "bool":
-        return "bool"
-    raise ValueError(f"Unsupported scalar dtype for torch schema: {triton_dtype}")
-
-
-# ---------------------------------------------------------------------------
-# ArgDescriptor hierarchy
-# ---------------------------------------------------------------------------
-
-
-@dataclass(frozen=True)
-class ArgDescriptor:
-    """Base class for per-arg codegen descriptors.
-
-    Built once by ``build_arg_descriptors`` and consumed by all ``gen_*``
-    functions.  Use ``isinstance`` to dispatch on arg kind:
-
-    - ``PointerArg`` — tensor pointer (required or optional)
-    - ``ScalarArg`` — non-pointer signature arg with a Triton dtype
-    - ``ConstantArg`` — compile-time constant with a Python type
-    """
-
-    name: str
-    index: int
-
-
-@dataclass(frozen=True)
-class PointerArg(ArgDescriptor):
-    """Tensor pointer arg (required or optional)."""
-
-    is_optional: bool
-
-
-@dataclass(frozen=True)
-class ScalarArg(ArgDescriptor):
-    """Non-pointer signature arg with a Triton dtype (e.g., ``"i32"``, ``"fp32"``).
-
-    ``triton_dtype`` is the **widest** type across all specs for this
-    position, computed by ``_compute_invariants`` via ``_wider_type``.
-    Individual specs may use a narrower type (e.g., ``"i32"`` when
-    ``triton_dtype`` is ``"i64"``); codegen adds ``fits_i32`` guards
-    and ``static_cast`` for narrowing.
-    """
-
-    triton_dtype: str
-
-
-@dataclass(frozen=True)
-class ConstantArg(ArgDescriptor):
-    """Compile-time constant arg with a Python type (``int``, ``str``, ``bool``)."""
-
-    python_type: type[Any]
-
-
-def build_arg_descriptors(
-    func: JITFunction[list[Any]],
-    unit: OpsUnit,
-) -> list[ArgDescriptor]:
-    """Build ordered arg descriptors from func arg names + OpsUnit invariants.
-
-    Single source of truth for arg classification.  Called once in
-    ``compile_to_cpp`` and passed to all downstream codegen functions.
-    """
-    result: list[ArgDescriptor] = []
-    for i, name in enumerate(func.arg_names):
-        if i in unit.pointer_args:
-            result.append(
-                PointerArg(name=name, index=i, is_optional=i in unit.optional)
-            )
-        elif i in unit.scalar_dtypes:
-            result.append(
-                ScalarArg(name=name, index=i, triton_dtype=unit.scalar_dtypes[i])
-            )
-        elif i in unit.constant_types:
-            result.append(
-                ConstantArg(name=name, index=i, python_type=unit.constant_types[i])
-            )
-        else:
-            raise ValueError(
-                f"Arg {name} (index {i}) not classified as pointer, scalar, "
-                f"or constant in OpsUnit"
-            )
-    return result
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/codegen.py b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/codegen.py
deleted file mode 100644
index f1b03848f..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/codegen.py
+++ /dev/null
@@ -1,780 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-
-# pyre-strict
-
-"""C++ and Python code generation for AOT-T compiled kernels.
-
-Generates:
-  - kernel.h      (header with gridDims, tuner meta, selector proto)
-  - kernel.cpp    (cubin externs, loaders, launchers, selector)
-  - _torch_op.cpp (torch op registration)
-  - _meta.py      (Python autotuner meta function)
-"""
-
-import textwrap
-from collections import Counter
-from typing import Any
-
-# @manual=//triton:triton
-import triton
-from generative_recommenders.ops.triton_aot.compile.arg_descriptor import (
-    ArgDescriptor,
-    CONSTANT_CPP_OP_CTYPE,
-    CONSTANT_SELECTOR_CTYPE,
-    CONSTANT_TORCH_SCHEMA,
-    ConstantArg,
-    PointerArg,
-    scalar_cpp_op_ctype,
-    scalar_torch_schema,
-    ScalarArg,
-)
-from generative_recommenders.ops.triton_aot.compile.spec_processing import (
-    KernelSpec,
-    OpsUnit,
-)
-from generative_recommenders.ops.triton_aot.compile.stable_types import (
-    PY_TYPES_TO_CPP_TYPES,
-    SCALAR_TYPES,
-)
-from generative_recommenders.ops.triton_aot.compile.utils import (
-    hash_kernel_name,
-    unwrap_heuristic,
-)
-from generative_recommenders.ops.triton_aot.shared.compat import get_scratch_parameters
-from generative_recommenders.ops.triton_aot.shared.types import AUTOTUNE_ATTRs, CTYPES
-from generative_recommenders.ops.triton_aot.templates.template_utils import (
-    load_template,
-    render_template,
-)
-from triton.runtime.jit import JITFunction
-
-
-# ---------------------------------------------------------------------------
-# Kernel naming and binary generation
-# ---------------------------------------------------------------------------
-
-
-def gen_kernel_name(
-    fn: Any,
-    spec: KernelSpec,
-    cc: int | str,
-) -> str:
-    name = fn.__name__
-    sig = "_".join([p.replace("*", "p") for p in spec.signature.values()])
-    const = "_".join(map(str, spec.constants.values()))
-    cc_str = f"sm{cc}"
-    autotune_configs = []
-    autotune_configs.append(f"w{spec.num_warps}")
-    autotune_configs.append(f"s{spec.num_stages}")
-    # AMD only
-    autotune_configs.append(f"matrix{spec.matrix_instr_nonkdim}")
-    autotune_configs.append(f"wave{spec.waves_per_eu}")
-    autotune_configs.append(f"kpack{spec.kpack}")
-    # See kernel_suffix in triton/compiler/code_generator.py
-    suffix = ""
-    for i, _ in enumerate(spec.signature):
-        suffix += str(i)
-        if i in spec.divisible_by_16:
-            suffix += "d"
-        if i in spec.divisible_by_8:
-            suffix += "e"
-    return "_".join([name, cc_str, sig, const] + autotune_configs + [suffix])
-
-
-def gen_cubin(kernel_name: str, kernel: Any, install_dir: str, backend: str) -> str:
-    """Generate kernel binary file (.cubin or .hsaco) and return extern declaration.
-
-    Args:
-        kernel_name: Full kernel name including specialization suffix.
-        kernel: Compiled Triton kernel object containing binary in kernel.asm.
-        install_dir: Directory to write binary file.
-        backend: GPU backend ("cuda" or "hip").
-
-    Returns:
-        C++ extern declaration for the kernel binary array.
-    """
-    hashed = hash_kernel_name(kernel_name)
-    if backend == "hip":
-        binary_file = f"{install_dir}/{hashed}.hsaco"
-        with open(binary_file, "wb") as hsaco:
-            hsaco.write(kernel.asm["hsaco"])
-        target_symbol_name = f"{kernel_name}_cubin"
-    else:
-        binary_file = f"{install_dir}/{hashed}.cubin"
-        with open(binary_file, "wb") as cubin:
-            cubin.write(kernel.asm["cubin"])
-        target_symbol_name = f"{kernel_name}_cubin"
-
-    # We return extern declarations for both the array and its pointer.
-    # The pointer is used by gen_loader() to generate R_X86_64_64 relocations
-    # instead of R_X86_64_32, which allows the .triton section to be placed
-    # beyond the 4GB address limit in large binaries.
-    # Note: The pointer is volatile to prevent optimizer constant-propagation.
-    return f'extern "C" {{ extern unsigned char {target_symbol_name}[]; extern const void* volatile {target_symbol_name}_ptr; }}'
-
-
-def gen_loader(kernel_name: str, cubin_name: str, shared: int) -> str:
-    # TODO(changpan): Extract inline cuModuleLoadData/cuModuleGetFunction error
-    # handling into a shared helper to reduce generated code size.
-    return textwrap.dedent(
-        f"""
-        CUfunction load_{kernel_name}(void)
-        {{
-            thread_local std::unordered_map<int32_t, CUfunction> cache;
-            auto idx = torch::stable::accelerator::getCurrentDeviceIndex();
-            auto res = cache.find(idx);
-            if (res != cache.end()) {{
-                return res->second;
-            }}
-            CUfunction func;
-            CUmodule mod_ptr;
-            CUresult err;
-            // Use pointer to cubin data to generate R_X86_64_64 relocation
-            // instead of R_X86_64_32, allowing cubin data to be placed beyond 4GB
-            const void *image = {kernel_name}_cubin_ptr;
-
-            err = cuModuleLoadData(&mod_ptr, image);
-            if (err != 0) {{
-                const char* errStr;
-                cuGetErrorString(err, &errStr);
-                throw std::runtime_error("cuModuleLoadData failed for {kernel_name}: error " + std::to_string(err) + " (" + (errStr ? errStr : "unknown") + ")");
-            }}
-
-            err = cuModuleGetFunction(&func, mod_ptr, "{cubin_name}");
-            if (err != 0) {{
-                const char* errStr;
-                cuGetErrorString(err, &errStr);
-                throw std::runtime_error("cuModuleGetFunction failed for {kernel_name}: error " + std::to_string(err) + " (" + (errStr ? errStr : "unknown") + ")");
-            }}
-
-            check_errors({shared}, func);
-            cache.emplace(idx, func);
-            return func;
-        }}
-    """
-    )
-
-
-# ---------------------------------------------------------------------------
-# Launcher codegen (per-spec)
-# ---------------------------------------------------------------------------
-
-
-def gen_launcher_params(
-    descriptors: list[ArgDescriptor],
-    signature: dict[int, str],
-) -> str:
-    args = ["gridDims grid"]
-    for d in descriptors:
-        if d.index in signature:
-            if isinstance(d, PointerArg):
-                ctype = "void*"
-            else:
-                ctype = CTYPES[signature[d.index]]
-            args.append(f"{ctype} {d.name}")
-    return ", ".join(args)
-
-
-def gen_launch_args(
-    func: JITFunction[list[Any]],
-    spec: KernelSpec,
-) -> list[str]:
-    """Generate kernel launch argument list (pointers to non-constant arguments)."""
-    args = []
-    for i, arg in enumerate(func.arg_names):
-        if i in spec.constants:
-            continue
-        assert i in spec.signature, f"Argument {i} ({arg}) does not appear in signature"
-        args.append(f"&{arg}")
-    return args
-
-
-def gen_launcher(
-    kernel_name: str,
-    func: JITFunction[list[Any]],
-    kernel: Any,
-    shared: int,
-    warp_size: int,
-    spec: KernelSpec,
-    descriptors: list[ArgDescriptor],
-) -> str:
-    params = gen_launcher_params(descriptors, spec.signature)
-    args = gen_launch_args(func, spec)
-
-    scratch_declarations, scratch_args = get_scratch_parameters(kernel)
-    args.extend(scratch_args)
-
-    args_str = ", ".join(args)
-
-    return textwrap.dedent(
-        f"""
-        void {kernel_name}({params}) {{
-            CUfunction func = load_{kernel_name}();
-            cudaStream_t stream = grid.stream ? grid.stream : triton_aot_get_current_stream();
-            {scratch_declarations}
-            void *args[] = {{ {args_str} }};
-            auto res = cuLaunchKernel(func, grid.x, grid.y, grid.z, {warp_size} * {spec.num_warps}, 1, 1, {shared}, stream, args, NULL);
-            TRITON_AOT_CU_CHECK(res);
-        }}
-    """
-    )
-
-
-# ---------------------------------------------------------------------------
-# Selector codegen (invariant)
-# ---------------------------------------------------------------------------
-
-
-def gen_selector_params(
-    descriptors: list[ArgDescriptor],
-) -> str:
-    """Generate C++ selector function parameter list."""
-    args = ["gridDims grid"]
-    for d in descriptors:
-        if isinstance(d, PointerArg):
-            args.append(f"const std::optional<torch::stable::Tensor>& {d.name}")
-        elif isinstance(d, ScalarArg):
-            args.append(f"{CTYPES[d.triton_dtype]} {d.name}")
-        elif isinstance(d, ConstantArg):
-            args.append(f"{CONSTANT_SELECTOR_CTYPE[d.python_type]} {d.name}")
-
-    for name, value in AUTOTUNE_ATTRs.items():
-        args.append(f"{type(value).__name__} {name}")
-    return ", ".join(args)
-
-
-def gen_launcher_call_args(
-    descriptors: list[ArgDescriptor],
-    signature: dict[int, str],
-) -> str:
-    args = ["grid"]
-    for d in descriptors:
-        if d.index in signature:
-            if isinstance(d, PointerArg):
-                args.append(f"{d.name}.value().data_ptr()")
-            elif isinstance(d, ScalarArg) and signature[d.index] != d.triton_dtype:
-                args.append(f"static_cast<{CTYPES[signature[d.index]]}>({d.name})")
-            else:
-                args.append(d.name)
-    return ", ".join(args)
-
-
-def gen_guarded_calls(  # noqa: C901
-    func: JITFunction[list[Any]],
-    unit: OpsUnit,
-    descriptors: list[ArgDescriptor],
-) -> str:
-    desc_by_idx: dict[int, ArgDescriptor] = {d.index: d for d in descriptors}
-    calls = []
-    for spec in unit.specs:
-        kernel_name = gen_kernel_name(func, spec, unit.cc)
-        args = gen_launcher_call_args(descriptors, spec.signature)
-        guards = ""
-
-        # Guard on tensor dtypes (per-spec: different specs may have different dtypes)
-        for i, ttype in spec.signature.items():
-            d = desc_by_idx[i]
-            if not isinstance(d, PointerArg):
-                continue
-            arg = d.name
-            atype = SCALAR_TYPES[ttype]
-            guards += f"if ({arg}.has_value()) "
-            guards += f"if ({arg}.value().scalar_type() == {atype}) "
-
-        # Guard on int range (spec uses narrower type than selector)
-        for i, dtype in spec.signature.items():
-            d = desc_by_idx[i]
-            if isinstance(d, ScalarArg) and dtype != d.triton_dtype:
-                if dtype == "i32":
-                    guards += f"if (fits_i32({d.name})) "
-
-        # Guard on constant values.
-        for i, val in spec.constants.items():
-            arg = desc_by_idx[i].name
-            if isinstance(val, bool):
-                guards += f"if ({arg}) " if val else f"if (!({arg})) "
-            elif isinstance(val, str):
-                guards += f'if ({arg} == "{val}") '
-            elif val is None:
-                guards += f"if (!{arg}.has_value()) "
-            else:
-                guards += f"if ({arg} == {val}) "
-
-        # Guard on special constants
-        for name in AUTOTUNE_ATTRs.keys():
-            guards += f"if ({name} == {getattr(spec, name)}) "
-
-        # Guard on divisible_by_16
-        for i in spec.divisible_by_16:
-            arg = desc_by_idx[i].name
-            if i in spec.signature:
-                ttype = spec.signature[i]
-                if ttype.startswith("*"):
-                    guards += f"if ((((uintptr_t){arg}.value().data_ptr()) % 16) == 0) "
-                else:
-                    guards += f"if (({arg} % 16) == 0) "
-            elif i in spec.constants:
-                assert (spec.constants[i] % 16) == 0
-
-        # Guard on divisible_by_8
-        for i in spec.divisible_by_8:
-            arg = desc_by_idx[i].name
-            if i in spec.signature:
-                ttype = spec.signature[i]
-                # divisible_by_8 is only applied to int
-                if not ttype.startswith("*"):
-                    guards += f"if (({arg} % 8) == 0) "
-            elif i in spec.constants:
-                assert (spec.constants[i] % 8) == 0
-
-        # Call the specialization.
-        calls.append(f"{guards}return {kernel_name}({args});\n")
-    return "".join(calls)
-
-
-def gen_selector_proto(
-    descriptors: list[ArgDescriptor],
-    func_name: str,
-) -> str:
-    params = gen_selector_params(descriptors)
-    # Add Triton's default values for num warps/stages, etc
-    for name, value in AUTOTUNE_ATTRs.items():
-        params = params.replace(name, f"{name}={value}")
-    return f"void {func_name}({params});"
-
-
-def gen_failure_msg(
-    descriptors: list[ArgDescriptor],
-) -> str:
-    """Generate C++ ``<<``-chain for the dispatch-failure error message.
-
-    Groups parameters by category (Tensors / Scalars / Constants /
-    Autotune / Device).  Tensor entries include aligned16 status.
-    """
-    tensors: list[str] = []
-    scalars: list[str] = []
-    constants: list[str] = []
-
-    for d in descriptors:
-        if isinstance(d, PointerArg):
-            dtype_expr = (
-                f"({d.name}.has_value()"
-                f" ? c10::toString({d.name}.value().scalar_type())"
-                f' : "nullptr")'
-            )
-            align_expr = (
-                f"(({d.name}.has_value()"
-                f" && (((uintptr_t){d.name}.value().data_ptr()) % 16) == 0)"
-                f' ? "true" : "false")'
-            )
-            tensors.append(
-                f'" {d.name}=" << {dtype_expr} << "(aligned16=" << {align_expr} << ")"'
-            )
-        elif isinstance(d, ScalarArg):
-            scalars.append(f'" {d.name}=" << {d.name}')
-        elif isinstance(d, ConstantArg):
-            constants.append(f'" {d.name}=" << {d.name}')
-
-    autotune: list[str] = [f'" {n}=" << {n}' for n in AUTOTUNE_ATTRs]
-
-    sections: list[str] = []
-    if tensors:
-        sections.append('"\\n  Tensors:" << ' + " << ".join(tensors))
-    if scalars:
-        sections.append('"\\n  Scalars:" << ' + " << ".join(scalars))
-    if constants:
-        sections.append('"\\n  Constants:" << ' + " << ".join(constants))
-    sections.append('"\\n  Autotune:" << ' + " << ".join(autotune))
-    sections.append('"\\n  Device: cc=" << cc')
-
-    return " << ".join(sections)
-
-
-def gen_selector(
-    func: JITFunction[list[Any]],
-    unit: OpsUnit,
-    descriptors: list[ArgDescriptor],
-) -> str:
-    params = gen_selector_params(descriptors)
-    guarded_calls = gen_guarded_calls(func, unit, descriptors)
-    failure_msg = gen_failure_msg(descriptors)
-    return f"""
-        void {func.__name__}({params}) {{
-            auto cc = compute_capability();
-            if (grid.x * grid.y * grid.z > 0) {{
-                {guarded_calls}
-                std::stringstream ss;
-                ss << "[TritonAOT] No implementation found for {func.__name__}" << {failure_msg};
-                throw std::runtime_error(ss.str());
-            }}
-        }}
-    """
-
-
-# ---------------------------------------------------------------------------
-# Torch op codegen (invariant)
-# ---------------------------------------------------------------------------
-
-
-def gen_cpp_op_params(
-    descriptors: list[ArgDescriptor],
-) -> str:
-    args = []
-    for d in descriptors:
-        if isinstance(d, PointerArg):
-            args.append(f"std::optional<torch::stable::Tensor> {d.name}")
-        elif isinstance(d, ScalarArg):
-            args.append(f"{scalar_cpp_op_ctype(d.triton_dtype)} {d.name}")
-        elif isinstance(d, ConstantArg):
-            args.append(f"{CONSTANT_CPP_OP_CTYPE[d.python_type]} {d.name}")
-    for name, value in AUTOTUNE_ATTRs.items():
-        args.append(f"{PY_TYPES_TO_CPP_TYPES[type(value)]} {name}")
-    return ", ".join(args)
-
-
-def gen_torch_op_params(
-    descriptors: list[ArgDescriptor],
-    default_values: dict[str, Any],
-) -> str:
-    args = []
-
-    def gen_str_wrap(value: Any) -> Any:
-        return f'\\"{value}\\"' if isinstance(value, str) else value
-
-    def gen_default_str(arg: str) -> str:
-        return (
-            f" = {gen_str_wrap(default_values[arg])}" if arg in default_values else ""
-        )
-
-    for d in descriptors:
-        df_str = gen_default_str(d.name)
-        if isinstance(d, PointerArg):
-            t = chr(ord("a") + d.index)
-            args.append(f"Tensor({t}!)? {d.name}")
-        elif isinstance(d, ScalarArg):
-            args.append(f"{scalar_torch_schema(d.triton_dtype)} {d.name}{df_str}")
-        elif isinstance(d, ConstantArg):
-            args.append(f"{CONSTANT_TORCH_SCHEMA[d.python_type]} {d.name}{df_str}")
-    for name, value in AUTOTUNE_ATTRs.items():
-        args.append(f"{type(value).__name__} {name}={value}")
-    return ", ".join(args)
-
-
-def gen_torch_op(
-    func: JITFunction[list[Any]],
-    descriptors: list[ArgDescriptor],
-    default_values: dict[str, Any],
-) -> str:
-    cpp_params = gen_cpp_op_params(descriptors)
-    torch_params = gen_torch_op_params(descriptors, default_values)
-    arg_names = list(func.arg_names) + list(AUTOTUNE_ATTRs.keys())
-    args = ", ".join(arg_names)
-
-    # Generate a comment noting which tensor params are non-optional but
-    # promoted to Tensor? for TorchScript compatibility.
-    promoted = [
-        d.name for d in descriptors if isinstance(d, PointerArg) and not d.is_optional
-    ]
-    type_comment = ""
-    if promoted:
-        type_comment = (
-            f"// Note: {', '.join(promoted)} are non-optional but use Tensor? "
-            "for TorchScript compatibility.\n"
-            "// Dispatch uses HAS_XXX constexpr ints, not tensor presence.\n"
-        )
-    return textwrap.dedent(
-        f"""
-        namespace {{
-        triton::aot::gridDims dims_from_vec(
-            const std::vector<int64_t>& grid
-        ) {{
-          return triton::aot::gridDims(
-              grid.size() > 0 ? grid[0] : 1,
-              grid.size() > 1 ? grid[1] : 1,
-              grid.size() > 2 ? grid[2] : 1
-          );
-        }}
-
-        {type_comment}void {func.__name__}_op(
-            std::vector<int64_t> grid,
-            {cpp_params}
-        ) {{
-            triton::aot::{func.__name__}(
-                dims_from_vec(grid),
-                {args}
-            );
-        }}
-
-        void {func.__name__}_dummy_op(
-            std::vector<int64_t> grid,
-            {cpp_params}
-        ) {{
-            // Do nothing.  The op is a dummy for model transform,
-            // processing, and splitting services.
-        }}
-        }}
-
-        STABLE_TORCH_LIBRARY_FRAGMENT(triton_aot, m) {{
-          m.def("{func.__name__}(int[] grid, {torch_params}) -> ()");
-        }}
-        STABLE_TORCH_LIBRARY_IMPL(triton_aot, CUDA, m) {{
-          m.impl("{func.__name__}", TORCH_BOX(&{func.__name__}_op));
-        }}
-
-        STABLE_TORCH_LIBRARY_IMPL(triton_aot, CPU, m) {{
-          m.impl("{func.__name__}", TORCH_BOX(&{func.__name__}_dummy_op));
-        }}
-
-        STABLE_TORCH_LIBRARY_IMPL(triton_aot, Meta, m) {{
-          m.impl("{func.__name__}", TORCH_BOX(&{func.__name__}_dummy_op));
-        }}
-        """
-    )
-
-
-# ---------------------------------------------------------------------------
-# Tuner meta codegen
-# ---------------------------------------------------------------------------
-
-
-def key_names_and_idx(func: Any) -> tuple[list[str], list[int]]:
-    if hasattr(func, "key_idx"):
-        arg_names = [func.arg_names[idx] for idx in func.key_idx]
-        key_idx = func.key_idx
-    else:
-        arg_names = func.keys
-        key_idx = [func.arg_names.index(arg) for arg in arg_names]
-    return arg_names, key_idx
-
-
-def is_non_empty_mapping_of_type(obj: object, value_type: type[Any]) -> bool:
-    """Check if object is a non-empty dict with all values of specific type"""
-    if not obj or not isinstance(obj, dict):
-        return False
-
-    return all(isinstance(value, value_type) for value in obj.values())
-
-
-_LAUNCH_PARAM_NAMES: list[str] = ["num_warps", "num_stages"]
-
-
-def gen_tuner_meta_py(
-    func: Any,
-    tuner_fallback: bool,
-    unit: OpsUnit,
-) -> str:
-    vals = []
-
-    guard_list = []
-
-    # Use custom meta generation function if available
-    if hasattr(func, "gen_autotune_select_meta_src"):
-        return func.gen_autotune_select_meta_src(unit.constant_types)
-
-    if hasattr(func, "cache") and is_non_empty_mapping_of_type(
-        func.cache, triton.runtime.autotuner.Config
-    ):
-        # auto tuned configs
-        arg_names, key_idx = key_names_and_idx(func)
-
-        in_args = ", ".join(
-            [
-                f"{name}: {unit.constant_types[idx].__name__ if idx in unit.constant_types else 'int'}"
-                for idx, name in zip(key_idx, arg_names)
-            ]
-        )
-
-        cfg_first = next(iter(func.cache.values()))
-        return_names = list(cfg_first.kwargs.keys()) + _LAUNCH_PARAM_NAMES
-
-        for key, cfg in func.cache.items():
-            val = list(cfg.kwargs.values()) + [cfg.num_warps, cfg.num_stages]
-            val = tuple(val)
-            vals.append(val)
-            equations = []
-            for arg, value in zip(arg_names, key):
-                if isinstance(value, str):
-                    equations.append(f"{arg} == '{value}'")
-                elif isinstance(value, bool):
-                    equations.append(f"{arg} == {int(value)}")
-                else:
-                    equations.append(f"{arg} == {value}")
-            guard_list.append(f"if {' and '.join(equations)}: return {val}")
-
-    else:
-        # default configs — single spec, use specs[0]
-        in_args = ""
-        arg_names = list(_LAUNCH_PARAM_NAMES)
-        return_names = list(_LAUNCH_PARAM_NAMES)
-        val = unit.specs[0].num_warps, unit.specs[0].num_stages
-        vals.append(val)
-
-    name = unwrap_heuristic(func, JITFunction).__name__
-    meta = name + "_meta"
-
-    guards = "\n        ".join(guard_list)
-
-    fmt_args = ", ".join([f"{{{arg_name}}}" for arg_name in arg_names])
-
-    raise_runtime_error_str = (
-        f"""raise RuntimeError(f"No autotuning config found for {name}({fmt_args})")"""
-    )
-    fallback_str = f"""return {Counter(vals).most_common(1)[0][0]}"""
-
-    returns_comment = f"# Returns: ({', '.join(return_names)})"
-
-    return textwrap.dedent(
-        f"""
-    def {meta}({in_args}):
-        {returns_comment}
-        {guards}
-        {fallback_str if tuner_fallback else raise_runtime_error_str}
-    """
-    )
-
-
-def gen_tuner_meta_cpp(
-    func: Any,
-    tuner_fallback: bool,
-    constant_types: dict[int, type[Any]],
-) -> str:
-    # TODO(changpan): This C++ inline _meta is currently dead code — no C++ caller
-    # invokes it.  The Python _meta.py (gen_tuner_meta_py) is the only consumer.
-    # Double check, try remove this and the TUNER_META_CPP template region.
-    def infer_arg_type(idx: int) -> str:
-        if idx in constant_types:
-            return PY_TYPES_TO_CPP_TYPES[constant_types[idx]]
-        else:
-            return "int64_t"
-
-    arg_names, key_idx = key_names_and_idx(func)
-
-    in_args = ", ".join(
-        [f"{infer_arg_type(idx)} {name}" for idx, name in zip(key_idx, arg_names)]
-    )
-
-    vals = []
-    guard_list = []
-    for key, cfg in func.cache.items():
-        val = list(cfg.kwargs.values()) + [cfg.num_warps, cfg.num_stages]
-        val = tuple(val)
-        vals.append(val)
-        equations = []
-        for arg, value in zip(arg_names, key):
-            if isinstance(value, str):
-                equations.append(f'{arg} == "{value}"')
-            elif isinstance(value, bool):
-                equations.append(f"{arg} == {int(value)}")
-            else:
-                equations.append(f"{arg} == {value}")
-        guard_list.append(f"if ({' && '.join(equations)}) return std::make_tuple{val};")
-    guards = "\n        ".join(guard_list)
-    name = unwrap_heuristic(func, JITFunction).__name__
-    meta = name + "_meta"
-    fmt_args = ", ".join([f"{arg_name}" for arg_name in arg_names])
-    raise_runtime_error_str = f"""throw std::runtime_error("No autotuning config found for {name}({fmt_args})");"""
-    fallback_str = f"""return std::make_tuple{Counter(vals).most_common(1)[0][0]};"""
-    # Infer the return type from the actual values
-    return_type = _infer_return_type(vals[0])
-    return textwrap.dedent(
-        f"""
-    inline std::tuple<{return_type}> {meta}({in_args}) {{
-        {guards}
-        {fallback_str if tuner_fallback else raise_runtime_error_str}
-    }}
-    """
-    )
-
-
-def _infer_return_type(vals: tuple[Any, ...]) -> str:
-    types = [PY_TYPES_TO_CPP_TYPES.get(type(val)) for val in vals]
-    try:
-        # pyre-fixme[6]: For 1st argument expected
-        #  `Iterable[typing_extensions.LiteralString]` but got `List[Optional[str]]`.
-        return ", ".join(types)
-    except TypeError:  # one of the types cannot be inferred, e.g. `None`
-        raise ValueError("Cannot infer return type from `vals`")
-
-
-# ---------------------------------------------------------------------------
-# Top-level codegen entry points
-# ---------------------------------------------------------------------------
-
-
-def generate_header_content(
-    tuned_func: triton.runtime.autotuner.Autotuner | None,
-    func: JITFunction[list[Any]],
-    unit: OpsUnit,
-    descriptors: list[ArgDescriptor],
-    tuner_fallback: bool,
-) -> str:
-    """Generate the content of the .h header file."""
-    h_template = load_template("kernel.h")
-    tuner_meta_cpp = (
-        gen_tuner_meta_cpp(tuned_func, tuner_fallback, unit.constant_types)
-        if tuned_func
-        else ""
-    )
-    selector_proto = gen_selector_proto(descriptors, func.__name__)
-    return render_template(
-        h_template,
-        {
-            "TUNER_META_CPP": tuner_meta_cpp,
-            "SELECTOR_PROTO": selector_proto,
-        },
-    )
-
-
-def generate_kernel_cpp_content(
-    func: JITFunction[list[Any]],
-    unit: OpsUnit,
-    descriptors: list[ArgDescriptor],
-    prefix: str,
-    generated_specs: list[str],
-    backend: str,
-) -> str:
-    """Generate the content of the kernel .cpp file.
-
-    All tensor params use Tensor? for TorchScript compatibility.
-    Dispatch relies on HAS_XXX constexpr ints, not tensor presence.
-    """
-    cpp_template = load_template("kernel.cpp")
-    kernel_specs = "\n".join(generated_specs)
-    selector = gen_selector(func, unit, descriptors)
-
-    # On AMD, apply hipification to generated code (KERNEL_SPECS, SELECTOR)
-    # Templates are already hipified at load time (from hip/ subdirectory)
-    if backend == "hip":
-        from torch._inductor.codegen.aoti_hipify_utils import maybe_hipify_code_wrapper
-
-        kernel_specs = maybe_hipify_code_wrapper(kernel_specs, force_hipify=True)
-        selector = maybe_hipify_code_wrapper(selector, force_hipify=True)
-
-    cpp_content = render_template(
-        cpp_template,
-        {
-            "HEADER_INCLUDE": f'#include "{prefix}.h"\n',
-            "KERNEL_SPECS": kernel_specs,
-            "SELECTOR": selector,
-        },
-    )
-    return cpp_content
-
-
-def generate_torch_op_content(
-    func: JITFunction[list[Any]],
-    descriptors: list[ArgDescriptor],
-    prefix: str,
-    default_values: dict[str, Any],
-) -> str:
-    """Generate the content of the torch_op .cpp file."""
-    torch_template = load_template("torch_op.cpp")
-    torch_op_content = gen_torch_op(func, descriptors, default_values)
-    torch_content = render_template(
-        torch_template,
-        {
-            "HEADER_INCLUDE": f'#include "{prefix}.h"\n',
-            "TORCH_OP": torch_op_content,
-        },
-    )
-    return torch_content
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/compile_state.py b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/compile_state.py
deleted file mode 100644
index 231cb52f1..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/compile_state.py
+++ /dev/null
@@ -1,409 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# pyre-strict
-
-#!/usr/bin/env python3
-
-from __future__ import annotations
-
-import hashlib
-import json
-import os
-import tempfile
-from inspect import getcallargs, Parameter, signature
-from typing import Any, Callable, Dict, List, Optional, Set
-
-import torch
-
-# @manual=//triton:triton
-import triton.language as tl
-from generative_recommenders.ops.triton_aot.compile.stable_types import SCALAR_TYPES
-from generative_recommenders.ops.triton_aot.compile.utils import is_autotuner
-from generative_recommenders.ops.triton_aot.types import (
-    Annotation,
-    AnnotationHint,
-    TritonAOT,
-)
-
-# @manual=//triton:triton
-from triton.runtime.jit import KernelInterface, mangle_type
-
-
-class CustomEncoder(json.JSONEncoder):
-    # pyre-ignore[14]: Inconsistent override
-    def default(self, obj: object) -> Any:
-        if isinstance(obj, set):
-            return {"__set__": True, "items": sorted(obj)}
-        # Handle other non-serializable types
-        return super().default(obj)
-
-
-def hash_spec(spec: Dict[str, Any]) -> str:
-    serialized_dict = json.dumps(spec, cls=CustomEncoder, sort_keys=True)
-    return hashlib.sha256(serialized_dict.encode("utf-8")).hexdigest()
-
-
-class AOTTCompileState:
-    """
-    Singleton state container for Triton AOT compilation.
-
-    Description:
-    This singleton pattern enables state sharing between code loaded via
-    torch.package (which creates isolated module namespaces) and the regular
-    Python import system. Without this pattern, the packaged module would have
-    its own copy of global state, leading to inconsistencies.
-
-    Usage:
-        # Normal usage - get the singleton instance
-        state = AOTTCompileState.get_instance()
-
-        # For torch.package integration - inject shared instance into packaged module
-        packaged_module = package_importer.import_module("triton_aot.compile.compile_state")
-        packaged_module.AOTTCompileState.set_instance(AOTTCompileState.get_instance())
-    """
-
-    _instance: Optional["AOTTCompileState"] = None
-
-    kernel_specs: Dict[KernelInterface[List[Any]], List[Dict[str, List[Any]]]] = {}
-    specs_hashset: Dict[KernelInterface[List[Any]], Set[str]] = {}
-    enable_aott_compile: bool = False
-    compile_base_dir: str = ""
-    compile_path: str = ""
-
-    def __new__(cls) -> "AOTTCompileState":
-        if cls._instance is None:
-            instance = super().__new__(cls)
-            instance._initialize()
-            cls._instance = instance
-        return cls._instance
-
-    def _initialize(self) -> None:
-        """Initialize the singleton state. Called only once."""
-        self.kernel_specs: Dict[
-            KernelInterface[List[Any]], List[Dict[str, List[Any]]]
-        ] = {}
-        self.specs_hashset: Dict[KernelInterface[List[Any]], Set[str]] = {}
-        self.enable_aott_compile: bool = False
-        self.compile_base_dir: str = os.getenv("TRITON_AOT_PATH_PREFIX", "/var/tmp")
-        self.compile_path: str = tempfile.mkdtemp(
-            dir=self.compile_base_dir, prefix="triton_aot_compile_"
-        )
-
-    @classmethod
-    def get_instance(cls) -> "AOTTCompileState":
-        """Get the singleton instance, creating it if necessary."""
-        if cls._instance is None:
-            cls._instance = cls()
-        return cls._instance
-
-    @classmethod
-    def set_instance(cls, instance: "AOTTCompileState") -> None:
-        """
-        Set the singleton instance. Used for torch.package integration.
-
-        When code is loaded via torch.package, it creates a separate module
-        namespace with its own class objects. This method allows injecting
-        a shared instance from the main module into the packaged module.
-        """
-        cls._instance = instance
-
-    def reset(self) -> None:
-        """Reset all state to initial values."""
-        self.kernel_specs = {}
-        self.specs_hashset = {}
-        self.disable()
-        self.compile_base_dir = os.getenv("TRITON_AOT_PATH_PREFIX", "/var/tmp")
-        self.compile_path = tempfile.mkdtemp(
-            dir=self.compile_base_dir, prefix="triton_aot_compile_"
-        )
-
-    def add_kernel_spec(
-        self,
-        fn: KernelInterface[List[Any]],
-        spec: Dict[str, List[Any]],
-        hashed_spec: str,
-    ) -> None:
-        """Add a kernel spec if not already present (based on hash).
-        If the same Triton kernel is used at multiple locations in a model:
-            - All calls share one spec list under the same kernel function key
-            - Specs with identical signatures (same dtypes, shapes) are deduplicated via hash
-            - Specs with different signatures (e.g., fp32 vs bf16) are recorded separately
-
-        Example:
-            # Two call sites using the same kernel:
-            my_kernel[grid](tensor_fp32, ...)  # Records spec with "*fp32"
-            my_kernel[grid](tensor_bf16, ...)  # Records spec with "*bf16"
-            my_kernel[grid](tensor_fp32, ...)  # Deduplicated, same hash as first call
-
-            # Result: kernel_specs[my_kernel] = [fp32_spec, bf16_spec]
-        """
-        if fn not in self.kernel_specs:
-            self.kernel_specs[fn] = []
-            self.specs_hashset[fn] = set()
-        if hashed_spec not in self.specs_hashset[fn]:
-            self.kernel_specs[fn].append(spec)
-            self.specs_hashset[fn].add(hashed_spec)
-
-    def _collect_spec(
-        self,
-        fn: KernelInterface[List[Any]],
-        annotations: Dict[str, Annotation],
-        *args: Any,
-        **kwargs: Any,
-    ) -> None:
-        """Spec collection callback registered on TritonAOT during compile.
-
-        Always collects the annotated spec (which equals the inferred spec
-        when no annotations are present).  Also collects the inferred spec
-        when it differs and either:
-        - annotations conflict with sample (fallback for safety), or
-        - inferred has perf hints the annotation lacks (perf variant).
-        """
-        spec = infer_spec(fn, annotations, *args, **kwargs)
-        annotated_hash = hash_spec(spec)
-        self.add_kernel_spec(fn, spec, annotated_hash)
-
-        if annotations:
-            inferred = infer_spec(fn, {}, *args, **kwargs)
-            inferred_hash = hash_spec(inferred)
-            if inferred_hash == annotated_hash:
-                return
-            if _annotation_conflicts_with_sample(
-                fn, annotations, *args, **kwargs
-            ) or _inferred_has_perf_advantage(spec, inferred):
-                self.add_kernel_spec(fn, inferred, inferred_hash)
-
-    def enable(self) -> None:
-        """Enable AOT compile and register the spec collection hook."""
-        self.enable_aott_compile = True
-        TritonAOT.set_spec_collector(self._collect_spec)
-
-    def disable(self) -> None:
-        """Disable AOT compile and unregister the spec collection hook."""
-        self.enable_aott_compile = False
-        TritonAOT.set_spec_collector(None)
-
-
-def get_aott_compile_state() -> AOTTCompileState:
-    """Get the current AOTTCompileState singleton.
-
-    Uses get_instance() so injected instances (via set_instance() for
-    torch.package integration) are respected.
-    """
-    return AOTTCompileState.get_instance()
-
-
-########
-# Module-level global accessors that delegate to singleton
-########
-
-
-def get_triton_aot_kernel_specs() -> Dict[
-    KernelInterface[List[Any]], List[Dict[str, List[Any]]]
-]:
-    return get_aott_compile_state().kernel_specs
-
-
-def get_triton_aot_specs_hashset() -> Dict[KernelInterface[List[Any]], Set[str]]:
-    return get_aott_compile_state().specs_hashset
-
-
-def get_aott_compile_path() -> str:
-    return get_aott_compile_state().compile_path
-
-
-def add_kernel_spec(
-    fn: KernelInterface[List[Any]], spec: Dict[str, List[Any]], hashed_spec: str
-) -> None:
-    get_aott_compile_state().add_kernel_spec(fn, spec, hashed_spec)
-
-
-def _unwrap_triton_fn(
-    fn: KernelInterface[List[Any]],
-) -> Callable[..., Any]:
-    while isinstance(fn, KernelInterface):
-        # pyre-ignore[16]: KernelInterface has `fn` attribute at runtime
-        fn = fn.fn
-    return fn
-
-
-def _inferred_has_perf_advantage(
-    annotated_spec: Dict[str, List[Any]],
-    inferred_spec: Dict[str, List[Any]],
-) -> bool:
-    """True if inferred spec has alignment/divisibility hints the annotated lacks.
-
-    A tuple element ``(type, N)`` carries alignment or divisibility info
-    that a bare string does not.  When inference adds such hints (e.g.,
-    tensor alignment from ``data_ptr() % 16 == 0``), the inferred spec
-    produces a more optimized cubin worth keeping as a perf variant.
-    """
-    for ann_elem, inf_elem in zip(
-        annotated_spec["signature"], inferred_spec["signature"]
-    ):
-        if isinstance(inf_elem, tuple) and not isinstance(ann_elem, tuple):
-            return True
-    return False
-
-
-# Triton-internal kwargs injected by KernelInterface.__getitem__
-# (triton/runtime/jit.py).  These are not kernel parameters and must
-# be stripped before getcallargs.
-_TRITON_INTERNAL_KWARGS: frozenset[str] = frozenset({"warmup", "grid"})
-
-
-def _resolve_call_args(
-    fn: KernelInterface[List[Any]],
-    *args: Any,
-    **kwargs: Any,
-) -> tuple[Callable[..., Any], dict[str, Any]]:
-    """Unwrap kernel and resolve call args with autotune placeholder fill."""
-    triton_fn = _unwrap_triton_fn(fn)
-    # Filter Triton-internal kwargs injected by KernelInterface.__getitem__
-    # (triton/runtime/jit.py) — not part of the kernel signature.
-    clean_kwargs = {k: v for k, v in kwargs.items() if k not in _TRITON_INTERNAL_KWARGS}
-    if is_autotuner(fn):
-        # pyre-ignore[16]: Attributes checked by is_autotuner
-        for arg_name in fn.configs[0].kwargs.keys():
-            if arg_name not in clean_kwargs:
-                clean_kwargs[arg_name] = -1
-    return triton_fn, getcallargs(triton_fn, *args, **clean_kwargs)
-
-
-_I32_MIN: int = -(2**31)
-_I32_MAX: int = 2**31 - 1
-
-
-def _sample_satisfies_int_type(sample: int, ann_type: str) -> bool:
-    """True if sample int fits the annotated type range."""
-    if ann_type == "i32":
-        return _I32_MIN <= sample <= _I32_MAX
-    return True
-
-
-def _sample_satisfies_annotation(sample: Any, ann: Annotation) -> bool:
-    """True if a single sample value satisfies its annotation constraint."""
-    if isinstance(ann, AnnotationHint):
-        if isinstance(sample, torch.Tensor):
-            return sample.data_ptr() % ann.hint == 0
-        if isinstance(sample, int):
-            if ann.hint == 1:
-                return sample == 1
-            if not _sample_satisfies_int_type(sample, ann.dtype):
-                return False
-            if ann.hint > 1:
-                return sample % ann.hint == 0
-        return True
-    if isinstance(ann, str) and not ann.startswith("*") and isinstance(sample, int):
-        return _sample_satisfies_int_type(sample, ann)
-    return True
-
-
-def _annotation_conflicts_with_sample(
-    fn: KernelInterface[List[Any]],
-    annotations: Dict[str, Annotation],
-    *args: Any,
-    **kwargs: Any,
-) -> bool:
-    """True if any annotated param's sample value doesn't satisfy the annotation.
-
-    Used by ``_collect_spec`` to decide whether to generate an inferred
-    fallback spec.  When the sample satisfies all annotations, only the
-    annotated spec is needed (the user's constraints hold for this input).
-    """
-    _, sample_args = _resolve_call_args(fn, *args, **kwargs)
-
-    for param_name, ann in annotations.items():
-        sample = sample_args.get(param_name)
-        if sample is None:
-            continue
-        if not _sample_satisfies_annotation(sample, ann):
-            return True
-
-    return False
-
-
-def _infer_spec_entry(
-    arg_name: str,
-    arg: Any,
-    arg_annotation: Any,
-    annotations: Dict[str, Annotation],
-) -> Any:
-    if arg_annotation != Parameter.empty:
-        if arg_annotation == tl.constexpr:
-            return arg
-        raise RuntimeError(
-            f"TritonAOT: unsupported scalar annotation {arg_annotation}."
-        )
-
-    if arg_name in annotations:
-        ann = annotations[arg_name]
-        # Convert to tuple for raw spec format (shared/spec_conversion
-        # processes plain tuples).
-        return ann.to_tuple() if isinstance(ann, AnnotationHint) else ann
-
-    if arg is None:
-        return None
-
-    if isinstance(arg, torch.Tensor):
-        # Reject dtypes SCALAR_TYPES can't render (e.g. *u1, *u16, *fp8e5)
-        # so codegen doesn't KeyError downstream.
-        type_str = mangle_type(arg)
-        if type_str not in SCALAR_TYPES:
-            raise RuntimeError(
-                f"TritonAOT: unsupported tensor type for {arg_name}: "
-                f"{arg.dtype} (Triton mangled to {type_str!r}). "
-                f"Supported tensor dtypes: {sorted(SCALAR_TYPES.keys())}."
-            )
-        return (type_str, 16) if arg.data_ptr() % 16 == 0 else type_str
-
-    if isinstance(arg, bool):
-        # bool is subclass of int; must check before int.
-        # Non-constexpr bools have no CTYPES entry for codegen.
-        raise RuntimeError(
-            f"TritonAOT: parameter {arg_name} is a bool without "
-            f"tl.constexpr annotation.  Add `{arg_name}: tl.constexpr` "
-            f"to the kernel signature."
-        )
-
-    if isinstance(arg, int):
-        # Always i64 for safety; users annotate "i32" for narrower variant via
-        # annotation-as-variant.
-        if not -(2**63) <= arg <= 2**63 - 1:
-            raise RuntimeError(
-                f"TritonAOT: unsupported int value for {arg_name}: "
-                f"value exceeds i64 range. Use a smaller value or tl.constexpr."
-            )
-        return "i64"
-
-    if isinstance(arg, float):
-        return "fp32"
-
-    raise RuntimeError(f"TritonAOT: parameter {arg_name} needs annotation.")
-
-
-def infer_spec(
-    fn: KernelInterface[List[Any]],
-    annotations: Dict[str, Annotation],
-    *args: Any,
-    **kwargs: Any,
-) -> Dict[str, List[Any]]:
-    """Infer kernel spec from sample args.
-
-    Tensor dtype: ``mangle_type``, alignment: ``data_ptr() % 16``.
-    Scalar int: always ``"i64"`` (safe default; user can annotate ``"i32"``
-    to get a narrower variant via annotation-as-variant).
-    Float: ``mangle_type`` → fp32.
-    """
-    triton_fn, call_args = _resolve_call_args(fn, *args, **kwargs)
-    fn_sig = signature(triton_fn)
-    arg_annotations = {
-        name: param.annotation for name, param in fn_sig.parameters.items()
-    }
-    spec = []
-
-    for arg_name in fn_sig.parameters.keys():
-        arg = call_args[arg_name]
-        spec.append(
-            _infer_spec_entry(arg_name, arg, arg_annotations[arg_name], annotations)
-        )
-    return {"signature": spec}
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/pipeline.py b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/pipeline.py
deleted file mode 100644
index cdf4d1821..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/pipeline.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-
-# pyre-strict
-
-"""AOT-T compilation pipeline.
-
-Orchestrates: spec processing → Triton native compile → C++ / Python codegen.
-"""
-
-from __future__ import annotations
-
-import logging
-import multiprocessing as mp
-import os
-import signal
-import threading
-from concurrent.futures import ThreadPoolExecutor
-from types import FrameType, ModuleType
-from typing import Any, Callable
-
-# @manual=//triton:triton
-import triton
-import triton.compiler
-from generative_recommenders.ops.triton_aot.compile.arg_descriptor import (
-    ArgDescriptor,
-    build_arg_descriptors,
-)
-from generative_recommenders.ops.triton_aot.compile.codegen import (
-    gen_cubin,
-    gen_kernel_name,
-    gen_launcher,
-    gen_loader,
-    gen_tuner_meta_py,
-    generate_header_content,
-    generate_kernel_cpp_content,
-    generate_torch_op_content,
-)
-from generative_recommenders.ops.triton_aot.compile.spec_processing import (
-    gen_compile_arg,
-    KernelSpec,
-    OpsUnit,
-    RawKernelSpec,
-)
-from generative_recommenders.ops.triton_aot.compile.utils import (
-    is_autotuner,
-    unwrap_heuristic,
-)
-from generative_recommenders.ops.triton_aot.shared.types import AUTOTUNE_ATTRs
-from triton.backends.compiler import GPUTarget
-from triton.runtime.jit import JITFunction, KernelInterface
-
-logger: logging.Logger = logging.getLogger(__name__)
-
-
-def compile_specs_parallel(
-    specs: list[KernelSpec],
-    install_dir: str,
-    module: str,
-    name: str,
-    gpu_target: GPUTarget,
-    import_module: Callable[[str], ModuleType],
-    descriptors: list[ArgDescriptor],
-) -> list[str]:
-    """Compile kernel specs in parallel using multiprocessing.
-
-    When TRITON_AOT_DEBUG=1 is set, compiles sequentially for easier debugging.
-
-    Args:
-        specs: List of kernel specifications to compile
-        install_dir: Directory to install generated files
-        module: The module name of the function
-        name: The function name
-        gpu_target: GPU target for compilation
-        import_module: Function to import modules (e.g., importlib.import_module or PackageImporter.import_module)
-
-    Returns:
-        List of generated code strings for each spec (cubin, loader, launcher)
-    """
-
-    debug = os.environ.get("TRITON_AOT_DEBUG", "0") == "1"
-    if debug:
-        outputs = [
-            spec_gen(
-                install_dir,
-                spec,
-                module,
-                name,
-                gpu_target,
-                import_module,
-                descriptors,
-            )
-            for spec in specs
-        ]
-    else:
-        max_workers = mp.cpu_count() // 2 + 1
-        with ThreadPoolExecutor(max_workers=min(len(specs), max_workers)) as executor:
-            outputs = list(
-                executor.map(
-                    lambda spec: spec_gen(
-                        install_dir,
-                        spec,
-                        module,
-                        name,
-                        gpu_target,
-                        import_module,
-                        descriptors,
-                    ),
-                    specs,
-                )
-            )
-    return outputs
-
-
-# For each spec, generate a kernel:
-# - cubin
-# - loader
-# - launcher
-def spec_gen(
-    install_dir: str,
-    spec: KernelSpec,
-    module: str,
-    name: str,
-    gpu_target: GPUTarget,
-    import_module: Callable[[str], ModuleType],
-    descriptors: list[ArgDescriptor],
-) -> str:
-    # To run this function with multiprocessing, we need to import the function by name,
-    # since JITFunction cannot be pickled.
-    # we have the case where the func name is injected with a suffix, like "_cuda" or "_amd",
-    # we should use the original name to import the func in such case
-    original_name = name
-    splits = name.split("_")
-    end_idx = len(splits)
-
-    while end_idx > 0:
-        original_name = "_".join(splits[:end_idx])
-        if hasattr(import_module(module), original_name):
-            break
-        end_idx -= 1
-    func = unwrap_heuristic(getattr(import_module(module), original_name), JITFunction)
-    func.__name__ = name
-
-    # Generate cubin.
-    kernel_name = gen_kernel_name(func, spec, gpu_target.arch)
-
-    compile_arg = gen_compile_arg(spec, func)
-    options = {name: getattr(spec, name) for name in AUTOTUNE_ATTRs.keys()}
-    compile_kwargs = {
-        "target": gpu_target,
-        "options": options,
-    }
-    kernel = triton.compiler.compile(*compile_arg, **compile_kwargs)
-    if getattr(kernel.metadata, "global_scratch_size", 0) > 0:
-        raise RuntimeError(f"{kernel_name=} with global scratch is not supported.")
-
-    metadata_name = kernel.metadata.name
-    metadata_shared = kernel.metadata.shared
-
-    cubin = gen_cubin(kernel_name, kernel, install_dir, gpu_target.backend)
-    out = [
-        cubin,
-        # Generate loader.
-        gen_loader(kernel_name, metadata_name, metadata_shared),
-        # Generate launcher.
-        gen_launcher(
-            kernel_name,
-            func,
-            kernel,
-            metadata_shared,
-            gpu_target.warp_size,
-            spec,
-            descriptors,
-        ),
-    ]
-    return "".join(out)
-
-
-def sigchld_handler(signum: int, frame: FrameType | None) -> None:
-    sketchy_signals = map(int, [signal.SIGSEGV, signal.SIGABRT, signal.SIGBUS])
-    try:
-        # Consume all pending SIGCHLDs, looking for unexpected failures
-        while True:
-            pid, status = os.waitpid(-1, os.WNOHANG)
-            if pid == 0:
-                break
-            if os.WIFSIGNALED(status) and os.WTERMSIG(status) in sketchy_signals:
-                logger.error(
-                    f"Child process {pid} exited catastrophically with signal {os.WTERMSIG(status)}, terminating!"
-                )
-
-                # Avoid triggering atexit etc which can get stuck and behave improperly
-                # because multiprocessing sets up an atexit handler to join workers
-                # (sigh).  We want to exit, now, so use os._exit instead of sys.exit.
-                os._exit(1)
-    except ChildProcessError:
-        pass
-
-
-def compile_to_cpp(
-    func: KernelInterface[list[Any]] | triton.runtime.autotuner.Autotuner,
-    base_specs: list[RawKernelSpec],
-    install_dir: str,
-    prefix: str,
-    *,
-    gpu_target: GPUTarget,
-    import_module: Callable[[str], ModuleType],
-    default_values: dict[str, Any] | None = None,
-    tuner_fallback: bool = False,
-) -> None:
-    """Compile a Triton kernel into .cpp, .h, _torch_op.cpp, _meta.py files.
-
-    Args:
-        func: Triton JITFunction or Autotuner to compile.
-        base_specs: List of kernel specialization specs.
-        install_dir: Directory to output generated files.
-        prefix: Kernel name prefix, e.g., "_addmm_fwd".
-        gpu_target: GPU target for compilation.
-        import_module: torch.package importer for loading kernels source code.
-        default_values: Default values for kernel arguments.
-        tuner_fallback: If True, generate fallback tuner code.
-    """
-    tuned_func = func if is_autotuner(func) else None
-    # pyre-ignore[6]: Attributes verified by is_autotuner
-    unit = OpsUnit.from_raw_specs(base_specs, gpu_target, tuned_func)
-    default_values = {} if default_values is None else default_values
-
-    func_unwrapped = unwrap_heuristic(func, JITFunction)
-    descriptors = build_arg_descriptors(func_unwrapped, unit)
-
-    # Python's multiprocessing.Pool class is not great at handling unexpected child
-    # failures such as segfaults.  Account for this by temporarily installing a signal
-    # handler that considers such signals a catastrophic compilation failure.  If not
-    # for this, the Pool will deadlock.
-    if threading.current_thread() is threading.main_thread():
-        previous_child_handler = signal.signal(signal.SIGCHLD, sigchld_handler)
-    else:
-        previous_child_handler = None
-
-    func = func_unwrapped
-
-    # sanity check to make sure args with default values are always at the end
-    has_default_value_arg = False
-    for name in func.arg_names:
-        if name in default_values:
-            has_default_value_arg = True
-        elif has_default_value_arg:
-            raise RuntimeError(
-                f"default values must be at the end of the argument list. {func.arg_names=} {default_values=}"
-            )
-
-    h_out = f"{install_dir}/{prefix}.h"
-    cu_out = f"{install_dir}/{prefix}.cpp"
-    torch_out = f"{install_dir}/{prefix}_torch_op.cpp"
-    py_out = f"{install_dir}/{prefix}_meta.py"
-
-    # Generate kernel.h file
-    h_content = generate_header_content(
-        tuned_func,  # pyre-ignore[6]: Autotuner when set (verified by is_autotuner)
-        func,
-        unit,
-        descriptors,
-        tuner_fallback,
-    )
-
-    with open(h_out, "w") as fp:
-        fp.write(h_content)
-
-    generated_specs = compile_specs_parallel(
-        unit.specs,
-        install_dir,
-        func.__module__,
-        func.__name__,
-        gpu_target,
-        import_module,
-        descriptors,
-    )
-    # Generate kernel.cpp file
-    cu_content = generate_kernel_cpp_content(
-        func, unit, descriptors, prefix, generated_specs, gpu_target.backend
-    )
-    with open(cu_out, "w") as fp:
-        fp.write(cu_content)
-
-    # Generate torch_op.cpp file
-    torch_op_content = generate_torch_op_content(
-        func, descriptors, prefix, default_values
-    )
-
-    with open(torch_out, "w") as fp:
-        fp.write(torch_op_content)
-
-    if tuned_func:
-        with open(py_out, "w") as fp:
-            fp.write(gen_tuner_meta_py(tuned_func, tuner_fallback, unit))
-    else:
-        with open(py_out, "w") as fp:
-            fp.write(gen_tuner_meta_py(func, tuner_fallback, unit))
-
-    if previous_child_handler is not None:
-        signal.signal(signal.SIGCHLD, previous_child_handler)
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/spec_processing.py b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/spec_processing.py
deleted file mode 100644
index e8c181121..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/spec_processing.py
+++ /dev/null
@@ -1,593 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-
-# pyre-strict
-
-"""Kernel spec processing for AOT-T compilation.
-
-Transforms raw kernel specs (from infer_spec) into compiled specs ready
-for Triton native compile and C++ codegen.
-"""
-
-from __future__ import annotations
-
-import copy
-import dataclasses
-import logging
-from dataclasses import dataclass
-from typing import Any, cast
-
-# @manual=//triton:triton
-import triton
-from generative_recommenders.ops.triton_aot.compile.compile_state import hash_spec
-from generative_recommenders.ops.triton_aot.shared.spec_conversion import (
-    collect_constraints,
-    extract_constants,
-    get_fp8_replacement_signature_for_amd,
-    get_fp8_replacement_signature_for_sm80,
-    signature_list_to_dict,
-    SignatureElement,
-)
-from generative_recommenders.ops.triton_aot.shared.types import AUTOTUNE_ATTRs
-from triton.backends.compiler import BaseBackend, GPUTarget
-from triton.compiler.compiler import ASTSource
-from triton.runtime.jit import JITFunction
-
-logger: logging.Logger = logging.getLogger(__name__)
-
-TRITON_VERSION: str = triton.__version__
-
-# A raw kernel spec produced by infer_spec.  The only key is "signature".
-RawKernelSpec = dict[str, list[SignatureElement]]
-
-
-@dataclass
-class KernelSpec:
-    """A single compilation variant for a kernel.
-
-    Each variant represents one combination of dtypes, constant values,
-    alignment constraints, and autotune configuration.  Multiple variants
-    are grouped together in an ``OpsUnit``.
-
-    Attributes:
-        signature: Non-constant arg index → dtype string (e.g., ``{0: "*fp32", 4: "i32"}``).
-        constants: Arg index → compile-time constant value.  Includes bare
-            literals (128, ``"leaky_relu"``), absent optional tensors (None),
-            and equal-to-1 specializations (stride=1 → constexpr folding).
-        divisible_by_16: Indices of args whose values are divisible by 16.
-            For pointers this means the address is 16-byte aligned;
-            for scalars it means the value itself is a multiple of 16.
-        divisible_by_8: Indices of args whose values are divisible by 8.
-            Only meaningful for scalars (pointer alignment is always ≥16).
-        num_warps: Number of warps per block.
-        num_stages: Number of pipeline stages.
-        matrix_instr_nonkdim: AMD matrix instruction non-K dimension.
-        waves_per_eu: AMD waves per execution unit.
-        kpack: AMD kpack factor.
-    """
-
-    signature: dict[int, str]
-    constants: dict[int, Any]
-    divisible_by_16: set[int]
-    divisible_by_8: set[int]
-    num_warps: int = 4
-    num_stages: int = 3
-    matrix_instr_nonkdim: int = 0
-    waves_per_eu: int = 1
-    kpack: int = 1
-
-
-@dataclass
-class OpsUnit:
-    """All compilation variants for a single kernel op.
-
-    Groups per-kernel invariants with the list of ``KernelSpec`` variants.
-    Use ``OpsUnit.from_raw_specs()`` to build — it performs the complete
-    spec processing pipeline (convert → detect optional → validate →
-    autotune → dedup → compute invariants).
-
-    Attributes:
-        cc: Compute capability (int for NVIDIA, str for AMD).
-        optional: Indices of optional tensor args (unified across all call sites).
-        pointer_args: Indices of all tensor pointer args (required + optional).
-            Invariant across specs — a pointer arg never becomes a non-pointer.
-        scalar_dtypes: Non-pointer signature arg index → widest dtype string
-            across all specs (e.g., ``"i32"``, ``"i64"``, ``"fp32"``).
-            Computed by ``_wider_type`` — individual specs may use narrower types.
-        constant_types: Python type per constant arg position (e.g., ``{15: int, 19: bool}``).
-            Excludes optional tensor args (None constants).
-            Invariant across specs — same Python type for each position.
-        specs: Per-variant compilation specs.
-    """
-
-    cc: int | str
-    optional: set[int]
-    pointer_args: set[int]
-    scalar_dtypes: dict[int, str]
-    constant_types: dict[int, type[Any]]
-    specs: list[KernelSpec]
-
-    @classmethod
-    def from_raw_specs(
-        cls,
-        base_specs: list[RawKernelSpec],
-        gpu_target: GPUTarget,
-        tuned_func: triton.runtime.autotuner.Autotuner | None = None,
-    ) -> OpsUnit:
-        """Build an OpsUnit from raw kernel specs.
-
-        Performs the complete spec processing pipeline:
-        1. Convert raw specs to KernelSpecs
-        2. Detect optional tensor args (cross-spec + 3-tuple)
-        3. Validate consistency across converted specs
-        4. Apply autotuning (if tuned_func provided)
-        5. Deduplicate specs
-        6. Compute shared invariants (pointer_args, scalar_dtypes, constant_types)
-        """
-        # Validate raw specs upfront, before any rewriting.
-        num_params = _check_uniform_signature_length(base_specs)
-        specs, three_tuple_optional = _convert_raw_specs(base_specs, gpu_target)
-        optional = _detect_optional_args(specs) | three_tuple_optional
-
-        _validate_converted_specs(specs, optional, num_params)
-
-        # Plain @triton.jit kernels (no @triton.autotune) skip config expansion.
-        if tuned_func is not None:
-            specs = _autotune_specs(tuned_func, gpu_target, specs)
-
-        specs = _dedup_specs(specs)
-
-        pointer_args, scalar_dtypes, constant_types = _compute_invariants(
-            specs, optional
-        )
-
-        return cls(
-            cc=gpu_target.arch,
-            optional=optional,
-            pointer_args=pointer_args,
-            scalar_dtypes=scalar_dtypes,
-            constant_types=constant_types,
-            specs=specs,
-        )
-
-
-# ---------------------------------------------------------------------------
-# Public helpers (used outside spec processing)
-# ---------------------------------------------------------------------------
-
-
-def gen_compile_arg(
-    spec: KernelSpec,
-    func: JITFunction[list[Any]],
-) -> tuple[ASTSource]:
-    # ASTSource expects tuple-keyed dicts: {(idx,): value} for constants,
-    # {(idx,): [[attr_name, attr_val], ...]} for attrs.  Tuple keys support
-    # nested paths into structured types (asserted by ASTSource.__init__).
-    new_signature = {}
-    new_constants = {}
-    param_names = list(func.signature.parameters.keys())
-    for idx, param in enumerate(param_names):
-        if idx in spec.signature:
-            new_signature[param] = spec.signature[idx]
-        if idx in spec.constants:
-            new_constants[(idx,)] = spec.constants[idx]
-            new_signature[param] = "constexpr"
-
-    # parse_attr("D") returns a fresh [["tt.divisibility", 16]] each call.
-    new_attrs = {(idx,): BaseBackend.parse_attr("D") for idx in spec.divisible_by_16}
-
-    return (
-        ASTSource(
-            func,
-            new_signature,
-            constexprs=new_constants,
-            attrs=new_attrs,
-        ),
-    )
-
-
-# ---------------------------------------------------------------------------
-# Int width helpers
-# ---------------------------------------------------------------------------
-
-_INT_WIDTH_RANK: dict[str, int] = {"i32": 0, "i64": 1}
-
-
-def _wider_type(t1: str, t2: str) -> str:
-    """Return the wider of two scalar dtypes.
-
-    Only i32/i64 widening is supported.  All other types must match exactly.
-    """
-    if t1 == t2:
-        return t1
-    r1 = _INT_WIDTH_RANK.get(t1)
-    r2 = _INT_WIDTH_RANK.get(t2)
-    if r1 is not None and r2 is not None:
-        return t1 if r1 >= r2 else t2
-    raise ValueError(f"Cannot widen incompatible types: {t1!r} vs {t2!r}")
-
-
-# ---------------------------------------------------------------------------
-# Private helpers — called by OpsUnit.from_raw_specs
-# ---------------------------------------------------------------------------
-
-
-def _detect_optional_args(specs: list[KernelSpec]) -> set[int]:
-    """Detect optional tensor args by cross-spec comparison.
-
-    An arg at index ``i`` is optional if:
-    - Some specs have ``i`` in ``signature`` as a pointer type (``*...``)
-    - Other specs have ``constants[i] = None``
-
-    Single-spec None args (always-absent tensors) are NOT detected here
-    but are handled by ``_compute_invariants`` which adds any
-    ``constants[i] = None`` to ``pointer_args``.
-    """
-    if len(specs) <= 1:
-        return set()
-    optional: set[int] = set()
-    all_indices: set[int] = set()
-    for spec in specs:
-        all_indices |= spec.signature.keys()
-        all_indices |= spec.constants.keys()
-    for i in all_indices:
-        has_pointer = any(
-            i in s.signature and s.signature[i].startswith("*") for s in specs
-        )
-        has_none_const = any(i in s.constants and s.constants[i] is None for s in specs)
-        if has_pointer and has_none_const:
-            optional.add(i)
-    return optional
-
-
-def _check_uniform_signature_length(base_specs: list[RawKernelSpec]) -> int:
-    """All raw specs must declare the same param count; return that count.
-
-    Each raw spec is one ``infer_spec`` call site for the same kernel,
-    so all should have ``len(fn.signature.parameters)`` entries.  Differing
-    lengths means upstream bug (mixed kernels, truncated spec, etc.) and
-    would surface later as silent IndexError or wrong bound checks.
-    """
-    if not base_specs:
-        return 0
-    sig_lens = {len(spec["signature"]) for spec in base_specs}
-    if len(sig_lens) != 1:
-        raise ValueError(
-            f"Raw specs declare inconsistent signature lengths: "
-            f"{sorted(sig_lens)}.  All specs for the same kernel must have "
-            f"one entry per declared param."
-        )
-    return sig_lens.pop()
-
-
-def _check_arg_indices_in_range(
-    specs: list[KernelSpec],
-    num_params: int,
-) -> None:
-    """Every spec arg index must be in ``[0, num_params)``.
-
-    Out-of-range indices would silently drop in ``gen_compile_arg``'s
-    ``enumerate(param_names)`` loop.  ``num_params <= 0`` disables the check.
-    """
-    if num_params <= 0:
-        return
-    for idx, spec in enumerate(specs):
-        all_indices = (
-            spec.signature.keys()
-            | spec.constants.keys()
-            | spec.divisible_by_16
-            | spec.divisible_by_8
-        )
-        for i in all_indices:
-            if not 0 <= i < num_params:
-                raise ValueError(
-                    f"Spec {idx}: arg index {i} out of range "
-                    f"[0, {num_params}) — kernel has {num_params} declared params"
-                )
-
-
-def _collect_pointer_args(
-    specs: list[KernelSpec],
-    optional: set[int],
-) -> set[int]:
-    """Collect all tensor pointer indices across all specs.
-
-    Includes optional args (from _detect_optional_args) AND any arg
-    whose constant value is None (single-spec optional tensor case
-    where _detect_optional_args didn't fire).
-    """
-    pointer_args: set[int] = set(optional)
-    for spec in specs:
-        for i, dtype in spec.signature.items():
-            if dtype.startswith("*"):
-                pointer_args.add(i)
-        for i, val in spec.constants.items():
-            if val is None:
-                pointer_args.add(i)
-    return pointer_args
-
-
-def _collect_scalar_dtypes(
-    specs: list[KernelSpec],
-    pointer_args: set[int],
-) -> dict[int, str]:
-    """Collect non-pointer signature arg dtypes, widening compatible int types.
-
-    Invariant across specs (validated by _validate_converted_specs).
-    """
-    scalar_dtypes: dict[int, str] = {}
-    for spec in specs:
-        for i, dtype in spec.signature.items():
-            if i not in pointer_args:
-                if i in scalar_dtypes:
-                    scalar_dtypes[i] = _wider_type(scalar_dtypes[i], dtype)
-                else:
-                    scalar_dtypes[i] = dtype
-    return scalar_dtypes
-
-
-def _collect_constant_types(
-    specs: list[KernelSpec],
-) -> dict[int, type[Any]]:
-    """Collect Python type per constant position.
-
-    Excludes None constants (optional tensor args — already in pointer_args).
-    """
-    constant_types: dict[int, type[Any]] = {}
-    for spec in specs:
-        for i, val in spec.constants.items():
-            if val is not None and i not in constant_types:
-                constant_types[i] = type(val)
-    return constant_types
-
-
-def _compute_invariants(
-    specs: list[KernelSpec],
-    optional: set[int],
-) -> tuple[set[int], dict[int, str], dict[int, type[Any]]]:
-    """Compute shared invariants from processed specs.
-
-    Returns (pointer_args, scalar_dtypes, constant_types).
-
-    When annotation-as-variant produces mixed partitions (arg in
-    ``signature`` in some specs, ``constants`` in others), the arg
-    appears in both ``scalar_dtypes`` and ``constant_types``.  The
-    selector must receive it as a runtime parameter for dispatch,
-    so ``scalar_dtypes`` wins and the arg is removed from
-    ``constant_types``.
-    """
-    pointer_args = _collect_pointer_args(specs, optional)
-    scalar_dtypes = _collect_scalar_dtypes(specs, pointer_args)
-    constant_types = _collect_constant_types(specs)
-
-    # Resolve overlap: if any spec has the arg in signature (scalar),
-    # the selector needs it as a runtime parameter → not a constant.
-    for i in scalar_dtypes:
-        constant_types.pop(i, None)
-
-    return pointer_args, scalar_dtypes, constant_types
-
-
-def _validate_converted_specs(
-    specs: list[KernelSpec],
-    optional: set[int],
-    num_params: int = 0,
-) -> None:
-    """Validate that converted specs are consistent before further processing.
-
-    Checks that all specs produce identical C++ function signatures:
-    - All arg indices are in ``[0, num_params)`` (when ``num_params > 0``)
-    - Optional args: each spec has either a pointer in signature or None in constants
-    - Non-optional scalar args: same dtype (or compatible int widths)
-    - Non-optional constant args: same Python type
-
-    Called after _convert_raw_specs + _detect_optional_args, before autotuning.
-    """
-    _check_arg_indices_in_range(specs, num_params)
-    if len(specs) <= 1:
-        return
-    ref = specs[0]
-    for idx, spec in enumerate(specs[1:], 1):
-        _check_optional_consistency(ref, spec, idx, optional)
-        _check_signature_consistency(ref, spec, idx, optional)
-        _check_constants_consistency(ref, spec, idx, optional)
-
-
-def _check_optional_consistency(
-    ref: KernelSpec,
-    spec: KernelSpec,
-    idx: int,
-    optional: set[int],
-) -> None:
-    """Optional positions must be pointer-in-signature or None-in-constants.
-
-    Validates that optional tensor args are not misclassified as scalars
-    or non-None constants, which would produce incompatible C++ types.
-    """
-    for i in optional:
-        for label, s in [("spec 0", ref), (f"spec {idx}", spec)]:
-            if i in s.signature:
-                if not s.signature[i].startswith("*"):
-                    raise ValueError(
-                        f"Arg {i}: optional position has non-pointer type "
-                        f"'{s.signature[i]}' in {label}"
-                    )
-            elif i in s.constants:
-                if s.constants[i] is not None:
-                    raise ValueError(
-                        f"Arg {i}: optional position has non-None constant "
-                        f"{s.constants[i]!r} in {label}"
-                    )
-
-
-def _check_signature_consistency(
-    ref: KernelSpec,
-    spec: KernelSpec,
-    idx: int,
-    optional: set[int],
-) -> None:
-    """Non-optional, non-pointer scalar args must have compatible dtypes.
-
-    Pointer args are skipped (different tensor dtypes are dispatched by
-    the dtype guard in ``gen_guarded_calls``).  Compatible int widths
-    (i32/i64) are allowed — handled by ``_wider_type`` and int range guards.
-    Optional positions are validated by ``_check_optional_consistency``.
-
-    Partition differences are allowed: an arg may be in ``signature`` in
-    one spec and in ``constants`` in another (e.g., annotation-as-variant
-    where stride=1 is constexpr in one spec but a runtime parameter in
-    another).  The per-spec codegen handles this correctly.
-    """
-    for i in ref.signature.keys() | spec.signature.keys():
-        if i in optional:
-            continue
-        if (i in ref.signature and ref.signature[i].startswith("*")) or (
-            i in spec.signature and spec.signature[i].startswith("*")
-        ):
-            continue
-        # Allow partition differences: arg in signature in one spec,
-        # in constants in another (annotation-as-variant pattern).
-        if i not in ref.signature or i not in spec.signature:
-            continue
-        if ref.signature[i] != spec.signature[i]:
-            r1 = _INT_WIDTH_RANK.get(ref.signature[i])
-            r2 = _INT_WIDTH_RANK.get(spec.signature[i])
-            if r1 is not None and r2 is not None:
-                continue
-            raise ValueError(
-                f"Arg {i}: dtype mismatch '{ref.signature[i]}' vs "
-                f"'{spec.signature[i]}' (spec 0 vs spec {idx})"
-            )
-
-
-def _check_constants_consistency(
-    ref: KernelSpec,
-    spec: KernelSpec,
-    idx: int,
-    optional: set[int],
-) -> None:
-    """Non-optional constant args must have the same Python type across specs.
-
-    C++ codegen uses one type per constant arg position (``PY_TYPES_TO_CPP_TYPES``),
-    so ``BLOCK_M=64`` (int) and ``BLOCK_M=64.0`` (float) would produce
-    incompatible launchers.  Optional positions are validated separately
-    by ``_check_optional_consistency``.
-    """
-    for i in ref.constants.keys() | spec.constants.keys():
-        if i in optional:
-            continue
-        if ref.constants.get(i) is None or spec.constants.get(i) is None:
-            continue
-        if type(ref.constants[i]) is not type(spec.constants[i]):
-            raise ValueError(
-                f"Arg {i}: constant type mismatch "
-                f"{type(ref.constants[i]).__name__} vs "
-                f"{type(spec.constants[i]).__name__} (spec 0 vs spec {idx})"
-            )
-
-
-def _convert_raw_specs(
-    base_specs: list[RawKernelSpec],
-    gpu_target: GPUTarget,
-) -> tuple[list[KernelSpec], set[int]]:
-    """Convert raw specs to KernelSpecs.
-
-    Returns (specs, three_tuple_optional) where three_tuple_optional is the
-    union of optional_args detected from 3-tuple signature elements across
-    all specs (backward compat with ``collect_constraints``).
-    """
-    raw_specs = cast(list[dict[str, Any]], copy.deepcopy(base_specs))
-    is_amd = gpu_target.backend == "hip"
-
-    result: list[KernelSpec] = []
-    three_tuple_optional: set[int] = set()
-    for raw_spec in raw_specs:
-        constraints = collect_constraints(raw_spec["signature"])
-        constants = extract_constants(raw_spec["signature"], constraints)
-        signature: dict[int, str] = signature_list_to_dict(
-            raw_spec["signature"], constants
-        )
-        three_tuple_optional |= constraints.optional_args
-
-        spec = KernelSpec(
-            signature=signature,
-            constants=constants,
-            divisible_by_16=constraints.divisible_by_16,
-            divisible_by_8=constraints.divisible_by_8,
-        )
-
-        if constraints.has_fp8:
-            if is_amd:
-                spec.signature = get_fp8_replacement_signature_for_amd(
-                    {"signature": spec.signature}, {str(gpu_target.arch)}
-                )
-            elif gpu_target.arch == 80:
-                spec.signature = get_fp8_replacement_signature_for_sm80(
-                    {"signature": spec.signature}
-                )
-
-        result.append(spec)
-
-    return result, three_tuple_optional
-
-
-def _autotune_specs(
-    func: triton.runtime.autotuner.Autotuner,
-    target: GPUTarget,
-    specs: list[KernelSpec],
-) -> list[KernelSpec]:
-    tuned_specs: list[KernelSpec] = []
-    for spec in specs:
-        for cfg in func.cache.values():
-            constants = spec.constants.copy()
-            for arg_name, arg_val in cfg.kwargs.items():
-                if arg_name in AUTOTUNE_ATTRs:
-                    continue
-                arg_idx = func.arg_names.index(arg_name)
-                if constants.get(arg_idx, -1) == -1:
-                    constants[arg_idx] = arg_val
-
-            autotune_values: dict[str, int] = {}
-            for name, default in AUTOTUNE_ATTRs.items():
-                if name in cfg.kwargs:
-                    autotune_values[name] = cfg.kwargs[name]
-                else:
-                    autotune_values[name] = getattr(cfg, name, default)
-                # AMD has changed their software pipeliner in Triton
-                # It now expects num_stages == 2 instead of 0
-                # see: https://github.com/pytorch/pytorch/pull/139881
-                # if we see someone try to set num_stages == 0, set it to the default (2) instead
-                # We can't use the Triton hook to get the default value because it requires the AMD runtime to be loaded
-                if (
-                    target.backend == "hip"
-                    and name == "num_stages"
-                    and autotune_values[name] == 0
-                    and TRITON_VERSION >= "3.2.0"
-                ):
-                    autotune_values[name] = 2
-
-            tuned_spec = dataclasses.replace(
-                spec,
-                constants=constants,
-                # pyrefly: ignore [bad-argument-type]
-                **autotune_values,
-            )
-            tuned_specs.append(tuned_spec)
-    return tuned_specs
-
-
-def _dedup_specs(specs: list[KernelSpec]) -> list[KernelSpec]:
-    deduped_specs: list[KernelSpec] = []
-    duplicated_specs: list[KernelSpec] = []
-    hash_spec_ids: set[str] = set()
-    for spec in specs:
-        id = hash_spec(dataclasses.asdict(spec))
-        if id in hash_spec_ids:
-            duplicated_specs.append(spec)
-        else:
-            hash_spec_ids.add(id)
-            deduped_specs.append(spec)
-
-    logger.debug(
-        f"[TritonAOT Dedup] {len(specs)=} {len(deduped_specs)=} {len(duplicated_specs)=}"
-    )
-    return deduped_specs
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/stable_types.py b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/stable_types.py
deleted file mode 100644
index 33038a534..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/stable_types.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# pyre-strict
-
-"""AOTT-local type mappings for stable ABI codegen.
-
-These replace ``shared.types.ATYPES`` and ``shared.types.PY_TYPES_TO_CPP_TYPES``
-with versions that have zero link dependency on ATen.  The shared dicts are kept
-unchanged so TritonCC is not affected.
-"""
-
-from typing import Any
-
-# Stable ABI scalar type mapping: Triton pointer dtype → c10::ScalarType enum.
-# Uses c10::ScalarType:: (from torch/headeronly/core/ScalarType.h) instead of
-# at::kFloat aliases (which require ATen headers).
-SCALAR_TYPES: dict[str, str] = {
-    "*i1": "c10::ScalarType::Bool",
-    "*u8": "c10::ScalarType::Byte",
-    "*i8": "c10::ScalarType::Char",
-    "*i16": "c10::ScalarType::Short",
-    "*i32": "c10::ScalarType::Int",
-    "*i64": "c10::ScalarType::Long",
-    "*fp16": "c10::ScalarType::Half",
-    "*fp32": "c10::ScalarType::Float",
-    "*fp64": "c10::ScalarType::Double",
-    "*bf16": "c10::ScalarType::BFloat16",
-    "*fp8e4nv": "c10::ScalarType::Float8_e4m3fn",
-    "*fp8e4b8": "c10::ScalarType::Float8_e4m3fnuz",
-}
-
-# Stable ABI override: str → "std::string" instead of "at::string".
-PY_TYPES_TO_CPP_TYPES: dict[type[Any], str] = {
-    int: "int64_t",
-    str: "std::string",
-    float: "double",
-}
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/triton_aot_compile.py b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/triton_aot_compile.py
deleted file mode 100644
index 74179435a..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/triton_aot_compile.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# pyre-strict
-
-import importlib
-import logging
-import os
-import pickle
-from types import ModuleType, TracebackType
-from typing import Any, Callable, Optional, Type
-
-from generative_recommenders.ops.triton_aot.build.extension_builder import (
-    build_triton_aot_extension,
-)
-from generative_recommenders.ops.triton_aot.compile.codegen import (
-    is_non_empty_mapping_of_type,
-)
-from generative_recommenders.ops.triton_aot.compile.compile_state import (
-    get_aott_compile_path,
-    get_aott_compile_state,
-    get_triton_aot_kernel_specs,
-)
-from generative_recommenders.ops.triton_aot.compile.pipeline import compile_to_cpp
-from generative_recommenders.ops.triton_aot.compile.utils import unwrap_heuristic
-from torch import package
-from triton.backends.compiler import GPUTarget
-from triton.runtime import driver, JITFunction
-
-# @manual=//triton:triton
-from triton.runtime.autotuner import Config
-
-logger: logging.Logger = logging.getLogger(__name__)
-
-
-class TritonAOTCompile:
-    """
-    Context manager to compile Triton kernels to C++ and build a shared library.
-    The compiled kernels are cached in a temporary directory.
-
-    - package_importer:
-        torch.package importer for loading kernels source code (aott/ops).
-        If not provided, the default importlib is used (for local use cases)
-    - gpu_target:
-        GPU target to compile for (default: active GPU target, determined by Triton driver)
-    This local copy intentionally omits Manifold autotune-cache overrides. The
-    HSTU e2e path only needs representative-input autotuning captured during
-    the compile context.
-    """
-
-    def __init__(
-        self,
-        package_importer: Optional[package.PackageImporter] = None,
-        gpu_target: Optional[GPUTarget] = None,
-        auto_tune_cache_override_path: Optional[str] = None,
-    ) -> None:
-        self._import_module: Callable[[str], ModuleType] = (
-            package_importer.import_module
-            if package_importer is not None
-            else importlib.import_module
-        )
-        self.gpu_target: GPUTarget = gpu_target or driver.active.get_current_target()
-        self.auto_tune_cache_override_path: Optional[str] = (
-            auto_tune_cache_override_path
-        )
-
-    def _load_autotune_cache_overrides(
-        self,
-    ) -> dict[str, Any]:
-        if self.auto_tune_cache_override_path is None:
-            return {}
-        raise NotImplementedError(
-            "Local generative_recommenders AOT-T compile does not support "
-            "auto_tune_cache_override_path."
-        )
-
-    def __enter__(self) -> None:
-        state = get_aott_compile_state()
-        state.reset()
-        state.enable()
-        logger.info(
-            f"Start AOTT compile, output dir: {get_aott_compile_path()}, gpu_target: {self.gpu_target}"
-        )
-
-    def _resolve_autotune_cache(
-        self,
-        fn: Any,
-        fn_name: str,
-        fn_dir: str,
-        overrides: dict[str, Any],
-    ) -> None:
-        """Apply override (if matched) and dump the autotune cache to fn_dir."""
-        override = overrides.get(fn_name)
-        if override is not None:
-            logger.info(
-                f"[AOTT]: Overriding autotune cache for {fn_name} "
-                f"from {self.auto_tune_cache_override_path}"
-            )
-            fn.cache = override
-
-        # cache are dumped just for testing
-        if hasattr(fn, "cache") and is_non_empty_mapping_of_type(fn.cache, Config):
-            with open(f"{fn_dir}/{fn_name}_autotune_cache", "wb") as data:
-                # @lint-ignore PYTHONPICKLEISBAD
-                pickle.dump(fn.cache, data)
-
-    def __exit__(
-        self,
-        exc_type: Optional[Type[BaseException]],
-        exc_value: Optional[BaseException],
-        traceback: Optional[TracebackType],
-    ) -> None:
-        compile_path = get_aott_compile_path()
-        if not os.path.exists(compile_path):
-            os.makedirs(compile_path)
-
-        kernel_specs = get_triton_aot_kernel_specs()
-        auto_tune_overrides = self._load_autotune_cache_overrides()
-
-        logger.info(f"[AOTT]: compiling {len(kernel_specs)} kernels")
-
-        for fn, specs in kernel_specs.items():
-            jit_fn = unwrap_heuristic(fn, JITFunction)
-            fn_name = jit_fn.__name__
-
-            logger.info(f"[AOTT]: compiling {fn_name} with specs: {specs}")
-
-            module_suffix = jit_fn.__module__.rsplit(".", 1)[-1]
-            fn_dir = f"{compile_path}/{module_suffix}_{fn_name}"
-            if not os.path.exists(fn_dir):
-                os.makedirs(fn_dir)
-
-            self._resolve_autotune_cache(fn, fn_name, fn_dir, auto_tune_overrides)
-
-            compile_to_cpp(
-                func=fn,
-                base_specs=specs,
-                install_dir=f"{fn_dir}",
-                prefix=f"{fn_name}",
-                gpu_target=self.gpu_target,
-                tuner_fallback=True,
-                import_module=self._import_module,
-            )
-
-            build_triton_aot_extension(
-                source_dir=fn_dir,
-                kernel_name=fn_name,
-                output_dir=fn_dir,
-            )
-
-        get_aott_compile_state().disable()
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/utils.py b/recommendation_v4/generative_recommenders/ops/triton_aot/compile/utils.py
deleted file mode 100644
index e3848fd2e..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/compile/utils.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-
-# pyre-strict
-
-import hashlib
-from typing import Any, Type, TypeVar
-
-T = TypeVar("T")
-
-
-def unwrap_heuristic(func: Any, return_type: Type[T]) -> T:
-    while not isinstance(func, return_type):
-        func = func.fn
-        if not hasattr(func, "fn"):
-            # pyre-fixme[7]: Incompatible return type [7]: Expected `Variable[T]` but got `None`.
-            return None
-    return func
-
-
-def is_autotuner(obj: Any) -> bool:
-    """Check whether *obj* is a Triton Autotuner using duck typing.
-
-    In Buck builds the ``Autotuner`` class can be loaded from multiple module
-    paths (e.g. via ``torch.package`` re-imports), causing ``isinstance`` to
-    return ``False`` for genuine Autotuner instances.  We combine a class-name
-    check with duck-typing on the attributes that callers actually need
-    (``cache``, ``configs``, ``arg_names``), making detection robust against
-    module-path aliasing.
-    """
-    return "Autotuner" in type(obj).__name__ and all(
-        hasattr(obj, attr) for attr in ("cache", "configs", "arg_names")
-    )
-
-
-def hash_kernel_name(kernel_name: str) -> str:
-    """Hash kernel name to create shorter, filesystem-safe names.
-
-    Args:
-        kernel_name: Full kernel name (can be very long with specialization suffixes).
-            e.g., "_addmm_fwd_sm80_pfp32_pfp32_pfp32_pfp32_i32_..."
-
-    Returns:
-        Hashed name in format "kernel_<sha256_hex>".
-            e.g., "kernel_a1b2c3d4e5f6..."
-
-    """
-    return "kernel_" + hashlib.sha256(kernel_name.encode("utf-8")).hexdigest()
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/preprocess.py b/recommendation_v4/generative_recommenders/ops/triton_aot/preprocess.py
deleted file mode 100644
index ce2e63d43..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/preprocess.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
-
-# pyre-strict
-
-"""
-Preprocessing utilities for triton_aot models before AOT compilation.
-"""
-
-import logging
-
-from tgif.fx.tgif_tracer import TGIFTracer
-from torch.fx import GraphModule
-
-logger: logging.Logger = logging.getLogger(__name__)
-
-# "aot_triton_kernel_wrapper_" is a pre-defined prefix for
-# AOT-T triton kernel wrapper functions. This is required for
-# AOT-T backend to recognize and trace correctly for ops transformation.
-AOTT_WRAPPER_PREFIX: str = "aot_triton_kernel_wrapper_"
-
-
-def unwrap_aott_wrapper_nodes(fx_m: GraphModule, tracer: TGIFTracer) -> GraphModule:
-    """Mark ``aot_triton_kernel_wrapper_*`` FX nodes as unwrapped and re-trace.
-
-    In the traced FX graph, outer wrapper functions (prefixed with
-    ``aot_triton_kernel_wrapper_``) are ``@torch.fx.wrap`` leaves.
-    Setting ``node.meta["is_wrapped"] = False`` causes a subsequent
-    ``symbolic_trace`` to trace *through* them, exposing the inner
-    ``@torch.fx.wrap`` functions (e.g., ``_triton_aot_grouped_gemm``)
-    that contain the actual kernel calls.
-
-    Any ``_body_transformer`` hook (e.g. one registered by
-    ``early_return_fx_code_transform``) is temporarily removed before
-    re-tracing to avoid injecting un-traceable control flow
-    (``if Proxy: …``) into the generated ``forward``.  After re-trace
-    the hook is restored on the new module.  See P2266562545.
-
-    Args:
-        fx_m: The FX GraphModule to modify **in-place** before re-trace.
-        tracer: Tracer instance used for the re-trace step.
-
-    Returns:
-        The re-traced ``GraphModule`` with AOTT wrappers expanded.
-    """
-    logger.info("Re-trace to get the AOTT node exposed.")
-
-    # Save and clear the body transformer so that re-trace does not hit
-    # ``if Proxy:`` from code-level hooks like early_return_fx_code_transform.
-    saved_body_transformer = fx_m.graph._codegen._body_transformer
-    fx_m.graph._codegen._body_transformer = None
-
-    unwrap_count = 0
-    for node in fx_m.graph.nodes:
-        if node.op == "call_function":
-            target = node.target
-            if hasattr(target, "__name__") and target.__name__.startswith(
-                AOTT_WRAPPER_PREFIX
-            ):
-                logger.info(f"[AOTT] Found inference wrapper node: {node=}")
-                node.meta["is_wrapped"] = False
-                unwrap_count += 1
-
-    if unwrap_count > 0:
-        logger.info(f"[AOTT] Found {unwrap_count} inference wrapper nodes.")
-        fx_m.recompile()
-    else:
-        logger.warning("[AOTT] No inference wrapper node found. Skip re-compile.")
-
-    result = tracer.symbolic_trace(fx_m)
-
-    # Restore the body transformer on the new module.
-    if saved_body_transformer is not None:
-        result.graph._codegen._body_transformer = saved_body_transformer
-        result.recompile()
-
-    return result
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/shared/compat.py b/recommendation_v4/generative_recommenders/ops/triton_aot/shared/compat.py
deleted file mode 100644
index be6235701..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/shared/compat.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# pyre-strict
-"""
-This module provides shared utilities that handle differences between
-Triton versions.
-"""
-
-from typing import Any
-
-# @manual=//triton:triton
-import triton
-from packaging.version import Version
-from triton.runtime.jit import JITFunction
-
-TRITON_VERSION: str = triton.__version__
-
-
-def version_gte(version: str, target: str) -> bool:
-    """
-    Check if version >= target using semantic version comparison.
-    Simple string comparison fails for versions like "3.10" vs "3.5"
-    """
-    return Version(version) >= Version(target)
-
-
-def get_kernel_name(jit_fn: JITFunction[Any]) -> str:
-    """
-    Get the simple kernel name from a JITFunction.
-
-    In Triton 3.5+, JITFunction._fn_name returns the full qualified name
-    (e.g., "generative_recommenders.ops.triton_aot.triton_addmm._addmm_fwd").
-    In older versions, it returns just the simple name (e.g., "_addmm_fwd").
-
-    This function normalizes the behavior to always return the simple name.
-
-    Args:
-        jit_fn: A Triton JITFunction
-
-    Returns:
-        The simple kernel name (e.g., "_addmm_fwd")
-    """
-    fn_name = jit_fn._fn_name
-    if version_gte(TRITON_VERSION, "3.5"):
-        # Triton 3.5+ uses get_full_name(fn) which returns qualified name
-        return fn_name.rsplit(".", 1)[-1]
-    else:
-        # Older versions use fn.__name__ which is already simple
-        return fn_name
-
-
-def get_scratch_parameters(kernel: Any) -> tuple[str, list[str]]:
-    """
-    Get scratch parameter declarations and argument pointers for the kernel launcher.
-
-    Scratch parameters are backend and version-specific features for profiling
-    and global memory management.
-
-    Detection Strategy:
-        1. Check metadata first for each parameter
-        2. Fall back to version-based detection if metadata unavailable
-
-    Version Requirements (fallback):
-        - v3.4+: both global_scratch and profile_scratch
-        - v3.3: only global_scratch
-        - v3.2 and earlier: no scratch parameters
-
-    Args:
-        kernel: Compiled Triton kernel with metadata attribute
-
-    Returns:
-        Tuple of (declarations, arg_pointers):
-        - declarations: C++ variable declarations for scratch parameters
-        - arg_pointers: List of argument pointers to append to kernel args
-    """
-    declarations = []
-    arg_pointers = []
-
-    if hasattr(kernel.metadata, "global_scratch_size"):
-        declarations.append("CUdeviceptr global_scratch = 0;")
-        arg_pointers.append("&global_scratch")
-    elif version_gte(TRITON_VERSION, "3.3"):
-        declarations.append("CUdeviceptr global_scratch = 0;")
-        arg_pointers.append("&global_scratch")
-
-    if hasattr(kernel.metadata, "profile_scratch_size"):
-        declarations.append("CUdeviceptr profile_scratch = 0;")
-        arg_pointers.append("&profile_scratch")
-    elif version_gte(TRITON_VERSION, "3.4"):
-        declarations.append("CUdeviceptr profile_scratch = 0;")
-        arg_pointers.append("&profile_scratch")
-
-    return ("\n            ".join(declarations), arg_pointers)
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/shared/spec_conversion.py b/recommendation_v4/generative_recommenders/ops/triton_aot/shared/spec_conversion.py
deleted file mode 100644
index 4a1ebb133..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/shared/spec_conversion.py
+++ /dev/null
@@ -1,389 +0,0 @@
-# pyre-strict
-
-"""Functions for converting kernel specs to architecture-specific formats.
-
-A "spec" (specification) describes how to compile a Triton kernel for a specific
-set of input shapes and types. Users provide "base specs" in a human-friendly
-format that describes kernel arguments:
-
-    {"signature": [("*fp32", 16), ("*bf16", 16), ("i32", None), 128]}
-
-This format encodes dtypes, alignment hints, and constant values together.
-Before compilation, base specs must be converted to "compiled specs" that
-separate this information into distinct fields the compiler understands:
-
-    {"signature": {0: "*fp32", 1: "*bf16"},
-     "constants": {2: None, 3: 128},
-     "configs": (instance_descriptor(...),),
-     "cc": 80}
-
-This module provides the functionality to perform this transformation,
-extracting constraints, identifying constants, and preparing specs for each
-target GPU architecture.
-"""
-
-from collections import namedtuple
-from dataclasses import dataclass
-from typing import Any, TypeAlias
-
-from generative_recommenders.ops.triton_aot.shared.types import CTYPES
-
-# Compile-time constant values that can appear in signatures or be returned
-# by constexpr(). These are values the compiler can fold into generated code.
-ConstantValue: TypeAlias = str | int | float | bool | None
-
-# A single element in a kernel signature list.
-# Can be: dtype string, (dtype, alignment) tuple, (dtype, alignment, has_value)
-# triple for optional args, or a bare literal constant.
-SignatureElement: TypeAlias = (
-    ConstantValue | tuple[str, int | None] | tuple[str, int | None, bool]
-)
-
-
-instance_descriptor = namedtuple(
-    "instance_descriptor",
-    [
-        "divisible_by_16",
-        "equal_to_1",
-        "ids_of_folded_args",
-        "divisible_by_8",
-    ],
-)
-
-
-def constexpr(s: SignatureElement) -> ConstantValue:
-    """Identify compile-time constant expressions in signature elements.
-
-    Args:
-        s: A signature element.
-
-    Returns:
-        The constant value if s is a compile-time constant, None otherwise.
-        Constants are: int, float, bool, or strings that aren't dtype names.
-    """
-    expr = s[0] if isinstance(s, tuple) and len(s) > 1 else s
-
-    if expr is None:
-        return expr
-
-    try:
-        ret = int(expr)
-        return ret
-    except (ValueError, TypeError):
-        pass
-    try:
-        ret = float(expr)
-        return ret
-    except (ValueError, TypeError):
-        pass
-
-    if isinstance(expr, bool):
-        return expr
-    if isinstance(expr, str) and expr not in CTYPES and not expr.startswith("*"):
-        return expr
-    return None
-
-
-@dataclass
-class SignatureConstraints:
-    """Constraints extracted from parsing a kernel signature.
-
-    When compiling a Triton kernel, the compiler can generate more efficient
-    code if it knows certain properties about the arguments:
-
-    - Pointer alignment: If a pointer is always 16-byte aligned, the compiler
-      can use faster aligned memory operations.
-    - Constant values: Arguments known at compile time can be folded into the
-      generated code, eliminating runtime checks.
-    - FP8 dtypes: Some GPU architectures require dtype substitutions for FP8
-      types (e.g., gfx942 needs fp8e4b8 instead of fp8e4nv).
-
-    This dataclass collects all these constraints from a single pass over the
-    signature, so downstream code can use them without re-parsing.
-
-    Attributes:
-        divisible_by_16: Indices of args with values divisible by 16.
-        divisible_by_8: Indices of args with values divisible by 8.
-        equal_to_1: Indices of args with value equal to 1.
-        none_args: Indices of args that are None (not provided).
-        optional_args: Indices of optional arguments.
-        has_fp8: Whether any argument has an FP8 dtype.
-    """
-
-    divisible_by_16: set[int]
-    divisible_by_8: set[int]
-    equal_to_1: set[int]
-    none_args: set[int]
-    optional_args: set[int]
-    has_fp8: bool
-
-
-def collect_constraints(signature: list[SignatureElement]) -> SignatureConstraints:
-    """Collect divisibility and type constraints from a signature list.
-
-    Iterates through signature elements and identifies:
-    - Arguments divisible by 16 or 8 (for memory alignment)
-    - Arguments equal to 1 (for optimization)
-    - Optional arguments and those not provided (None)
-    - Whether any FP8 dtypes are present
-
-    Args:
-        signature: List of signature elements. The input format is unfortunately
-            variable; each element can be one of several types:
-
-            1. Plain string (dtype only, no alignment info):
-               "*fp32"           - A float32 pointer
-               "i32"             - A 32-bit integer scalar
-
-            2. Tuple of (dtype, value) where value indicates alignment or constness:
-               ("*fp32", 16)     - Float32 pointer, 16-byte aligned
-               ("i32", None)     - Integer arg not provided (becomes constant None)
-               ("*bf16", 1)      - Pointer with value=1 (folded as constant)
-
-            3. Triple of (dtype, value, has_value) for optional arguments:
-               ("*fp32", 16, True)  - Optional arg that IS provided, 16-byte aligned
-               ("*fp32", 16, False) - Optional arg NOT provided (becomes None)
-
-            4. Bare literals (become compile-time constants):
-               128               - Integer constant
-               "leaky_relu"      - String constant (e.g., activation name)
-
-    Returns:
-        SignatureConstraints with all constraint sets populated.
-
-    Example:
-        >>> sig = [("*fp32", 16), ("i32", None), ("*fp8e4nv", 8)]
-        >>> c = collect_constraints(sig)
-        >>> 0 in c.divisible_by_16
-        True
-        >>> c.has_fp8
-        True
-    """
-    divisible_by_16: set[int] = set()
-    divisible_by_8: set[int] = set()
-    equal_to_1: set[int] = set()
-    none_args: set[int] = set()
-    optional_args: set[int] = set()
-    has_fp8: bool = False
-
-    for i, s in enumerate(signature):
-        # Handle optional tensor case: tuple with 3 elements where s[2] indicates
-        # whether the optional arg has a value
-        if isinstance(s, tuple) and len(s) > 2:
-            optional_args.add(i)
-            # pyrefly: ignore [bad-index]
-            if not s[2]:  # has_value is False
-                none_args.add(i)
-                continue
-
-        # Extract dtype
-        dtype = s[0] if isinstance(s, tuple) else s
-
-        # Check for FP8 types
-        if isinstance(dtype, str) and ("fp8e4nv" in dtype or "fp8e4b8" in dtype):
-            has_fp8 = True
-
-        # Extract value (alignment or constant)
-        value = s[1] if isinstance(s, tuple) else s
-
-        # Check divisibility and equality constraints
-        if isinstance(value, int):
-            if value % 16 == 0:
-                divisible_by_16.add(i)
-            if value % 8 == 0:
-                divisible_by_8.add(i)
-            if value == 1:
-                equal_to_1.add(i)
-
-        if value is None:
-            none_args.add(i)
-
-    return SignatureConstraints(
-        divisible_by_16=divisible_by_16,
-        divisible_by_8=divisible_by_8,
-        equal_to_1=equal_to_1,
-        none_args=none_args,
-        optional_args=optional_args,
-        has_fp8=has_fp8,
-    )
-
-
-def make_instance_descriptor(
-    constraints: SignatureConstraints,
-) -> tuple[instance_descriptor]:
-    """Create an instance_descriptor tuple from constraints.
-
-    Args:
-        constraints: The collected signature constraints.
-
-    Returns:
-        A tuple containing a single instance_descriptor namedtuple with
-        divisible_by_16, equal_to_1, ids_of_folded_args, and divisible_by_8.
-    """
-    ids_of_folded_args = constraints.equal_to_1 | constraints.none_args
-    return (
-        instance_descriptor(
-            divisible_by_16=constraints.divisible_by_16,
-            equal_to_1=constraints.equal_to_1,
-            ids_of_folded_args=ids_of_folded_args,
-            divisible_by_8=constraints.divisible_by_8,
-        ),
-    )
-
-
-def extract_constants(
-    signature: list[SignatureElement],
-    constraints: SignatureConstraints,
-) -> dict[int, ConstantValue]:
-    """Extract compile-time constant values from signature elements.
-
-    Identifies arguments that can be folded into generated code at compile time.
-    Constants come from three sources:
-
-    1. Bare literals in the signature (e.g., 128 for block size, "leaky_relu"
-       for activation type, True for a boolean flag)
-    2. Arguments with value=1 (tracked in constraints.equal_to_1)
-    3. Arguments not provided (tracked in constraints.none_args)
-
-    Args:
-        signature: List of signature elements in input format.
-        constraints: The collected signature constraints.
-
-    Returns:
-        Dict mapping argument indices to their constant values.
-    """
-    # Use constexpr to identify constant expressions
-    constexprs = {i: constexpr(s) for i, s in enumerate(signature)}
-    constants: dict[int, ConstantValue] = {
-        k: v for k, v in constexprs.items() if v is not None
-    }
-
-    # Add equal_to_1 args with value 1
-    for k in constraints.equal_to_1:
-        constants[k] = 1
-
-    # Add none_args with value None
-    for k in constraints.none_args:
-        constants[k] = None
-
-    return constants
-
-
-def signature_list_to_dict(
-    signature: list[SignatureElement],
-    constants: dict[int, ConstantValue],
-) -> dict[int, str]:
-    """Convert signature from list format to dict format.
-
-    Transforms the input signature list into a dict mapping argument
-    indices to dtype strings. Arguments that are constants are excluded
-    since they don't need runtime type information.
-
-    Args:
-        signature: List of signature elements in input format.
-        constants: Dict of constant argument indices to exclude.
-
-    Returns:
-        Dict mapping non-constant argument indices to their dtype strings.
-    """
-    result: dict[int, str] = {}
-    for i, s in enumerate(signature):
-        if i in constants:
-            continue
-        # After filtering out constants, remaining elements are dtype declarations.
-        # For tuples like ("*fp32", 16), s[0] is the dtype string.
-        # For plain strings like "*fp32", the element itself is the dtype.
-        if isinstance(s, tuple) and len(s) > 1:
-            dtype = s[0]
-        else:
-            dtype = s
-        assert isinstance(dtype, str)
-        result[i] = dtype
-    return result
-
-
-# CC (compute capability) to AMD GPU architecture mapping
-# CC is a 2-digit shorthand: 94 -> gfx942, 95 -> gfx950
-HIP_CC_TO_ARCH_INFO: dict[int, str] = {
-    90: "gfx90a",
-    94: "gfx942",
-    95: "gfx950",
-}
-
-# Reverse mapping: architecture string -> CC string
-HIP_ARCH_TO_CC: dict[str, str] = {v: str(k) for k, v in HIP_CC_TO_ARCH_INFO.items()}
-
-HIP_CC_MI350X: str = "95"  # CC string for gfx950 (MI350X/MI355X)
-
-
-def _normalize_cc(cc: set[str]) -> set[str]:
-    """Normalize CC values to 2-digit format for internal comparison.
-
-    Accepts both tritoncc format ("94", "95") and Triton driver format
-    ("gfx942", "gfx950"). Returns 2-digit CC strings.
-    """
-    return {HIP_ARCH_TO_CC.get(c, c) for c in cc}
-
-
-def get_fp8_replacement_signature_for_amd(
-    spec: dict[str, Any], cc: set[str]
-) -> dict[int, str]:
-    """Replace FP8 dtypes in signature for AMD architectures.
-
-    Args:
-        spec: Compiled spec dict with 'signature' in dict format.
-        cc: Set of CC strings in either format:
-            - 2-digit tritoncc format: {"94"} for gfx942
-            - Triton driver format: {"gfx942"}
-            See HIP_CC_TO_ARCH_INFO.
-
-    Returns:
-        Dict mapping argument indices to dtype strings with FP8 types replaced.
-    """
-    normalized_cc: set[str] = _normalize_cc(cc)
-
-    def replace_fp8_type(dtype_str: str) -> str:
-        if "fp8e4nv" in dtype_str:
-            if HIP_CC_MI350X not in normalized_cc:
-                return dtype_str.replace("fp8e4nv", "fp8e4b8")
-        elif "fp8e4b8" in dtype_str and HIP_CC_MI350X in normalized_cc:
-            return dtype_str.replace("fp8e4b8", "fp8e4nv")
-        return dtype_str
-
-    replace_fp8_signatures: dict[int, str] = {}
-    for key, value in spec["signature"].items():
-        if isinstance(value, str):
-            replace_fp8_signatures[key] = replace_fp8_type(value)
-        else:
-            replace_fp8_signatures[key] = value
-
-    return replace_fp8_signatures
-
-
-def get_fp8_replacement_signature_for_sm80(
-    spec: dict[str, Any],
-) -> dict[int, Any]:
-    """Replace FP8 dtypes with bf16 for SM80 (A100) which lacks native FP8 support.
-
-    Args:
-        spec: Compiled spec dict with 'signature' in dict format.
-
-    Returns:
-        Dict mapping argument indices to dtype strings with FP8 types replaced by bf16.
-    """
-
-    def replace_fp8_type(dtype_str: str) -> str:
-        if "fp8e4nv" in dtype_str:
-            return dtype_str.replace("fp8e4nv", "bf16")
-        return dtype_str
-
-    replace_fp8_signatures: dict[int, Any] = {}
-    for key, value in spec["signature"].items():
-        if isinstance(value, tuple) and isinstance(value[0], str):
-            replace_fp8_signatures[key] = (replace_fp8_type(value[0]), value[1])
-        elif isinstance(value, str):
-            replace_fp8_signatures[key] = replace_fp8_type(value)
-        else:
-            replace_fp8_signatures[key] = value
-
-    return replace_fp8_signatures
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/shared/types.py b/recommendation_v4/generative_recommenders/ops/triton_aot/shared/types.py
deleted file mode 100644
index 6a870fc4d..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/shared/types.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# pyre-strict
-
-"""Shared type definitions for AOTT and Triton CC.
-
-This module contains fundamental type mappings used across the compiler.
-"""
-
-from typing import Any
-
-# Mapping from Triton dtype names to C type names
-CTYPES: dict[str, str] = {
-    "i1": "bool",
-    "u8": "uint8_t",
-    "i8": "int8_t",
-    "i16": "int16_t",
-    "i32": "int32_t",
-    "i64": "int64_t",
-    "fp16": "half",
-    "fp32": "float",
-    "fp64": "double",
-    "bf16": "__nv_bfloat16",
-    "fp8e4nv": "__nv_fp8_e4m3",
-    "fp8e4b8": "__hip_fp8_e4m3_fnuz",
-}
-
-# Mapping from Triton pointer dtype names to ATen scalar types
-ATYPES: dict[str, str] = {
-    "*i1": "at::kBool",
-    "*u8": "at::kByte",
-    "*i8": "at::kChar",
-    "*i16": "at::kShort",
-    "*i32": "at::kInt",
-    "*i64": "at::kLong",
-    "*fp16": "at::kHalf",
-    "*fp32": "at::kFloat",
-    "*fp64": "at::kDouble",
-    "*bf16": "at::kBFloat16",
-    "*fp8e4nv": "at::kFloat8_e4m3fn",
-    "*fp8e4b8": "at::kFloat8_e4m3fnuz",
-}
-
-# Mapping from Python types to C++ type names
-PY_TYPES_TO_CPP_TYPES: dict[type[Any], str] = {
-    int: "int64_t",
-    str: "at::string",
-    float: "double",
-}
-
-# Default values for autotuning attributes.
-# These are used as default kernel launch parameters.
-AUTOTUNE_ATTRs: dict[str, int] = {
-    "num_warps": 4,
-    "num_stages": 3,
-    # AMD only
-    "matrix_instr_nonkdim": 0,
-    "waves_per_eu": 1,
-    "kpack": 1,
-}
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/templates/embedded_cubins.cpp b/recommendation_v4/generative_recommenders/ops/triton_aot/templates/embedded_cubins.cpp
deleted file mode 100644
index d269c3555..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/templates/embedded_cubins.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <cstddef>
-
-extern "C" {
-// __TRITON_AOT_GENERATE_BEGIN__ CUBIN_ARRAYS
-// placeholder
-// __TRITON_AOT_GENERATE_END__ CUBIN_ARRAYS
-}
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/templates/kernel.cpp b/recommendation_v4/generative_recommenders/ops/triton_aot/templates/kernel.cpp
deleted file mode 100644
index 7f882f11f..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/templates/kernel.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-// __TRITON_AOT_GENERATE_BEGIN__ HEADER_INCLUDE
-#include "kernel.h"
-// __TRITON_AOT_GENERATE_END__ HEADER_INCLUDE
-// These headers are used by code generated at runtime in KERNEL_SPECS blocks
-#include <torch/csrc/stable/accelerator.h>
-#include <torch/headeronly/core/ScalarType.h> // NOLINT(facebook-unused-include-check)
-
-inline void triton_aot_cu_check(CUresult err, const char* file, int line) {
-  if (err != CUDA_SUCCESS) {
-    const char* err_str;
-    cuGetErrorString(err, &err_str);
-    throw std::runtime_error(
-        std::string(file) + ":" + std::to_string(line) +
-        " CUDA driver error: " + (err_str ? err_str : "unknown"));
-  }
-}
-#define TRITON_AOT_CU_CHECK(EXPR) triton_aot_cu_check(EXPR, __FILE__, __LINE__)
-
-// NOLINTNEXTLINE(facebook-hte-NullableReturn): error path throws
-inline cudaStream_t triton_aot_get_current_stream() {
-  auto device_idx = torch::stable::accelerator::getCurrentDeviceIndex();
-  void* stream_ptr = nullptr;
-  // TODO: No torch::stable op provides the same functionality
-  // today. Revisit if torch exposes a proper stable::accelerator stream API.
-  if (aoti_torch_get_current_cuda_stream(device_idx, &stream_ptr) != 0) {
-    throw std::runtime_error("Failed to get current CUDA stream");
-  }
-  return reinterpret_cast<cudaStream_t>(stream_ptr);
-}
-
-namespace triton {
-namespace aot {
-
-namespace {
-[[maybe_unused]] int compute_capability() {
-  // Cached: AOTT hosts use homogeneous GPUs.
-  static int cc = 0;
-  if (cc == 0) {
-    CUdevice device;
-    TRITON_AOT_CU_CHECK(cuCtxGetDevice(&device));
-    int major, minor;
-    TRITON_AOT_CU_CHECK(cuDeviceGetAttribute(
-        &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
-    TRITON_AOT_CU_CHECK(cuDeviceGetAttribute(
-        &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
-    cc = major * 10 + minor;
-  }
-  return cc;
-}
-} // namespace
-
-namespace {
-#ifdef USE_ROCM
-[[maybe_unused]] void check_errors(int shared, hipFunction_t func) {
-  // HIP doesn't need the same shared memory configuration as CUDA
-  return;
-}
-#else
-[[maybe_unused]] void check_errors(int shared, CUfunction func) {
-  int shared_optin;
-  int device = 0;
-  TRITON_AOT_CU_CHECK(cuDeviceGetAttribute(
-      &shared_optin,
-      CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
-      device));
-  if (shared > 49152 && shared_optin > 49152) {
-    // If requested/shared_optin exceed 48 KB, it switches cache to prefer
-    // shared memory and sets the max dynamic shared memory so the kernel can
-    // allocate the larger amount needed.
-    TRITON_AOT_CU_CHECK(
-        cuFuncSetCacheConfig(func, CU_FUNC_CACHE_PREFER_SHARED));
-    int shared_total, shared_static;
-    TRITON_AOT_CU_CHECK(cuDeviceGetAttribute(
-        &shared_total,
-        CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
-        device));
-    TRITON_AOT_CU_CHECK(cuFuncGetAttribute(
-        &shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, func));
-    TRITON_AOT_CU_CHECK(cuFuncSetAttribute(
-        func,
-        CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
-        shared_optin - shared_static));
-  }
-}
-#endif
-} // namespace
-
-// __TRITON_AOT_GENERATE_BEGIN__ KERNEL_SPECS
-// __TRITON_AOT_GENERATE_END__ KERNEL_SPECS
-
-// __TRITON_AOT_GENERATE_BEGIN__ SELECTOR
-// __TRITON_AOT_GENERATE_END__ SELECTOR
-
-} // namespace aot
-} // namespace triton
-
-// Anchor: keeps the inline `triton_aot_get_current_stream` (and its reference
-// to `aoti_torch_get_current_cuda_stream`) from being dead-stripped at
-// buck-build time, where KERNEL_SPECS is empty. `weak` dedups the symbol
-// across the per-op .so files generated by the runtime template substitution.
-extern "C" __attribute__((weak, visibility("default"))) cudaStream_t
-__triton_aot_anchor_get_stream() {
-  return triton_aot_get_current_stream();
-}
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/templates/kernel.h b/recommendation_v4/generative_recommenders/ops/triton_aot/templates/kernel.h
deleted file mode 100644
index 6b6f76e17..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/templates/kernel.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#pragma once
-
-#include <torch/csrc/stable/tensor.h>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-namespace triton {
-namespace aot {
-
-#ifndef GRID_DIM_DEFINED_MACRO
-struct gridDims {
-  int x = 1;
-  int y = 1;
-  int z = 1;
-  cudaStream_t stream = nullptr;
-  gridDims(int _x = 1, int _y = 1, int _z = 1, cudaStream_t _stream = nullptr)
-      : x(_x), y(_y), z(_z), stream(_stream) {}
-};
-#define GRID_DIM_DEFINED_MACRO
-#endif
-
-#ifndef FITS_I32_DEFINED_MACRO
-constexpr bool fits_i32(int64_t v) {
-  return v >= INT32_MIN && v <= INT32_MAX;
-}
-#define FITS_I32_DEFINED_MACRO
-#endif
-
-// __TRITON_AOT_GENERATE_BEGIN__ TUNER_META_CPP
-// __TRITON_AOT_GENERATE_END__ TUNER_META_CPP
-// __TRITON_AOT_GENERATE_BEGIN__ SELECTOR_PROTO
-// __TRITON_AOT_GENERATE_END__ SELECTOR_PROTO
-
-} // namespace aot
-} // namespace triton
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/templates/template_utils.py b/recommendation_v4/generative_recommenders/ops/triton_aot/templates/template_utils.py
deleted file mode 100644
index d89bbe8b6..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/templates/template_utils.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-
-# pyre-strict
-
-"""
-Common utilities for template loading and rendering.
-
-This module provides functions to load template files from the templates
-directory and render them by replacing marker blocks with actual values.
-"""
-
-import re
-from collections import Counter
-from importlib import resources
-
-
-def load_template(name: str) -> str:
-    """Load template file content from Buck resources.
-
-    Templates are loaded from the resources bundled with this package via Buck's
-    select_accelerator mechanism:
-    - AMD builds: hipified templates with HIP APIs
-    - NVIDIA builds: original templates with CUDA APIs
-
-    Args:
-        name: Template filename (e.g., 'kernel.cpp', 'embedded_cubins.cpp').
-
-    Returns:
-        The template file content as a string.
-    """
-    return resources.files(__package__).joinpath(name).read_text()
-
-
-def render_template(template: str, replacements: dict[str, str]) -> str:
-    """Replace block markers in template with actual values.
-
-    Replaces content between "// __TRITON_AOT_GENERATE_BEGIN__ NAME"
-    and "// __TRITON_AOT_GENERATE_END__ NAME" with the value for key "NAME".
-    Each key must have exactly one BEGIN/END pair in the template.
-    The markers are preserved for easier debugging.
-
-    Args:
-        template: Template string containing marker blocks.
-        replacements: Dict mapping marker names to replacement values.
-
-    Returns:
-        Rendered template with all marker blocks replaced.
-
-    Raises:
-        AssertionError: If markers are duplicated, mismatched, or keys don't match.
-    """
-    BEGIN_PREFIX = "// __TRITON_AOT_GENERATE_BEGIN__ "
-    END_PREFIX = "// __TRITON_AOT_GENERATE_END__ "
-
-    begin_keys = re.findall(r"// __TRITON_AOT_GENERATE_BEGIN__ (\w+)", template)
-    end_keys = re.findall(r"// __TRITON_AOT_GENERATE_END__ (\w+)", template)
-
-    # Check for duplicate keys
-    begin_key_counts = Counter(begin_keys)
-    end_key_counts = Counter(end_keys)
-    for key, count in begin_key_counts.items():
-        assert count == 1, f"Duplicate BEGIN marker for key: {key}"
-    for key, count in end_key_counts.items():
-        assert count == 1, f"Duplicate END marker for key: {key}"
-
-    # Check BEGIN and END keys match
-    template_keys = set(begin_keys)
-    assert template_keys == set(end_keys), (
-        f"Mismatched BEGIN/END markers: BEGIN={template_keys}, END={set(end_keys)}"
-    )
-
-    # Validate keys match between template and replacements
-    replacement_keys = set(replacements.keys())
-    assert template_keys == replacement_keys, (
-        f"Keys mismatch: in template but not in replacements: {template_keys - replacement_keys}, "
-        f"in replacements but not in template: {replacement_keys - template_keys}"
-    )
-
-    # Do the replacements
-    result = template
-    for key, value in replacements.items():
-        begin_marker = f"{BEGIN_PREFIX}{key}"
-        end_marker = f"{END_PREFIX}{key}"
-
-        begin_idx = result.find(begin_marker)
-        newline_idx = result.find("\n", begin_idx)
-        assert newline_idx != -1, (
-            f"BEGIN marker for key '{key}' must be followed by newline"
-        )
-        content_start = newline_idx + 1
-        end_idx = result.find(end_marker, begin_idx)
-
-        result = result[:content_start] + value + result[end_idx:]
-
-    return result
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/templates/torch_op.cpp b/recommendation_v4/generative_recommenders/ops/triton_aot/templates/torch_op.cpp
deleted file mode 100644
index c2e5a4063..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/templates/torch_op.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// __TRITON_AOT_GENERATE_BEGIN__ HEADER_INCLUDE
-#include "kernel.h"
-// __TRITON_AOT_GENERATE_END__ HEADER_INCLUDE
-#include <torch/csrc/stable/library.h>
-#include <torch/csrc/stable/tensor.h> // NOLINT(facebook-unused-include-check)
-
-// __TRITON_AOT_GENERATE_BEGIN__ TORCH_OP
-namespace {
-// no-op, force link StableLibrary
-torch::stable::Tensor _triton_aot_placeholder_noop(
-    torch::stable::Tensor input) {
-  return input;
-}
-} // namespace
-
-STABLE_TORCH_LIBRARY_FRAGMENT(triton_aot, m) {
-  m.def("_placeholder_noop(Tensor input) -> Tensor");
-}
-STABLE_TORCH_LIBRARY_IMPL(triton_aot, CPU, m) {
-  m.impl("_placeholder_noop", TORCH_BOX(&_triton_aot_placeholder_noop));
-}
-// __TRITON_AOT_GENERATE_END__ TORCH_OP
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/transform/import_utils.py b/recommendation_v4/generative_recommenders/ops/triton_aot/transform/import_utils.py
deleted file mode 100644
index e3c4f1955..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/transform/import_utils.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# pyre-strict
-
-"""
-Import-header utilities for triton_aot codegen.
-"""
-
-import ast
-
-from torch import package
-
-
-def get_original_import_header(source_code: str) -> str:
-    """Extract all import statements from *source_code* as a single string."""
-    tree = ast.parse(source_code)
-    import_header = ""
-    for node in ast.walk(tree):
-        if isinstance(node, ast.ImportFrom):
-            import_header += ast.unparse(node) + "\n"
-        elif isinstance(node, ast.Import):
-            import_header += ast.unparse(node) + "\n"
-    return import_header
-
-
-def _is_extern_module(module_name: str, extern_modules: set[str]) -> bool:
-    """Return True if *module_name* (or a parent) is in the extern set."""
-    if module_name in extern_modules:
-        return True
-    parts = module_name.split(".")
-    for i in range(1, len(parts)):
-        if ".".join(parts[:i]) in extern_modules:
-            return True
-    return False
-
-
-def rewrite_package_imports(
-    import_header: str,
-    package_importer: package.PackageImporter,
-) -> str:
-    """Rewrite interned imports to use ``_package_importer``.
-
-    Extern modules (``torch``, ``typing``, …) keep regular ``import``
-    statements.  Interned modules (for example, local
-    ``generative_recommenders.*`` modules) are rewritten to::
-
-        _pkg_mod = _package_importer.import_module(
-            'generative_recommenders.ops.triton.triton_utils'
-        )
-        helper = _pkg_mod.helper
-
-    The ``_package_importer`` object is injected into the wrapper module's
-    namespace by ``replace_kernels`` before ``exec_module`` is called.
-    """
-    extern_modules = set(package_importer.extern_modules)
-    header_tree = ast.parse(import_header)
-
-    regular: list[str] = []
-    from_package: list[str] = []
-
-    for node in header_tree.body:
-        if isinstance(node, ast.Import):
-            for alias in node.names:
-                if _is_extern_module(alias.name, extern_modules):
-                    regular.append(ast.unparse(node))
-                else:
-                    local = alias.asname or alias.name
-                    from_package.append(
-                        f"{local} = _package_importer.import_module('{alias.name}')"
-                    )
-        elif isinstance(node, ast.ImportFrom):
-            mod = node.module or ""
-            if _is_extern_module(mod, extern_modules):
-                regular.append(ast.unparse(node))
-            else:
-                var = f"_pkg_{mod.replace('.', '_')}"
-                from_package.append(f"{var} = _package_importer.import_module('{mod}')")
-                for alias in node.names:
-                    local = alias.asname or alias.name
-                    from_package.append(f"{local} = {var}.{alias.name}")
-        else:
-            # Non-import statement (should not appear, but preserve if it does)
-            regular.append(ast.unparse(node))
-
-    parts: list[str] = []
-    if regular:
-        parts.append("\n".join(regular))
-    if from_package:
-        parts.append("# Imports resolved from torch package via _package_importer")
-        parts.append("\n".join(from_package))
-    return "\n".join(parts) + "\n" if parts else ""
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/transform/kernel_wrapper_codegen.py b/recommendation_v4/generative_recommenders/ops/triton_aot/transform/kernel_wrapper_codegen.py
deleted file mode 100644
index 51d144f1b..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/transform/kernel_wrapper_codegen.py
+++ /dev/null
@@ -1,500 +0,0 @@
-# pyre-strict
-import ast
-import inspect
-import os
-from typing import Any, Callable, Dict, List, Optional
-
-from generative_recommenders.ops.triton_aot.compile.compile_state import (
-    get_aott_compile_path,
-    get_triton_aot_kernel_specs,
-)
-from generative_recommenders.ops.triton_aot.compile.utils import unwrap_heuristic
-from generative_recommenders.ops.triton_aot.shared.compat import get_kernel_name
-from generative_recommenders.ops.triton_aot.transform.import_utils import (
-    get_original_import_header,
-    rewrite_package_imports,
-)
-from generative_recommenders.ops.triton_aot.types import TritonAOT
-from pyre_extensions import none_throws
-from torch import package
-from torch.fx import GraphModule
-
-# @manual=//triton:triton
-from triton.runtime.autotuner import Autotuner
-from triton.runtime.jit import JITFunction, KernelInterface
-
-
-def _is_torch_package_module(module_name: str) -> bool:
-    """Check if a module name is from torch.package namespace."""
-    return module_name.startswith("<torch_package")
-
-
-def _strip_torch_package_prefix(module_name: str) -> str:
-    """Strip the torch.package namespace prefix from a module name.
-
-    Example:
-        '<torch_package_0>.generative_recommenders.ops.triton_aot.triton_layer_norm'
-        -> 'generative_recommenders.ops.triton_aot.triton_layer_norm'
-    """
-    if _is_torch_package_module(module_name):
-        # Remove '<torch_package_N>.' prefix
-        return module_name.split(".", 1)[1]
-    return module_name
-
-
-def _get_clean_module_basename(module_name: str) -> str:
-    """Get the basename of a module, stripping torch.package prefix if present.
-
-    Example:
-        '<torch_package_0>.generative_recommenders.ops.triton_aot.triton_layer_norm'
-        -> 'triton_layer_norm'
-        'generative_recommenders.ops.triton_aot.triton_layer_norm'
-        -> 'triton_layer_norm'
-    """
-    clean_name = _strip_torch_package_prefix(module_name)
-    return clean_name.rsplit(".", 1)[-1]
-
-
-def _extract_function_source(module_source: str, fn_name: str) -> str:
-    """Extract a function's source code from module source.
-
-    Parses the module source and extracts just the function definition.
-    """
-    tree = ast.parse(module_source)
-    for node in ast.walk(tree):
-        if isinstance(node, ast.FunctionDef) and node.name == fn_name:
-            return ast.unparse(node)
-    raise ValueError(f"Function '{fn_name}' not found in module source")
-
-
-def _get_module_and_source(
-    target: Callable[..., Any],
-    package_importer: Optional[package.PackageImporter],
-) -> tuple[Any, str, str]:
-    """Get module, module source, and function source for a callable.
-
-    Handles both regular modules and torch.package loaded modules.
-
-    Args:
-        target: The callable (function) to get source for
-        package_importer: Optional PackageImporter for torch.package modules
-
-    Returns:
-        Tuple of (module, module_source, function_source)
-    """
-    module_name = target.__module__
-    fn_name = target.__name__
-
-    if _is_torch_package_module(module_name) and package_importer is not None:
-        # Handle torch.package namespace
-        real_module_name = _strip_torch_package_prefix(module_name)
-        assert real_module_name.startswith(
-            "generative_recommenders.ops.triton_aot"
-        ) or real_module_name.startswith("prime_perf_optimizer"), (
-            f"Expected module under 'generative_recommenders.ops.triton_aot' or 'prime_perf_optimizer', got: {real_module_name}"
-        )
-
-        # Get module source from package
-        module_source = package_importer.get_source(real_module_name)
-
-        # Import the module through the package importer
-        fn_module = package_importer.import_module(real_module_name)
-
-        # Extract function source from module source
-        fn_source = _extract_function_source(module_source, fn_name)
-
-        return fn_module, module_source, fn_source
-    else:
-        # Standard module handling
-        fn_module = inspect.getmodule(target)
-        module_source = inspect.getsource(none_throws(fn_module))
-        fn_source = inspect.getsource(target)
-
-        return fn_module, module_source, fn_source
-
-
-def _calls_triton_aot_kernel(node: ast.FunctionDef, kernel_name: str) -> bool:
-    """
-    kernel_name is the JIT function name (e.g. "_weighted_layer_norm_fwd"),
-    which may differ from the wrapper function name (e.g.
-    "_triton_aot_swish_layer_norm").  We match by looking for a
-    Subscript-call ``kernel_name[grid](...)`` inside the function body.
-    """
-    for child in ast.walk(node):
-        if (
-            isinstance(child, ast.Call)
-            and isinstance(child.func, ast.Subscript)
-            and isinstance(child.func.value, ast.Name)
-            and child.func.value.id == kernel_name
-        ):
-            return True
-    return False
-
-
-def _is_torch_jit_unused(d: ast.expr) -> bool:
-    """Check if a decorator AST node represents @torch.jit.unused."""
-    return (
-        isinstance(d, ast.Attribute)
-        and d.attr == "unused"
-        and isinstance(d.value, ast.Attribute)
-        and d.value.attr == "jit"
-        and isinstance(d.value.value, ast.Name)
-        and d.value.value.id == "torch"
-    )
-
-
-def strip_jit_unused_decorator(
-    node: ast.FunctionDef, kernel_name: str
-) -> ast.FunctionDef:
-    """Strip @torch.jit.unused if the function body calls ``kernel_name[grid](...)``.
-
-    kernel_name is the TritonAOT kernel's JIT function name (e.g.
-    ``_weighted_layer_norm_fwd``), not the wrapper function name.  This avoids
-    relying on a naming convention on the wrapper function itself.
-    """
-    if _calls_triton_aot_kernel(node, kernel_name):
-        node.decorator_list = [
-            d for d in node.decorator_list if not _is_torch_jit_unused(d)
-        ]
-    return node
-
-
-class TritonAOTOperatorTransform(ast.NodeTransformer):
-    def __init__(self, kernel: Any) -> None:
-        super().__init__()
-        self._kernel: Any = kernel
-        self._kernel_jit_fn: JITFunction[List[Any]] = unwrap_heuristic(
-            kernel, return_type=JITFunction
-        )
-        self._kernel_autotuner: Optional[Autotuner] = unwrap_heuristic(
-            kernel, return_type=Autotuner
-        )
-        self._kernel_name: str = get_kernel_name(self._kernel_jit_fn)
-        # Only transform the function body
-        self._autotune_params: List[str] = (
-            list(list(self._kernel_autotuner.cache.values())[0].kwargs.keys())
-            if self._kernel_autotuner is not None
-            else []
-        )
-        self._autotune_params += ["num_warps", "num_stages"]
-
-        self._lambda_arg_name: Optional[str] = None
-        self._grid_name: Optional[str] = None
-        self._autotune_key_id: Optional[Dict[str, int]] = None
-        self._autotune_key_map: Optional[Dict[str, ast.expr]] = None
-        self._kernel_meta: Optional[ast.Assign] = None
-
-        if self._kernel_autotuner is not None:
-            autotune_key_id: Dict[str, int] = {}
-            self._autotune_key_id = autotune_key_id
-            # pyre-ignore[16]: JITFunction has arg_names at runtime
-            for key in self._kernel_autotuner.keys:
-                autotune_key_id[key] = self._kernel_jit_fn.arg_names.index(key)
-
-    def generate_function_meta(self) -> None:
-        targets = [
-            ast.Name(id=param, ctx=ast.Store()) for param in self._autotune_params
-        ]
-        autotune_key_map = self._autotune_key_map
-        kernel_autotuner = self._kernel_autotuner
-        call = ast.Call(
-            func=ast.Name(id=f"{self._kernel_name}_meta", ctx=ast.Load()),
-            args=[
-                none_throws(autotune_key_map)[key]
-                for key in none_throws(kernel_autotuner).keys
-            ]
-            if kernel_autotuner is not None
-            else [],
-            keywords=[],
-        )
-        self._kernel_meta = ast.Assign(
-            # pyre-ignore[6]: ast.Assign targets type
-            targets=[ast.Tuple(elts=targets, ctx=ast.Store())],
-            value=call,
-        )
-
-    def visit_FunctionDef(self, node: ast.FunctionDef) -> ast.FunctionDef:
-        strip_jit_unused_decorator(node, self._kernel_name)
-
-        new_body: List[ast.stmt] = []
-        stmts = node.body
-        for stmt in stmts:
-            if isinstance(stmt, ast.Assign):
-                for target in stmt.targets:
-                    if isinstance(target, ast.Name) and target.id == self._grid_name:
-                        assert self._kernel_meta is not None
-                        self._kernel_meta.lineno = stmt.lineno
-                        new_body.append(self._kernel_meta)
-            new_body.append(self.visit(stmt))
-        node.body = new_body
-        return node
-
-    def visit_Assign(self, node: ast.Assign) -> ast.Assign:
-        for target in node.targets:
-            if isinstance(target, ast.Name) and isinstance(node.value, ast.Lambda):
-                lambda_node = node.value
-                self._lambda_arg_name = lambda_node.args.args[0].arg
-                lambda_body = lambda_node.body
-                assert isinstance(lambda_body, ast.Tuple)
-                new_elts: List[ast.expr] = []
-                for elt in lambda_body.elts:
-                    new_elts.append(self.visit(elt))
-                node.value = ast.Tuple(elts=new_elts, ctx=ast.Load())
-                self._lambda_arg_name = None
-        return node
-
-    def visit_Subscript(self, node: ast.Subscript) -> ast.expr:
-        if isinstance(node.value, ast.Name) and node.value.id == self._lambda_arg_name:
-            assert isinstance(node.slice, ast.Constant)
-            assert isinstance(node.slice.value, str)
-            var_name = node.slice.value
-            # pyre-ignore
-            node = ast.Name(id=var_name, ctx=ast.Load())
-        return node
-
-    def visit_Expr(self, node: ast.Expr) -> ast.Expr:
-        if isinstance(node.value, ast.Call):
-            call = node.value
-            if (
-                isinstance(call.func, ast.Subscript)
-                and isinstance(call.func.value, ast.Name)
-                and call.func.value.id == self._kernel_name
-            ):
-                grid_arg = call.func.slice
-                new_func = ast.Attribute(
-                    value=ast.Attribute(
-                        value=ast.Attribute(
-                            value=ast.Name(id="torch", ctx=ast.Load()),
-                            attr="ops",
-                            ctx=ast.Load(),
-                        ),
-                        attr="triton_aot",
-                        ctx=ast.Load(),
-                    ),
-                    attr=self._kernel_name,
-                    ctx=ast.Load(),
-                )
-                new_args = [grid_arg] + call.args
-                new_keywords = call.keywords + [
-                    ast.keyword(arg=param, value=ast.Name(id=param, ctx=ast.Load()))
-                    for param in self._autotune_params
-                ]
-                node.value = ast.Call(
-                    func=new_func,
-                    args=new_args,
-                    keywords=new_keywords,
-                )
-        return node
-
-    def contains_triton_call(self, node: ast.AST) -> bool:
-        for child in ast.walk(node):
-            if (
-                isinstance(child, ast.Call)
-                and isinstance(child.func, ast.Subscript)
-                # pyre-ignore[16]: ast.expr may have `id` attribute at runtime
-                and child.func.value.id == self._kernel_name
-            ):
-                # pyrefly: ignore [missing-attribute]
-                self._grid_name = child.func.slice.id
-
-                if self._kernel_autotuner is not None:
-                    autotune_key_map: Dict[str, ast.expr] = {}
-                    self._autotune_key_map = autotune_key_map
-                    # pyre-ignore[16]: Autotuner has keys at runtime
-                    for key in self._kernel_autotuner.keys:
-                        found_key = False
-                        for keyword in child.keywords:
-                            if keyword.arg == key:
-                                autotune_key_map[key] = keyword.value
-                                found_key = True
-                                break
-
-                        if not found_key:
-                            autotune_key_id = self._autotune_key_id
-                            assert autotune_key_id is not None
-                            assert key in autotune_key_id
-                            key_id = autotune_key_id[key]
-                            autotune_key_map[key] = child.args[key_id]
-
-                self.generate_function_meta()
-                return True
-        return False
-
-    def contains_lambda(self, node: ast.AST) -> bool:
-        for child in ast.walk(node):
-            if isinstance(child, ast.Lambda):
-                return True
-        return False
-
-    def _get_grid_name(self, node: ast.AST) -> Optional[str]:
-        for child in ast.walk(node):
-            if (
-                isinstance(child, ast.Call)
-                and isinstance(child.func, ast.Subscript)
-                # pyre-ignore[16]: ast.expr may have `id` attribute at runtime
-                and child.func.value.id == self._kernel_name
-            ):
-                # pyrefly: ignore [missing-attribute]
-                return child.func.slice.id
-        return None
-
-    def generate_so_loading_code(
-        self,
-        node: ast.AST,
-        abs_triton_aot_path: str,
-    ) -> str:
-        """Return auto-generated code to load the compiled kernel at runtime.
-
-        If *node* contains a call to this transformer's kernel, returns
-        ``import importlib.util`` + meta-module loading + ``torch.ops.load_library``
-        code.  Otherwise returns an empty string.
-
-        This method also sets up internal transformer state (grid name,
-        autotune key map, etc.) via ``contains_triton_call`` as a side effect.
-
-        Example for _addmm_fwd kernel:
-            kernel_dir = "triton_addmm__addmm_fwd"
-            meta_module_path = "/path/to/triton_aot_compile/triton_addmm__addmm_fwd/_addmm_fwd_meta.py"
-            so_path = "/path/to/triton_aot_compile/triton_addmm__addmm_fwd/addmm_fwd.so"
-        """
-        if not self.contains_triton_call(node):
-            return ""
-
-        kernel_dir = f"{_get_clean_module_basename(self._kernel_jit_fn.__module__)}_{self._kernel_name}"
-
-        meta_module_path = os.path.join(
-            abs_triton_aot_path, kernel_dir, f"{self._kernel_name}_meta.py"
-        )
-
-        so_path = os.path.join(
-            abs_triton_aot_path,
-            kernel_dir,
-            f"{self._kernel_name.lstrip('_')}.so",
-        )
-
-        return f"""
-# Auto-generated by triton_aot.kernel_wrapper_codegen
-import importlib.util
-_meta_spec = importlib.util.spec_from_file_location("{self._kernel_name}_meta", "{meta_module_path}")
-_meta_module = importlib.util.module_from_spec(_meta_spec)
-_meta_spec.loader.exec_module(_meta_module)
-{self._kernel_name}_meta = _meta_module.{self._kernel_name}_meta
-
-torch.ops.load_library("{so_path}")
-"""
-
-
-def _find_triton_aot_kernel(
-    node_target: Any,
-    kernel_specs: Dict[KernelInterface[List[Any]], List[Dict[str, List[Any]]]],
-) -> Optional[TritonAOT]:
-    """Find the single TritonAOT kernel referenced in a node target's globals.
-
-    Scans ``node_target.__globals__`` for ``TritonAOT`` instances, validates
-    that every instance appears in *kernel_specs*, and asserts at most one
-    kernel is present (per the one-kernel-per-wrapper invariant).
-
-    Returns the kernel, or ``None`` if the function references no kernels.
-    """
-    kernels: set[TritonAOT] = set()
-    for _, var in node_target.__globals__.items():
-        if isinstance(var, TritonAOT):
-            if var.fn in kernel_specs:
-                kernels.add(var)
-            else:
-                raise RuntimeError(
-                    f"Cannot find TritonAOT kernel {var.fn} in TRITON_AOT_KERNEL_SPECS"
-                )
-
-    if len(kernels) == 0:
-        return None
-
-    fn_name = node_target.__name__
-    assert len(kernels) == 1, (
-        f"Expected exactly 1 kernel per wrapper function '{fn_name}', "
-        f"got {len(kernels)}"
-    )
-    (kernel_obj,) = kernels
-    return kernel_obj
-
-
-def _generate_wrapper_files(
-    node_target: Any,
-    kernel: TritonAOT,
-    compile_path: str,
-    package_importer: Optional[package.PackageImporter],
-) -> None:
-    """Generate ``_original.py`` and ``_wrapper.py`` for a single kernel.
-
-    Creates a per-kernel subdirectory under *compile_path*, writes the
-    original function source, then AST-transforms the wrapper to replace
-    ``kernel[grid](...)`` with ``torch.ops.triton_aot.*`` calls.
-    """
-    fn_name = node_target.__name__
-
-    jit_fn = none_throws(
-        unwrap_heuristic(kernel, return_type=JITFunction),
-        f"Failed to unwrap kernel to JITFunction: {kernel}",
-    )
-    kernel_dir = (
-        f"{_get_clean_module_basename(jit_fn.__module__)}_{get_kernel_name(jit_fn)}"
-    )
-    output_dir = os.path.join(compile_path, kernel_dir)
-    os.makedirs(output_dir, exist_ok=True)
-
-    _, module_code, wrapper_code = _get_module_and_source(node_target, package_importer)
-    import_header = get_original_import_header(module_code)
-
-    with open(os.path.join(output_dir, f"{fn_name}_original.py"), "w") as f:
-        f.write(import_header)
-        f.write(wrapper_code)
-
-    # When source comes from a torch package, rewrite interned
-    # imports to use _package_importer, which
-    # is injected by replace_kernels at load time.  Must happen
-    # before auto-generated code is appended (so stdlib imports
-    # like ``import importlib.util`` are not touched).
-    if package_importer is not None:
-        import_header = rewrite_package_imports(import_header, package_importer)
-
-    tree = ast.parse(wrapper_code)
-    transformer = TritonAOTOperatorTransform(kernel=kernel)
-    import_header += transformer.generate_so_loading_code(tree, compile_path)
-    tree = transformer.visit(tree)
-
-    new_source_code = ast.unparse(tree)
-
-    with open(os.path.join(output_dir, f"{fn_name}_wrapper.py"), "w") as f:
-        f.write(import_header)
-        f.write(new_source_code)
-
-
-def kernel_wrapper_codegen(
-    module: GraphModule, packageImporter: package.PackageImporter | None = None
-) -> None:
-    """
-    Generate wrapper files for TritonAOT kernels.
-    Requirement: under wrapper.py, @triton.jit kernel/func is imported without 'as' alias.
-
-    For each function containing TritonAOT kernels, generates:
-    - {fn_name}_original.py: Original source code with imports
-    - {fn_name}_wrapper.py: Transformed wrapper that uses torch.ops.triton_aot
-    """
-    compile_path = get_aott_compile_path()
-    if not os.path.exists(compile_path):
-        os.makedirs(compile_path)
-
-    transformed_ops: set[Callable[..., Any]] = set()
-    kernel_specs = get_triton_aot_kernel_specs()
-    for node in module.graph.nodes:
-        if node.op == "call_function" and hasattr(node.target, "__globals__"):
-            if node.target not in transformed_ops:
-                transformed_ops.add(node.target)
-            else:
-                continue
-
-            kernel = _find_triton_aot_kernel(node.target, kernel_specs)
-            if kernel is not None:
-                _generate_wrapper_files(
-                    node.target, kernel, compile_path, packageImporter
-                )
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/transform/replace_kernels.py b/recommendation_v4/generative_recommenders/ops/triton_aot/transform/replace_kernels.py
deleted file mode 100644
index 53e5d83e9..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/transform/replace_kernels.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Permission is hereby granted, free of charge, to any person obtaining
-# a copy of this software and associated documentation files
-# (the "Software"), to deal in the Software without restriction,
-# including without limitation the rights to use, copy, modify, merge,
-# publish, distribute, sublicense, and/or sell copies of the Software,
-# and to permit persons to whom the Software is furnished to do so,
-# subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be
-# included in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-#!/usr/bin/env python3
-
-# pyre-strict
-
-import importlib.util
-import logging
-import os
-import sys
-from typing import Any, Dict, Optional
-
-from generative_recommenders.ops.triton_aot.compile.compile_state import (
-    get_aott_compile_path,
-)
-from torch import package
-from torch.fx import GraphModule
-
-logger: logging.Logger = logging.getLogger(__name__)
-
-
-def _find_wrapper_files(
-    compile_path: str,
-) -> list[tuple[str, str, str]]:
-    """Find all ``*_wrapper.py`` files under *compile_path*.
-
-    Walks one level deep into kernel subdirectories and returns a list of
-    ``(wrapper_name, fn_name, wrapper_path)`` tuples.
-    """
-    results: list[tuple[str, str, str]] = []
-    for dirpath, dirnames, filenames in os.walk(compile_path):
-        if dirpath != compile_path:
-            dirnames.clear()  # only recurse one level into kernel subdirs
-        for item in filenames:
-            if item.endswith("_wrapper.py"):
-                wrapper_name = item.removesuffix(".py")
-                fn_name = wrapper_name.removesuffix("_wrapper")
-                wrapper_path = os.path.join(dirpath, item)
-                results.append((wrapper_name, fn_name, wrapper_path))
-    return results
-
-
-def _load_wrapper_module(
-    wrapper_name: str,
-    fn_name: str,
-    wrapper_path: str,
-    package_importer: Optional[package.PackageImporter],
-) -> Optional[Any]:
-    """Dynamically import a single ``*_wrapper.py`` and return its wrapper callable.
-
-    Returns ``None`` if the module does not expose a function named *fn_name*.
-    """
-    spec = importlib.util.spec_from_file_location(wrapper_name, wrapper_path)
-    assert spec is not None, f"Failed to create spec for {wrapper_path}"
-    assert spec.loader is not None, f"Spec has no loader for {wrapper_path}"
-
-    loader = spec.loader
-    wrapper_module = importlib.util.module_from_spec(spec)
-
-    sys.modules[wrapper_name] = wrapper_module
-
-    if package_importer is not None:
-        wrapper_module._package_importer = package_importer  # type: ignore[attr-defined]
-
-    loader.exec_module(wrapper_module)
-
-    if hasattr(wrapper_module, fn_name):
-        return getattr(wrapper_module, fn_name)
-    return None
-
-
-def replace_kernels(
-    fx_m: GraphModule,
-    eager: bool = False,
-    package_importer: Optional[package.PackageImporter] = None,
-) -> GraphModule:
-    if eager:
-        raise NotImplementedError(
-            "Local generative_recommenders AOT-T transform does not support "
-            "eager replacement."
-        )
-
-    compile_path = get_aott_compile_path()
-    assert os.path.exists(compile_path), "triton_aot_compile dir does not exist"
-
-    wrapper_dict: Dict[str, Any] = {}
-    for wrapper_name, fn_name, wrapper_path in _find_wrapper_files(compile_path):
-        wrapper_fn = _load_wrapper_module(
-            wrapper_name, fn_name, wrapper_path, package_importer
-        )
-        if wrapper_fn is not None:
-            wrapper_dict[fn_name] = wrapper_fn
-
-    logger.info(f"replace_kernels: {wrapper_dict=}")
-
-    # Phase 2: Replace FX graph nodes
-    #   Walk the FX graph, find call_function nodes whose target name
-    #   matches a loaded wrapper, and swap the target so that
-    #   kernel[grid](...) calls become torch.ops.triton_aot.* calls.
-    replaced_count = 0
-    for nodes in fx_m.graph.nodes:
-        if nodes.op == "call_function" and nodes.target.__name__ in wrapper_dict.keys():
-            logger.info(
-                f"Replaced node: {nodes.op} {nodes.target} -> {wrapper_dict[nodes.target.__name__]} {nodes.meta}"
-            )
-            nodes.target = wrapper_dict[nodes.target.__name__]
-            replaced_count += 1
-
-    assert replaced_count > 0, (
-        f"No ops were replaced with triton_aot wrappers. "
-        f"wrapper_dict={wrapper_dict}, compile_path={compile_path}"
-    )
-    logger.info(
-        f"Successfully replaced {replaced_count} op(s) with triton_aot wrappers."
-    )
-
-    fx_m.recompile()
-    return fx_m
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/transform/transform_kernels.py b/recommendation_v4/generative_recommenders/ops/triton_aot/transform/transform_kernels.py
deleted file mode 100644
index c2be78989..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/transform/transform_kernels.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# pyre-strict
-
-from typing import Optional
-
-from generative_recommenders.ops.triton_aot.transform.kernel_wrapper_codegen import (
-    kernel_wrapper_codegen,
-)
-from generative_recommenders.ops.triton_aot.transform.replace_kernels import (
-    replace_kernels,
-)
-from torch import package
-from torch.fx import GraphModule
-
-
-def transform_kernels(
-    fx_m: GraphModule,
-    eager: bool = False,
-    package_importer: Optional[package.PackageImporter] = None,
-) -> GraphModule:
-    """Generate AOT wrappers and replace FX graph nodes in one step.
-
-    1. kernel_wrapper_codegen: AST-transforms wrapper functions,
-       rewrites kernel[grid](...) -> torch.ops.triton_aot.kernel(...),
-       writes {fn}_wrapper.py
-    2. replace_kernels: loads wrappers and replaces graph node targets
-    """
-    kernel_wrapper_codegen(fx_m, package_importer)
-    return replace_kernels(fx_m, eager=eager, package_importer=package_importer)
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_addmm.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_addmm.py
deleted file mode 100644
index b71ec8144..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_addmm.py
+++ /dev/null
@@ -1,347 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Permission is hereby granted, free of charge, to any person obtaining
-# a copy of this software and associated documentation files
-# (the "Software"), to deal in the Software without restriction,
-# including without limitation the rights to use, copy, modify, merge,
-# publish, distribute, sublicense, and/or sell copies of the Software,
-# and to permit persons to whom the Software is furnished to do so,
-# subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be
-# included in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-# pyre-strict
-# pyre-ignore-all-errors[2]: Triton has its own type system on func's input
-
-#!/usr/bin/env python3
-
-
-from typing import Any, List, Tuple
-
-import torch
-
-# @manual=//triton:triton
-import triton
-
-# @manual=//triton:triton
-import triton.language as tl
-from generative_recommenders.common import (
-    BACKEND_ALLOW_TF32,
-    cdiv,
-    should_trigger_eager_impl,
-)
-from generative_recommenders.ops.triton_aot.types import triton_aot
-
-
-def get_mm_configs() -> List[triton.Config]:
-    return [
-        triton.Config(
-            {
-                "BLOCK_M": 32,
-                "BLOCK_N": 64,
-                "BLOCK_K": 32,
-                "GROUP_M": 8,
-            },
-            num_stages=5,
-            num_warps=2,
-        ),
-        triton.Config(
-            {
-                "BLOCK_M": 128,
-                "BLOCK_N": 256,
-                "BLOCK_K": 64,
-                "GROUP_M": 8,
-            },
-            num_stages=3,
-            num_warps=8,
-        ),
-        triton.Config(
-            {
-                "BLOCK_M": 64,
-                "BLOCK_N": 256,
-                "BLOCK_K": 32,
-                "GROUP_M": 8,
-            },
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {
-                "BLOCK_M": 128,
-                "BLOCK_N": 128,
-                "BLOCK_K": 32,
-                "GROUP_M": 8,
-            },
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {
-                "BLOCK_M": 128,
-                "BLOCK_N": 64,
-                "BLOCK_K": 32,
-                "GROUP_M": 8,
-            },
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {
-                "BLOCK_M": 64,
-                "BLOCK_N": 128,
-                "BLOCK_K": 32,
-                "GROUP_M": 8,
-            },
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {
-                "BLOCK_M": 128,
-                "BLOCK_N": 32,
-                "BLOCK_K": 32,
-                "GROUP_M": 8,
-            },
-            num_stages=4,
-            num_warps=4,
-        ),
-        triton.Config(
-            {
-                "BLOCK_M": 64,
-                "BLOCK_N": 32,
-                "BLOCK_K": 32,
-                "GROUP_M": 8,
-            },
-            num_stages=5,
-            num_warps=2,
-        ),
-    ]
-
-
-@triton_aot(
-    annotations={
-        "M": "i32",
-        "N": ("i32", 16),
-        "K": ("i32", 16),
-        "stride_xm": ("i32", 16),
-        "stride_xk": ("i32", 1),
-        "stride_wk": ("i32", 16),
-        "stride_wn": ("i32", 1),
-        "stride_ym": ("i32", 16),
-        "stride_yn": ("i32", 1),
-        "stride_zm": ("i32", 16),
-        "stride_zn": ("i32", 1),
-    },
-)
-# pyre-ignore[56]: Pyre cannot infer triton.autotune decorator type
-@triton.autotune(
-    configs=get_mm_configs(),
-    key=["N", "K"],
-)
-@triton.jit
-def _addmm_fwd(
-    x_ptr,
-    w_ptr,
-    y_ptr,
-    z_ptr,
-    M,
-    N,
-    K,
-    stride_xm,
-    stride_xk,
-    stride_wk,
-    stride_wn,
-    stride_ym,
-    stride_yn,
-    stride_zm,
-    stride_zn,
-    BLOCK_M: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    BLOCK_K: tl.constexpr,
-    GROUP_M: tl.constexpr,
-    ALLOW_TF32: tl.constexpr,
-    BROADCAST_Y: tl.constexpr,
-) -> None:
-    pid_0, pid_1 = tl.program_id(axis=0), tl.program_id(axis=1)
-    pid = pid_0 * tl.num_programs(axis=1) + pid_1
-    num_pid_m = tl.cdiv(M, BLOCK_M)
-    num_pid_n = tl.cdiv(N, BLOCK_N)
-    num_pid_in_group = GROUP_M * num_pid_n
-    group_id = pid // num_pid_in_group
-    first_pid_m = group_id * GROUP_M
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_M)
-    pid_m = first_pid_m + (pid % group_size_m)
-    pid_n = (pid % num_pid_in_group) // group_size_m
-
-    offs_m = tl.arange(0, BLOCK_M)
-    offs_k = tl.arange(0, BLOCK_K)
-    offs_n = tl.arange(0, BLOCK_N)
-    mask_m = (pid_m * BLOCK_M + offs_m)[:, None] < M
-    mask_n = (pid_n * BLOCK_N + offs_n)[None, :] < N
-    x_ptr += pid_m.to(tl.int64) * BLOCK_M * stride_xm
-    x_ptrs = x_ptr + (offs_m[:, None] * stride_xm + offs_k[None, :] * stride_xk)
-    w_ptr += pid_n.to(tl.int64) * BLOCK_N * stride_wn
-    w_ptrs = w_ptr + (offs_k[:, None] * stride_wk + offs_n[None, :] * stride_wn)
-    accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(K, BLOCK_K)):
-        mask_k = offs_k[None, :] < K - k * BLOCK_K
-        x = tl.load(x_ptrs, mask=mask_k & mask_m, other=0.0)
-        mask_k = offs_k[:, None] < K - k * BLOCK_K
-        w = tl.load(w_ptrs, mask=mask_k & mask_n, other=0.0)
-        accumulator += tl.dot(x, w, allow_tf32=ALLOW_TF32)
-        x_ptrs += BLOCK_K * stride_xk
-        w_ptrs += BLOCK_K * stride_wk
-
-    z_mask = mask_m & mask_n
-    if BROADCAST_Y:
-        # y is a vector, broadcast to add to z
-        y_ptr += pid_n.to(tl.int64) * BLOCK_N * stride_yn
-        y_ptrs = y_ptr + stride_yn * offs_n[None, :]
-        y = tl.load(y_ptrs, mask=mask_n)
-    else:
-        y_ptr += pid_m.to(tl.int64) * BLOCK_M * stride_ym
-        y_ptr += pid_n.to(tl.int64) * BLOCK_N * stride_yn
-        y_ptrs = y_ptr + stride_ym * offs_m[:, None] + stride_yn * offs_n[None, :]
-        y = tl.load(y_ptrs, mask=z_mask)
-    z = (accumulator + y.to(tl.float32)).to(z_ptr.dtype.element_ty)
-    z_ptr += pid_m.to(tl.int64) * BLOCK_M * stride_zm
-    z_ptr += pid_n.to(tl.int64) * BLOCK_N * stride_zn
-    z_ptrs = z_ptr + stride_zm * offs_m[:, None] + stride_zn * offs_n[None, :]
-    tl.store(z_ptrs, z, mask=z_mask)
-
-
-@torch.jit.unused
-@torch.fx.wrap
-def _triton_aot_addmm_fwd(
-    x: torch.Tensor,
-    w: torch.Tensor,
-    y: torch.Tensor,
-    allow_tf32: bool = BACKEND_ALLOW_TF32,
-) -> torch.Tensor:
-    M, K = x.shape
-    KB, N = w.shape
-    assert K == KB, f"incompatible dimensions {K}, {KB}"
-
-    is_y_1d = y.dim() == 1
-    NY = y.shape[0] if is_y_1d else y.shape[1]
-    assert N == NY, f"incompatible dimensions {N}, {NY}"
-
-    # Allocate output
-    z = torch.empty((M, N), device=x.device, dtype=x.dtype)
-    if M == 0 or N == 0:
-        return z
-
-    grid = lambda meta: (  # noqa E731
-        cdiv(M, meta["BLOCK_M"]),
-        cdiv(N, meta["BLOCK_N"]),
-    )
-
-    _addmm_fwd[grid](
-        x,
-        w,
-        y,
-        z,
-        M,
-        N,
-        K,
-        x.stride(0),
-        x.stride(1),
-        w.stride(0),
-        w.stride(1),
-        y.stride(0) if not is_y_1d else 0,
-        y.stride(1) if not is_y_1d else y.stride(0),
-        z.stride(0),
-        z.stride(1),
-        ALLOW_TF32=allow_tf32,
-        BROADCAST_Y=is_y_1d,
-    )
-    return z
-
-
-def _triton_aot_addmm_fwd_eager(
-    x: torch.Tensor,
-    w: torch.Tensor,
-    y: torch.Tensor,
-) -> torch.Tensor:
-    return torch.addmm(y, x, w)
-
-
-@torch.fx.wrap
-def _triton_aot_addmm_fwd_maybe_eager(
-    x: torch.Tensor,
-    w: torch.Tensor,
-    y: torch.Tensor,
-) -> torch.Tensor:
-    if torch.jit.is_scripting():
-        # call eager
-        return torch.addmm(y, x, w)
-    else:
-        return _triton_aot_addmm_fwd(x, w, y)
-
-
-def triton_addmm_bwd(
-    x: torch.Tensor,
-    w: torch.Tensor,
-    dz: torch.Tensor,
-    is_y_1d: bool,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    if is_y_1d:
-        dy = torch.sum(dz, dim=0)
-    else:
-        dy = dz
-    dw = torch.mm(x.t(), dz)
-    dx = torch.mm(dz, w.t())
-
-    return dx, dw, dy
-
-
-class _AddMmFunction(torch.autograd.Function):
-    @staticmethod
-    # pyre-ignore[14]: autograd.Function signature override
-    def forward(
-        ctx: Any,
-        x: torch.Tensor,
-        w: torch.Tensor,
-        y: torch.Tensor,
-    ) -> torch.Tensor:
-        ctx.save_for_backward(x, w)
-        ctx.is_y_1d = y.dim() == 1
-        return _triton_aot_addmm_fwd(x, w, y)
-
-    @staticmethod
-    # pyre-ignore[14]: autograd.Function signature override
-    def backward(
-        ctx: Any, dz: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        (x, w) = ctx.saved_tensors
-        return triton_addmm_bwd(x, w, dz, ctx.is_y_1d)
-
-
-def triton_addmm(
-    input: torch.Tensor,
-    mat1: torch.Tensor,
-    mat2: torch.Tensor,
-) -> torch.Tensor:
-    return _AddMmFunction.apply(mat1, mat2, input)
-
-
-@torch.fx.wrap
-def aot_triton_kernel_wrapper_addmm(
-    input: torch.Tensor,
-    mat1: torch.Tensor,
-    mat2: torch.Tensor,
-    allow_tf32: bool = BACKEND_ALLOW_TF32,
-) -> torch.Tensor:
-    if should_trigger_eager_impl():
-        return torch.addmm(input, mat1, mat2)
-    else:
-        return _triton_aot_addmm_fwd(mat1, mat2, input, allow_tf32=allow_tf32)
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_concat_2d_jagged.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_concat_2d_jagged.py
deleted file mode 100644
index 8afc3c18e..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_concat_2d_jagged.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Permission is hereby granted, free of charge, to any person obtaining
-# a copy of this software and associated documentation files
-# (the "Software"), to deal in the Software without restriction,
-# including without limitation the rights to use, copy, modify, merge,
-# publish, distribute, sublicense, and/or sell copies of the Software,
-# and to permit persons to whom the Software is furnished to do so,
-# subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be
-# included in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-# pyre-strict
-
-from typing import Optional
-
-import torch
-from generative_recommenders.common import (
-    fx_unwrap_optional_tensor,
-    next_power_of_2,
-    should_trigger_eager_impl,
-)
-from generative_recommenders.ops.pytorch.pt_jagged import (
-    pytorch_replace_last_n_with_jagged,
-)
-from generative_recommenders.ops.pytorch.pt_jagged_tensors import (
-    pytorch_concat_2D_jagged,
-)
-from generative_recommenders.ops.triton.triton_jagged import concat_2D_jagged
-from generative_recommenders.ops.triton_aot.types import triton_aot
-
-
-concat_2D_jagged = triton_aot(
-    annotations={
-        "DenseSize": "i32",
-        "D": "i32",
-        "stride_ad": "i32",
-        "stride_bd": "i32",
-        "stride_dense_batch": "i32",
-        "stride_od": "i32",
-    },
-    # pyrefly: ignore [bad-argument-type]
-)(concat_2D_jagged)
-
-
-@torch.jit.unused
-@torch.fx.wrap
-def _triton_aot_concat_2D_jagged(
-    max_seq_len: int,
-    values_a: torch.Tensor,
-    values_b: torch.Tensor,
-    offsets_a: Optional[torch.Tensor] = None,
-    offsets_b: Optional[torch.Tensor] = None,
-    is_replace: bool = False,
-) -> torch.Tensor:
-    is_dense_a = offsets_a is None
-    is_dense_b = offsets_b is None
-
-    dense_size: int = 0
-    if is_dense_a:
-        B, dense_size, D = values_a.size()
-        offsets_b = fx_unwrap_optional_tensor(offsets_b)
-        jagged_seq_len, _ = values_b.shape
-        values_out = torch.empty(
-            (dense_size * B + jagged_seq_len, D),
-            device=values_b.device,
-            dtype=values_b.dtype,
-        )
-        offsets_a = offsets_b.new_empty(0)
-        stride_dense_batch = values_a.stride(0)
-    elif is_dense_b:
-        B, dense_size, D = values_b.size()
-        offsets_a = fx_unwrap_optional_tensor(offsets_a)
-        jagged_seq_len, _ = values_a.shape
-        values_out = torch.empty(
-            (jagged_seq_len + dense_size * B, D),
-            device=values_a.device,
-            dtype=values_a.dtype,
-        )
-        offsets_b = offsets_a.new_empty(0)
-        stride_dense_batch = values_b.stride(0)
-    else:
-        offsets_a = fx_unwrap_optional_tensor(offsets_a)
-        offsets_b = fx_unwrap_optional_tensor(offsets_b)
-        B = offsets_a.size(0) - 1
-        seq_len_a, D = values_a.shape
-        seq_len_b, _ = values_b.shape
-        if is_replace:
-            values_out = torch.empty_like(values_a)
-        else:
-            values_out = torch.empty(
-                (seq_len_a + seq_len_b, D), device=values_a.device, dtype=values_a.dtype
-            )
-        stride_dense_batch = 0
-
-    # Make sure offsets are alignted on 16-byte to match AOTT spec
-    if (
-        offsets_a is not None
-        and (offsets_a.storage_offset() * offsets_a.element_size()) % 16 != 0
-    ):
-        offsets_a = offsets_a.clone()
-    if (
-        offsets_b is not None
-        and (offsets_b.storage_offset() * offsets_b.element_size()) % 16 != 0
-    ):
-        offsets_b = offsets_b.clone()
-
-    BLOCK_D = next_power_of_2(D)
-
-    grid = (max_seq_len, B)
-    # pyrefly: ignore [not-callable]
-    concat_2D_jagged[grid](
-        OffsetsA=offsets_a,
-        ValuesA=values_a,
-        OffsetsB=offsets_b,
-        ValuesB=values_b,
-        DenseSize=dense_size,
-        Out=values_out,
-        D=D,
-        stride_ad=(values_a.stride(1) if is_dense_a else values_a.stride(0)),
-        stride_bd=(values_b.stride(1) if is_dense_b else values_b.stride(0)),
-        stride_dense_batch=stride_dense_batch,
-        stride_od=values_out.stride(0),
-        # pyrefly: ignore [bad-argument-type]
-        IS_DENSE_A=is_dense_a,
-        # pyrefly: ignore [bad-argument-type]
-        IS_DENSE_B=is_dense_b,
-        # pyrefly: ignore [bad-argument-type]
-        BLOCK_D=BLOCK_D,
-        # pyrefly: ignore [bad-argument-type]
-        IS_REPLACE=is_replace,
-    )
-    return values_out
-
-
-@torch.fx.wrap
-# "aot_triton_kernel_wrapper_" is a pre-defined prefix for
-# AOT-T triton kernel wrapper functions. This is required for
-# AOT-T backend to recognize and trace correctly for ops transformation.
-def aot_triton_kernel_wrapper_concat_2D_jagged(
-    max_seq_len: int,
-    values_a: torch.Tensor,
-    values_b: torch.Tensor,
-    offsets_a: Optional[torch.Tensor] = None,
-    offsets_b: Optional[torch.Tensor] = None,
-    is_replace: bool = False,
-) -> torch.Tensor:
-    if should_trigger_eager_impl():
-        if is_replace:
-            assert offsets_a is not None and offsets_b is not None
-            return pytorch_replace_last_n_with_jagged(
-                max_seq_len_left=max_seq_len,
-                offsets_left=offsets_a,
-                values_left=values_a,
-                offsets_right=offsets_b,
-                values_right=values_b,
-            )
-        return pytorch_concat_2D_jagged(
-            values_left=values_a,
-            values_right=values_b,
-            max_len_left=max_seq_len if offsets_a is None else None,
-            max_len_right=max_seq_len if offsets_b is None else None,
-            offsets_left=offsets_a,
-            offsets_right=offsets_b,
-        )
-    else:
-        return _triton_aot_concat_2D_jagged(
-            max_seq_len=max_seq_len,
-            values_a=values_a,
-            values_b=values_b,
-            offsets_a=offsets_a,
-            offsets_b=offsets_b,
-            is_replace=is_replace,
-        )
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_group_norm_mul_dropout.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_group_norm_mul_dropout.py
deleted file mode 100644
index 15c609a3c..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_group_norm_mul_dropout.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# pyre-strict
-
-import torch
-from generative_recommenders.common import next_power_of_2, should_trigger_eager_impl
-from generative_recommenders.ops.pytorch.pt_hstu_linear import pytorch_norm_mul_dropout
-from generative_recommenders.ops.triton.triton_hstu_linear import (
-    _group_norm_mul_dropout_fwd,
-)
-from generative_recommenders.ops.triton_aot.types import triton_aot
-
-_group_norm_mul_dropout_fwd = triton_aot(
-    annotations={
-        "D": ("i32", 16),
-        "eps": "fp32",
-        "seed": "i64",
-        "dropout_ratio": "fp32",
-        "stride_x": ("i32", 16),
-        "stride_u": ("i32", 16),
-        "stride_y": ("i32", 16),
-    },
-    # pyrefly: ignore [bad-argument-type]
-)(_group_norm_mul_dropout_fwd)
-
-
-@torch.jit.unused
-@torch.fx.wrap
-def _triton_aot_group_norm_mul_dropout(
-    x: torch.Tensor,
-    u: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor,
-    eps: float,
-    silu_u: bool,
-    concat_ux: bool,
-    num_heads: int,
-    linear_dim: int,
-) -> torch.Tensor:
-    x = x.contiguous()
-    u = u.contiguous()
-    N, _ = x.shape
-    if concat_ux:
-        y = torch.empty((N, 3 * num_heads * linear_dim), dtype=x.dtype, device=x.device)
-    else:
-        y = torch.empty((N, num_heads * linear_dim), dtype=x.dtype, device=x.device)
-    mean = torch.empty((N * num_heads,), dtype=x.dtype, device=x.device)
-    rstd = torch.empty((N * num_heads,), dtype=x.dtype, device=x.device)
-
-    BLOCK_D = next_power_of_2(linear_dim)
-    BLOCK_H = next_power_of_2(num_heads)
-
-    seed = 0
-    dropout_ratio = 0.0
-
-    grid = (N,)
-    # pyrefly: ignore [not-callable]
-    _group_norm_mul_dropout_fwd[grid](
-        x,  # X
-        u,  # U
-        y,  # Y
-        weight,  # W
-        bias,  # B
-        mean,  # Mean
-        rstd,  # Rstd
-        linear_dim,  # D
-        num_heads,  # Heads
-        eps,  # eps
-        seed,  # seed
-        dropout_ratio,  # dropout_ratio
-        x.stride(0),  # stride_x
-        u.stride(0),  # stride_u
-        y.stride(0),  # stride_y
-        # pyrefly: ignore [bad-argument-type]
-        SILU_U=silu_u,
-        # pyrefly: ignore [bad-argument-type]
-        BLOCK_D=BLOCK_D,
-        # pyrefly: ignore [bad-argument-type]
-        BLOCK_H=BLOCK_H,
-        # pyrefly: ignore [bad-argument-type]
-        TRAINING=False,
-        # pyrefly: ignore [bad-argument-type]
-        CONCAT_UX=concat_ux,
-    )
-    return y
-
-
-@torch.fx.wrap
-def aot_triton_kernel_wrapper_group_norm_mul_dropout(
-    x: torch.Tensor,
-    u: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor,
-    eps: float,
-    silu_u: bool,
-    concat_ux: bool,
-    num_heads: int,
-    linear_dim: int,
-) -> torch.Tensor:
-    if should_trigger_eager_impl():
-        return pytorch_norm_mul_dropout(
-            x=x,
-            u=u,
-            weight=weight,
-            bias=bias,
-            eps=eps,
-            dropout_ratio=0.0,
-            training=False,
-            silu_u=silu_u,
-            concat_u=concat_ux,
-            concat_x=concat_ux,
-            group_norm=True,
-            num_heads=num_heads,
-            linear_dim=linear_dim,
-        )
-    return _triton_aot_group_norm_mul_dropout(
-        x=x,
-        u=u,
-        weight=weight,
-        bias=bias,
-        eps=eps,
-        silu_u=silu_u,
-        concat_ux=concat_ux,
-        num_heads=num_heads,
-        linear_dim=linear_dim,
-    )
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_layer_norm.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_layer_norm.py
deleted file mode 100644
index c5339033f..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_layer_norm.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Permission is hereby granted, free of charge, to any person obtaining
-# a copy of this software and associated documentation files
-# (the "Software"), to deal in the Software without restriction,
-# including without limitation the rights to use, copy, modify, merge,
-# publish, distribute, sublicense, and/or sell copies of the Software,
-# and to permit persons to whom the Software is furnished to do so,
-# subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be
-# included in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-# pyre-strict
-
-#!/usr/bin/env python3
-
-import torch
-from generative_recommenders.common import (
-    cdiv,
-    next_power_of_2,
-    should_trigger_eager_impl,
-    switch_to_contiguous_if_needed,
-)
-from generative_recommenders.ops.pytorch.pt_layer_norm import (
-    pytorch_layer_norm,
-    pytorch_swish_layer_norm,
-)
-from generative_recommenders.ops.triton.triton_layer_norm import (
-    _weighted_layer_norm_fwd,
-)
-from generative_recommenders.ops.triton_aot.types import triton_aot
-
-
-_weighted_layer_norm_fwd = triton_aot(
-    annotations={
-        "N": "i32",
-        "D": ("i32", 16),
-        "stride_x": ("i32", 16),
-        "stride_y": ("i32", 16),
-    },
-)(_weighted_layer_norm_fwd)
-
-
-@torch.jit.unused
-@torch.fx.wrap
-def _triton_aot_swish_layer_norm(
-    x: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor,
-    eps: float,
-    is_swish: bool,
-) -> torch.Tensor:
-    assert x.dim() == 2, f"x.dim() == {x.dim()}, expected 2"
-    x = switch_to_contiguous_if_needed(x)
-    N, D = x.shape
-
-    assert weight.dim() == 1
-    assert bias.dim() == 1
-    assert weight.numel() == D
-    assert bias.numel() == D
-
-    y = torch.empty_like(x)
-
-    BLOCK_D = next_power_of_2(D)
-
-    grid = lambda meta: (  # noqa E731
-        cdiv(N, meta["BLOCK_N"]),
-    )
-    # pyrefly: ignore [not-callable]
-    _weighted_layer_norm_fwd[grid](
-        x,
-        y,
-        weight,
-        bias,
-        torch.empty(0, dtype=torch.float32),
-        torch.empty(0, dtype=torch.float32),
-        N,
-        D,
-        eps,
-        stride_x=x.stride(0),
-        stride_y=y.stride(0),
-        IS_SWISH=is_swish,
-        TRAINING=False,
-        BLOCK_D=BLOCK_D,
-        COMPUTE_MEAN_AND_RSTD=True,
-    )
-
-    return y
-
-
-@torch.fx.wrap
-# "aot_triton_kernel_wrapper_" is a pre-defined prefix for
-# AOT-T triton kernel wrapper functions. This is required for
-# AOT-T backend to recognize and trace correctly for ops transformation.
-def aot_triton_kernel_wrapper_swish_layer_norm(
-    x: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor,
-    eps: float,
-    is_swish: bool,
-) -> torch.Tensor:
-    if should_trigger_eager_impl():
-        if is_swish:
-            return pytorch_swish_layer_norm(x, [x.shape[1]], weight, bias, eps).to(
-                x.dtype
-            )
-        else:
-            return pytorch_layer_norm(x, [x.shape[1]], weight, bias, eps).to(x.dtype)
-    else:
-        return _triton_aot_swish_layer_norm(x, weight, bias, eps, is_swish)
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_layer_norm_mul_dropout.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_layer_norm_mul_dropout.py
deleted file mode 100644
index 7f7a6c743..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_layer_norm_mul_dropout.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Permission is hereby granted, free of charge, to any person obtaining
-# a copy of this software and associated documentation files
-# (the "Software"), to deal in the Software without restriction,
-# including without limitation the rights to use, copy, modify, merge,
-# publish, distribute, sublicense, and/or sell copies of the Software,
-# and to permit persons to whom the Software is furnished to do so,
-# subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be
-# included in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-# pyre-strict
-
-#!/usr/bin/env python3
-
-
-import torch
-from generative_recommenders.common import next_power_of_2, should_trigger_eager_impl
-from generative_recommenders.ops.pytorch.pt_hstu_linear import pytorch_norm_mul_dropout
-from generative_recommenders.ops.triton.triton_hstu_linear import _ln_mul_dropout_fwd
-from generative_recommenders.ops.triton_aot.types import triton_aot
-
-_ln_mul_dropout_fwd = triton_aot(
-    annotations={
-        "D": ("i32", 16),
-        "stride_x": ("i32", 16),
-        "stride_u": ("i32", 16),
-        "stride_y": ("i32", 16),
-    },
-    # pyrefly: ignore [bad-argument-type]
-)(_ln_mul_dropout_fwd)
-
-
-@torch.jit.unused
-@torch.fx.wrap
-def _triton_aot_layer_norm_mul_dropout(
-    x: torch.Tensor,
-    u: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor,
-    eps: float,
-    dropout_ratio: float,
-    training: bool,
-    silu_u: bool,
-    concat_ux: bool,
-    mul_u_activation_type: str,
-) -> torch.Tensor:
-    assert x.dim() == 2
-    if x.stride(1) != 1:
-        x = x.contiguous()
-    N, D = x.shape
-    assert weight.dim() == 1
-    assert bias.dim() == 1
-    assert weight.numel() == D
-    assert bias.numel() == D
-
-    if concat_ux:
-        y = torch.empty((N, 3 * D), dtype=x.dtype, device=x.device)
-    else:
-        y = torch.empty_like(x)
-    if N == 0:
-        return y
-    mean = x.new_empty((N,))
-    rstd = x.new_empty((N,))
-
-    # Less than 64KB per feature: enqueue fused kernel
-    MAX_FUSED_SIZE = 65536 // x.element_size()
-    BLOCK_D = min(MAX_FUSED_SIZE, next_power_of_2(D))
-    if D > BLOCK_D:
-        raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
-
-    seed = 0
-    # num_warps = min(max(BLOCK_D // 256, 1), 8)
-    grid = (N,)
-    # pyrefly: ignore [not-callable]
-    _ln_mul_dropout_fwd[grid](
-        x,
-        u,
-        y,
-        weight,
-        bias,
-        mean,
-        rstd,
-        D,
-        eps,
-        seed,
-        dropout_ratio,
-        x.stride(0),
-        u.stride(0),
-        y.stride(0),
-        # pyrefly: ignore [bad-argument-type]
-        SILU_U=silu_u,
-        # pyrefly: ignore [bad-argument-type]
-        BLOCK_D=BLOCK_D,
-        # pyrefly: ignore [bad-argument-type]
-        TRAINING=training,
-        # pyrefly: ignore [bad-argument-type]
-        CONCAT_U=concat_ux,
-        # pyrefly: ignore [bad-argument-type]
-        CONCAT_X=concat_ux,
-        # pyrefly: ignore [bad-argument-type]
-        MUL_U_ACTIVATION_TYPE=mul_u_activation_type,
-        # pyrefly: ignore [bad-argument-type]
-        FAST_DROPOUT=False,
-    )
-    return y
-
-
-@torch.fx.wrap
-# "aot_triton_kernel_wrapper_" is a pre-defined prefix for
-# AOT-T triton kernel wrapper functions. This is required for
-# AOT-T backend to recognize and trace correctly for ops transformation.
-def aot_triton_kernel_wrapper_layer_norm_mul_dropout(
-    x: torch.Tensor,
-    u: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor,
-    eps: float,
-    dropout_ratio: float,
-    training: bool,
-    silu_u: bool,
-    concat_ux: bool,
-    mul_u_activation_type: str,
-) -> torch.Tensor:
-    if should_trigger_eager_impl():
-        return pytorch_norm_mul_dropout(
-            x=x,
-            u=u,
-            weight=weight,
-            bias=bias,
-            eps=eps,
-            dropout_ratio=dropout_ratio,
-            training=training,
-            silu_u=silu_u,
-            concat_u=concat_ux,
-            concat_x=concat_ux,
-            mul_u_activation_type=mul_u_activation_type,
-            group_norm=False,
-        )
-    else:
-        return _triton_aot_layer_norm_mul_dropout(
-            x=x,
-            u=u,
-            weight=weight,
-            bias=bias,
-            eps=eps,
-            dropout_ratio=dropout_ratio,
-            training=training,
-            silu_u=silu_u,
-            concat_ux=concat_ux,
-            mul_u_activation_type=mul_u_activation_type,
-        )
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_position.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_position.py
deleted file mode 100644
index 828a4f4d4..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_position.py
+++ /dev/null
@@ -1,176 +0,0 @@
-# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
-
-# pyre-strict
-
-from typing import Optional
-
-import torch
-from generative_recommenders.common import (
-    cdiv,
-    fx_unwrap_optional_tensor,
-    next_power_of_2,
-    prev_power_of_2,
-    should_trigger_eager_impl,
-)
-from generative_recommenders.ops.pytorch.pt_position import (
-    pytorch_add_timestamp_positional_embeddings,
-)
-from generative_recommenders.ops.triton.triton_position import (
-    _add_timestamp_position_embeddings_kernel,
-)
-from generative_recommenders.ops.triton_aot.types import triton_aot
-
-
-_add_timestamp_position_embeddings_kernel = triton_aot(
-    annotations={
-        "SeqEmb": ("*bf16", 16),
-        "Offsets": ("*i64", 16),
-        "Lengths": ("*i64", 16),
-        "PosEmb": ("*fp32", 16),
-        "TsEmb": ("*fp32", 16),
-        "Out": ("*bf16", 16),
-        "TS": ("*i64", 16),
-        "PosInds": ("*i32", 16),
-        "TsInds": ("*i32", 16),
-        "NumTargets": ("*i64", 16),
-        "AUTOTUNE_MAX_SEQ_LEN": "i32",
-        "D": "i32",
-        "num_time_buckets": "i32",
-        "time_bucket_increments": "fp32",
-        "time_bucket_scale": "fp32",
-        "time_delta": "i32",
-        "max_contextual_seq_len": "i32",
-        "max_pos_ind": "i32",
-        "stride_sn": ("i32", 16),
-        "stride_pn": ("i32", 16),
-        "stride_tn": ("i32", 16),
-        "stride_on": ("i32", 16),
-    },
-)(_add_timestamp_position_embeddings_kernel)
-
-
-@torch.jit.unused
-@torch.fx.wrap
-def _triton_aot_position(
-    seq_embeddings: torch.Tensor,
-    seq_offsets: torch.Tensor,
-    pos_embeddings: torch.Tensor,
-    ts_embeddings: torch.Tensor,
-    timestamps: torch.Tensor,
-    max_seq_len: int,
-    max_contextual_seq_len: int,
-    seq_lengths: torch.Tensor,
-    num_targets: Optional[torch.Tensor],
-    interleave_targets: bool,
-    time_bucket_fn: str,
-) -> torch.Tensor:
-    has_multiple_targets = num_targets is not None
-    if not has_multiple_targets:
-        num_targets_resolved = torch.empty(
-            0, dtype=torch.int64, device=seq_embeddings.device
-        )
-    else:
-        num_targets_resolved = fx_unwrap_optional_tensor(num_targets).to(torch.int64)
-
-    seq_embeddings = seq_embeddings.contiguous()
-    pos_embeddings = pos_embeddings.contiguous()
-    ts_embeddings = ts_embeddings.contiguous()
-
-    max_pos_ind = pos_embeddings.shape[0]
-    B = seq_lengths.shape[0]
-
-    N, D = seq_embeddings.shape
-    out = torch.empty_like(seq_embeddings)
-
-    timestamps = timestamps.contiguous()
-    ts_inds = torch.empty((N,), device=timestamps.device, dtype=torch.int32)
-    pos_inds = torch.empty((N,), device=timestamps.device, dtype=torch.int32)
-
-    autotune_max_seq_len = prev_power_of_2(max_seq_len)
-    BLOCK_D = next_power_of_2(D) if D < 64 else 64
-
-    grid = lambda meta: (  # noqa E731
-        B,
-        cdiv(max_seq_len, meta["BLOCK_N"]),
-    )
-    # pyrefly: ignore [not-callable]
-    _add_timestamp_position_embeddings_kernel[grid](
-        SeqEmb=seq_embeddings,
-        Offsets=seq_offsets,
-        Lengths=seq_lengths,
-        PosEmb=pos_embeddings,
-        TsEmb=ts_embeddings,
-        Out=out,
-        TS=timestamps,
-        PosInds=pos_inds,
-        TsInds=ts_inds,
-        NumTargets=num_targets_resolved,
-        AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len,
-        D=D,
-        num_time_buckets=2048,
-        time_bucket_increments=60.0,
-        time_bucket_scale=1.0,
-        time_delta=0,
-        max_contextual_seq_len=max_contextual_seq_len,
-        max_pos_ind=max_pos_ind,
-        stride_sn=seq_embeddings.stride(0),
-        stride_pn=pos_embeddings.stride(0),
-        stride_tn=ts_embeddings.stride(0),
-        stride_on=out.stride(0),
-        TRAINING=False,
-        HAS_MULTIPLE_TARGETS=has_multiple_targets,
-        INTERLEAVE_TARGETS=interleave_targets,
-        TIME_BUCKET_FN=time_bucket_fn,
-        BLOCK_D=BLOCK_D,
-    )
-
-    return out
-
-
-@torch.fx.wrap
-# "aot_triton_kernel_wrapper_" is a pre-defined prefix for
-# AOT-T triton kernel wrapper functions. This is required for
-# AOT-T backend to recognize and trace correctly for ops transformation.
-def aot_triton_kernel_wrapper_position(
-    alpha: float,
-    max_seq_len: int,
-    max_contextual_seq_len: int,
-    position_embeddings_weight: torch.Tensor,
-    timestamp_embeddings_weight: torch.Tensor,
-    seq_offsets: torch.Tensor,
-    seq_lengths: torch.Tensor,
-    seq_embeddings: torch.Tensor,
-    timestamps: torch.Tensor,
-    num_targets: Optional[torch.Tensor],
-    interleave_targets: bool,
-    time_bucket_fn: str,
-) -> torch.Tensor:
-    seq_embeddings = seq_embeddings * alpha
-    if should_trigger_eager_impl():
-        return pytorch_add_timestamp_positional_embeddings(
-            seq_embeddings=seq_embeddings,
-            seq_offsets=seq_offsets,
-            pos_embeddings=position_embeddings_weight,
-            ts_embeddings=timestamp_embeddings_weight,
-            timestamps=timestamps,
-            max_seq_len=max_seq_len,
-            max_contextual_seq_len=max_contextual_seq_len,
-            seq_lengths=seq_lengths,
-            num_targets=num_targets,
-            interleave_targets=interleave_targets,
-            time_bucket_fn=time_bucket_fn,
-        )
-    else:
-        return _triton_aot_position(
-            seq_embeddings=seq_embeddings,
-            seq_offsets=seq_offsets,
-            pos_embeddings=position_embeddings_weight,
-            ts_embeddings=timestamp_embeddings_weight,
-            timestamps=timestamps,
-            max_seq_len=max_seq_len,
-            max_contextual_seq_len=max_contextual_seq_len,
-            seq_lengths=seq_lengths,
-            num_targets=num_targets,
-            interleave_targets=interleave_targets,
-            time_bucket_fn=time_bucket_fn,
-        )
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_ragged_hstu_attention.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_ragged_hstu_attention.py
deleted file mode 100644
index 4fa11dddc..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_ragged_hstu_attention.py
+++ /dev/null
@@ -1,366 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Permission is hereby granted, free of charge, to any person obtaining
-# a copy of this software and associated documentation files
-# (the "Software"), to deal in the Software without restriction,
-# including without limitation the rights to use, copy, modify, merge,
-# publish, distribute, sublicense, and/or sell copies of the Software,
-# and to permit persons to whom the Software is furnished to do so,
-# subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be
-# included in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-# pyre-strict
-
-#!/usr/bin/env python3
-
-from typing import Optional
-
-import torch
-from generative_recommenders.common import (
-    autotune_max_seq_len,
-    BACKEND_ALLOW_TF32,
-    cdiv,
-    prev_power_of_2,
-    should_trigger_eager_impl,
-)
-from generative_recommenders.ops.pytorch.pt_hstu_attention import (
-    pytorch_cached_hstu_mha,
-    pytorch_hstu_mha,
-)
-from generative_recommenders.ops.triton.triton_hstu_attention import _hstu_attn_fwd
-from generative_recommenders.ops.triton_aot.types import triton_aot
-
-
-for _config in _hstu_attn_fwd.configs:
-    if isinstance(_config.kwargs.get("USE_TLX"), bool):
-        _config.kwargs["USE_TLX"] = int(_config.kwargs["USE_TLX"])
-
-
-_hstu_attn_fwd = triton_aot(
-    annotations={
-        "stride_qm": ("i32", 16),
-        "stride_qh": ("i32", 16),
-        "stride_kn": ("i32", 16),
-        "stride_kh": ("i32", 16),
-        "stride_vn": ("i32", 16),
-        "stride_vh": ("i32", 16),
-        "stride_om": ("i32", 16),
-        "stride_oh": ("i32", 16),
-        "contextual_seq_len": "i32",
-        "max_attn_len": "i32",
-        "Z": "i32",
-        "AUTOTUNE_Z": "i32",
-        "H": "i32",
-        "MAX_SEQ_LEN": "i32",
-        "AUTOTUNE_MAX_SEQ_LEN": "i32",
-        "DimQ": "i32",
-        "DimV": "i32",
-        "DeltaSize": "i32",
-        "workspace_ptr": "*i8",
-        "sort_by_length_indices": "*i64",
-    }
-)(_hstu_attn_fwd)
-
-
-def _check_common_args(
-    invalid_attn_mask_type: str,
-    attn_scale: Optional[torch.Tensor],
-    full_attn_size: int,
-    num_softmax_heads: int,
-) -> None:
-    assert invalid_attn_mask_type in ("causal", "lower_triangular"), (
-        f"unsupported invalid_attn_mask_type: {invalid_attn_mask_type}"
-    )
-    assert attn_scale is None, "attn_scale is not implemented for AOT-T HSTU MHA"
-    assert full_attn_size == 0, "full_attn_size is not implemented for AOT-T HSTU MHA"
-    assert num_softmax_heads == 0, (
-        "num_softmax_heads is not implemented for AOT-T HSTU MHA"
-    )
-
-
-@torch.jit.unused
-@torch.fx.wrap
-def _triton_aot_ragged_hstu_mha(
-    N: int,
-    alpha: float,
-    q: torch.Tensor,
-    k: torch.Tensor,
-    v: torch.Tensor,
-    seq_offsets: torch.Tensor,
-    invalid_attn_mask_type: str,
-    num_targets: Optional[torch.Tensor],
-    attn_scale: Optional[torch.Tensor],
-    max_attn_len: int,
-    contextual_seq_len: int,
-    full_attn_size: int,
-    num_softmax_heads: int = 0,
-    allow_tf32: bool = BACKEND_ALLOW_TF32,
-) -> torch.Tensor:
-    assert invalid_attn_mask_type in ("causal", "lower_triangular"), (
-        f"unsupported invalid_attn_mask_type: {invalid_attn_mask_type}"
-    )
-    assert attn_scale is None, "attn_scale is not implemented for AOT-T HSTU MHA"
-    assert full_attn_size == 0, "full_attn_size is not implemented for AOT-T HSTU MHA"
-    assert num_softmax_heads == 0, (
-        "num_softmax_heads is not implemented for AOT-T HSTU MHA"
-    )
-    Z = seq_offsets.numel() - 1
-    L, H, DimQ = q.shape
-    DimV = v.shape[2]
-
-    q = q.contiguous()
-    k = k.contiguous()
-    v = v.contiguous()
-    seq_offsets = seq_offsets.contiguous()
-
-    out = torch.empty_like(v)
-    if L == 0:
-        return out
-    workspace = torch.empty(0, dtype=torch.int8, device=q.device)
-    sort_by_length_indices = torch.empty(
-        0, dtype=torch.int64, device=seq_offsets.device
-    )
-
-    grid = lambda meta: (  # noqa E731
-        cdiv(N, meta["BLOCK_M"]),
-        Z * H,
-    )
-    # pyrefly: ignore [not-callable]
-    _hstu_attn_fwd[grid](
-        Q=q,
-        K=k,
-        V=v,
-        workspace_ptr=workspace,
-        sort_by_length_indices=sort_by_length_indices,
-        seq_offsets=seq_offsets,
-        num_targets=num_targets,
-        Out=out,
-        stride_qm=q.stride(0),
-        stride_qh=q.stride(1),
-        stride_kn=k.stride(0),
-        stride_kh=k.stride(1),
-        stride_vn=v.stride(0),
-        stride_vh=v.stride(1),
-        stride_om=out.stride(0),
-        stride_oh=out.stride(1),
-        alpha=alpha,
-        contextual_seq_len=contextual_seq_len,
-        max_attn_len=max_attn_len,
-        Z=Z,
-        AUTOTUNE_Z=prev_power_of_2(Z),
-        H=H,
-        MAX_SEQ_LEN=N,
-        AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(N),
-        DimQ=DimQ,
-        DimV=DimV,
-        DeltaSize=0,
-        HAS_MULTIPLE_TARGETS=num_targets is not None,
-        IS_DELTA_Q=False,
-        ALLOW_TF32=allow_tf32,
-        BLOCK_D_Q=DimQ,
-        BLOCK_D_V=DimV,
-        HAS_CONTEXTUAL_SEQ_LEN=contextual_seq_len > 0,
-        HAS_MAX_ATTN_LEN=max_attn_len > 0,
-        HAS_SORT_BY_LENGTH_INDICES=False,
-        ENABLE_TMA=False,
-        TMA_DESC_SIZE=128,
-    )
-    return out
-
-
-@torch.fx.wrap
-def aot_triton_kernel_wrapper_ragged_hstu_mha(
-    N: int,
-    alpha: float,
-    q: torch.Tensor,
-    k: torch.Tensor,
-    v: torch.Tensor,
-    seq_offsets: torch.Tensor,
-    invalid_attn_mask_type: str,
-    num_targets: Optional[torch.Tensor],
-    attn_scale: Optional[torch.Tensor],
-    max_attn_len: int,
-    contextual_seq_len: int,
-    full_attn_size: int,
-    num_softmax_heads: int,
-    allow_tf32: bool = BACKEND_ALLOW_TF32,
-) -> torch.Tensor:
-    _check_common_args(
-        invalid_attn_mask_type=invalid_attn_mask_type,
-        attn_scale=attn_scale,
-        full_attn_size=full_attn_size,
-        num_softmax_heads=num_softmax_heads,
-    )
-    if should_trigger_eager_impl():
-        return pytorch_hstu_mha(
-            max_seq_len=N,
-            alpha=alpha,
-            q=q,
-            k=k,
-            v=v,
-            seq_offsets=seq_offsets,
-            causal=True,
-            dropout_pr=0.0,
-            training=False,
-            num_targets=num_targets,
-            attn_scale=attn_scale,
-            max_attn_len=max_attn_len,
-            contextual_seq_len=contextual_seq_len,
-            min_full_attn_seq_len=full_attn_size,
-        )
-    return _triton_aot_ragged_hstu_mha(
-        N=N,
-        alpha=alpha,
-        q=q,
-        k=k,
-        v=v,
-        seq_offsets=seq_offsets,
-        invalid_attn_mask_type=invalid_attn_mask_type,
-        num_targets=num_targets,
-        attn_scale=attn_scale,
-        max_attn_len=max_attn_len,
-        contextual_seq_len=contextual_seq_len,
-        full_attn_size=full_attn_size,
-        num_softmax_heads=num_softmax_heads,
-        allow_tf32=allow_tf32,
-    )
-
-
-@torch.jit.unused
-@torch.fx.wrap
-def _triton_aot_cached_hstu_mha(
-    N: int,
-    alpha: float,
-    delta_q: torch.Tensor,
-    k: torch.Tensor,
-    v: torch.Tensor,
-    delta_x_offsets: torch.Tensor,
-    seq_offsets: torch.Tensor,
-    num_targets: Optional[torch.Tensor],
-    attn_scale: Optional[torch.Tensor],
-    max_attn_len: int,
-    full_attn_size: int,
-    allow_tf32: bool = BACKEND_ALLOW_TF32,
-) -> torch.Tensor:
-    assert attn_scale is None, "attn_scale is not implemented for AOT-T HSTU MHA"
-    assert full_attn_size == 0, "full_attn_size is not implemented for AOT-T HSTU MHA"
-    Z = seq_offsets.size(0) - 1
-    DELTA_L, H, DimQ = delta_q.shape
-    DeltaSize = DELTA_L // Z
-    DimV = v.shape[2]
-
-    delta_q = delta_q.contiguous()
-    k = k.contiguous()
-    v = v.contiguous()
-    seq_offsets = seq_offsets.contiguous()
-
-    out = torch.empty((DELTA_L, H, DimV), dtype=delta_q.dtype, device=delta_q.device)
-    if DELTA_L == 0:
-        return out
-    workspace = torch.empty(0, dtype=torch.int8, device=delta_q.device)
-    sort_by_length_indices = torch.empty(
-        0, dtype=torch.int64, device=seq_offsets.device
-    )
-
-    grid = lambda meta: (  # noqa E731
-        cdiv(DeltaSize, meta["BLOCK_M"]),
-        Z * H,
-    )
-    # pyrefly: ignore [not-callable]
-    _hstu_attn_fwd[grid](
-        Q=delta_q,
-        K=k,
-        V=v,
-        workspace_ptr=workspace,
-        sort_by_length_indices=sort_by_length_indices,
-        seq_offsets=seq_offsets,
-        num_targets=num_targets,
-        Out=out,
-        stride_qm=delta_q.stride(0),
-        stride_qh=delta_q.stride(1),
-        stride_kn=k.stride(0),
-        stride_kh=k.stride(1),
-        stride_vn=v.stride(0),
-        stride_vh=v.stride(1),
-        stride_om=out.stride(0),
-        stride_oh=out.stride(1),
-        alpha=alpha,
-        contextual_seq_len=0,
-        max_attn_len=max_attn_len,
-        Z=Z,
-        AUTOTUNE_Z=prev_power_of_2(Z),
-        H=H,
-        MAX_SEQ_LEN=N,
-        AUTOTUNE_MAX_SEQ_LEN=autotune_max_seq_len(N),
-        DimQ=DimQ,
-        DimV=DimV,
-        DeltaSize=DeltaSize,
-        HAS_MULTIPLE_TARGETS=num_targets is not None,
-        IS_DELTA_Q=True,
-        ALLOW_TF32=allow_tf32,
-        BLOCK_D_Q=DimQ,
-        BLOCK_D_V=DimV,
-        HAS_CONTEXTUAL_SEQ_LEN=False,
-        HAS_MAX_ATTN_LEN=max_attn_len > 0,
-        HAS_SORT_BY_LENGTH_INDICES=False,
-        ENABLE_TMA=False,
-        TMA_DESC_SIZE=128,
-    )
-    return out
-
-
-@torch.fx.wrap
-def aot_triton_kernel_wrapper_cached_hstu_mha(
-    N: int,
-    alpha: float,
-    delta_q: torch.Tensor,
-    k: torch.Tensor,
-    v: torch.Tensor,
-    delta_x_offsets: torch.Tensor,
-    seq_offsets: torch.Tensor,
-    num_targets: Optional[torch.Tensor],
-    attn_scale: Optional[torch.Tensor],
-    max_attn_len: int,
-    full_attn_size: int,
-) -> torch.Tensor:
-    _check_common_args(
-        invalid_attn_mask_type="causal",
-        attn_scale=attn_scale,
-        full_attn_size=full_attn_size,
-        num_softmax_heads=0,
-    )
-    if should_trigger_eager_impl():
-        return pytorch_cached_hstu_mha(
-            max_seq_len=N,
-            alpha=alpha,
-            delta_q=delta_q,
-            k=k,
-            v=v,
-            seq_offsets=seq_offsets,
-            num_targets=num_targets,
-            max_attn_len=max_attn_len,
-            contextual_seq_len=0,
-        )
-    return _triton_aot_cached_hstu_mha(
-        N=N,
-        alpha=alpha,
-        delta_q=delta_q,
-        k=k,
-        v=v,
-        delta_x_offsets=delta_x_offsets,
-        seq_offsets=seq_offsets,
-        num_targets=num_targets,
-        attn_scale=attn_scale,
-        max_attn_len=max_attn_len,
-        full_attn_size=full_attn_size,
-    )
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_rms_norm.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_rms_norm.py
deleted file mode 100644
index e5d9e093e..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_rms_norm.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Permission is hereby granted, free of charge, to any person obtaining
-# a copy of this software and associated documentation files
-# (the "Software"), to deal in the Software without restriction,
-# including without limitation the rights to use, copy, modify, merge,
-# publish, distribute, sublicense, and/or sell copies of the Software,
-# and to permit persons to whom the Software is furnished to do so,
-# subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be
-# included in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-# IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-# pyre-strict
-
-#!/usr/bin/env python3
-
-import torch
-from generative_recommenders.common import (
-    cdiv,
-    next_power_of_2,
-    should_trigger_eager_impl,
-    switch_to_contiguous_if_needed,
-)
-from generative_recommenders.ops.pytorch.pt_layer_norm import pytorch_rms_norm
-from generative_recommenders.ops.triton.triton_layer_norm import _weighted_rms_norm_fwd
-from generative_recommenders.ops.triton_aot.types import triton_aot
-
-_weighted_rms_norm_fwd = triton_aot(
-    annotations={
-        "N": "i32",
-        "D": ("i32", 16),
-        "stride_x": ("i32", 16),
-        "stride_y": ("i32", 16),
-    },
-)(_weighted_rms_norm_fwd)
-
-
-@torch.jit.unused
-@torch.fx.wrap
-def _triton_aot_rms_norm(
-    x: torch.Tensor,
-    weight: torch.Tensor,
-    eps: float,
-    silu: bool,
-) -> torch.Tensor:
-    """Internal AOTT kernel function for RMS norm."""
-    assert x.dim() == 2, f"x.dim() == {x.dim()}, expected 2"
-    x = switch_to_contiguous_if_needed(x)
-    N, D = x.shape
-
-    assert weight.dim() == 1
-    assert weight.numel() == D
-
-    y = torch.empty_like(x)
-    rstd = torch.empty(N, dtype=torch.float32, device=x.device)
-
-    BLOCK_D = next_power_of_2(D)
-
-    grid = lambda meta: (  # noqa E731
-        cdiv(N, meta["BLOCK_N"]),
-    )
-    # pyrefly: ignore [not-callable]
-    _weighted_rms_norm_fwd[grid](
-        x,
-        y,
-        weight,
-        rstd,
-        N,
-        D,
-        eps,
-        stride_x=x.stride(0),
-        stride_y=y.stride(0),
-        SILU=silu,
-        BLOCK_D=BLOCK_D,
-    )
-
-    return y
-
-
-def _pytorch_rms_norm_fallback(
-    x: torch.Tensor,
-    weight: torch.Tensor,
-    eps: float,
-    silu: bool,
-) -> torch.Tensor:
-    """PyTorch fallback for RMS norm in eager mode."""
-
-    return pytorch_rms_norm(x, [x.shape[-1]], weight, eps, silu)
-
-
-@torch.fx.wrap
-def aot_triton_kernel_wrapper_rms_norm(
-    x: torch.Tensor,
-    weight: torch.Tensor,
-    eps: float,
-    silu: bool,
-) -> torch.Tensor:
-    """AOT-T wrapper for RMS norm.
-
-    Routes between PyTorch fallback (for tracing/serialization) and AOTT kernel path.
-    """
-    if should_trigger_eager_impl():
-        return _pytorch_rms_norm_fallback(x, weight, eps, silu)
-    else:
-        return _triton_aot_rms_norm(x, weight, eps, silu)
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_split_2d_jagged.py b/recommendation_v4/generative_recommenders/ops/triton_aot/triton_split_2d_jagged.py
deleted file mode 100644
index 9aa0c655b..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/triton_split_2d_jagged.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
-
-# pyre-strict
-
-from typing import Optional, Tuple
-
-import torch
-from generative_recommenders.common import (
-    fx_unwrap_optional_tensor,
-    next_power_of_2,
-    should_trigger_eager_impl,
-)
-from generative_recommenders.ops.pytorch.pt_jagged_tensors import (
-    pytorch_split_2D_jagged,
-)
-from generative_recommenders.ops.triton.triton_jagged import split_2D_jagged
-from generative_recommenders.ops.triton_aot.types import triton_aot
-
-
-split_2D_jagged = triton_aot(
-    annotations={
-        "DenseSize": "i32",
-        "D": ("i32", 16),
-        "stride_id": ("i32", 16),
-        "stride_ad": ("i32", 16),
-        "stride_bd": ("i32", 16),
-    },
-    # pyrefly: ignore [bad-argument-type]
-)(split_2D_jagged)
-
-
-@torch.jit.unused
-@torch.fx.wrap
-def _triton_aot_split_2D_jagged(
-    values: torch.Tensor,
-    max_seq_len: int,
-    offsets_a: torch.Tensor,
-    offsets_b: torch.Tensor,
-    dense_size: int = 0,
-    is_dense_a: bool = False,
-    is_dense_b: bool = False,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    _, D = values.shape
-    BLOCK_D = next_power_of_2(D)
-
-    if is_dense_a:
-        L, _ = values.shape
-        B = offsets_b.size(0) - 1
-        seq_len_a = dense_size * B
-        seq_len_b = L - seq_len_a
-    elif is_dense_b:
-        L, _ = values.shape
-        B = offsets_a.size(0) - 1
-        seq_len_b = dense_size * B
-        seq_len_a = L - seq_len_b
-    else:
-        B = offsets_a.size(0) - 1
-        seq_len_a = int(offsets_a[-1].item())
-        seq_len_b = int(offsets_b[-1].item())
-
-    values_a = torch.empty((seq_len_a, D), device=values.device, dtype=values.dtype)
-    values_b = torch.empty((seq_len_b, D), device=values.device, dtype=values.dtype)
-
-    grid = (max_seq_len, B)
-    # pyre-ignore[29]: TritonAOT.__getitem__ is callable at runtime
-    split_2D_jagged[grid](
-        JaggedIn=values,
-        DenseSize=dense_size,
-        OffsetsA=offsets_a,
-        OffsetsB=offsets_b,
-        OutA=values_a,
-        OutB=values_b,
-        D=D,
-        stride_id=values.stride(0),
-        stride_ad=values_a.stride(0),
-        stride_bd=values_b.stride(0),
-        # pyrefly: ignore [bad-argument-type]
-        IS_DENSE_A=is_dense_a,
-        # pyrefly: ignore [bad-argument-type]
-        IS_DENSE_B=is_dense_b,
-        # pyrefly: ignore [bad-argument-type]
-        BLOCK_D=BLOCK_D,
-        # pyrefly: ignore [bad-argument-type]
-        IS_REPLACE=False,
-    )
-
-    if is_dense_a:
-        values_a = values_a.reshape(B, dense_size, D)
-    if is_dense_b:
-        values_b = values_b.reshape(B, dense_size, D)
-
-    return values_a, values_b
-
-
-@torch.fx.wrap
-def aot_triton_kernel_wrapper_split_2D_jagged(
-    values: torch.Tensor,
-    max_seq_len: int,
-    offsets_a: Optional[torch.Tensor] = None,
-    offsets_b: Optional[torch.Tensor] = None,
-    dense_size: int = 0,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    if should_trigger_eager_impl():
-        assert offsets_a is not None and offsets_b is not None, (
-            "Eager fallback requires both offsets_a and offsets_b"
-        )
-        return pytorch_split_2D_jagged(
-            max_seq_len=max_seq_len,
-            values=values,
-            max_len_left=None,
-            max_len_right=None,
-            offsets_left=offsets_a,
-            offsets_right=offsets_b,
-        )
-    else:
-        is_dense_a: bool = offsets_a is None
-        is_dense_b: bool = offsets_b is None
-        resolved_offsets_a: torch.Tensor = values.new_empty(0)
-        resolved_offsets_b: torch.Tensor = values.new_empty(0)
-        if is_dense_a:
-            resolved_offsets_b = fx_unwrap_optional_tensor(offsets_b)
-            resolved_offsets_a = resolved_offsets_b.new_empty(0)
-        elif is_dense_b:
-            resolved_offsets_a = fx_unwrap_optional_tensor(offsets_a)
-            resolved_offsets_b = resolved_offsets_a.new_empty(0)
-        else:
-            resolved_offsets_a = fx_unwrap_optional_tensor(offsets_a)
-            resolved_offsets_b = fx_unwrap_optional_tensor(offsets_b)
-
-        return _triton_aot_split_2D_jagged(
-            values=values,
-            max_seq_len=max_seq_len,
-            offsets_a=resolved_offsets_a,
-            offsets_b=resolved_offsets_b,
-            dense_size=dense_size,
-            is_dense_a=is_dense_a,
-            is_dense_b=is_dense_b,
-        )
diff --git a/recommendation_v4/generative_recommenders/ops/triton_aot/types.py b/recommendation_v4/generative_recommenders/ops/triton_aot/types.py
deleted file mode 100644
index bd7b2a1ed..000000000
--- a/recommendation_v4/generative_recommenders/ops/triton_aot/types.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-
-# pyre-strict
-
-from __future__ import annotations
-
-import logging
-from dataclasses import dataclass
-from typing import Any, Callable, ClassVar, Dict, List, Optional, Protocol, Union
-
-from generative_recommenders.ops.triton_aot.compile.utils import is_autotuner
-
-# @manual=//triton:triton
-from triton.runtime.jit import KernelInterface
-# triton.fb.triton_util depends on torch
-# @dep=//caffe2:_torch
-
-
-_VALID_HINTS: frozenset[int] = frozenset({1, 8, 16})
-_VALID_POINTER_HINTS: frozenset[int] = frozenset({16})
-
-
-@dataclass(frozen=True)
-class AnnotationHint:
-    """Annotation with a value hint (dtype + divisibility/alignment).
-
-    Valid hints: 16 (divisible_by_16), 8 (divisible_by_8), 1 (equal_to_1).
-    For pointers (dtype starts with ``*``), only 16 is valid — other values
-    would cause incorrect codegen (e.g. alignment=1 folds the pointer as a
-    constexpr constant, causing a segfault at launch).
-    """
-
-    dtype: str
-    hint: int
-
-    def __post_init__(self) -> None:
-        if self.hint not in _VALID_HINTS:
-            raise RuntimeError(
-                f"TritonAOT: invalid annotation hint {self.hint!r} for "
-                f"dtype {self.dtype!r}. Valid hints: {sorted(_VALID_HINTS)}."
-            )
-        if self.dtype.startswith("*") and self.hint not in _VALID_POINTER_HINTS:
-            raise RuntimeError(
-                f"TritonAOT: invalid pointer alignment {self.hint!r} for "
-                f"dtype {self.dtype!r}. Pointer annotations only support "
-                f"alignment={sorted(_VALID_POINTER_HINTS)}."
-            )
-
-    def to_tuple(self) -> tuple[str, int]:
-        """Convert to plain tuple for raw spec format."""
-        return (self.dtype, self.hint)
-
-
-# Internal annotation type (after normalization).
-Annotation = Union[str, AnnotationHint]
-
-# User-facing input type (also accepts raw tuples).
-AnnotationInput = Union[str, tuple[str, int], AnnotationHint]
-
-
-def _normalize_annotation(ann: AnnotationInput) -> Annotation:
-    """Convert a raw tuple to AnnotationHint (triggers validation)."""
-    if isinstance(ann, AnnotationHint):
-        return ann
-    if isinstance(ann, tuple):
-        return AnnotationHint(ann[0], ann[1])
-    return ann
-
-
-class SpecCollector(Protocol):
-    """Callback invoked by TritonAOT.run() to collect kernel specs during AOT compile."""
-
-    def __call__(
-        self,
-        fn: KernelInterface[List[Any]],
-        annotations: Dict[str, Annotation],
-        *args: Any,
-        **kwargs: Any,
-    ) -> None: ...
-
-
-logger: logging.Logger = logging.getLogger(__name__)
-
-
-class TritonAOTMeta(type):
-    # TODO consider merge with AOTTCompileState
-    def __init__(cls, name, bases, attrs):  # pyre-ignore [2,3]
-        super().__init__(name, bases, attrs)
-        # Initialize an empty list for each new class created
-        cls._instances: List["TritonAOT"] = []
-
-    def __call__(cls, *args, **kwargs):  # pyre-ignore [2,3]
-        # Create the instance using the default behavior
-        instance = super().__call__(*args, **kwargs)
-        # Store the instance in the class-specific list
-        cls._instances.append(instance)
-        return instance
-
-    def get_instances(cls) -> List["TritonAOT"]:
-        return cls._instances
-
-
-class TritonAOT(KernelInterface[List[Any]], metaclass=TritonAOTMeta):
-    """Wraps a Triton kernel for ahead-of-time compilation.
-
-    Annotations specify dtype and optional value hints for kernel parameters:
-
-    - Scalar:  ``"i32"``, ``"fp32"``, or ``AnnotationHint("i32", 16)``
-      where 16 means the runtime value is divisible by 16.
-    - Pointer: ``AnnotationHint("*fp32", 16)`` for 16-byte aligned tensors.
-      Only alignment=16 is valid for pointers.
-    - Tensor:  typically inferred from runtime ``torch.Tensor.dtype``.
-    - Optional tensor:  auto-detected when the same kernel is called
-      with a tensor at one site and ``None`` at another.
-    """
-
-    _spec_collector: ClassVar[Optional[SpecCollector]] = None
-
-    def __init__(
-        self,
-        fn: KernelInterface[List[Any]],
-        annotations: Dict[str, AnnotationInput],
-    ) -> None:
-        self.fn: KernelInterface[List[Any]] = fn
-        self.annotations: Dict[str, Annotation] = {
-            k: _normalize_annotation(v) for k, v in annotations.items()
-        }
-
-    @classmethod
-    def set_spec_collector(cls, collector: Optional[SpecCollector]) -> None:
-        """Register or unregister the spec collection callback.
-
-        When a collector is registered (not None), TritonAOT.run() will call
-        it to collect kernel specs for AOT compilation.  When None, run()
-        simply delegates to the underlying Triton kernel (normal JIT path).
-        """
-        cls._spec_collector = collector
-
-    # pyrefly: ignore [bad-override]
-    def run(self, *args: Any, **kwargs: Any) -> Any:
-        if self._spec_collector is not None:
-            self._spec_collector(self.fn, self.annotations, *args, **kwargs)
-        # pyre-ignore[29]: KernelInterface.run is callable at runtime
-        return self.fn.run(*args, **kwargs)
-
-
-def triton_aot(
-    annotations: Dict[str, AnnotationInput],
-) -> Callable[[KernelInterface[List[Any]]], TritonAOT]:
-    def decorator(fn: KernelInterface[List[Any]]) -> TritonAOT:
-        return TritonAOT(fn, annotations)
-
-    return decorator
-
-
-def get_all_triton_aot_instances() -> List[TritonAOT]:
-    """Return all triton aot function instances (e.g. decorated with @triton_aot)."""
-    return TritonAOT.get_instances()
-
-
-def reset_all_triton_aot_autotune_cache() -> bool:
-    """Reset triton autotune cache for all triton aot kernels.
-
-    If triton aot compile is not enabled, this function is no op. Return True if any
-    kernel's autotune cache is reset. Else return False.
-
-    """
-    if TritonAOT._spec_collector is None:
-        return False
-
-    reset = False
-    for triton_aot_kernel in get_all_triton_aot_instances():
-        if is_autotuner(triton_aot_kernel.fn):
-            autotune_fn = triton_aot_kernel.fn
-            autotune_fn.cache.clear()  # pyre-ignore [16]
-            logger.info(
-                f"Reset autotune cache for triton kernel {autotune_fn.fn.__name__}"  # pyre-ignore [16]
-            )
-            reset = True
-
-    return reset
diff --git a/recommendation_v4/generative_recommenders/research/data/dataset.py b/recommendation_v4/generative_recommenders/research/data/dataset.py
deleted file mode 100644
index 09a18ae01..000000000
--- a/recommendation_v4/generative_recommenders/research/data/dataset.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-import csv
-import linecache
-from typing import Dict, List, Optional, Tuple
-
-import numpy as np
-import pandas as pd
-import torch
-
-
-class DatasetV2(torch.utils.data.Dataset):
-    """In reverse chronological order."""
-
-    def __init__(
-        self,
-        ratings_file: str,
-        padding_length: int,
-        ignore_last_n: int,  # used for creating train/valid/test sets
-        shift_id_by: int = 0,
-        chronological: bool = False,
-        sample_ratio: float = 1.0,
-    ) -> None:
-        """
-        Args:
-            csv_file (string): Path to the csv file.
-        """
-        super().__init__()
-
-        self.ratings_frame: pd.DataFrame = pd.read_csv(
-            ratings_file,
-            delimiter=",",
-            # iterator=True,
-        )
-        self._padding_length: int = padding_length
-        self._ignore_last_n: int = ignore_last_n
-        self._cache: Dict[int, Dict[str, torch.Tensor]] = dict()
-        self._shift_id_by: int = shift_id_by
-        self._chronological: bool = chronological
-        self._sample_ratio: float = sample_ratio
-
-    def __len__(self) -> int:
-        return len(self.ratings_frame)
-
-    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
-        if idx in self._cache.keys():
-            return self._cache[idx]
-        data = self.ratings_frame.iloc[idx]
-        sample = self.load_item(data)
-        self._cache[idx] = sample
-        return sample
-
-    def load_item(self, data) -> Dict[str, torch.Tensor]:
-        user_id = data.user_id
-
-        def eval_as_list(x: str, ignore_last_n: int) -> List[int]:
-            y = eval(x)
-            y_list = [y] if type(y) == int else list(y)
-            if ignore_last_n > 0:
-                # for training data creation
-                y_list = y_list[:-ignore_last_n]
-            return y_list
-
-        def eval_int_list(
-            x: str,
-            target_len: int,
-            ignore_last_n: int,
-            shift_id_by: int,
-            sampling_kept_mask: Optional[List[bool]],
-        ) -> Tuple[List[int], int]:
-            y = eval_as_list(x, ignore_last_n=ignore_last_n)
-            if sampling_kept_mask is not None:
-                y = [x for x, kept in zip(y, sampling_kept_mask) if kept]
-            y_len = len(y)
-            y.reverse()
-            if shift_id_by > 0:
-                y = [x + shift_id_by for x in y]
-            return y, y_len
-
-        if self._sample_ratio < 1.0:
-            raw_length = len(eval_as_list(data.sequence_item_ids, self._ignore_last_n))
-            sampling_kept_mask = (
-                torch.rand((raw_length,), dtype=torch.float32) < self._sample_ratio
-            ).tolist()
-        else:
-            sampling_kept_mask = None
-
-        movie_history, movie_history_len = eval_int_list(
-            data.sequence_item_ids,
-            self._padding_length,
-            self._ignore_last_n,
-            shift_id_by=self._shift_id_by,
-            sampling_kept_mask=sampling_kept_mask,
-        )
-        movie_history_ratings, ratings_len = eval_int_list(
-            data.sequence_ratings,
-            self._padding_length,
-            self._ignore_last_n,
-            0,
-            sampling_kept_mask=sampling_kept_mask,
-        )
-        movie_timestamps, timestamps_len = eval_int_list(
-            data.sequence_timestamps,
-            self._padding_length,
-            self._ignore_last_n,
-            0,
-            sampling_kept_mask=sampling_kept_mask,
-        )
-        assert movie_history_len == timestamps_len, (
-            f"history len {movie_history_len} differs from timestamp len {timestamps_len}."
-        )
-        assert movie_history_len == ratings_len, (
-            f"history len {movie_history_len} differs from ratings len {ratings_len}."
-        )
-
-        def _truncate_or_pad_seq(
-            y: List[int], target_len: int, chronological: bool
-        ) -> List[int]:
-            y_len = len(y)
-            if y_len < target_len:
-                y = y + [0] * (target_len - y_len)
-            else:
-                if not chronological:
-                    y = y[:target_len]
-                else:
-                    y = y[-target_len:]
-            assert len(y) == target_len
-            return y
-
-        historical_ids = movie_history[1:]
-        historical_ratings = movie_history_ratings[1:]
-        historical_timestamps = movie_timestamps[1:]
-        target_ids = movie_history[0]
-        target_ratings = movie_history_ratings[0]
-        target_timestamps = movie_timestamps[0]
-        if self._chronological:
-            historical_ids.reverse()
-            historical_ratings.reverse()
-            historical_timestamps.reverse()
-
-        max_seq_len = self._padding_length - 1
-        history_length = min(len(historical_ids), max_seq_len)
-        historical_ids = _truncate_or_pad_seq(
-            historical_ids,
-            max_seq_len,
-            self._chronological,
-        )
-        historical_ratings = _truncate_or_pad_seq(
-            historical_ratings,
-            max_seq_len,
-            self._chronological,
-        )
-        historical_timestamps = _truncate_or_pad_seq(
-            historical_timestamps,
-            max_seq_len,
-            self._chronological,
-        )
-        # moved to features.py
-        # if self._chronological:
-        #     historical_ids.append(0)
-        #     historical_ratings.append(0)
-        #     historical_timestamps.append(0)
-        # print(historical_ids, historical_ratings, historical_timestamps, target_ids, target_ratings, target_timestamps)
-        ret = {
-            "user_id": user_id,
-            "historical_ids": torch.tensor(historical_ids, dtype=torch.int64),
-            "historical_ratings": torch.tensor(historical_ratings, dtype=torch.int64),
-            "historical_timestamps": torch.tensor(
-                historical_timestamps, dtype=torch.int64
-            ),
-            "history_lengths": history_length,
-            "target_ids": target_ids,
-            "target_ratings": target_ratings,
-            "target_timestamps": target_timestamps,
-        }
-        return ret
-
-
-class MultiFileDatasetV2(DatasetV2, torch.utils.data.Dataset):
-    def __init__(
-        self,
-        file_prefix: str,
-        num_files: int,
-        padding_length: int,
-        ignore_last_n: int,  # used for creating train/valid/test sets
-        shift_id_by: int = 0,
-        chronological: bool = False,
-        sample_ratio: float = 1.0,
-    ) -> None:
-        torch.utils.data.Dataset().__init__()
-        self._file_prefix: str = file_prefix
-        self._num_files: int = num_files
-        with open(f"{file_prefix}_users.csv", "r") as file:
-            reader = csv.reader(file)
-            self.users_cumsum: List[int] = np.cumsum(
-                [int(row[1]) for row in reader]
-            ).tolist()
-        self._padding_length: int = padding_length
-        self._ignore_last_n: int = ignore_last_n
-        self._shift_id_by: int = shift_id_by
-        self._chronological: bool = chronological
-        self._sample_ratio: float = sample_ratio
-
-    def __len__(self) -> int:
-        return self.users_cumsum[-1]
-
-    def _process_line(self, line: str) -> pd.Series:
-        reader = csv.reader([line])
-        parsed_line = next(reader)
-        user_id = int(parsed_line[0])
-        sequence_item_ids = parsed_line[1]
-        sequence_ratings = parsed_line[2]
-        return pd.Series(
-            data={
-                "user_id": user_id,
-                "sequence_item_ids": sequence_item_ids,
-                "sequence_ratings": sequence_ratings,
-                "sequence_timestamps": sequence_item_ids,  # placeholder
-            }
-        )
-
-    def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
-        assert idx < self.users_cumsum[-1]
-        file_idx: int = 0
-        while self.users_cumsum[file_idx] <= idx:
-            file_idx += 1
-        if file_idx == 0:
-            local_idx = idx
-        else:
-            local_idx = idx - self.users_cumsum[file_idx - 1]
-        line = linecache.getline(f"{self._file_prefix}_{file_idx}.csv", local_idx + 1)
-        data = self._process_line(line)
-        sample = self.load_item(data)
-        return sample
diff --git a/recommendation_v4/generative_recommenders/research/data/eval.py b/recommendation_v4/generative_recommenders/research/data/eval.py
deleted file mode 100644
index 16026e5c7..000000000
--- a/recommendation_v4/generative_recommenders/research/data/eval.py
+++ /dev/null
@@ -1,263 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-import logging
-import sys
-from dataclasses import dataclass
-from typing import Callable, Dict, List, Optional, Set, Union
-
-import torch
-import torch.distributed as dist
-from generative_recommenders.research.indexing.candidate_index import (
-    CandidateIndex,
-    TopKModule,
-)
-from generative_recommenders.research.modeling.sequential.features import (
-    SequentialFeatures,
-)
-from generative_recommenders.research.rails.similarities.module import SimilarityModule
-from torch.utils.tensorboard import SummaryWriter
-
-
-logging.basicConfig(stream=sys.stdout, level=logging.INFO)
-
-
-@dataclass
-class EvalState:
-    all_item_ids: Set[int]
-    candidate_index: CandidateIndex
-    top_k_module: TopKModule
-
-
-def get_eval_state(
-    model: SimilarityModule,
-    all_item_ids: List[int],  # [X]
-    negatives_sampler: torch.nn.Module,
-    top_k_module_fn: Callable[[torch.Tensor, torch.Tensor], TopKModule],
-    device: int,
-    float_dtype: Optional[torch.dtype] = None,
-) -> EvalState:
-    # Exhaustively eval all items (incl. seen ids).
-    eval_negatives_ids = torch.as_tensor(all_item_ids).to(device).unsqueeze(0)  # [1, X]
-    # pyre-fixme[29]: `Union[Tensor, Module]` is not a function.
-    eval_negative_embeddings = negatives_sampler.normalize_embeddings(
-        # pyre-fixme[29]: `Union[Tensor, Module]` is not a function.
-        model.get_item_embeddings(eval_negatives_ids)
-    )
-    if float_dtype is not None:
-        eval_negative_embeddings = eval_negative_embeddings.to(float_dtype)
-    candidates = CandidateIndex(
-        ids=eval_negatives_ids,
-        embeddings=eval_negative_embeddings,
-    )
-    return EvalState(
-        all_item_ids=set(all_item_ids),
-        candidate_index=candidates,
-        top_k_module=top_k_module_fn(eval_negative_embeddings, eval_negatives_ids),
-    )
-
-
-@torch.inference_mode  # pyre-ignore [56]
-def eval_metrics_v2_from_tensors(
-    eval_state: EvalState,
-    model: SimilarityModule,
-    seq_features: SequentialFeatures,
-    target_ids: torch.Tensor,  # [B, 1]
-    min_positive_rating: int = 4,
-    target_ratings: Optional[torch.Tensor] = None,  # [B, 1]
-    epoch: Optional[str] = None,
-    filter_invalid_ids: bool = True,
-    user_max_batch_size: Optional[int] = None,
-    dtype: Optional[torch.dtype] = None,
-) -> Dict[str, Union[float, torch.Tensor]]:
-    """
-    Args:
-        eval_negatives_ids: Optional[Tensor]. If not present, defaults to eval over
-            the entire corpus (`num_items`) excluding all the items that users have
-            seen in the past (historical_ids, target_ids). This is consistent with
-            papers like SASRec and TDM but may not be fair in practice as retrieval
-            modules don't have access to read state during the initial fetch stage.
-        filter_invalid_ids: bool. If true, filters seen ids by default.
-    Returns:
-        keyed metric -> list of values for each example.
-    """
-    B, _ = target_ids.shape
-    device = target_ids.device
-
-    for target_id in target_ids:
-        target_id = int(target_id)
-        if target_id not in eval_state.all_item_ids:
-            print(f"missing target_id {target_id}")
-
-    # computes ro- part exactly once.
-    # pyre-fixme[29]: `Union[Tensor, Module]` is not a function.
-    shared_input_embeddings = model.encode(
-        past_lengths=seq_features.past_lengths,
-        past_ids=seq_features.past_ids,
-        # pyre-fixme[29]: `Union[Tensor, Module]` is not a function.
-        past_embeddings=model.get_item_embeddings(seq_features.past_ids),
-        past_payloads=seq_features.past_payloads,
-    )
-    if dtype is not None:
-        shared_input_embeddings = shared_input_embeddings.to(dtype)
-
-    MAX_K = 2500
-    k = min(MAX_K, eval_state.candidate_index.ids.size(1))
-    user_max_batch_size = user_max_batch_size or shared_input_embeddings.size(0)
-    num_batches = (
-        shared_input_embeddings.size(0) + user_max_batch_size - 1
-    ) // user_max_batch_size
-    eval_top_k_ids_all = []
-    eval_top_k_prs_all = []
-    for mb in range(num_batches):
-        eval_top_k_ids, eval_top_k_prs, _ = (
-            eval_state.candidate_index.get_top_k_outputs(
-                query_embeddings=shared_input_embeddings[
-                    mb * user_max_batch_size : (mb + 1) * user_max_batch_size, ...
-                ],
-                top_k_module=eval_state.top_k_module,
-                k=k,
-                invalid_ids=(
-                    seq_features.past_ids[
-                        mb * user_max_batch_size : (mb + 1) * user_max_batch_size, :
-                    ]
-                    if filter_invalid_ids
-                    else None
-                ),
-                return_embeddings=False,
-            )
-        )
-        eval_top_k_ids_all.append(eval_top_k_ids)
-        eval_top_k_prs_all.append(eval_top_k_prs)
-
-    if num_batches == 1:
-        eval_top_k_ids = eval_top_k_ids_all[0]
-        eval_top_k_prs = eval_top_k_prs_all[0]
-    else:
-        eval_top_k_ids = torch.cat(eval_top_k_ids_all, dim=0)
-        eval_top_k_prs = torch.cat(eval_top_k_prs_all, dim=0)
-
-    assert eval_top_k_ids.size(1) == k
-    _, eval_rank_indices = torch.max(
-        torch.cat(
-            [eval_top_k_ids, target_ids],
-            dim=1,
-        )
-        == target_ids,
-        dim=1,
-    )
-    eval_ranks = torch.where(eval_rank_indices == k, MAX_K + 1, eval_rank_indices + 1)
-
-    output = {
-        "ndcg@1": torch.where(
-            eval_ranks <= 1,
-            torch.div(1.0, torch.log2(eval_ranks + 1)),
-            torch.zeros(1, dtype=torch.float32, device=device),
-        ),
-        "ndcg@10": torch.where(
-            eval_ranks <= 10,
-            torch.div(1.0, torch.log2(eval_ranks + 1)),
-            torch.zeros(1, dtype=torch.float32, device=device),
-        ),
-        "ndcg@50": torch.where(
-            eval_ranks <= 50,
-            torch.div(1.0, torch.log2(eval_ranks + 1)),
-            torch.zeros(1, dtype=torch.float32, device=device),
-        ),
-        "ndcg@100": torch.where(
-            eval_ranks <= 100,
-            torch.div(1.0, torch.log2(eval_ranks + 1)),
-            torch.zeros(1, dtype=torch.float32, device=device),
-        ),
-        "ndcg@200": torch.where(
-            eval_ranks <= 200,
-            torch.div(1.0, torch.log2(eval_ranks + 1)),
-            torch.zeros(1, dtype=torch.float32, device=device),
-        ),
-        "hr@1": (eval_ranks <= 1),
-        "hr@10": (eval_ranks <= 10),
-        "hr@50": (eval_ranks <= 50),
-        "hr@100": (eval_ranks <= 100),
-        "hr@200": (eval_ranks <= 200),
-        "hr@500": (eval_ranks <= 500),
-        "hr@1000": (eval_ranks <= 1000),
-        "mrr": torch.div(1.0, eval_ranks),
-    }
-    if target_ratings is not None:
-        target_ratings = target_ratings.squeeze(1)  # [B]
-        output["ndcg@10_>=4"] = torch.where(
-            eval_ranks[target_ratings >= 4] <= 10,
-            torch.div(1.0, torch.log2(eval_ranks[target_ratings >= 4] + 1)),
-            torch.zeros(1, dtype=torch.float32, device=device),
-        )
-        output[f"hr@10_>={min_positive_rating}"] = (
-            eval_ranks[target_ratings >= min_positive_rating] <= 10
-        )
-        output[f"hr@50_>={min_positive_rating}"] = (
-            eval_ranks[target_ratings >= min_positive_rating] <= 50
-        )
-        output[f"mrr_>={min_positive_rating}"] = torch.div(
-            1.0, eval_ranks[target_ratings >= min_positive_rating]
-        )
-
-    return output  # pyre-ignore [7]
-
-
-def eval_recall_metrics_from_tensors(
-    eval_state: EvalState,
-    model: SimilarityModule,
-    seq_features: SequentialFeatures,
-    user_max_batch_size: Optional[int] = None,
-    dtype: Optional[torch.dtype] = None,
-) -> Dict[str, torch.Tensor]:
-    target_ids = seq_features.past_ids[:, -1].unsqueeze(1)
-    filtered_past_ids = seq_features.past_ids.detach().clone()
-    filtered_past_ids[:, -1] = torch.zeros_like(target_ids.squeeze(1))
-    return eval_metrics_v2_from_tensors(
-        eval_state=eval_state,
-        model=model,
-        seq_features=SequentialFeatures(
-            past_lengths=seq_features.past_lengths - 1,
-            past_ids=filtered_past_ids,
-            past_embeddings=seq_features.past_embeddings,
-            past_payloads=seq_features.past_payloads,
-        ),
-        target_ids=target_ids,
-        user_max_batch_size=user_max_batch_size,
-        dtype=dtype,
-    )
-
-
-def _avg(x: torch.Tensor, world_size: int) -> torch.Tensor:
-    _sum_and_numel = torch.tensor(
-        [x.sum(), x.numel()], dtype=torch.float32, device=x.device
-    )
-    if world_size > 1:
-        dist.all_reduce(_sum_and_numel, op=dist.ReduceOp.SUM)
-    return _sum_and_numel[0] / _sum_and_numel[1]
-
-
-def add_to_summary_writer(
-    writer: Optional[SummaryWriter],
-    batch_id: int,
-    metrics: Dict[str, torch.Tensor],
-    prefix: str,
-    world_size: int,
-) -> None:
-    for key, values in metrics.items():
-        avg_value = _avg(values, world_size)
-        if writer is not None:
-            writer.add_scalar(f"{prefix}/{key}", avg_value, batch_id)
diff --git a/recommendation_v4/generative_recommenders/research/data/item_features.py b/recommendation_v4/generative_recommenders/research/data/item_features.py
deleted file mode 100644
index 8ecb6ea6a..000000000
--- a/recommendation_v4/generative_recommenders/research/data/item_features.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-from dataclasses import dataclass
-from typing import List
-
-import torch
-
-
-@dataclass
-class ItemFeatures:
-    num_items: int
-    max_jagged_dimension: int
-    max_ind_range: List[int]  # [(,)] x num_features
-    lengths: List[torch.Tensor]  # [(num_items,)] x num_features
-    values: List[torch.Tensor]  # [(num_items, max_jagged_dimension)] x num_features
diff --git a/recommendation_v4/generative_recommenders/research/data/preprocessor.py b/recommendation_v4/generative_recommenders/research/data/preprocessor.py
deleted file mode 100644
index bf52f41da..000000000
--- a/recommendation_v4/generative_recommenders/research/data/preprocessor.py
+++ /dev/null
@@ -1,474 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-import abc
-import logging
-import os
-import sys
-import tarfile
-from typing import Dict, Optional, Union
-from urllib.request import urlretrieve
-from zipfile import ZipFile
-
-import numpy as np
-import pandas as pd
-
-
-logging.basicConfig(stream=sys.stdout, level=logging.INFO)
-
-
-class DataProcessor:
-    """
-    This preprocessor does not remap item_ids. This is intended so that we can easily join other
-    side-information based on item_ids later.
-    """
-
-    def __init__(
-        self,
-        prefix: str,
-        expected_num_unique_items: Optional[int],
-        expected_max_item_id: Optional[int],
-    ) -> None:
-        self._prefix: str = prefix
-        self._expected_num_unique_items = expected_num_unique_items
-        self._expected_max_item_id = expected_max_item_id
-
-    @abc.abstractmethod
-    def expected_num_unique_items(self) -> Optional[int]:
-        return self._expected_num_unique_items
-
-    @abc.abstractmethod
-    def expected_max_item_id(self) -> Optional[int]:
-        return self._expected_max_item_id
-
-    @abc.abstractmethod
-    def processed_item_csv(self) -> str:
-        pass
-
-    def output_format_csv(self) -> str:
-        return f"tmp/{self._prefix}/sasrec_format.csv"
-
-    def to_seq_data(
-        self,
-        ratings_data: pd.DataFrame,
-        user_data: Optional[pd.DataFrame] = None,
-    ) -> pd.DataFrame:
-        if user_data is not None:
-            ratings_data_transformed = ratings_data.join(
-                user_data.set_index("user_id"), on="user_id"
-            )
-        else:
-            ratings_data_transformed = ratings_data
-        ratings_data_transformed.item_ids = ratings_data_transformed.item_ids.apply(
-            lambda x: ",".join([str(v) for v in x])
-        )
-        ratings_data_transformed.ratings = ratings_data_transformed.ratings.apply(
-            lambda x: ",".join([str(v) for v in x])
-        )
-        ratings_data_transformed.timestamps = ratings_data_transformed.timestamps.apply(
-            lambda x: ",".join([str(v) for v in x])
-        )
-        ratings_data_transformed.rename(
-            columns={
-                "item_ids": "sequence_item_ids",
-                "ratings": "sequence_ratings",
-                "timestamps": "sequence_timestamps",
-            },
-            inplace=True,
-        )
-        return ratings_data_transformed
-
-    def file_exists(self, name: str) -> bool:
-        return os.path.isfile("%s/%s" % (os.getcwd(), name))
-
-
-class MovielensSyntheticDataProcessor(DataProcessor):
-    def __init__(
-        self,
-        prefix: str,
-        expected_num_unique_items: Optional[int] = None,
-        expected_max_item_id: Optional[int] = None,
-    ) -> None:
-        super().__init__(prefix, expected_num_unique_items, expected_max_item_id)
-
-    def preprocess_rating(self) -> None:
-        return
-
-
-class MovielensDataProcessor(DataProcessor):
-    def __init__(
-        self,
-        download_path: str,
-        saved_name: str,
-        prefix: str,
-        convert_timestamp: bool,
-        expected_num_unique_items: Optional[int] = None,
-        expected_max_item_id: Optional[int] = None,
-    ) -> None:
-        super().__init__(prefix, expected_num_unique_items, expected_max_item_id)
-        self._download_path = download_path
-        self._saved_name = saved_name
-        self._convert_timestamp: bool = convert_timestamp
-
-    def download(self) -> None:
-        if not self.file_exists(self._saved_name):
-            urlretrieve(self._download_path, self._saved_name)
-        if self._saved_name[-4:] == ".zip":
-            ZipFile(self._saved_name, "r").extractall(path="tmp/")
-        else:
-            with tarfile.open(self._saved_name, "r:*") as tar_ref:
-                tar_ref.extractall("tmp/")
-
-    def processed_item_csv(self) -> str:
-        return f"tmp/processed/{self._prefix}/movies.csv"
-
-    def sasrec_format_csv_by_user_train(self) -> str:
-        return f"tmp/{self._prefix}/sasrec_format_by_user_train.csv"
-
-    def sasrec_format_csv_by_user_test(self) -> str:
-        return f"tmp/{self._prefix}/sasrec_format_by_user_test.csv"
-
-    def preprocess_rating(self) -> int:
-        self.download()
-
-        if self._prefix == "ml-1m":
-            users = pd.read_csv(
-                f"tmp/{self._prefix}/users.dat",
-                sep="::",
-                names=["user_id", "sex", "age_group", "occupation", "zip_code"],
-            )
-            ratings = pd.read_csv(
-                f"tmp/{self._prefix}/ratings.dat",
-                sep="::",
-                names=["user_id", "movie_id", "rating", "unix_timestamp"],
-            )
-            movies = pd.read_csv(
-                f"tmp/{self._prefix}/movies.dat",
-                sep="::",
-                names=["movie_id", "title", "genres"],
-                encoding="iso-8859-1",
-            )
-        elif self._prefix == "ml-20m":
-            # ml-20m
-            # ml-20m doesn't have user data.
-            users = None
-            # ratings: userId,movieId,rating,timestamp
-            ratings = pd.read_csv(
-                f"tmp/{self._prefix}/ratings.csv",
-                sep=",",
-            )
-            ratings.rename(
-                columns={
-                    "userId": "user_id",
-                    "movieId": "movie_id",
-                    "timestamp": "unix_timestamp",
-                },
-                inplace=True,
-            )
-            # movieId,title,genres
-            # 1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
-            # 2,Jumanji (1995),Adventure|Children|Fantasy
-            movies = pd.read_csv(
-                f"tmp/{self._prefix}/movies.csv",
-                sep=",",
-                encoding="iso-8859-1",
-            )
-            movies.rename(columns={"movieId": "movie_id"}, inplace=True)
-        else:
-            assert self._prefix == "ml-20mx16x32"
-            # ml-1b
-            user_ids = []
-            movie_ids = []
-            for i in range(16):
-                train_file = f"tmp/{self._prefix}/trainx16x32_{i}.npz"
-                with np.load(train_file) as data:
-                    user_ids.extend([x[0] for x in data["arr_0"]])
-                    movie_ids.extend([x[1] for x in data["arr_0"]])
-            ratings = pd.DataFrame(
-                data={
-                    "user_id": user_ids,
-                    "movie_id": movie_ids,
-                    "rating": user_ids,  # placeholder
-                    "unix_timestamp": movie_ids,  # placeholder
-                }
-            )
-            users = None
-            movies = None
-
-        if movies is not None:
-            # ML-1M and ML-20M only
-            movies["year"] = movies["title"].apply(lambda x: x[-5:-1])
-            movies["cleaned_title"] = movies["title"].apply(lambda x: x[:-7])
-            # movies.year = pd.Categorical(movies.year)
-            # movies["year"] = movies.year.cat.codes
-
-        if users is not None:
-            ## Users (ml-1m only)
-            users.sex = pd.Categorical(users.sex)
-            users["sex"] = users.sex.cat.codes
-
-            users.age_group = pd.Categorical(users.age_group)
-            users["age_group"] = users.age_group.cat.codes
-
-            users.occupation = pd.Categorical(users.occupation)
-            users["occupation"] = users.occupation.cat.codes
-
-            users.zip_code = pd.Categorical(users.zip_code)
-            users["zip_code"] = users.zip_code.cat.codes
-
-        # Normalize movie ids to speed up training
-        print(
-            f"{self._prefix} #item before normalize: {len(set(ratings['movie_id'].values))}"
-        )
-        print(
-            f"{self._prefix} max item id before normalize: {max(set(ratings['movie_id'].values))}"
-        )
-        # print(f"ratings.movie_id.cat.categories={ratings.movie_id.cat.categories}; {type(ratings.movie_id.cat.categories)}")
-        # print(f"ratings.movie_id.cat.codes={ratings.movie_id.cat.codes}; {type(ratings.movie_id.cat.codes)}")
-        # print(movie_id_to_cat)
-        # ratings["movie_id"] = ratings.movie_id.cat.codes
-        # print(f"{self._prefix} #item after normalize: {len(set(ratings['movie_id'].values))}")
-        # print(f"{self._prefix} max item id after normalize: {max(set(ratings['movie_id'].values))}")
-        # movies["remapped_id"] = movies["movie_id"].apply(lambda x: movie_id_to_cat[x])
-
-        if self._convert_timestamp:
-            ratings["unix_timestamp"] = pd.to_datetime(
-                ratings["unix_timestamp"], unit="s"
-            )
-
-        # Save primary csv's
-        if not os.path.exists(f"tmp/processed/{self._prefix}"):
-            os.makedirs(f"tmp/processed/{self._prefix}")
-        if users is not None:
-            users.to_csv(f"tmp/processed/{self._prefix}/users.csv", index=False)
-        if movies is not None:
-            movies.to_csv(f"tmp/processed/{self._prefix}/movies.csv", index=False)
-        ratings.to_csv(f"tmp/processed/{self._prefix}/ratings.csv", index=False)
-
-        num_unique_users = len(set(ratings["user_id"].values))
-        num_unique_items = len(set(ratings["movie_id"].values))
-
-        # SASRec version
-        ratings_group = ratings.sort_values(by=["unix_timestamp"]).groupby("user_id")
-        seq_ratings_data = pd.DataFrame(
-            data={
-                "user_id": list(ratings_group.groups.keys()),
-                "item_ids": list(ratings_group.movie_id.apply(list)),
-                "ratings": list(ratings_group.rating.apply(list)),
-                "timestamps": list(ratings_group.unix_timestamp.apply(list)),
-            }
-        )
-
-        result = pd.DataFrame([[]])
-        for col in ["item_ids"]:
-            result[col + "_mean"] = seq_ratings_data[col].apply(len).mean()
-            result[col + "_min"] = seq_ratings_data[col].apply(len).min()
-            result[col + "_max"] = seq_ratings_data[col].apply(len).max()
-        print(self._prefix)
-        print(result)
-
-        seq_ratings_data = self.to_seq_data(seq_ratings_data, users)
-        seq_ratings_data.sample(frac=1).reset_index().to_csv(
-            self.output_format_csv(), index=False, sep=","
-        )
-
-        # Split by user ids (not tested yet)
-        user_id_split = int(num_unique_users * 0.9)
-        seq_ratings_data_train = seq_ratings_data[
-            seq_ratings_data["user_id"] <= user_id_split
-        ]
-        seq_ratings_data_train.sample(frac=1).reset_index().to_csv(
-            self.sasrec_format_csv_by_user_train(),
-            index=False,
-            sep=",",
-        )
-        seq_ratings_data_test = seq_ratings_data[
-            seq_ratings_data["user_id"] > user_id_split
-        ]
-        seq_ratings_data_test.sample(frac=1).reset_index().to_csv(
-            self.sasrec_format_csv_by_user_test(), index=False, sep=","
-        )
-        print(
-            f"{self._prefix}: train num user: {len(set(seq_ratings_data_train['user_id'].values))}"
-        )
-        print(
-            f"{self._prefix}: test num user: {len(set(seq_ratings_data_test['user_id'].values))}"
-        )
-
-        # print(seq_ratings_data)
-        if self.expected_num_unique_items() is not None:
-            assert self.expected_num_unique_items() == num_unique_items, (
-                f"Expected items: {self.expected_num_unique_items()}, got: {num_unique_items}"
-            )
-
-        return num_unique_items
-
-
-class AmazonDataProcessor(DataProcessor):
-    def __init__(
-        self,
-        download_path: str,
-        saved_name: str,
-        prefix: str,
-        expected_num_unique_items: Optional[int],
-    ) -> None:
-        super().__init__(
-            prefix,
-            expected_num_unique_items=expected_num_unique_items,
-            expected_max_item_id=None,
-        )
-        self._download_path = download_path
-        self._saved_name = saved_name
-        self._prefix = prefix
-
-    def download(self) -> None:
-        if not self.file_exists(self._saved_name):
-            urlretrieve(self._download_path, self._saved_name)
-
-    def preprocess_rating(self) -> int:
-        self.download()
-
-        ratings = pd.read_csv(
-            self._saved_name,
-            sep=",",
-            names=["user_id", "item_id", "rating", "timestamp"],
-        )
-        print(f"{self._prefix} #data points before filter: {ratings.shape[0]}")
-        print(
-            f"{self._prefix} #user before filter: {len(set(ratings['user_id'].values))}"
-        )
-        print(
-            f"{self._prefix} #item before filter: {len(set(ratings['item_id'].values))}"
-        )
-
-        # filter users and items with presence < 5
-        item_id_count = (
-            ratings["item_id"]
-            .value_counts()
-            .rename_axis("unique_values")
-            .reset_index(name="item_count")
-        )
-        user_id_count = (
-            ratings["user_id"]
-            .value_counts()
-            .rename_axis("unique_values")
-            .reset_index(name="user_count")
-        )
-        ratings = ratings.join(item_id_count.set_index("unique_values"), on="item_id")
-        ratings = ratings.join(user_id_count.set_index("unique_values"), on="user_id")
-        ratings = ratings[ratings["item_count"] >= 5]
-        ratings = ratings[ratings["user_count"] >= 5]
-        print(f"{self._prefix} #data points after filter: {ratings.shape[0]}")
-
-        # categorize user id and item id
-        ratings["item_id"] = pd.Categorical(ratings["item_id"])
-        ratings["item_id"] = ratings["item_id"].cat.codes
-        ratings["user_id"] = pd.Categorical(ratings["user_id"])
-        ratings["user_id"] = ratings["user_id"].cat.codes
-        print(
-            f"{self._prefix} #user after filter: {len(set(ratings['user_id'].values))}"
-        )
-        print(
-            f"{self._prefix} #item ater filter: {len(set(ratings['item_id'].values))}"
-        )
-
-        num_unique_items = len(set(ratings["item_id"].values))
-
-        # SASRec version
-        ratings_group = ratings.sort_values(by=["timestamp"]).groupby("user_id")
-
-        seq_ratings_data = pd.DataFrame(
-            data={
-                "user_id": list(ratings_group.groups.keys()),
-                "item_ids": list(ratings_group.item_id.apply(list)),
-                "ratings": list(ratings_group.rating.apply(list)),
-                "timestamps": list(ratings_group.timestamp.apply(list)),
-            }
-        )
-
-        seq_ratings_data = seq_ratings_data[
-            seq_ratings_data["item_ids"].apply(len) >= 5
-        ]
-
-        result = pd.DataFrame([[]])
-        for col in ["item_ids"]:
-            result[col + "_mean"] = seq_ratings_data[col].apply(len).mean()
-            result[col + "_min"] = seq_ratings_data[col].apply(len).min()
-            result[col + "_max"] = seq_ratings_data[col].apply(len).max()
-        print(self._prefix)
-        print(result)
-
-        if not os.path.exists(f"tmp/{self._prefix}"):
-            os.makedirs(f"tmp/{self._prefix}")
-
-        seq_ratings_data = self.to_seq_data(seq_ratings_data)
-        seq_ratings_data.sample(frac=1).reset_index().to_csv(
-            self.output_format_csv(), index=False, sep=","
-        )
-
-        if self.expected_num_unique_items() is not None:
-            assert self.expected_num_unique_items() == num_unique_items, (
-                f"expected: {self.expected_num_unique_items()}, actual: {num_unique_items}"
-            )
-            logging.info(f"{self.expected_num_unique_items()} unique items.")
-
-        return num_unique_items
-
-
-def get_common_preprocessors() -> Dict[
-    str,
-    Union[AmazonDataProcessor, MovielensDataProcessor, MovielensSyntheticDataProcessor],
-]:
-    ml_1m_dp = MovielensDataProcessor(  # pyre-ignore [45]
-        "http://files.grouplens.org/datasets/movielens/ml-1m.zip",
-        "tmp/movielens1m.zip",
-        prefix="ml-1m",
-        convert_timestamp=False,
-        expected_num_unique_items=3706,
-        expected_max_item_id=3952,
-    )
-    ml_20m_dp = MovielensDataProcessor(  # pyre-ignore [45]
-        "http://files.grouplens.org/datasets/movielens/ml-20m.zip",
-        "tmp/movielens20m.zip",
-        prefix="ml-20m",
-        convert_timestamp=False,
-        expected_num_unique_items=26744,
-        expected_max_item_id=131262,
-    )
-    ml_1b_dp = MovielensDataProcessor(  # pyre-ignore [45]
-        "https://files.grouplens.org/datasets/movielens/ml-20mx16x32.tar",
-        "tmp/movielens1b.tar",
-        prefix="ml-20mx16x32",
-        convert_timestamp=False,
-    )
-    ml_3b_dp = MovielensSyntheticDataProcessor(  # pyre-ignore [45]
-        prefix="ml-3b",
-        expected_num_unique_items=26743 * 32,
-        expected_max_item_id=26743 * 32,
-    )
-    amzn_books_dp = AmazonDataProcessor(  # pyre-ignore [45]
-        "http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/ratings_Books.csv",
-        "tmp/ratings_Books.csv",
-        prefix="amzn_books",
-        expected_num_unique_items=695762,
-    )
-    return {
-        "ml-1m": ml_1m_dp,
-        "ml-20m": ml_20m_dp,
-        "ml-1b": ml_1b_dp,
-        "ml-3b": ml_3b_dp,
-        "amzn-books": amzn_books_dp,
-    }
diff --git a/recommendation_v4/generative_recommenders/research/data/reco_dataset.py b/recommendation_v4/generative_recommenders/research/data/reco_dataset.py
deleted file mode 100644
index eedcdc08a..000000000
--- a/recommendation_v4/generative_recommenders/research/data/reco_dataset.py
+++ /dev/null
@@ -1,176 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-from dataclasses import dataclass
-from typing import List
-
-import pandas as pd
-import torch
-from generative_recommenders.research.data.dataset import DatasetV2, MultiFileDatasetV2
-from generative_recommenders.research.data.item_features import ItemFeatures
-from generative_recommenders.research.data.preprocessor import get_common_preprocessors
-
-
-@dataclass
-class RecoDataset:
-    max_sequence_length: int
-    num_unique_items: int
-    max_item_id: int
-    all_item_ids: List[int]
-    train_dataset: torch.utils.data.Dataset
-    eval_dataset: torch.utils.data.Dataset
-
-
-def get_reco_dataset(
-    dataset_name: str,
-    max_sequence_length: int,
-    chronological: bool,
-    positional_sampling_ratio: float = 1.0,
-) -> RecoDataset:
-    if dataset_name == "ml-1m":
-        dp = get_common_preprocessors()[dataset_name]
-        train_dataset = DatasetV2(
-            ratings_file=dp.output_format_csv(),
-            padding_length=max_sequence_length + 1,  # target
-            ignore_last_n=1,
-            chronological=chronological,
-            sample_ratio=positional_sampling_ratio,
-        )
-        eval_dataset = DatasetV2(
-            ratings_file=dp.output_format_csv(),
-            padding_length=max_sequence_length + 1,  # target
-            ignore_last_n=0,
-            chronological=chronological,
-            sample_ratio=1.0,  # do not sample
-        )
-    elif dataset_name == "ml-20m":
-        dp = get_common_preprocessors()[dataset_name]
-        train_dataset = DatasetV2(
-            ratings_file=dp.output_format_csv(),
-            padding_length=max_sequence_length + 1,  # target
-            ignore_last_n=1,
-            chronological=chronological,
-        )
-        eval_dataset = DatasetV2(
-            ratings_file=dp.output_format_csv(),
-            padding_length=max_sequence_length + 1,  # target
-            ignore_last_n=0,
-            chronological=chronological,
-        )
-    elif dataset_name == "ml-3b":
-        dp = get_common_preprocessors()[dataset_name]
-        train_dataset = MultiFileDatasetV2(
-            file_prefix="tmp/ml-3b/16x32",
-            num_files=16,
-            padding_length=max_sequence_length + 1,  # target
-            ignore_last_n=1,
-            chronological=chronological,
-        )
-        eval_dataset = MultiFileDatasetV2(
-            file_prefix="tmp/ml-3b/16x32",
-            num_files=16,
-            padding_length=max_sequence_length + 1,  # target
-            ignore_last_n=0,
-            chronological=chronological,
-        )
-    elif dataset_name == "amzn-books":
-        dp = get_common_preprocessors()[dataset_name]
-        train_dataset = DatasetV2(
-            ratings_file=dp.output_format_csv(),
-            padding_length=max_sequence_length + 1,  # target
-            ignore_last_n=1,
-            shift_id_by=1,  # [0..n-1] -> [1..n]
-            chronological=chronological,
-        )
-        eval_dataset = DatasetV2(
-            ratings_file=dp.output_format_csv(),
-            padding_length=max_sequence_length + 1,  # target
-            ignore_last_n=0,
-            shift_id_by=1,  # [0..n-1] -> [1..n]
-            chronological=chronological,
-        )
-    else:
-        raise ValueError(f"Unknown dataset {dataset_name}")
-
-    if dataset_name == "ml-1m" or dataset_name == "ml-20m":
-        items = pd.read_csv(dp.processed_item_csv(), delimiter=",")
-        max_jagged_dimension = 16
-        expected_max_item_id = dp.expected_max_item_id()
-        assert expected_max_item_id is not None
-        item_features: ItemFeatures = ItemFeatures(
-            max_ind_range=[63, 16383, 511],
-            num_items=expected_max_item_id + 1,
-            max_jagged_dimension=max_jagged_dimension,
-            lengths=[
-                torch.zeros((expected_max_item_id + 1,), dtype=torch.int64),
-                torch.zeros((expected_max_item_id + 1,), dtype=torch.int64),
-                torch.zeros((expected_max_item_id + 1,), dtype=torch.int64),
-            ],
-            values=[
-                torch.zeros(
-                    (expected_max_item_id + 1, max_jagged_dimension),
-                    dtype=torch.int64,
-                ),
-                torch.zeros(
-                    (expected_max_item_id + 1, max_jagged_dimension),
-                    dtype=torch.int64,
-                ),
-                torch.zeros(
-                    (expected_max_item_id + 1, max_jagged_dimension),
-                    dtype=torch.int64,
-                ),
-            ],
-        )
-        all_item_ids = []
-        for df_index, row in items.iterrows():
-            # print(f"index {df_index}: {row}")
-            movie_id = int(row["movie_id"])
-            genres = row["genres"].split("|")
-            titles = row["cleaned_title"].split(" ")
-            # print(f"{index}: genres{genres}, title{titles}")
-            genres_vector = [hash(x) % item_features.max_ind_range[0] for x in genres]
-            titles_vector = [hash(x) % item_features.max_ind_range[1] for x in titles]
-            years_vector = [hash(row["year"]) % item_features.max_ind_range[2]]
-            item_features.lengths[0][movie_id] = min(
-                len(genres_vector), max_jagged_dimension
-            )
-            item_features.lengths[1][movie_id] = min(
-                len(titles_vector), max_jagged_dimension
-            )
-            item_features.lengths[2][movie_id] = min(
-                len(years_vector), max_jagged_dimension
-            )
-            for f, f_values in enumerate([genres_vector, titles_vector, years_vector]):
-                for j in range(min(len(f_values), max_jagged_dimension)):
-                    item_features.values[f][movie_id][j] = f_values[j]
-            all_item_ids.append(movie_id)
-        max_item_id = dp.expected_max_item_id()
-        for x in all_item_ids:
-            assert x > 0, "x in all_item_ids should be positive"
-    else:
-        # expected_max_item_id and item_features are not set for Amazon datasets.
-        item_features = None
-        max_item_id = dp.expected_num_unique_items()
-        all_item_ids = [x + 1 for x in range(max_item_id)]  # pyre-ignore [6]
-
-    return RecoDataset(
-        max_sequence_length=max_sequence_length,
-        num_unique_items=dp.expected_num_unique_items(),  # pyre-ignore [6]
-        max_item_id=max_item_id,  # pyre-ignore [6]
-        all_item_ids=all_item_ids,
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-    )
diff --git a/recommendation_v4/generative_recommenders/research/indexing/candidate_index.py b/recommendation_v4/generative_recommenders/research/indexing/candidate_index.py
deleted file mode 100644
index fee763eaa..000000000
--- a/recommendation_v4/generative_recommenders/research/indexing/candidate_index.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-from typing import Optional, Tuple
-
-import torch
-from generative_recommenders.research.rails.indexing.candidate_index import TopKModule
-
-
-class CandidateIndex(object):
-    def __init__(
-        self,
-        ids: torch.Tensor,
-        embeddings: torch.Tensor,
-        invalid_ids: Optional[torch.Tensor] = None,
-        debug_path: Optional[str] = None,
-    ) -> None:
-        super().__init__()
-
-        self._ids: torch.Tensor = ids
-        self._embeddings: torch.Tensor = embeddings
-        self._invalid_ids: Optional[torch.Tensor] = invalid_ids
-        self._debug_path: Optional[str] = debug_path
-
-    @property
-    def ids(self) -> torch.Tensor:
-        """
-        Returns:
-            (1, X) or (B, X), where valid ids are positive integers.
-        """
-        return self._ids
-
-    @property
-    def num_objects(self) -> int:
-        return self._ids.size(1)
-
-    @property
-    def embeddings(self) -> torch.Tensor:
-        """
-        Returns:
-            (1, X, D) or (B, X, D) with the same shape as `ids'.
-        """
-        return self._embeddings
-
-    def filter_invalid_ids(
-        self,
-        invalid_ids: torch.Tensor,
-    ) -> "CandidateIndex":
-        """
-        Filters invalid_ids (batch dimension dependent) from the current index.
-
-        Args:
-            invalid_ids: (B, N) x int64.
-
-        Returns:
-            CandidateIndex with invalid_ids filtered.
-        """
-        X = self._ids.size(1)
-        if self._ids.size(0) == 1:
-            # ((1, X, 1) == (B, 1, N)) -> (B, X)
-            invalid_mask, _ = (self._ids.unsqueeze(2) == invalid_ids.unsqueeze(1)).max(
-                dim=2
-            )
-            lengths = (~invalid_mask).int().sum(-1)  # (B,)
-            valid_1d_mask = (~invalid_mask).view(-1)
-            B: int = lengths.size(0)
-            D: int = self._embeddings.size(-1)
-            jagged_ids = self._ids.expand(B, -1).reshape(-1)[valid_1d_mask]
-            jagged_embeddings = self._embeddings.expand(B, -1, -1).reshape(-1, D)[
-                valid_1d_mask
-            ]
-            X_prime: int = lengths.max(-1)[0].item()
-            jagged_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
-            return CandidateIndex(
-                ids=torch.ops.fbgemm.jagged_to_padded_dense(
-                    values=jagged_ids.unsqueeze(-1),
-                    offsets=[jagged_offsets],
-                    max_lengths=[X_prime],
-                    padding_value=0,
-                ).squeeze(-1),
-                embeddings=torch.ops.fbgemm.jagged_to_padded_dense(
-                    values=jagged_embeddings,
-                    offsets=[jagged_offsets],
-                    max_lengths=[X_prime],
-                    padding_value=0.0,
-                ),
-                debug_path=self._debug_path,
-            )
-        else:
-            assert self._invalid_ids == None
-            return CandidateIndex(
-                ids=self.ids,
-                embeddings=self.embeddings,
-                invalid_ids=invalid_ids,
-                debug_path=self._debug_path,
-            )
-
-    def get_top_k_outputs(
-        self,
-        query_embeddings: torch.Tensor,
-        k: int,
-        top_k_module: TopKModule,
-        invalid_ids: Optional[torch.Tensor],
-        r: int = 1,
-        return_embeddings: bool = False,
-    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
-        """
-        Gets top-k outputs specified by `policy_fn', while filtering out
-        invalid ids per row as specified by `invalid_ids'.
-
-        Args:
-            k: int. top k to return.
-            policy_fn: lambda that takes in item-side embeddings (B, X, D,) and user-side
-                embeddings (B * r, ...), and returns predictions (unnormalized logits)
-                of shape (B * r, X,).
-            invalid_ids: (B * r, N_0) x int64. The list of ids (if > 0) to filter from
-                results if present. Expect N_0 to be a small constant.
-            return_embeddings: bool if we should additionally return embeddings for the
-                top k results.
-
-        Returns:
-            A tuple of (top_k_ids, top_k_prs, top_k_embeddings) of shape (B * r, k, ...).
-        """
-        B: int = query_embeddings.size(0)
-        max_num_invalid_ids = 0
-        if invalid_ids is not None:
-            max_num_invalid_ids = invalid_ids.size(1)
-
-        k_prime = min(k + max_num_invalid_ids, self.num_objects)
-        top_k_prime_scores, top_k_prime_ids = top_k_module(
-            query_embeddings=query_embeddings, k=k_prime
-        )
-        # Masks out invalid items rowwise.
-        if invalid_ids is not None:
-            id_is_valid = ~(
-                (top_k_prime_ids.unsqueeze(2) == invalid_ids.unsqueeze(1)).max(2)[0]
-            )  # [B, K + N_0]
-            id_is_valid = torch.logical_and(
-                id_is_valid, torch.cumsum(id_is_valid.int(), dim=1) <= k
-            )
-            # [[1, 0, 1, 0], [0, 1, 1, 1]], k=2 -> [[0, 2], [1, 2]]
-            top_k_rowwise_offsets = torch.nonzero(id_is_valid, as_tuple=True)[1].view(
-                -1, k
-            )
-            top_k_scores = torch.gather(
-                top_k_prime_scores, dim=1, index=top_k_rowwise_offsets
-            )
-            top_k_ids = torch.gather(
-                top_k_prime_ids, dim=1, index=top_k_rowwise_offsets
-            )
-        else:
-            top_k_scores = top_k_prime_scores
-            top_k_ids = top_k_prime_ids
-
-        # TODO: this should be decoupled from candidate_index.
-        if return_embeddings:
-            raise ValueError("return_embeddings not supported yet.")
-        else:
-            top_k_embeddings = None
-        return top_k_ids, top_k_scores, top_k_embeddings
-
-    def apply_object_filter(self) -> "CandidateIndex":
-        """
-        Applies general per batch filters.
-        """
-        raise NotImplementedError("not implemented.")
diff --git a/recommendation_v4/generative_recommenders/research/indexing/utils.py b/recommendation_v4/generative_recommenders/research/indexing/utils.py
deleted file mode 100644
index 972d3c2e7..000000000
--- a/recommendation_v4/generative_recommenders/research/indexing/utils.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-import torch
-from generative_recommenders.research.rails.indexing.candidate_index import TopKModule
-from generative_recommenders.research.rails.indexing.mips_top_k import (
-    MIPSBruteForceTopK,
-)
-from generative_recommenders.research.rails.indexing.mol_top_k import MoLBruteForceTopK
-
-
-def get_top_k_module(
-    top_k_method: str,
-    model: torch.nn.Module,
-    item_embeddings: torch.Tensor,
-    item_ids: torch.Tensor,
-) -> TopKModule:
-    if top_k_method == "MIPSBruteForceTopK":
-        top_k_module = MIPSBruteForceTopK(
-            item_embeddings=item_embeddings,
-            item_ids=item_ids,
-        )
-    elif top_k_method == "MoLBruteForceTopK":
-        top_k_module = MoLBruteForceTopK(  # pyre-ignore [20]
-            item_embeddings=item_embeddings,
-            item_ids=item_ids,
-        )
-    else:
-        raise ValueError(f"Invalid top-k method {top_k_method}")
-    return top_k_module
diff --git a/recommendation_v4/generative_recommenders/research/modeling/initialization.py b/recommendation_v4/generative_recommenders/research/modeling/initialization.py
deleted file mode 100644
index c80d60075..000000000
--- a/recommendation_v4/generative_recommenders/research/modeling/initialization.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-import torch
-
-
-def truncated_normal(x: torch.Tensor, mean: float, std: float) -> torch.Tensor:
-    with torch.no_grad():
-        size = x.shape
-        tmp = x.new_empty(size + (4,)).normal_()
-        valid = (tmp < 2) & (tmp > -2)
-        ind = valid.max(-1, keepdim=True)[1]
-        x.data.copy_(tmp.gather(-1, ind).squeeze(-1))
-        x.data.mul_(std).add_(mean)
-        return x
-
-
-def init_mlp_xavier_weights_zero_bias(m: torch.nn.Module) -> None:
-    if isinstance(m, torch.nn.Linear):
-        torch.nn.init.xavier_uniform(m.weight)
-        if getattr(m, "bias", None) is not None:
-            m.bias.data.fill_(0.0)
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/autoregressive_losses.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/autoregressive_losses.py
deleted file mode 100644
index c32bedf0e..000000000
--- a/recommendation_v4/generative_recommenders/research/modeling/sequential/autoregressive_losses.py
+++ /dev/null
@@ -1,477 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-import abc
-from collections import OrderedDict
-from typing import List, Tuple
-
-import torch
-import torch.nn.functional as F
-from generative_recommenders.research.rails.similarities.module import SimilarityModule
-from torch.utils.checkpoint import checkpoint
-
-
-class NegativesSampler(torch.nn.Module):
-    def __init__(self, l2_norm: bool, l2_norm_eps: float) -> None:
-        super().__init__()
-
-        self._l2_norm: bool = l2_norm
-        self._l2_norm_eps: float = l2_norm_eps
-
-    def normalize_embeddings(self, x: torch.Tensor) -> torch.Tensor:
-        return self._maybe_l2_norm(x)
-
-    def _maybe_l2_norm(self, x: torch.Tensor) -> torch.Tensor:
-        if self._l2_norm:
-            x = x / torch.clamp(
-                torch.linalg.norm(x, ord=2, dim=-1, keepdim=True),
-                min=self._l2_norm_eps,
-            )
-        return x
-
-    @abc.abstractmethod
-    def debug_str(self) -> str:
-        pass
-
-    @abc.abstractmethod
-    def process_batch(
-        self,
-        ids: torch.Tensor,
-        presences: torch.Tensor,
-        embeddings: torch.Tensor,
-    ) -> None:
-        pass
-
-    @abc.abstractmethod
-    def forward(
-        self,
-        positive_ids: torch.Tensor,
-        num_to_sample: int,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Returns:
-            A tuple of (sampled_ids, sampled_negative_embeddings).
-        """
-        pass
-
-
-class LocalNegativesSampler(NegativesSampler):
-    def __init__(
-        self,
-        num_items: int,
-        item_emb: torch.nn.Embedding,
-        all_item_ids: List[int],
-        l2_norm: bool,
-        l2_norm_eps: float,
-    ) -> None:
-        super().__init__(l2_norm=l2_norm, l2_norm_eps=l2_norm_eps)
-
-        self._num_items: int = len(all_item_ids)
-        self._item_emb: torch.nn.Embedding = item_emb
-        self.register_buffer("_all_item_ids", torch.tensor(all_item_ids))
-
-    def debug_str(self) -> str:
-        sampling_debug_str = (
-            f"local{f'-l2-eps{self._l2_norm_eps}' if self._l2_norm else ''}"
-        )
-        return sampling_debug_str
-
-    def process_batch(
-        self,
-        ids: torch.Tensor,
-        presences: torch.Tensor,
-        embeddings: torch.Tensor,
-    ) -> None:
-        pass
-
-    def forward(
-        self,
-        positive_ids: torch.Tensor,
-        num_to_sample: int,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Returns:
-            A tuple of (sampled_ids, sampled_negative_embeddings).
-        """
-        # assert torch.max(torch.abs(self._item_emb(positive_ids) - positive_embeddings)) < 1e-4
-        output_shape = positive_ids.size() + (num_to_sample,)
-        sampled_offsets = torch.randint(
-            low=0,
-            high=self._num_items,
-            size=output_shape,
-            dtype=positive_ids.dtype,
-            device=positive_ids.device,
-        )
-        sampled_ids = self._all_item_ids[sampled_offsets.view(-1)].reshape(output_shape)
-        return sampled_ids, self.normalize_embeddings(self._item_emb(sampled_ids))
-
-
-class InBatchNegativesSampler(NegativesSampler):
-    def __init__(
-        self,
-        l2_norm: bool,
-        l2_norm_eps: float,
-        dedup_embeddings: bool,
-    ) -> None:
-        super().__init__(l2_norm=l2_norm, l2_norm_eps=l2_norm_eps)
-
-        self._dedup_embeddings: bool = dedup_embeddings
-
-    def debug_str(self) -> str:
-        sampling_debug_str = (
-            f"in-batch{f'-l2-eps{self._l2_norm_eps}' if self._l2_norm else ''}"
-        )
-        if self._dedup_embeddings:
-            sampling_debug_str += "-dedup"
-        return sampling_debug_str
-
-    def process_batch(
-        self,
-        ids: torch.Tensor,
-        presences: torch.Tensor,
-        embeddings: torch.Tensor,
-    ) -> None:
-        """
-        Args:
-           ids: (N') or (B, N) x int64
-           presences: (N') or (B, N) x bool
-           embeddings: (N', D) or (B, N, D) x float
-        """
-        assert ids.size() == presences.size()
-        assert ids.size() == embeddings.size()[:-1]
-        if self._dedup_embeddings:
-            valid_ids = ids[presences]
-            unique_ids, unique_ids_inverse_indices = torch.unique(
-                input=valid_ids, sorted=False, return_inverse=True
-            )
-            device = unique_ids.device
-            unique_embedding_offsets = torch.empty(
-                (unique_ids.numel(),),
-                dtype=torch.int64,
-                device=device,
-            )
-            unique_embedding_offsets[unique_ids_inverse_indices] = torch.arange(
-                valid_ids.numel(), dtype=torch.int64, device=device
-            )
-            unique_embeddings = embeddings[presences][unique_embedding_offsets, :]
-            self._cached_embeddings = self._maybe_l2_norm(  # pyre-ignore [16]
-                unique_embeddings
-            )
-            self._cached_ids = unique_ids  # pyre-ignore [16]
-        else:
-            self._cached_embeddings = self._maybe_l2_norm(embeddings[presences])
-            self._cached_ids = ids[presences]
-
-    def get_all_ids_and_embeddings(self) -> Tuple[torch.Tensor, torch.Tensor]:
-        return self._cached_ids, self._cached_embeddings  # pyre-ignore [7]
-
-    def forward(
-        self,
-        positive_ids: torch.Tensor,
-        num_to_sample: int,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Returns:
-            A tuple of (sampled_ids, sampled_negative_embeddings,).
-        """
-        X = self._cached_ids.size(0)
-        sampled_offsets = torch.randint(
-            low=0,
-            high=X,
-            size=positive_ids.size() + (num_to_sample,),
-            dtype=positive_ids.dtype,
-            device=positive_ids.device,
-        )
-        return (
-            self._cached_ids[sampled_offsets],  # pyre-ignore [29]
-            self._cached_embeddings[sampled_offsets],  # pyre-ignore [29]
-        )
-
-
-class AutoregressiveLoss(torch.nn.Module):
-    @abc.abstractmethod
-    def jagged_forward(
-        self,
-        output_embeddings: torch.Tensor,
-        supervision_ids: torch.Tensor,
-        supervision_embeddings: torch.Tensor,
-        supervision_weights: torch.Tensor,
-        negatives_sampler: NegativesSampler,
-    ) -> torch.Tensor:
-        """
-        Variant of forward() when the tensors are already in jagged format.
-
-        Args:
-            output_embeddings: [N', D] x float, embeddings for the current
-                input sequence.
-            supervision_ids: [N'] x int64, (positive) supervision ids.
-            supervision_embeddings: [N', D] x float.
-            supervision_weights: Optional [N'] x float. Optional weights for
-                masking out invalid positions, or reweighting supervision labels.
-            negatives_sampler: sampler used to obtain negative examples paired with
-                positives.
-
-        Returns:
-            (1), loss for the current engaged sequence.
-        """
-        pass
-
-    @abc.abstractmethod
-    def forward(
-        self,
-        lengths: torch.Tensor,
-        output_embeddings: torch.Tensor,
-        supervision_ids: torch.Tensor,
-        supervision_embeddings: torch.Tensor,
-        supervision_weights: torch.Tensor,
-        negatives_sampler: NegativesSampler,
-    ) -> torch.Tensor:
-        """
-        Args:
-            lengths: [B] x int32 representing number of non-zero elements per row.
-            output_embeddings: [B, N, D] x float, embeddings for the current
-                input sequence.
-            supervision_ids: [B, N] x int64, (positive) supervision ids.
-            supervision_embeddings: [B, N, D] x float.
-            supervision_weights: Optional [B, N] x float. Optional weights for
-                masking out invalid positions, or reweighting supervision labels.
-            negatives_sampler: sampler used to obtain negative examples paired with
-                positives.
-
-        Returns:
-            (1), loss for the current engaged sequence.
-        """
-        pass
-
-
-class BCELoss(AutoregressiveLoss):
-    def __init__(
-        self,
-        temperature: float,
-        model: SimilarityModule,
-    ) -> None:
-        super().__init__()
-        self._temperature: float = temperature
-        self._model = model
-
-    def jagged_forward(
-        self,
-        output_embeddings: torch.Tensor,
-        supervision_ids: torch.Tensor,
-        supervision_embeddings: torch.Tensor,
-        supervision_weights: torch.Tensor,
-        negatives_sampler: NegativesSampler,
-    ) -> torch.Tensor:
-        assert output_embeddings.size() == supervision_embeddings.size()
-        assert supervision_ids.size() == supervision_embeddings.size()[:-1]
-        assert supervision_ids.size() == supervision_weights.size()
-
-        sampled_ids, sampled_negative_embeddings = negatives_sampler(
-            positive_ids=supervision_ids,
-            num_to_sample=1,
-        )
-
-        positive_logits = (
-            self._model.interaction(  # pyre-ignore [29]
-                input_embeddings=output_embeddings,  # [B, D] = [N', D]
-                target_ids=supervision_ids.unsqueeze(1),  # [N', 1]
-                target_embeddings=supervision_embeddings.unsqueeze(
-                    1
-                ),  # [N', D] -> [N', 1, D]
-            )[0].squeeze(1)
-            / self._temperature
-        )  # [N']
-
-        sampled_negatives_logits = (
-            self._model.interaction(  # pyre-ignore [29]
-                input_embeddings=output_embeddings,  # [N', D]
-                target_ids=sampled_ids,  # [N', 1]
-                target_embeddings=sampled_negative_embeddings,  # [N', 1, D]
-            )[0].squeeze(1)
-            / self._temperature
-        )  # [N']
-        sampled_negatives_valid_mask = (
-            supervision_ids != sampled_ids.squeeze(1)
-        ).float()  # [N']
-        loss_weights = supervision_weights * sampled_negatives_valid_mask
-        weighted_losses = (
-            (
-                F.binary_cross_entropy_with_logits(
-                    input=positive_logits,
-                    target=torch.ones_like(positive_logits),
-                    reduction="none",
-                )
-                + F.binary_cross_entropy_with_logits(
-                    input=sampled_negatives_logits,
-                    target=torch.zeros_like(sampled_negatives_logits),
-                    reduction="none",
-                )
-            )
-            * loss_weights
-            * 0.5
-        )
-        return weighted_losses.sum() / loss_weights.sum()
-
-    def forward(
-        self,
-        lengths: torch.Tensor,
-        output_embeddings: torch.Tensor,
-        supervision_ids: torch.Tensor,
-        supervision_embeddings: torch.Tensor,
-        supervision_weights: torch.Tensor,
-        negatives_sampler: NegativesSampler,
-    ) -> torch.Tensor:
-        """
-        Args:
-          lengths: [B] x int32 representing number of non-zero elements per row.
-          output_embeddings: [B, N, D] x float, embeddings for the current
-              input sequence.
-          supervision_ids: [B, N] x int64, (positive) supervision ids.
-          supervision_embeddings: [B, N, D] x float.
-          supervision_weights: Optional [B, N] x float. Optional weights for
-              masking out invalid positions, or reweighting supervision labels.
-          negatives_sampler: sampler used to obtain negative examples paired with
-              positives.
-        Returns:
-          (1), loss for the current engaged sequence.
-        """
-        assert output_embeddings.size() == supervision_embeddings.size()
-        assert supervision_ids.size() == supervision_embeddings.size()[:-1]
-        jagged_id_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
-        jagged_supervision_ids = (
-            torch.ops.fbgemm.dense_to_jagged(
-                supervision_ids.unsqueeze(-1).float(), [jagged_id_offsets]
-            )[0]
-            .squeeze(1)
-            .long()
-        )
-        jagged_supervision_weights = torch.ops.fbgemm.dense_to_jagged(
-            supervision_weights.unsqueeze(-1),
-            [jagged_id_offsets],
-        )[0].squeeze(1)
-        return self.jagged_forward(
-            output_embeddings=torch.ops.fbgemm.dense_to_jagged(
-                output_embeddings,
-                [jagged_id_offsets],
-            )[0],
-            supervision_ids=jagged_supervision_ids,
-            supervision_embeddings=torch.ops.fbgemm.dense_to_jagged(
-                supervision_embeddings,
-                [jagged_id_offsets],
-            )[0],
-            supervision_weights=jagged_supervision_weights,
-            negatives_sampler=negatives_sampler,
-        )
-
-
-class BCELossWithRatings(AutoregressiveLoss):
-    def __init__(
-        self,
-        temperature: float,
-        model: SimilarityModule,
-    ) -> None:
-        super().__init__()
-        self._temperature: float = temperature
-        self._model = model
-
-    def jagged_forward(
-        self,
-        output_embeddings: torch.Tensor,
-        supervision_ids: torch.Tensor,
-        supervision_embeddings: torch.Tensor,
-        supervision_weights: torch.Tensor,
-        supervision_ratings: torch.Tensor,
-        negatives_sampler: NegativesSampler,
-    ) -> torch.Tensor:
-        assert output_embeddings.size() == supervision_embeddings.size()
-        assert supervision_ids.size() == supervision_embeddings.size()[:-1]
-        assert supervision_ids.size() == supervision_weights.size()
-
-        target_logits = (
-            self._model.interaction(  # pyre-ignore [29]
-                input_embeddings=output_embeddings,  # [B, D] = [N', D]
-                target_ids=supervision_ids.unsqueeze(1),  # [N', 1]
-                target_embeddings=supervision_embeddings.unsqueeze(
-                    1
-                ),  # [N', D] -> [N', 1, D]
-            )[0].squeeze(1)
-            / self._temperature
-        )  # [N', 1]
-
-        weighted_losses = (
-            F.binary_cross_entropy_with_logits(
-                input=target_logits,
-                target=supervision_ratings.to(dtype=target_logits.dtype),
-                reduction="none",
-            )
-        ) * supervision_weights
-        return weighted_losses.sum() / supervision_weights.sum()
-
-    def forward(
-        self,
-        lengths: torch.Tensor,
-        output_embeddings: torch.Tensor,
-        supervision_ids: torch.Tensor,
-        supervision_embeddings: torch.Tensor,
-        supervision_weights: torch.Tensor,
-        supervision_ratings: torch.Tensor,
-        negatives_sampler: NegativesSampler,
-    ) -> torch.Tensor:
-        """
-        Args:
-          lengths: [B] x int32 representing number of non-zero elements per row.
-          output_embeddings: [B, N, D] x float, embeddings for the current
-              input sequence.
-          supervision_ids: [B, N] x int64, (positive) supervision ids.
-          supervision_embeddings: [B, N, D] x float.
-          supervision_weights: Optional [B, N] x float. Optional weights for
-              masking out invalid positions, or reweighting supervision labels.
-          negatives_sampler: sampler used to obtain negative examples paired with
-              positives.
-        Returns:
-          (1), loss for the current engaged sequence.
-        """
-        assert output_embeddings.size() == supervision_embeddings.size()
-        assert supervision_ids.size() == supervision_embeddings.size()[:-1]
-        jagged_id_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
-        jagged_supervision_ids = (
-            torch.ops.fbgemm.dense_to_jagged(
-                supervision_ids.unsqueeze(-1).float(), [jagged_id_offsets]
-            )[0]
-            .squeeze(1)
-            .long()
-        )
-        jagged_supervision_weights = torch.ops.fbgemm.dense_to_jagged(
-            supervision_weights.unsqueeze(-1),
-            [jagged_id_offsets],
-        )[0].squeeze(1)
-        return self.jagged_forward(
-            output_embeddings=torch.ops.fbgemm.dense_to_jagged(
-                output_embeddings,
-                [jagged_id_offsets],
-            )[0],
-            supervision_ids=jagged_supervision_ids,
-            supervision_embeddings=torch.ops.fbgemm.dense_to_jagged(
-                supervision_embeddings,
-                [jagged_id_offsets],
-            )[0],
-            supervision_weights=jagged_supervision_weights,
-            supervision_ratings=torch.ops.fbgemm.dense_to_jagged(
-                supervision_ratings.unsqueeze(-1),
-                [jagged_id_offsets],
-            )[0].squeeze(1),
-            negatives_sampler=negatives_sampler,
-        )
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/embedding_modules.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/embedding_modules.py
deleted file mode 100644
index 6e85a62dd..000000000
--- a/recommendation_v4/generative_recommenders/research/modeling/sequential/embedding_modules.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-import abc
-
-import torch
-from generative_recommenders.research.modeling.initialization import truncated_normal
-
-
-class EmbeddingModule(torch.nn.Module):
-    @abc.abstractmethod
-    def debug_str(self) -> str:
-        pass
-
-    @abc.abstractmethod
-    def get_item_embeddings(self, item_ids: torch.Tensor) -> torch.Tensor:
-        pass
-
-    @property
-    @abc.abstractmethod
-    def item_embedding_dim(self) -> int:
-        pass
-
-
-class LocalEmbeddingModule(EmbeddingModule):
-    def __init__(
-        self,
-        num_items: int,
-        item_embedding_dim: int,
-    ) -> None:
-        super().__init__()
-
-        self._item_embedding_dim: int = item_embedding_dim
-        self._item_emb = torch.nn.Embedding(
-            num_items + 1, item_embedding_dim, padding_idx=0
-        )
-        self.reset_params()
-
-    def debug_str(self) -> str:
-        return f"local_emb_d{self._item_embedding_dim}"
-
-    def reset_params(self) -> None:
-        for name, params in self.named_parameters():
-            if "_item_emb" in name:
-                print(
-                    f"Initialize {name} as truncated normal: {params.data.size()} params"
-                )
-                truncated_normal(params, mean=0.0, std=0.02)
-            else:
-                print(f"Skipping initializing params {name} - not configured")
-
-    def get_item_embeddings(self, item_ids: torch.Tensor) -> torch.Tensor:
-        return self._item_emb(item_ids)
-
-    @property
-    def item_embedding_dim(self) -> int:
-        return self._item_embedding_dim
-
-
-class CategoricalEmbeddingModule(EmbeddingModule):
-    def __init__(
-        self,
-        num_items: int,
-        item_embedding_dim: int,
-        item_id_to_category_id: torch.Tensor,
-    ) -> None:
-        super().__init__()
-
-        self._item_embedding_dim: int = item_embedding_dim
-        self._item_emb: torch.nn.Embedding = torch.nn.Embedding(
-            num_items + 1, item_embedding_dim, padding_idx=0
-        )
-        self.register_buffer("_item_id_to_category_id", item_id_to_category_id)
-        self.reset_params()
-
-    def debug_str(self) -> str:
-        return f"cat_emb_d{self._item_embedding_dim}"
-
-    def reset_params(self) -> None:
-        for name, params in self.named_parameters():
-            if "_item_emb" in name:
-                print(
-                    f"Initialize {name} as truncated normal: {params.data.size()} params"
-                )
-                truncated_normal(params, mean=0.0, std=0.02)
-            else:
-                print(f"Skipping initializing params {name} - not configured")
-
-    def get_item_embeddings(self, item_ids: torch.Tensor) -> torch.Tensor:
-        item_ids = self._item_id_to_category_id[(item_ids - 1).clamp(min=0)] + 1
-        return self._item_emb(item_ids)
-
-    @property
-    def item_embedding_dim(self) -> int:
-        return self._item_embedding_dim
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/encoder_utils.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/encoder_utils.py
deleted file mode 100644
index dc64aa2cf..000000000
--- a/recommendation_v4/generative_recommenders/research/modeling/sequential/encoder_utils.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-import gin
-from generative_recommenders.research.modeling.sequential.embedding_modules import (
-    EmbeddingModule,
-)
-from generative_recommenders.research.modeling.sequential.hstu import HSTU
-from generative_recommenders.research.modeling.sequential.input_features_preprocessors import (
-    InputFeaturesPreprocessorModule,
-)
-from generative_recommenders.research.modeling.sequential.output_postprocessors import (
-    OutputPostprocessorModule,
-)
-from generative_recommenders.research.modeling.sequential.sasrec import SASRec
-from generative_recommenders.research.modeling.similarity_module import (
-    SequentialEncoderWithLearnedSimilarityModule,
-)
-from generative_recommenders.research.rails.similarities.module import SimilarityModule
-
-
-@gin.configurable
-def sasrec_encoder(
-    max_sequence_length: int,
-    max_output_length: int,
-    embedding_module: EmbeddingModule,
-    similarity_module: SimilarityModule,
-    input_preproc_module: InputFeaturesPreprocessorModule,
-    output_postproc_module: OutputPostprocessorModule,
-    activation_checkpoint: bool,
-    verbose: bool,
-    ffn_hidden_dim: int = 64,
-    ffn_activation_fn: str = "relu",
-    ffn_dropout_rate: float = 0.2,
-    num_blocks: int = 2,
-    num_heads: int = 1,
-) -> SequentialEncoderWithLearnedSimilarityModule:
-    return SASRec(
-        embedding_module=embedding_module,
-        max_sequence_len=max_sequence_length,
-        max_output_len=max_output_length,
-        embedding_dim=embedding_module.item_embedding_dim,
-        ffn_hidden_dim=ffn_hidden_dim,
-        ffn_activation_fn=ffn_activation_fn,
-        ffn_dropout_rate=ffn_dropout_rate,
-        num_blocks=num_blocks,
-        num_heads=num_heads,
-        similarity_module=similarity_module,  # pyre-ignore [6]
-        input_features_preproc_module=input_preproc_module,
-        output_postproc_module=output_postproc_module,
-        activation_checkpoint=activation_checkpoint,
-        verbose=verbose,
-    )
-
-
-@gin.configurable
-def hstu_encoder(
-    max_sequence_length: int,
-    max_output_length: int,
-    embedding_module: EmbeddingModule,
-    similarity_module: SimilarityModule,
-    input_preproc_module: InputFeaturesPreprocessorModule,
-    output_postproc_module: OutputPostprocessorModule,
-    activation_checkpoint: bool,
-    verbose: bool,
-    num_blocks: int = 2,
-    num_heads: int = 1,
-    dqk: int = 64,
-    dv: int = 64,
-    linear_dropout_rate: float = 0.0,
-    attn_dropout_rate: float = 0.0,
-    normalization: str = "rel_bias",
-    linear_config: str = "uvqk",
-    linear_activation: str = "silu",
-    concat_ua: bool = False,
-    enable_relative_attention_bias: bool = True,
-) -> SequentialEncoderWithLearnedSimilarityModule:
-    return HSTU(
-        embedding_module=embedding_module,
-        similarity_module=similarity_module,  # pyre-ignore [6]
-        input_features_preproc_module=input_preproc_module,
-        output_postproc_module=output_postproc_module,
-        max_sequence_len=max_sequence_length,
-        max_output_len=max_output_length,
-        embedding_dim=embedding_module.item_embedding_dim,
-        num_blocks=num_blocks,
-        num_heads=num_heads,
-        attention_dim=dqk,
-        linear_dim=dv,
-        linear_dropout_rate=linear_dropout_rate,
-        attn_dropout_rate=attn_dropout_rate,
-        linear_config=linear_config,
-        linear_activation=linear_activation,
-        normalization=normalization,
-        concat_ua=concat_ua,
-        enable_relative_attention_bias=enable_relative_attention_bias,
-        verbose=verbose,
-    )
-
-
-@gin.configurable
-def get_sequential_encoder(
-    module_type: str,
-    max_sequence_length: int,
-    max_output_length: int,
-    embedding_module: EmbeddingModule,
-    interaction_module: SimilarityModule,
-    input_preproc_module: InputFeaturesPreprocessorModule,
-    output_postproc_module: OutputPostprocessorModule,
-    verbose: bool,
-    activation_checkpoint: bool = False,
-) -> SequentialEncoderWithLearnedSimilarityModule:
-    if module_type == "SASRec":
-        model = sasrec_encoder(
-            max_sequence_length=max_sequence_length,
-            max_output_length=max_output_length,
-            embedding_module=embedding_module,
-            similarity_module=interaction_module,
-            input_preproc_module=input_preproc_module,
-            output_postproc_module=output_postproc_module,
-            activation_checkpoint=activation_checkpoint,
-            verbose=verbose,
-        )
-    elif module_type == "HSTU":
-        model = hstu_encoder(
-            max_sequence_length=max_sequence_length,
-            max_output_length=max_output_length,
-            embedding_module=embedding_module,
-            similarity_module=interaction_module,
-            input_preproc_module=input_preproc_module,
-            output_postproc_module=output_postproc_module,
-            activation_checkpoint=activation_checkpoint,
-            verbose=verbose,
-        )
-    else:
-        raise ValueError(f"Unsupported module_type {module_type}")
-    return model
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/features.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/features.py
deleted file mode 100644
index 70bf80cc0..000000000
--- a/recommendation_v4/generative_recommenders/research/modeling/sequential/features.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-from typing import Dict, NamedTuple, Optional, Tuple
-
-import torch
-
-
-class SequentialFeatures(NamedTuple):
-    # (B,) x int64. Requires past_lengths[i] > 0 \forall i.
-    past_lengths: torch.Tensor
-    # (B, N,) x int64. 0 denotes valid ids.
-    past_ids: torch.Tensor
-    # (B, N, D) x float.
-    past_embeddings: Optional[torch.Tensor]
-    # Implementation-specific payloads.
-    # e.g., past timestamps, past event_types (e.g., clicks, likes), etc.
-    past_payloads: Dict[str, torch.Tensor]
-
-
-def movielens_seq_features_from_row(
-    row: Dict[str, torch.Tensor],
-    device: int,
-    max_output_length: int,
-) -> Tuple[SequentialFeatures, torch.Tensor, torch.Tensor]:
-    historical_lengths = row["history_lengths"].to(device)  # [B]
-    historical_ids = row["historical_ids"].to(device)  # [B, N]
-    historical_ratings = row["historical_ratings"].to(device)
-    historical_timestamps = row["historical_timestamps"].to(device)
-    target_ids = row["target_ids"].to(device).unsqueeze(1)  # [B, 1]
-    target_ratings = row["target_ratings"].to(device).unsqueeze(1)
-    target_timestamps = row["target_timestamps"].to(device).unsqueeze(1)
-    if max_output_length > 0:
-        B = historical_lengths.size(0)
-        historical_ids = torch.cat(
-            [
-                historical_ids,
-                torch.zeros(
-                    (B, max_output_length), dtype=historical_ids.dtype, device=device
-                ),
-            ],
-            dim=1,
-        )
-        historical_ratings = torch.cat(
-            [
-                historical_ratings,
-                torch.zeros(
-                    (B, max_output_length),
-                    dtype=historical_ratings.dtype,
-                    device=device,
-                ),
-            ],
-            dim=1,
-        )
-        historical_timestamps = torch.cat(
-            [
-                historical_timestamps,
-                torch.zeros(
-                    (B, max_output_length),
-                    dtype=historical_timestamps.dtype,
-                    device=device,
-                ),
-            ],
-            dim=1,
-        )
-        historical_timestamps.scatter_(
-            dim=1,
-            index=historical_lengths.view(-1, 1),
-            src=target_timestamps.view(-1, 1),
-        )
-        # print(f"historical_ids.size()={historical_ids.size()}, historical_timestamps.size()={historical_timestamps.size()}")
-    features = SequentialFeatures(
-        past_lengths=historical_lengths,
-        past_ids=historical_ids,
-        past_embeddings=None,
-        past_payloads={
-            "timestamps": historical_timestamps,
-            "ratings": historical_ratings,
-        },
-    )
-    return features, target_ids, target_ratings
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/hstu.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/hstu.py
deleted file mode 100644
index 3c89245a2..000000000
--- a/recommendation_v4/generative_recommenders/research/modeling/sequential/hstu.py
+++ /dev/null
@@ -1,808 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-"""
-Implements HSTU (Hierarchical Sequential Transduction Unit) in
-Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations
-(https://arxiv.org/abs/2402.17152, ICML'24).
-"""
-
-import abc
-import math
-from typing import Callable, Dict, List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-from generative_recommenders.research.modeling.sequential.embedding_modules import (
-    EmbeddingModule,
-)
-from generative_recommenders.research.modeling.sequential.input_features_preprocessors import (
-    InputFeaturesPreprocessorModule,
-)
-from generative_recommenders.research.modeling.sequential.output_postprocessors import (
-    OutputPostprocessorModule,
-)
-from generative_recommenders.research.modeling.sequential.utils import (
-    get_current_embeddings,
-)
-from generative_recommenders.research.modeling.similarity_module import (
-    SequentialEncoderWithLearnedSimilarityModule,
-)
-from generative_recommenders.research.rails.similarities.module import SimilarityModule
-
-
-TIMESTAMPS_KEY = "timestamps"
-
-
-class RelativeAttentionBiasModule(torch.nn.Module):
-    @abc.abstractmethod
-    def forward(
-        self,
-        all_timestamps: torch.Tensor,
-    ) -> torch.Tensor:
-        """
-        Args:
-            all_timestamps: [B, N] x int64
-        Returns:
-            torch.float tensor broadcastable to [B, N, N]
-        """
-        pass
-
-
-class RelativePositionalBias(RelativeAttentionBiasModule):
-    def __init__(self, max_seq_len: int) -> None:
-        super().__init__()
-
-        self._max_seq_len: int = max_seq_len
-        self._w = torch.nn.Parameter(
-            torch.empty(2 * max_seq_len - 1).normal_(mean=0, std=0.02),
-        )
-
-    def forward(
-        self,
-        all_timestamps: torch.Tensor,
-    ) -> torch.Tensor:
-        del all_timestamps
-        n: int = self._max_seq_len
-        t = F.pad(self._w[: 2 * n - 1], [0, n]).repeat(n)
-        t = t[..., :-n].reshape(1, n, 3 * n - 2)
-        r = (2 * n - 1) // 2
-        return t[..., r:-r]
-
-
-class RelativeBucketedTimeAndPositionBasedBias(RelativeAttentionBiasModule):
-    """
-    Bucketizes timespans based on ts(next-item) - ts(current-item).
-    """
-
-    def __init__(
-        self,
-        max_seq_len: int,
-        num_buckets: int,
-        bucketization_fn: Callable[[torch.Tensor], torch.Tensor],
-    ) -> None:
-        super().__init__()
-
-        self._max_seq_len: int = max_seq_len
-        self._ts_w = torch.nn.Parameter(
-            torch.empty(num_buckets + 1).normal_(mean=0, std=0.02),
-        )
-        self._pos_w = torch.nn.Parameter(
-            torch.empty(2 * max_seq_len - 1).normal_(mean=0, std=0.02),
-        )
-        self._num_buckets: int = num_buckets
-        self._bucketization_fn: Callable[[torch.Tensor], torch.Tensor] = (
-            bucketization_fn
-        )
-
-    def forward(
-        self,
-        all_timestamps: torch.Tensor,
-    ) -> torch.Tensor:
-        """
-        Args:
-            all_timestamps: (B, N).
-        Returns:
-            (B, N, N).
-        """
-        B = all_timestamps.size(0)
-        N = self._max_seq_len
-        t = F.pad(self._pos_w[: 2 * N - 1], [0, N]).repeat(N)
-        t = t[..., :-N].reshape(1, N, 3 * N - 2)
-        r = (2 * N - 1) // 2
-
-        # [B, N + 1] to simplify tensor manipulations.
-        ext_timestamps = torch.cat(
-            [all_timestamps, all_timestamps[:, N - 1 : N]], dim=1
-        )
-        # causal masking. Otherwise [:, :-1] - [:, 1:] works
-        bucketed_timestamps = torch.clamp(
-            self._bucketization_fn(
-                ext_timestamps[:, 1:].unsqueeze(2) - ext_timestamps[:, :-1].unsqueeze(1)
-            ),
-            min=0,
-            max=self._num_buckets,
-        ).detach()
-        rel_pos_bias = t[:, :, r:-r]
-        rel_ts_bias = torch.index_select(
-            self._ts_w, dim=0, index=bucketed_timestamps.view(-1)
-        ).view(B, N, N)
-        return rel_pos_bias + rel_ts_bias
-
-
-HSTUCacheState = Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
-
-
-def _hstu_attention_maybe_from_cache(
-    num_heads: int,
-    attention_dim: int,
-    linear_dim: int,
-    q: torch.Tensor,
-    k: torch.Tensor,
-    v: torch.Tensor,
-    cached_q: Optional[torch.Tensor],
-    cached_k: Optional[torch.Tensor],
-    delta_x_offsets: Optional[Tuple[torch.Tensor, torch.Tensor]],
-    x_offsets: torch.Tensor,
-    all_timestamps: Optional[torch.Tensor],
-    invalid_attn_mask: torch.Tensor,
-    rel_attn_bias: RelativeAttentionBiasModule,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    B: int = x_offsets.size(0) - 1
-    n: int = invalid_attn_mask.size(-1)
-    if delta_x_offsets is not None:
-        padded_q, padded_k = cached_q, cached_k
-        flattened_offsets = delta_x_offsets[1] + torch.arange(
-            start=0,
-            end=B * n,
-            step=n,
-            device=delta_x_offsets[1].device,
-            dtype=delta_x_offsets[1].dtype,
-        )
-        assert isinstance(padded_q, torch.Tensor)
-        assert isinstance(padded_k, torch.Tensor)
-        padded_q = (
-            padded_q.view(B * n, -1)
-            .index_copy_(
-                dim=0,
-                index=flattened_offsets,
-                source=q,
-            )
-            .view(B, n, -1)
-        )
-        padded_k = (
-            padded_k.view(B * n, -1)
-            .index_copy_(
-                dim=0,
-                index=flattened_offsets,
-                source=k,
-            )
-            .view(B, n, -1)
-        )
-    else:
-        padded_q = torch.ops.fbgemm.jagged_to_padded_dense(
-            values=q, offsets=[x_offsets], max_lengths=[n], padding_value=0.0
-        )
-        padded_k = torch.ops.fbgemm.jagged_to_padded_dense(
-            values=k, offsets=[x_offsets], max_lengths=[n], padding_value=0.0
-        )
-
-    qk_attn = torch.einsum(
-        "bnhd,bmhd->bhnm",
-        padded_q.view(B, n, num_heads, attention_dim),
-        padded_k.view(B, n, num_heads, attention_dim),
-    )
-    if all_timestamps is not None:
-        qk_attn = qk_attn + rel_attn_bias(all_timestamps).unsqueeze(1)
-    qk_attn = F.silu(qk_attn) / n
-    qk_attn = qk_attn * invalid_attn_mask.unsqueeze(0).unsqueeze(0)
-    attn_output = torch.ops.fbgemm.dense_to_jagged(
-        torch.einsum(
-            "bhnm,bmhd->bnhd",
-            qk_attn,
-            torch.ops.fbgemm.jagged_to_padded_dense(v, [x_offsets], [n]).reshape(
-                B, n, num_heads, linear_dim
-            ),
-        ).reshape(B, n, num_heads * linear_dim),
-        [x_offsets],
-    )[0]
-    return attn_output, padded_q, padded_k
-
-
-class SequentialTransductionUnitJagged(torch.nn.Module):
-    def __init__(
-        self,
-        embedding_dim: int,
-        linear_hidden_dim: int,
-        attention_dim: int,
-        dropout_ratio: float,
-        attn_dropout_ratio: float,
-        num_heads: int,
-        linear_activation: str,
-        relative_attention_bias_module: Optional[RelativeAttentionBiasModule] = None,
-        normalization: str = "rel_bias",
-        linear_config: str = "uvqk",
-        concat_ua: bool = False,
-        epsilon: float = 1e-6,
-        max_length: Optional[int] = None,
-    ) -> None:
-        super().__init__()
-        self._embedding_dim: int = embedding_dim
-        self._linear_dim: int = linear_hidden_dim
-        self._attention_dim: int = attention_dim
-        self._dropout_ratio: float = dropout_ratio
-        self._attn_dropout_ratio: float = attn_dropout_ratio
-        self._num_heads: int = num_heads
-        self._rel_attn_bias: Optional[RelativeAttentionBiasModule] = (
-            relative_attention_bias_module
-        )
-        self._normalization: str = normalization
-        self._linear_config: str = linear_config
-        if self._linear_config == "uvqk":
-            self._uvqk: torch.nn.Parameter = torch.nn.Parameter(
-                torch.empty(
-                    (
-                        embedding_dim,
-                        linear_hidden_dim * 2 * num_heads
-                        + attention_dim * num_heads * 2,
-                    )
-                ).normal_(mean=0, std=0.02),
-            )
-        else:
-            raise ValueError(f"Unknown linear_config {self._linear_config}")
-        self._linear_activation: str = linear_activation
-        self._concat_ua: bool = concat_ua
-        self._o = torch.nn.Linear(
-            in_features=linear_hidden_dim * num_heads * (3 if concat_ua else 1),
-            out_features=embedding_dim,
-        )
-        torch.nn.init.xavier_uniform_(self._o.weight)
-        self._eps: float = epsilon
-
-    def _norm_input(self, x: torch.Tensor) -> torch.Tensor:
-        return F.layer_norm(x, normalized_shape=[self._embedding_dim], eps=self._eps)
-
-    def _norm_attn_output(self, x: torch.Tensor) -> torch.Tensor:
-        return F.layer_norm(
-            x, normalized_shape=[self._linear_dim * self._num_heads], eps=self._eps
-        )
-
-    def forward(  # pyre-ignore [3]
-        self,
-        x: torch.Tensor,
-        x_offsets: torch.Tensor,
-        all_timestamps: Optional[torch.Tensor],
-        invalid_attn_mask: torch.Tensor,
-        delta_x_offsets: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        cache: Optional[HSTUCacheState] = None,
-        return_cache_states: bool = False,
-    ):
-        """
-        Args:
-            x: (\sum_i N_i, D) x float.
-            x_offsets: (B + 1) x int32.
-            all_timestamps: optional (B, N) x int64.
-            invalid_attn_mask: (B, N, N) x float, each element in {0, 1}.
-            delta_x_offsets: optional 2-tuple ((B,) x int32, (B,) x int32).
-                For the 1st element in the tuple, each element is in [0, x_offsets[-1]). For the
-                2nd element in the tuple, each element is in [0, N).
-            cache: Optional 4-tuple of (v, padded_q, padded_k, output) from prior runs,
-                where all except padded_q, padded_k are jagged.
-        Returns:
-            x' = f(x), (\sum_i N_i, D) x float.
-        """
-        n: int = invalid_attn_mask.size(-1)
-        cached_q = None
-        cached_k = None
-        if delta_x_offsets is not None:
-            # In this case, for all the following code, x, u, v, q, k become restricted to
-            # [delta_x_offsets[0], :].
-            assert cache is not None
-            x = x[delta_x_offsets[0], :]
-            cached_v, cached_q, cached_k, cached_outputs = cache
-
-        normed_x = self._norm_input(x)
-
-        if self._linear_config == "uvqk":
-            batched_mm_output = torch.mm(normed_x, self._uvqk)
-            if self._linear_activation == "silu":
-                batched_mm_output = F.silu(batched_mm_output)
-            elif self._linear_activation == "none":
-                batched_mm_output = batched_mm_output
-            u, v, q, k = torch.split(
-                batched_mm_output,
-                [
-                    self._linear_dim * self._num_heads,
-                    self._linear_dim * self._num_heads,
-                    self._attention_dim * self._num_heads,
-                    self._attention_dim * self._num_heads,
-                ],
-                dim=1,
-            )
-        else:
-            raise ValueError(f"Unknown self._linear_config {self._linear_config}")
-
-        if delta_x_offsets is not None:
-            v = cached_v.index_copy_(dim=0, index=delta_x_offsets[0], source=v)
-
-        B: int = x_offsets.size(0) - 1
-        if self._normalization == "rel_bias" or self._normalization == "hstu_rel_bias":
-            assert self._rel_attn_bias is not None
-            attn_output, padded_q, padded_k = _hstu_attention_maybe_from_cache(
-                num_heads=self._num_heads,
-                attention_dim=self._attention_dim,
-                linear_dim=self._linear_dim,
-                q=q,
-                k=k,
-                v=v,
-                cached_q=cached_q,
-                cached_k=cached_k,
-                delta_x_offsets=delta_x_offsets,
-                x_offsets=x_offsets,
-                all_timestamps=all_timestamps,
-                invalid_attn_mask=invalid_attn_mask,
-                rel_attn_bias=self._rel_attn_bias,
-            )
-        elif self._normalization == "softmax_rel_bias":
-            if delta_x_offsets is not None:
-                B = x_offsets.size(0) - 1
-                padded_q, padded_k = cached_q, cached_k
-                flattened_offsets = delta_x_offsets[1] + torch.arange(
-                    start=0,
-                    end=B * n,
-                    step=n,
-                    device=delta_x_offsets[1].device,
-                    dtype=delta_x_offsets[1].dtype,
-                )
-                assert padded_q is not None
-                assert padded_k is not None
-                padded_q = (
-                    padded_q.view(B * n, -1)
-                    .index_copy_(
-                        dim=0,
-                        index=flattened_offsets,
-                        source=q,
-                    )
-                    .view(B, n, -1)
-                )
-                padded_k = (
-                    padded_k.view(B * n, -1)
-                    .index_copy_(
-                        dim=0,
-                        index=flattened_offsets,
-                        source=k,
-                    )
-                    .view(B, n, -1)
-                )
-            else:
-                padded_q = torch.ops.fbgemm.jagged_to_padded_dense(
-                    values=q, offsets=[x_offsets], max_lengths=[n], padding_value=0.0
-                )
-                padded_k = torch.ops.fbgemm.jagged_to_padded_dense(
-                    values=k, offsets=[x_offsets], max_lengths=[n], padding_value=0.0
-                )
-
-            qk_attn = torch.einsum("bnd,bmd->bnm", padded_q, padded_k)
-            if self._rel_attn_bias is not None:
-                qk_attn = qk_attn + self._rel_attn_bias(all_timestamps)
-            qk_attn = F.softmax(qk_attn / math.sqrt(self._attention_dim), dim=-1)
-            qk_attn = qk_attn * invalid_attn_mask
-            attn_output = torch.ops.fbgemm.dense_to_jagged(
-                torch.bmm(
-                    qk_attn,
-                    torch.ops.fbgemm.jagged_to_padded_dense(v, [x_offsets], [n]),
-                ),
-                [x_offsets],
-            )[0]
-        else:
-            raise ValueError(f"Unknown normalization method {self._normalization}")
-
-        attn_output = (
-            attn_output
-            if delta_x_offsets is None
-            else attn_output[delta_x_offsets[0], :]
-        )
-        if self._concat_ua:
-            a = self._norm_attn_output(attn_output)
-            o_input = torch.cat([u, a, u * a], dim=-1)
-        else:
-            o_input = u * self._norm_attn_output(attn_output)
-
-        new_outputs = (
-            self._o(
-                F.dropout(
-                    o_input,
-                    p=self._dropout_ratio,
-                    training=self.training,
-                )
-            )
-            + x
-        )
-
-        if delta_x_offsets is not None:
-            new_outputs = cached_outputs.index_copy_(
-                dim=0, index=delta_x_offsets[0], source=new_outputs
-            )
-
-        if return_cache_states and delta_x_offsets is None:
-            v = v.contiguous()
-
-        return new_outputs, (v, padded_q, padded_k, new_outputs)
-
-
-class HSTUJagged(torch.nn.Module):
-    def __init__(
-        self,
-        modules: List[SequentialTransductionUnitJagged],
-        autocast_dtype: Optional[torch.dtype],
-    ) -> None:
-        super().__init__()
-
-        self._attention_layers: torch.nn.ModuleList = torch.nn.ModuleList(
-            modules=modules
-        )
-        self._autocast_dtype: Optional[torch.dtype] = autocast_dtype
-
-    def jagged_forward(
-        self,
-        x: torch.Tensor,
-        x_offsets: torch.Tensor,
-        all_timestamps: Optional[torch.Tensor],
-        invalid_attn_mask: torch.Tensor,
-        delta_x_offsets: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        cache: Optional[List[HSTUCacheState]] = None,
-        return_cache_states: bool = False,
-    ) -> Tuple[torch.Tensor, List[HSTUCacheState]]:
-        """
-        Args:
-            x: (\sum_i N_i, D) x float
-            x_offsets: (B + 1) x int32
-            all_timestamps: (B, 1 + N) x int64
-            invalid_attn_mask: (B, N, N) x float, each element in {0, 1}
-            return_cache_states: bool. True if we should return cache states.
-
-        Returns:
-            x' = f(x), (\sum_i N_i, D) x float
-        """
-        cache_states: List[HSTUCacheState] = []
-
-        with torch.autocast(
-            "cuda",
-            enabled=self._autocast_dtype is not None,
-            dtype=self._autocast_dtype or torch.float16,
-        ):
-            for i, layer in enumerate(self._attention_layers):
-                x, cache_states_i = layer(
-                    x=x,
-                    x_offsets=x_offsets,
-                    all_timestamps=all_timestamps,
-                    invalid_attn_mask=invalid_attn_mask,
-                    delta_x_offsets=delta_x_offsets,
-                    cache=cache[i] if cache is not None else None,
-                    return_cache_states=return_cache_states,
-                )
-                if return_cache_states:
-                    cache_states.append(cache_states_i)
-
-        return x, cache_states
-
-    def forward(
-        self,
-        x: torch.Tensor,
-        x_offsets: torch.Tensor,
-        all_timestamps: Optional[torch.Tensor],
-        invalid_attn_mask: torch.Tensor,
-        delta_x_offsets: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        cache: Optional[List[HSTUCacheState]] = None,
-        return_cache_states: bool = False,
-    ) -> Tuple[torch.Tensor, List[HSTUCacheState]]:
-        """
-        Args:
-            x: (B, N, D) x float.
-            x_offsets: (B + 1) x int32.
-            all_timestamps: (B, 1 + N) x int64
-            invalid_attn_mask: (B, N, N) x float, each element in {0, 1}.
-        Returns:
-            x' = f(x), (B, N, D) x float
-        """
-        if len(x.size()) == 3:
-            x = torch.ops.fbgemm.dense_to_jagged(x, [x_offsets])[0]
-
-        jagged_x, cache_states = self.jagged_forward(
-            x=x,
-            x_offsets=x_offsets,
-            all_timestamps=all_timestamps,
-            invalid_attn_mask=invalid_attn_mask,
-            delta_x_offsets=delta_x_offsets,
-            cache=cache,
-            return_cache_states=return_cache_states,
-        )
-        y = torch.ops.fbgemm.jagged_to_padded_dense(
-            values=jagged_x,
-            offsets=[x_offsets],
-            max_lengths=[invalid_attn_mask.size(1)],
-            padding_value=0.0,
-        )
-        return y, cache_states
-
-
-class HSTU(SequentialEncoderWithLearnedSimilarityModule):
-    """
-    Implements HSTU (Hierarchical Sequential Transduction Unit) in
-    Actions Speak Louder than Words: Trillion-Parameter Sequential Transducers for Generative Recommendations,
-    https://arxiv.org/abs/2402.17152.
-
-    Note that this implementation is intended for reproducing experiments in
-    the traditional sequential recommender setting (Section 4.1.1), and does
-    not yet use optimized kernels discussed in the paper.
-    """
-
-    def __init__(
-        self,
-        max_sequence_len: int,
-        max_output_len: int,
-        embedding_dim: int,
-        num_blocks: int,
-        num_heads: int,
-        linear_dim: int,
-        attention_dim: int,
-        normalization: str,
-        linear_config: str,
-        linear_activation: str,
-        linear_dropout_rate: float,
-        attn_dropout_rate: float,
-        embedding_module: EmbeddingModule,
-        similarity_module: SimilarityModule,
-        input_features_preproc_module: InputFeaturesPreprocessorModule,
-        output_postproc_module: OutputPostprocessorModule,
-        enable_relative_attention_bias: bool = True,
-        concat_ua: bool = False,
-        verbose: bool = True,
-    ) -> None:
-        super().__init__(ndp_module=similarity_module)
-
-        self._embedding_dim: int = embedding_dim
-        self._item_embedding_dim: int = embedding_module.item_embedding_dim
-        self._max_sequence_length: int = max_sequence_len
-        self._embedding_module: EmbeddingModule = embedding_module
-        self._input_features_preproc: InputFeaturesPreprocessorModule = (
-            input_features_preproc_module
-        )
-        self._output_postproc: OutputPostprocessorModule = output_postproc_module
-        self._num_blocks: int = num_blocks
-        self._num_heads: int = num_heads
-        self._dqk: int = attention_dim
-        self._dv: int = linear_dim
-        self._linear_activation: str = linear_activation
-        self._linear_dropout_rate: float = linear_dropout_rate
-        self._attn_dropout_rate: float = attn_dropout_rate
-        self._enable_relative_attention_bias: bool = enable_relative_attention_bias
-        self._hstu = HSTUJagged(
-            modules=[
-                SequentialTransductionUnitJagged(
-                    embedding_dim=self._embedding_dim,
-                    linear_hidden_dim=linear_dim,
-                    attention_dim=attention_dim,
-                    normalization=normalization,
-                    linear_config=linear_config,
-                    linear_activation=linear_activation,
-                    num_heads=num_heads,
-                    # TODO: change to lambda x.
-                    relative_attention_bias_module=(
-                        RelativeBucketedTimeAndPositionBasedBias(
-                            max_seq_len=max_sequence_len
-                            + max_output_len,  # accounts for next item.
-                            num_buckets=128,
-                            bucketization_fn=lambda x: (
-                                torch.log(torch.abs(x).clamp(min=1)) / 0.301
-                            ).long(),
-                        )
-                        if enable_relative_attention_bias
-                        else None
-                    ),
-                    dropout_ratio=linear_dropout_rate,
-                    attn_dropout_ratio=attn_dropout_rate,
-                    concat_ua=concat_ua,
-                )
-                for _ in range(num_blocks)
-            ],
-            autocast_dtype=None,
-        )
-        # causal forward, w/ +1 for padding.
-        self.register_buffer(
-            "_attn_mask",
-            torch.triu(
-                torch.ones(
-                    (
-                        self._max_sequence_length + max_output_len,
-                        self._max_sequence_length + max_output_len,
-                    ),
-                    dtype=torch.bool,
-                ),
-                diagonal=1,
-            ),
-        )
-        self._verbose: bool = verbose
-        self.reset_params()
-
-    def reset_params(self) -> None:
-        for name, params in self.named_parameters():
-            if ("_hstu" in name) or ("_embedding_module" in name):
-                if self._verbose:
-                    print(f"Skipping init for {name}")
-                continue
-            try:
-                torch.nn.init.xavier_normal_(params.data)
-                if self._verbose:
-                    print(
-                        f"Initialize {name} as xavier normal: {params.data.size()} params"
-                    )
-            except:
-                if self._verbose:
-                    print(f"Failed to initialize {name}: {params.data.size()} params")
-
-    def get_item_embeddings(self, item_ids: torch.Tensor) -> torch.Tensor:
-        return self._embedding_module.get_item_embeddings(item_ids)
-
-    def debug_str(self) -> str:
-        debug_str = (
-            f"HSTU-b{self._num_blocks}-h{self._num_heads}-dqk{self._dqk}-dv{self._dv}"
-            + f"-l{self._linear_activation}d{self._linear_dropout_rate}"
-            + f"-ad{self._attn_dropout_rate}"
-        )
-        if not self._enable_relative_attention_bias:
-            debug_str += "-norab"
-        return debug_str
-
-    def generate_user_embeddings(
-        self,
-        past_lengths: torch.Tensor,
-        past_ids: torch.Tensor,
-        past_embeddings: torch.Tensor,
-        past_payloads: Dict[str, torch.Tensor],
-        delta_x_offsets: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        cache: Optional[List[HSTUCacheState]] = None,
-        return_cache_states: bool = False,
-    ) -> Tuple[torch.Tensor, List[HSTUCacheState]]:
-        """
-        [B, N] -> [B, N, D].
-        """
-        device = past_lengths.device
-        float_dtype = past_embeddings.dtype
-        B, N, _ = past_embeddings.size()
-
-        past_lengths, user_embeddings, _ = self._input_features_preproc(
-            past_lengths=past_lengths,
-            past_ids=past_ids,
-            past_embeddings=past_embeddings,
-            past_payloads=past_payloads,
-        )
-
-        float_dtype = user_embeddings.dtype
-        user_embeddings, cached_states = self._hstu(
-            x=user_embeddings,
-            x_offsets=torch.ops.fbgemm.asynchronous_complete_cumsum(past_lengths),
-            all_timestamps=(
-                past_payloads[TIMESTAMPS_KEY]
-                if TIMESTAMPS_KEY in past_payloads
-                else None
-            ),
-            invalid_attn_mask=1.0 - self._attn_mask.to(float_dtype),
-            delta_x_offsets=delta_x_offsets,
-            cache=cache,
-            return_cache_states=return_cache_states,
-        )
-        return self._output_postproc(user_embeddings), cached_states
-
-    def forward(
-        self,
-        past_lengths: torch.Tensor,
-        past_ids: torch.Tensor,
-        past_embeddings: torch.Tensor,
-        past_payloads: Dict[str, torch.Tensor],
-        batch_id: Optional[int] = None,
-    ) -> torch.Tensor:
-        """
-        Runs the main encoder.
-
-        Args:
-            past_lengths: (B,) x int64
-            past_ids: (B, N,) x int64 where the latest engaged ids come first. In
-                particular, past_ids[i, past_lengths[i] - 1] should correspond to
-                the latest engaged values.
-            past_embeddings: (B, N, D) x float or (\sum_b N_b, D) x float.
-            past_payloads: implementation-specific keyed tensors of shape (B, N, ...).
-
-        Returns:
-            encoded_embeddings of [B, N, D].
-        """
-        encoded_embeddings, _ = self.generate_user_embeddings(
-            past_lengths=past_lengths,
-            past_ids=past_ids,
-            past_embeddings=past_embeddings,
-            past_payloads=past_payloads,
-        )
-        return encoded_embeddings
-
-    def _encode(
-        self,
-        past_lengths: torch.Tensor,
-        past_ids: torch.Tensor,
-        past_embeddings: torch.Tensor,
-        past_payloads: Dict[str, torch.Tensor],
-        delta_x_offsets: Optional[Tuple[torch.Tensor, torch.Tensor]],
-        cache: Optional[List[HSTUCacheState]],
-        return_cache_states: bool,
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[HSTUCacheState]]]:
-        """
-        Args:
-            past_lengths: (B,) x int64.
-            past_ids: (B, N,) x int64.
-            past_embeddings: (B, N, D,) x float.
-            past_payloads: implementation-specific keyed tensors of shape (B, N, ...).
-            return_cache_states: bool.
-
-        Returns:
-            (B, D) x float, representing embeddings for the current state.
-        """
-        encoded_seq_embeddings, cache_states = self.generate_user_embeddings(
-            past_lengths=past_lengths,
-            past_ids=past_ids,
-            past_embeddings=past_embeddings,
-            past_payloads=past_payloads,
-            delta_x_offsets=delta_x_offsets,
-            cache=cache,
-            return_cache_states=return_cache_states,
-        )  # [B, N, D]
-        current_embeddings = get_current_embeddings(
-            lengths=past_lengths, encoded_embeddings=encoded_seq_embeddings
-        )
-        if return_cache_states:
-            return current_embeddings, cache_states
-        else:
-            return current_embeddings
-
-    def encode(
-        self,
-        past_lengths: torch.Tensor,
-        past_ids: torch.Tensor,
-        past_embeddings: torch.Tensor,
-        past_payloads: Dict[str, torch.Tensor],
-        delta_x_offsets: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        cache: Optional[List[HSTUCacheState]] = None,
-        return_cache_states: bool = False,
-    ) -> Union[torch.Tensor, Tuple[torch.Tensor, List[HSTUCacheState]]]:
-        """
-        Runs encoder to obtain the current hidden states.
-
-        Args:
-            past_lengths: (B,) x int.
-            past_ids: (B, N,) x int.
-            past_embeddings: (B, N, D) x float.
-            past_payloads: implementation-specific keyed tensors of shape (B, N, ...).
-
-        Returns:
-            (B, D,) x float, representing encoded states at the most recent time step.
-        """
-        return self._encode(
-            past_lengths=past_lengths,
-            past_ids=past_ids,
-            past_embeddings=past_embeddings,
-            past_payloads=past_payloads,
-            delta_x_offsets=delta_x_offsets,
-            cache=cache,
-            return_cache_states=return_cache_states,
-        )
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/input_features_preprocessors.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/input_features_preprocessors.py
deleted file mode 100644
index a461ab879..000000000
--- a/recommendation_v4/generative_recommenders/research/modeling/sequential/input_features_preprocessors.py
+++ /dev/null
@@ -1,259 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-import abc
-import math
-from typing import Dict, Tuple
-
-import torch
-from generative_recommenders.research.modeling.initialization import truncated_normal
-
-
-class InputFeaturesPreprocessorModule(torch.nn.Module):
-    @abc.abstractmethod
-    def debug_str(self) -> str:
-        pass
-
-    @abc.abstractmethod
-    def forward(
-        self,
-        past_lengths: torch.Tensor,
-        past_ids: torch.Tensor,
-        past_embeddings: torch.Tensor,
-        past_payloads: Dict[str, torch.Tensor],
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        pass
-
-
-class LearnablePositionalEmbeddingInputFeaturesPreprocessor(
-    InputFeaturesPreprocessorModule
-):
-    def __init__(
-        self,
-        max_sequence_len: int,
-        embedding_dim: int,
-        dropout_rate: float,
-    ) -> None:
-        super().__init__()
-
-        self._embedding_dim: int = embedding_dim
-        self._pos_emb: torch.nn.Embedding = torch.nn.Embedding(
-            max_sequence_len,
-            self._embedding_dim,
-        )
-        self._dropout_rate: float = dropout_rate
-        self._emb_dropout = torch.nn.Dropout(p=dropout_rate)
-        self.reset_state()
-
-    def debug_str(self) -> str:
-        return f"posi_d{self._dropout_rate}"
-
-    def reset_state(self) -> None:
-        truncated_normal(
-            self._pos_emb.weight.data,
-            mean=0.0,
-            std=math.sqrt(1.0 / self._embedding_dim),
-        )
-
-    def forward(
-        self,
-        past_lengths: torch.Tensor,
-        past_ids: torch.Tensor,
-        past_embeddings: torch.Tensor,
-        past_payloads: Dict[str, torch.Tensor],
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        B, N = past_ids.size()
-        D = past_embeddings.size(-1)
-
-        user_embeddings = past_embeddings * (self._embedding_dim**0.5) + self._pos_emb(
-            torch.arange(N, device=past_ids.device).unsqueeze(0).repeat(B, 1)
-        )
-        user_embeddings = self._emb_dropout(user_embeddings)
-
-        valid_mask = (past_ids != 0).unsqueeze(-1).float()  # [B, N, 1]
-        user_embeddings *= valid_mask
-        return past_lengths, user_embeddings, valid_mask
-
-
-class LearnablePositionalEmbeddingRatedInputFeaturesPreprocessor(
-    InputFeaturesPreprocessorModule
-):
-    def __init__(
-        self,
-        max_sequence_len: int,
-        item_embedding_dim: int,
-        dropout_rate: float,
-        rating_embedding_dim: int,
-        num_ratings: int,
-    ) -> None:
-        super().__init__()
-
-        self._embedding_dim: int = item_embedding_dim + rating_embedding_dim
-        self._pos_emb: torch.nn.Embedding = torch.nn.Embedding(
-            max_sequence_len,
-            self._embedding_dim,
-        )
-        self._dropout_rate: float = dropout_rate
-        self._emb_dropout = torch.nn.Dropout(p=dropout_rate)
-        self._rating_emb: torch.nn.Embedding = torch.nn.Embedding(
-            num_ratings,
-            rating_embedding_dim,
-        )
-        self.reset_state()
-
-    def debug_str(self) -> str:
-        return f"posir_d{self._dropout_rate}"
-
-    def reset_state(self) -> None:
-        truncated_normal(
-            self._pos_emb.weight.data,
-            mean=0.0,
-            std=math.sqrt(1.0 / self._embedding_dim),
-        )
-        truncated_normal(
-            self._rating_emb.weight.data,
-            mean=0.0,
-            std=math.sqrt(1.0 / self._embedding_dim),
-        )
-
-    def forward(
-        self,
-        past_lengths: torch.Tensor,
-        past_ids: torch.Tensor,
-        past_embeddings: torch.Tensor,
-        past_payloads: Dict[str, torch.Tensor],
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        B, N = past_ids.size()
-
-        user_embeddings = torch.cat(
-            [past_embeddings, self._rating_emb(past_payloads["ratings"].int())],
-            dim=-1,
-        ) * (self._embedding_dim**0.5) + self._pos_emb(
-            torch.arange(N, device=past_ids.device).unsqueeze(0).repeat(B, 1)
-        )
-        user_embeddings = self._emb_dropout(user_embeddings)
-
-        valid_mask = (past_ids != 0).unsqueeze(-1).float()  # [B, N, 1]
-        user_embeddings *= valid_mask
-        return past_lengths, user_embeddings, valid_mask
-
-
-class CombinedItemAndRatingInputFeaturesPreprocessor(InputFeaturesPreprocessorModule):
-    def __init__(
-        self,
-        max_sequence_len: int,
-        item_embedding_dim: int,
-        dropout_rate: float,
-        num_ratings: int,
-    ) -> None:
-        super().__init__()
-
-        self._embedding_dim: int = item_embedding_dim
-        # Due to [item_0, rating_0, item_1, rating_1, ...]
-        self._pos_emb: torch.nn.Embedding = torch.nn.Embedding(
-            max_sequence_len * 2,
-            self._embedding_dim,
-        )
-        self._dropout_rate: float = dropout_rate
-        self._emb_dropout = torch.nn.Dropout(p=dropout_rate)
-        self._rating_emb: torch.nn.Embedding = torch.nn.Embedding(
-            num_ratings,
-            item_embedding_dim,
-        )
-        self.reset_state()
-
-    def debug_str(self) -> str:
-        return f"combir_d{self._dropout_rate}"
-
-    def reset_state(self) -> None:
-        truncated_normal(
-            self._pos_emb.weight.data,
-            mean=0.0,
-            std=math.sqrt(1.0 / self._embedding_dim),
-        )
-        truncated_normal(
-            self._rating_emb.weight.data,
-            mean=0.0,
-            std=math.sqrt(1.0 / self._embedding_dim),
-        )
-
-    def get_preprocessed_ids(
-        self,
-        past_lengths: torch.Tensor,
-        past_ids: torch.Tensor,
-        past_embeddings: torch.Tensor,
-        past_payloads: Dict[str, torch.Tensor],
-    ) -> torch.Tensor:
-        """
-        Returns (B, N * 2,) x int64.
-        """
-        B, N = past_ids.size()
-        return torch.cat(
-            [
-                past_ids.unsqueeze(2),  # (B, N, 1)
-                past_payloads["ratings"].to(past_ids.dtype).unsqueeze(2),
-            ],
-            dim=2,
-        ).reshape(B, N * 2)
-
-    def get_preprocessed_masks(
-        self,
-        past_lengths: torch.Tensor,
-        past_ids: torch.Tensor,
-        past_embeddings: torch.Tensor,
-        past_payloads: Dict[str, torch.Tensor],
-    ) -> torch.Tensor:
-        """
-        Returns (B, N * 2,) x bool.
-        """
-        B, N = past_ids.size()
-        return (past_ids != 0).unsqueeze(2).expand(-1, -1, 2).reshape(B, N * 2)
-
-    def forward(
-        self,
-        past_lengths: torch.Tensor,
-        past_ids: torch.Tensor,
-        past_embeddings: torch.Tensor,
-        past_payloads: Dict[str, torch.Tensor],
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        B, N = past_ids.size()
-        D = past_embeddings.size(-1)
-
-        user_embeddings = torch.cat(
-            [
-                past_embeddings,  # (B, N, D)
-                self._rating_emb(past_payloads["ratings"].int()),
-            ],
-            dim=2,
-        ) * (self._embedding_dim**0.5)
-        user_embeddings = user_embeddings.view(B, N * 2, D)
-        user_embeddings = user_embeddings + self._pos_emb(
-            torch.arange(N * 2, device=past_ids.device).unsqueeze(0).repeat(B, 1)
-        )
-        user_embeddings = self._emb_dropout(user_embeddings)
-
-        valid_mask = (
-            self.get_preprocessed_masks(
-                past_lengths,
-                past_ids,
-                past_embeddings,
-                past_payloads,
-            )
-            .unsqueeze(2)
-            .float()
-        )  # (B, N * 2, 1,)
-        user_embeddings *= valid_mask
-        return past_lengths * 2, user_embeddings, valid_mask
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/losses/sampled_softmax.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/losses/sampled_softmax.py
deleted file mode 100644
index 8e2195783..000000000
--- a/recommendation_v4/generative_recommenders/research/modeling/sequential/losses/sampled_softmax.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-from collections import OrderedDict
-from typing import Dict, Optional, Tuple
-
-import torch
-import torch.nn.functional as F
-from generative_recommenders.research.modeling.sequential.autoregressive_losses import (
-    AutoregressiveLoss,
-    NegativesSampler,
-)
-from torch.utils.checkpoint import checkpoint
-
-
-class SampledSoftmaxLoss(AutoregressiveLoss):
-    def __init__(
-        self,
-        num_to_sample: int,
-        softmax_temperature: float,
-        model,
-        activation_checkpoint: bool = False,
-    ) -> None:
-        super().__init__()
-
-        self._num_to_sample: int = num_to_sample
-        self._softmax_temperature: float = softmax_temperature
-        self._model = model
-        self._activation_checkpoint: bool = activation_checkpoint
-
-    def jagged_forward(  # pyre-ignore [15]
-        self,
-        output_embeddings: torch.Tensor,
-        supervision_ids: torch.Tensor,
-        supervision_embeddings: torch.Tensor,
-        supervision_weights: torch.Tensor,
-        negatives_sampler: NegativesSampler,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
-        assert output_embeddings.size() == supervision_embeddings.size()
-        assert supervision_ids.size() == supervision_embeddings.size()[:-1]
-        assert supervision_ids.size() == supervision_weights.size()
-
-        sampled_ids, sampled_negative_embeddings = negatives_sampler(
-            positive_ids=supervision_ids,
-            num_to_sample=self._num_to_sample,
-        )
-        positive_embeddings = negatives_sampler.normalize_embeddings(
-            supervision_embeddings
-        )
-        positive_logits, aux_losses = self._model.similarity_fn(
-            query_embeddings=output_embeddings,  # [B, D] = [N', D]
-            item_ids=supervision_ids.unsqueeze(1),  # [N', 1]
-            item_embeddings=positive_embeddings.unsqueeze(1),  # [N', D] -> [N', 1, D]
-            **kwargs,
-        )
-        positive_logits = positive_logits / self._softmax_temperature  # [0]
-        sampled_negatives_logits, _ = self._model.similarity_fn(
-            query_embeddings=output_embeddings,  # [N', D]
-            item_ids=sampled_ids,  # [N', R]
-            item_embeddings=sampled_negative_embeddings,  # [N', R, D]
-            **kwargs,
-        )  # [N', R]  # [0]
-        sampled_negatives_logits = torch.where(
-            supervision_ids.unsqueeze(1) == sampled_ids,  # [N', R]
-            -5e4,
-            sampled_negatives_logits / self._softmax_temperature,
-        )
-        jagged_loss = -F.log_softmax(
-            torch.cat([positive_logits, sampled_negatives_logits], dim=1), dim=1
-        )[:, 0]
-        return (
-            jagged_loss * supervision_weights
-        ).sum() / supervision_weights.sum(), aux_losses
-
-    def forward(  # pyre-ignore [15]
-        self,
-        lengths: torch.Tensor,
-        output_embeddings: torch.Tensor,
-        supervision_ids: torch.Tensor,
-        supervision_embeddings: torch.Tensor,
-        supervision_weights: torch.Tensor,
-        negatives_sampler: NegativesSampler,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
-        """
-        Args:
-            lengths: [B] x int32 representing number of non-zero elements per row.
-            output_embeddings: [B, N, D] x float, embeddings for the current
-                input sequence.
-            supervision_ids: [B, N] x int64, (positive) supervision ids.
-            supervision_embeddings: [B, N, D] x float.
-            supervision_weights: Optional [B, N] x float. Optional weights for
-                masking out invalid positions, or reweighting supervision labels.
-            negatives_sampler: sampler used to obtain negative examples paired with
-                positives.
-
-        Returns:
-            Tuple of (loss for the current engaged sequence, str-keyed aux_losses).
-        """
-        torch._assert(
-            output_embeddings.size() == supervision_embeddings.size(),
-            "Invalid supervision embeddings size.",
-        )
-        torch._assert(
-            supervision_ids.size() == supervision_embeddings.size()[:-1],
-            "Invalid supervision ids size.",
-        )
-
-        jagged_id_offsets = torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)
-        jagged_supervision_ids = (
-            torch.ops.fbgemm.dense_to_jagged(
-                supervision_ids.unsqueeze(-1).float(), [jagged_id_offsets]
-            )[0]
-            .squeeze(1)
-            .long()
-        )
-        if "user_ids" in kwargs:
-            # expand to jagged.
-            max_length: int = int(lengths.max())
-            kwargs["user_ids"] = torch.ops.fbgemm.dense_to_jagged(
-                kwargs["user_ids"]
-                .unsqueeze(1)
-                .expand(-1, max_length)
-                .unsqueeze(2),  # (B, max_length, 1)
-                [jagged_id_offsets],
-            )[0].squeeze(1)
-
-        args = OrderedDict(
-            [
-                (
-                    "output_embeddings",
-                    torch.ops.fbgemm.dense_to_jagged(
-                        output_embeddings,
-                        [jagged_id_offsets],
-                    )[0],
-                ),
-                ("supervision_ids", jagged_supervision_ids),
-                (
-                    "supervision_embeddings",
-                    torch.ops.fbgemm.dense_to_jagged(
-                        supervision_embeddings,
-                        [jagged_id_offsets],
-                    )[0],
-                ),
-                (
-                    "supervision_weights",
-                    torch.ops.fbgemm.dense_to_jagged(
-                        supervision_weights.unsqueeze(-1),
-                        [jagged_id_offsets],
-                    )[0].squeeze(1),
-                ),
-                ("negatives_sampler", negatives_sampler),
-            ]
-        )
-        args.update(kwargs)
-        if self._activation_checkpoint:
-            return checkpoint(
-                self.jagged_forward,
-                *args.values(),
-                use_reentrant=False,
-            )
-        else:
-            return self.jagged_forward(
-                output_embeddings=torch.ops.fbgemm.dense_to_jagged(
-                    output_embeddings,
-                    [jagged_id_offsets],
-                )[0],
-                supervision_ids=jagged_supervision_ids,
-                supervision_embeddings=torch.ops.fbgemm.dense_to_jagged(
-                    supervision_embeddings,
-                    [jagged_id_offsets],
-                )[0],
-                supervision_weights=torch.ops.fbgemm.dense_to_jagged(
-                    supervision_weights.unsqueeze(-1),
-                    [jagged_id_offsets],
-                )[0].squeeze(1),
-                negatives_sampler=negatives_sampler,
-                **kwargs,
-            )
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/output_postprocessors.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/output_postprocessors.py
deleted file mode 100644
index 3319dfd93..000000000
--- a/recommendation_v4/generative_recommenders/research/modeling/sequential/output_postprocessors.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-import abc
-
-import torch
-import torch.nn.functional as F
-
-
-class OutputPostprocessorModule(torch.nn.Module):
-    @abc.abstractmethod
-    def debug_str(self) -> str:
-        pass
-
-    @abc.abstractmethod
-    def forward(
-        self,
-        output_embeddings: torch.Tensor,
-    ) -> torch.Tensor:
-        pass
-
-
-class L2NormEmbeddingPostprocessor(OutputPostprocessorModule):
-    def __init__(
-        self,
-        embedding_dim: int,
-        eps: float = 1e-6,
-    ) -> None:
-        super().__init__()
-        self._embedding_dim: int = embedding_dim
-        self._eps: float = eps
-
-    def debug_str(self) -> str:
-        return "l2"
-
-    def forward(
-        self,
-        output_embeddings: torch.Tensor,
-    ) -> torch.Tensor:
-        output_embeddings = output_embeddings[..., : self._embedding_dim]
-        return output_embeddings / torch.clamp(
-            torch.linalg.norm(output_embeddings, ord=None, dim=-1, keepdim=True),
-            min=self._eps,
-        )
-
-
-class LayerNormEmbeddingPostprocessor(OutputPostprocessorModule):
-    def __init__(
-        self,
-        embedding_dim: int,
-        eps: float = 1e-6,
-    ) -> None:
-        super().__init__()
-        self._embedding_dim: int = embedding_dim
-        self._eps: float = eps
-
-    def debug_str(self) -> str:
-        return "ln"
-
-    def forward(
-        self,
-        output_embeddings: torch.Tensor,
-    ) -> torch.Tensor:
-        output_embeddings = output_embeddings[..., : self._embedding_dim]
-        return F.layer_norm(
-            output_embeddings,
-            normalized_shape=(self._embedding_dim,),
-            eps=self._eps,
-        )
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/sasrec.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/sasrec.py
deleted file mode 100644
index 2709ddb08..000000000
--- a/recommendation_v4/generative_recommenders/research/modeling/sequential/sasrec.py
+++ /dev/null
@@ -1,316 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-"""
-Implements SASRec (Self-Attentive Sequential Recommendation, https://arxiv.org/abs/1808.09781, ICDM'18).
-
-Compared with the original paper which used BCE loss, this implementation is modified so that
-we can utilize a Sampled Softmax loss proposed in Revisiting Neural Retrieval on Accelerators
-(https://arxiv.org/abs/2306.04039, KDD'23) and Turning Dross Into Gold Loss: is BERT4Rec really
-better than SASRec? (https://arxiv.org/abs/2309.07602, RecSys'23), where the authors showed
-sampled softmax loss to significantly improved SASRec model quality.
-"""
-
-from typing import Dict, Optional, Tuple
-
-import torch
-import torch.nn.functional as F
-from generative_recommenders.research.modeling.sequential.embedding_modules import (
-    EmbeddingModule,
-)
-from generative_recommenders.research.modeling.sequential.input_features_preprocessors import (
-    InputFeaturesPreprocessorModule,
-)
-from generative_recommenders.research.modeling.sequential.output_postprocessors import (
-    OutputPostprocessorModule,
-)
-from generative_recommenders.research.modeling.sequential.utils import (
-    get_current_embeddings,
-)
-from generative_recommenders.research.modeling.similarity_module import (
-    SequentialEncoderWithLearnedSimilarityModule,
-)
-from generative_recommenders.research.rails.similarities.module import SimilarityModule
-
-
-class StandardAttentionFF(torch.nn.Module):
-    def __init__(
-        self,
-        embedding_dim: int,
-        hidden_dim: int,
-        activation_fn: str,
-        dropout_rate: float,
-    ) -> None:
-        super().__init__()
-
-        assert activation_fn == "relu" or activation_fn == "gelu", (
-            f"Invalid activation_fn {activation_fn}"
-        )
-
-        self._conv1d = torch.nn.Sequential(
-            torch.nn.Conv1d(
-                in_channels=embedding_dim,
-                out_channels=hidden_dim,
-                kernel_size=1,
-            ),
-            torch.nn.GELU() if activation_fn == "gelu" else torch.nn.ReLU(),
-            torch.nn.Dropout(p=dropout_rate),
-            torch.nn.Conv1d(
-                in_channels=hidden_dim,
-                out_channels=embedding_dim,
-                kernel_size=1,
-            ),
-            torch.nn.Dropout(p=dropout_rate),
-        )
-
-    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
-        # Conv1D requires (B, D, N)
-        return self._conv1d(inputs.transpose(-1, -2)).transpose(-1, -2) + inputs
-
-
-class SASRec(SequentialEncoderWithLearnedSimilarityModule):
-    """
-    Implements SASRec (Self-Attentive Sequential Recommendation, https://arxiv.org/abs/1808.09781, ICDM'18).
-
-    Compared with the original paper which used BCE loss, this implementation is modified so that
-    we can utilize a Sampled Softmax loss proposed in Revisiting Neural Retrieval on Accelerators
-    (https://arxiv.org/abs/2306.04039, KDD'23) and Turning Dross Into Gold Loss: is BERT4Rec really
-    better than SASRec? (https://arxiv.org/abs/2309.07602, RecSys'23), where the authors showed
-    sampled softmax loss to significantly improved SASRec model quality.
-    """
-
-    def __init__(
-        self,
-        max_sequence_len: int,
-        max_output_len: int,
-        embedding_dim: int,
-        num_blocks: int,
-        num_heads: int,
-        ffn_hidden_dim: int,
-        ffn_activation_fn: str,
-        ffn_dropout_rate: float,
-        embedding_module: EmbeddingModule,
-        similarity_module: SimilarityModule,
-        input_features_preproc_module: InputFeaturesPreprocessorModule,
-        output_postproc_module: OutputPostprocessorModule,
-        activation_checkpoint: bool = False,
-        verbose: bool = False,
-    ) -> None:
-        super().__init__(ndp_module=similarity_module)
-
-        self._embedding_module: EmbeddingModule = embedding_module
-        self._embedding_dim: int = embedding_dim
-        self._item_embedding_dim: int = embedding_module.item_embedding_dim
-        self._max_sequence_length: int = max_sequence_len + max_output_len
-        self._input_features_preproc: InputFeaturesPreprocessorModule = (
-            input_features_preproc_module
-        )
-        self._output_postproc: OutputPostprocessorModule = output_postproc_module
-        self._activation_checkpoint: bool = activation_checkpoint
-        self._verbose: bool = verbose
-
-        self.attention_layers = torch.nn.ModuleList()
-        self.forward_layers = torch.nn.ModuleList()
-        self._num_blocks: int = num_blocks
-        self._num_heads: int = num_heads
-        self._ffn_hidden_dim: int = ffn_hidden_dim
-        self._ffn_activation_fn: str = ffn_activation_fn
-        self._ffn_dropout_rate: float = ffn_dropout_rate
-
-        for _ in range(num_blocks):
-            self.attention_layers.append(
-                torch.nn.MultiheadAttention(
-                    embed_dim=self._embedding_dim,
-                    num_heads=num_heads,
-                    dropout=ffn_dropout_rate,
-                    batch_first=True,
-                )
-            )
-            self.forward_layers.append(
-                StandardAttentionFF(
-                    embedding_dim=self._embedding_dim,
-                    hidden_dim=ffn_hidden_dim,
-                    activation_fn=ffn_activation_fn,
-                    dropout_rate=self._ffn_dropout_rate,
-                )
-            )
-
-        self.register_buffer(
-            "_attn_mask",
-            torch.triu(
-                torch.ones(
-                    (self._max_sequence_length, self._max_sequence_length),
-                    dtype=torch.bool,
-                ),
-                diagonal=1,
-            ),
-        )
-        self.reset_state()
-
-    def reset_state(self) -> None:
-        for name, params in self.named_parameters():
-            if (
-                "_input_features_preproc" in name
-                or "_embedding_module" in name
-                or "_output_postproc" in name
-            ):
-                if self._verbose:
-                    print(f"Skipping initialization for {name}")
-                continue
-            try:
-                torch.nn.init.xavier_normal_(params.data)
-                if self._verbose:
-                    print(
-                        f"Initialize {name} as xavier normal: {params.data.size()} params"
-                    )
-            except:
-                if self._verbose:
-                    print(f"Failed to initialize {name}: {params.data.size()} params")
-
-    def get_item_embeddings(self, item_ids: torch.Tensor) -> torch.Tensor:
-        return self._embedding_module.get_item_embeddings(item_ids)
-
-    def debug_str(self) -> str:
-        return (
-            f"SASRec-d{self._item_embedding_dim}-b{self._num_blocks}-h{self._num_heads}"
-            + "-"
-            + self._input_features_preproc.debug_str()
-            + "-"
-            + self._output_postproc.debug_str()
-            + f"-ffn{self._ffn_hidden_dim}-{self._ffn_activation_fn}-d{self._ffn_dropout_rate}"
-            + f"{'-ac' if self._activation_checkpoint else ''}"
-        )
-
-    def _run_one_layer(
-        self,
-        i: int,
-        user_embeddings: torch.Tensor,
-        valid_mask: torch.Tensor,
-    ) -> torch.Tensor:
-        Q = F.layer_norm(
-            user_embeddings,
-            normalized_shape=(self._embedding_dim,),
-            eps=1e-8,
-        )
-        mha_outputs, _ = self.attention_layers[i](
-            query=Q,
-            key=user_embeddings,
-            value=user_embeddings,
-            attn_mask=self._attn_mask,
-        )
-        user_embeddings = self.forward_layers[i](
-            F.layer_norm(
-                Q + mha_outputs,
-                normalized_shape=(self._embedding_dim,),
-                eps=1e-8,
-            )
-        )
-        user_embeddings *= valid_mask
-        return user_embeddings
-
-    def generate_user_embeddings(
-        self,
-        past_lengths: torch.Tensor,
-        past_ids: torch.Tensor,
-        past_embeddings: torch.Tensor,
-        past_payloads: Dict[str, torch.Tensor],
-    ) -> torch.Tensor:
-        """
-        Args:
-            past_ids: (B, N,) x int
-
-        Returns:
-            (B, N, D,) x float
-        """
-        past_lengths, user_embeddings, valid_mask = self._input_features_preproc(
-            past_lengths=past_lengths,
-            past_ids=past_ids,
-            past_embeddings=past_embeddings,
-            past_payloads=past_payloads,
-        )
-
-        for i in range(len(self.attention_layers)):
-            if self._activation_checkpoint:
-                user_embeddings = torch.utils.checkpoint.checkpoint(
-                    self._run_one_layer,
-                    i,
-                    user_embeddings,
-                    valid_mask,
-                    use_reentrant=False,
-                )
-            else:
-                user_embeddings = self._run_one_layer(i, user_embeddings, valid_mask)
-
-        return self._output_postproc(user_embeddings)
-
-    def forward(
-        self,
-        past_lengths: torch.Tensor,
-        past_ids: torch.Tensor,
-        past_embeddings: torch.Tensor,
-        past_payloads: Dict[str, torch.Tensor],
-        batch_id: Optional[int] = None,
-    ) -> torch.Tensor:
-        """
-        Args:
-            past_ids: [B, N] x int64 where the latest engaged ids come first. In
-                particular, [:, 0] should correspond to the last engaged values.
-            past_ratings: [B, N] x int64.
-            past_timestamps: [B, N] x int64.
-
-        Returns:
-            encoded_embeddings of [B, N, D].
-        """
-        encoded_embeddings = self.generate_user_embeddings(
-            past_lengths,
-            past_ids,
-            past_embeddings,
-            past_payloads,
-        )
-        return encoded_embeddings
-
-    def encode(
-        self,
-        past_lengths: torch.Tensor,
-        past_ids: torch.Tensor,  # [B, N] x int64
-        past_embeddings: torch.Tensor,
-        past_payloads: Dict[str, torch.Tensor],
-    ) -> torch.Tensor:
-        encoded_seq_embeddings = self.generate_user_embeddings(
-            past_lengths, past_ids, past_embeddings, past_payloads
-        )  # [B, N, D]
-        return get_current_embeddings(
-            lengths=past_lengths, encoded_embeddings=encoded_seq_embeddings
-        )
-
-    def predict(
-        self,
-        past_ids: torch.Tensor,
-        past_ratings: torch.Tensor,
-        past_timestamps: torch.Tensor,
-        next_timestamps: torch.Tensor,
-        target_ids: torch.Tensor,
-        batch_id: Optional[int] = None,
-    ) -> torch.Tensor:
-        return self.interaction(  # pyre-ignore [29]
-            self.encode(
-                past_ids,
-                past_ratings,
-                past_timestamps,
-                next_timestamps,  # pyre-ignore [6]
-            ),
-            target_ids,
-        )  # [B, X]
diff --git a/recommendation_v4/generative_recommenders/research/modeling/sequential/utils.py b/recommendation_v4/generative_recommenders/research/modeling/sequential/utils.py
deleted file mode 100644
index 60dfb8e44..000000000
--- a/recommendation_v4/generative_recommenders/research/modeling/sequential/utils.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-import torch
-
-
-def batch_gather_embeddings(
-    rowwise_indices: torch.Tensor,
-    embeddings: torch.Tensor,
-) -> torch.Tensor:
-    """
-    Args:
-        rowwise_indices: (B, N) x int, where each entry is in [0, X).
-        embeddings: (B, X, D,) x float.
-
-    Returns:
-        (B, N, D,) x float, embeddings corresponding to rowwise_indices.
-    """
-    _, N = rowwise_indices.size()
-    B, X, D = embeddings.size()
-    flattened_indices = (
-        rowwise_indices
-        + torch.arange(
-            start=0,
-            end=B,
-            step=1,
-            dtype=rowwise_indices.dtype,
-            device=rowwise_indices.device,
-        )
-        .unsqueeze(1)
-        .expand(-1, N)
-        * X
-    )
-    return embeddings.view(-1, D)[flattened_indices, :].reshape(
-        rowwise_indices.size() + (D,)
-    )
-
-
-def batch_scatter_embeddings(
-    dst_embeddings: torch.Tensor,
-    rowwise_indices: torch.Tensor,
-    src_embeddings: torch.Tensor,
-) -> None:
-    """
-    Args:
-        dst_embeddings: (B, N, D,) x float.
-        rowwise_indices: (B,) x int, where each entry is in [0, N - 1).
-        source_embeddings: (B, D,) x float.
-    """
-    B, N, D = dst_embeddings.size()
-    flattened_indices = rowwise_indices + torch.arange(
-        start=0,
-        end=B * N,
-        step=N,
-        dtype=rowwise_indices.dtype,
-        device=rowwise_indices.device,
-    )
-    dst_embeddings.view(B * N, D)[flattened_indices, :] = src_embeddings
-
-
-def get_current_embeddings(
-    lengths: torch.Tensor,
-    encoded_embeddings: torch.Tensor,
-) -> torch.Tensor:
-    """
-    Args:
-        lengths: (B,) x int
-        seq_embeddings: (B, N, D,) x float
-
-    Returns:
-        (B, D,) x float, where [i, :] == encoded_embeddings[i, lengths[i] - 1, :]
-    """
-    B, N, D = encoded_embeddings.size()
-    flattened_offsets = (lengths - 1) + torch.arange(
-        start=0, end=B, step=1, dtype=lengths.dtype, device=lengths.device
-    ) * N
-    return encoded_embeddings.reshape(-1, D)[flattened_offsets, :].reshape(B, D)
-
-
-def jagged_or_dense_repeat_interleave_dim0(
-    x: torch.Tensor, lengths: torch.Tensor, repeats: int
-) -> torch.Tensor:
-    if len(x.size()) == 3:
-        return x.repeat_interleave(repeats, dim=0)
-    else:
-        assert len(x.size()) == 2, f"x.size() = {x.size()}"
-        padded_x = torch.ops.fbgemm.jagged_to_padded_dense(
-            values=x,
-            offsets=[torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)],
-            max_lengths=[lengths.max()],
-            padding_value=0.0,
-        )
-        lengths = lengths.repeat_interleave(repeats, dim=0)
-        return torch.ops.fbgemm.dense_to_jagged(
-            padded_x.repeat_interleave(repeats, dim=0),
-            [torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)],
-        )[0]
-
-
-def jagged_or_dense_index_select_dim0(
-    x: torch.Tensor, lengths: torch.Tensor, indices: torch.Tensor
-) -> torch.Tensor:
-    if len(x.size()) == 3:
-        return x[indices, :, :]
-    else:
-        assert len(x.size()) == 2, f"x.size() = {x.size()}"
-        padded_x = torch.ops.fbgemm.jagged_to_padded_dense(
-            values=x,
-            offsets=[torch.ops.fbgemm.asynchronous_complete_cumsum(lengths)],
-            max_lengths=[lengths.max()],
-            padding_value=0.0,
-        )
-        return torch.ops.fbgemm.dense_to_jagged(
-            padded_x[indices, :],
-            [torch.ops.fbgemm.asynchronous_complete_cumsum(lengths[indices])],
-        )[0]
diff --git a/recommendation_v4/generative_recommenders/research/modeling/similarity_module.py b/recommendation_v4/generative_recommenders/research/modeling/similarity_module.py
deleted file mode 100644
index 3ba32d239..000000000
--- a/recommendation_v4/generative_recommenders/research/modeling/similarity_module.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-import abc
-from typing import Optional
-
-import torch
-from generative_recommenders.research.rails.similarities.module import SimilarityModule
-
-
-class SequentialEncoderWithLearnedSimilarityModule(torch.nn.Module):
-    """
-    Interface enabling using various similarity functions (besides inner products)
-    as part of a sequential encoder/decoder.
-
-    See rails/ for more details.
-    """
-
-    def __init__(
-        self,
-        ndp_module: SimilarityModule,
-    ) -> None:
-        super().__init__()
-
-        self._ndp_module: SimilarityModule = ndp_module
-
-    @abc.abstractmethod
-    def debug_str(
-        self,
-    ) -> str:
-        pass
-
-    def similarity_fn(
-        self,
-        query_embeddings: torch.Tensor,
-        item_ids: torch.Tensor,
-        item_embeddings: Optional[torch.Tensor] = None,
-        **kwargs,
-    ) -> torch.Tensor:
-        torch._assert(
-            len(query_embeddings.size()) == 2, "len(query_embeddings.size()) must be 2"
-        )
-        torch._assert(len(item_ids.size()) == 2, "len(item_ids.size()) must be 2")
-        if item_embeddings is None:
-            item_embeddings = self.get_item_embeddings(item_ids)  # pyre-ignore [29]
-        torch._assert(
-            len(item_embeddings.size()) == 3, "len(item_embeddings.size()) must be 3"
-        )
-
-        return self._ndp_module(
-            query_embeddings=query_embeddings,  # (B, query_embedding_dim)
-            item_embeddings=item_embeddings,  # (1/B, X, item_embedding_dim)
-            item_ids=item_ids,
-            **kwargs,
-        )
diff --git a/recommendation_v4/generative_recommenders/research/modeling/similarity_utils.py b/recommendation_v4/generative_recommenders/research/modeling/similarity_utils.py
deleted file mode 100644
index 7fd870b4b..000000000
--- a/recommendation_v4/generative_recommenders/research/modeling/similarity_utils.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-from typing import List, Optional, Tuple
-
-import gin
-import torch
-from generative_recommenders.research.rails.similarities.dot_product_similarity_fn import (
-    DotProductSimilarity,
-)
-from generative_recommenders.research.rails.similarities.layers import SwiGLU
-from generative_recommenders.research.rails.similarities.mol.item_embeddings_fn import (
-    RecoMoLItemEmbeddingsFn,
-)
-from generative_recommenders.research.rails.similarities.mol.query_embeddings_fn import (
-    RecoMoLQueryEmbeddingsFn,
-)
-from generative_recommenders.research.rails.similarities.mol.similarity_fn import (
-    MoLSimilarity,
-    SoftmaxDropoutCombiner,
-)
-
-
-def init_mlp_xavier_weights_zero_bias(m) -> None:
-    if isinstance(m, torch.nn.Linear):
-        torch.nn.init.xavier_uniform(m.weight)
-        if getattr(m, "bias", None) is not None:
-            m.bias.data.fill_(0.0)
-
-
-@gin.configurable
-def create_mol_interaction_module(
-    query_embedding_dim: int,
-    item_embedding_dim: int,
-    dot_product_dimension: int,
-    query_dot_product_groups: int,
-    item_dot_product_groups: int,
-    temperature: float,
-    query_dropout_rate: float,
-    query_hidden_dim: int,
-    item_dropout_rate: float,
-    item_hidden_dim: int,
-    gating_query_hidden_dim: int,
-    gating_qi_hidden_dim: int,
-    gating_item_hidden_dim: int,
-    softmax_dropout_rate: float,
-    bf16_training: bool,
-    gating_query_fn: bool = True,
-    gating_item_fn: bool = True,
-    dot_product_l2_norm: bool = True,
-    query_nonlinearity: str = "geglu",
-    item_nonlinearity: str = "geglu",
-    uid_dropout_rate: float = 0.5,
-    uid_embedding_hash_sizes: Optional[List[int]] = None,
-    uid_embedding_level_dropout: bool = False,
-    gating_combination_type: str = "glu_silu",
-    gating_item_dropout_rate: float = 0.0,
-    gating_qi_dropout_rate: float = 0.0,
-    eps: float = 1e-6,
-) -> Tuple[MoLSimilarity, str]:
-    """
-    Gin wrapper for creating MoL learned similarity.
-    """
-    mol_module = MoLSimilarity(
-        query_embedding_dim=query_embedding_dim,
-        item_embedding_dim=item_embedding_dim,
-        dot_product_dimension=dot_product_dimension,
-        query_dot_product_groups=query_dot_product_groups,
-        item_dot_product_groups=item_dot_product_groups,
-        temperature=temperature,
-        dot_product_l2_norm=dot_product_l2_norm,
-        query_embeddings_fn=RecoMoLQueryEmbeddingsFn(
-            query_embedding_dim=query_embedding_dim,
-            query_dot_product_groups=query_dot_product_groups,
-            dot_product_dimension=dot_product_dimension,
-            dot_product_l2_norm=dot_product_l2_norm,
-            proj_fn=lambda input_dim, output_dim: (
-                torch.nn.Sequential(
-                    torch.nn.Dropout(p=query_dropout_rate),
-                    SwiGLU(
-                        in_features=input_dim,
-                        out_features=query_hidden_dim,
-                    ),
-                    torch.nn.Linear(
-                        in_features=query_hidden_dim,
-                        out_features=output_dim,
-                    ),
-                ).apply(init_mlp_xavier_weights_zero_bias)
-            ),
-            eps=eps,
-        ),
-        item_embeddings_fn=RecoMoLItemEmbeddingsFn(
-            item_embedding_dim=item_embedding_dim,
-            item_dot_product_groups=item_dot_product_groups,
-            dot_product_dimension=dot_product_dimension,
-            dot_product_l2_norm=dot_product_l2_norm,
-            proj_fn=lambda input_dim, output_dim: (
-                torch.nn.Sequential(
-                    torch.nn.Dropout(p=item_dropout_rate),
-                    SwiGLU(in_features=input_dim, out_features=item_hidden_dim),
-                    torch.nn.Linear(
-                        in_features=item_hidden_dim,
-                        out_features=output_dim,
-                    ),
-                ).apply(init_mlp_xavier_weights_zero_bias)
-            ),
-            eps=eps,
-        ),
-        gating_query_only_partial_fn=lambda input_dim, output_dim: (  # pyre-ignore [6]
-            torch.nn.Sequential(
-                torch.nn.Linear(
-                    in_features=input_dim,
-                    out_features=gating_query_hidden_dim,
-                ),
-                torch.nn.SiLU(),
-                torch.nn.Linear(
-                    in_features=gating_query_hidden_dim,
-                    out_features=output_dim,
-                    bias=False,
-                ),
-            ).apply(init_mlp_xavier_weights_zero_bias)
-            if gating_query_fn
-            else None
-        ),
-        gating_item_only_partial_fn=lambda input_dim, output_dim: (  # pyre-ignore [6]
-            torch.nn.Sequential(
-                torch.nn.Dropout(p=gating_item_dropout_rate),
-                torch.nn.Linear(
-                    in_features=input_dim,
-                    out_features=gating_item_hidden_dim,
-                ),
-                torch.nn.SiLU(),
-                torch.nn.Linear(
-                    in_features=gating_item_hidden_dim,
-                    out_features=output_dim,
-                    bias=False,
-                ),
-            ).apply(init_mlp_xavier_weights_zero_bias)
-            if gating_item_fn
-            else None
-        ),
-        gating_qi_partial_fn=lambda input_dim, output_dim: (  # pyre-ignore [6]
-            torch.nn.Sequential(
-                torch.nn.Dropout(p=gating_qi_dropout_rate),
-                torch.nn.Linear(
-                    in_features=input_dim,
-                    out_features=gating_qi_hidden_dim,
-                ),
-                torch.nn.SiLU(),
-                torch.nn.Linear(
-                    in_features=gating_qi_hidden_dim,
-                    out_features=output_dim,
-                ),
-            ).apply(init_mlp_xavier_weights_zero_bias)
-            if gating_qi_hidden_dim > 0
-            else torch.nn.Sequential(
-                torch.nn.Dropout(p=gating_qi_dropout_rate),
-                torch.nn.Linear(
-                    in_features=input_dim,
-                    out_features=output_dim,
-                ),
-            ).apply(init_mlp_xavier_weights_zero_bias)
-        ),
-        gating_combination_type=gating_combination_type,
-        gating_normalization_fn=lambda _: SoftmaxDropoutCombiner(
-            dropout_rate=softmax_dropout_rate, eps=1e-6
-        ),
-        eps=eps,
-        autocast_bf16=bf16_training,
-    )
-    interaction_module_debug_str = (
-        f"MoL-{query_dot_product_groups}x{item_dot_product_groups}x{dot_product_dimension}"
-        + f"-t{temperature}-d{softmax_dropout_rate}"
-        + f"{'-l2' if dot_product_l2_norm else ''}"
-        + f"-q{query_hidden_dim}d{query_dropout_rate}{query_nonlinearity}"
-        + f"-i{item_hidden_dim}d{item_dropout_rate}{item_nonlinearity}"
-        + (f"-gq{gating_query_hidden_dim}" if gating_query_fn else "")
-        + (
-            f"-gi{gating_item_hidden_dim}d{gating_item_dropout_rate}"
-            if gating_item_fn
-            else ""
-        )
-        + f"-gqi{gating_qi_hidden_dim}d{gating_qi_dropout_rate}-x-{gating_combination_type}"
-    )
-    return mol_module, interaction_module_debug_str
-
-
-@gin.configurable
-def get_similarity_function(
-    module_type: str,
-    query_embedding_dim: int,
-    item_embedding_dim: int,
-    bf16_training: bool = False,
-    activation_checkpoint: bool = False,
-) -> Tuple[torch.nn.Module, str]:
-    if module_type == "DotProduct":
-        interaction_module = DotProductSimilarity()
-        interaction_module_debug_str = "DotProduct"
-    elif module_type == "MoL":
-        interaction_module, interaction_module_debug_str = (
-            create_mol_interaction_module(
-                query_embedding_dim=query_embedding_dim,
-                item_embedding_dim=item_embedding_dim,
-                bf16_training=bf16_training,
-            )
-        )
-    else:
-        raise ValueError(f"Unknown interaction_module_type {module_type}")
-    return interaction_module, interaction_module_debug_str
diff --git a/recommendation_v4/generative_recommenders/research/rails/indexing/candidate_index.py b/recommendation_v4/generative_recommenders/research/rails/indexing/candidate_index.py
deleted file mode 100644
index f628468ce..000000000
--- a/recommendation_v4/generative_recommenders/research/rails/indexing/candidate_index.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-import abc
-from typing import Tuple
-
-import torch
-
-
-class TopKModule(torch.nn.Module):
-    @abc.abstractmethod
-    def forward(
-        self,
-        query_embeddings: torch.Tensor,
-        k: int,
-        sorted: bool = True,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Args:
-            query_embeddings: (B, X, ...). Implementation-specific.
-            k: int. top k to return.
-            sorted: bool.
-
-        Returns:
-            Tuple of (top_k_scores, top_k_ids), both of shape (B, K,)
-        """
-        pass
diff --git a/recommendation_v4/generative_recommenders/research/rails/indexing/mips_top_k.py b/recommendation_v4/generative_recommenders/research/rails/indexing/mips_top_k.py
deleted file mode 100644
index 810b24c42..000000000
--- a/recommendation_v4/generative_recommenders/research/rails/indexing/mips_top_k.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-from typing import Tuple
-
-import torch
-from generative_recommenders.research.rails.indexing.candidate_index import TopKModule
-
-
-class MIPSTopKModule(TopKModule):
-    def __init__(
-        self,
-        item_embeddings: torch.Tensor,
-        item_ids: torch.Tensor,
-    ) -> None:
-        """
-        Args:
-            item_embeddings: (1, X, D)
-            item_ids: (1, X,)
-        """
-        super().__init__()
-
-        self._item_embeddings: torch.Tensor = item_embeddings
-        self._item_ids: torch.Tensor = item_ids
-
-
-class MIPSBruteForceTopK(MIPSTopKModule):
-    def __init__(
-        self,
-        item_embeddings: torch.Tensor,
-        item_ids: torch.Tensor,
-    ) -> None:
-        super().__init__(
-            item_embeddings=item_embeddings,
-            item_ids=item_ids,
-        )
-        del self._item_embeddings
-        self._item_embeddings_t: torch.Tensor = item_embeddings.permute(
-            2, 1, 0
-        ).squeeze(2)
-
-    def forward(
-        self,
-        query_embeddings: torch.Tensor,
-        k: int,
-        sorted: bool = True,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Args:
-            query_embeddings: (B, ...). Implementation-specific.
-            k: int. final top-k to return.
-            sorted: bool. whether to sort final top-k results or not.
-
-        Returns:
-            Tuple of (top_k_scores x float, top_k_ids x int), both of shape (B, K,)
-        """
-        # (B, X,)
-        all_logits = torch.mm(query_embeddings, self._item_embeddings_t)
-        top_k_logits, top_k_indices = torch.topk(
-            all_logits,
-            dim=1,
-            k=k,
-            sorted=sorted,
-            largest=True,
-        )  # (B, k,)
-        return top_k_logits, self._item_ids.squeeze(0)[top_k_indices]
diff --git a/recommendation_v4/generative_recommenders/research/rails/indexing/mol_top_k.py b/recommendation_v4/generative_recommenders/research/rails/indexing/mol_top_k.py
deleted file mode 100644
index fe88ca919..000000000
--- a/recommendation_v4/generative_recommenders/research/rails/indexing/mol_top_k.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-"""
-Defines exact- and approximate- Top-K modules for Mixture-of-Logits (MoL),
-discussed in Retrieval with Learned Similarities (https://arxiv.org/abs/2407.15462).
-
-Forked from bailuding/rails @ 664fdb9.
-"""
-
-from typing import Tuple
-
-import torch
-from generative_recommenders.research.rails.indexing.candidate_index import TopKModule
-from generative_recommenders.research.rails.similarities.mol.similarity_fn import (
-    MoLSimilarity,
-)
-
-
-class MoLTopKModule(TopKModule):
-    def __init__(
-        self,
-        mol_module: MoLSimilarity,
-        item_embeddings: torch.Tensor,
-        item_ids: torch.Tensor,
-        flatten_item_ids_and_embeddings: bool,
-        keep_component_level_item_embeddings: bool,
-        component_level_item_embeddings_dtype: torch.dtype = torch.bfloat16,
-    ) -> None:
-        """
-        Args:
-            mol_module: MoLSimilarity.
-            item_embeddings: (1, X, D) if mol_module._apply_item_embeddings_fn is True,
-                (1, X, P_X, D_P) otherwise.
-            item_ids: (1, X,) representing the item ids.
-            flatten_item_ids_and_embeddings: bool. If true, do not keep the extra (1,)
-                dimension at size(0).
-            keep_component_level_item_embeddings: bool. If true, keep P_x component-level
-                embeddings in `self._mol_item_embeddings` for downstream applications.
-            component_level_item_embeddings_dtype: torch.dtype. If set, the dtype
-                to keep component-level item embeddings in. By default we use bfloat16.
-        """
-        super().__init__()
-
-        self._mol_module: MoLSimilarity = mol_module
-        self._item_embeddings: torch.Tensor = (
-            item_embeddings
-            if not flatten_item_ids_and_embeddings
-            else item_embeddings.squeeze(0)
-        )
-
-        if keep_component_level_item_embeddings:
-            self._mol_item_embeddings: torch.Tensor = (
-                mol_module.get_item_component_embeddings(
-                    (
-                        self._item_embeddings.squeeze(0)
-                        if not flatten_item_ids_and_embeddings
-                        else self._item_embeddings
-                    ),
-                    decoupled_inference=True,
-                )[0]  # (X, D) -> (X, P_X, D_P)
-            ).to(component_level_item_embeddings_dtype)
-
-        self._item_ids: torch.Tensor = (
-            item_ids if not flatten_item_ids_and_embeddings else item_ids.squeeze(0)
-        )
-
-    @property
-    def mol_module(self) -> MoLSimilarity:
-        return self._mol_module
-
-
-class MoLBruteForceTopK(MoLTopKModule):
-    def __init__(
-        self,
-        mol_module: MoLSimilarity,
-        item_embeddings: torch.Tensor,
-        item_ids: torch.Tensor,
-    ) -> None:
-        super().__init__(
-            mol_module=mol_module,
-            item_embeddings=item_embeddings,
-            item_ids=item_ids,
-            flatten_item_ids_and_embeddings=False,
-            keep_component_level_item_embeddings=False,
-        )
-
-    def forward(
-        self,
-        query_embeddings: torch.Tensor,
-        k: int,
-        sorted: bool = True,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Args:
-            query_embeddings: (B, X, D) if mol_module._apply_query_embeddings_fn is True,
-                (B, X, P_Q, D_P) otherwise.
-            k: int. final top-k to return.
-            sorted: bool. whether to sort final top-k results or not.
-            **kwargs: Implementation-specific keys/values.
-
-        Returns:
-            Tuple of (top_k_scores x float, top_k_ids x int), both of shape (B, K,)
-        """
-        # (B, X,)
-        all_logits, _ = self.mol_module(
-            query_embeddings,
-            self._item_embeddings,
-            **kwargs,
-        )
-        top_k_logits, top_k_indices = torch.topk(
-            all_logits,
-            dim=1,
-            k=k,
-            sorted=sorted,
-            largest=True,
-        )  # (B, k,)
-        return top_k_logits, self._item_ids.squeeze(0)[top_k_indices]
diff --git a/recommendation_v4/generative_recommenders/research/rails/similarities/dot_product_similarity_fn.py b/recommendation_v4/generative_recommenders/research/rails/similarities/dot_product_similarity_fn.py
deleted file mode 100644
index 9357fd0e4..000000000
--- a/recommendation_v4/generative_recommenders/research/rails/similarities/dot_product_similarity_fn.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-from typing import Dict, Tuple
-
-import torch
-from generative_recommenders.research.rails.similarities.module import SimilarityModule
-
-
-class DotProductSimilarity(SimilarityModule):
-    def __init__(
-        self,
-    ) -> None:
-        super().__init__()
-
-    def debug_str(self) -> str:
-        return "dp"
-
-    def forward(
-        self,
-        query_embeddings: torch.Tensor,
-        item_embeddings: torch.Tensor,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
-        """
-        Args:
-            query_embeddings: (B, D,) or (B * r, D) x float.
-            item_embeddings: (1, X, D) or (B, X, D) x float.
-
-        Returns:
-            (B, X) x float.
-        """
-
-        B_I, X, D = item_embeddings.size()
-        if B_I == 1:
-            # [B, D] x ([1, X, D] -> [D, X]) => [B, X]
-            return (
-                torch.mm(query_embeddings, item_embeddings.squeeze(0).t()),
-                {},
-            )  # [B, X]
-        elif query_embeddings.size(0) != B_I:
-            # (B * r, D) x (B, X, D).
-            return (
-                torch.bmm(
-                    query_embeddings.view(B_I, -1, D),
-                    item_embeddings.permute(0, 2, 1),
-                ).view(-1, X),
-                {},
-            )
-        else:
-            # [B, X, D] x ([B, D] -> [B, D, 1]) => [B, X, 1] -> [B, X]
-            return (
-                torch.bmm(item_embeddings, query_embeddings.unsqueeze(2)).squeeze(2),
-                {},
-            )
diff --git a/recommendation_v4/generative_recommenders/research/rails/similarities/layers.py b/recommendation_v4/generative_recommenders/research/rails/similarities/layers.py
deleted file mode 100644
index 3f838bc48..000000000
--- a/recommendation_v4/generative_recommenders/research/rails/similarities/layers.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-"""
-Defines network architectures used in constructing various learned similarities.
-
-Forked from bailuding/rails @ 664fdb9.
-"""
-
-import torch
-import torch.nn.functional as F
-
-
-class GeGLU(torch.nn.Module):
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-    ) -> None:
-        super().__init__()
-
-        self._in_features = in_features
-        self._out_features = out_features
-        self._w = torch.nn.Parameter(
-            torch.empty((in_features, out_features * 2)).normal_(mean=0, std=0.02),
-        )
-        self._b = torch.nn.Parameter(
-            torch.zeros((1, out_features * 2)),
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        bs = x.size()[:-1]
-        lhs, rhs = torch.split(
-            torch.mm(x.reshape(-1, self._in_features), self._w) + self._b,
-            [self._out_features, self._out_features],
-            dim=-1,
-        )
-        return (F.gelu(lhs) * rhs).reshape(bs + (self._out_features,))
-
-
-class SwiGLU(torch.nn.Module):
-    """
-    SwiGLU from https://arxiv.org/abs/2002.05202.
-    """
-
-    def __init__(
-        self,
-        in_features: int,
-        out_features: int,
-    ) -> None:
-        super().__init__()
-
-        self._in_features = in_features
-        self._out_features = out_features
-        self._w = torch.nn.Parameter(
-            torch.empty((in_features, out_features * 2)).normal_(mean=0, std=0.02),
-        )
-        self._b = torch.nn.Parameter(
-            torch.zeros((1, out_features * 2)),
-        )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        bs = x.size()[:-1]
-        lhs, rhs = torch.split(
-            torch.mm(x.reshape(-1, self._in_features), self._w) + self._b,
-            [self._out_features, self._out_features],
-            dim=-1,
-        )
-        return (F.silu(lhs) * rhs).reshape(bs + (self._out_features,))
diff --git a/recommendation_v4/generative_recommenders/research/rails/similarities/module.py b/recommendation_v4/generative_recommenders/research/rails/similarities/module.py
deleted file mode 100644
index e4061fa74..000000000
--- a/recommendation_v4/generative_recommenders/research/rails/similarities/module.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-import abc
-from typing import Dict, Tuple
-
-import torch
-
-
-class SimilarityModule(torch.nn.Module):
-    """
-    Interface enabling interfacing with various similarity functions.
-
-    While the discussions in our initial ICML'24 paper are based on inner products
-    for simplicity, we provide this interface (SimilarityModule) to support various
-    learned similarities at the retrieval stage, such as MLPs, Factorization Machines
-    (FMs), and Mixture-of-Logits (MoL), which we discussed in
-    - Revisiting Neural Retrieval on Accelerators (KDD'23), and
-    - Retrieval with Learned Similarities (https://arxiv.org/abs/2407.15462).
-    """
-
-    @abc.abstractmethod
-    def forward(
-        self,
-        query_embeddings: torch.Tensor,
-        item_embeddings: torch.Tensor,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
-        """
-        Args:
-            query_embeddings: (B, input_embedding_dim) x float.
-            item_embeddings: (1/B, X, item_embedding_dim) x float.
-            **kwargs: Implementation-specific keys/values (e.g.,
-                item ids / sideinfo, etc.)
-
-        Returns:
-            A tuple of (
-                (B, X,) similarity values,
-                keyed outputs representing auxiliary losses at training time.
-            ).
-        """
-        pass
diff --git a/recommendation_v4/generative_recommenders/research/rails/similarities/mol/embeddings_fn.py b/recommendation_v4/generative_recommenders/research/rails/similarities/mol/embeddings_fn.py
deleted file mode 100644
index fd94e6f22..000000000
--- a/recommendation_v4/generative_recommenders/research/rails/similarities/mol/embeddings_fn.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-"""
-Defines interface for generating query- and item-side embeddings for MoL.
-
-Forked from bailuding/rails @ 664fdb9.
-"""
-
-import abc
-from typing import Dict, Tuple
-
-import torch
-
-
-class MoLEmbeddingsFn(torch.nn.Module):
-    """
-    Generates K_Q query-side (K_I item-side) embeddings for MoL based on
-    input embeddings and other optional implementation-specific tensors.
-    """
-
-    @abc.abstractmethod
-    def forward(
-        self,
-        input_embeddings: torch.Tensor,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
-        """
-        Args:
-            input_embeddings: (B, ...) x float where B is the batch size.
-            kwargs: implementation-specific.
-
-        Returns:
-            Tuple of (
-                (B, query_dot_product_groups/item_dot_product_groups, dot_product_embedding_dim) x float,
-                str-keyed auxiliary losses.
-            ).
-        """
-        pass
diff --git a/recommendation_v4/generative_recommenders/research/rails/similarities/mol/item_embeddings_fn.py b/recommendation_v4/generative_recommenders/research/rails/similarities/mol/item_embeddings_fn.py
deleted file mode 100644
index 237cd8942..000000000
--- a/recommendation_v4/generative_recommenders/research/rails/similarities/mol/item_embeddings_fn.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-"""
-Defines functions to generate item-side embeddings for MoL.
-
-Forked from bailuding/rails @ 664fdb9.
-"""
-
-from typing import Callable, Dict, Tuple
-
-import torch
-from generative_recommenders.research.rails.similarities.mol.embeddings_fn import (
-    MoLEmbeddingsFn,
-)
-
-
-def init_mlp_xavier_weights_zero_bias(m) -> None:
-    if isinstance(m, torch.nn.Linear):
-        torch.nn.init.xavier_uniform_(m.weight)
-        if getattr(m, "bias", None) is not None:
-            m.bias.data.fill_(0.0)
-
-
-class RecoMoLItemEmbeddingsFn(MoLEmbeddingsFn):
-    """
-    Generates P_X query-side embeddings for MoL based on input embeddings and other
-    optional tensors for recommendation models. Tested for sequential retrieval
-    scenarios.
-    """
-
-    def __init__(
-        self,
-        item_embedding_dim: int,
-        item_dot_product_groups: int,
-        dot_product_dimension: int,
-        dot_product_l2_norm: bool,
-        proj_fn: Callable[[int, int], torch.nn.Module],
-        eps: float,
-    ) -> None:
-        super().__init__()
-
-        self._item_emb_based_dot_product_groups: int = item_dot_product_groups
-        self._item_emb_proj_module: torch.nn.Module = proj_fn(
-            item_embedding_dim,
-            dot_product_dimension * self._item_emb_based_dot_product_groups,
-        )
-        self._dot_product_dimension: int = dot_product_dimension
-        self._dot_product_l2_norm: bool = dot_product_l2_norm
-        self._eps: float = eps
-
-    def forward(
-        self,
-        input_embeddings: torch.Tensor,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
-        """
-        Args:
-            input_embeddings: (B, item_embedding_dim,) x float where B is the batch size.
-            kwargs: str-keyed tensors. Implementation-specific.
-
-        Returns:
-            Tuple of (
-                (B, item_dot_product_groups, dot_product_embedding_dim) x float,
-                str-keyed aux_losses,
-            ).
-        """
-        split_item_embeddings = self._item_emb_proj_module(input_embeddings).reshape(
-            input_embeddings.size()[:-1]
-            + (
-                self._item_emb_based_dot_product_groups,
-                self._dot_product_dimension,
-            )
-        )
-
-        if self._dot_product_l2_norm:
-            split_item_embeddings = split_item_embeddings / torch.clamp(
-                torch.linalg.norm(
-                    split_item_embeddings,
-                    ord=None,
-                    dim=-1,
-                    keepdim=True,
-                ),
-                min=self._eps,
-            )
-        return split_item_embeddings, {}
diff --git a/recommendation_v4/generative_recommenders/research/rails/similarities/mol/query_embeddings_fn.py b/recommendation_v4/generative_recommenders/research/rails/similarities/mol/query_embeddings_fn.py
deleted file mode 100644
index 8fe28ee11..000000000
--- a/recommendation_v4/generative_recommenders/research/rails/similarities/mol/query_embeddings_fn.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-"""
-Defines functions to generate query-side embeddings for MoL.
-
-Forked from bailuding/rails @ 664fdb9.
-"""
-
-from typing import Callable, Dict, List, Optional, Tuple
-
-import torch
-import torch.nn.functional as F
-from generative_recommenders.research.rails.similarities.mol.embeddings_fn import (
-    MoLEmbeddingsFn,
-)
-
-
-def init_mlp_xavier_weights_zero_bias(m) -> None:
-    if isinstance(m, torch.nn.Linear):
-        torch.nn.init.xavier_uniform_(m.weight)
-        if getattr(m, "bias", None) is not None:
-            m.bias.data.fill_(0.0)
-
-
-class RecoMoLQueryEmbeddingsFn(MoLEmbeddingsFn):
-    """
-    Generates P_Q query-side embeddings for MoL based on input embeddings and other
-    optional tensors for recommendation models. Tested for sequential retrieval
-    scenarios.
-
-    The current implementation accesses user_ids associated with the query from
-    `user_ids' in kwargs.
-    """
-
-    def __init__(
-        self,
-        query_embedding_dim: int,
-        query_dot_product_groups: int,
-        dot_product_dimension: int,
-        dot_product_l2_norm: bool,
-        proj_fn: Callable[[int, int], torch.nn.Module],
-        eps: float,
-        uid_embedding_hash_sizes: Optional[List[int]] = None,
-        uid_dropout_rate: float = 0.0,
-        uid_embedding_level_dropout: bool = False,
-    ) -> None:
-        super().__init__()
-        self._uid_embedding_hash_sizes: List[int] = uid_embedding_hash_sizes or []
-        self._query_emb_based_dot_product_groups: int = query_dot_product_groups - len(
-            self._uid_embedding_hash_sizes
-        )
-        self._query_emb_proj_module: torch.nn.Module = proj_fn(
-            query_embedding_dim,
-            dot_product_dimension * self._query_emb_based_dot_product_groups,
-        )
-        self._dot_product_dimension: int = dot_product_dimension
-        self._dot_product_l2_norm: bool = dot_product_l2_norm
-        if len(self._uid_embedding_hash_sizes) > 0:
-            for i, hash_size in enumerate(self._uid_embedding_hash_sizes):
-                setattr(
-                    self,
-                    f"_uid_embeddings_{i}",
-                    torch.nn.Embedding(
-                        hash_size + 1, dot_product_dimension, padding_idx=0
-                    ),
-                )
-        self._uid_dropout_rate: float = uid_dropout_rate
-        self._uid_embedding_level_dropout: bool = uid_embedding_level_dropout
-        self._eps: float = eps
-
-    def forward(
-        self,
-        input_embeddings: torch.Tensor,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
-        """
-        Args:
-            input_embeddings: (B, query_embedding_dim,) x float where B is the batch size.
-            kwargs: str-keyed tensors. Implementation-specific.
-
-        Returns:
-            Tuple of (
-                (B, query_dot_product_groups, dot_product_embedding_dim) x float,
-                str-keyed aux_losses,
-            ).
-        """
-        split_query_embeddings = self._query_emb_proj_module(input_embeddings).reshape(
-            (
-                input_embeddings.size(0),
-                self._query_emb_based_dot_product_groups,
-                self._dot_product_dimension,
-            )
-        )
-
-        aux_losses: Dict[str, torch.Tensor] = {}
-
-        if len(self._uid_embedding_hash_sizes) > 0:
-            all_uid_embeddings = []
-            for i, hash_size in enumerate(self._uid_embedding_hash_sizes):
-                # TODO: decouple this from MoLQueryEmbeddingFn.
-                uid_embeddings = getattr(self, f"_uid_embeddings_{i}")(
-                    (kwargs["user_ids"] % hash_size) + 1
-                )
-                if self.training:
-                    l2_norm = (uid_embeddings * uid_embeddings).sum(-1).mean()
-                    if i == 0:
-                        aux_losses["uid_embedding_l2_norm"] = l2_norm
-                    else:
-                        aux_losses["uid_embedding_l2_norm"] = (
-                            aux_losses["uid_embedding_l2_norm"] + l2_norm
-                        )
-
-                if self._uid_dropout_rate > 0.0:
-                    if self._uid_embedding_level_dropout:
-                        # conditionally dropout the entire embedding.
-                        if self.training:
-                            uid_dropout_mask = (
-                                torch.rand(
-                                    uid_embeddings.size()[:-1],
-                                    device=uid_embeddings.device,
-                                )
-                                > self._uid_dropout_rate
-                            )
-                            uid_embeddings = (
-                                uid_embeddings
-                                * uid_dropout_mask.unsqueeze(-1)
-                                / (1.0 - self._uid_dropout_rate)
-                            )
-                    else:
-                        uid_embeddings = F.dropout(
-                            uid_embeddings,
-                            p=self._uid_dropout_rate,
-                            training=self.training,
-                        )
-                all_uid_embeddings.append(uid_embeddings.unsqueeze(1))
-            split_query_embeddings = torch.cat(
-                [split_query_embeddings] + all_uid_embeddings, dim=1
-            )
-
-        if self._dot_product_l2_norm:
-            split_query_embeddings = split_query_embeddings / torch.clamp(
-                torch.linalg.norm(
-                    split_query_embeddings,
-                    ord=None,
-                    dim=-1,
-                    keepdim=True,
-                ),
-                min=self._eps,
-            )
-        return split_query_embeddings, aux_losses
diff --git a/recommendation_v4/generative_recommenders/research/rails/similarities/mol/similarity_fn.py b/recommendation_v4/generative_recommenders/research/rails/similarities/mol/similarity_fn.py
deleted file mode 100644
index 34e4c4a23..000000000
--- a/recommendation_v4/generative_recommenders/research/rails/similarities/mol/similarity_fn.py
+++ /dev/null
@@ -1,388 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-"""
-Implements MoL (Mixture-of-Logits) with load balancing regularization loss, as discussed in:
-- Revisiting Neural Retrieval on Accelerators (https://arxiv.org/abs/2306.04039, KDD'23).
-- Retrieval with Learned Similarities (https://arxiv.org/abs/2407.15462).
-
-Forked from bailuding/rails @ 664fdb9.
-"""
-
-from typing import Callable, Dict, Optional, Tuple
-
-import torch
-import torch.nn.functional as F
-from generative_recommenders.research.rails.similarities.module import SimilarityModule
-from generative_recommenders.research.rails.similarities.mol.embeddings_fn import (
-    MoLEmbeddingsFn,
-)
-
-
-@torch.compile(dynamic=True)
-def _softmax_dropout_combiner_fn(
-    x: torch.Tensor,
-    y: torch.Tensor,
-    dropout_pr: float,
-    eps: float,
-    training: bool,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """
-    Computes (_softmax_dropout_fn(x) * y).sum(-1).
-    """
-    x = F.softmax(x, dim=-1)
-    if dropout_pr > 0.0:
-        x = F.dropout(x, p=dropout_pr, training=training)
-        x = x / torch.clamp(x.sum(-1, keepdims=True), min=eps)  # pyre-ignore [19]
-    return x, (x * y).sum(-1)
-
-
-@torch.compile
-def _load_balancing_mi_loss_fn(
-    gating_prs: torch.Tensor,
-    eps: float,
-) -> torch.Tensor:
-    """
-    See Retrieval with Learned Similarities (RAILS, https://arxiv.org/abs/2407.15462) for discussions.
-    """
-    B, X, E = gating_prs.size()
-    expert_util_prs = gating_prs.view(B * X, E).sum(0, keepdim=False) / (1.0 * B * X)
-    expert_util_entropy = -(expert_util_prs * torch.log(expert_util_prs + eps)).sum()
-    per_example_expert_entropy = -(gating_prs * torch.log(gating_prs + eps)).sum() / (
-        1.0 * B * X
-    )
-    return -expert_util_entropy + per_example_expert_entropy
-
-
-class SoftmaxDropoutCombiner(torch.nn.Module):
-    def __init__(
-        self,
-        dropout_rate: float,
-        eps: float,
-    ) -> None:
-        super().__init__()
-
-        self._dropout_rate: float = dropout_rate
-        self._eps: float = eps
-
-    def forward(
-        self,
-        gating_weights: torch.Tensor,
-        x: torch.Tensor,
-    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
-        gating_prs, combined_logits = _softmax_dropout_combiner_fn(
-            x=gating_weights,
-            y=x,
-            dropout_pr=self._dropout_rate,
-            eps=self._eps,
-            training=self.training,
-        )
-
-        aux_losses = {}
-        if self.training:
-            aux_losses["mi_loss"] = _load_balancing_mi_loss_fn(
-                gating_prs, eps=self._eps
-            )
-
-        return combined_logits, aux_losses
-
-
-class MoLGatingFn(torch.nn.Module):
-    """
-    Implements the gating function for MoL, used to compute pi_p(q, x) for a given (p, x) pair.
-    """
-
-    def __init__(
-        self,
-        num_logits: int,
-        query_embedding_dim: int,
-        item_embedding_dim: int,
-        query_only_partial_fn: Optional[Callable[[int, int], torch.nn.Module]],
-        item_only_partial_fn: Optional[Callable[[int, int], torch.nn.Module]],
-        qi_partial_fn: Optional[Callable[[int, int], torch.nn.Module]],
-        combination_type: str,
-        normalization_fn: Callable[[int], torch.nn.Module],
-    ) -> None:
-        super().__init__()
-
-        self._query_only_partial_module: Optional[torch.nn.Module] = (
-            query_only_partial_fn(query_embedding_dim, num_logits)
-            if query_only_partial_fn
-            else None
-        )
-        self._item_only_partial_module: Optional[torch.nn.Module] = (
-            item_only_partial_fn(item_embedding_dim, num_logits)
-            if item_only_partial_fn
-            else None
-        )
-        self._qi_partial_module: Optional[torch.nn.Module] = (
-            qi_partial_fn(
-                num_logits,
-                num_logits,
-            )
-            if qi_partial_fn is not None
-            else None
-        )
-        if (
-            self._query_only_partial_module is None
-            and self._item_only_partial_module is None
-            and self._qi_partial_module is None
-        ):
-            raise ValueError(
-                "At least one of query_only_partial_fn, item_only_partial_fn, "
-                "and qi_partial_fn must not be None."
-            )
-        self._num_logits: int = num_logits
-        self._combination_type: str = combination_type
-        self._normalization_fn: torch.nn.Module = normalization_fn(num_logits)
-
-    def forward(
-        self,
-        logits: torch.Tensor,
-        query_embeddings: torch.Tensor,
-        item_embeddings: torch.Tensor,
-    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
-        """
-        Args:
-            logits: (B, X, P_Q * P_X) x float;
-            query_embeddings: (B, D) x float;
-            item_embeddings: (1/B, X, D') x float;
-
-        Returns:
-            (B, X) x float, Dict[str, Tensor] representing auxiliary losses.
-        """
-        B, X, _ = logits.size()
-        # [B, 1, F], [1/B, X, F], [B, X, F]
-        query_partial_inputs, item_partial_inputs, qi_partial_inputs = None, None, None
-        if self._query_only_partial_module is not None:
-            query_partial_inputs = self._query_only_partial_module(
-                query_embeddings
-            ).unsqueeze(1)
-        if self._item_only_partial_module is not None:
-            item_partial_inputs = self._item_only_partial_module(item_embeddings)
-        if self._qi_partial_module is not None:
-            qi_partial_inputs = self._qi_partial_module(logits)
-
-        if self._combination_type == "glu_silu":
-            gating_inputs = (
-                query_partial_inputs * item_partial_inputs + qi_partial_inputs
-            )
-            gating_weights = gating_inputs * F.sigmoid(gating_inputs)
-        elif self._combination_type == "glu_silu_ln":
-            gating_inputs = (
-                query_partial_inputs * item_partial_inputs + qi_partial_inputs
-            )
-            gating_weights = gating_inputs * F.sigmoid(
-                F.layer_norm(gating_inputs, normalized_shape=[self._num_logits])
-            )
-        elif self._combination_type == "none":
-            gating_inputs = query_partial_inputs
-            if gating_inputs is None:
-                gating_inputs = item_partial_inputs
-            elif item_partial_inputs is not None:
-                gating_inputs += item_partial_inputs
-            if gating_inputs is None:
-                gating_inputs = qi_partial_inputs
-            elif qi_partial_inputs is not None:
-                gating_inputs += qi_partial_inputs
-            gating_weights = gating_inputs
-        else:
-            raise ValueError(f"Unknown combination_type {self._combination_type}")
-
-        return self._normalization_fn(gating_weights, logits)
-
-
-class MoLSimilarity(SimilarityModule):
-    def __init__(
-        self,
-        query_embedding_dim: int,
-        item_embedding_dim: int,
-        dot_product_dimension: int,
-        query_dot_product_groups: int,
-        item_dot_product_groups: int,
-        temperature: float,
-        dot_product_l2_norm: bool,
-        query_embeddings_fn: MoLEmbeddingsFn,
-        item_embeddings_fn: Optional[MoLEmbeddingsFn],
-        gating_query_only_partial_fn: Optional[Callable[[int, int], torch.nn.Module]],
-        gating_item_only_partial_fn: Optional[Callable[[int, int], torch.nn.Module]],
-        gating_qi_partial_fn: Optional[Callable[[int], torch.nn.Module]],
-        gating_combination_type: str,
-        gating_normalization_fn: Callable[[int], torch.nn.Module],
-        eps: float,
-        apply_query_embeddings_fn: bool = True,
-        apply_item_embeddings_fn: bool = True,
-        autocast_bf16: bool = False,
-    ) -> None:
-        """
-        Args:
-            apply_query_embeddings_fn: bool. If true, compute query_embeddings_fn
-                to input during forward(). Otherwise, we assume the caller will
-                invoke get_query_component_embeddings() separately before
-                calling forward().
-            apply_item_embeddings_fn: bool. If true, compute item_embeddings_fn
-                to input during forward(). Otherwise, we assume the caller will
-                invoke get_item_component_embeddings() separately before
-                calling forward().
-        """
-        super().__init__()
-
-        self._gating_fn: MoLGatingFn = MoLGatingFn(
-            num_logits=query_dot_product_groups * item_dot_product_groups,
-            query_embedding_dim=query_embedding_dim,
-            item_embedding_dim=item_embedding_dim,
-            query_only_partial_fn=gating_query_only_partial_fn,
-            item_only_partial_fn=gating_item_only_partial_fn,
-            qi_partial_fn=gating_qi_partial_fn,  # pyre-ignore [6]
-            combination_type=gating_combination_type,
-            normalization_fn=gating_normalization_fn,
-        )
-        self._query_embeddings_fn: MoLEmbeddingsFn = query_embeddings_fn
-        self._item_embeddings_fn: MoLEmbeddingsFn = (  # pyre-ignore [8]
-            item_embeddings_fn
-        )
-        self._apply_query_embeddings_fn: bool = apply_query_embeddings_fn
-        self._apply_item_embeddings_fn: bool = apply_item_embeddings_fn
-        self._dot_product_l2_norm: bool = dot_product_l2_norm
-        self._query_dot_product_groups: int = query_dot_product_groups
-        self._item_dot_product_groups: int = item_dot_product_groups
-        self._dot_product_dimension: int = dot_product_dimension
-        self._temperature: float = temperature
-        self._eps: float = eps
-        self._autocast_bf16: bool = autocast_bf16
-
-    def get_query_component_embeddings(
-        self,
-        input_embeddings: torch.Tensor,
-        decoupled_inference: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
-        """
-        Args:
-            input_embeddings: (B, self._input_embedding_dim,) x float
-                or (B, P_Q, self._dot_product_dimension) x float.
-            decoupled_inference: bool. If true, the call represents an attempt to run
-                forward() in decoupled mode at inference time (e.g., to pre-compute
-                component-level query embeddings for filtering, etc.). We simulate
-                the logic in forward() in this case (e.g., if forward() doesn't apply
-                query_embeddings_fn, then this call won't either).
-            kwargs: additional implementation-specific arguments.
-
-        Returns:
-            (B, query_dot_product_groups, dot_product_embedding_dim) x float.
-        """
-        if decoupled_inference and not self._apply_query_embeddings_fn:
-            return input_embeddings, {}
-        return self._query_embeddings_fn(input_embeddings, **kwargs)
-
-    def get_item_component_embeddings(
-        self,
-        input_embeddings: torch.Tensor,
-        decoupled_inference: bool = False,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
-        """
-        Args:
-            input_embeddings: (..., self._input_embedding_dim,) x float
-                or (..., P_X, self._dot_product_dimension) x float.
-            decoupled_inference: bool. If true, the call represents an attempt to run
-                forward() in decoupled mode at inference time (e.g., to pre-compute
-                component-level item embeddings for filtering, etc.). We simulate
-                the logic in forward() in this case (e.g., if forward() doesn't apply
-                item_embeddings_fn, then this call won't either).
-            kwargs: additional implementation-specific arguments.
-
-        Returns:
-            (..., item_dot_product_groups, dot_product_embedding_dim) x float.
-        """
-        if decoupled_inference and not self._apply_item_embeddings_fn:
-            return input_embeddings, {}
-
-        return self._item_embeddings_fn(input_embeddings, **kwargs)
-
-    def forward(
-        self,
-        query_embeddings: torch.Tensor,
-        item_embeddings: torch.Tensor,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
-        """
-        Args:
-            query_embeddings: (B, self._input_embedding_dim) x float or
-                (B, P_Q, self._dot_product_dimension) x float (when query_embeddings_fn
-                is applied externally).
-            item_embeddings: (1/B, X, self._item_embedding_dim) x float or
-                (1/B, X, P_X, self._dot_product_dimension) x float (when item_embeddings_fn
-                is applied externally).
-            kwargs: additional implementation-specific arguments.
-
-        Returns:
-            (B, X) x float, Dict[str, Tensor] representing auxiliary losses.
-        """
-        with torch.autocast(
-            enabled=self._autocast_bf16, dtype=torch.bfloat16, device_type="cuda"
-        ):
-            B = query_embeddings.size(0)
-            B_prime = item_embeddings.shape[0]  # 1 or B
-            X = item_embeddings.shape[1]
-
-            if self._apply_query_embeddings_fn:
-                (
-                    split_query_embeddings,
-                    query_aux_losses,
-                ) = self.get_query_component_embeddings(
-                    query_embeddings,
-                    **kwargs,
-                )
-            else:
-                split_query_embeddings, query_aux_losses = query_embeddings, {}
-
-            if self._apply_item_embeddings_fn:
-                (
-                    split_item_embeddings,
-                    item_aux_losses,
-                ) = self.get_item_component_embeddings(
-                    input_embeddings=item_embeddings,
-                    **kwargs,
-                )
-            else:
-                split_item_embeddings, item_aux_losses = item_embeddings, {}
-
-            if B_prime == 1:
-                logits = torch.einsum(
-                    "bnd,xmd->bxnm",
-                    split_query_embeddings,
-                    split_item_embeddings.squeeze(0),
-                ).reshape(
-                    B, X, self._query_dot_product_groups * self._item_dot_product_groups
-                )
-            else:
-                logits = torch.einsum(
-                    "bnd,bxmd->bxnm", split_query_embeddings, split_item_embeddings
-                ).reshape(
-                    B, X, self._query_dot_product_groups * self._item_dot_product_groups
-                )
-
-            gated_outputs, gating_aux_losses = self._gating_fn(
-                logits=logits / self._temperature,  # [B, X, L]
-                query_embeddings=query_embeddings,  # [B, D]
-                item_embeddings=item_embeddings,  # [1/B, X, D']
-            )
-            return gated_outputs, {
-                **gating_aux_losses,
-                **query_aux_losses,
-                **item_aux_losses,
-            }
diff --git a/recommendation_v4/generative_recommenders/research/trainer/data_loader.py b/recommendation_v4/generative_recommenders/research/trainer/data_loader.py
deleted file mode 100644
index 390b04bdb..000000000
--- a/recommendation_v4/generative_recommenders/research/trainer/data_loader.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-import os
-from typing import Optional, Tuple
-
-import gin
-import torch
-
-
-@gin.configurable
-def create_data_loader(
-    dataset: torch.utils.data.Dataset,
-    batch_size: int,
-    world_size: int,
-    rank: int,
-    shuffle: bool,
-    prefetch_factor: int = 128,
-    num_workers: Optional[int] = os.cpu_count(),
-    drop_last: bool = False,
-) -> Tuple[
-    Optional[torch.utils.data.distributed.DistributedSampler[torch.utils.data.Dataset]],
-    torch.utils.data.DataLoader,
-]:
-    if shuffle:
-        sampler = torch.utils.data.distributed.DistributedSampler(
-            dataset,
-            num_replicas=world_size,
-            rank=rank,
-            shuffle=True,
-            seed=0,
-            drop_last=drop_last,
-        )
-    else:
-        sampler = None
-    data_loader = torch.utils.data.DataLoader(
-        dataset,
-        batch_size=batch_size,
-        # shuffle=True, cannot use with sampler
-        num_workers=num_workers or 0,
-        sampler=sampler,
-        prefetch_factor=prefetch_factor,
-    )
-    return sampler, data_loader
diff --git a/recommendation_v4/generative_recommenders/research/trainer/train.py b/recommendation_v4/generative_recommenders/research/trainer/train.py
deleted file mode 100644
index 6d2da5be7..000000000
--- a/recommendation_v4/generative_recommenders/research/trainer/train.py
+++ /dev/null
@@ -1,532 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-import logging
-import os
-import random
-import time
-from datetime import date
-from typing import Dict, Optional
-
-import gin
-import torch
-import torch.distributed as dist
-from generative_recommenders.research.data.eval import (
-    _avg,
-    add_to_summary_writer,
-    eval_metrics_v2_from_tensors,
-    get_eval_state,
-)
-from generative_recommenders.research.data.reco_dataset import get_reco_dataset
-from generative_recommenders.research.indexing.utils import get_top_k_module
-from generative_recommenders.research.modeling.sequential.autoregressive_losses import (
-    BCELoss,
-    InBatchNegativesSampler,
-    LocalNegativesSampler,
-)
-from generative_recommenders.research.modeling.sequential.embedding_modules import (
-    EmbeddingModule,
-    LocalEmbeddingModule,
-)
-from generative_recommenders.research.modeling.sequential.encoder_utils import (
-    get_sequential_encoder,
-)
-from generative_recommenders.research.modeling.sequential.features import (
-    movielens_seq_features_from_row,
-)
-from generative_recommenders.research.modeling.sequential.input_features_preprocessors import (
-    LearnablePositionalEmbeddingInputFeaturesPreprocessor,
-)
-from generative_recommenders.research.modeling.sequential.losses.sampled_softmax import (
-    SampledSoftmaxLoss,
-)
-from generative_recommenders.research.modeling.sequential.output_postprocessors import (
-    L2NormEmbeddingPostprocessor,
-    LayerNormEmbeddingPostprocessor,
-)
-from generative_recommenders.research.modeling.similarity_utils import (
-    get_similarity_function,
-)
-from generative_recommenders.research.trainer.data_loader import create_data_loader
-from torch.nn.parallel import DistributedDataParallel as DDP
-from torch.utils.tensorboard import SummaryWriter
-
-
-def setup(rank: int, world_size: int, master_port: int) -> None:
-    os.environ["MASTER_ADDR"] = "localhost"
-    os.environ["MASTER_PORT"] = str(master_port)
-
-    # initialize the process group
-    dist.init_process_group("nccl", rank=rank, world_size=world_size)
-
-
-def cleanup() -> None:
-    dist.destroy_process_group()
-
-
-@gin.configurable
-def get_weighted_loss(
-    main_loss: torch.Tensor,
-    aux_losses: Dict[str, torch.Tensor],
-    weights: Dict[str, float],
-) -> torch.Tensor:
-    weighted_loss = main_loss
-    for key, weight in weights.items():
-        cur_weighted_loss = aux_losses[key] * weight
-        weighted_loss = weighted_loss + cur_weighted_loss
-    return weighted_loss
-
-
-@gin.configurable
-def train_fn(
-    rank: int,
-    world_size: int,
-    master_port: int,
-    dataset_name: str = "ml-20m",
-    max_sequence_length: int = 200,
-    positional_sampling_ratio: float = 1.0,
-    local_batch_size: int = 128,
-    eval_batch_size: int = 128,
-    eval_user_max_batch_size: Optional[int] = None,
-    main_module: str = "SASRec",
-    main_module_bf16: bool = False,
-    dropout_rate: float = 0.2,
-    user_embedding_norm: str = "l2_norm",
-    sampling_strategy: str = "in-batch",
-    loss_module: str = "SampledSoftmaxLoss",
-    loss_weights: Optional[Dict[str, float]] = {},
-    num_negatives: int = 1,
-    loss_activation_checkpoint: bool = False,
-    item_l2_norm: bool = False,
-    temperature: float = 0.05,
-    num_epochs: int = 101,
-    learning_rate: float = 1e-3,
-    num_warmup_steps: int = 0,
-    weight_decay: float = 1e-3,
-    top_k_method: str = "MIPSBruteForceTopK",
-    eval_interval: int = 100,
-    full_eval_every_n: int = 1,
-    save_ckpt_every_n: int = 1000,
-    partial_eval_num_iters: int = 32,
-    embedding_module_type: str = "local",
-    item_embedding_dim: int = 240,
-    interaction_module_type: str = "",
-    gr_output_length: int = 10,
-    l2_norm_eps: float = 1e-6,
-    enable_tf32: bool = False,
-    random_seed: int = 42,
-) -> None:
-    # to enable more deterministic results.
-    random.seed(random_seed)
-    torch.backends.cuda.matmul.allow_tf32 = enable_tf32
-    torch.backends.cudnn.allow_tf32 = enable_tf32
-    logging.info(f"cuda.matmul.allow_tf32: {enable_tf32}")
-    logging.info(f"cudnn.allow_tf32: {enable_tf32}")
-    logging.info(f"Training model on rank {rank}.")
-    setup(rank, world_size, master_port)
-
-    dataset = get_reco_dataset(
-        dataset_name=dataset_name,
-        max_sequence_length=max_sequence_length,
-        chronological=True,
-        positional_sampling_ratio=positional_sampling_ratio,
-    )
-
-    train_data_sampler, train_data_loader = create_data_loader(
-        dataset.train_dataset,
-        batch_size=local_batch_size,
-        world_size=world_size,
-        rank=rank,
-        shuffle=True,
-        drop_last=world_size > 1,
-    )
-    eval_data_sampler, eval_data_loader = create_data_loader(
-        dataset.eval_dataset,
-        batch_size=eval_batch_size,
-        world_size=world_size,
-        rank=rank,
-        shuffle=True,  # needed for partial eval
-        drop_last=world_size > 1,
-    )
-
-    model_debug_str = main_module
-    if embedding_module_type == "local":
-        embedding_module: EmbeddingModule = LocalEmbeddingModule(
-            num_items=dataset.max_item_id,
-            item_embedding_dim=item_embedding_dim,
-        )
-    else:
-        raise ValueError(f"Unknown embedding_module_type {embedding_module_type}")
-    model_debug_str += f"-{embedding_module.debug_str()}"
-
-    interaction_module, interaction_module_debug_str = get_similarity_function(
-        module_type=interaction_module_type,
-        query_embedding_dim=item_embedding_dim,
-        item_embedding_dim=item_embedding_dim,
-    )
-
-    assert user_embedding_norm == "l2_norm" or user_embedding_norm == "layer_norm", (
-        f"Not implemented for {user_embedding_norm}"
-    )
-    output_postproc_module = (
-        L2NormEmbeddingPostprocessor(
-            embedding_dim=item_embedding_dim,
-            eps=1e-6,
-        )
-        if user_embedding_norm == "l2_norm"
-        else LayerNormEmbeddingPostprocessor(
-            embedding_dim=item_embedding_dim,
-            eps=1e-6,
-        )
-    )
-    input_preproc_module = LearnablePositionalEmbeddingInputFeaturesPreprocessor(
-        max_sequence_len=dataset.max_sequence_length + gr_output_length + 1,
-        embedding_dim=item_embedding_dim,
-        dropout_rate=dropout_rate,
-    )
-
-    model = get_sequential_encoder(
-        module_type=main_module,
-        max_sequence_length=dataset.max_sequence_length,
-        max_output_length=gr_output_length + 1,
-        embedding_module=embedding_module,
-        interaction_module=interaction_module,
-        input_preproc_module=input_preproc_module,
-        output_postproc_module=output_postproc_module,
-        verbose=True,
-    )
-    model_debug_str = model.debug_str()
-
-    # loss
-    loss_debug_str = loss_module
-    if loss_module == "BCELoss":
-        loss_debug_str = loss_debug_str[:-4]
-        assert temperature == 1.0
-        ar_loss = BCELoss(temperature=temperature, model=model)
-    elif loss_module == "SampledSoftmaxLoss":
-        loss_debug_str = "ssl"
-        if temperature != 1.0:
-            loss_debug_str += f"-t{temperature}"
-        ar_loss = SampledSoftmaxLoss(
-            num_to_sample=num_negatives,
-            softmax_temperature=temperature,
-            model=model,
-            activation_checkpoint=loss_activation_checkpoint,
-        )
-        loss_debug_str += (
-            f"-n{num_negatives}{'-ac' if loss_activation_checkpoint else ''}"
-        )
-    else:
-        raise ValueError(f"Unrecognized loss module {loss_module}.")
-
-    # sampling
-    if sampling_strategy == "in-batch":
-        negatives_sampler = InBatchNegativesSampler(
-            l2_norm=item_l2_norm,
-            l2_norm_eps=l2_norm_eps,
-            dedup_embeddings=True,
-        )
-        sampling_debug_str = (
-            f"in-batch{f'-l2-eps{l2_norm_eps}' if item_l2_norm else ''}-dedup"
-        )
-    elif sampling_strategy == "local":
-        negatives_sampler = LocalNegativesSampler(
-            num_items=dataset.max_item_id,
-            item_emb=model._embedding_module._item_emb,
-            all_item_ids=dataset.all_item_ids,
-            l2_norm=item_l2_norm,
-            l2_norm_eps=l2_norm_eps,
-        )
-    else:
-        raise ValueError(f"Unrecognized sampling strategy {sampling_strategy}.")
-    sampling_debug_str = negatives_sampler.debug_str()
-
-    # Creates model and moves it to GPU with id rank
-    device = rank
-    if main_module_bf16:
-        model = model.to(torch.bfloat16)
-    model = model.to(device)
-    ar_loss = ar_loss.to(device)
-    negatives_sampler = negatives_sampler.to(device)
-    model = DDP(model, device_ids=[rank], broadcast_buffers=False)
-
-    # TODO: wrap in create_optimizer.
-    opt = torch.optim.AdamW(
-        model.parameters(),
-        lr=learning_rate,
-        betas=(0.9, 0.98),
-        weight_decay=weight_decay,
-    )
-
-    date_str = date.today().strftime("%Y-%m-%d")
-    model_subfolder = f"{dataset_name}-l{max_sequence_length}"
-    model_desc = (
-        f"{model_subfolder}"
-        + f"/{model_debug_str}_{interaction_module_debug_str}_{sampling_debug_str}_{loss_debug_str}"
-        + f"{f'-ddp{world_size}' if world_size > 1 else ''}-b{local_batch_size}-lr{learning_rate}-wu{num_warmup_steps}-wd{weight_decay}{'' if enable_tf32 else '-notf32'}-{date_str}"
-    )
-    if full_eval_every_n > 1:
-        model_desc += f"-fe{full_eval_every_n}"
-    if positional_sampling_ratio is not None and positional_sampling_ratio < 1:
-        model_desc += f"-d{positional_sampling_ratio}"
-    # creates subfolders.
-    os.makedirs(f"./exps/{model_subfolder}", exist_ok=True)
-    os.makedirs(f"./ckpts/{model_subfolder}", exist_ok=True)
-    log_dir = f"./exps/{model_desc}"
-    if rank == 0:
-        writer = SummaryWriter(log_dir=log_dir)
-        logging.info(f"Rank {rank}: writing logs to {log_dir}")
-    else:
-        writer = None
-        logging.info(f"Rank {rank}: disabling summary writer")
-
-    last_training_time = time.time()
-    torch.autograd.set_detect_anomaly(True)
-
-    batch_id = 0
-    epoch = 0
-    for epoch in range(num_epochs):
-        if train_data_sampler is not None:
-            train_data_sampler.set_epoch(epoch)
-        if eval_data_sampler is not None:
-            eval_data_sampler.set_epoch(epoch)
-        model.train()
-        for row in iter(train_data_loader):
-            seq_features, target_ids, target_ratings = movielens_seq_features_from_row(
-                row,
-                device=device,
-                max_output_length=gr_output_length + 1,
-            )
-
-            if (batch_id % eval_interval) == 0:
-                model.eval()
-
-                eval_state = get_eval_state(
-                    model=model.module,
-                    all_item_ids=dataset.all_item_ids,
-                    negatives_sampler=negatives_sampler,
-                    top_k_module_fn=lambda item_embeddings, item_ids: get_top_k_module(
-                        top_k_method=top_k_method,
-                        model=model.module,
-                        item_embeddings=item_embeddings,
-                        item_ids=item_ids,
-                    ),
-                    device=device,
-                    float_dtype=torch.bfloat16 if main_module_bf16 else None,
-                )
-                eval_dict = eval_metrics_v2_from_tensors(
-                    eval_state,
-                    model.module,
-                    seq_features,
-                    target_ids=target_ids,
-                    target_ratings=target_ratings,
-                    user_max_batch_size=eval_user_max_batch_size,
-                    dtype=torch.bfloat16 if main_module_bf16 else None,
-                )
-                add_to_summary_writer(
-                    writer, batch_id, eval_dict, prefix="eval", world_size=world_size
-                )
-                logging.info(
-                    f"rank {rank}:  batch-stat (eval): iter {batch_id} (epoch {epoch}): "
-                    + f"NDCG@10 {_avg(eval_dict['ndcg@10'], world_size):.4f}, "
-                    f"HR@10 {_avg(eval_dict['hr@10'], world_size):.4f}, "
-                    f"HR@50 {_avg(eval_dict['hr@50'], world_size):.4f}, "
-                    + f"MRR {_avg(eval_dict['mrr'], world_size):.4f} "
-                )
-                model.train()
-
-            # TODO: consider separating this out?
-            B, N = seq_features.past_ids.shape
-            seq_features.past_ids.scatter_(
-                dim=1,
-                index=seq_features.past_lengths.view(-1, 1),
-                src=target_ids.view(-1, 1),
-            )
-
-            opt.zero_grad()
-            input_embeddings = model.module.get_item_embeddings(seq_features.past_ids)
-            seq_embeddings = model(
-                past_lengths=seq_features.past_lengths,
-                past_ids=seq_features.past_ids,
-                past_embeddings=input_embeddings,
-                past_payloads=seq_features.past_payloads,
-            )  # [B, X]
-
-            supervision_ids = seq_features.past_ids
-
-            if sampling_strategy == "in-batch":
-                # get_item_embeddings currently assume 1-d tensor.
-                in_batch_ids = supervision_ids.view(-1)
-                negatives_sampler.process_batch(
-                    ids=in_batch_ids,
-                    presences=(in_batch_ids != 0),
-                    embeddings=model.module.get_item_embeddings(in_batch_ids),
-                )
-            else:
-                # pyre-fixme[16]: `InBatchNegativesSampler` has no attribute
-                #  `_item_emb`.
-                negatives_sampler._item_emb = model.module._embedding_module._item_emb
-
-            ar_mask = supervision_ids[:, 1:] != 0
-            loss, aux_losses = ar_loss(
-                lengths=seq_features.past_lengths,  # [B],
-                output_embeddings=seq_embeddings[:, :-1, :],  # [B, N-1, D]
-                supervision_ids=supervision_ids[:, 1:],  # [B, N-1]
-                supervision_embeddings=input_embeddings[:, 1:, :],  # [B, N - 1, D]
-                supervision_weights=ar_mask.float(),
-                negatives_sampler=negatives_sampler,
-                **seq_features.past_payloads,
-            )  # [B, N]
-
-            main_loss = loss.detach().clone()
-            loss = get_weighted_loss(loss, aux_losses, weights=loss_weights or {})
-
-            if rank == 0:
-                assert writer is not None
-                writer.add_scalar("losses/ar_loss", loss, batch_id)
-                writer.add_scalar("losses/main_loss", main_loss, batch_id)
-
-            loss.backward()
-
-            # Optional linear warmup.
-            if batch_id < num_warmup_steps:
-                lr_scalar = min(1.0, float(batch_id + 1) / num_warmup_steps)
-                for pg in opt.param_groups:
-                    pg["lr"] = lr_scalar * learning_rate
-                lr = lr_scalar * learning_rate
-            else:
-                lr = learning_rate
-
-            if (batch_id % eval_interval) == 0:
-                logging.info(
-                    f" rank: {rank}, batch-stat (train): step {batch_id} "
-                    f"(epoch {epoch} in {time.time() - last_training_time:.2f}s): {loss:.6f}"
-                )
-                last_training_time = time.time()
-                if rank == 0:
-                    assert writer is not None
-                    writer.add_scalar("loss/train", loss, batch_id)
-                    writer.add_scalar("lr", lr, batch_id)
-
-            opt.step()
-
-            batch_id += 1
-
-        def is_full_eval(epoch: int) -> bool:
-            return (epoch % full_eval_every_n) == 0
-
-        # eval per epoch
-        eval_dict_all = None
-        eval_start_time = time.time()
-        model.eval()
-        eval_state = get_eval_state(
-            model=model.module,
-            all_item_ids=dataset.all_item_ids,
-            negatives_sampler=negatives_sampler,
-            top_k_module_fn=lambda item_embeddings, item_ids: get_top_k_module(
-                top_k_method=top_k_method,
-                model=model.module,
-                item_embeddings=item_embeddings,
-                item_ids=item_ids,
-            ),
-            device=device,
-            float_dtype=torch.bfloat16 if main_module_bf16 else None,
-        )
-        for eval_iter, row in enumerate(iter(eval_data_loader)):
-            seq_features, target_ids, target_ratings = movielens_seq_features_from_row(
-                row, device=device, max_output_length=gr_output_length + 1
-            )
-            eval_dict = eval_metrics_v2_from_tensors(
-                eval_state,
-                model.module,
-                seq_features,
-                target_ids=target_ids,
-                target_ratings=target_ratings,
-                user_max_batch_size=eval_user_max_batch_size,
-                dtype=torch.bfloat16 if main_module_bf16 else None,
-            )
-
-            if eval_dict_all is None:
-                eval_dict_all = {}
-                for k, v in eval_dict.items():
-                    eval_dict_all[k] = []
-
-            for k, v in eval_dict.items():
-                eval_dict_all[k] = eval_dict_all[k] + [v]
-            del eval_dict
-
-            if (eval_iter + 1 >= partial_eval_num_iters) and (not is_full_eval(epoch)):
-                logging.info(
-                    f"Truncating epoch {epoch} eval to {eval_iter + 1} iters to save cost.."
-                )
-                break
-
-        assert eval_dict_all is not None
-        for k, v in eval_dict_all.items():
-            eval_dict_all[k] = torch.cat(v, dim=-1)
-
-        ndcg_10 = _avg(eval_dict_all["ndcg@10"], world_size=world_size)
-        ndcg_50 = _avg(eval_dict_all["ndcg@50"], world_size=world_size)
-        hr_10 = _avg(eval_dict_all["hr@10"], world_size=world_size)
-        hr_50 = _avg(eval_dict_all["hr@50"], world_size=world_size)
-        mrr = _avg(eval_dict_all["mrr"], world_size=world_size)
-
-        add_to_summary_writer(
-            writer,
-            batch_id=epoch,
-            metrics=eval_dict_all,
-            prefix="eval_epoch",
-            world_size=world_size,
-        )
-        if full_eval_every_n > 1 and is_full_eval(epoch):
-            add_to_summary_writer(
-                writer,
-                batch_id=epoch,
-                metrics=eval_dict_all,
-                prefix="eval_epoch_full",
-                world_size=world_size,
-            )
-        if rank == 0 and epoch > 0 and (epoch % save_ckpt_every_n) == 0:
-            torch.save(
-                {
-                    "epoch": epoch,
-                    "model_state_dict": model.state_dict(),
-                    "optimizer_state_dict": opt.state_dict(),
-                },
-                f"./ckpts/{model_desc}_ep{epoch}",
-            )
-
-        logging.info(
-            f"rank {rank}: eval @ epoch {epoch} in {time.time() - eval_start_time:.2f}s: "
-            f"NDCG@10 {ndcg_10:.4f}, NDCG@50 {ndcg_50:.4f}, HR@10 {hr_10:.4f}, HR@50 {hr_50:.4f}, MRR {mrr:.4f}"
-        )
-        last_training_time = time.time()
-
-    if rank == 0:
-        if writer is not None:
-            writer.flush()
-            writer.close()
-
-        torch.save(
-            {
-                "epoch": epoch,
-                "model_state_dict": model.state_dict(),
-                "optimizer_state_dict": opt.state_dict(),
-            },
-            f"./ckpts/{model_desc}_ep{epoch}",
-        )
-
-    cleanup()
diff --git a/recommendation_v4/main.py b/recommendation_v4/main.py
deleted file mode 100644
index 445f25820..000000000
--- a/recommendation_v4/main.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-"""
-Main entry point for model training. Please refer to README.md for usage instructions.
-"""
-
-import logging
-import os
-from typing import List, Optional
-
-os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"  # Hide excessive tensorflow debug messages
-import sys
-
-import fbgemm_gpu  # noqa: F401, E402
-import gin
-import torch
-import torch.multiprocessing as mp
-from absl import app, flags
-from generative_recommenders.research.trainer.train import train_fn
-
-logging.basicConfig(stream=sys.stdout, level=logging.INFO)
-
-
-def delete_flags(FLAGS, keys_to_delete: List[str]) -> None:  # pyre-ignore [2]
-    keys = [key for key in FLAGS._flags()]
-    for key in keys:
-        if key in keys_to_delete:
-            delattr(FLAGS, key)
-
-
-delete_flags(flags.FLAGS, ["gin_config_file", "master_port"])
-flags.DEFINE_string("gin_config_file", None, "Path to the config file.")
-flags.DEFINE_integer("master_port", 12355, "Master port.")
-FLAGS = flags.FLAGS  # pyre-ignore [5]
-
-
-def mp_train_fn(
-    rank: int,
-    world_size: int,
-    master_port: int,
-    gin_config_file: Optional[str],
-) -> None:
-    if gin_config_file is not None:
-        # Hack as absl doesn't support flag parsing inside multiprocessing.
-        logging.info(f"Rank {rank}: loading gin config from {gin_config_file}")
-        gin.parse_config_file(gin_config_file)
-
-    train_fn(rank, world_size, master_port)
-
-
-def _main(argv) -> None:  # pyre-ignore [2]
-    world_size = torch.cuda.device_count()
-
-    mp.set_start_method("forkserver")
-    mp.spawn(
-        mp_train_fn,
-        args=(world_size, FLAGS.master_port, FLAGS.gin_config_file),
-        nprocs=world_size,
-        join=True,
-    )
-
-
-def main() -> None:
-    app.run(_main)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/recommendation_v4/preprocess_public_data.py b/recommendation_v4/preprocess_public_data.py
deleted file mode 100644
index 927ccf4c6..000000000
--- a/recommendation_v4/preprocess_public_data.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-"""
-Usage: mkdir -p tmp/ && python3 preprocess_public_data.py
-"""
-
-from generative_recommenders.research.data.preprocessor import get_common_preprocessors
-
-
-def main() -> None:
-    get_common_preprocessors()["ml-1m"].preprocess_rating()
-    get_common_preprocessors()["ml-20m"].preprocess_rating()
-    # get_common_preprocessors()["ml-1b"].preprocess_rating()
-    get_common_preprocessors()["amzn-books"].preprocess_rating()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/recommendation_v4/run_fractal_expansion.py b/recommendation_v4/run_fractal_expansion.py
deleted file mode 100644
index 308eadea2..000000000
--- a/recommendation_v4/run_fractal_expansion.py
+++ /dev/null
@@ -1,588 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# pyre-unsafe
-
-"""
-Run fractal expansion introduced in https://arxiv.org/abs/1901.08910.
-Implementation adapted from the scripts used to generate MovieLens-1B
-(https://grouplens.org/datasets/movielens/movielens-1b/).
-"""
-
-# Generate a 3B dataset (takes around 50 minutes):
-# python run_fractal_expansion.py --input-csv-file ~/data/ml-20m/ratings.csv --write-dataset True --output-prefix ~/data/ml-3b/
-# Generate a 13B dataset with 440M item size:
-# python run_fractal_expansion.py --input-csv-file ~/data/ml-20m/ratings.csv --write-dataset True --output-prefix ~/data/ml-13b/ --num-row-multiplier 16 --num-col-multiplier 16384 --element-sample-rate 0.2 --block-sample-rate 0.05
-# Generate a 18B dataset with 1B item size:
-# python run_fractal_expansion.py --input-csv-file ~/data/ml-20m/ratings.csv --write-dataset True --output-prefix ~/data/ml-18b/ --num-row-multiplier 20 --num-col-multiplier 36864 --element-sample-rate 0.08 --block-sample-rate 0.05
-
-import csv
-import linecache
-import logging
-import os
-import pickle
-from dataclasses import dataclass
-
-import click
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-import scipy.linalg
-import skimage.transform as transform
-from scipy import sparse
-from scipy.sparse import linalg
-from sklearn.utils import shuffle
-from tqdm import tqdm
-
-
-logging.basicConfig()
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-
-@dataclass
-class SparseMatrixMetadata:
-    num_interactions: int = 0
-    num_rows: int = 0
-    num_cols: int = 0
-
-
-def _dropout_sparse_coo_matrix(
-    sparse_matrix, rate, min_dropout_rate=0.005, max_dropout_rate=0.999
-):
-    assert min_dropout_rate <= max_dropout_rate
-    sampling_rate = 1.0 - rate
-
-    sampled_fraction = min(
-        max(sampling_rate, 1.0 - max_dropout_rate), 1.0 - min_dropout_rate
-    )
-    if sampled_fraction != sampling_rate:
-        logger.warning(
-            f"Desired sampling rate {sampling_rate} clipped to {sampled_fraction}."
-        )
-    num_sampled = min(
-        max(int(sparse_matrix.nnz * sampled_fraction), 1), sparse_matrix.nnz
-    )
-    sampled_indices = np.random.choice(
-        sparse_matrix.nnz, size=num_sampled, replace=False
-    )
-    return sparse.coo_matrix(
-        (
-            sparse_matrix.data[sampled_indices],
-            (sparse_matrix.row[sampled_indices], sparse_matrix.col[sampled_indices]),
-        ),
-        shape=sparse_matrix.shape,
-    )
-
-
-def shuffle_sparse_matrix(
-    sparse_matrix, dropout_rate=0.0, min_dropout_rate=0.005, max_dropout_rate=0.999
-):
-    """
-    Shuffle sparse matrix encoded as a SciPy csr matrix.
-    """
-
-    assert dropout_rate >= 0.0 and dropout_rate <= 1.0
-    (num_rows, num_cols) = sparse_matrix.shape
-    shuffled_rows = shuffle(np.arange(num_rows))
-    shuffled_cols = shuffle(np.arange(num_cols))
-    sparse_matrix = _dropout_sparse_coo_matrix(
-        sparse_matrix, dropout_rate, min_dropout_rate, max_dropout_rate
-    )
-    new_row = np.take(shuffled_rows, sparse_matrix.row)
-    new_col = np.take(shuffled_cols, sparse_matrix.col)
-    return sparse.csr_matrix(
-        (sparse_matrix.data, (new_row, new_col)), shape=(num_rows, num_cols)
-    )
-
-
-def graph_reduce(usv, num_rows, num_cols):
-    """Apply algorithm 2 in https://arxiv.org/pdf/1901.08910.pdf."""
-
-    def _closest_column_orthogonal_matrix(matrix):
-        return np.matmul(
-            matrix, np.linalg.inv(scipy.linalg.sqrtm(np.matmul(matrix.T, matrix)))
-        )
-
-    u, s, v = usv
-    k = min(num_rows, num_cols)
-    u_random_proj = transform.resize(u[:, :k], (num_rows, k))
-    v_random_proj = transform.resize(v[:k, :], (k, num_cols))
-    u_random_proj_orth = _closest_column_orthogonal_matrix(u_random_proj)
-    v_random_proj_orth = _closest_column_orthogonal_matrix(v_random_proj.T).T
-    return np.matmul(u_random_proj_orth, np.matmul(np.diag(s[:k]), v_random_proj_orth))
-
-
-def rescale(matrix, rescale_w_abs=False, element_sample_rate=1.0):
-    """Rescale all values of the matrix into [0, 1]."""
-    if rescale_w_abs:
-        abs_matrix = np.abs(matrix.copy())
-        out = abs_matrix / abs_matrix.max()
-    else:
-        out = (matrix - matrix.min()) / (matrix.max() - matrix.min())
-        assert out.min() >= 0 and out.max() <= 1
-    return out * element_sample_rate
-
-
-def _compute_row_block(
-    i, left_matrix, right_matrix, block_sample_rate, indices_out_path, remove_empty_rows
-):
-    """Compute row block of expansion for row i of the left_matrix."""
-
-    kron_blocks = []
-    num_rows = 0
-    num_removed_rows = 0
-    num_interactions = 0
-
-    for j in range(left_matrix.shape[1]):
-        if np.random.random() <= block_sample_rate:
-            dropout_rate = 1.0 - left_matrix[i, j]
-            kron_block = shuffle_sparse_matrix(right_matrix, dropout_rate).tocsr()
-            num_interactions += kron_block.nnz
-            kron_blocks.append(kron_block)
-            logger.info(f"Kronecker block ({i}, {j}) processed.")
-        else:
-            kron_blocks.append(sparse.csr_matrix(right_matrix.shape))
-            logger.info(f"Kronecker block ({i}, {j}) skipped.")
-
-    rows_to_write = sparse.hstack(kron_blocks).tocsr()
-    logger.info("Writing dataset row by row.")
-
-    # Write Kronecker product line per line.
-    filepath = f"{indices_out_path}_{i}.csv"
-    os.makedirs(os.path.dirname(filepath), exist_ok=True)
-    with open(filepath, "w", newline="") as file:
-        writer = csv.writer(file)
-        for k in range(right_matrix.shape[0]):
-            items_to_write = rows_to_write.getrow(k).indices
-            ratings_to_write = rows_to_write.getrow(k).data
-            num = items_to_write.shape[0]
-            if remove_empty_rows and (not num):
-                logger.info(f"Removed empty output row {i * left_matrix.shape[0] + k}.")
-                num_removed_rows += 1
-                continue
-            num_rows += 1
-            writer.writerow(
-                [
-                    i * right_matrix.shape[0] + k,
-                    ",".join([str(x) for x in items_to_write]),
-                    ",".join([str(x) for x in ratings_to_write]),
-                ]
-            )
-            if k % 100000 == 0:
-                logger.info(f"Done producing data set row {k}.")
-
-    num_cols = rows_to_write.shape[1]
-    metadata = SparseMatrixMetadata(
-        num_interactions=num_interactions, num_rows=num_rows, num_cols=num_cols
-    )
-    logger.info(
-        f"Done with left matrix row {i}, {num_interactions} interactions written in shard, {num_removed_rows} rows removed in shard."
-    )
-    return (num_removed_rows, metadata)
-
-
-def visualize_samples(
-    right_matrix,
-    visualize_num_samples,
-    expanded_file_name,
-    output_prefix,
-):
-    # Note: only the rows of the first Kronecker block are visualized.
-    logger.info("visualize dataset row by row.")
-    fig, axs = plt.subplots(1, 2, figsize=(12, 5))
-    axs[0].set_title("Original data Histogram")
-    axs[0].set_xlabel("Value")
-    axs[0].set_ylabel("Frequency")
-    axs[1].set_title("Expended Row Histogram")
-    axs[1].set_xlabel("Value")
-    axs[1].set_ylabel("Frequency")
-    for k in range(visualize_num_samples):
-        original_row = right_matrix.getrow(k).data
-        line = linecache.getline(expanded_file_name, k + 1)
-        reader = csv.reader([line])
-        parsed_line = next(reader)
-        expended_row = eval(parsed_line[2])
-        original_hist_counts, original_bin_edges = np.histogram(original_row, bins=9)
-        expended_hist_counts, expended_bin_edges = np.histogram(expended_row, bins=9)
-        axs[0].plot(original_bin_edges[:-1], original_hist_counts, alpha=0.2)
-        axs[1].plot(expended_bin_edges[:-1], expended_hist_counts, alpha=0.2)
-        axs[0].fill_between(original_bin_edges[:-1], original_hist_counts, alpha=0.2)
-        axs[1].fill_between(expended_bin_edges[:-1], expended_hist_counts, alpha=0.2)
-    plt.tight_layout()
-    plt.savefig(f"{output_prefix}_sample_distribution.png")
-    logger.info("Sample visualization finished.")
-
-
-def build_randomized_kronecker(
-    left_matrix,
-    right_matrix,
-    block_sample_rate,
-    indices_out_path,
-    metadata_out_path=None,
-    remove_empty_rows=True,
-):
-    """Compute randomized Kronecker product and dump it on the fly based on https://arxiv.org/pdf/1901.08910.pdf."""
-    logger.info(f"Writing item sequences to pickle files {metadata_out_path}.")
-
-    num_rows = 0
-    num_removed_rows = 0
-    num_cols = left_matrix.shape[1] * right_matrix.shape[1]
-    num_interactions = 0
-
-    filepath = f"{indices_out_path}_users.csv"
-    os.makedirs(os.path.dirname(filepath), exist_ok=True)
-    with open(filepath, "w", newline="") as file:
-        writer = csv.writer(file)
-        for i in tqdm(range(left_matrix.shape[0])):
-            (shard_num_removed_rows, shard_metadata) = _compute_row_block(
-                i,
-                left_matrix,
-                right_matrix,
-                block_sample_rate,
-                indices_out_path,
-                remove_empty_rows,
-            )
-            writer.writerow([i, shard_metadata.num_rows])
-            file.flush()
-            num_rows += shard_metadata.num_rows
-            num_removed_rows += shard_num_removed_rows
-            num_interactions += shard_metadata.num_interactions
-
-    logger.info(f"{num_interactions / num_rows} average sequence length")
-    logger.info(f"{num_interactions} total interactions written.")
-    logger.info(f"{num_removed_rows} total rows removed.")
-
-    metadata = SparseMatrixMetadata(
-        num_interactions=num_interactions, num_rows=num_rows, num_cols=num_cols
-    )
-    if metadata_out_path is not None:
-        logger.info(f"Writing metadata file to {metadata_out_path}")
-        with open(metadata_out_path, "wb") as output_file:
-            pickle.dump(metadata, output_file)
-    return metadata
-
-
-def _preprocess_movie_lens(ratings_df, binary=False):
-    """
-    Filters out users with less than three distinct timestamps.
-    """
-
-    def _create_index(df, colname):
-        value_set = sorted(set(df[colname].values))
-        num_unique = len(value_set)
-        return dict(zip(value_set, range(num_unique)))
-
-    if not binary:
-        ratings_df["data"] = ratings_df["rating"]
-    else:
-        ratings_df["data"] = 1.0
-    ratings_df["binary_data"] = 1.0
-    num_timestamps = ratings_df[["userId", "timestamp"]].groupby("userId").nunique()
-    ratings_df["numberOfTimestamps"] = ratings_df["userId"].apply(
-        lambda x: num_timestamps["timestamp"][x]
-    )
-    ratings_df = ratings_df[ratings_df["numberOfTimestamps"] > 2]
-    user_id_to_user_idx = _create_index(ratings_df, "userId")
-    item_id_to_item_idx = _create_index(ratings_df, "movieId")
-    ratings_df["row"] = ratings_df["userId"].apply(lambda x: user_id_to_user_idx[x])
-    ratings_df["col"] = ratings_df["movieId"].apply(lambda x: item_id_to_item_idx[x])
-    return ratings_df
-
-
-def normalize(matrix):
-    norm_matrix = matrix.copy()
-    if isinstance(norm_matrix, np.ndarray):
-        norm_matrix -= norm_matrix.mean()
-    else:
-        norm_matrix.data -= norm_matrix.mean()
-    max_val = norm_matrix.max()
-    min_val = norm_matrix.min()
-    if isinstance(norm_matrix, np.ndarray):
-        norm_matrix /= max(abs(max_val), abs(min_val))
-    else:
-        norm_matrix.data /= max(abs(max_val), abs(min_val))
-    return norm_matrix
-
-
-def plot_distribution(user_wise_sum, item_wise_sum, s, title_prefix, normalized=False):
-    y_label = "rating sums" if normalized else "number of ratings"
-    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))
-    ax1.loglog(
-        np.arange(len(user_wise_sum)) + 1,
-        np.sort(user_wise_sum)[::-1],
-        linestyle="-",
-        color="blue",
-        marker="",
-    )
-    ax1.set_title(f"{title_prefix} matrix user-wise rating sums")
-    ax1.set_xlabel("User rank")
-    ax1.set_ylabel(y_label)
-    ax1.grid(True)
-    ax2.loglog(
-        np.arange(len(item_wise_sum)) + 1,
-        np.sort(item_wise_sum)[::-1],
-        linestyle="-",
-        color="green",
-        marker="",
-    )
-    ax2.set_title(f"{title_prefix} matrix item-wise rating sums")
-    ax2.set_xlabel("Item rank")
-    ax2.set_ylabel(y_label)
-    ax2.grid(True)
-    ax3.loglog(
-        np.arange(len(s)) + 1, np.sort(s)[::-1], linestyle="-", color="red", marker=""
-    )
-    ax3.set_title(f"{title_prefix} matrix singular values")
-    ax3.set_xlabel("Singular value Rank")
-    ax3.set_ylabel("Magnitude")
-    ax3.grid(True)
-    plt.tight_layout()
-    plt.savefig(f"{title_prefix}_distribution.png")
-
-
-def visualize_distribution(mat, reduced_mat, s, reduced_s, normalized=False, title=""):
-    user_wise_sum = np.asarray(mat.sum(axis=1)).flatten()
-    item_wise_sum = np.asarray(mat.sum(axis=0)).flatten()
-    assert len(user_wise_sum) == mat.shape[0]
-    assert len(item_wise_sum) == mat.shape[1]
-    plot_distribution(
-        user_wise_sum,
-        item_wise_sum,
-        s,
-        title_prefix=f"{title}_Original",
-        normalized=normalized,
-    )
-
-    reduced_user_wise_sum = np.asarray(reduced_mat.sum(axis=1)).flatten()
-    reduced_item_wise_sum = np.asarray(reduced_mat.sum(axis=0)).flatten()
-    assert len(reduced_user_wise_sum) == reduced_mat.shape[0]
-    assert len(reduced_item_wise_sum) == reduced_mat.shape[1]
-    plot_distribution(
-        reduced_user_wise_sum,
-        reduced_item_wise_sum,
-        reduced_s,
-        title_prefix=f"{title}_Reduced",
-        normalized=normalized,
-    )
-
-    expanded_s = np.einsum("i,j->ij", reduced_s, s).flatten()
-    expanded_user_wise_sum = np.einsum("ij,k->ik", reduced_mat, user_wise_sum).flatten()
-    expanded_item_wise_sum = np.einsum("ij,k->jk", reduced_mat, item_wise_sum).flatten()
-    assert len(expanded_user_wise_sum) == reduced_mat.shape[0] * mat.shape[0]
-    assert len(expanded_item_wise_sum) == reduced_mat.shape[1] * mat.shape[1]
-    plot_distribution(
-        expanded_user_wise_sum,
-        expanded_item_wise_sum,
-        expanded_s,
-        title_prefix=f"{title}_Expanded",
-        normalized=normalized,
-    )
-
-
-def expand_dataset(
-    ratings_matrix,
-    binary_ratings_matrix,
-    num_users,
-    num_items,
-    reduced_num_rows,
-    reduced_num_cols,
-    rescale_w_abs,
-    element_sample_rate,
-    block_sample_rate,
-    visualize,
-    write_dataset,
-    output_prefix,
-):
-    k = min(reduced_num_rows, reduced_num_cols)
-    norm_rating_matrix = normalize(ratings_matrix)
-    (u, s, v) = linalg.svds(
-        norm_rating_matrix, k=k, maxiter=None, return_singular_vectors=True
-    )
-
-    logger.info(
-        f"Creating reduced rating matrix (size {reduced_num_rows}, {reduced_num_cols})"
-    )
-    reduced_matrix = graph_reduce((u, s, v), reduced_num_rows, reduced_num_cols)
-    norm_reduced_matrix = normalize(reduced_matrix)
-    (_, s_reduce, _) = linalg.svds(
-        norm_reduced_matrix, k=k - 1, maxiter=None, return_singular_vectors=True
-    )
-    reduced_matrix = rescale(
-        reduced_matrix,
-        rescale_w_abs=rescale_w_abs,
-        element_sample_rate=element_sample_rate,
-    )
-    logger.info(f"largest singular value of the reduced matrix is {s_reduce[-1]}")
-    logger.info(
-        f"Sampling rate mean is {reduced_matrix.mean()}, var is {reduced_matrix.var()}, min is {reduced_matrix.min()}, max is {reduced_matrix.max()}"
-    )
-    samples = reduced_matrix.sum() * ratings_matrix.nnz * block_sample_rate
-    logger.info(
-        f"Expected number of synthetic samples: {samples}, sparsity is {samples / (num_users * num_items * reduced_num_rows * reduced_num_cols)}, average seqlen is {samples / (num_users * reduced_num_rows)}"
-    )
-
-    if visualize:
-        s = linalg.svds(
-            norm_rating_matrix, k=20 * k, maxiter=None, return_singular_vectors=False
-        )
-        visualize_distribution(
-            norm_rating_matrix,
-            norm_reduced_matrix,
-            s,
-            s_reduce,
-            normalized=True,
-            title="Normalized",
-        )
-        visualize_distribution(
-            binary_ratings_matrix,
-            reduced_matrix,
-            s,
-            s_reduce,
-            normalized=False,
-            title="Binary",
-        )
-    if write_dataset:
-        output_file = (
-            output_prefix + str(reduced_num_rows) + "x" + str(reduced_num_cols)
-        )
-        output_file_metadata = None
-
-        logger.info(f"Creating synthetic dataset and dumping to {output_file}.")
-        build_randomized_kronecker(
-            left_matrix=reduced_matrix,
-            right_matrix=ratings_matrix.tocoo(),
-            block_sample_rate=block_sample_rate,
-            indices_out_path=output_file,
-            metadata_out_path=output_file_metadata,
-        )
-
-
-@click.command()
-@click.option(
-    "--random-seed",
-    type=int,
-    default=0,
-)
-@click.option(
-    "--input-csv-file",
-    type=str,
-    default="ratings.csv",
-)
-@click.option(
-    "--output-prefix",
-    type=str,
-    default="",
-)
-@click.option(
-    "--num-row-multiplier",
-    type=int,
-    default=16,
-)
-@click.option(
-    "--num-col-multiplier",
-    type=int,
-    default=32,
-)
-@click.option(
-    "--element-sample-rate",
-    type=float,
-    default=1.0,
-)
-@click.option(
-    "--block-sample-rate",
-    type=float,
-    default=1.0,
-)
-@click.option(
-    "--visualize",
-    type=bool,
-    default=False,
-)
-@click.option(
-    "--write-dataset",
-    type=bool,
-    default=False,
-)
-@click.option(
-    "--visualize-num-samples",
-    type=int,
-    default=0,
-)
-def main(
-    random_seed: int,
-    input_csv_file: str,
-    output_prefix: str,
-    num_row_multiplier: int,
-    num_col_multiplier: int,
-    element_sample_rate: float,
-    block_sample_rate: float,
-    visualize: bool,
-    write_dataset: bool,
-    visualize_num_samples: int,
-):
-    np.random.seed(random_seed)
-
-    logger.info(f"Loading and preprocessing MovieLens-20m from {input_csv_file}")
-    with open(input_csv_file, "r") as infile:
-        ratings_df = pd.read_csv(infile, sep=",", header=0)
-    ratings_df = _preprocess_movie_lens(ratings_df, binary=False)
-    num_ratings = len(ratings_df)
-    num_users = len(set(ratings_df["row"].values))
-    num_items = len(set(ratings_df["col"].values))
-    logger.info(
-        f"number of ratings of input dataset is {num_ratings}, number of users is {num_users}, number of items is {num_items}, sparsity is {num_ratings / (num_users * num_items)}, average seqlen is {num_ratings / num_users}"
-    )
-
-    ratings_matrix = sparse.csr_matrix(
-        (
-            ratings_df["data"].values,
-            (ratings_df["row"].values, ratings_df["col"].values),
-        ),
-        shape=(num_users, num_items),
-    )
-    binary_ratings_matrix = sparse.csr_matrix(
-        (
-            ratings_df["binary_data"].values,
-            (ratings_df["row"].values, ratings_df["col"].values),
-        ),
-        shape=(num_users, num_items),
-    )
-    if write_dataset or visualize:
-        expand_dataset(
-            ratings_matrix=ratings_matrix,
-            binary_ratings_matrix=binary_ratings_matrix,
-            num_users=num_users,
-            num_items=num_items,
-            reduced_num_rows=num_row_multiplier,
-            reduced_num_cols=num_col_multiplier,
-            rescale_w_abs=False,
-            element_sample_rate=element_sample_rate,
-            block_sample_rate=block_sample_rate,
-            visualize=visualize,
-            write_dataset=write_dataset,
-            output_prefix=output_prefix,
-        )
-    if visualize_num_samples > 0:
-        logger.info(f"Visualizing {visualize_num_samples} samples.")
-        visualize_samples(
-            right_matrix=ratings_matrix.tocoo(),
-            visualize_num_samples=visualize_num_samples,
-            expanded_file_name=f"{output_prefix}{num_row_multiplier}x{num_col_multiplier}_0.csv",
-            output_prefix="Sample_Histogram",
-        )
-
-
-if __name__ == "__main__":
-    main()

From b68bfb711130372781c8abbde3ca2d566fe02413 Mon Sep 17 00:00:00 2001
From: Chris Cai <chcai@amd.com>
Date: Fri, 26 Jun 2026 05:05:31 +0000
Subject: [PATCH 103/113] Make data-fraction eval cadence the default

Change the default eval cadence from per-window (EVAL_EVERY_N_WINDOWS=1)
to data-fraction every 0.5% of data (EVAL_EVERY_N_WINDOWS=0,
EVAL_EVERY_DATA_PCT=0.005). Per-window spacing is uneven in data volume
since each daily window holds a different number of samples; the
data-fraction cadence yields ~200 evenly-spaced-by-compute eval points.

Updates the gin defaults and the launch_slurm.sh / launch_local.sh
fallbacks together so the two cadences are never both >0 (which raises a
ValueError at startup), and corrects the corresponding comments.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin           | 29 +++++++++++--------
 recommendation_v4/scripts/launch_local.sh     |  5 +++-
 recommendation_v4/scripts/launch_slurm.sh     |  5 +++-
 3 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index dd645f06a..14bc9d106 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -358,21 +358,24 @@ pl/env_int.default = 1
 # (EVAL_EVERY_DATA_PCT>0) requires EVAL_EVERY_N_WINDOWS=0; setting both >0 raises
 # a ValueError at startup. The final end-of-run eval always runs in either mode.
 #
-# (1) PER-WINDOW cadence (EVAL_EVERY_N_WINDOWS, the default).
+# (1) PER-WINDOW cadence (EVAL_EVERY_N_WINDOWS).
 # Full-holdout eval cadence (single knob; replaces the old EVAL_EACH_WINDOW
-# on/off switch). 0 = eval disabled (train-only, e.g. perf benchmarking or the
-# resume test; the eval dataloader isn't even built). 1 (default) = eval after
-# every window. N>1 (e.g. 5 via $EVAL_EVERY_N_WINDOWS) = eval every Nth window
-# (and always the final one) to amortize the cost of consuming the full next-day
-# eval window. The cadence is anchored to the absolute ts grid so eval points
-# stay stable across a mid-run resume.
+# on/off switch). 0 (default) = per-window cadence OFF -> defer to the
+# data-fraction cadence below (EVAL_EVERY_DATA_PCT); if that is also 0, eval is
+# disabled entirely (train-only, e.g. perf benchmarking or the resume test; the
+# eval dataloader isn't even built). 1 = eval after every window. N>1 (e.g. 5
+# via $EVAL_EVERY_N_WINDOWS) = eval every Nth window (and always the final one)
+# to amortize the cost of consuming the full next-day eval window. The cadence is
+# anchored to the absolute ts grid so eval points stay stable across a mid-run
+# resume.
 # NOTE: each daily window has a DIFFERENT number of training samples, so a
 # per-window cadence produces eval points that are UNEVENLY spaced in terms of
-# how much data was trained between them. Use the data-fraction cadence below if
-# you want evenly-spaced-by-data eval points instead.
+# how much data was trained between them. This is why the data-fraction cadence
+# below is now the default; enable this per-window knob only if you specifically
+# want eval anchored to the daily window grid.
 streaming_train_eval_loop.eval_every_n_windows = @evn/env_int()
 evn/env_int.key = "EVAL_EVERY_N_WINDOWS"
-evn/env_int.default = 1
+evn/env_int.default = 0
 #
 # (2) DATA-FRACTION cadence (EVAL_EVERY_DATA_PCT).
 # Run the full-holdout eval every time the run has trained this FRACTION of the
@@ -380,7 +383,9 @@ evn/env_int.default = 1
 # (compute), independent of how many samples each daily window happens to hold.
 # This is the fix for the per-window cadence's uneven spacing noted above.
 #   value semantics (it is a fraction in (0, 1], NOT a percent number):
-#     0.0  (default) = OFF  -> fall back to the per-window EVAL_EVERY_N_WINDOWS.
+#     0.0            = OFF  -> fall back to the per-window EVAL_EVERY_N_WINDOWS;
+#                      if that is also 0, eval is disabled entirely (train-only).
+#     0.005 (default)= eval every 0.5% of the data -> ~200 eval points total.
 #     0.01           = eval every 1%  of the data  -> ~100 eval points total.
 #     0.05           = eval every 5%  of the data  -> ~20  eval points total.
 #     0.10           = eval every 10% of the data  -> ~10  eval points total.
@@ -397,7 +402,7 @@ evn/env_int.default = 1
 # trajectory can be plotted against data volume. Override via $EVAL_EVERY_DATA_PCT.
 streaming_train_eval_loop.eval_every_data_pct = @edp/env_float()
 edp/env_float.key     = "EVAL_EVERY_DATA_PCT"
-edp/env_float.default = 0.0
+edp/env_float.default = 0.005
 # Double-buffer windows: prepare the next window (index mask + first-batch
 # prefetch) in a background thread during the current window's compute, hiding
 # the per-window reset. Needs persistent_loader=1. Override via env.
diff --git a/recommendation_v4/scripts/launch_local.sh b/recommendation_v4/scripts/launch_local.sh
index 9a69a36e5..45f4bb6f2 100755
--- a/recommendation_v4/scripts/launch_local.sh
+++ b/recommendation_v4/scripts/launch_local.sh
@@ -89,7 +89,10 @@ if [ "$SMOKE" = "1" ]; then
   export NUM_TRAIN_TS=${NUM_TRAIN_TS:-1}
   export NUM_TRAIN_BATCHES=${NUM_TRAIN_BATCHES:-20}
   export NUM_EVAL_BATCHES=${NUM_EVAL_BATCHES:-10}
-  export EVAL_EVERY_N_WINDOWS=${EVAL_EVERY_N_WINDOWS:-1}
+  # Default eval cadence: per-window OFF (0), data-fraction every 0.5% of data
+  # (0.005). Mutually exclusive (both >0 raises a ValueError at startup).
+  export EVAL_EVERY_N_WINDOWS=${EVAL_EVERY_N_WINDOWS:-0}
+  export EVAL_EVERY_DATA_PCT=${EVAL_EVERY_DATA_PCT:-0.005}
   export METRIC_LOG_FREQ=${METRIC_LOG_FREQ:-5}
   # Smaller per-sample shape keeps the smoke run light; drop these to use the
   # gin defaults (4086/4096). Reuse an existing hstu_cache_L<N>/ if present.
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index a1b6334e3..af9ff3907 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -142,7 +142,10 @@ orchestrate() {
   NUM_TRAIN_BATCHES=${NUM_TRAIN_BATCHES:-20}
   NUM_EVAL_BATCHES=${NUM_EVAL_BATCHES:-10}
   EVAL_EACH_WINDOW=${EVAL_EACH_WINDOW:-1}
-  EVAL_EVERY_N_WINDOWS=${EVAL_EVERY_N_WINDOWS:-1}
+  # Default eval cadence: per-window OFF (0), data-fraction every 0.5% of data
+  # (0.005). The two are mutually exclusive (both >0 raises a ValueError).
+  EVAL_EVERY_N_WINDOWS=${EVAL_EVERY_N_WINDOWS:-0}
+  EVAL_EVERY_DATA_PCT=${EVAL_EVERY_DATA_PCT:-0.005}
   METRIC_LOG_FREQ=${METRIC_LOG_FREQ:-5}
   FORCE_PROVISION=${FORCE_PROVISION:-0}
 

From 7e8de35d1ff10130c8cf5345a239a216945a4467 Mon Sep 17 00:00:00 2001
From: chris <chcai@amd.com>
Date: Fri, 26 Jun 2026 09:26:05 +0000
Subject: [PATCH 104/113] dlrmv3 streaming: fix distributed sync + generalize
 checkpoint/resume e2e test

- broadcast total_train_anchors from rank-0 (avoid redundant mmap-gather +
  UID-hash recompute on every rank) and add a window-boundary dist.barrier()
  to prevent NCCL collective deadlock on skewed per-rank data prep.
- generalize streaming_resume_test.sh with --platform auto-detect for both
  NVIDIA B200 and AMD MI350/355 (container names, dataset paths, ckpt roots,
  node-local data staging), adding midwindow + multiwindow scenarios.
- extend streaming_resume_test.py with a `summarize` subcommand that parses
  logs for anchor-broadcast, window-barrier, eval-trigger and resume signals.
- env-gated barrier debug log in utils.py for test observability.

Validated end-to-end on B200: both midwindow and multiwindow scenarios PASS.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/_env_bootstrap.py           |  12 +-
 .../train/tests/streaming_resume_test.py      | 125 +++-
 .../train/tests/streaming_resume_test.sh      | 561 ++++++++++++------
 .../dlrm_v3/train/utils.py                    | 106 +++-
 recommendation_v4/scripts/launch_slurm.sh     |   5 +-
 5 files changed, 619 insertions(+), 190 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/_env_bootstrap.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/_env_bootstrap.py
index 2890851de..5470d4e39 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/_env_bootstrap.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/_env_bootstrap.py
@@ -23,6 +23,16 @@
 def apply_env_bootstrap(
     TRITON_FULL_AUTOTUNE: Optional[bool] = None,
 ) -> None:
-    if TRITON_FULL_AUTOTUNE is not None:
+    # A pre-set environment variable wins over the gin binding. The pinned
+    # triton configs are MI350X-specific, so a different GPU arch (e.g. B200
+    # sm_100) sets TRITON_FULL_AUTOTUNE=1 in the launcher environment to
+    # re-enable the full autotune search WITHOUT editing this (AMD-default)
+    # gin file. Cross-cluster launchers thus stay config-as-code via env.
+    if "TRITON_FULL_AUTOTUNE" in os.environ:
+        logger.info(
+            "env bootstrap: honoring pre-set TRITON_FULL_AUTOTUNE=%s (overrides gin binding)",
+            os.environ["TRITON_FULL_AUTOTUNE"],
+        )
+    elif TRITON_FULL_AUTOTUNE is not None:
         os.environ["TRITON_FULL_AUTOTUNE"] = "1" if TRITON_FULL_AUTOTUNE else "0"
         logger.info("env bootstrap: TRITON_FULL_AUTOTUNE=%s", os.environ["TRITON_FULL_AUTOTUNE"])
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py
index b46da936b..cfe9b2e84 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py
@@ -14,12 +14,27 @@
 
 """End-to-end failure-injection test for streaming resume.
 
-Validates the four resume features end-to-end on the yambda-5b stack:
+Two scenarios, driven by the sibling `streaming_resume_test.sh` (see its header
+for the full B200 launch wiring). This module is the shared log parser + a CLI
+the driver shells out to.
+
+SCENARIO `midwindow` — exact-once mid-window resume. Validates the four
+single-window resume features end-to-end on the yambda-5b stack:
   1. Mid-window save (in_window_checkpoint_frequency)
   2. Within-window exact-once skip (StreamingWindowSampler.set_window skip)
   3. Auto-detect-latest checkpoint subdir
   4. keep_last_n retention (default 1)
 
+SCENARIO `multiwindow` — distributed-sync regression guard for the two fixes the
+mid-window test cannot reach (it runs ONE window with per-window eval off):
+  A. total_train_anchors() computed once on rank 0 + broadcast (not world_size×).
+  B. window-boundary dist.barrier() before the first forward of each window.
+Both only matter across >=2 windows with the data-fraction eval cadence
+(EVAL_EVERY_DATA_PCT>0) active, and the deadlock they fix originally struck at a
+window boundary mid-run — so the scenario trains multiple windows AND resumes
+across a completed-window boundary. The signals are extracted by `summarize`
+(see `summarize_run`) and asserted in the shell driver.
+
 Test flow (driven by the sibling `streaming_resume_test.sh`):
   Phase 1 (baseline): Run streaming-train-eval for N=2 train_ts × K batches/window
     with die_at_step=-1. Capture per-batch window_ne / window_auc into traj_baseline.json.
@@ -49,7 +64,7 @@
 import json
 import re
 import sys
-from typing import Dict, Tuple
+from typing import Dict, List, Optional, Tuple
 
 # Per-step metrics from MetricsLogger.compute_and_log are emitted like:
 #   "train - Step 51 metrics: {'metric/lifetime_ne/listen_plus': tensor(1.0954, ...)
@@ -61,6 +76,97 @@
 _WACC_RE = re.compile(r"window_accuracy/listen_plus.*?tensor\(([0-9.]+)")
 
 
+# --- multi-window / data-pct-eval regression signals -------------------------
+# These cover the two distributed-sync fixes that the single-window mid-window
+# test above does NOT exercise (it runs one window with per-window eval off):
+#
+#   (A) total_train_anchors() rank-0 broadcast. The data-fraction eval cadence
+#       needs total_train_anchors — a multi-minute, single-threaded O(N) gather
+#       + uid-hash over the mmap'd anchor array. Run on EVERY rank it both wastes
+#       8x CPU and desyncs the NCCL stream (a fast rank races into the first
+#       embedding all-to-all while slow ranks still hash) → deadlock. The fix
+#       computes it ONCE on rank 0 and broadcasts the scalar. yambda logs exactly
+#       one `total_train_anchors(start_ts=…)` line per call, so the regression
+#       guard is: that line appears EXACTLY ONCE per launch (was world_size×).
+#
+#   (B) window-boundary barrier. Per-window data prep (`window_indices`, an O(N)
+#       mask over the ~18GB mmap) finishes at very different times across ranks;
+#       without a sync before the first forward the collective stream desyncs and
+#       the job hangs at the boundary. The fix adds a dist.barrier() at each
+#       window boundary. It is silent on the healthy path, so the trainer emits a
+#       `[window-barrier] … rendezvous complete` line (rank 0) per crossed window
+#       ONLY under WINDOW_BARRIER_DEBUG=1 — the guard counts those == #windows.
+_TTA_RE = re.compile(r"total_train_anchors\(start_ts=(\d+),\s*num_ts=(\d+)\):")
+_BARRIER_RE = re.compile(r"\[window-barrier\] train_ts=(\d+) rendezvous complete")
+_DATA_PCT_SETUP_RE = re.compile(
+    r"\[data-pct-eval\] eval_every_data_pct=.*?eval_interval_steps=(\d+)"
+)
+_DATA_PCT_TRIGGER_RE = re.compile(r"\[data-pct-eval\] trigger eval train_ts=(\d+)")
+_RESUME_COMPLETED_RE = re.compile(r"Resuming from completed train_ts=(\d+)")
+_RESUME_MIDWINDOW_RE = re.compile(
+    r"Resuming mid-window at train_ts=(\d+) batch_idx_in_window=(\d+)"
+)
+# Test driver appends this sentinel after the trainer returns (clean OR crash);
+# code 0 == the run finished all requested windows + final eval without hanging.
+_PHASE_EXIT_RE = re.compile(r"PHASE_EXIT=(-?\d+)")
+
+
+def summarize_run(log_path: str) -> Dict[str, object]:
+    """Extract the multi-window / data-pct-eval regression signals from a run log.
+
+    Returns a JSON-able dict the shell driver asserts on. All counts are over the
+    WHOLE log (one launch's worth — the driver uses a fresh per-phase log)."""
+    tta_calls: List[Tuple[int, int]] = []
+    barrier_windows: List[int] = []
+    data_pct_eval_setup: bool = False
+    data_pct_eval_interval: Optional[int] = None
+    data_pct_eval_triggers: List[int] = []
+    resume_completed_ts: Optional[int] = None
+    resume_midwindow: Optional[Tuple[int, int]] = None
+    phase_exit: Optional[int] = None
+    with open(log_path, "r", errors="replace") as f:
+        for line in f:
+            m = _TTA_RE.search(line)
+            if m:
+                tta_calls.append((int(m.group(1)), int(m.group(2))))
+            m = _BARRIER_RE.search(line)
+            if m:
+                barrier_windows.append(int(m.group(1)))
+            m = _DATA_PCT_SETUP_RE.search(line)
+            if m:
+                data_pct_eval_setup = True
+                data_pct_eval_interval = int(m.group(1))
+            m = _DATA_PCT_TRIGGER_RE.search(line)
+            if m:
+                data_pct_eval_triggers.append(int(m.group(1)))
+            m = _RESUME_COMPLETED_RE.search(line)
+            if m:
+                resume_completed_ts = int(m.group(1))
+            m = _RESUME_MIDWINDOW_RE.search(line)
+            if m:
+                resume_midwindow = (int(m.group(1)), int(m.group(2)))
+            m = _PHASE_EXIT_RE.search(line)
+            if m:
+                phase_exit = int(m.group(1))
+    return {
+        # (A) rank-0 broadcast: must be exactly 1 (was world_size× before the fix)
+        "total_train_anchors_calls": len(tta_calls),
+        "total_train_anchors_args": tta_calls,
+        # (B) barrier executed once per crossed window (rank 0, debug-gated)
+        "window_barrier_count": len(barrier_windows),
+        "windows_trained": sorted(set(barrier_windows)),
+        # data-fraction eval cadence active + actually fired
+        "data_pct_eval_setup": data_pct_eval_setup,
+        "data_pct_eval_interval_steps": data_pct_eval_interval,
+        "data_pct_eval_trigger_count": len(data_pct_eval_triggers),
+        # resume classification
+        "resume_completed_ts": resume_completed_ts,
+        "resume_midwindow": resume_midwindow,
+        # terminal status (None => still running / killed without sentinel)
+        "phase_exit": phase_exit,
+    }
+
+
 def parse_trajectory(log_path: str) -> Dict[int, Dict[str, float]]:
     """Extract a {step: {window_ne, window_auc, window_accuracy}} dict from a
     train.log. The grep is loose on the metric line itself — we accept the
@@ -142,6 +248,13 @@ def main() -> int:
     p_cmp.add_argument("--min-resume-step", type=int, required=True)
     p_cmp.add_argument("--atol", type=float, default=0.15)
 
+    p_sum = sub.add_parser(
+        "summarize",
+        help="Emit multi-window / data-pct-eval regression signals from a run log",
+    )
+    p_sum.add_argument("log")
+    p_sum.add_argument("out", nargs="?", help="optional JSON output path")
+
     args = ap.parse_args()
     if args.cmd == "parse":
         traj = parse_trajectory(args.log)
@@ -159,6 +272,14 @@ def main() -> int:
         )
         print(msg)
         return 0 if ok else 1
+    if args.cmd == "summarize":
+        summary = summarize_run(args.log)
+        out = json.dumps(summary, indent=2)
+        if args.out:
+            with open(args.out, "w") as f:
+                f.write(out)
+        print(out)
+        return 0
     return 0
 
 
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
index e14e557e8..afdc65805 100755
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
@@ -1,242 +1,443 @@
 #!/bin/bash
 # End-to-end failure-injection + resume test for streaming-train-eval.
 #
-# Validates exact-once mid-window resume on the yambda-5b stack:
-#   Phase 1 (baseline):     uninterrupted run for N=2 train_ts × K batches/window
-#   Phase 2 (interrupted):  same config but die_at_step=M → exits at step M
-#                           after the in-window checkpoint lands
-#   Phase 3 (resume):       re-launch with same CKPT_PATH → auto-latest picks
-#                           the in-window save → finishes the partial window
-#                           and the rest of the requested train_ts list
-# Assertion: traj_resumed[step].window_ne / window_auc / window_accuracy match
-# traj_baseline bit-equal (np.allclose atol=1e-4) for all step > die_at_step.
+# PLATFORM-GENERAL: runs on both NVIDIA B200 and AMD MI350/MI355 (ROCm/meta64).
+# The only hardware-specific bits are picked by --platform (auto-detected from the
+# running container if omitted): the container name, the dataset path, and the
+# checkpoint root. Everything else — the worker entrypoint (scripts/launch_slurm.sh,
+# which is the shared launcher both clusters' supervisors use), the env-driven gin
+# knobs, and all assertions — is identical across platforms.
 #
-# Driven entirely via env-driven gin knobs defined in yambda_5b.gin:
-#   NUM_TRAIN_TS / NUM_TRAIN_BATCHES / IN_WINDOW_CKPT_FREQ / DIE_AT_STEP /
-#   CKPT_PATH / KEEP_LAST_N / EVAL_EVERY_N_WINDOWS
+# Two scenarios (select with --scenario; default runs both):
+#
+#  midwindow  — exact-once MID-WINDOW resume (single window).
+#     P1 baseline:    uninterrupted 1 train_ts × K batches.
+#     P2 interrupted: same + die_at_step=M → exits AFTER the in-window ckpt at M.
+#     P3 resume:      relaunch w/ same CKPT_PATH → auto-latest picks the in-window
+#                     save, skips the M already-trained batches, finishes.
+#     Gates: re-entered at batch_idx_in_window=M, per-rank RNG restored, first
+#     resumed step == M+1, atomic save + keep_last_n, trajectory within --atol.
+#
+#  multiwindow — distributed-sync REGRESSION guard for the two fixes the
+#     mid-window test cannot reach (it runs ONE window with per-window eval off):
+#        (A) total_train_anchors() computed ONCE on rank 0 + broadcast (the
+#            data-fraction eval cadence needs it; running the multi-minute O(N)
+#            mmap gather + uid-hash on every rank desynced NCCL → boundary hang).
+#        (B) a dist.barrier() at every window boundary before the first forward
+#            (per-rank data-prep skew otherwise desyncs the collective stream).
+#     Both only bite across >=2 windows with EVAL_EVERY_DATA_PCT>0, and the
+#     deadlock struck at a boundary mid-run, so:
+#        P1 mw_baseline: cold run over MW_TS windows w/ data-pct eval. Asserts
+#            total_train_anchors logged EXACTLY ONCE (computed at setup + broadcast
+#            from rank 0), the barrier fired on EVERY window, the data-pct cadence
+#            was set up, and the run COMPLETED (no boundary hang).
+#        P2 mw_seed:     1 window → clean end-of-window (WINDOW_COMPLETE) ckpt.
+#        P3 mw_resume:   relaunch over MW_TS windows w/ same CKPT_PATH → resumes
+#            past the completed window and CROSSES the boundary into the next
+#            windows. Asserts "Resuming from completed", barrier fired on each
+#            remaining window, anchors broadcast once, and the run COMPLETED —
+#            i.e. the exact boundary-crossing-on-resume case that used to hang.
+#
+# Driven entirely via env-driven gin knobs (yambda_5b.gin) through the SAME B200
+# worker entrypoint the production supervisor uses: `bash scripts/launch_slurm.sh`
+# (worker phase, auto-detected inside the container). WINDOW_BARRIER_DEBUG=1 makes
+# the otherwise-silent barrier emit one rank-0 line per crossed window.
+#
+# CHECKPOINT/DATASET PLACEMENT (the one real platform difference):
+#   * B200: virtiofs/NFS WEDGES under the trainer's concurrent mmap LOAD, so the
+#     checkpoint root AND the mmap'd dataset cache MUST be node-local (defaults
+#     /tmp/...). The dataset must already be staged node-local at --data-path
+#     (the e2e supervisor's stage_data_in does this); the test fails fast if not.
+#   * MI350/MI355 (meta64): NFS mmap is fine, so the checkpoint root + dataset
+#     read directly from shared NFS (defaults /apps/chcai/...), as the original
+#     test did. No staging needed.
+#   Logs always use read()/write() only, so they live on shared /apps/chcai and
+#   are grep-able from the head node on both platforms.
 #
 # Usage:
-#   bash generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh --jobid <slurm-jobid>
-#       [--container yambda_primus]
-#       [--num-train-batches 200]
-#       [--die-at-step 350]
-#       [--keep]    # retain LOG_DIR + CKPT after run for inspection
+#   bash generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh \
+#       --jobid <slurm-jobid> [--platform b200|mi350] [--scenario all]
+#       [--container <name>] [--data-path <path>] [--ckpt-root <path>] [--start-ts 150]
+#       [--num-train-batches 200] [--die-at-step 100]      # midwindow knobs
+#       [--mw-num-train-ts 3] [--mw-num-train-batches 20]  # multiwindow knobs
+#       [--mw-eval-pct 0.34] [--keep]
+#   --platform is auto-detected from the running container when omitted. Any of
+#   --container/--data-path/--ckpt-root override the platform default.
 
 set -uo pipefail
 
 JOBID=""
-CONTAINER="yambda_primus"
+REPO=/home/chcai/training/recommendation_v4
+DATASET_SUBDIR=processed_5b/hstu_cache_L4086
+SCENARIO=all                                # midwindow | multiwindow | all
+START_TS=150
+KEEP=0
+LOG_DIR=/apps/chcai/streaming_resume_test   # shared NFS (read()/write() only)
+# Platform + the three platform-specific paths. Empty sentinels here; filled by
+# apply_platform_defaults() AFTER platform detection unless the user overrode
+# them on the command line. (DATA_PATH uses a distinct sentinel because an
+# explicit empty value is meaningful: "do not inject DLRM_DATA_PATH; let the gin
+# default apply".)
+PLATFORM=""                                 # b200 | mi350 | mi355 ; auto if empty
+CONTAINER=""                                # default: per-platform
+DATA_PATH="__AUTO__"                         # default: per-platform
+CKPT_ROOT=""                                # default: per-platform (node-local on B200)
+
+# --- midwindow knobs ---
 NUM_TRAIN_BATCHES=200
-DIE_AT_STEP=350
+NUM_EVAL_BATCHES=5      # cap the per-phase FINAL eval (0 = full holdout, very slow)
+DIE_AT_STEP=100
 IN_WINDOW_FREQ=50
-KEEP=0
-# Trajectory closeness bound — NOT a bit-equality check. The ROCm training stack
-# is nondeterministic across runs (non-deterministic atomic scatter-add in the
-# embedding/attention backward): two independent *cold* runs already drift
-# ~7e-4 in window_ne over 20 steps, and early-training chaos (AUC~0.5) amplifies
-# any seed difference. So resume-vs-baseline can legitimately differ by a few
-# percent. This bound just catches GROSS divergence (wrong data skip, totally
-# unrestored state) while tolerating nondeterministic drift. The HARD resume
-# correctness gates are the functional-invariant checks below (RNG restored,
-# resumed-at-correct-step, atomic/keep_last_n), not this number.
-ATOL=0.15
-CKPT_ROOT=/apps/chcai/ckpts_resume_test
-LOG_DIR=/apps/chcai/streaming_resume_test
-REPO=/home/chcai/training/recommendation_v4
+ATOL=0.15           # trajectory closeness bound (NOT bit-equality; see py module)
+MW_TIMEOUT=1800
+
+# --- multiwindow knobs ---
+MW_TS=3                 # windows to train (>=2 to cross a boundary)
+MW_BATCHES=20           # train batches per window (small = fast)
+MW_EVAL_BATCHES=5       # holdout eval batches per fired eval
+MW_EVAL_PCT=0.34        # data-fraction eval cadence (>0 enables the anchors path)
+MW_SPLIT=0.90           # train split (<1 => holdout exists => uid-hash anchor path)
+MW_HOLDOUT_TS=200       # PINNED holdout window (must match across seed→resume)
+MW_RUN_TIMEOUT=3600     # generous: init + planner + anchors gather can take min
 
 while [[ $# -gt 0 ]]; do
     case $1 in
         --jobid) JOBID="$2"; shift 2;;
+        --platform) PLATFORM="$2"; shift 2;;
         --container) CONTAINER="$2"; shift 2;;
+        --repo) REPO="$2"; shift 2;;
+        --data-path) DATA_PATH="$2"; shift 2;;
+        --dataset-subdir) DATASET_SUBDIR="$2"; shift 2;;
+        --scenario) SCENARIO="$2"; shift 2;;
+        --start-ts) START_TS="$2"; shift 2;;
+        --ckpt-root) CKPT_ROOT="$2"; shift 2;;
+        --log-dir) LOG_DIR="$2"; shift 2;;
         --num-train-batches) NUM_TRAIN_BATCHES="$2"; shift 2;;
+        --num-eval-batches) NUM_EVAL_BATCHES="$2"; shift 2;;
         --die-at-step) DIE_AT_STEP="$2"; shift 2;;
         --in-window-freq) IN_WINDOW_FREQ="$2"; shift 2;;
         --atol) ATOL="$2"; shift 2;;
+        --mw-num-train-ts) MW_TS="$2"; shift 2;;
+        --mw-num-train-batches) MW_BATCHES="$2"; shift 2;;
+        --mw-num-eval-batches) MW_EVAL_BATCHES="$2"; shift 2;;
+        --mw-eval-pct) MW_EVAL_PCT="$2"; shift 2;;
+        --mw-split) MW_SPLIT="$2"; shift 2;;
+        --mw-holdout-ts) MW_HOLDOUT_TS="$2"; shift 2;;
         --keep) KEEP=1; shift;;
         *) echo "Unknown arg: $1"; exit 1;;
     esac
 done
 [[ -z "$JOBID" ]] && { echo "Error: --jobid required"; exit 1; }
+case "$SCENARIO" in midwindow|multiwindow|all) ;; *) echo "Error: --scenario must be midwindow|multiwindow|all"; exit 1;; esac
+(( MW_TS < 2 )) && { echo "Error: --mw-num-train-ts must be >=2 to cross a boundary"; exit 1; }
+[[ -n "$PLATFORM" ]] && case "$PLATFORM" in b200|mi350|mi355) ;; *) echo "Error: --platform must be b200|mi350|mi355"; exit 1;; esac
+
+# --- resolve platform + its three hardware-specific paths --------------------
+# Precedence: explicit --platform > inferred from explicit --container > probe
+# the allocation's docker for a known training container > default b200.
+if [[ -z "$PLATFORM" ]]; then
+    if [[ "$CONTAINER" == "yambda_b200" ]]; then PLATFORM=b200
+    elif [[ "$CONTAINER" == "yambda_primus" ]]; then PLATFORM=mi350
+    else
+        _names=$(srun --jobid="$JOBID" --overlap docker ps -a --format '{{.Names}}' 2>/dev/null)
+        if grep -qx yambda_b200 <<<"$_names"; then PLATFORM=b200
+        elif grep -qx yambda_primus <<<"$_names"; then PLATFORM=mi350
+        else PLATFORM=b200; echo "Warning: could not auto-detect platform (no known container on job $JOBID) — defaulting to b200"; fi
+    fi
+    echo "[$(date)] auto-detected platform: $PLATFORM"
+fi
+case "$PLATFORM" in
+    b200)
+        : "${CONTAINER:=yambda_b200}"
+        # B200: mmap (ckpt LOAD + dataset cache) must NOT touch virtiofs/NFS.
+        [[ "$DATA_PATH" == "__AUTO__" ]] && DATA_PATH=/tmp/yambda_data
+        : "${CKPT_ROOT:=/tmp/yambda_resume_test/ckpts}"
+        ;;
+    mi350|mi355)
+        : "${CONTAINER:=yambda_primus}"
+        # meta64: NFS mmap is fine — read dataset + write ckpt directly on NFS
+        # (matches the original MI350 test). /apps/chcai/dlrm_data is the gin default.
+        [[ "$DATA_PATH" == "__AUTO__" ]] && DATA_PATH=/apps/chcai/dlrm_data
+        : "${CKPT_ROOT:=/apps/chcai/ckpts_resume_test}"
+        ;;
+esac
+echo "[$(date)] platform=$PLATFORM container=$CONTAINER data_path=${DATA_PATH:-<gin default>} ckpt_root=$CKPT_ROOT"
 
 mkdir -p "$LOG_DIR"
+PYHELPER="$REPO/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py"
 
-# Single-window mid-window resume: NUM_TRAIN_TS=1, so the whole test runs inside
-# train_ts=START_TS. die_at_step must land strictly inside that window, AT a
-# multiple of IN_WINDOW_FREQ so an in-window checkpoint is saved right before
-# the crash (resume then skips exactly DIE_AT_STEP already-trained batches).
-if (( DIE_AT_STEP <= 0 || DIE_AT_STEP >= NUM_TRAIN_BATCHES )); then
-    echo "Warning: die_at_step=$DIE_AT_STEP not strictly inside window (0, $NUM_TRAIN_BATCHES)" >&2
-fi
-if (( DIE_AT_STEP % IN_WINDOW_FREQ != 0 )); then
-    echo "Warning: die_at_step=$DIE_AT_STEP not a multiple of in_window_freq=$IN_WINDOW_FREQ; no save lands exactly at crash" >&2
-fi
+# --- container helpers (inspect CKPT/dataset via docker exec — works whether the
+#     path is node-local on B200 or shared NFS on MI350) ---
+sx() { srun --jobid="$JOBID" --overlap docker exec "$CONTAINER" bash -lc "$1" 2>/dev/null; }
 
 cleanup_workers() {
-    srun --jobid="$JOBID" --overlap docker exec "$CONTAINER" bash -lc \
-        "pkill -9 -f generative_recommenders 2>/dev/null; sleep 2; \
-         pkill -9 -f spawn_main 2>/dev/null; sleep 3; true" 2>/dev/null || true
+    sx "pkill -9 -f train_ranker 2>/dev/null; pkill -9 -f generative_recommenders 2>/dev/null; \
+        pkill -9 -f multiprocessing 2>/dev/null; sleep 2; pkill -9 -f spawn_main 2>/dev/null; sleep 3; true" || true
 }
-clean_ckpt() {
-    srun --jobid="$JOBID" --overlap docker exec "$CONTAINER" rm -rf "$CKPT_ROOT" 2>/dev/null || true
+clean_ckpt() { sx "rm -rf '$1'" || true; }
+
+# Precheck: the dataset cache must be readable at $DATA_PATH. On B200 it must be
+# staged node-local (the supervisor's stage_data_in does this) since mmap from
+# virtiofs/NFS wedges; on MI350 it reads directly from NFS. Skipped when DATA_PATH
+# is empty (the trainer falls back to its gin default and we don't know the path).
+precheck_data() {
+    [[ -z "$DATA_PATH" ]] && { echo "[$(date)] data path unset — trainer will use its gin default; skipping precheck"; return 0; }
+    local ok
+    ok=$(sx "[ -d '$DATA_PATH/$DATASET_SUBDIR' ] && echo yes || echo no")
+    if [[ "$ok" != "yes" ]]; then
+        echo "FAIL: dataset cache not found at $DATA_PATH/$DATASET_SUBDIR inside '$CONTAINER' (platform=$PLATFORM)."
+        if [[ "$PLATFORM" == "b200" ]]; then
+            echo "      B200: stage it node-local first (the e2e supervisor does this via stage_data_in),"
+            echo "      or pass --data-path to an already-staged local mirror. mmap from virtiofs/NFS wedges."
+        else
+            echo "      MI350/MI355: pass --data-path to the NFS dataset root (gin default is /apps/chcai/dlrm_data)."
+        fi
+        exit 1
+    fi
 }
 
-# Wait for a log line to appear OR a crash sentinel. Returns 0 if target found,
-# 1 if crash sentinel found first.
+# Wait (host-side grep on the shared-NFS log) for a target regex OR a crash
+# sentinel. 0=target found, 1=crash first, 2=timeout.
 wait_for_log() {
-    local log="$1"; local target_re="$2"; local timeout_s="${3:-1500}"
+    local log="$LOG_DIR/$1.log"; local target_re="$2"; local timeout_s="${3:-1800}"
     local elapsed=0
     while (( elapsed < timeout_s )); do
-        if grep -qE "$target_re" "$log" 2>/dev/null; then
-            return 0
-        fi
-        if grep -qE "Traceback|RuntimeError|OutOfMemoryError" "$log" 2>/dev/null; then
-            return 1
-        fi
-        sleep 5
-        elapsed=$((elapsed + 5))
+        grep -qE "$target_re" "$log" 2>/dev/null && return 0
+        grep -qE "Traceback|RuntimeError|OutOfMemoryError|CUDA error" "$log" 2>/dev/null && return 1
+        sleep 5; elapsed=$((elapsed + 5))
     done
     return 2
 }
 
-# Single train window of NUM_TRAIN_BATCHES steps → last train step == NUM_TRAIN_BATCHES.
-LAST_STEP=$NUM_TRAIN_BATCHES
-
+# Launch one trainer phase (detached), appending a PHASE_EXIT sentinel after the
+# trainer returns (clean OR crash) — exactly like the production supervisor. The
+# common env (data path, mode, start_ts, barrier debug) is fixed; per-phase knobs
+# are passed as additional "K=V" words.
 run_phase() {
     local name="$1"; shift
     local log="$LOG_DIR/${name}.log"
-    # Join the per-phase env overrides into ONE word. Using `$*` (not `$@`) is
-    # essential: `$@` embedded mid-string in the double-quoted `bash -lc "..."`
-    # expands to *multiple* arguments, so bash -lc would only run up to the
-    # first override and treat the rest as positional params — launch_smoke
-    # would never execute (silent 0-byte log).
+    # `$*` (joined into ONE word), NOT `$@`: embedded mid-string in the
+    # double-quoted `bash -lc "..."`, `$@` would expand to multiple args and
+    # bash -lc would stop after the first override (silent 0-byte log).
     local env_overrides="$*"
+    # Inject DLRM_DATA_PATH only when a path is set; an empty DATA_PATH means
+    # "use the trainer's gin default" (the meta64 NFS root).
+    local data_env=""
+    [[ -n "$DATA_PATH" ]] && data_env="DLRM_DATA_PATH=$DATA_PATH"
     : > "$log"
     echo "[$(date)] === phase '$name' ==="
     cleanup_workers
     srun --jobid="$JOBID" --overlap docker exec -d "$CONTAINER" bash -lc "
         cd $REPO &&
+        $data_env \
         HSTU_HAMMER_KERNEL=TRITON \
-        $env_overrides \
+        MODE=streaming-train-eval \
+        START_TS=$START_TS \
+        WINDOW_BARRIER_DEBUG=1 \
         RUN_NAME=resume_test_$name \
         LOG=$log \
-        bash scripts/launch_smoke_8gpu.sh
+        $env_overrides \
+        bash scripts/launch_slurm.sh;
+        echo \"PHASE_EXIT=\$? \$(date '+%F %T')\" >> $log
     "
 }
 
-# === Phase 1: baseline ===
-clean_ckpt
-run_phase baseline \
-    "NUM_TRAIN_TS=1" \
-    "EVAL_EVERY_N_WINDOWS=0" \
-    "METRIC_LOG_FREQ=1" \
-    "NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES" \
-    "DIE_AT_STEP=-1"
-wait_for_log "$LOG_DIR/baseline.log" "train - Step $LAST_STEP metrics" 1500
-rc=$?
-cleanup_workers
-[[ $rc -ne 0 ]] && { echo "FAIL: baseline didn't finish"; tail -20 "$LOG_DIR/baseline.log"; exit 1; }
-
-# === Phase 2: interrupted ===
-clean_ckpt
-run_phase interrupt \
-    "NUM_TRAIN_TS=1" \
-    "EVAL_EVERY_N_WINDOWS=0" \
-    "METRIC_LOG_FREQ=1" \
-    "NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES" \
-    "IN_WINDOW_CKPT_FREQ=$IN_WINDOW_FREQ" \
-    "KEEP_LAST_N=1" \
-    "DIE_AT_STEP=$DIE_AT_STEP" \
-    "CKPT_PATH=$CKPT_ROOT"
-wait_for_log "$LOG_DIR/interrupt.log" "die_at_step=$DIE_AT_STEP hit" 1500
-rc=$?
-cleanup_workers
-[[ $rc -ne 0 ]] && { echo "FAIL: interrupt didn't hit die_at_step"; tail -20 "$LOG_DIR/interrupt.log"; exit 1; }
-
-SAVED=$(srun --jobid="$JOBID" --overlap docker exec "$CONTAINER" ls "$CKPT_ROOT" 2>/dev/null | tr '\n' ' ')
-echo "Saved checkpoints after interrupt: $SAVED"
-
-# === Phase 3: resume ===
-run_phase resume \
-    "NUM_TRAIN_TS=1" \
-    "EVAL_EVERY_N_WINDOWS=0" \
-    "METRIC_LOG_FREQ=1" \
-    "NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES" \
-    "IN_WINDOW_CKPT_FREQ=$IN_WINDOW_FREQ" \
-    "KEEP_LAST_N=1" \
-    "DIE_AT_STEP=-1" \
-    "CKPT_PATH=$CKPT_ROOT"
-wait_for_log "$LOG_DIR/resume.log" "train - Step $LAST_STEP metrics" 1500
-rc=$?
-[[ $rc -ne 0 ]] && { cleanup_workers; echo "FAIL: resume didn't finish"; tail -20 "$LOG_DIR/resume.log"; exit 1; }
-# The resume run performs an end-of-window checkpoint save AFTER the final
-# step's metric line. That save (hundreds of GB) writes <ts>.tmp and then
-# atomically renames it onto <ts>, logging "checkpoint successfully saved" only
-# once the rename completes. If we kill workers right after the step line we'd
-# orphan a half-written <ts>.tmp and trip the stale-dir gate below — a harness
-# race, not a resume bug. Wait for the save to finish before tearing down.
-wait_for_log "$LOG_DIR/resume.log" "checkpoint successfully saved" 1500
-save_rc=$?
-cleanup_workers
-[[ $save_rc -ne 0 ]] && { echo "FAIL: resume end-of-window checkpoint save did not complete"; tail -20 "$LOG_DIR/resume.log"; exit 1; }
-
-# === HARD resume-correctness gates (functional invariants) ===
-# These — not the trajectory closeness check below — are the authoritative
-# proof the resume path is correct, because they're deterministic and immune
-# to the GPU nondeterminism that perturbs the metric trajectory.
-
-# (1) Re-entered the partial window at exactly the saved batch_idx_in_window.
-if ! grep -qE "Resuming mid-window at train_ts=[0-9]+ batch_idx_in_window=$DIE_AT_STEP\b" "$LOG_DIR/resume.log" 2>/dev/null; then
-    echo "FAIL: resume did not re-enter mid-window at batch_idx_in_window=$DIE_AT_STEP"
-    grep -E "Resuming" "$LOG_DIR/resume.log" 2>/dev/null | head -2
-    exit 1
-fi
-# (2) Per-rank RNG state was actually restored (dropout determinism path).
-RNG_RESTORED=$(grep -c "RNG state restored from" "$LOG_DIR/resume.log" 2>/dev/null || echo 0)
-echo "RNG state restored on $RNG_RESTORED ranks"
-[[ "$RNG_RESTORED" -lt 1 ]] && { echo "FAIL: no RNG state restored on resume"; exit 1; }
-# (3) The FIRST training step after resume is exactly die_at_step+1, i.e. the
-#     skip-already-trained-batches logic emitted the next unseen batch (not a
-#     restart from step 1, and not a gap).
-FIRST_RESUMED=$(grep -oE 'train - Step [0-9]+ metrics: \{.metric' "$LOG_DIR/resume.log" 2>/dev/null \
-    | grep -oE 'Step [0-9]+' | awk '{print $2}' | sort -n | head -1)
-echo "First resumed train step: $FIRST_RESUMED (expect $((DIE_AT_STEP + 1)))"
-[[ "$FIRST_RESUMED" != "$((DIE_AT_STEP + 1))" ]] && {
-    echo "FAIL: resume did not continue at step $((DIE_AT_STEP + 1)) (got $FIRST_RESUMED)"; exit 1; }
-
-# === Final on-disk state checks (atomic save + retention) ===
-NUM_CKPT=$(srun --jobid="$JOBID" --overlap docker exec "$CONTAINER" bash -lc \
-    "ls $CKPT_ROOT 2>/dev/null | grep -E '^[0-9]+$' | wc -l" | tr -d ' ')
-# Both .tmp (interrupted write) and .old (interrupted atomic-overwrite swap)
-# must be absent — their presence means a save crashed without clean recovery.
-STALE_CKPT=$(srun --jobid="$JOBID" --overlap docker exec "$CONTAINER" bash -lc \
-    "ls $CKPT_ROOT 2>/dev/null | grep -E '\\.(tmp|old)$' | wc -l" | tr -d ' ')
-echo "Final: $NUM_CKPT numeric ckpt subdirs, $STALE_CKPT stale (.tmp/.old) dirs (expect 1, 0)"
-[[ "$NUM_CKPT" != "1" ]] && { echo "FAIL: keep_last_n=1 violated"; exit 1; }
-[[ "$STALE_CKPT" != "0" ]] && { echo "FAIL: stale .tmp/.old dirs left behind"; exit 1; }
-echo "=== Resume functional invariants: ALL PASS ==="
-
-# === Trajectory closeness (sanity bound, NOT bit-equality) ===
-# Catches gross resume bugs (wrong data slice, unrestored model) that throw the
-# metric trajectory far off. Small drift is expected & tolerated (see ATOL note
-# at top). The functional invariants above are the real correctness proof.
-python3 $REPO/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py parse \
-    "$LOG_DIR/baseline.log" "$LOG_DIR/traj_baseline.json"
-python3 $REPO/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py parse \
-    "$LOG_DIR/resume.log" "$LOG_DIR/traj_resumed.json"
-
-python3 $REPO/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py compare \
-    "$LOG_DIR/traj_baseline.json" "$LOG_DIR/traj_resumed.json" \
-    --min-resume-step $((DIE_AT_STEP + 1)) --atol $ATOL
-RC=$?
+# Read a scalar field from a summarize-JSON.
+jget() { python3 -c "import json,sys;print(json.load(open(sys.argv[1])).get(sys.argv[2]))" "$1" "$2"; }
+
+FAIL=0
+fail() { echo "FAIL: $*"; FAIL=1; }
+
+precheck_data
+
+# =============================================================================
+# SCENARIO: midwindow
+# =============================================================================
+run_midwindow() {
+    echo "########## scenario: midwindow ##########"
+    local LAST_STEP=$NUM_TRAIN_BATCHES
+    if (( DIE_AT_STEP <= 0 || DIE_AT_STEP >= NUM_TRAIN_BATCHES )); then
+        echo "Warning: die_at_step=$DIE_AT_STEP not strictly inside window (0, $NUM_TRAIN_BATCHES)" >&2
+    fi
+    if (( DIE_AT_STEP % IN_WINDOW_FREQ != 0 )); then
+        echo "Warning: die_at_step=$DIE_AT_STEP not a multiple of in_window_freq=$IN_WINDOW_FREQ; no save lands exactly at crash" >&2
+    fi
+
+    # P1 baseline
+    clean_ckpt "$CKPT_ROOT"
+    run_phase baseline \
+        "NUM_TRAIN_TS=1" "EVAL_EVERY_N_WINDOWS=0" "METRIC_LOG_FREQ=1" \
+        "NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES" "NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES" \
+        "TRAIN_SPLIT_PERCENTAGE=1.0" "DIE_AT_STEP=-1"
+    wait_for_log baseline "PHASE_EXIT=0" "$MW_TIMEOUT"; local rc=$?
+    cleanup_workers
+    (( rc != 0 )) && { echo "FAIL: midwindow baseline didn't finish (rc=$rc)"; tail -20 "$LOG_DIR/baseline.log"; return 1; }
+
+    # P2 interrupted
+    clean_ckpt "$CKPT_ROOT"
+    run_phase interrupt \
+        "NUM_TRAIN_TS=1" "EVAL_EVERY_N_WINDOWS=0" "METRIC_LOG_FREQ=1" \
+        "NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES" "NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES" \
+        "TRAIN_SPLIT_PERCENTAGE=1.0" \
+        "IN_WINDOW_CKPT_FREQ=$IN_WINDOW_FREQ" "KEEP_LAST_N=1" \
+        "DIE_AT_STEP=$DIE_AT_STEP" "CKPT_PATH=$CKPT_ROOT"
+    wait_for_log interrupt "die_at_step=$DIE_AT_STEP hit" "$MW_TIMEOUT"; rc=$?
+    cleanup_workers
+    (( rc != 0 )) && { echo "FAIL: interrupt didn't hit die_at_step (rc=$rc)"; tail -20 "$LOG_DIR/interrupt.log"; return 1; }
+    echo "Saved checkpoints after interrupt: $(sx "ls '$CKPT_ROOT' 2>/dev/null | tr '\n' ' '")"
+
+    # P3 resume
+    run_phase resume \
+        "NUM_TRAIN_TS=1" "EVAL_EVERY_N_WINDOWS=0" "METRIC_LOG_FREQ=1" \
+        "NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES" "NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES" \
+        "TRAIN_SPLIT_PERCENTAGE=1.0" \
+        "IN_WINDOW_CKPT_FREQ=$IN_WINDOW_FREQ" "KEEP_LAST_N=1" \
+        "DIE_AT_STEP=-1" "CKPT_PATH=$CKPT_ROOT"
+    # PHASE_EXIT=0 only after the (blocking) end-of-window save renames cleanly,
+    # so this also confirms the final atomic save completed.
+    wait_for_log resume "PHASE_EXIT=0" "$MW_TIMEOUT"; rc=$?
+    cleanup_workers
+    (( rc != 0 )) && { echo "FAIL: resume didn't finish (rc=$rc)"; tail -20 "$LOG_DIR/resume.log"; return 1; }
+
+    # HARD functional invariants (deterministic; the real correctness proof).
+    if ! grep -qE "Resuming mid-window at train_ts=[0-9]+ batch_idx_in_window=$DIE_AT_STEP\b" "$LOG_DIR/resume.log" 2>/dev/null; then
+        fail "resume did not re-enter mid-window at batch_idx_in_window=$DIE_AT_STEP"
+        grep -E "Resuming" "$LOG_DIR/resume.log" 2>/dev/null | head -2
+    fi
+    local rng_restored
+    rng_restored=$(grep -c "RNG state restored from" "$LOG_DIR/resume.log" 2>/dev/null || echo 0)
+    echo "RNG state restored on $rng_restored ranks"
+    (( rng_restored < 1 )) && fail "no RNG state restored on resume"
+    local first_resumed
+    first_resumed=$(grep -oE 'train - Step [0-9]+ metrics: \{.metric' "$LOG_DIR/resume.log" 2>/dev/null \
+        | grep -oE 'Step [0-9]+' | awk '{print $2}' | sort -n | head -1)
+    echo "First resumed train step: $first_resumed (expect $((DIE_AT_STEP + 1)))"
+    [[ "$first_resumed" != "$((DIE_AT_STEP + 1))" ]] && fail "resume did not continue at step $((DIE_AT_STEP + 1)) (got $first_resumed)"
+
+    # On-disk: atomic save + retention.
+    local num_ckpt stale_ckpt
+    num_ckpt=$(sx "ls '$CKPT_ROOT' 2>/dev/null | grep -E '^[0-9]+$' | wc -l" | tr -d ' ')
+    stale_ckpt=$(sx "ls '$CKPT_ROOT' 2>/dev/null | grep -E '\\.(tmp|old|staging)$' | wc -l" | tr -d ' ')
+    echo "Final: $num_ckpt numeric ckpt subdirs, $stale_ckpt stale dirs (expect 1, 0)"
+    [[ "$num_ckpt" != "1" ]] && fail "keep_last_n=1 violated (got $num_ckpt)"
+    [[ "$stale_ckpt" != "0" ]] && fail "stale .tmp/.old/.staging dirs left behind ($stale_ckpt)"
+
+    # Trajectory closeness (loose sanity bound, NOT bit-equality).
+    python3 "$PYHELPER" parse "$LOG_DIR/baseline.log" "$LOG_DIR/traj_baseline.json"
+    python3 "$PYHELPER" parse "$LOG_DIR/resume.log" "$LOG_DIR/traj_resumed.json"
+    if ! python3 "$PYHELPER" compare "$LOG_DIR/traj_baseline.json" "$LOG_DIR/traj_resumed.json" \
+            --min-resume-step $((DIE_AT_STEP + 1)) --atol "$ATOL"; then
+        fail "trajectory diverged beyond $ATOL (likely wrong data slice / unrestored state)"
+    fi
+    (( FAIL == 0 )) && echo "=== midwindow: PASS ===" || echo "=== midwindow: FAIL ==="
+}
+
+# =============================================================================
+# SCENARIO: multiwindow  (regression guard for the broadcast + barrier fixes)
+# =============================================================================
+# Common split contract — MUST be byte-identical between mw_seed and mw_resume,
+# else the resume aborts on a split-contract mismatch (the holdout_ts default of
+# start_ts+num_train_ts differs between a 1-window seed and an MW_TS resume, so
+# it is PINNED here).
+MW_SPLIT_ENV=( "TRAIN_SPLIT_PERCENTAGE=$MW_SPLIT" "SPLIT_SALT=0"
+               "EVAL_HOLDOUT_TS=$MW_HOLDOUT_TS" "EVAL_HOLDOUT_NUM_WINDOWS=1" )
+
+run_multiwindow() {
+    echo "########## scenario: multiwindow ##########"
+    local sum
+
+    # P1 mw_baseline — cold multi-window run with data-pct eval.
+    clean_ckpt "$CKPT_ROOT"
+    run_phase mw_baseline \
+        "NUM_TRAIN_TS=$MW_TS" "NUM_TRAIN_BATCHES=$MW_BATCHES" "NUM_EVAL_BATCHES=$MW_EVAL_BATCHES" \
+        "EVAL_EVERY_N_WINDOWS=0" "EVAL_EVERY_DATA_PCT=$MW_EVAL_PCT" "METRIC_LOG_FREQ=1" \
+        "${MW_SPLIT_ENV[@]}"
+    wait_for_log mw_baseline "PHASE_EXIT=" "$MW_RUN_TIMEOUT"; local rc=$?
+    cleanup_workers
+    (( rc == 1 )) && { echo "FAIL: mw_baseline crashed"; tail -30 "$LOG_DIR/mw_baseline.log"; return 1; }
+    (( rc == 2 )) && { echo "FAIL: mw_baseline timed out (possible boundary deadlock)"; tail -30 "$LOG_DIR/mw_baseline.log"; return 1; }
+
+    sum="$LOG_DIR/mw_baseline.summary.json"
+    python3 "$PYHELPER" summarize "$LOG_DIR/mw_baseline.log" "$sum" >/dev/null
+    echo "--- mw_baseline summary ---"; cat "$sum"
+    local exit_code anchors barriers dpct_setup dpct_trig
+    exit_code=$(jget "$sum" phase_exit)
+    anchors=$(jget "$sum" total_train_anchors_calls)
+    barriers=$(jget "$sum" window_barrier_count)
+    dpct_setup=$(jget "$sum" data_pct_eval_setup)
+    dpct_trig=$(jget "$sum" data_pct_eval_trigger_count)
+    # (barrier B) ran through ALL windows and exited 0 — no boundary deadlock.
+    [[ "$exit_code" != "0" ]] && fail "mw_baseline did not complete cleanly (phase_exit=$exit_code)"
+    [[ "$barriers" != "$MW_TS" ]] && fail "window barrier fired $barriers times, expected $MW_TS (one per window; need world_size>=2)"
+    # (broadcast A) total_train_anchors computed exactly once (rank 0), not Nx.
+    # It is computed at loop SETUP (before any training), so this exercises the
+    # broadcast regardless of whether an eval later fires.
+    [[ "$anchors" != "1" ]] && fail "total_train_anchors computed $anchors times, expected 1 (rank-0 broadcast regressed)"
+    # data-fraction eval cadence set up (the path that needs total_train_anchors).
+    [[ "$dpct_setup" != "True" ]] && fail "data-pct eval cadence not set up (total_train_anchors path not reached)"
+    # Trigger firing depends on (full-window) anchor count vs the few test steps,
+    # so it is informational — not required to exercise the broadcast fix.
+    echo "data-pct eval triggers fired: $dpct_trig (informational)"
+
+    # P2 mw_seed — 1 window → clean WINDOW_COMPLETE checkpoint.
+    clean_ckpt "$CKPT_ROOT"
+    run_phase mw_seed \
+        "NUM_TRAIN_TS=1" "NUM_TRAIN_BATCHES=$MW_BATCHES" "NUM_EVAL_BATCHES=$MW_EVAL_BATCHES" \
+        "EVAL_EVERY_N_WINDOWS=0" "EVAL_EVERY_DATA_PCT=$MW_EVAL_PCT" "METRIC_LOG_FREQ=1" \
+        "KEEP_LAST_N=1" "CKPT_PATH=$CKPT_ROOT" "${MW_SPLIT_ENV[@]}"
+    wait_for_log mw_seed "PHASE_EXIT=0" "$MW_RUN_TIMEOUT"; rc=$?
+    cleanup_workers
+    (( rc != 0 )) && { echo "FAIL: mw_seed didn't finish/checkpoint (rc=$rc)"; tail -30 "$LOG_DIR/mw_seed.log"; return 1; }
+    local seed_ckpt
+    seed_ckpt=$(sx "ls '$CKPT_ROOT' 2>/dev/null | grep -E '^[0-9]+$' | sort -n | tail -1" | tr -d ' ')
+    echo "mw_seed end-of-window checkpoint: ${seed_ckpt:-<none>} (expect $START_TS)"
+    [[ "$seed_ckpt" != "$START_TS" ]] && { fail "mw_seed did not save end-of-window ckpt $START_TS (got '$seed_ckpt')"; return 1; }
+
+    # P3 mw_resume — relaunch over MW_TS windows; resume past the completed
+    # window and CROSS the boundary into the remaining windows (the exact case
+    # that used to deadlock). The full split contract matches the seed.
+    run_phase mw_resume \
+        "NUM_TRAIN_TS=$MW_TS" "NUM_TRAIN_BATCHES=$MW_BATCHES" "NUM_EVAL_BATCHES=$MW_EVAL_BATCHES" \
+        "EVAL_EVERY_N_WINDOWS=0" "EVAL_EVERY_DATA_PCT=$MW_EVAL_PCT" "METRIC_LOG_FREQ=1" \
+        "KEEP_LAST_N=1" "CKPT_PATH=$CKPT_ROOT" "${MW_SPLIT_ENV[@]}"
+    wait_for_log mw_resume "PHASE_EXIT=" "$MW_RUN_TIMEOUT"; rc=$?
+    cleanup_workers
+    (( rc == 1 )) && { echo "FAIL: mw_resume crashed"; tail -30 "$LOG_DIR/mw_resume.log"; return 1; }
+    (( rc == 2 )) && { echo "FAIL: mw_resume timed out (possible boundary deadlock on resume)"; tail -30 "$LOG_DIR/mw_resume.log"; return 1; }
+
+    sum="$LOG_DIR/mw_resume.summary.json"
+    python3 "$PYHELPER" summarize "$LOG_DIR/mw_resume.log" "$sum" >/dev/null
+    echo "--- mw_resume summary ---"; cat "$sum"
+    local r_exit r_resume_ts r_anchors r_barriers r_dpct
+    r_exit=$(jget "$sum" phase_exit)
+    r_resume_ts=$(jget "$sum" resume_completed_ts)
+    r_anchors=$(jget "$sum" total_train_anchors_calls)
+    r_barriers=$(jget "$sum" window_barrier_count)
+    r_dpct=$(jget "$sum" data_pct_eval_setup)
+    # Resumed from the completed seed window (advanced past the boundary).
+    [[ "$r_resume_ts" != "$START_TS" ]] && fail "mw_resume did not resume from completed train_ts=$START_TS (got $r_resume_ts)"
+    # Crossed the boundary into the remaining MW_TS-1 windows and exited 0.
+    [[ "$r_exit" != "0" ]] && fail "mw_resume did not complete cleanly (phase_exit=$r_exit) — boundary deadlock on resume?"
+    [[ "$r_barriers" != "$((MW_TS - 1))" ]] && fail "mw_resume barrier fired $r_barriers times, expected $((MW_TS - 1)) (windows after the resumed one)"
+    # Broadcast still once on the resume path; data-pct cadence rebuilt.
+    [[ "$r_anchors" != "1" ]] && fail "mw_resume total_train_anchors computed $r_anchors times, expected 1"
+    [[ "$r_dpct" != "True" ]] && fail "mw_resume data-pct eval cadence not set up"
+
+    (( FAIL == 0 )) && echo "=== multiwindow: PASS ===" || echo "=== multiwindow: FAIL ==="
+}
+
+# =============================================================================
+[[ "$SCENARIO" == "midwindow"  || "$SCENARIO" == "all" ]] && run_midwindow
+[[ "$SCENARIO" == "multiwindow" || "$SCENARIO" == "all" ]] && run_multiwindow
 
 if [[ "$KEEP" != "1" ]]; then
     rm -rf "$LOG_DIR"
-    clean_ckpt
+    clean_ckpt "$CKPT_ROOT"
 fi
 
-if [[ $RC -eq 0 ]]; then
-    echo "=== PASS: resume validated (functional invariants + trajectory within $ATOL of baseline) ==="
-else
-    echo "=== FAIL: trajectory diverged beyond $ATOL — likely a real resume bug (wrong data slice / unrestored state), not nondeterminism ==="
+if (( FAIL == 0 )); then
+    echo "=== PASS: all selected scenarios validated ==="
+    exit 0
 fi
-exit $RC
+echo "=== FAIL: one or more scenarios failed (see above) ==="
+exit 1
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index 526a6f89d..9c422c161 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -154,7 +154,11 @@ def setup(
     os.environ["MASTER_PORT"] = str(master_port)
 
     BACKEND = dist.Backend.NCCL
-    TIMEOUT = 1800
+    # Process-group / NCCL watchdog timeout (seconds). Env-overridable so a
+    # diagnostic run can use a short, finite timeout that trips the NCCL flight
+    # recorder dump (TORCH_NCCL_TRACE_BUFFER_SIZE + TORCH_NCCL_DUMP_ON_TIMEOUT)
+    # on a collective desync instead of hanging for the full default.
+    TIMEOUT = int(os.environ.get("PG_TIMEOUT_S", "1800"))
 
     # set device BEFORE init_process_group so NCCL binds this rank to its
     # own GPU; otherwise every rank's first CUDA context lands on GPU 0,
@@ -196,6 +200,55 @@ def cleanup() -> None:
     dist.destroy_process_group()
 
 
+def _window_boundary_barrier(
+    device: torch.device, world_size: int, train_ts: int
+) -> None:
+    """Collective rendezvous at a streaming window boundary.
+
+    The per-window data prep (``window_indices``: an O(N) mask over the ~18 GB
+    mmap'd ``anchor_ts`` array) can complete at very different times across
+    ranks. The embedding input-dist all-to-all that follows is a collective, so
+    if a fast rank reaches it while a slow rank is still in prep, the NCCL
+    stream desyncs and the job deadlocks (one rank a collective behind the
+    rest). Synchronizing here makes prep-time skew harmless: every rank waits
+    until all ranks have a ready window before any issues the first forward.
+
+    Cost is one near-zero-payload barrier per window (299 total over a full
+    run). In the healthy case prep already overlapped the previous window's
+    compute via the prefetcher, so the barrier returns immediately; it only
+    blocks for the real prep skew it is there to absorb.
+    """
+    if not (dist.is_available() and dist.is_initialized()) or world_size <= 1:
+        return
+    t0 = time.time()
+    if device.type == "cuda":
+        dist.barrier(device_ids=[device.index])
+    else:
+        dist.barrier()
+    waited = time.time() - t0
+    # Surface non-trivial skew (the thing this barrier exists to absorb) so a
+    # node with a slow rank is visible without trawling the flight recorder.
+    if waited > 5.0:
+        logger.warning(
+            "[window-barrier] train_ts=%d: waited %.1fs at boundary "
+            "rendezvous (per-rank data-prep skew)",
+            train_ts,
+            waited,
+        )
+    # Test/debug observability: the healthy-path barrier is otherwise SILENT
+    # (the skew warning above only fires on >5s waits), so the resume e2e test
+    # has no signal that the boundary rendezvous actually executed. When
+    # WINDOW_BARRIER_DEBUG=1, rank 0 emits exactly one line per crossed window
+    # so the test can assert the barrier ran at EVERY boundary (regression guard
+    # for the desync deadlock the barrier fixes). Off by default — zero prod cost.
+    if os.environ.get("WINDOW_BARRIER_DEBUG") == "1" and dist.get_rank() == 0:
+        logger.info(
+            "[window-barrier] train_ts=%d rendezvous complete (waited %.3fs)",
+            train_ts,
+            waited,
+        )
+
+
 class HammerToTorchDataset(TorchDataset):
     def __init__(
         self,
@@ -534,6 +587,14 @@ def make_optimizer_and_shard(
     # local_world_size = GPUs per node so the planner respects the intra-node
     # (xGMI/NVLink) vs inter-node hierarchy when placing shards. Defaults to
     # world_size for the single-node case (no behavior change).
+    logger.info(
+        "[hbm-cap] make_optimizer_and_shard: hbm_cap_gb=%s (planner Topology hbm_cap=%d bytes), "
+        "world_size=%s local_world_size=%s",
+        hbm_cap_gb,
+        hbm_cap_gb * 1024 * 1024 * 1024,
+        world_size,
+        local_world_size or world_size,
+    )
     planner = EmbeddingShardingPlanner(
         topology=Topology(
             local_world_size=local_world_size or world_size,
@@ -1728,9 +1789,30 @@ def streaming_train_eval_loop(
             else int(os.environ.get("BATCH_SIZE", "1024"))
         )
         if hasattr(dataset.dataset, "total_train_anchors"):
-            total_train_anchors = dataset.dataset.total_train_anchors(  # pyre-ignore[16]
-                eval_anchor_ts, requested_end_ts - eval_anchor_ts
-            )
+            # total_train_anchors does a full-range gather over the mmap'd uid
+            # array for ~billions of positions + a uid hash. It is slow
+            # (minutes, single-threaded) AND, run on every rank independently,
+            # a large per-rank skew source: a fast rank finishes and races into
+            # the first embedding all-to-all while slow ranks are still hashing,
+            # desyncing the NCCL collective stream and hanging the job. The
+            # result is a pure function of the (identical) dataset + split, so
+            # compute it ONCE on rank 0 and broadcast the scalar; ranks 1..N
+            # skip the gather entirely (no 8x mmap/CPU contention, no skew).
+            if world_size > 1 and torch.distributed.is_initialized():
+                _tta = (
+                    dataset.dataset.total_train_anchors(  # pyre-ignore[16]
+                        eval_anchor_ts, requested_end_ts - eval_anchor_ts
+                    )
+                    if rank == 0
+                    else 0
+                )
+                _tta_t = torch.tensor([_tta], dtype=torch.int64, device=device)
+                torch.distributed.broadcast(_tta_t, src=0)
+                total_train_anchors = int(_tta_t.item())
+            else:
+                total_train_anchors = dataset.dataset.total_train_anchors(  # pyre-ignore[16]
+                    eval_anchor_ts, requested_end_ts - eval_anchor_ts
+                )
             total_train_steps = total_train_anchors // max(1, bs * world_size)
             eval_interval_steps = max(
                 1, round(eval_every_data_pct * total_train_steps)
@@ -2401,6 +2483,18 @@ def _do_eval_db(train_ts: int, gstep: int) -> None:
                 if i == 0 and resume_batch_idx_in_window > 0
                 else 0
             )
+            # Rendezvous all ranks at the window boundary BEFORE the first
+            # forward of this window. The prefetcher has already handed back a
+            # ready iterator (this window's window_indices mmap scan is done),
+            # but that O(N) scan over the ~18 GB anchor_ts array can finish at
+            # very different times across ranks. Without this barrier a fast
+            # rank issues the first embedding all-to-all while a slow rank is
+            # still in prep, desyncing the NCCL collective stream and hanging
+            # the job (observed at a window boundary via the flight recorder:
+            # ranks split across consecutive collective seq ids). This only
+            # absorbs prep skew (one near-zero sync per window); it does not
+            # serialize the background prefetch of future windows.
+            _window_boundary_barrier(device, world_size, train_ts)
             if _per_window_blocks:
                 mlt.block_start()
             _run_train_window(
@@ -2489,6 +2583,10 @@ def _do_eval_nb(train_ts: int, gstep: int) -> None:
                 if i == 0 and resume_batch_idx_in_window > 0
                 else 0
             )
+            # See the double-buffer path: rendezvous all ranks at the window
+            # boundary before the first forward so per-rank data-prep skew
+            # cannot desync the NCCL collective stream and hang the job.
+            _window_boundary_barrier(device, world_size, train_ts)
             if _per_window_blocks:
                 mlt.block_start()
             _run_train_window(
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index af9ff3907..2177f446c 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -35,10 +35,10 @@
 #                       what run_streaming_e2e.sh invokes per relaunch)
 #   Perf pair:
 #     LOG=/apps/chcai/perf_1node.log NUM_TRAIN_BATCHES=200 NUM_EVAL_BATCHES=0 \
-#       EVAL_EACH_WINDOW=0 METRIC_LOG_FREQ=20 \
+#       EVAL_EVERY_N_WINDOWS=0 METRIC_LOG_FREQ=20 \
 #       sbatch --nodes=1 --job-name=y1 scripts/launch_slurm.sh
 #     LOG=/apps/chcai/perf_2node.log NUM_TRAIN_BATCHES=200 NUM_EVAL_BATCHES=0 \
-#       EVAL_EACH_WINDOW=0 METRIC_LOG_FREQ=20 \
+#       EVAL_EVERY_N_WINDOWS=0 METRIC_LOG_FREQ=20 \
 #       sbatch --nodes=2 --job-name=y2 scripts/launch_slurm.sh
 #     # then: bash scripts/compare_node_perf.sh /apps/chcai/perf_1node.log /apps/chcai/perf_2node.log
 #
@@ -285,7 +285,6 @@ orchestrate() {
       -e MODE=$MODE \
       -e START_TS=$START_TS \
       -e NUM_TRAIN_TS=$NUM_TRAIN_TS \
-      -e EVAL_EACH_WINDOW=$EVAL_EACH_WINDOW \
       -e EVAL_EVERY_N_WINDOWS=$EVAL_EVERY_N_WINDOWS \
       ${EVAL_EVERY_DATA_PCT:+-e EVAL_EVERY_DATA_PCT=$EVAL_EVERY_DATA_PCT} \
       -e NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES \

From f8807ca280409f10eadb6978c8db60b3c2975a1f Mon Sep 17 00:00:00 2001
From: chriscai-amd <chcai@amd.com>
Date: Fri, 26 Jun 2026 22:16:43 +0000
Subject: [PATCH 105/113] dlrmv3 streaming: make resume e2e test pass on
 MI350/NFS

Platform-aware per-phase timeouts (meta64 NFS full-model checkpoints take
~9 min each vs B200 node-local NVMe), exposed via new --phase-timeout /
--mw-run-timeout overrides. Fix cleanup_workers self-kill: a plain
`pkill -f generative_recommenders` matched its own shell and SIGKILLed
cleanup mid-run, leaking trainer VRAM so the next phase OOM'd; now uses
bracketed patterns and blocks until trainers exit and VRAM drains.

Validated PASS end-to-end on MI350 (midwindow + multiwindow).

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../train/tests/streaming_resume_test.py      |  4 +-
 .../train/tests/streaming_resume_test.sh      | 76 ++++++++++++++++---
 2 files changed, 68 insertions(+), 12 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py
index cfe9b2e84..e3f78886c 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py
@@ -15,8 +15,8 @@
 """End-to-end failure-injection test for streaming resume.
 
 Two scenarios, driven by the sibling `streaming_resume_test.sh` (see its header
-for the full B200 launch wiring). This module is the shared log parser + a CLI
-the driver shells out to.
+for the full, platform-general launch wiring — NVIDIA B200 and AMD MI350/MI355).
+This module is the shared log parser + a CLI the driver shells out to.
 
 SCENARIO `midwindow` — exact-once mid-window resume. Validates the four
 single-window resume features end-to-end on the yambda-5b stack:
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
index afdc65805..8acbf8269 100755
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
@@ -38,8 +38,8 @@
 #            remaining window, anchors broadcast once, and the run COMPLETED —
 #            i.e. the exact boundary-crossing-on-resume case that used to hang.
 #
-# Driven entirely via env-driven gin knobs (yambda_5b.gin) through the SAME B200
-# worker entrypoint the production supervisor uses: `bash scripts/launch_slurm.sh`
+# Driven entirely via env-driven gin knobs (yambda_5b.gin) through the SAME worker
+# entrypoint both platforms' production supervisors use: `bash scripts/launch_slurm.sh`
 # (worker phase, auto-detected inside the container). WINDOW_BARRIER_DEBUG=1 makes
 # the otherwise-silent barrier emit one rank-0 line per crossed window.
 #
@@ -60,14 +60,22 @@
 #       [--container <name>] [--data-path <path>] [--ckpt-root <path>] [--start-ts 150]
 #       [--num-train-batches 200] [--die-at-step 100]      # midwindow knobs
 #       [--mw-num-train-ts 3] [--mw-num-train-batches 20]  # multiwindow knobs
-#       [--mw-eval-pct 0.34] [--keep]
+#       [--mw-eval-pct 0.34] [--phase-timeout S] [--mw-run-timeout S] [--keep]
 #   --platform is auto-detected from the running container when omitted. Any of
 #   --container/--data-path/--ckpt-root override the platform default.
+#   Per-phase wait budgets default per-platform (B200 node-local NVMe: 1800/3600s;
+#   MI350/MI355 shared-NFS full-model ckpts ~9 min each: 5400/5400s) and can be
+#   overridden with --phase-timeout (midwindow) / --mw-run-timeout (multiwindow).
 
 set -uo pipefail
 
 JOBID=""
-REPO=/home/chcai/training/recommendation_v4
+# Repo root is derived from THIS script's location
+# (<repo>/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh —
+# four levels up), so the test is not pinned to any one user's home. Override with
+# --repo if the repo is mounted at a different path inside the container.
+_SELF_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+REPO=$(cd "$_SELF_DIR/../../../.." && pwd)
 DATASET_SUBDIR=processed_5b/hstu_cache_L4086
 SCENARIO=all                                # midwindow | multiwindow | all
 START_TS=150
@@ -89,7 +97,12 @@ NUM_EVAL_BATCHES=5      # cap the per-phase FINAL eval (0 = full holdout, very s
 DIE_AT_STEP=100
 IN_WINDOW_FREQ=50
 ATOL=0.15           # trajectory closeness bound (NOT bit-equality; see py module)
-MW_TIMEOUT=1800
+# Per-phase wait budget. Left empty here and filled per-platform below (a B200
+# ckpt save/load hits node-local NVMe and is fast; on meta64 each full-model DCP
+# save/load lands on shared NFS and takes ~9 min, and the resume phase does a
+# LOAD + several in-window saves + an end-of-window save, so it needs far longer).
+# Override explicitly with --phase-timeout.
+MW_TIMEOUT=""
 
 # --- multiwindow knobs ---
 MW_TS=3                 # windows to train (>=2 to cross a boundary)
@@ -98,7 +111,9 @@ MW_EVAL_BATCHES=5       # holdout eval batches per fired eval
 MW_EVAL_PCT=0.34        # data-fraction eval cadence (>0 enables the anchors path)
 MW_SPLIT=0.90           # train split (<1 => holdout exists => uid-hash anchor path)
 MW_HOLDOUT_TS=200       # PINNED holdout window (must match across seed→resume)
-MW_RUN_TIMEOUT=3600     # generous: init + planner + anchors gather can take min
+# generous: init + planner + anchors gather can take min; on NFS add ckpt save/load.
+# Empty => filled per-platform below. Override with --mw-run-timeout.
+MW_RUN_TIMEOUT=""
 
 while [[ $# -gt 0 ]]; do
     case $1 in
@@ -117,6 +132,8 @@ while [[ $# -gt 0 ]]; do
         --die-at-step) DIE_AT_STEP="$2"; shift 2;;
         --in-window-freq) IN_WINDOW_FREQ="$2"; shift 2;;
         --atol) ATOL="$2"; shift 2;;
+        --phase-timeout) MW_TIMEOUT="$2"; shift 2;;
+        --mw-run-timeout) MW_RUN_TIMEOUT="$2"; shift 2;;
         --mw-num-train-ts) MW_TS="$2"; shift 2;;
         --mw-num-train-batches) MW_BATCHES="$2"; shift 2;;
         --mw-num-eval-batches) MW_EVAL_BATCHES="$2"; shift 2;;
@@ -142,7 +159,20 @@ if [[ -z "$PLATFORM" ]]; then
         _names=$(srun --jobid="$JOBID" --overlap docker ps -a --format '{{.Names}}' 2>/dev/null)
         if grep -qx yambda_b200 <<<"$_names"; then PLATFORM=b200
         elif grep -qx yambda_primus <<<"$_names"; then PLATFORM=mi350
-        else PLATFORM=b200; echo "Warning: could not auto-detect platform (no known container on job $JOBID) — defaulting to b200"; fi
+        else
+            # No known training container yet (e.g. container not provisioned).
+            # Fall back to probing the allocation's GPU vendor on the host so we
+            # do NOT silently assume a platform.
+            _vendor=$(srun --jobid="$JOBID" --overlap bash -lc \
+                'if command -v rocm-smi >/dev/null 2>&1; then echo amd; \
+                 elif command -v nvidia-smi >/dev/null 2>&1; then echo nvidia; \
+                 else echo unknown; fi' 2>/dev/null | head -1)
+            case "$_vendor" in
+                amd)    PLATFORM=mi350; echo "[$(date)] no known container — detected AMD GPU host (rocm-smi) → mi350";;
+                nvidia) PLATFORM=b200;  echo "[$(date)] no known container — detected NVIDIA GPU host (nvidia-smi) → b200";;
+                *) echo "Error: could not auto-detect platform on job $JOBID (no yambda_b200/yambda_primus container and no rocm-smi/nvidia-smi). Pass --platform b200|mi350|mi355."; exit 1;;
+            esac
+        fi
     fi
     echo "[$(date)] auto-detected platform: $PLATFORM"
 fi
@@ -152,6 +182,9 @@ case "$PLATFORM" in
         # B200: mmap (ckpt LOAD + dataset cache) must NOT touch virtiofs/NFS.
         [[ "$DATA_PATH" == "__AUTO__" ]] && DATA_PATH=/tmp/yambda_data
         : "${CKPT_ROOT:=/tmp/yambda_resume_test/ckpts}"
+        # Node-local NVMe: full-model save/load is fast.
+        : "${MW_TIMEOUT:=1800}"
+        : "${MW_RUN_TIMEOUT:=3600}"
         ;;
     mi350|mi355)
         : "${CONTAINER:=yambda_primus}"
@@ -159,9 +192,15 @@ case "$PLATFORM" in
         # (matches the original MI350 test). /apps/chcai/dlrm_data is the gin default.
         [[ "$DATA_PATH" == "__AUTO__" ]] && DATA_PATH=/apps/chcai/dlrm_data
         : "${CKPT_ROOT:=/apps/chcai/ckpts_resume_test}"
+        # Shared NFS: each full-model DCP save/load is ~9 min. The midwindow resume
+        # phase chains a LOAD + multiple in-window saves + an end-of-window save
+        # (>2000s observed), so the B200 budgets are far too tight — abandoning a
+        # still-running trainer leaks GPU VRAM and OOMs the next phase. Be generous.
+        : "${MW_TIMEOUT:=5400}"
+        : "${MW_RUN_TIMEOUT:=5400}"
         ;;
 esac
-echo "[$(date)] platform=$PLATFORM container=$CONTAINER data_path=${DATA_PATH:-<gin default>} ckpt_root=$CKPT_ROOT"
+echo "[$(date)] platform=$PLATFORM container=$CONTAINER data_path=${DATA_PATH:-<gin default>} ckpt_root=$CKPT_ROOT phase_timeout=${MW_TIMEOUT}s mw_run_timeout=${MW_RUN_TIMEOUT}s"
 
 mkdir -p "$LOG_DIR"
 PYHELPER="$REPO/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.py"
@@ -170,9 +209,26 @@ PYHELPER="$REPO/generative_recommenders/dlrm_v3/train/tests/streaming_resume_tes
 #     path is node-local on B200 or shared NFS on MI350) ---
 sx() { srun --jobid="$JOBID" --overlap docker exec "$CONTAINER" bash -lc "$1" 2>/dev/null; }
 
+# Kill any lingering trainer procs from a prior phase AND block until they are
+# really gone, so the freed GPU VRAM is reclaimed before the next phase shards
+# its embedding tables (otherwise it OOMs on the leaked memory).
+#   * Bracketed patterns ([t]rain_ranker, …) are REQUIRED: a plain `pkill -f
+#     train_ranker` issued inside `bash -lc "...train_ranker..."` matches its OWN
+#     command line and SIGKILLs this very shell (docker exec returns 137), which
+#     silently aborted the rest of the old cleanup and leaked trainers/VRAM.
+#   * After signalling, poll until no trainer remains (bounded), then a short
+#     settle so the driver finishes reclaiming device memory.
 cleanup_workers() {
-    sx "pkill -9 -f train_ranker 2>/dev/null; pkill -9 -f generative_recommenders 2>/dev/null; \
-        pkill -9 -f multiprocessing 2>/dev/null; sleep 2; pkill -9 -f spawn_main 2>/dev/null; sleep 3; true" || true
+    sx '
+        for pat in "[t]rain_ranker" "[g]enerative_recommenders" "[s]pawn_main" "[m]ultiprocessing"; do
+            pkill -9 -f "$pat" 2>/dev/null
+        done
+        for _ in $(seq 1 30); do
+            pgrep -f "[t]rain_ranker" >/dev/null 2>&1 || \
+            pgrep -f "[g]enerative_recommenders" >/dev/null 2>&1 || break
+            sleep 2
+        done
+        sleep 3; true' || true
 }
 clean_ckpt() { sx "rm -rf '$1'" || true; }
 

From ea370649de4d424bc85206327b993c1dc5add8b7 Mon Sep 17 00:00:00 2001
From: chris <chcai@amd.com>
Date: Fri, 26 Jun 2026 23:17:05 +0000
Subject: [PATCH 106/113] dlrmv3 streaming: document midwindow vs multiwindow
 resume test purpose

Add a plain-language header section to streaming_resume_test.sh explaining
why there are two scenarios and how they differ: midwindow guards resume
CORRECTNESS (land on the right batch/RNG/checkpoint within one window), while
multiwindow guards LIVENESS at window seams (all ranks cross a window boundary
in lockstep without an NCCL desync deadlock). Includes before/after timelines
of the boundary hang vs the rank-0-broadcast + dist.barrier fixes, why a
single-window test structurally cannot catch those bugs, and a comparison
table. Comments only; no behavior change.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../train/tests/streaming_resume_test.sh      | 67 +++++++++++++++++++
 1 file changed, 67 insertions(+)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
index 8acbf8269..e093fb31a 100755
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/tests/streaming_resume_test.sh
@@ -1,6 +1,73 @@
 #!/bin/bash
 # End-to-end failure-injection + resume test for streaming-train-eval.
 #
+# ============================================================================
+# WHY TWO TESTS (the intuition, in plain language)
+# ============================================================================
+# Training runs over consecutive time WINDOWS (window 0, then 1, then 2, ...).
+# All N GPUs must march from one window to the next IN LOCKSTEP: they constantly
+# do "everybody-talk-at-once" group ops (NCCL collectives — sharing embeddings
+# across GPUs), and every GPU must enter each group-op at the same time. If one
+# GPU is late, the rest wait for it forever and the whole job FREEZES (deadlock).
+#
+# The two scenarios check DIFFERENT kinds of failure — not bigger vs smaller:
+#
+#   midwindow   = CORRECTNESS. "When I crash and resume, do I land on the exact
+#                 right batch with the right RNG state and produce the same
+#                 numbers?"  (stays inside ONE window; never crosses a seam.)
+#
+#   multiwindow = LIVENESS / NO-DEADLOCK. "Can all N GPUs hand off across a
+#                 window SEAM together without one falling out of step and
+#                 hanging the job?"  (needs >=2 windows so a seam actually exists.)
+#
+# The dangerous spot is the SEAM between two windows: there, each GPU does solo
+# prep work (load next window's data; count anchors for the eval cadence) and
+# they DON'T all finish at the same speed. Two bugs lived exactly there, and BOTH
+# are invisible to a single-window test:
+#   (A) every GPU separately ran a slow O(N) "count all the data" pass -> they
+#       finished at different times -> fast GPU barged into the next group-op
+#       while others were still counting -> freeze.
+#       FIX: only rank 0 counts, then broadcasts the number to everyone else.
+#   (B) no rendezvous at the seam -> uneven data-prep -> same desync -> freeze.
+#       FIX: a dist.barrier() at every window boundary (all GPUs wait, then cross
+#       together).  WINDOW_BARRIER_DEBUG=1 makes rank 0 log one line per seam.
+#
+#   TIMELINE — without the fixes (each GPU on its own clock at the seam):
+#       win0 train  | solo prep (varies) | next group-op
+#       GPU0 ########|=====|              >> waiting..........
+#       GPU1 ########|========|           >> waiting.......
+#       GPU2 ########|===========|        >> waiting....
+#       GPU3 ########|==============|     >> never lines up  -> HANG
+#
+#   TIMELINE — with the fixes (rank 0 counts + a barrier gate at the seam):
+#       win0 train  | [== BARRIER: all wait ==] | win1 train
+#       GPU0 ########| count |  wait           # |########
+#       GPU1 ########|       |  wait           # |########
+#       GPU2 ########|       |  wait           # |########
+#       GPU3 ########|       |  wait           # |########
+#                 rank0 shares count ^   all cross together ^  -> OK
+#
+# Why midwindow can NOT catch (A)/(B): it runs a SINGLE window with per-window
+# eval off (NUM_TRAIN_TS=1, EVAL_EVERY_N_WINDOWS=0, split=1.0), so it never
+# reaches a seam and never turns on the data-fraction-eval/anchor-count path.
+# A broken barrier or broken broadcast passes midwindow silently.
+#
+# Why the multiwindow RESUME phase (P3 below) is the meanest case: restarting
+# from a checkpoint loads the saved window and then IMMEDIATELY steps across a
+# seam into the next window — landing right on the spot that used to freeze, AND
+# re-running all that slow setup on the resume path. If (A)/(B) regressed, P3
+# hangs and the test fails by timing out.
+#
+#                 | midwindow            | multiwindow
+#   --------------+----------------------+-----------------------------
+#   proves        | resume to RIGHT spot | cross seam WITHOUT freezing
+#   windows       | 1 (no seam)          | >=2 (crosses >=1 seam)
+#   data-pct eval | off                  | on (exercises the anchor count)
+#   catches       | wrong batch/RNG/ckpt | missing barrier/broadcast -> HANG
+#   failure mode  | wrong NUMBERS        | job FREEZES forever
+# They are complementary: you need BOTH.
+# ============================================================================
+#
 # PLATFORM-GENERAL: runs on both NVIDIA B200 and AMD MI350/MI355 (ROCm/meta64).
 # The only hardware-specific bits are picked by --platform (auto-detected from the
 # running container if omitted): the container name, the dataset path, and the

From 8db66f3dfc7ca2a1de925f34315890d0677a4305 Mon Sep 17 00:00:00 2001
From: chris <chcai@amd.com>
Date: Sat, 27 Jun 2026 00:54:12 +0000
Subject: [PATCH 107/113] dlrmv3: gin/env-configurable embedding table
 placement (HBM/UVM)

Add per-table and global control over embedding table placement via gin,
overridable by env var. make_optimizer_and_shard now translates an
EMB_PLACEMENT global default plus per-table EMB_PLACEMENT_OVERRIDES
(hbm|uvm|uvm_caching|auto) into torchrec ParameterConstraints fed to the
EmbeddingShardingPlanner; "auto" leaves the table to the planner so the
default is byte-identical to the prior behavior (constraints=None). A new
env_str_map gin helper parses "name=val,name=val" with opt-in per-key merge
so a launch-time env tweak layers over the gin default. Also logs the
planner's ACTUAL per-table compute kernel ([emb-placement] plan: ...).

Validated end-to-end on 8x B200: force-HBM put every table on fused, and a
per-table override put uid on fused_uvm_caching while the rest stayed fused;
both trained cleanly.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin           |  21 +++
 .../dlrm_v3/train/train_ranker.py             |   1 +
 .../dlrm_v3/train/utils.py                    | 131 +++++++++++++++++-
 .../generative_recommenders/dlrm_v3/utils.py  |  39 ++++++
 4 files changed, 191 insertions(+), 1 deletion(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 14bc9d106..23cd7886b 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -158,6 +158,27 @@ make_optimizer_and_shard.hbm_cap_gb = @env_int()
 env_int.key = "HBM_CAP_GB"
 env_int.default = 260
 
+# Embedding table placement. Global default applied to EVERY table:
+#   auto        -> no constraint; planner decides from the HBM cap (default)
+#   hbm         -> FUSED          (resident in GPU HBM)
+#   uvm         -> FUSED_UVM      (host DDR via UVM, no HBM cache)
+#   uvm_caching -> FUSED_UVM_CACHING (host DDR + HBM cache)
+# "force HBM" for all tables = set this to "hbm" (or export EMB_PLACEMENT=hbm).
+make_optimizer_and_shard.embedding_placement = @emp/env_str()
+emp/env_str.key     = "EMB_PLACEMENT"
+emp/env_str.default = "auto"
+
+# Per-table placement overrides (win over the global default above). Configure
+# each table independently here as a gin dict; allowed values per table:
+#   "hbm" | "uvm" | "uvm_caching" | "auto"  (unlisted tables use the global
+# default above). merge=True means a per-run env override LAYERS ON TOP of this
+# dict per key (tweak one table at launch, keep the rest), e.g.:
+#   EMB_PLACEMENT_OVERRIDES="uid=hbm"   # only retargets uid; others stay as below
+make_optimizer_and_shard.embedding_placement_overrides = @env_str_map()
+env_str_map.key     = "EMB_PLACEMENT_OVERRIDES"
+env_str_map.merge   = True
+env_str_map.default = {}
+
 # Sparse embedding all-to-all wire precision. The embedding shuffle is the
 # dominant, bandwidth-bound (esp. multi-node) collective; quantizing it via
 # TorchRec QCommsConfig halves (bf16/fp16, both 2 bytes) the wire volume.
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
index d5797697b..a57153f60 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/train_ranker.py
@@ -150,6 +150,7 @@ def _main_func(
         device=device,
         world_size=world_size,
         local_world_size=gpus_per_node,
+        embedding_table_configs=embedding_table_configs,
     )
     # Decorrelate forward-time stochasticity (HSTU dropout) per data-parallel
     # rank. MUST run after make_model() + make_optimizer_and_shard() so the
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index 9c422c161..8d52a055b 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -55,8 +55,10 @@
 from torch.optim.optimizer import Optimizer
 from torch.utils.data import DataLoader, Dataset as TorchDataset
 from torch.utils.data.distributed import _T_co, DistributedSampler
+from torchrec.distributed.embedding_types import EmbeddingComputeKernel
 from torchrec.distributed.model_parallel import DistributedModelParallel
 from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology
+from torchrec.distributed.planner.types import ParameterConstraints
 from torchrec.distributed.sharding_plan import get_default_sharders
 from torchrec.distributed.types import ShardedTensor, ShardingEnv
 from torchrec.modules.embedding_configs import EmbeddingConfig
@@ -75,6 +77,16 @@
     EmbeddingCollection,
 }
 
+# Embedding placement vocabulary -> torchrec compute kernel. Used by
+# make_optimizer_and_shard to translate the gin/env placement strings
+# ("hbm"/"uvm"/"uvm_caching") into ParameterConstraints. "auto" (or anything not
+# in this map) means "no constraint": the planner decides from the HBM cap.
+_PLACEMENT_TO_KERNEL: Dict[str, EmbeddingComputeKernel] = {
+    "hbm": EmbeddingComputeKernel.FUSED,
+    "uvm": EmbeddingComputeKernel.FUSED_UVM,
+    "uvm_caching": EmbeddingComputeKernel.FUSED_UVM_CACHING,
+}
+
 
 @gin.configurable
 def seed_everything(seed: int = -1, rank: int = 0) -> None:
@@ -552,6 +564,89 @@ def _maybe_apply_qcomm_a2a(
         return sharders
 
 
+def _embedding_table_names(
+    model: torch.nn.Module,
+    embedding_table_configs: Optional[Dict[str, EmbeddingConfig]],
+) -> List[str]:
+    """All embedding table names the planner will place.
+
+    Prefers the authoritative `embedding_table_configs` (keys == table names ==
+    planner parameter names). Falls back to walking the model's EBC/EC modules
+    when the configs are not passed in.
+    """
+    if embedding_table_configs:
+        return list(embedding_table_configs.keys())
+    names: List[str] = []
+    for _, module in model.named_modules():
+        if type(module) in TORCHREC_TYPES:
+            if isinstance(module, EmbeddingBagCollection):
+                names.extend(c.name for c in module.embedding_bag_configs())
+            elif isinstance(module, EmbeddingCollection):
+                names.extend(c.name for c in module.embedding_configs())
+    return names
+
+
+def _build_placement_constraints(
+    model: torch.nn.Module,
+    embedding_placement: str,
+    embedding_placement_overrides: Dict[str, str],
+    embedding_table_configs: Optional[Dict[str, EmbeddingConfig]],
+) -> Dict[str, ParameterConstraints]:
+    """Translate gin/env placement strings into torchrec ParameterConstraints.
+
+    Resolution per table: ``overrides.get(name, embedding_placement)``. A value
+    of ``auto`` (or empty) means "no constraint" and the table is omitted so the
+    planner keeps deciding from the HBM cap. Unknown values raise ValueError.
+    """
+    valid = set(_PLACEMENT_TO_KERNEL) | {"auto", ""}
+    for where, val in [
+        ("embedding_placement", embedding_placement),
+        *[
+            (f"embedding_placement_overrides[{k}]", v)
+            for k, v in embedding_placement_overrides.items()
+        ],
+    ]:
+        if val not in valid:
+            raise ValueError(
+                f"Invalid embedding placement {val!r} for {where}; "
+                f"expected one of {sorted(valid - {''})}."
+            )
+
+    names = _embedding_table_names(model, embedding_table_configs)
+    unknown = set(embedding_placement_overrides) - set(names)
+    if unknown:
+        logger.warning(
+            "[emb-placement] override(s) for unknown table(s) %s ignored; "
+            "known tables: %s",
+            sorted(unknown),
+            sorted(names),
+        )
+
+    constraints: Dict[str, ParameterConstraints] = {}
+    resolved: Dict[str, str] = {}
+    for name in names:
+        placement = embedding_placement_overrides.get(name, embedding_placement)
+        resolved[name] = placement or "auto"
+        kernel = _PLACEMENT_TO_KERNEL.get(placement)
+        if kernel is not None:
+            constraints[name] = ParameterConstraints(
+                compute_kernels=[kernel.value]
+            )
+
+    rank = dist.get_rank() if dist.is_initialized() else 0
+    if rank == 0:
+        logger.info(
+            "[emb-placement] global=%r overrides=%s -> resolved=%s "
+            "(constrained=%d/%d tables; the rest are planner-auto)",
+            embedding_placement,
+            embedding_placement_overrides or {},
+            resolved,
+            len(constraints),
+            len(names),
+        )
+    return constraints
+
+
 @gin.configurable
 def make_optimizer_and_shard(
     model: torch.nn.Module,
@@ -561,6 +656,9 @@ def make_optimizer_and_shard(
     hbm_cap_gb: int = 260,
     sparse_a2a_forward_precision: str = "fp32",
     sparse_a2a_backward_precision: str = "fp32",
+    embedding_placement: str = "auto",
+    embedding_placement_overrides: Optional[Dict[str, str]] = None,
+    embedding_table_configs: Optional[Dict[str, EmbeddingConfig]] = None,
 ) -> Tuple[DistributedModelParallel, torch.optim.Optimizer]:
     dense_opt_cls, dense_opt_args, dense_opt_factory = (
         dense_optimizer_factory_and_class()
@@ -595,6 +693,17 @@ def make_optimizer_and_shard(
         world_size,
         local_world_size or world_size,
     )
+    # Resolve per-table embedding placement (gin/env-driven). Global default
+    # `embedding_placement` applies to every table; `embedding_placement_overrides`
+    # (table name -> placement) wins per table. Tables resolving to "auto" carry
+    # no constraint (planner decides from hbm_cap). When nothing is constrained we
+    # pass constraints=None so the plan is byte-identical to the legacy path.
+    constraints = _build_placement_constraints(
+        model=model,
+        embedding_placement=embedding_placement,
+        embedding_placement_overrides=embedding_placement_overrides or {},
+        embedding_table_configs=embedding_table_configs,
+    )
     planner = EmbeddingShardingPlanner(
         topology=Topology(
             local_world_size=local_world_size or world_size,
@@ -602,7 +711,8 @@ def make_optimizer_and_shard(
             compute_device="cuda",
             hbm_cap=hbm_cap_gb * 1024 * 1024 * 1024,
             ddr_cap=0,
-        )
+        ),
+        constraints=constraints or None,
     )
     pg = dist.GroupMember.WORLD
     env = ShardingEnv.from_process_group(pg)  # pyre-ignore [6]
@@ -610,6 +720,25 @@ def make_optimizer_and_shard(
 
     plan = planner.collective_plan(model, sharders, pg)
 
+    # Authoritative placement log: report the compute kernel the planner ACTUALLY
+    # assigned to each table (vs the [emb-placement] line above, which reports what
+    # was requested). "fused" = HBM; "fused_uvm"/"fused_uvm_caching" = UVM-backed.
+    # Rank 0 only, best-effort (never break the build over a logging shape change).
+    if (dist.get_rank() if dist.is_initialized() else 0) == 0:
+        try:
+            for module_path, param_plans in plan.plan.items():
+                for param_name, ps in param_plans.items():
+                    logger.info(
+                        "[emb-placement] plan: %s.%s -> compute_kernel=%s "
+                        "sharding_type=%s",
+                        module_path,
+                        param_name,
+                        getattr(ps, "compute_kernel", "?"),
+                        getattr(ps, "sharding_type", "?"),
+                    )
+        except Exception as e:  # logging only; must never fail the build
+            logger.warning("[emb-placement] could not dump plan kernels: %s", e)
+
     # Re-seed right before DMP materializes/inits the sharded embedding tables.
     # The per-table seeded init_fn (configs.get_embedding_table_config) handles
     # the eager path, but the fused FBGEMM TBE path inits weights on-device and
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
index ed456bde6..1e5d79993 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/utils.py
@@ -1500,6 +1500,45 @@ def env_str(key: str = "", default: str = "") -> str:
     return raw if raw else default
 
 
+@gin.configurable
+def env_str_map(
+    key: str = "",
+    default: Optional[Dict[str, str]] = None,
+    merge: bool = False,
+) -> Dict[str, str]:
+    """Parse os.environ[key] as 'name=value,name=value' into a dict.
+
+    Falls back to `default` (gin) when the env var is unset/empty. Companion to
+    `env_str` for map-valued overrides (e.g. per-table embedding placement).
+    Example gin usage:
+
+        make_optimizer_and_shard.embedding_placement_overrides = @env_str_map()
+        env_str_map.key     = "EMB_PLACEMENT_OVERRIDES"
+        env_str_map.default = {}
+
+    Example env override: EMB_PLACEMENT_OVERRIDES="uid=uvm_caching,item_id=hbm".
+
+    `merge` controls how the parsed env entries combine with `default`:
+      * False (default): the env var REPLACES `default` wholesale (whole-dict
+        override) — set the env var and you fully define the map.
+      * True: the parsed env entries are LAYERED ON TOP of `default` (per-key
+        override) — gin `default` defines the base map and the env var tweaks
+        only the named keys, leaving the rest of `default` intact.
+    """
+    base = dict(default or {})
+    raw = os.environ.get(key) if key else None
+    if not raw:
+        return base
+    parsed: Dict[str, str] = {}
+    for pair in raw.split(","):
+        pair = pair.strip()
+        if not pair:
+            continue
+        k, _, v = pair.partition("=")
+        parsed[k.strip()] = v.strip()
+    return {**base, **parsed} if merge else parsed
+
+
 @gin.configurable
 def env_int(key: str = "", default: int = 0) -> int:
     """Resolve an int from os.environ[key], falling back to `default`.

From d0ce11e2fa1919482b7c91a4961eb32788767085 Mon Sep 17 00:00:00 2001
From: Chris Cai <chcai@amd.com>
Date: Mon, 29 Jun 2026 06:40:15 +0000
Subject: [PATCH 108/113] dlrmv3 qcomm: hard-fail instead of silently falling
 back to fp32 a2a
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When embedding all-to-all quantization is configured (SPARSE_A2A_FWD/BWD)
but cannot actually be enabled, _maybe_apply_qcomm_a2a previously logged a
warning and returned the unquantized sharders, silently running fp32. That
hides real misconfiguration and, worse, could leave some ranks on fp32 while
others run fp16 — desyncing the embedding collectives.

Now every "configured but not enabled" path raises (on all ranks, so the job
aborts consistently):
  - unknown precision string -> ValueError
  - codec registry build failure -> RuntimeError (chained from cause)
  - codec built but no EmbeddingCollectionSharder to bind it to -> RuntimeError

The legitimate no-quant default (forward=backward=fp32) still returns the
sharders untouched.
---
 .../dlrm_v3/train/utils.py                    | 78 +++++++++++--------
 1 file changed, 44 insertions(+), 34 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index 8d52a055b..19451078a 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -512,14 +512,12 @@ def _maybe_apply_qcomm_a2a(
     rank0 = (not dist.is_initialized()) or dist.get_rank() == 0
     for name, p in (("forward", fwd), ("backward", bwd)):
         if p not in _COMM:
-            if rank0:
-                logger.warning(
-                    "DLRMV4 qcomm a2a: unknown %s precision %r (want "
-                    "fp32|bf16|fp16); using fp32 a2a",
-                    name,
-                    p,
-                )
-            return sharders
+            # Misconfigured precision: fail loudly rather than silently running
+            # fp32. A typo in SPARSE_A2A_{FWD,BWD} must not pass as "no quant".
+            raise ValueError(
+                f"DLRMV4 qcomm a2a: unknown {name} precision {p!r} "
+                f"(want one of fp32|bf16|fp16)"
+            )
     if fwd == "fp32" and bwd == "fp32":
         return sharders
     try:
@@ -535,33 +533,45 @@ def _maybe_apply_qcomm_a2a(
             backward_precision=getattr(CommType, _COMM[bwd]),
         )
         registry = get_qcomm_codecs_registry(qcfg, device=device)
-        new_sharders = []
-        replaced = False
-        for s in sharders:
-            if type(s).__name__ == "EmbeddingCollectionSharder" and not replaced:
-                new_sharders.append(
-                    EmbeddingCollectionSharder(qcomm_codecs_registry=registry)
-                )
-                replaced = True
-            else:
-                new_sharders.append(s)
-        if rank0:
-            logger.info(
-                "DLRMV4 qcomm a2a ENABLED: forward=%s backward=%s "
-                "replaced_ec_sharder=%s",
-                fwd,
-                bwd,
-                replaced,
+    except Exception as e:  # noqa: BLE001
+        # A configured quantized a2a that fails to build is a hard error. Silently
+        # downgrading to fp32 would change numerics/throughput with no signal, and
+        # a partial failure (one rank fp32, others fp16) would also desync the
+        # collectives. Raise on every rank so the whole job aborts consistently.
+        raise RuntimeError(
+            f"DLRMV4 qcomm a2a: failed to enable configured quantization "
+            f"(forward={fwd} backward={bwd}): {type(e).__name__}: {e}"
+        ) from e
+
+    new_sharders = []
+    replaced = False
+    for s in sharders:
+        if type(s).__name__ == "EmbeddingCollectionSharder" and not replaced:
+            new_sharders.append(
+                EmbeddingCollectionSharder(qcomm_codecs_registry=registry)
             )
-        return new_sharders
-    except Exception as e:  # noqa: BLE001 — fall back to fp32 a2a on any failure
-        if rank0:
-            logger.warning(
-                "DLRMV4 qcomm a2a: failed to enable (%s: %s); using fp32 a2a",
-                type(e).__name__,
-                e,
-            )
-        return sharders
+            replaced = True
+        else:
+            new_sharders.append(s)
+    if not replaced:
+        # Codec registry built fine, but there was no EmbeddingCollectionSharder to
+        # bind it to, so the quantized a2a would be silently inert. Treat this as a
+        # hard failure too — "configured but not applied" is the bug we want caught.
+        raise RuntimeError(
+            f"DLRMV4 qcomm a2a: quantization configured (forward={fwd} "
+            f"backward={bwd}) but no EmbeddingCollectionSharder was found to attach "
+            f"the qcomm codec registry to; refusing to run with quantization "
+            f"silently disabled"
+        )
+    if rank0:
+        logger.info(
+            "DLRMV4 qcomm a2a ENABLED: forward=%s backward=%s "
+            "replaced_ec_sharder=%s",
+            fwd,
+            bwd,
+            replaced,
+        )
+    return new_sharders
 
 
 def _embedding_table_names(

From 584cb66f3902a9a2c46bc647d4226c02674efa20 Mon Sep 17 00:00:00 2001
From: Chris Cai <chcai@amd.com>
Date: Mon, 29 Jun 2026 18:57:16 +0000
Subject: [PATCH 109/113] dlrmv3 yambda-5b: default embedding a2a quantization
 to fp16/fp16
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Flip SPARSE_A2A_FWD/BWD gin defaults from fp32 to fp16. An A/B run vs the fp32
interleaved baseline matched window AUC to within fixed-seed noise (mean
Δ≈-1e-6, max|Δ|≈5e-5, Pearson r=1.0 over the 0-54.5% data overlap) while
halving the embedding all-to-all wire volume (~6% end-to-end speedup). Grads on
yambda-5b stay well inside fp16 range (grad-clip=1.0, lr=1e-7), so fp16 backward
is convergence-neutral here; set both to "fp32" to restore the unquantized path.
---
 .../dlrm_v3/train/gin/yambda_5b.gin             | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 23cd7886b..008fe6d0b 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -183,17 +183,20 @@ env_str_map.default = {}
 # dominant, bandwidth-bound (esp. multi-node) collective; quantizing it via
 # TorchRec QCommsConfig halves (bf16/fp16, both 2 bytes) the wire volume.
 # Forward and backward are set independently (each: "fp32" | "bf16" | "fp16").
-# Both "fp32" = off (default; numerically identical to baseline trunk).
-# Per TorchRec golden_training, fwd=fp16 / bwd=bf16 is the recommended quantized
-# mix: fp16's mantissa suits bounded forward activations, while bf16's wider
-# exponent range avoids overflow on gradients.
-# Override via $SPARSE_A2A_FWD / $SPARSE_A2A_BWD.
+# Both "fp32" = off (numerically identical to baseline trunk).
+# Default is now fp16/fp16: an A/B run vs the fp32 baseline matched window AUC to
+# within fixed-seed noise (mean Δ≈-1e-6, max|Δ|≈5e-5, r=1.0 over 0-54.5% data)
+# while cutting the embedding-a2a wire volume in half (~6% end-to-end speedup).
+# TorchRec golden_training suggests fwd=fp16 / bwd=bf16 (bf16's wider exponent
+# hedges gradient overflow), but on yambda-5b the grads stay well in fp16 range
+# (grad-clip=1.0, lr=1e-7), so fp16/fp16 is convergence-neutral here.
+# Override via $SPARSE_A2A_FWD / $SPARSE_A2A_BWD (e.g. set both "fp32" to disable).
 make_optimizer_and_shard.sparse_a2a_forward_precision = @saaf/env_str()
 saaf/env_str.key     = "SPARSE_A2A_FWD"
-saaf/env_str.default = "fp32"
+saaf/env_str.default = "fp16"
 make_optimizer_and_shard.sparse_a2a_backward_precision = @saab/env_str()
 saab/env_str.key     = "SPARSE_A2A_BWD"
-saab/env_str.default = "fp32"
+saab/env_str.default = "fp16"
 
 get_dataset.name = %dataset
 get_dataset.new_path_prefix = %DATA_PATH

From 3a695608d471cb2a37a632bbb582944301449985 Mon Sep 17 00:00:00 2001
From: Chris Cai <chcai@amd.com>
Date: Mon, 29 Jun 2026 19:04:18 +0000
Subject: [PATCH 110/113] dlrmv3 yambda-5b: default gin to the canonical fp16
 full-corpus run

Align the gin defaults so a no-override streaming-train-eval launch reproduces
the validated fp16 run:
  - START_TS 150 -> 0 and NUM_TRAIN_TS 149 -> 299 (sweep the full ts=0..298
    corpus instead of the dense ts=150..298 sub-range)
  - EVAL_HOLDOUT_TS -1 -> 299 (the window just past training; equivalent to the
    prior runtime-resolved value under the new sweep, now explicit)
  - CKPT_TIME_INTERVAL_S 0.0 -> 3600 (hourly saves)
Comments updated; each remains env-overridable to restore the old behavior.
---
 .../dlrm_v3/train/gin/yambda_5b.gin           | 31 ++++++++++---------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 008fe6d0b..2834f52ee 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -335,15 +335,17 @@ make_persistent_streaming_dataloader.num_workers = %num_workers
 make_persistent_streaming_dataloader.prefetch_factor = %prefetch_factor
 streaming_train_eval_loop.num_train_ts = @nts/env_int()
 nts/env_int.key = "NUM_TRAIN_TS"
-# 149 daily windows -> with start_ts=150 the run sweeps ts=150..298, the full
-# dense range of the corpus (matches the long e2e runs). Clamped to the
-# dataset's available window count at runtime. Override via $NUM_TRAIN_TS.
-nts/env_int.default = 149
+# 299 daily windows -> with start_ts=0 the run sweeps ts=0..298, the full corpus
+# (matches the long fp16 e2e run). Clamped to the dataset's available window
+# count at runtime. Override via $NUM_TRAIN_TS.
+nts/env_int.default = 299
 # Anchors need >= history_length prior events, so the first ~130 daily windows
-# are near-empty warm-up; start at a dense window. Override via $START_TS.
+# are near-empty warm-up. Default start_ts=0 trains the full corpus from the
+# start (matches the canonical fp16 run); set $START_TS=150 to skip warm-up and
+# begin at a dense window. Override via $START_TS.
 streaming_train_eval_loop.start_ts = @sts/env_int()
 sts/env_int.key = "START_TS"
-sts/env_int.default = 150
+sts/env_int.default = 0
 # Per-step metric logging cadence. Default 50 (one compute_and_log GPU->CPU
 # sync per 50 batches). The streaming-resume test sets METRIC_LOG_FREQ=1 so
 # every step emits a parseable "Step N metrics" line for trajectory comparison.
@@ -434,12 +436,13 @@ streaming_train_eval_loop.double_buffer = @db/env_int()
 db/env_int.key = "DOUBLE_BUFFER"
 db/env_int.default = 1
 # Fixed eval-holdout window range (held-out users' anchors over these windows
-# form the eval set evaluated at EVERY eval step). EVAL_HOLDOUT_TS<0 (default)
-# resolves at runtime to start_ts+num_train_ts (the window just past training),
-# which is stable across resume. EVAL_HOLDOUT_NUM_WINDOWS widens the eval span.
+# form the eval set evaluated at EVERY eval step). Default 299 = the window just
+# past the ts=0..298 training sweep (matches the canonical fp16 run). Set
+# EVAL_HOLDOUT_TS<0 to instead resolve at runtime to start_ts+num_train_ts (also
+# stable across resume). EVAL_HOLDOUT_NUM_WINDOWS widens the eval span.
 streaming_train_eval_loop.eval_holdout_ts = @eht/env_int()
 eht/env_int.key     = "EVAL_HOLDOUT_TS"
-eht/env_int.default = -1
+eht/env_int.default = 299
 streaming_train_eval_loop.eval_holdout_num_windows = @ehnw/env_int()
 ehnw/env_int.key     = "EVAL_HOLDOUT_NUM_WINDOWS"
 ehnw/env_int.default = 1
@@ -545,12 +548,12 @@ streaming_train_eval_loop.checkpoint_step_frequency = @csf/env_int()
 csf/env_int.key     = "CKPT_STEP_FREQ"
 csf/env_int.default = 0
 # Wall-clock checkpoint cadence in seconds: save when >= this many seconds have
-# elapsed since the last save (e.g. 3600 for hourly). Rank 0 owns the clock and
-# broadcasts the decision so all ranks save together. 0.0 (default) = off.
-# Override via $CKPT_TIME_INTERVAL_S.
+# elapsed since the last save. Rank 0 owns the clock and broadcasts the decision
+# so all ranks save together. Default 3600 = hourly saves (matches the canonical
+# fp16 run); set 0.0 to disable. Override via $CKPT_TIME_INTERVAL_S.
 streaming_train_eval_loop.checkpoint_time_interval_s = @ctis/env_float()
 ctis/env_float.key     = "CKPT_TIME_INTERVAL_S"
-ctis/env_float.default = 0.0
+ctis/env_float.default = 3600.0
 # Cap each train_ts window's batch count (mostly for the resume test driver).
 # Unset / 0 = use the full window.
 streaming_train_eval_loop.num_train_batches = @ntb/env_int()

From 4f987c1f6c6f83cf69da3ad301a858c0635208fc Mon Sep 17 00:00:00 2001
From: chriscai-amd <chriscai-amd@users.noreply.github.com>
Date: Tue, 30 Jun 2026 21:05:53 +0000
Subject: [PATCH 111/113] dlrmv3 qcomm: low-memory fbgemm fp16/bf16 a2a codec
 to fix skewed-batch OOM hang

The embedding all-to-all is row-wise sharded, so a data-skewed batch routes a
few extremely hot IDs to a single owner rank, ballooning its a2a input tensor.
fbgemm's quant codec packs the payload via `torch.clamp(t, MIN, MAX).half()`,
where clamp() allocates a full-size fp32 temp before the cast (~2.5x input
peak). On the hottest shard that temp reached ~81.5 GiB and OOM'd the rank,
which then dropped out of the collective while peers blocked in the a2a -> a
deterministic ~30-min NCCL-watchdog hang (yambda-5b 4-node fp16, window 235 /
global step 43621, every run).

Fix: monkeypatch the codec to cast first then clamp in place
(`t.half().clamp_(MIN, MAX)`), dropping the full-size fp32 temp. Bit-for-bit
identical output (values above HALF_MAX cast to +inf which clamp_ maps back to
HALF_MAX; NaNs unchanged) and no throughput regression (strictly less memory
traffic). Validated: the patched run trains through step 43621 with 0 OOM /
0 watchdog timeouts and an unperturbed window-AUC trajectory.

Gin-configurable via make_optimizer_and_shard.qcomm_lowmem_clamp_cast
($QCOMM_LOWMEM_CODEC), ON by default, under a new RUNTIME PATCHES section in
yambda_5b.gin documenting the rationale; a no-op when the a2a is unquantized.

launch_slurm.sh: forward PG_TIMEOUT_S + TORCH_NCCL_* flight-recorder env into
the container (the instrumentation used to root-cause this hang).

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin           | 58 ++++++++++++
 .../dlrm_v3/train/utils.py                    | 90 +++++++++++++++++++
 recommendation_v4/scripts/launch_slurm.sh     |  5 ++
 3 files changed, 153 insertions(+)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 2834f52ee..2eb92b30f 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -198,6 +198,64 @@ make_optimizer_and_shard.sparse_a2a_backward_precision = @saab/env_str()
 saab/env_str.key     = "SPARSE_A2A_BWD"
 saab/env_str.default = "fp16"
 
+# =============================================================================
+# RUNTIME PATCHES / MONKEYPATCHES
+# -----------------------------------------------------------------------------
+# Third-party kernels we override AT RUNTIME (no fork of the dependency). Each
+# knob here is a kill switch for one such patch: ON by default (it fixes a real
+# bug we hit), but flippable per-run so we can A/B the patch, reproduce the
+# original failure, or fall back to stock if a future dependency version makes
+# the patch unnecessary or incorrect. Patches are applied during model
+# build/shard (make_optimizer_and_shard), before any training step.
+# =============================================================================
+#
+# --- qcomm_lowmem_clamp_cast: low-memory fbgemm fp16/bf16 quant codec ---------
+# WHAT IT PATCHES
+#   fbgemm_gpu's embedding-a2a quantizer, fp32_to_fp16_with_clamp (and the bf16
+#   variant), which the TorchRec qcomm codec calls to pack the sparse
+#   embedding all-to-all payload onto the wire. Stock implementation is:
+#       torch.clamp(tensor, HALF_MIN, HALF_MAX).half()
+#   torch.clamp() materializes a SECOND full-size fp32 tensor (same numel as the
+#   input) BEFORE the cast, so the transient peak is
+#       input(fp32) + clamp_temp(fp32) + output(fp16)  ~= 2.5x the input.
+#   The patch reorders to an in-place, allocation-free-equivalent:
+#       tensor.half().clamp_(HALF_MIN, HALF_MAX)
+#   i.e. cast FIRST (only the fp16 output is allocated), then clamp IN PLACE —
+#   dropping the full-size fp32 clamp temp and cutting the transient peak by the
+#   size of the input tensor.
+#
+# WHY IT'S NEEDED (the bug it fixes)
+#   Embeddings are ROW-WISE sharded: every lookup of row r routes to the single
+#   rank that owns r. On a data-skewed batch a few extremely hot IDs send a
+#   disproportionate share of the global lookups to ONE owner rank, so that
+#   rank's a2a input tensor balloons. With the stock codec the extra fp32 clamp
+#   temp on that rank reached ~81.5 GiB, which OOM'd the rank mid-forward. The
+#   OOM'd rank then dropped out of the collective while its peers blocked forever
+#   in the embedding all-to-all -> ~30-min NCCL watchdog timeout -> the whole job
+#   SIGABRTs. This was a DETERMINISTIC hang (same window/step every run; e.g.
+#   yambda-5b 4-node fp16 a2a hit it at window 235 / global step 43621).
+#   See generative_recommenders/dlrm_v3/train/utils.py
+#   (_patch_fbgemm_lowmem_clamp_cast) for the full diagnosis + flight-recorder
+#   evidence.
+#
+# CORRECTNESS / PERF
+#   Numerically IDENTICAL (bit-for-bit) to stock: the single fp32->fp16 cast is
+#   the only rounding step in both orders; an fp32 value above HALF_MAX casts to
+#   +inf which clamp_ maps back to HALF_MAX, and NaNs pass through unchanged.
+#   In-place is safe because the codec encode() runs inside the qcomm autograd
+#   Function.forward with grad disabled (no graph to corrupt). No throughput
+#   regression (measured step time within run-to-run noise; it does strictly
+#   LESS memory traffic + allocation than stock).
+#
+# WHEN TO TURN OFF (set 0): only to reproduce the original OOM/hang for
+#   debugging, or to revalidate against stock after an fbgemm_gpu upgrade.
+#   Override via $QCOMM_LOWMEM_CODEC. Only takes effect when the embedding a2a is
+#   actually quantized (SPARSE_A2A_FWD/BWD not both fp32); with fp32 a2a there is
+#   no codec to patch and this knob is a no-op.
+make_optimizer_and_shard.qcomm_lowmem_clamp_cast = @qlcc/env_int()
+qlcc/env_int.key     = "QCOMM_LOWMEM_CODEC"
+qlcc/env_int.default = 1
+
 get_dataset.name = %dataset
 get_dataset.new_path_prefix = %DATA_PATH
 # Total user-interaction-history (UIH) budget per sample, distributed evenly
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index 19451078a..dae3d92e7 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -480,11 +480,94 @@ def sparse_optimizer_factory_and_class(
     return optimizer_cls, kwargs, optimizer_factory
 
 
+_FBGEMM_LOWMEM_PATCHED = False
+
+
+def _patch_fbgemm_lowmem_clamp_cast(enabled: bool = True, rank0: bool = False) -> None:
+    """Replace fbgemm's quant clamp+cast with a memory-frugal equivalent.
+
+    ``enabled`` is the gin/env-driven kill switch (see
+    ``make_optimizer_and_shard.qcomm_lowmem_clamp_cast`` /
+    ``$QCOMM_LOWMEM_CODEC``). Default ON; pass ``enabled=False`` to fall back to
+    stock fbgemm (e.g. to reproduce the pre-patch OOM, or if a future fbgemm
+    version changes the codec and the patch needs revalidation).
+
+    fbgemm's ``fp32_to_fp16_with_clamp`` (and the bf16 variant) does
+    ``torch.clamp(tensor, MIN, MAX).half()``. ``torch.clamp(...)`` allocates a
+    SECOND full-size fp32 tensor (same numel as the input) *before* the cast, so
+    the transient peak is input(fp32) + clamp_temp(fp32) + output(fp16) ~= 2.5x
+    the input. On a skewed row-wise-sharded batch the hottest shard's embedding
+    tensor is huge (observed 81.5 GiB clamp temp), and that extra fp32 copy is
+    exactly the allocation that OOMs the rank — which then exits the train loop
+    while peers block forever in the a2a (a 30-min NCCL-watchdog hang). See
+    HANG_ROOTCAUSE.md / flight-recorder dump for the full diagnosis.
+
+    Cast FIRST then clamp IN PLACE: ``tensor.half().clamp_(MIN, MAX)``. This
+    allocates only the fp16 output (no full-size fp32 temp), cutting the peak by
+    the size of the input tensor, while being numerically identical: an fp32
+    value above HALF_MAX casts to +inf, which clamp_ maps back to HALF_MAX (and
+    NaNs pass through unchanged), matching clamp-then-cast bit for bit. Safe to
+    do in place because the codec ``encode()`` runs inside the qcomm autograd
+    ``Function.forward`` (grad disabled), so there is no graph to corrupt.
+    """
+    global _FBGEMM_LOWMEM_PATCHED
+    if not enabled:
+        if rank0:
+            logger.warning(
+                "[qcomm-lowmem] DISABLED (qcomm_lowmem_clamp_cast=False / "
+                "QCOMM_LOWMEM_CODEC=0) — running stock fbgemm clamp+cast, which "
+                "allocates a full-size fp32 clamp temp and can OOM->hang the "
+                "hottest row-wise embedding shard on skewed batches."
+            )
+        return
+    if _FBGEMM_LOWMEM_PATCHED:
+        return
+    try:
+        from fbgemm_gpu import quantize_comm, quantize_utils
+
+        _HMIN = quantize_utils.TORCH_HALF_MIN
+        _HMAX = quantize_utils.TORCH_HALF_MAX
+        _BMIN = quantize_utils.TORCH_BFLOAT16_MIN
+        _BMAX = quantize_utils.TORCH_BFLOAT16_MAX
+
+        def _lowmem_fp16(tensor: torch.Tensor) -> torch.Tensor:
+            return tensor.half().clamp_(_HMIN, _HMAX)
+
+        def _lowmem_bf16(tensor: torch.Tensor) -> torch.Tensor:
+            return tensor.bfloat16().clamp_(_BMIN, _BMAX)
+
+        # Patch BOTH the definition module and quantize_comm, which imported the
+        # names directly (``from .quantize_utils import fp32_to_fp16_with_clamp``)
+        # so its module-level reference must be overridden too.
+        for _mod in (quantize_utils, quantize_comm):
+            if hasattr(_mod, "fp32_to_fp16_with_clamp"):
+                _mod.fp32_to_fp16_with_clamp = _lowmem_fp16
+            if hasattr(_mod, "fp32_to_bf16_with_clamp"):
+                _mod.fp32_to_bf16_with_clamp = _lowmem_bf16
+
+        _FBGEMM_LOWMEM_PATCHED = True
+        if rank0:
+            logger.info(
+                "[qcomm-lowmem] patched fbgemm fp32->fp16/bf16 clamp+cast to "
+                "cast-then-clamp_ (drops the full-size fp32 clamp temp; avoids "
+                "OOM on skewed row-wise embedding a2a)"
+            )
+    except Exception as e:  # noqa: BLE001 — patch is best-effort, never fatal
+        if rank0:
+            logger.warning(
+                "[qcomm-lowmem] could not patch fbgemm clamp+cast (%s: %s); "
+                "running with stock (higher-peak) quantizer",
+                type(e).__name__,
+                e,
+            )
+
+
 def _maybe_apply_qcomm_a2a(
     sharders: List[Any],
     device: torch.device,
     forward_precision: str = "fp32",
     backward_precision: str = "fp32",
+    lowmem_clamp_cast: bool = True,
 ) -> List[Any]:
     """Optionally quantize the embedding all-to-all payload via TorchRec qcomm.
 
@@ -520,6 +603,11 @@ def _maybe_apply_qcomm_a2a(
             )
     if fwd == "fp32" and bwd == "fp32":
         return sharders
+    # Before building the codec, swap fbgemm's clamp+cast for a memory-frugal
+    # equivalent — see _patch_fbgemm_lowmem_clamp_cast for why (avoids a full
+    # extra fp32 temp that OOMs the hottest row-wise shard on skewed batches).
+    # Gated by `lowmem_clamp_cast` (gin/env); ON by default.
+    _patch_fbgemm_lowmem_clamp_cast(enabled=lowmem_clamp_cast, rank0=rank0)
     try:
         from torchrec.distributed.embedding import EmbeddingCollectionSharder
         from torchrec.distributed.fbgemm_qcomm_codec import (
@@ -666,6 +754,7 @@ def make_optimizer_and_shard(
     hbm_cap_gb: int = 260,
     sparse_a2a_forward_precision: str = "fp32",
     sparse_a2a_backward_precision: str = "fp32",
+    qcomm_lowmem_clamp_cast: bool = True,
     embedding_placement: str = "auto",
     embedding_placement_overrides: Optional[Dict[str, str]] = None,
     embedding_table_configs: Optional[Dict[str, EmbeddingConfig]] = None,
@@ -691,6 +780,7 @@ def make_optimizer_and_shard(
         device,
         forward_precision=sparse_a2a_forward_precision,
         backward_precision=sparse_a2a_backward_precision,
+        lowmem_clamp_cast=qcomm_lowmem_clamp_cast,
     )
     # local_world_size = GPUs per node so the planner respects the intra-node
     # (xGMI/NVLink) vs inter-node hierarchy when placing shards. Defaults to
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index 2177f446c..2994b11dd 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -325,6 +325,11 @@ orchestrate() {
       ${CKPT_PATH:+-e CKPT_PATH=$CKPT_PATH} \
       ${SPARSE_A2A_FWD:+-e SPARSE_A2A_FWD=$SPARSE_A2A_FWD} \
       ${SPARSE_A2A_BWD:+-e SPARSE_A2A_BWD=$SPARSE_A2A_BWD} \
+      ${PG_TIMEOUT_S:+-e PG_TIMEOUT_S=$PG_TIMEOUT_S} \
+      ${TORCH_NCCL_TRACE_BUFFER_SIZE:+-e TORCH_NCCL_TRACE_BUFFER_SIZE=$TORCH_NCCL_TRACE_BUFFER_SIZE} \
+      ${TORCH_NCCL_DUMP_ON_TIMEOUT:+-e TORCH_NCCL_DUMP_ON_TIMEOUT=$TORCH_NCCL_DUMP_ON_TIMEOUT} \
+      ${TORCH_NCCL_TRACE_CPP_STACK:+-e TORCH_NCCL_TRACE_CPP_STACK=$TORCH_NCCL_TRACE_CPP_STACK} \
+      ${TORCH_NCCL_DEBUG_INFO_TEMP_FILE:+-e TORCH_NCCL_DEBUG_INFO_TEMP_FILE=$TORCH_NCCL_DEBUG_INFO_TEMP_FILE} \
       -e LOG=$LOG \
       $NCCL_ENV_ARGS \
       $CONTAINER bash -lc 'cd $REPO && LAUNCH_SLURM_PHASE=worker bash scripts/launch_slurm.sh'

From a856229c7575a4c945c927e4e286db1084199d3e Mon Sep 17 00:00:00 2001
From: chriscai-amd <chriscai-amd@users.noreply.github.com>
Date: Tue, 30 Jun 2026 21:29:10 +0000
Subject: [PATCH 112/113] dlrmv3 launch_slurm: forward QCOMM_LOWMEM_CODEC env
 into the container

Completes the qcomm low-memory codec knob: without this the gin
qlcc/env_int($QCOMM_LOWMEM_CODEC) binding could never see an env override inside
the container (it only saw the gin default). Now $QCOMM_LOWMEM_CODEC set at
submit time reaches the trainer, so the patch can be toggled per-run.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 recommendation_v4/scripts/launch_slurm.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index 2994b11dd..80bb3643c 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -325,6 +325,7 @@ orchestrate() {
       ${CKPT_PATH:+-e CKPT_PATH=$CKPT_PATH} \
       ${SPARSE_A2A_FWD:+-e SPARSE_A2A_FWD=$SPARSE_A2A_FWD} \
       ${SPARSE_A2A_BWD:+-e SPARSE_A2A_BWD=$SPARSE_A2A_BWD} \
+      ${QCOMM_LOWMEM_CODEC:+-e QCOMM_LOWMEM_CODEC=$QCOMM_LOWMEM_CODEC} \
       ${PG_TIMEOUT_S:+-e PG_TIMEOUT_S=$PG_TIMEOUT_S} \
       ${TORCH_NCCL_TRACE_BUFFER_SIZE:+-e TORCH_NCCL_TRACE_BUFFER_SIZE=$TORCH_NCCL_TRACE_BUFFER_SIZE} \
       ${TORCH_NCCL_DUMP_ON_TIMEOUT:+-e TORCH_NCCL_DUMP_ON_TIMEOUT=$TORCH_NCCL_DUMP_ON_TIMEOUT} \

From b8f82985b9121d4c071073337fe2cc0f7379031f Mon Sep 17 00:00:00 2001
From: chriscai-amd <chriscai-amd@users.noreply.github.com>
Date: Wed, 1 Jul 2026 03:21:53 +0000
Subject: [PATCH 113/113] dlrmv3 sharding: gin/env-configurable per-table
 embedding sharding-type overrides

Add EMB_SHARDING_OVERRIDES (gin/env) so individual embedding tables can be
pinned to a sharding type, orthogonal to the existing placement override.
Default OFF -> plan is byte-identical to the legacy all-ROW_WISE path.

Motivation: ROW_WISE routes every lookup of a hot ID to its single owner rank,
so a few popular albums/artists concentrate the embedding all-to-all onto one
rank; the burst scales ~linearly with global batch size and OOM'd the hot rank
(~208-238 GiB / 288) at window ~248 on the yambda-5b 4-node run. Moving
album_id/artist_id to COLUMN_WISE balances the a2a by rank regardless of hot-ID
skew. Validated by a reshard smoke: DCP loaded the ROW_WISE ckpt into the CW
plan cleanly, window_auc stayed ~0.78-0.80, and the hot rank sat at ~58%
(~120 GiB free) through the previously-OOM window.

- utils: _build_placement_constraints/make_optimizer_and_shard accept
  embedding_sharding_overrides and merge them into ParameterConstraints.
- gin: EMB_SHARDING_OVERRIDES env_str_map binding + rationale/example comments.
- launch_slurm: forward EMB_SHARDING_OVERRIDES (+ placement) into the container.

Co-authored-by: Cursor <cursoragent@cursor.com>
---
 .../dlrm_v3/train/gin/yambda_5b.gin           | 38 ++++++++
 .../dlrm_v3/train/utils.py                    | 88 +++++++++++++++----
 recommendation_v4/scripts/launch_slurm.sh     |  4 +
 3 files changed, 113 insertions(+), 17 deletions(-)

diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
index 2eb92b30f..5da1d31c7 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/gin/yambda_5b.gin
@@ -179,6 +179,44 @@ env_str_map.key     = "EMB_PLACEMENT_OVERRIDES"
 env_str_map.merge   = True
 env_str_map.default = {}
 
+# Per-table SHARDING-TYPE overrides (shard layout, orthogonal to placement above).
+# Absent/"auto" tables keep the planner's choice (ROW_WISE for the large yambda
+# tables). Allowed per table: "row_wise"|"column_wise"|"table_wise"|
+# "table_row_wise"|"table_column_wise"|"data_parallel"|"auto" (aliases rw/cw/tw/
+# twrw). DEFAULT OFF ({}) -> plan is byte-identical to the legacy all-ROW_WISE
+# path. Opt in per run via env, e.g.:
+#   EMB_SHARDING_OVERRIDES="album_id=column_wise,artist_id=column_wise"
+# WHY: ROW_WISE routes every lookup of row r to the single owner rank, so a few
+# hot IDs concentrate the embedding all-to-all onto one rank and OOM it (the
+# yambda-5b skew hang; album_id ~2.8x, artist_id ~1.3x per-rank load). COLUMN_WISE
+# splits the table by embedding dim (every rank holds all rows, dim/world cols),
+# so the a2a load is balanced by RANK regardless of which IDs are hot — removing
+# the value-skew OOM — at identical per-rank table bytes. Convert only the skewed
+# high-volume tables (album_id, artist_id); leave the balanced, highest-volume
+# item_id and the tiny length-1 contextual/cross tables on ROW_WISE.
+# WHY THIS GETS WORSE AT LARGER GLOBAL BATCH: the ROW_WISE a2a input buffer on the
+# owner rank is sized by how many times its hot IDs appear across the WHOLE global
+# batch, so that transient scales ~linearly with global batch size (here 32 ranks
+# x 1024 = 32768, each carrying ~4096-token UIH sequences -> tens of millions of
+# lookups/step, heavily re-hitting the same few popular albums/artists). Doubling
+# the global batch ~doubles the hot-rank burst while every other rank stays idle,
+# which is exactly what tipped GPU5/GPU3 from a saturated steady state (~208-238
+# GiB) over 288 GiB at window ~248. COLUMN_WISE makes each rank receive dim/world
+# of EVERY lookup, so the per-rank a2a volume is ~global_batch/world (balanced)
+# and grows with world size, not with which IDs are hot -> it scales cleanly as
+# you push global batch / add ranks, instead of piling the growth onto one shard.
+# Example (the yambda-5b 4-node run, global batch 32768, that hit the skew OOM):
+#   EMB_SHARDING_OVERRIDES="album_id=column_wise,artist_id=column_wise"
+# validated by the CW smoke: reshard-loaded the ROW_WISE ckpt cleanly, window_auc
+# stayed ~0.78-0.80, and the hot rank sat at ~58% (~120 GiB free) THROUGH the old
+# OOM window (vs 16 GiB free right before the ROW_WISE crash).
+# NOTE: changing the sharding plan changes the on-disk shard layout; a checkpoint
+# written under a different plan must be resharded/validated on load.
+make_optimizer_and_shard.embedding_sharding_overrides = @esh/env_str_map()
+esh/env_str_map.key     = "EMB_SHARDING_OVERRIDES"
+esh/env_str_map.merge   = True
+esh/env_str_map.default = {}
+
 # Sparse embedding all-to-all wire precision. The embedding shuffle is the
 # dominant, bandwidth-bound (esp. multi-node) collective; quantizing it via
 # TorchRec QCommsConfig halves (bf16/fp16, both 2 bytes) the wire volume.
diff --git a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
index dae3d92e7..f226595a8 100644
--- a/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
+++ b/recommendation_v4/generative_recommenders/dlrm_v3/train/utils.py
@@ -60,7 +60,7 @@
 from torchrec.distributed.planner import EmbeddingShardingPlanner, Topology
 from torchrec.distributed.planner.types import ParameterConstraints
 from torchrec.distributed.sharding_plan import get_default_sharders
-from torchrec.distributed.types import ShardedTensor, ShardingEnv
+from torchrec.distributed.types import ShardedTensor, ShardingEnv, ShardingType
 from torchrec.modules.embedding_configs import EmbeddingConfig
 from torchrec.modules.embedding_modules import (
     EmbeddingBagCollection,
@@ -87,6 +87,25 @@
     "uvm_caching": EmbeddingComputeKernel.FUSED_UVM_CACHING,
 }
 
+# Per-table sharding-type vocabulary -> torchrec ShardingType. Used by
+# make_optimizer_and_shard to pin a table's shard layout via ParameterConstraints
+# (e.g. move a hot, data-skewed table off ROW_WISE to COLUMN_WISE so its
+# embedding all-to-all load is balanced by rank instead of routed by row/value).
+# "auto" (or anything not in this map) means "no constraint": the planner decides.
+# Short aliases (rw/cw/tw/twrw) are accepted alongside the canonical names.
+_SHARDING_TO_TYPE: Dict[str, ShardingType] = {
+    "row_wise": ShardingType.ROW_WISE,
+    "column_wise": ShardingType.COLUMN_WISE,
+    "table_wise": ShardingType.TABLE_WISE,
+    "table_row_wise": ShardingType.TABLE_ROW_WISE,
+    "table_column_wise": ShardingType.TABLE_COLUMN_WISE,
+    "data_parallel": ShardingType.DATA_PARALLEL,
+    "rw": ShardingType.ROW_WISE,
+    "cw": ShardingType.COLUMN_WISE,
+    "tw": ShardingType.TABLE_WISE,
+    "twrw": ShardingType.TABLE_ROW_WISE,
+}
+
 
 @gin.configurable
 def seed_everything(seed: int = -1, rank: int = 0) -> None:
@@ -689,14 +708,27 @@ def _build_placement_constraints(
     embedding_placement: str,
     embedding_placement_overrides: Dict[str, str],
     embedding_table_configs: Optional[Dict[str, EmbeddingConfig]],
+    embedding_sharding_overrides: Optional[Dict[str, str]] = None,
 ) -> Dict[str, ParameterConstraints]:
-    """Translate gin/env placement strings into torchrec ParameterConstraints.
-
-    Resolution per table: ``overrides.get(name, embedding_placement)``. A value
-    of ``auto`` (or empty) means "no constraint" and the table is omitted so the
-    planner keeps deciding from the HBM cap. Unknown values raise ValueError.
+    """Translate gin/env placement + sharding strings into ParameterConstraints.
+
+    Two orthogonal per-table knobs are merged into one constraint per table:
+
+    * Placement (compute kernel / memory tier):
+      ``embedding_placement_overrides.get(name, embedding_placement)``.
+      ``auto``/empty -> no compute-kernel constraint (planner decides from HBM).
+    * Sharding type (shard layout): ``embedding_sharding_overrides.get(name)``.
+      ``auto``/empty (or absent) -> no sharding-type constraint (planner decides,
+      which is ROW_WISE for the large yambda tables). Use e.g. ``column_wise``
+      to move a hot, data-skewed table off ROW_WISE so its embedding all-to-all
+      is balanced by rank instead of routed by (hot) row.
+
+    A table is added to the returned dict only if at least one knob is set for it
+    (so with everything ``auto`` we return {} and the plan is byte-identical to
+    the legacy path). Unknown values raise ValueError.
     """
-    valid = set(_PLACEMENT_TO_KERNEL) | {"auto", ""}
+    embedding_sharding_overrides = embedding_sharding_overrides or {}
+    valid_place = set(_PLACEMENT_TO_KERNEL) | {"auto", ""}
     for where, val in [
         ("embedding_placement", embedding_placement),
         *[
@@ -704,14 +736,24 @@ def _build_placement_constraints(
             for k, v in embedding_placement_overrides.items()
         ],
     ]:
-        if val not in valid:
+        if val not in valid_place:
             raise ValueError(
                 f"Invalid embedding placement {val!r} for {where}; "
-                f"expected one of {sorted(valid - {''})}."
+                f"expected one of {sorted(valid_place - {''})}."
+            )
+    valid_shard = set(_SHARDING_TO_TYPE) | {"auto", ""}
+    for k, v in embedding_sharding_overrides.items():
+        if v not in valid_shard:
+            raise ValueError(
+                f"Invalid embedding sharding {v!r} for "
+                f"embedding_sharding_overrides[{k}]; "
+                f"expected one of {sorted(valid_shard - {''})}."
             )
 
     names = _embedding_table_names(model, embedding_table_configs)
-    unknown = set(embedding_placement_overrides) - set(names)
+    unknown = (
+        set(embedding_placement_overrides) | set(embedding_sharding_overrides)
+    ) - set(names)
     if unknown:
         logger.warning(
             "[emb-placement] override(s) for unknown table(s) %s ignored; "
@@ -721,24 +763,34 @@ def _build_placement_constraints(
         )
 
     constraints: Dict[str, ParameterConstraints] = {}
-    resolved: Dict[str, str] = {}
+    resolved_place: Dict[str, str] = {}
+    resolved_shard: Dict[str, str] = {}
     for name in names:
         placement = embedding_placement_overrides.get(name, embedding_placement)
-        resolved[name] = placement or "auto"
+        sharding = embedding_sharding_overrides.get(name, "auto")
+        resolved_place[name] = placement or "auto"
+        resolved_shard[name] = sharding or "auto"
         kernel = _PLACEMENT_TO_KERNEL.get(placement)
+        stype = _SHARDING_TO_TYPE.get(sharding)
+        kwargs: Dict[str, Any] = {}
         if kernel is not None:
-            constraints[name] = ParameterConstraints(
-                compute_kernels=[kernel.value]
-            )
+            kwargs["compute_kernels"] = [kernel.value]
+        if stype is not None:
+            kwargs["sharding_types"] = [stype.value]
+        if kwargs:
+            constraints[name] = ParameterConstraints(**kwargs)
 
     rank = dist.get_rank() if dist.is_initialized() else 0
     if rank == 0:
         logger.info(
-            "[emb-placement] global=%r overrides=%s -> resolved=%s "
+            "[emb-placement] placement(global=%r overrides=%s) sharding(overrides=%s) "
+            "-> resolved_placement=%s resolved_sharding=%s "
             "(constrained=%d/%d tables; the rest are planner-auto)",
             embedding_placement,
             embedding_placement_overrides or {},
-            resolved,
+            embedding_sharding_overrides or {},
+            resolved_place,
+            resolved_shard,
             len(constraints),
             len(names),
         )
@@ -757,6 +809,7 @@ def make_optimizer_and_shard(
     qcomm_lowmem_clamp_cast: bool = True,
     embedding_placement: str = "auto",
     embedding_placement_overrides: Optional[Dict[str, str]] = None,
+    embedding_sharding_overrides: Optional[Dict[str, str]] = None,
     embedding_table_configs: Optional[Dict[str, EmbeddingConfig]] = None,
 ) -> Tuple[DistributedModelParallel, torch.optim.Optimizer]:
     dense_opt_cls, dense_opt_args, dense_opt_factory = (
@@ -802,6 +855,7 @@ def make_optimizer_and_shard(
         model=model,
         embedding_placement=embedding_placement,
         embedding_placement_overrides=embedding_placement_overrides or {},
+        embedding_sharding_overrides=embedding_sharding_overrides or {},
         embedding_table_configs=embedding_table_configs,
     )
     planner = EmbeddingShardingPlanner(
diff --git a/recommendation_v4/scripts/launch_slurm.sh b/recommendation_v4/scripts/launch_slurm.sh
index 80bb3643c..cb171593c 100755
--- a/recommendation_v4/scripts/launch_slurm.sh
+++ b/recommendation_v4/scripts/launch_slurm.sh
@@ -289,6 +289,7 @@ orchestrate() {
       ${EVAL_EVERY_DATA_PCT:+-e EVAL_EVERY_DATA_PCT=$EVAL_EVERY_DATA_PCT} \
       -e NUM_TRAIN_BATCHES=$NUM_TRAIN_BATCHES \
       -e NUM_EVAL_BATCHES=$NUM_EVAL_BATCHES \
+      ${DIE_AT_STEP:+-e DIE_AT_STEP=$DIE_AT_STEP} \
       -e METRIC_LOG_FREQ=$METRIC_LOG_FREQ \
       ${MLPERF_LOGGING:+-e MLPERF_LOGGING=$MLPERF_LOGGING} \
       ${MLPERF_TRAIN_LOSS_LOG_FREQ:+-e MLPERF_TRAIN_LOSS_LOG_FREQ=$MLPERF_TRAIN_LOSS_LOG_FREQ} \
@@ -326,6 +327,9 @@ orchestrate() {
       ${SPARSE_A2A_FWD:+-e SPARSE_A2A_FWD=$SPARSE_A2A_FWD} \
       ${SPARSE_A2A_BWD:+-e SPARSE_A2A_BWD=$SPARSE_A2A_BWD} \
       ${QCOMM_LOWMEM_CODEC:+-e QCOMM_LOWMEM_CODEC=$QCOMM_LOWMEM_CODEC} \
+      ${EMB_SHARDING_OVERRIDES:+-e EMB_SHARDING_OVERRIDES=$EMB_SHARDING_OVERRIDES} \
+      ${EMB_PLACEMENT_OVERRIDES:+-e EMB_PLACEMENT_OVERRIDES=$EMB_PLACEMENT_OVERRIDES} \
+      ${EMB_PLACEMENT:+-e EMB_PLACEMENT=$EMB_PLACEMENT} \
       ${PG_TIMEOUT_S:+-e PG_TIMEOUT_S=$PG_TIMEOUT_S} \
       ${TORCH_NCCL_TRACE_BUFFER_SIZE:+-e TORCH_NCCL_TRACE_BUFFER_SIZE=$TORCH_NCCL_TRACE_BUFFER_SIZE} \
       ${TORCH_NCCL_DUMP_ON_TIMEOUT:+-e TORCH_NCCL_DUMP_ON_TIMEOUT=$TORCH_NCCL_DUMP_ON_TIMEOUT} \